Merge remote-tracking branch 'origin/next' into kvm-ppc-next

Conflicts: mm/Kconfig CMA DMA split and ZSWAP introduction were conflicting, fix up manually.
author: Alexander Graf <agraf@suse.de> 2013-08-28 18:41:59 -0400
committer: Alexander Graf <agraf@suse.de> 2013-08-28 18:41:59 -0400
commit: bf550fc93d9855872a95e69e4002256110d89858 (patch)
tree: 10876bb4304bffe54c4160a132e7b8de6577ac4e /mm
parent: 7e48c101e0c53e6095c5f4f5e63d14df50aae8fc (diff)
parent: cc2df20c7c4ce594c3e17e9cc260c330646012c8 (diff)
37 files changed, 2862 insertions, 953 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 81bcb4bd422d..6cdd27043303 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -501,3 +501,45 @@ config CMA_DEBUG
          messages for every CMA call as well as various messages while
          processing calls such as dma_alloc_from_contiguous().
          This option does not affect warning and error messages.
+config ZBUD
+        tristate
+        default n
+        help
+          A special purpose allocator for storing compressed pages.
+          It is designed to store up to two compressed pages per physical
+          page.  While this design limits storage density, it has simple and
+          deterministic reclaim properties that make it preferable to a higher
+          density approach when reclaim will be used.
+config ZSWAP
+        bool "Compressed cache for swap pages (EXPERIMENTAL)"
+        depends on FRONTSWAP && CRYPTO=y
+        select CRYPTO_LZO
+        select ZBUD
+        default n
+        help
+          A lightweight compressed cache for swap pages.  It takes
+          pages that are in the process of being swapped out and attempts to
+          compress them into a dynamically allocated RAM-based memory pool.
+          This can result in a significant I/O reduction on swap device and,
+          in the case where decompressing from RAM is faster that swap device
+          reads, can also improve workload performance.
+          This is marked experimental because it is a new feature (as of
+          v3.11) that interacts heavily with memory reclaim.  While these
+          interactions don't cause any known issues on simple memory setups,
+          they have not be fully explored on the large set of potential
+          configurations and workloads that exist.
+config MEM_SOFT_DIRTY
+        bool "Track memory changes"
+        depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
+        select PROC_PAGE_MONITOR
+        help
+          This option enables memory changes tracking by introducing a
+          soft-dirty bit on pte-s. This bit it set when someone writes
+          into a page just as regular dirty bit, but unlike the latter
+          it can be cleared by hands.
+          See Documentation/vm/soft-dirty.txt for more details.
diff --git a/mm/Makefile b/mm/Makefile
index 72c5acb9345f..f00803386a67 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 obj-$(CONFIG_BOUNCE)    += bounce.o
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_FRONTSWAP) += frontswap.o
+obj-$(CONFIG_ZSWAP)     += zswap.o
 obj-$(CONFIG_HAS_DMA)   += dmapool.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
@@ -58,3 +59,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
+obj-$(CONFIG_ZBUD)      += zbud.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 502517492258..d014ee5fcbbd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -515,7 +515,6 @@ EXPORT_SYMBOL(bdi_destroy);
 int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
                           unsigned int cap)
 {
-        char tmp[32];
        int err;
        bdi->name = name;
@@ -524,8 +523,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
        if (err)
                return err;
-        sprintf(tmp, "%.28s%s", name, "-%d");
+        err = bdi_register(bdi, NULL, "%.28s-%ld", name,
-        err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
+                           atomic_long_inc_return(&bdi_seq));
        if (err) {
                bdi_destroy(bdi);
                return err;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb019ec2..6ab7744e692e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -241,33 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        return count;
 }
-static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+static int reset_managed_pages_done __initdata;
+static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
 {
        struct zone *z;
-        /*
+        if (reset_managed_pages_done)
-         * In free_area_init_core(), highmem zone's managed_pages is set to
+                return;
-         * present_pages, and bootmem allocator doesn't allocate from highmem
-         * zones. So there's no need to recalculate managed_pages because all
-         * highmem pages will be managed by the buddy system. Here highmem
-         * zone also includes highmem movable zone.
-         */
        for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
-                if (!is_highmem(z))
+                z->managed_pages = 0;
-                        z->managed_pages = 0;
 }
-/**
+void __init reset_all_zones_managed_pages(void)
- * free_all_bootmem_node - release a node's free pages to the buddy allocator
- * @pgdat: node to be released
- *
- * Returns the number of pages actually released.
- */
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
-        register_page_bootmem_info_node(pgdat);
+        struct pglist_data *pgdat;
-        reset_node_lowmem_managed_pages(pgdat);
-        return free_all_bootmem_core(pgdat->bdata);
+        for_each_online_pgdat(pgdat)
+                reset_node_managed_pages(pgdat);
+        reset_managed_pages_done = 1;
 }
 /**
@@ -279,14 +272,14 @@ unsigned long __init free_all_bootmem(void)
 {
        unsigned long total_pages = 0;
        bootmem_data_t *bdata;
-        struct pglist_data *pgdat;
-        for_each_online_pgdat(pgdat)
+        reset_all_zones_managed_pages();
-                reset_node_lowmem_managed_pages(pgdat);
        list_for_each_entry(bdata, &bdata_list, list)
                total_pages += free_all_bootmem_core(bdata);
+        totalram_pages += total_pages;
        return total_pages;
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 7905fe721aa8..4b51ac1acae7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1539,12 +1539,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
        struct address_space *mapping = file->f_mapping;
        /* If we don't want any read-ahead, don't bother */
-        if (VM_RandomReadHint(vma))
+        if (vma->vm_flags & VM_RAND_READ)
                return;
        if (!ra->ra_pages)
                return;
-        if (VM_SequentialReadHint(vma)) {
+        if (vma->vm_flags & VM_SEQ_READ) {
                page_cache_sync_readahead(mapping, ra, file, offset,
                                          ra->ra_pages);
                return;
@@ -1584,7 +1584,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
        struct address_space *mapping = file->f_mapping;
        /* If we don't want any read-ahead, don't bother */
-        if (VM_RandomReadHint(vma))
+        if (vma->vm_flags & VM_RAND_READ)
                return;
        if (ra->mmap_miss > 0)
                ra->mmap_miss--;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 362c329b83fe..243e710c6039 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                pmd_t entry;
                entry = mk_huge_pmd(page, vma);
                page_add_new_anon_rmap(page, vma, haddr);
+                pgtable_trans_huge_deposit(mm, pmd, pgtable);
                set_pmd_at(mm, haddr, pmd, entry);
-                pgtable_trans_huge_deposit(mm, pgtable);
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm->nr_ptes++;
                spin_unlock(&mm->page_table_lock);
@@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
        entry = mk_pmd(zero_page, vma->vm_page_prot);
        entry = pmd_wrprotect(entry);
        entry = pmd_mkhuge(entry);
+        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
-        pgtable_trans_huge_deposit(mm, pgtable);
        mm->nr_ptes++;
        return true;
 }
@@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
+        pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-        pgtable_trans_huge_deposit(dst_mm, pgtable);
        dst_mm->nr_ptes++;
        ret = 0;
@@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
        pmdp_clear_flush(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
-        pgtable = pgtable_trans_huge_withdraw(mm);
+        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        pmdp_clear_flush(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
-        pgtable = pgtable_trans_huge_withdraw(mm);
+        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                 * young bit, instead of the current set_pmd_at.
                 */
                _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
-                set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+                if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+                                          pmd, _pmd,  1))
+                        update_mmu_cache_pmd(vma, addr, pmd);
        }
        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
                if (page->mapping && trylock_page(page)) {
@@ -1358,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                struct page *page;
                pgtable_t pgtable;
                pmd_t orig_pmd;
-                pgtable = pgtable_trans_huge_withdraw(tlb->mm);
+                /*
+                 * For architectures like ppc64 we look at deposited pgtable
+                 * when calling pmdp_get_and_clear. So do the
+                 * pgtable_trans_huge_withdraw after finishing pmdp related
+                 * operations.
+                 */
                orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+                pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
                if (is_huge_zero_pmd(orig_pmd)) {
                        tlb->mm->nr_ptes--;
                        spin_unlock(&tlb->mm->page_table_lock);
@@ -1429,7 +1437,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
        if (ret == 1) {
                pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
                VM_BUG_ON(!pmd_none(*new_pmd));
-                set_pmd_at(mm, new_addr, new_pmd, pmd);
+                set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
                spin_unlock(&mm->page_table_lock);
        }
 out:
@@ -1691,7 +1699,7 @@ static int __split_huge_page_map(struct page *page,
        pmd = page_check_address_pmd(page, mm, address,
                                     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
        if (pmd) {
-                pgtable = pgtable_trans_huge_withdraw(mm);
+                pgtable = pgtable_trans_huge_withdraw(mm, pmd);
                pmd_populate(mm, &_pmd, pgtable);
                haddr = address;
@@ -2359,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm,
        spin_lock(&mm->page_table_lock);
        BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address);
+        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache_pmd(vma, address, pmd);
-        pgtable_trans_huge_deposit(mm, pgtable);
        spin_unlock(&mm->page_table_lock);
        *hpage = NULL;
@@ -2667,7 +2675,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
        pmdp_clear_flush(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
-        pgtable = pgtable_trans_huge_withdraw(mm);
+        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aed085ad11a8..83aff0a4d093 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -319,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
        hstate = hstate_vma(vma);
-        return 1UL << (hstate->order + PAGE_SHIFT);
+        return 1UL << huge_page_shift(hstate);
 }
 EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
@@ -1263,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void)
                 * side-effects, like CommitLimit going negative.
                 */
                if (h->order > (MAX_ORDER - 1))
-                        totalram_pages += 1 << h->order;
+                        adjust_managed_page_count(page, 1 << h->order);
        }
 }
diff --git a/mm/internal.h b/mm/internal.h
index 8562de0a5197..4390ac6c106e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -32,11 +32,6 @@ static inline void set_page_refcounted(struct page *page)
        set_page_count(page, 1);
 }
-static inline void __put_page(struct page *page)
-{
-        atomic_dec(&page->_count);
-}
 static inline void __get_page_tail_foll(struct page *page,
                                        bool get_page_head)
 {
diff --git a/mm/memblock.c b/mm/memblock.c
index c5fad932fa51..a847bfe6f3ba 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -566,7 +566,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 /**
 * __next_free_mem_range - next function for for_each_free_mem_range()
 * @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %MAX_NUMNODES for all nodes
 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @out_nid: ptr to int for nid of the range, can be %NULL
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 194721839cf5..d12ca6f3c293 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -187,10 +187,6 @@ struct mem_cgroup_per_node {
        struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
-struct mem_cgroup_lru_info {
-        struct mem_cgroup_per_node *nodeinfo[0];
-};
 /*
 * Cgroups above their limits are maintained in a RB-Tree, independent of
 * their hierarchy representation
@@ -267,28 +263,10 @@ struct mem_cgroup {
        /* vmpressure notifications */
        struct vmpressure vmpressure;
-        union {
+        /*
-                /*
+         * the counter to account for mem+swap usage.
-                 * the counter to account for mem+swap usage.
+         */
-                 */
+        struct res_counter memsw;
-                struct res_counter memsw;
-                /*
-                 * rcu_freeing is used only when freeing struct mem_cgroup,
-                 * so put it into a union to avoid wasting more memory.
-                 * It must be disjoint from the css field.  It could be
-                 * in a union with the res field, but res plays a much
-                 * larger part in mem_cgroup life than memsw, and might
-                 * be of interest, even at time of free, when debugging.
-                 * So share rcu_head with the less interesting memsw.
-                 */
-                struct rcu_head rcu_freeing;
-                /*
-                 * We also need some space for a worker in deferred freeing.
-                 * By the time we call it, rcu_freeing is no longer in use.
-                 */
-                struct work_struct work_freeing;
-        };
        /*
         * the counter to account for kernel memory usage.
@@ -303,8 +281,6 @@ struct mem_cgroup {
        bool            oom_lock;
        atomic_t        under_oom;
-        atomic_t        refcnt;
        int     swappiness;
        /* OOM-Killer disable */
        int             oom_kill_disable;
@@ -366,14 +342,8 @@ struct mem_cgroup {
        atomic_t        numainfo_updating;
 #endif
-        /*
+        struct mem_cgroup_per_node *nodeinfo[0];
-         * Per cgroup active and inactive list, similar to the
+        /* WARNING: nodeinfo must be the last member here */
-         * per zone LRU lists.
-         *
-         * WARNING: This has to be the last element of the struct. Don't
-         * add new fields after this point.
-         */
-        struct mem_cgroup_lru_info info;
 };
 static size_t memcg_size(void)
@@ -416,6 +386,11 @@ static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 {
+        /*
+         * Our caller must use css_get() first, because memcg_uncharge_kmem()
+         * will call css_put() if it sees the memcg is dead.
+         */
+        smp_wmb();
        if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
                set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
 }
@@ -508,9 +483,6 @@ enum res_type {
 */
 static DEFINE_MUTEX(memcg_create_mutex);
-static void mem_cgroup_get(struct mem_cgroup *memcg);
-static void mem_cgroup_put(struct mem_cgroup *memcg);
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 {
@@ -561,15 +533,15 @@ void sock_update_memcg(struct sock *sk)
                 */
                if (sk->sk_cgrp) {
                        BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
-                        mem_cgroup_get(sk->sk_cgrp->memcg);
+                        css_get(&sk->sk_cgrp->memcg->css);
                        return;
                }
                rcu_read_lock();
                memcg = mem_cgroup_from_task(current);
                cg_proto = sk->sk_prot->proto_cgroup(memcg);
-                if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
+                if (!mem_cgroup_is_root(memcg) &&
-                        mem_cgroup_get(memcg);
+                    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
                        sk->sk_cgrp = cg_proto;
                }
                rcu_read_unlock();
@@ -583,7 +555,7 @@ void sock_release_memcg(struct sock *sk)
                struct mem_cgroup *memcg;
                WARN_ON(!sk->sk_cgrp->memcg);
                memcg = sk->sk_cgrp->memcg;
-                mem_cgroup_put(memcg);
+                css_put(&sk->sk_cgrp->memcg->css);
        }
 }
@@ -683,7 +655,7 @@ static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 {
        VM_BUG_ON((unsigned)nid >= nr_node_ids);
-        return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
+        return &memcg->nodeinfo[nid]->zoneinfo[zid];
 }
 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
@@ -1148,6 +1120,58 @@ skip_node:
        return NULL;
 }
+static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
+{
+        /*
+         * When a group in the hierarchy below root is destroyed, the
+         * hierarchy iterator can no longer be trusted since it might
+         * have pointed to the destroyed group.  Invalidate it.
+         */
+        atomic_inc(&root->dead_count);
+}
+static struct mem_cgroup *
+mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
+                     struct mem_cgroup *root,
+                     int *sequence)
+{
+        struct mem_cgroup *position = NULL;
+        /*
+         * A cgroup destruction happens in two stages: offlining and
+         * release.  They are separated by a RCU grace period.
+         *
+         * If the iterator is valid, we may still race with an
+         * offlining.  The RCU lock ensures the object won't be
+         * released, tryget will fail if we lost the race.
+         */
+        *sequence = atomic_read(&root->dead_count);
+        if (iter->last_dead_count == *sequence) {
+                smp_rmb();
+                position = iter->last_visited;
+                if (position && !css_tryget(&position->css))
+                        position = NULL;
+        }
+        return position;
+}
+static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
+                                   struct mem_cgroup *last_visited,
+                                   struct mem_cgroup *new_position,
+                                   int sequence)
+{
+        if (last_visited)
+                css_put(&last_visited->css);
+        /*
+         * We store the sequence count from the time @last_visited was
+         * loaded successfully instead of rereading it here so that we
+         * don't lose destruction events in between.  We could have
+         * raced with the destruction of @new_position after all.
+         */
+        iter->last_visited = new_position;
+        smp_wmb();
+        iter->last_dead_count = sequence;
+}
 /**
 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 * @root: hierarchy root
@@ -1171,7 +1195,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 {
        struct mem_cgroup *memcg = NULL;
        struct mem_cgroup *last_visited = NULL;
-        unsigned long uninitialized_var(dead_count);
        if (mem_cgroup_disabled())
                return NULL;
@@ -1191,6 +1214,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
        rcu_read_lock();
        while (!memcg) {
                struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+                int uninitialized_var(seq);
                if (reclaim) {
                        int nid = zone_to_nid(reclaim->zone);
@@ -1204,37 +1228,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                goto out_unlock;
                        }
-                        /*
+                        last_visited = mem_cgroup_iter_load(iter, root, &seq);
-                         * If the dead_count mismatches, a destruction
-                         * has happened or is happening concurrently.
-                         * If the dead_count matches, a destruction
-                         * might still happen concurrently, but since
-                         * we checked under RCU, that destruction
-                         * won't free the object until we release the
-                         * RCU reader lock.  Thus, the dead_count
-                         * check verifies the pointer is still valid,
-                         * css_tryget() verifies the cgroup pointed to
-                         * is alive.
-                         */
-                        dead_count = atomic_read(&root->dead_count);
-                        if (dead_count == iter->last_dead_count) {
-                                smp_rmb();
-                                last_visited = iter->last_visited;
-                                if (last_visited &&
-                                    !css_tryget(&last_visited->css))
-                                        last_visited = NULL;
-                        }
                }
                memcg = __mem_cgroup_iter_next(root, last_visited);
                if (reclaim) {
-                        if (last_visited)
+                        mem_cgroup_iter_update(iter, last_visited, memcg, seq);
-                                css_put(&last_visited->css);
-                        iter->last_visited = memcg;
-                        smp_wmb();
-                        iter->last_dead_count = dead_count;
                        if (!memcg)
                                iter->generation++;
@@ -1448,11 +1448,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
        return ret;
 }
-int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
+bool task_in_mem_cgroup(struct task_struct *task,
+                        const struct mem_cgroup *memcg)
 {
-        int ret;
        struct mem_cgroup *curr = NULL;
        struct task_struct *p;
+        bool ret;
        p = find_lock_task_mm(task);
        if (p) {
@@ -1464,14 +1465,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
                 * killer still needs to detect if they have already been oom
                 * killed to prevent needlessly killing additional tasks.
                 */
-                task_lock(task);
+                rcu_read_lock();
                curr = mem_cgroup_from_task(task);
                if (curr)
                        css_get(&curr->css);
-                task_unlock(task);
+                rcu_read_unlock();
        }
        if (!curr)
-                return 0;
+                return false;
        /*
         * We should check use_hierarchy of "memcg" not "curr". Because checking
         * use_hierarchy of "curr" here make this function true if hierarchy is
@@ -3031,8 +3032,16 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
        if (res_counter_uncharge(&memcg->kmem, size))
                return;
+        /*
+         * Releases a reference taken in kmem_cgroup_css_offline in case
+         * this last uncharge is racing with the offlining code or it is
+         * outliving the memcg existence.
+         *
+         * The memory barrier imposed by test&clear is paired with the
+         * explicit one in memcg_kmem_mark_dead().
+         */
        if (memcg_kmem_test_and_clear_dead(memcg))
-                mem_cgroup_put(memcg);
+                css_put(&memcg->css);
 }
 void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
@@ -3223,7 +3232,7 @@ void memcg_release_cache(struct kmem_cache *s)
        list_del(&s->memcg_params->list);
        mutex_unlock(&memcg->slab_caches_mutex);
-        mem_cgroup_put(memcg);
+        css_put(&memcg->css);
 out:
        kfree(s->memcg_params);
 }
@@ -3383,16 +3392,18 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
        mutex_lock(&memcg_cache_mutex);
        new_cachep = cachep->memcg_params->memcg_caches[idx];
-        if (new_cachep)
+        if (new_cachep) {
+                css_put(&memcg->css);
                goto out;
+        }
        new_cachep = kmem_cache_dup(memcg, cachep);
        if (new_cachep == NULL) {
                new_cachep = cachep;
+                css_put(&memcg->css);
                goto out;
        }
-        mem_cgroup_get(memcg);
        atomic_set(&new_cachep->memcg_params->nr_pages , 0);
        cachep->memcg_params->memcg_caches[idx] = new_cachep;
@@ -3480,8 +3491,6 @@ static void memcg_create_cache_work_func(struct work_struct *w)
        cw = container_of(w, struct create_work, work);
        memcg_create_kmem_cache(cw->memcg, cw->cachep);
-        /* Drop the reference gotten when we enqueued. */
-        css_put(&cw->memcg->css);
        kfree(cw);
 }
@@ -3618,6 +3627,34 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
        int ret;
        *_memcg = NULL;
+        /*
+         * Disabling accounting is only relevant for some specific memcg
+         * internal allocations. Therefore we would initially not have such
+         * check here, since direct calls to the page allocator that are marked
+         * with GFP_KMEMCG only happen outside memcg core. We are mostly
+         * concerned with cache allocations, and by having this test at
+         * memcg_kmem_get_cache, we are already able to relay the allocation to
+         * the root cache and bypass the memcg cache altogether.
+         *
+         * There is one exception, though: the SLUB allocator does not create
+         * large order caches, but rather service large kmallocs directly from
+         * the page allocator. Therefore, the following sequence when backed by
+         * the SLUB allocator:
+         *
+         *      memcg_stop_kmem_account();
+         *      kmalloc(<large_number>)
+         *      memcg_resume_kmem_account();
+         *
+         * would effectively ignore the fact that we should skip accounting,
+         * since it will drive us directly to this function without passing
+         * through the cache selector memcg_kmem_get_cache. Such large
+         * allocations are extremely rare but can happen, for instance, for the
+         * cache arrays. We bring this test here.
+         */
+        if (!current->mm || current->memcg_kmem_skip_account)
+                return true;
        memcg = try_get_mem_cgroup_from_mm(current->mm);
        /*
@@ -4171,12 +4208,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
        unlock_page_cgroup(pc);
        /*
         * even after unlock, we have memcg->res.usage here and this memcg
-         * will never be freed.
+         * will never be freed, so it's safe to call css_get().
         */
        memcg_check_events(memcg, page);
        if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
                mem_cgroup_swap_statistics(memcg, true);
-                mem_cgroup_get(memcg);
+                css_get(&memcg->css);
        }
        /*
         * Migration does not charge the res_counter for the
@@ -4288,7 +4325,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
        /*
         * record memcg information,  if swapout && memcg != NULL,
-         * mem_cgroup_get() was called in uncharge().
+         * css_get() was called in uncharge().
         */
        if (do_swap_account && swapout && memcg)
                swap_cgroup_record(ent, css_id(&memcg->css));
@@ -4319,7 +4356,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
                if (!mem_cgroup_is_root(memcg))
                        res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
                mem_cgroup_swap_statistics(memcg, false);
-                mem_cgroup_put(memcg);
+                css_put(&memcg->css);
        }
        rcu_read_unlock();
 }
@@ -4353,11 +4390,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
                 * This function is only called from task migration context now.
                 * It postpones res_counter and refcount handling till the end
                 * of task migration(mem_cgroup_clear_mc()) for performance
-                 * improvement. But we cannot postpone mem_cgroup_get(to)
+                 * improvement. But we cannot postpone css_get(to)  because if
-                 * because if the process that has been moved to @to does
+                 * the process that has been moved to @to does swap-in, the
-                 * swap-in, the refcount of @to might be decreased to 0.
+                 * refcount of @to might be decreased to 0.
+                 *
+                 * We are in attach() phase, so the cgroup is guaranteed to be
+                 * alive, so we can just call css_get().
                 */
-                mem_cgroup_get(to);
+                css_get(&to->css);
                return 0;
        }
        return -EINVAL;
@@ -5136,14 +5176,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
                 * starts accounting before all call sites are patched
                 */
                memcg_kmem_set_active(memcg);
-                /*
-                 * kmem charges can outlive the cgroup. In the case of slab
-                 * pages, for instance, a page contain objects from various
-                 * processes, so it is unfeasible to migrate them away. We
-                 * need to reference count the memcg because of that.
-                 */
-                mem_cgroup_get(memcg);
        } else
                ret = res_counter_set_limit(&memcg->kmem, val);
 out:
@@ -5176,16 +5208,16 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
                goto out;
        /*
-         * destroy(), called if we fail, will issue static_key_slow_inc() and
+         * __mem_cgroup_free() will issue static_key_slow_dec() because this
-         * mem_cgroup_put() if kmem is enabled. We have to either call them
+         * memcg is active already. If the later initialization fails then the
-         * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
+         * cgroup core triggers the cleanup so we do not have to do it here.
-         * this more consistent, since it always leads to the same destroy path
         */
-        mem_cgroup_get(memcg);
        static_key_slow_inc(&memcg_kmem_enabled_key);
        mutex_lock(&set_limit_mutex);
+        memcg_stop_kmem_account();
        ret = memcg_update_cache_sizes(memcg);
+        memcg_resume_kmem_account();
        mutex_unlock(&set_limit_mutex);
 out:
        return ret;
@@ -5864,23 +5896,43 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
        return mem_cgroup_sockets_init(memcg, ss);
 }
-static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
+static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
        mem_cgroup_sockets_destroy(memcg);
+}
+static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
+{
+        if (!memcg_kmem_is_active(memcg))
+                return;
+        /*
+         * kmem charges can outlive the cgroup. In the case of slab
+         * pages, for instance, a page contain objects from various
+         * processes. As we prevent from taking a reference for every
+         * such allocation we have to be careful when doing uncharge
+         * (see memcg_uncharge_kmem) and here during offlining.
+         *
+         * The idea is that that only the _last_ uncharge which sees
+         * the dead memcg will drop the last reference. An additional
+         * reference is taken here before the group is marked dead
+         * which is then paired with css_put during uncharge resp. here.
+         *
+         * Although this might sound strange as this path is called from
+         * css_offline() when the referencemight have dropped down to 0
+         * and shouldn't be incremented anymore (css_tryget would fail)
+         * we do not have other options because of the kmem allocations
+         * lifetime.
+         */
+        css_get(&memcg->css);
        memcg_kmem_mark_dead(memcg);
        if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
                return;
-        /*
-         * Charges already down to 0, undo mem_cgroup_get() done in the charge
-         * path here, being careful not to race with memcg_uncharge_kmem: it is
-         * possible that the charges went down to 0 between mark_dead and the
-         * res_counter read, so in that case, we don't need the put
-         */
        if (memcg_kmem_test_and_clear_dead(memcg))
-                mem_cgroup_put(memcg);
+                css_put(&memcg->css);
 }
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -5888,7 +5940,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
        return 0;
 }
-static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
+static void memcg_destroy_kmem(struct mem_cgroup *memcg)
+{
+}
+static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
 {
 }
 #endif
@@ -6058,13 +6114,13 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
                mz->on_tree = false;
                mz->memcg = memcg;
        }
-        memcg->info.nodeinfo[node] = pn;
+        memcg->nodeinfo[node] = pn;
        return 0;
 }
 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
-        kfree(memcg->info.nodeinfo[node]);
+        kfree(memcg->nodeinfo[node]);
 }
 static struct mem_cgroup *mem_cgroup_alloc(void)
@@ -6137,49 +6193,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
                vfree(memcg);
 }
-/*
- * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
- * but in process context.  The work_freeing structure is overlaid
- * on the rcu_freeing structure, which itself is overlaid on memsw.
- */
-static void free_work(struct work_struct *work)
-{
-        struct mem_cgroup *memcg;
-        memcg = container_of(work, struct mem_cgroup, work_freeing);
-        __mem_cgroup_free(memcg);
-}
-static void free_rcu(struct rcu_head *rcu_head)
-{
-        struct mem_cgroup *memcg;
-        memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-        INIT_WORK(&memcg->work_freeing, free_work);
-        schedule_work(&memcg->work_freeing);
-}
-static void mem_cgroup_get(struct mem_cgroup *memcg)
-{
-        atomic_inc(&memcg->refcnt);
-}
-static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
-{
-        if (atomic_sub_and_test(count, &memcg->refcnt)) {
-                struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-                call_rcu(&memcg->rcu_freeing, free_rcu);
-                if (parent)
-                        mem_cgroup_put(parent);
-        }
-}
-static void mem_cgroup_put(struct mem_cgroup *memcg)
-{
-        __mem_cgroup_put(memcg, 1);
-}
 /*
 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
 */
@@ -6239,7 +6252,6 @@ mem_cgroup_css_alloc(struct cgroup *cont)
        memcg->last_scanned_node = MAX_NUMNODES;
        INIT_LIST_HEAD(&memcg->oom_notify);
-        atomic_set(&memcg->refcnt, 1);
        memcg->move_charge_at_immigrate = 0;
        mutex_init(&memcg->thresholds_lock);
        spin_lock_init(&memcg->move_lock);
@@ -6275,12 +6287,9 @@ mem_cgroup_css_online(struct cgroup *cont)
                res_counter_init(&memcg->kmem, &parent->kmem);
                /*
-                 * We increment refcnt of the parent to ensure that we can
+                 * No need to take a reference to the parent because cgroup
-                 * safely access it on res_counter_charge/uncharge.
+                 * core guarantees its existence.
-                 * This refcnt will be decremented when freeing this
-                 * mem_cgroup(see mem_cgroup_put).
                 */
-                mem_cgroup_get(parent);
        } else {
                res_counter_init(&memcg->res, NULL);
                res_counter_init(&memcg->memsw, NULL);
@@ -6296,16 +6305,6 @@ mem_cgroup_css_online(struct cgroup *cont)
        error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
        mutex_unlock(&memcg_create_mutex);
-        if (error) {
-                /*
-                 * We call put now because our (and parent's) refcnts
-                 * are already in place. mem_cgroup_put() will internally
-                 * call __mem_cgroup_free, so return directly
-                 */
-                mem_cgroup_put(memcg);
-                if (parent->use_hierarchy)
-                        mem_cgroup_put(parent);
-        }
        return error;
 }
@@ -6317,20 +6316,22 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
        struct mem_cgroup *parent = memcg;
        while ((parent = parent_mem_cgroup(parent)))
-                atomic_inc(&parent->dead_count);
+                mem_cgroup_iter_invalidate(parent);
        /*
         * if the root memcg is not hierarchical we have to check it
         * explicitely.
         */
        if (!root_mem_cgroup->use_hierarchy)
-                atomic_inc(&root_mem_cgroup->dead_count);
+                mem_cgroup_iter_invalidate(root_mem_cgroup);
 }
 static void mem_cgroup_css_offline(struct cgroup *cont)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        kmem_cgroup_css_offline(memcg);
        mem_cgroup_invalidate_reclaim_iterators(memcg);
        mem_cgroup_reparent_charges(memcg);
        mem_cgroup_destroy_all_caches(memcg);
@@ -6340,9 +6341,8 @@ static void mem_cgroup_css_free(struct cgroup *cont)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        kmem_cgroup_destroy(memcg);
+        memcg_destroy_kmem(memcg);
+        __mem_cgroup_free(memcg);
-        mem_cgroup_put(memcg);
 }
 #ifdef CONFIG_MMU
@@ -6651,6 +6651,7 @@ static void __mem_cgroup_clear_mc(void)
 {
        struct mem_cgroup *from = mc.from;
        struct mem_cgroup *to = mc.to;
+        int i;
        /* we must uncharge all the leftover precharges from mc.to */
        if (mc.precharge) {
@@ -6671,7 +6672,9 @@ static void __mem_cgroup_clear_mc(void)
                if (!mem_cgroup_is_root(mc.from))
                        res_counter_uncharge(&mc.from->memsw,
                                                PAGE_SIZE * mc.moved_swap);
-                __mem_cgroup_put(mc.from, mc.moved_swap);
+                for (i = 0; i < mc.moved_swap; i++)
+                        css_put(&mc.from->css);
                if (!mem_cgroup_is_root(mc.to)) {
                        /*
@@ -6681,7 +6684,7 @@ static void __mem_cgroup_clear_mc(void)
                        res_counter_uncharge(&mc.to->res,
                                                PAGE_SIZE * mc.moved_swap);
                }
-                /* we've already done mem_cgroup_get(mc.to) */
+                /* we've already done css_get(mc.to) */
                mc.moved_swap = 0;
        }
        memcg_oom_recover(from);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ceb0c7f1932f..2c13aa7a0164 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1410,7 +1410,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
        /*
         * Isolate the page, so that it doesn't get reallocated if it
-         * was free.
+         * was free. This flag should be kept set until the source page
+         * is freed and PG_hwpoison on it is set.
         */
        set_migratetype_isolate(p, true);
        /*
@@ -1433,7 +1434,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
                /* Not a free page */
                ret = 1;
        }
-        unset_migratetype_isolate(p, MIGRATE_MOVABLE);
        unlock_memory_hotplug();
        return ret;
 }
@@ -1494,7 +1494,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
                atomic_long_add(1 << compound_trans_order(hpage),
                                &num_poisoned_pages);
        }
-        /* keep elevated page count for bad page */
        return ret;
 }
@@ -1559,7 +1558,7 @@ int soft_offline_page(struct page *page, int flags)
                        atomic_long_inc(&num_poisoned_pages);
                }
        }
-        /* keep elevated page count for bad page */
+        unset_migratetype_isolate(page, MIGRATE_MOVABLE);
        return ret;
 }
@@ -1625,7 +1624,22 @@ static int __soft_offline_page(struct page *page, int flags)
                        if (ret > 0)
                                ret = -EIO;
                } else {
+                        /*
+                         * After page migration succeeds, the source page can
+                         * be trapped in pagevec and actual freeing is delayed.
+                         * Freeing code works differently based on PG_hwpoison,
+                         * so there's a race. We need to make sure that the
+                         * source page should be freed back to buddy before
+                         * setting PG_hwpoison.
+                         */
+                        if (!is_free_buddy_page(page))
+                                lru_add_drain_all();
+                        if (!is_free_buddy_page(page))
+                                drain_all_pages();
                        SetPageHWPoison(page);
+                        if (!is_free_buddy_page(page))
+                                pr_info("soft offline: %#lx: page leaked\n",
+                                        pfn);
                        atomic_long_inc(&num_poisoned_pages);
                }
        } else {
diff --git a/mm/memory.c b/mm/memory.c
index 95d0cce63583..1ce2e2a734fc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,7 +82,6 @@ EXPORT_SYMBOL(max_mapnr);
 EXPORT_SYMBOL(mem_map);
 #endif
-unsigned long num_physpages;
 /*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
@@ -92,7 +91,6 @@ unsigned long num_physpages;
 */
 void * high_memory;
-EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 /*
@@ -1101,6 +1099,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
        spinlock_t *ptl;
        pte_t *start_pte;
        pte_t *pte;
+        unsigned long range_start = addr;
 again:
        init_rss_vec(rss);
@@ -1151,7 +1150,7 @@ again:
                                if (pte_dirty(ptent))
                                        set_page_dirty(page);
                                if (pte_young(ptent) &&
-                                    likely(!VM_SequentialReadHint(vma)))
+                                    likely(!(vma->vm_flags & VM_SEQ_READ)))
                                        mark_page_accessed(page);
                                rss[MM_FILEPAGES]--;
                        }
@@ -1206,12 +1205,14 @@ again:
                force_flush = 0;
 #ifdef HAVE_GENERIC_MMU_GATHER
-                tlb->start = addr;
+                tlb->start = range_start;
-                tlb->end = end;
+                tlb->end = addr;
 #endif
                tlb_flush_mmu(tlb);
-                if (addr != end)
+                if (addr != end) {
+                        range_start = addr;
                        goto again;
+                }
        }
        return addr;
@@ -2904,7 +2905,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
                        details->first_index, details->last_index) {
                vba = vma->vm_pgoff;
-                vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
+                vea = vba + vma_pages(vma) - 1;
                /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
                zba = details->first_index;
                if (zba < vba)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1ad92b46753e..ca1dd3aa5eee 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -75,7 +75,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
        res->end = start + size - 1;
        res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
        if (request_resource(&iomem_resource, res) < 0) {
-                printk("System RAM resource %pR cannot be added\n", res);
+                pr_debug("System RAM resource %pR cannot be added\n", res);
                kfree(res);
                res = NULL;
        }
@@ -101,12 +101,9 @@ void get_page_bootmem(unsigned long info,  struct page *page,
        atomic_inc(&page->_count);
 }
-/* reference to __meminit __free_pages_bootmem is valid
+void put_page_bootmem(struct page *page)
- * so use __ref to tell modpost not to generate a warning */
-void __ref put_page_bootmem(struct page *page)
 {
        unsigned long type;
-        static DEFINE_MUTEX(ppb_lock);
        type = (unsigned long) page->lru.next;
        BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,17 +113,8 @@ void __ref put_page_bootmem(struct page *page)
                ClearPagePrivate(page);
                set_page_private(page, 0);
                INIT_LIST_HEAD(&page->lru);
+                free_reserved_page(page);
-                /*
-                 * Please refer to comment for __free_pages_bootmem()
-                 * for why we serialize here.
-                 */
-                mutex_lock(&ppb_lock);
-                __free_pages_bootmem(page, 0);
-                mutex_unlock(&ppb_lock);
-                totalram_pages++;
        }
 }
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
@@ -220,13 +208,13 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
        pfn = pgdat->node_start_pfn;
        end_pfn = pgdat_end_pfn(pgdat);
-        /* register_section info */
+        /* register section info */
        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
                /*
                 * Some platforms can assign the same pfn to multiple nodes - on
                 * node0 as well as nodeN.  To avoid registering a pfn against
                 * multiple nodes we check that this pfn does not already
-                 * reside in some other node.
+                 * reside in some other nodes.
                 */
                if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
                        register_page_bootmem_info_section(pfn);
@@ -309,7 +297,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
        /* can't move pfns which are higher than @z2 */
        if (end_pfn > zone_end_pfn(z2))
                goto out_fail;
-        /* the move out part mast at the left most of @z2 */
+        /* the move out part must be at the left most of @z2 */
        if (start_pfn > z2->zone_start_pfn)
                goto out_fail;
        /* must included/overlap */
@@ -775,29 +763,18 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback);
 void __online_page_set_limits(struct page *page)
 {
-        unsigned long pfn = page_to_pfn(page);
-        if (pfn >= num_physpages)
-                num_physpages = pfn + 1;
 }
 EXPORT_SYMBOL_GPL(__online_page_set_limits);
 void __online_page_increment_counters(struct page *page)
 {
-        totalram_pages++;
+        adjust_managed_page_count(page, 1);
-#ifdef CONFIG_HIGHMEM
-        if (PageHighMem(page))
-                totalhigh_pages++;
-#endif
 }
 EXPORT_SYMBOL_GPL(__online_page_increment_counters);
 void __online_page_free(struct page *page)
 {
-        ClearPageReserved(page);
+        __free_reserved_page(page);
-        init_page_count(page);
-        __free_page(page);
 }
 EXPORT_SYMBOL_GPL(__online_page_free);
@@ -918,6 +895,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 {
+        unsigned long flags;
        unsigned long onlined_pages = 0;
        struct zone *zone;
        int need_zonelists_rebuild = 0;
@@ -936,19 +914,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
            !can_online_high_movable(zone)) {
                unlock_memory_hotplug();
-                return -1;
+                return -EINVAL;
        }
        if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
                if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
                        unlock_memory_hotplug();
-                        return -1;
+                        return -EINVAL;
                }
        }
        if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
                if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
                        unlock_memory_hotplug();
-                        return -1;
+                        return -EINVAL;
                }
        }
@@ -994,9 +972,12 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
                return ret;
        }
-        zone->managed_pages += onlined_pages;
        zone->present_pages += onlined_pages;
+        pgdat_resize_lock(zone->zone_pgdat, &flags);
        zone->zone_pgdat->node_present_pages += onlined_pages;
+        pgdat_resize_unlock(zone->zone_pgdat, &flags);
        if (onlined_pages) {
                node_states_set_node(zone_to_nid(zone), &arg);
                if (need_zonelists_rebuild)
@@ -1487,6 +1468,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
        unsigned long pfn, nr_pages, expire;
        long offlined_pages;
        int ret, drain, retry_max, node;
+        unsigned long flags;
        struct zone *zone;
        struct memory_notify arg;
@@ -1578,10 +1560,12 @@ repeat:
        /* reset pagetype flags and makes migrate type to be MOVABLE */
        undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
        /* removal success */
-        zone->managed_pages -= offlined_pages;
+        adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
        zone->present_pages -= offlined_pages;
+        pgdat_resize_lock(zone->zone_pgdat, &flags);
        zone->zone_pgdat->node_present_pages -= offlined_pages;
-        totalram_pages -= offlined_pages;
+        pgdat_resize_unlock(zone->zone_pgdat, &flags);
        init_per_zone_wmark_min();
@@ -1621,6 +1605,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
        return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
 }
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 /**
 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
@@ -1634,7 +1619,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 *
 * Returns the return value of func.
 */
-static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
                void *arg, int (*func)(struct memory_block *, void *))
 {
        struct memory_block *mem = NULL;
@@ -1671,24 +1656,7 @@ static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
        return 0;
 }
-/**
+#ifdef CONFIG_MEMORY_HOTREMOVE
- * offline_memory_block_cb - callback function for offlining memory block
- * @mem: the memory block to be offlined
- * @arg: buffer to hold error msg
- *
- * Always return 0, and put the error msg in arg if any.
- */
-static int offline_memory_block_cb(struct memory_block *mem, void *arg)
-{
-        int *ret = arg;
-        int error = offline_memory_block(mem);
-        if (error != 0 && *ret == 0)
-                *ret = error;
-        return 0;
-}
 static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
 {
        int ret = !is_memblock_offlined(mem);
@@ -1814,54 +1782,22 @@ void try_offline_node(int nid)
 }
 EXPORT_SYMBOL(try_offline_node);
-int __ref remove_memory(int nid, u64 start, u64 size)
+void __ref remove_memory(int nid, u64 start, u64 size)
 {
-        unsigned long start_pfn, end_pfn;
+        int ret;
-        int ret = 0;
-        int retry = 1;
-        start_pfn = PFN_DOWN(start);
-        end_pfn = PFN_UP(start + size - 1);
-        /*
-         * When CONFIG_MEMCG is on, one memory block may be used by other
-         * blocks to store page cgroup when onlining pages. But we don't know
-         * in what order pages are onlined. So we iterate twice to offline
-         * memory:
-         * 1st iterate: offline every non primary memory block.
-         * 2nd iterate: offline primary (i.e. first added) memory block.
-         */
-repeat:
-        walk_memory_range(start_pfn, end_pfn, &ret,
-                          offline_memory_block_cb);
-        if (ret) {
-                if (!retry)
-                        return ret;
-                retry = 0;
-                ret = 0;
-                goto repeat;
-        }
        lock_memory_hotplug();
        /*
-         * we have offlined all memory blocks like this:
+         * All memory blocks must be offlined before removing memory.  Check
-         *   1. lock memory hotplug
+         * whether all memory blocks in question are offline and trigger a BUG()
-         *   2. offline a memory block
+         * if this is not the case.
-         *   3. unlock memory hotplug
-         *
-         * repeat step1-3 to offline the memory block. All memory blocks
-         * must be offlined before removing memory. But we don't hold the
-         * lock in the whole operation. So we should check whether all
-         * memory blocks are offlined.
         */
+        ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
-        ret = walk_memory_range(start_pfn, end_pfn, NULL,
                                is_memblock_offlined_cb);
        if (ret) {
                unlock_memory_hotplug();
-                return ret;
+                BUG();
        }
        /* remove memmap entry */
@@ -1872,17 +1808,6 @@ repeat:
        try_offline_node(nid);
        unlock_memory_hotplug();
-        return 0;
 }
-#else
-int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
-{
-        return -EINVAL;
-}
-int remove_memory(int nid, u64 start, u64 size)
-{
-        return -EINVAL;
-}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 EXPORT_SYMBOL_GPL(remove_memory);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c280a02ea11e..633c08863fd8 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -9,6 +9,8 @@
 #include <linux/init.h>
 #include <linux/kobject.h>
 #include <linux/export.h>
+#include <linux/memory.h>
+#include <linux/notifier.h>
 #include "internal.h"
 #ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel);
 struct kobject *mm_kobj;
 EXPORT_SYMBOL_GPL(mm_kobj);
+#ifdef CONFIG_SMP
+s32 vm_committed_as_batch = 32;
+static void __meminit mm_compute_batch(void)
+{
+        u64 memsized_batch;
+        s32 nr = num_present_cpus();
+        s32 batch = max_t(s32, nr*2, 32);
+        /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
+        memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
+        vm_committed_as_batch = max_t(s32, memsized_batch, batch);
+}
+static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
+                                        unsigned long action, void *arg)
+{
+        switch (action) {
+        case MEM_ONLINE:
+        case MEM_OFFLINE:
+                mm_compute_batch();
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block compute_batch_nb __meminitdata = {
+        .notifier_call = mm_compute_batch_notifier,
+        .priority = IPC_CALLBACK_PRI, /* use lowest priority */
+};
+static int __init mm_compute_batch_init(void)
+{
+        mm_compute_batch();
+        register_hotmemory_notifier(&compute_batch_nb);
+        return 0;
+}
+__initcall(mm_compute_batch_init);
+#endif
 static int __init mm_sysfs_init(void)
 {
        mm_kobj = kobject_create_and_add("mm", kernel_kobj);
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e1842fad..fbad7b091090 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -955,7 +955,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
        if (is_mergeable_vma(vma, file, vm_flags) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
-                vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+                vm_pglen = vma_pages(vma);
                if (vma->vm_pgoff + vm_pglen == vm_pgoff)
                        return 1;
        }
@@ -1358,18 +1358,19 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
        if (!(flags & MAP_ANONYMOUS)) {
                audit_mmap_fd(fd, flags);
-                if (unlikely(flags & MAP_HUGETLB))
-                        return -EINVAL;
                file = fget(fd);
                if (!file)
                        goto out;
                if (is_file_hugepages(file))
                        len = ALIGN(len, huge_page_size(hstate_file(file)));
+                retval = -EINVAL;
+                if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
+                        goto out_fput;
        } else if (flags & MAP_HUGETLB) {
                struct user_struct *user = NULL;
-                struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) &
+                struct hstate *hs;
-                                                   SHM_HUGE_MASK);
+                hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
                if (!hs)
                        return -EINVAL;
@@ -1391,6 +1392,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+out_fput:
        if (file)
                fput(file);
 out:
@@ -1876,15 +1878,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 }
 #endif  
-void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
-{
-        /*
-         * Is this a new hole at the lowest possible address?
-         */
-        if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
-                mm->free_area_cache = addr;
-}
 /*
 * This mmap-allocator allocates new areas top-down from below the
 * stack's low limit (the base):
@@ -1941,19 +1934,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 }
 #endif
-void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
-{
-        /*
-         * Is this a new hole at the highest possible address?
-         */
-        if (addr > mm->free_area_cache)
-                mm->free_area_cache = addr;
-        /* dont allow allocations above current base */
-        if (mm->free_area_cache > mm->mmap_base)
-                mm->free_area_cache = mm->mmap_base;
-}
 unsigned long
 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags)
@@ -2374,7 +2354,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct vm_area_struct **insertion_point;
        struct vm_area_struct *tail_vma = NULL;
-        unsigned long addr;
        insertion_point = (prev ? &prev->vm_next : &mm->mmap);
        vma->vm_prev = NULL;
@@ -2391,11 +2370,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
        } else
                mm->highest_vm_end = prev ? prev->vm_end : 0;
        tail_vma->vm_next = NULL;
-        if (mm->unmap_area == arch_unmap_area)
-                addr = prev ? prev->vm_end : mm->mmap_base;
-        else
-                addr = vma ?  vma->vm_start : mm->mmap_base;
-        mm->unmap_area(mm, addr);
        mm->mmap_cache = NULL;          /* Kill the cache. */
 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 6725ff183374..93e6089cb456 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -315,7 +315,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
        /*
         * Wait for any running method to finish, of course including
-         * ->release if it was run by mmu_notifier_relase instead of us.
+         * ->release if it was run by mmu_notifier_release instead of us.
         */
        synchronize_srcu(&srcu);
diff --git a/mm/mremap.c b/mm/mremap.c
index 463a25705ac6..457d34ef3bf2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                        continue;
                pte = ptep_get_and_clear(mm, old_addr, old_pte);
                pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
-                set_pte_at(mm, new_addr, new_pte, pte);
+                set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte));
        }
        arch_leave_lazy_mmu_mode();
@@ -456,13 +456,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        unsigned long charged = 0;
        bool locked = false;
-        down_write(&current->mm->mmap_sem);
        if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
-                goto out;
+                return ret;
+        if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
+                return ret;
        if (addr & ~PAGE_MASK)
-                goto out;
+                return ret;
        old_len = PAGE_ALIGN(old_len);
        new_len = PAGE_ALIGN(new_len);
@@ -473,12 +474,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
         * a zero new-len is nonsensical.
         */
        if (!new_len)
-                goto out;
+                return ret;
+        down_write(&current->mm->mmap_sem);
        if (flags & MREMAP_FIXED) {
-                if (flags & MREMAP_MAYMOVE)
+                ret = mremap_to(addr, old_len, new_addr, new_len,
-                        ret = mremap_to(addr, old_len, new_addr, new_len,
+                                &locked);
-                                        &locked);
                goto out;
        }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bdd3fa2fc73b..61107cf55bb3 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,20 +137,25 @@ static unsigned long __init free_low_memory_core_early(void)
        return count;
 }
-static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+static int reset_managed_pages_done __initdata;
+static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
 {
        struct zone *z;
-        /*
+        if (reset_managed_pages_done)
-         * In free_area_init_core(), highmem zone's managed_pages is set to
+                return;
-         * present_pages, and bootmem allocator doesn't allocate from highmem
-         * zones. So there's no need to recalculate managed_pages because all
-         * highmem pages will be managed by the buddy system. Here highmem
-         * zone also includes highmem movable zone.
-         */
        for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
-                if (!is_highmem(z))
+                z->managed_pages = 0;
-                        z->managed_pages = 0;
+}
+void __init reset_all_zones_managed_pages(void)
+{
+        struct pglist_data *pgdat;
+        for_each_online_pgdat(pgdat)
+                reset_node_managed_pages(pgdat);
+        reset_managed_pages_done = 1;
 }
 /**
@@ -160,17 +165,19 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
 */
 unsigned long __init free_all_bootmem(void)
 {
-        struct pglist_data *pgdat;
+        unsigned long pages;
-        for_each_online_pgdat(pgdat)
+        reset_all_zones_managed_pages();
-                reset_node_lowmem_managed_pages(pgdat);
        /*
         * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
         *  because in some case like Node0 doesn't have RAM installed
         *  low ram will be on Node1
         */
-        return free_low_memory_core_early();
+        pages = free_low_memory_core_early();
+        totalram_pages += pages;
+        return pages;
 }
 /**
diff --git a/mm/nommu.c b/mm/nommu.c
index 298884dcd6e7..ecd1f158548e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -56,7 +56,6 @@
 void *high_memory;
 struct page *mem_map;
 unsigned long max_mapnr;
-unsigned long num_physpages;
 unsigned long highest_memmap_pfn;
 struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
@@ -85,7 +84,6 @@ unsigned long vm_memory_committed(void)
 EXPORT_SYMBOL_GPL(vm_memory_committed);
 EXPORT_SYMBOL(mem_map);
-EXPORT_SYMBOL(num_physpages);
 /* list of mapped, potentially shareable regions */
 static struct kmem_cache *vm_region_jar;
@@ -282,6 +280,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 long vread(char *buf, char *addr, unsigned long count)
 {
+        /* Don't allow overflow */
+        if ((unsigned long) buf + count < count)
+                count = -(unsigned long) buf;
        memcpy(buf, addr, count);
        return count;
 }
@@ -1869,10 +1871,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
        return -ENOMEM;
 }
-void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
-{
-}
 void unmap_mapping_range(struct address_space *mapping,
                         loff_t const holebegin, loff_t const holelen,
                         int even_cows)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c3edb624fccf..b100255dedda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,10 +61,14 @@
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
+#include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
+/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
+static DEFINE_MUTEX(pcp_batch_high_lock);
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -100,6 +104,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 };
 EXPORT_SYMBOL(node_states);
+/* Protect totalram_pages and zone->managed_pages */
+static DEFINE_SPINLOCK(managed_page_count_lock);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 /*
@@ -197,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
 };
 int min_free_kbytes = 1024;
+int user_min_free_kbytes;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -739,14 +747,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        local_irq_restore(flags);
 }
-/*
+void __init __free_pages_bootmem(struct page *page, unsigned int order)
- * Read access to zone->managed_pages is safe because it's unsigned long,
- * but we still need to serialize writers. Currently all callers of
- * __free_pages_bootmem() except put_page_bootmem() should only be used
- * at boot time. So for shorter boot time, we shift the burden to
- * put_page_bootmem() to serialize writers.
- */
-void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
        unsigned int nr_pages = 1 << order;
        unsigned int loop;
@@ -781,11 +782,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
        set_page_refcounted(page);
        set_pageblock_migratetype(page, MIGRATE_CMA);
        __free_pages(page, pageblock_order);
-        totalram_pages += pageblock_nr_pages;
+        adjust_managed_page_count(page, pageblock_nr_pages);
-#ifdef CONFIG_HIGHMEM
-        if (PageHighMem(page))
-                totalhigh_pages += pageblock_nr_pages;
-#endif
 }
 #endif
@@ -1050,7 +1047,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                         * MIGRATE_CMA areas.
                         */
                        if (!is_migrate_cma(migratetype) &&
-                            (unlikely(current_order >= pageblock_order / 2) ||
+                            (current_order >= pageblock_order / 2 ||
                             start_migratetype == MIGRATE_RECLAIMABLE ||
                             page_group_by_mobility_disabled)) {
                                int pages;
@@ -1179,10 +1176,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
        unsigned long flags;
        int to_drain;
+        unsigned long batch;
        local_irq_save(flags);
-        if (pcp->count >= pcp->batch)
+        batch = ACCESS_ONCE(pcp->batch);
-                to_drain = pcp->batch;
+        if (pcp->count >= batch)
+                to_drain = batch;
        else
                to_drain = pcp->count;
        if (to_drain > 0) {
@@ -1350,8 +1349,9 @@ void free_hot_cold_page(struct page *page, int cold)
                list_add(&page->lru, &pcp->lists[migratetype]);
        pcp->count++;
        if (pcp->count >= pcp->high) {
-                free_pcppages_bulk(zone, pcp->batch, pcp);
+                unsigned long batch = ACCESS_ONCE(pcp->batch);
-                pcp->count -= pcp->batch;
+                free_pcppages_bulk(zone, batch, pcp);
+                pcp->count -= batch;
        }
 out:
@@ -2839,7 +2839,7 @@ EXPORT_SYMBOL(free_pages_exact);
 * nr_free_zone_pages() counts the number of counts pages which are beyond the
 * high watermark within all zones at or below a given zone index.  For each
 * zone, the number of pages is calculated as:
- *     present_pages - high_pages
+ *     managed_pages - high_pages
 */
 static unsigned long nr_free_zone_pages(int offset)
 {
@@ -2906,9 +2906,13 @@ EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
+        int zone_type;          /* needs to be signed */
+        unsigned long managed_pages = 0;
        pg_data_t *pgdat = NODE_DATA(nid);
-        val->totalram = pgdat->node_present_pages;
+        for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+                managed_pages += pgdat->node_zones[zone_type].managed_pages;
+        val->totalram = managed_pages;
        val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
        val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3150,12 +3154,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 * Add all populated zones of a node to the zonelist.
 */
 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
-                                int nr_zones, enum zone_type zone_type)
+                                int nr_zones)
 {
        struct zone *zone;
+        enum zone_type zone_type = MAX_NR_ZONES;
-        BUG_ON(zone_type >= MAX_NR_ZONES);
-        zone_type++;
        do {
                zone_type--;
@@ -3165,8 +3167,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
                                &zonelist->_zonerefs[nr_zones++]);
                        check_highest_zone(zone_type);
                }
        } while (zone_type);
        return nr_zones;
 }
@@ -3250,18 +3252,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
        static DEFINE_MUTEX(zl_order_mutex);
        mutex_lock(&zl_order_mutex);
-        if (write)
+        if (write) {
-                strcpy(saved_string, (char*)table->data);
+                if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                strcpy(saved_string, (char *)table->data);
+        }
        ret = proc_dostring(table, write, buffer, length, ppos);
        if (ret)
                goto out;
        if (write) {
                int oldval = user_zonelist_order;
-                if (__parse_numa_zonelist_order((char*)table->data)) {
+                ret = __parse_numa_zonelist_order((char *)table->data);
+                if (ret) {
                        /*
                         * bogus value.  restore saved string
                         */
-                        strncpy((char*)table->data, saved_string,
+                        strncpy((char *)table->data, saved_string,
                                NUMA_ZONELIST_ORDER_LEN);
                        user_zonelist_order = oldval;
                } else if (oldval != user_zonelist_order) {
@@ -3353,8 +3362,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
        zonelist = &pgdat->node_zonelists[0];
        for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
                ;
-        j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+        j = build_zonelists_node(NODE_DATA(node), zonelist, j);
-                                                        MAX_NR_ZONES - 1);
        zonelist->_zonerefs[j].zone = NULL;
        zonelist->_zonerefs[j].zone_idx = 0;
 }
@@ -3368,7 +3376,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
        struct zonelist *zonelist;
        zonelist = &pgdat->node_zonelists[1];
-        j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
+        j = build_zonelists_node(pgdat, zonelist, 0);
        zonelist->_zonerefs[j].zone = NULL;
        zonelist->_zonerefs[j].zone_idx = 0;
 }
@@ -3425,8 +3433,8 @@ static int default_zonelist_order(void)
                        z = &NODE_DATA(nid)->node_zones[zone_type];
                        if (populated_zone(z)) {
                                if (zone_type < ZONE_NORMAL)
-                                        low_kmem_size += z->present_pages;
+                                        low_kmem_size += z->managed_pages;
-                                total_size += z->present_pages;
+                                total_size += z->managed_pages;
                        } else if (zone_type == ZONE_NORMAL) {
                                /*
                                 * If any node has only lowmem, then node order
@@ -3576,7 +3584,7 @@ static void build_zonelists(pg_data_t *pgdat)
        local_node = pgdat->node_id;
        zonelist = &pgdat->node_zonelists[0];
-        j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
+        j = build_zonelists_node(pgdat, zonelist, 0);
        /*
         * Now we build the zonelist so that it contains the zones
@@ -3589,14 +3597,12 @@ static void build_zonelists(pg_data_t *pgdat)
        for (node = local_node + 1; node < MAX_NUMNODES; node++) {
                if (!node_online(node))
                        continue;
-                j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+                j = build_zonelists_node(NODE_DATA(node), zonelist, j);
-                                                        MAX_NR_ZONES - 1);
        }
        for (node = 0; node < local_node; node++) {
                if (!node_online(node))
                        continue;
-                j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+                j = build_zonelists_node(NODE_DATA(node), zonelist, j);
-                                                        MAX_NR_ZONES - 1);
        }
        zonelist->_zonerefs[j].zone = NULL;
@@ -3705,12 +3711,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
                mminit_verify_zonelist();
                cpuset_init_current_mems_allowed();
        } else {
-                /* we have to stop all cpus to guarantee there is no user
-                   of zonelist */
 #ifdef CONFIG_MEMORY_HOTPLUG
                if (zone)
                        setup_zone_pageset(zone);
 #endif
+                /* we have to stop all cpus to guarantee there is no user
+                   of zonelist */
                stop_machine(__build_all_zonelists, pgdat, NULL);
                /* cpuset refresh routine should be here */
        }
@@ -4032,7 +4038,40 @@ static int __meminit zone_batchsize(struct zone *zone)
 #endif
 }
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+/*
+ * pcp->high and pcp->batch values are related and dependent on one another:
+ * ->batch must never be higher then ->high.
+ * The following function updates them in a safe manner without read side
+ * locking.
+ *
+ * Any new users of pcp->batch and pcp->high should ensure they can cope with
+ * those fields changing asynchronously (acording the the above rule).
+ *
+ * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
+ * outside of boot time (or some other assurance that no concurrent updaters
+ * exist).
+ */
+static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
+                unsigned long batch)
+{
+       /* start with a fail safe value for batch */
+        pcp->batch = 1;
+        smp_wmb();
+       /* Update high, then batch, in order */
+        pcp->high = high;
+        smp_wmb();
+        pcp->batch = batch;
+}
+/* a companion to pageset_set_high() */
+static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
+{
+        pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
+}
+static void pageset_init(struct per_cpu_pageset *p)
 {
        struct per_cpu_pages *pcp;
        int migratetype;
@@ -4041,45 +4080,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
        pcp = &p->pcp;
        pcp->count = 0;
-        pcp->high = 6 * batch;
-        pcp->batch = max(1UL, 1 * batch);
        for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
                INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+        pageset_init(p);
+        pageset_set_batch(p, batch);
+}
 /*
- * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
 * to the value high for the pageset p.
 */
+static void pageset_set_high(struct per_cpu_pageset *p,
-static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                                unsigned long high)
 {
-        struct per_cpu_pages *pcp;
+        unsigned long batch = max(1UL, high / 4);
+        if ((high / 4) > (PAGE_SHIFT * 8))
+                batch = PAGE_SHIFT * 8;
-        pcp = &p->pcp;
+        pageset_update(&p->pcp, high, batch);
-        pcp->high = high;
-        pcp->batch = max(1UL, high/4);
-        if ((high/4) > (PAGE_SHIFT * 8))
-                pcp->batch = PAGE_SHIFT * 8;
 }
-static void __meminit setup_zone_pageset(struct zone *zone)
+static void __meminit pageset_set_high_and_batch(struct zone *zone,
+                struct per_cpu_pageset *pcp)
 {
-        int cpu;
+        if (percpu_pagelist_fraction)
+                pageset_set_high(pcp,
-        zone->pageset = alloc_percpu(struct per_cpu_pageset);
+                        (zone->managed_pages /
+                                percpu_pagelist_fraction));
+        else
+                pageset_set_batch(pcp, zone_batchsize(zone));
+}
-        for_each_possible_cpu(cpu) {
+static void __meminit zone_pageset_init(struct zone *zone, int cpu)
-                struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+{
+        struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
-                setup_pageset(pcp, zone_batchsize(zone));
+        pageset_init(pcp);
+        pageset_set_high_and_batch(zone, pcp);
+}
-                if (percpu_pagelist_fraction)
+static void __meminit setup_zone_pageset(struct zone *zone)
-                        setup_pagelist_highmark(pcp,
+{
-                                (zone->managed_pages /
+        int cpu;
-                                        percpu_pagelist_fraction));
+        zone->pageset = alloc_percpu(struct per_cpu_pageset);
-        }
+        for_each_possible_cpu(cpu)
+                zone_pageset_init(zone, cpu);
 }
 /*
@@ -4368,13 +4417,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
 */
 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
+                                        unsigned long node_start_pfn,
+                                        unsigned long node_end_pfn,
                                        unsigned long *ignored)
 {
-        unsigned long node_start_pfn, node_end_pfn;
        unsigned long zone_start_pfn, zone_end_pfn;
-        /* Get the start and end of the node and zone */
+        /* Get the start and end of the zone */
-        get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
        zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
        zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
        adjust_zone_range_for_zone_movable(nid, zone_type,
@@ -4429,14 +4478,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 /* Return the number of page frames in holes in a zone on a node */
 static unsigned long __meminit zone_absent_pages_in_node(int nid,
                                        unsigned long zone_type,
+                                        unsigned long node_start_pfn,
+                                        unsigned long node_end_pfn,
                                        unsigned long *ignored)
 {
        unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
        unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
-        unsigned long node_start_pfn, node_end_pfn;
        unsigned long zone_start_pfn, zone_end_pfn;
-        get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
        zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
        zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
@@ -4449,6 +4498,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
+                                        unsigned long node_start_pfn,
+                                        unsigned long node_end_pfn,
                                        unsigned long *zones_size)
 {
        return zones_size[zone_type];
@@ -4456,6 +4507,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
                                                unsigned long zone_type,
+                                                unsigned long node_start_pfn,
+                                                unsigned long node_end_pfn,
                                                unsigned long *zholes_size)
 {
        if (!zholes_size)
@@ -4467,21 +4520,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
-                unsigned long *zones_size, unsigned long *zholes_size)
+                                                unsigned long node_start_pfn,
+                                                unsigned long node_end_pfn,
+                                                unsigned long *zones_size,
+                                                unsigned long *zholes_size)
 {
        unsigned long realtotalpages, totalpages = 0;
        enum zone_type i;
        for (i = 0; i < MAX_NR_ZONES; i++)
                totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
-                                                                zones_size);
+                                                         node_start_pfn,
+                                                         node_end_pfn,
+                                                         zones_size);
        pgdat->node_spanned_pages = totalpages;
        realtotalpages = totalpages;
        for (i = 0; i < MAX_NR_ZONES; i++)
                realtotalpages -=
                        zone_absent_pages_in_node(pgdat->node_id, i,
-                                                                zholes_size);
+                                                  node_start_pfn, node_end_pfn,
+                                                  zholes_size);
        pgdat->node_present_pages = realtotalpages;
        printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
                                                        realtotalpages);
@@ -4590,6 +4649,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
 * NOTE: pgdat should get zeroed by caller.
 */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
+                unsigned long node_start_pfn, unsigned long node_end_pfn,
                unsigned long *zones_size, unsigned long *zholes_size)
 {
        enum zone_type j;
@@ -4611,8 +4671,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize, freesize, memmap_pages;
-                size = zone_spanned_pages_in_node(nid, j, zones_size);
+                size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
+                                                  node_end_pfn, zones_size);
                realsize = freesize = size - zone_absent_pages_in_node(nid, j,
+                                                                node_start_pfn,
+                                                                node_end_pfn,
                                                                zholes_size);
                /*
@@ -4726,6 +4789,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                unsigned long node_start_pfn, unsigned long *zholes_size)
 {
        pg_data_t *pgdat = NODE_DATA(nid);
+        unsigned long start_pfn = 0;
+        unsigned long end_pfn = 0;
        /* pg_data_t should be reset to zero when it's allocated */
        WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
@@ -4733,7 +4798,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
        init_zone_allows_reclaim(nid);
-        calculate_node_totalpages(pgdat, zones_size, zholes_size);
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+#endif
+        calculate_node_totalpages(pgdat, start_pfn, end_pfn,
+                                  zones_size, zholes_size);
        alloc_node_mem_map(pgdat);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -4742,7 +4811,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                (unsigned long)pgdat->node_mem_map);
 #endif
-        free_area_init_core(pgdat, zones_size, zholes_size);
+        free_area_init_core(pgdat, start_pfn, end_pfn,
+                            zones_size, zholes_size);
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5150,35 +5220,101 @@ early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-unsigned long free_reserved_area(unsigned long start, unsigned long end,
+void adjust_managed_page_count(struct page *page, long count)
-                                 int poison, char *s)
 {
-        unsigned long pages, pos;
+        spin_lock(&managed_page_count_lock);
+        page_zone(page)->managed_pages += count;
+        totalram_pages += count;
+#ifdef CONFIG_HIGHMEM
+        if (PageHighMem(page))
+                totalhigh_pages += count;
+#endif
+        spin_unlock(&managed_page_count_lock);
+}
+EXPORT_SYMBOL(adjust_managed_page_count);
-        pos = start = PAGE_ALIGN(start);
+unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
-        end &= PAGE_MASK;
+{
-        for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
+        void *pos;
-                if (poison)
+        unsigned long pages = 0;
-                        memset((void *)pos, poison, PAGE_SIZE);
-                free_reserved_page(virt_to_page((void *)pos));
+        start = (void *)PAGE_ALIGN((unsigned long)start);
+        end = (void *)((unsigned long)end & PAGE_MASK);
+        for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
+                if ((unsigned int)poison <= 0xFF)
+                        memset(pos, poison, PAGE_SIZE);
+                free_reserved_page(virt_to_page(pos));
        }
        if (pages && s)
-                pr_info("Freeing %s memory: %ldK (%lx - %lx)\n",
+                pr_info("Freeing %s memory: %ldK (%p - %p)\n",
                        s, pages << (PAGE_SHIFT - 10), start, end);
        return pages;
 }
+EXPORT_SYMBOL(free_reserved_area);
 #ifdef  CONFIG_HIGHMEM
 void free_highmem_page(struct page *page)
 {
        __free_reserved_page(page);
        totalram_pages++;
+        page_zone(page)->managed_pages++;
        totalhigh_pages++;
 }
 #endif
+void __init mem_init_print_info(const char *str)
+{
+        unsigned long physpages, codesize, datasize, rosize, bss_size;
+        unsigned long init_code_size, init_data_size;
+        physpages = get_num_physpages();
+        codesize = _etext - _stext;
+        datasize = _edata - _sdata;
+        rosize = __end_rodata - __start_rodata;
+        bss_size = __bss_stop - __bss_start;
+        init_data_size = __init_end - __init_begin;
+        init_code_size = _einittext - _sinittext;
+        /*
+         * Detect special cases and adjust section sizes accordingly:
+         * 1) .init.* may be embedded into .data sections
+         * 2) .init.text.* may be out of [__init_begin, __init_end],
+         *    please refer to arch/tile/kernel/vmlinux.lds.S.
+         * 3) .rodata.* may be embedded into .text or .data sections.
+         */
+#define adj_init_size(start, end, size, pos, adj) \
+        if (start <= pos && pos < end && size > adj) \
+                size -= adj;
+        adj_init_size(__init_begin, __init_end, init_data_size,
+                     _sinittext, init_code_size);
+        adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
+        adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
+        adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
+        adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
+#undef  adj_init_size
+        printk("Memory: %luK/%luK available "
+               "(%luK kernel code, %luK rwdata, %luK rodata, "
+               "%luK init, %luK bss, %luK reserved"
+#ifdef  CONFIG_HIGHMEM
+               ", %luK highmem"
+#endif
+               "%s%s)\n",
+               nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
+               codesize >> 10, datasize >> 10, rosize >> 10,
+               (init_data_size + init_code_size) >> 10, bss_size >> 10,
+               (physpages - totalram_pages) << (PAGE_SHIFT-10),
+#ifdef  CONFIG_HIGHMEM
+               totalhigh_pages << (PAGE_SHIFT-10),
+#endif
+               str ? ", " : "", str ? str : "");
+}
 /**
 * set_dma_reserve - set the specified number of pages reserved in the first zone
 * @new_dma_reserve: The number of pages to mark reserved
@@ -5454,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void)
 int __meminit init_per_zone_wmark_min(void)
 {
        unsigned long lowmem_kbytes;
+        int new_min_free_kbytes;
        lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
+        new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
-        min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
-        if (min_free_kbytes < 128)
+        if (new_min_free_kbytes > user_min_free_kbytes) {
-                min_free_kbytes = 128;
+                min_free_kbytes = new_min_free_kbytes;
-        if (min_free_kbytes > 65536)
+                if (min_free_kbytes < 128)
-                min_free_kbytes = 65536;
+                        min_free_kbytes = 128;
+                if (min_free_kbytes > 65536)
+                        min_free_kbytes = 65536;
+        } else {
+                pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
+                                new_min_free_kbytes, user_min_free_kbytes);
+        }
        setup_per_zone_wmarks();
        refresh_zone_stat_thresholds();
        setup_per_zone_lowmem_reserve();
@@ -5479,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
        proc_dointvec(table, write, buffer, length, ppos);
-        if (write)
+        if (write) {
+                user_min_free_kbytes = min_free_kbytes;
                setup_per_zone_wmarks();
+        }
        return 0;
 }
@@ -5540,7 +5685,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
 * can have before it gets flushed back to buddy allocator.
 */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
@@ -5551,14 +5695,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (!write || (ret < 0))
                return ret;
+        mutex_lock(&pcp_batch_high_lock);
        for_each_populated_zone(zone) {
-                for_each_possible_cpu(cpu) {
+                unsigned long  high;
-                        unsigned long  high;
+                high = zone->managed_pages / percpu_pagelist_fraction;
-                        high = zone->managed_pages / percpu_pagelist_fraction;
+                for_each_possible_cpu(cpu)
-                        setup_pagelist_highmark(
+                        pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
-                                per_cpu_ptr(zone->pageset, cpu), high);
+                                         high);
-                }
        }
+        mutex_unlock(&pcp_batch_high_lock);
        return 0;
 }
@@ -6047,32 +6193,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
-static int __meminit __zone_pcp_update(void *data)
+/*
-{
+ * The zone indicated has a new number of managed_pages; batch sizes and percpu
-        struct zone *zone = data;
+ * page high values need to be recalulated.
-        int cpu;
+ */
-        unsigned long batch = zone_batchsize(zone), flags;
-        for_each_possible_cpu(cpu) {
-                struct per_cpu_pageset *pset;
-                struct per_cpu_pages *pcp;
-                pset = per_cpu_ptr(zone->pageset, cpu);
-                pcp = &pset->pcp;
-                local_irq_save(flags);
-                if (pcp->count > 0)
-                        free_pcppages_bulk(zone, pcp->count, pcp);
-                drain_zonestat(zone, pset);
-                setup_pageset(pset, batch);
-                local_irq_restore(flags);
-        }
-        return 0;
-}
 void __meminit zone_pcp_update(struct zone *zone)
 {
-        stop_machine(__zone_pcp_update, zone, NULL);
+        unsigned cpu;
+        mutex_lock(&pcp_batch_high_lock);
+        for_each_possible_cpu(cpu)
+                pageset_set_high_and_batch(zone,
+                                per_cpu_ptr(zone->pageset, cpu));
+        mutex_unlock(&pcp_batch_high_lock);
 }
 #endif
@@ -6142,6 +6274,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                list_del(&page->lru);
                rmv_page_order(page);
                zone->free_area[order].nr_free--;
+#ifdef CONFIG_HIGHMEM
+                if (PageHighMem(page))
+                        totalhigh_pages -= 1 << order;
+#endif
                for (i = 0; i < (1 << order); i++)
                        SetPageReserved((page+i));
                pfn += (1 << order);
diff --git a/mm/page_io.c b/mm/page_io.c
index a8a3ef45fed7..ba05b64e5d8d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -21,6 +21,7 @@
 #include <linux/writeback.h>
 #include <linux/frontswap.h>
 #include <linux/aio.h>
+#include <linux/blkdev.h>
 #include <asm/pgtable.h>
 static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -80,9 +81,54 @@ void end_swap_bio_read(struct bio *bio, int err)
                                imajor(bio->bi_bdev->bd_inode),
                                iminor(bio->bi_bdev->bd_inode),
                                (unsigned long long)bio->bi_sector);
-        } else {
+                goto out;
-                SetPageUptodate(page);
        }
+        SetPageUptodate(page);
+        /*
+         * There is no guarantee that the page is in swap cache - the software
+         * suspend code (at least) uses end_swap_bio_read() against a non-
+         * swapcache page.  So we must check PG_swapcache before proceeding with
+         * this optimization.
+         */
+        if (likely(PageSwapCache(page))) {
+                struct swap_info_struct *sis;
+                sis = page_swap_info(page);
+                if (sis->flags & SWP_BLKDEV) {
+                        /*
+                         * The swap subsystem performs lazy swap slot freeing,
+                         * expecting that the page will be swapped out again.
+                         * So we can avoid an unnecessary write if the page
+                         * isn't redirtied.
+                         * This is good for real swap storage because we can
+                         * reduce unnecessary I/O and enhance wear-leveling
+                         * if an SSD is used as the as swap device.
+                         * But if in-memory swap device (eg zram) is used,
+                         * this causes a duplicated copy between uncompressed
+                         * data in VM-owned memory and compressed data in
+                         * zram-owned memory.  So let's free zram-owned memory
+                         * and make the VM-owned decompressed page *dirty*,
+                         * so the page should be swapped out somewhere again if
+                         * we again wish to reclaim it.
+                         */
+                        struct gendisk *disk = sis->bdev->bd_disk;
+                        if (disk->fops->swap_slot_free_notify) {
+                                swp_entry_t entry;
+                                unsigned long offset;
+                                entry.val = page_private(page);
+                                offset = swp_offset(entry);
+                                SetPageDirty(page);
+                                disk->fops->swap_slot_free_notify(sis->bdev,
+                                                offset);
+                        }
+                }
+        }
+out:
        unlock_page(page);
        bio_put(bio);
 }
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0c8323fe6c8f..e1a6e4fab016 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                pgtable_t pgtable)
 {
        assert_spin_locked(&mm->page_table_lock);
@@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
 #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /* no "address" argument so destroys page coloring of some arch */
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 {
        pgtable_t pgtable;
diff --git a/mm/rmap.c b/mm/rmap.c
index 6280da86b5d6..cd356df4f71a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -720,7 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                         * mapping is already gone, the unmap path will have
                         * set PG_referenced or activated the page.
                         */
-                        if (likely(!VM_SequentialReadHint(vma)))
+                        if (likely(!(vma->vm_flags & VM_SEQ_READ)))
                                referenced++;
                }
                pte_unmap_unlock(pte, ptl);
@@ -1093,9 +1093,10 @@ void page_add_new_anon_rmap(struct page *page,
        else
                __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __page_set_anon_rmap(page, vma, address, 1);
-        if (!mlocked_vma_newpage(vma, page))
+        if (!mlocked_vma_newpage(vma, page)) {
-                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
+                SetPageActive(page);
-        else
+                lru_cache_add(page);
+        } else
                add_page_to_unevictable_list(page);
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 118dfa4952f4..a87990cf9f94 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1936,6 +1936,13 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
        inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
        if (inode) {
+#ifdef CONFIG_TMPFS_POSIX_ACL
+                error = generic_acl_init(inode, dir);
+                if (error) {
+                        iput(inode);
+                        return error;
+                }
+#endif
                error = security_inode_init_security(inode, dir,
                                                     &dentry->d_name,
                                                     shmem_initxattrs, NULL);
@@ -1945,15 +1952,8 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
                                return error;
                        }
                }
-#ifdef CONFIG_TMPFS_POSIX_ACL
-                error = generic_acl_init(inode, dir);
-                if (error) {
-                        iput(inode);
-                        return error;
-                }
-#else
                error = 0;
-#endif
                dir->i_size += BOGO_DIRENT_SIZE;
                dir->i_ctime = dir->i_mtime = CURRENT_TIME;
                d_instantiate(dentry, inode);
diff --git a/mm/slab.c b/mm/slab.c
index 8ccd296c6d9c..35cb0c861508 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -565,7 +565,7 @@ static void init_node_lock_keys(int q)
        if (slab_state < UP)
                return;
-        for (i = 1; i < PAGE_SHIFT + MAX_ORDER; i++) {
+        for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) {
                struct kmem_cache_node *n;
                struct kmem_cache *cache = kmalloc_caches[i];
@@ -1180,6 +1180,12 @@ static int init_cache_node_node(int node)
        return 0;
 }
+static inline int slabs_tofree(struct kmem_cache *cachep,
+                                                struct kmem_cache_node *n)
+{
+        return (n->free_objects + cachep->num - 1) / cachep->num;
+}
 static void __cpuinit cpuup_canceled(long cpu)
 {
        struct kmem_cache *cachep;
@@ -1241,7 +1247,7 @@ free_array_cache:
                n = cachep->node[node];
                if (!n)
                        continue;
-                drain_freelist(cachep, n, n->free_objects);
+                drain_freelist(cachep, n, slabs_tofree(cachep, n));
        }
 }
@@ -1408,7 +1414,7 @@ static int __meminit drain_cache_node_node(int node)
                if (!n)
                        continue;
-                drain_freelist(cachep, n, n->free_objects);
+                drain_freelist(cachep, n, slabs_tofree(cachep, n));
                if (!list_empty(&n->slabs_full) ||
                    !list_empty(&n->slabs_partial)) {
@@ -2532,7 +2538,7 @@ static int __cache_shrink(struct kmem_cache *cachep)
                if (!n)
                        continue;
-                drain_freelist(cachep, n, n->free_objects);
+                drain_freelist(cachep, n, slabs_tofree(cachep, n));
                ret += !list_empty(&n->slabs_full) ||
                        !list_empty(&n->slabs_partial);
@@ -3338,18 +3344,6 @@ done:
        return obj;
 }
-/**
- * kmem_cache_alloc_node - Allocate an object on the specified node
- * @cachep: The cache to allocate from.
- * @flags: See kmalloc().
- * @nodeid: node number of the target node.
- * @caller: return address of caller, used for debug information
- *
- * Identical to kmem_cache_alloc but it will allocate memory on the given
- * node, which can improve the performance for cpu bound structures.
- *
- * Fallback to other node is possible if __GFP_THISNODE is not set.
- */
 static __always_inline void *
 slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
                   unsigned long caller)
@@ -3643,6 +3637,17 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
 #endif
 #ifdef CONFIG_NUMA
+/**
+ * kmem_cache_alloc_node - Allocate an object on the specified node
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ * @nodeid: node number of the target node.
+ *
+ * Identical to kmem_cache_alloc but it will allocate memory on the given
+ * node, which can improve the performance for cpu bound structures.
+ *
+ * Fallback to other node is possible if __GFP_THISNODE is not set.
+ */
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
        void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
@@ -4431,20 +4436,10 @@ static int leaks_show(struct seq_file *m, void *p)
        return 0;
 }
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
-        return seq_list_next(p, &slab_caches, pos);
-}
-static void s_stop(struct seq_file *m, void *p)
-{
-        mutex_unlock(&slab_mutex);
-}
 static const struct seq_operations slabstats_op = {
        .start = leaks_start,
-        .next = s_next,
+        .next = slab_next,
-        .stop = s_stop,
+        .stop = slab_stop,
        .show = leaks_show,
 };
diff --git a/mm/slab.h b/mm/slab.h
index f96b49e4704e..620ceeddbe1a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -271,3 +271,6 @@ struct kmem_cache_node {
 #endif
 };
+void *slab_next(struct seq_file *m, void *p, loff_t *pos);
+void slab_stop(struct seq_file *m, void *p);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2d414508e9ec..538bade6df7d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -497,6 +497,13 @@ void __init create_kmalloc_caches(unsigned long flags)
 #ifdef CONFIG_SLABINFO
+#ifdef CONFIG_SLAB
+#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
+#else
+#define SLABINFO_RIGHTS S_IRUSR
+#endif
 void print_slabinfo_header(struct seq_file *m)
 {
        /*
@@ -531,12 +538,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
        return seq_list_start(&slab_caches, *pos);
 }
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+void *slab_next(struct seq_file *m, void *p, loff_t *pos)
 {
        return seq_list_next(p, &slab_caches, pos);
 }
-static void s_stop(struct seq_file *m, void *p)
+void slab_stop(struct seq_file *m, void *p)
 {
        mutex_unlock(&slab_mutex);
 }
@@ -613,8 +620,8 @@ static int s_show(struct seq_file *m, void *p)
 */
 static const struct seq_operations slabinfo_op = {
        .start = s_start,
-        .next = s_next,
+        .next = slab_next,
-        .stop = s_stop,
+        .stop = slab_stop,
        .show = s_show,
 };
@@ -633,7 +640,8 @@ static const struct file_operations proc_slabinfo_operations = {
 static int __init slab_proc_init(void)
 {
-        proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
+        proc_create("slabinfo", SLABINFO_RIGHTS, NULL,
+                                                &proc_slabinfo_operations);
        return 0;
 }
 module_init(slab_proc_init);
diff --git a/mm/slob.c b/mm/slob.c
index eeed4a05a2ef..91bd3f2dd2f0 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -122,7 +122,7 @@ static inline void clear_slob_page_free(struct page *sp)
 }
 #define SLOB_UNIT sizeof(slob_t)
-#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
+#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT)
 /*
 * struct slob_rcu is inserted at the tail of allocated slob blocks, which
@@ -554,7 +554,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
                                            flags, node);
        }
-        if (c->ctor)
+        if (b && c->ctor)
                c->ctor(b);
        kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
diff --git a/mm/slub.c b/mm/slub.c
index 57707f01bcfb..3b482c863002 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -123,6 +123,15 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 #endif
 }
+static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+        return !kmem_cache_debug(s);
+#else
+        return false;
+#endif
+}
 /*
 * Issues still to be resolved:
 *
@@ -1573,7 +1582,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
                        put_cpu_partial(s, page, 0);
                        stat(s, CPU_PARTIAL_NODE);
                }
-                if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
+                if (!kmem_cache_has_cpu_partial(s)
+                        || available > s->cpu_partial / 2)
                        break;
        }
@@ -1884,6 +1894,7 @@ redo:
 static void unfreeze_partials(struct kmem_cache *s,
                struct kmem_cache_cpu *c)
 {
+#ifdef CONFIG_SLUB_CPU_PARTIAL
        struct kmem_cache_node *n = NULL, *n2 = NULL;
        struct page *page, *discard_page = NULL;
@@ -1938,6 +1949,7 @@ static void unfreeze_partials(struct kmem_cache *s,
                discard_slab(s, page);
                stat(s, FREE_SLAB);
        }
+#endif
 }
 /*
@@ -1951,10 +1963,14 @@ static void unfreeze_partials(struct kmem_cache *s,
 */
 static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 {
+#ifdef CONFIG_SLUB_CPU_PARTIAL
        struct page *oldpage;
        int pages;
        int pobjects;
+        if (!s->cpu_partial)
+                return;
        do {
                pages = 0;
                pobjects = 0;
@@ -1987,6 +2003,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
                page->next = oldpage;
        } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
+#endif
 }
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -2358,7 +2375,7 @@ redo:
        object = c->freelist;
        page = c->page;
-        if (unlikely(!object || !node_match(page, node)))
+        if (unlikely(!object || !page || !node_match(page, node)))
                object = __slab_alloc(s, gfpflags, node, addr, c);
        else {
@@ -2495,7 +2512,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                new.inuse--;
                if ((!new.inuse || !prior) && !was_frozen) {
-                        if (!kmem_cache_debug(s) && !prior)
+                        if (kmem_cache_has_cpu_partial(s) && !prior)
                                /*
                                 * Slab was on no list before and will be partially empty
@@ -2550,8 +2567,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
         * Objects left in the slab. If it was not on the partial list before
         * then add it.
         */
-        if (kmem_cache_debug(s) && unlikely(!prior)) {
+        if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
-                remove_full(s, page);
+                if (kmem_cache_debug(s))
+                        remove_full(s, page);
                add_partial(n, page, DEACTIVATE_TO_TAIL);
                stat(s, FREE_ADD_PARTIAL);
        }
@@ -3059,7 +3077,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
         *    per node list when we run out of per cpu objects. We only fetch 50%
         *    to keep some capacity around for frees.
         */
-        if (kmem_cache_debug(s))
+        if (!kmem_cache_has_cpu_partial(s))
                s->cpu_partial = 0;
        else if (s->size >= PAGE_SIZE)
                s->cpu_partial = 2;
@@ -4456,7 +4474,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
        err = strict_strtoul(buf, 10, &objects);
        if (err)
                return err;
-        if (objects && kmem_cache_debug(s))
+        if (objects && !kmem_cache_has_cpu_partial(s))
                return -EINVAL;
        s->cpu_partial = objects;
@@ -5269,7 +5287,6 @@ __initcall(slab_sysfs_init);
 #ifdef CONFIG_SLABINFO
 void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
 {
-        unsigned long nr_partials = 0;
        unsigned long nr_slabs = 0;
        unsigned long nr_objs = 0;
        unsigned long nr_free = 0;
@@ -5281,9 +5298,8 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
                if (!n)
                        continue;
-                nr_partials += n->nr_partial;
+                nr_slabs += node_nr_slabs(n);
-                nr_slabs += atomic_long_read(&n->nr_slabs);
+                nr_objs += node_nr_objs(n);
-                nr_objs += atomic_long_read(&n->total_objects);
                nr_free += count_partial(n, count_free);
        }
diff --git a/mm/sparse.c b/mm/sparse.c
index 1c91f0d3f6ab..308d50331bc3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -79,7 +79,6 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
 {
        unsigned long root = SECTION_NR_TO_ROOT(section_nr);
        struct mem_section *section;
-        int ret = 0;
        if (mem_section[root])
                return -EEXIST;
@@ -90,7 +89,7 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
        mem_section[root] = section;
-        return ret;
+        return 0;
 }
 #else /* !SPARSEMEM_EXTREME */
 static inline int sparse_index_init(unsigned long section_nr, int nid)
@@ -481,6 +480,9 @@ void __init sparse_init(void)
        struct page **map_map;
 #endif
+        /* see include/linux/mmzone.h 'struct mem_section' definition */
+        BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
        /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
        set_pageblock_order();
@@ -751,6 +753,7 @@ out:
        return ret;
 }
+#ifdef CONFIG_MEMORY_HOTREMOVE
 #ifdef CONFIG_MEMORY_FAILURE
 static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 {
@@ -772,7 +775,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void free_section_usemap(struct page *memmap, unsigned long *usemap)
 {
        struct page *usemap_page;
diff --git a/mm/swap.c b/mm/swap.c
index dfd7d71d6841..4a1d0d2c52fa 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,10 +34,13 @@
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/pagemap.h>
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
-static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
@@ -384,6 +387,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
                SetPageActive(page);
                lru += LRU_ACTIVE;
                add_page_to_lru_list(page, lruvec, lru);
+                trace_mm_lru_activate(page, page_to_pfn(page));
                __count_vm_event(PGACTIVATE);
                update_page_reclaim_stat(lruvec, file, 1);
@@ -428,6 +432,33 @@ void activate_page(struct page *page)
 }
 #endif
+static void __lru_cache_activate_page(struct page *page)
+{
+        struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+        int i;
+        /*
+         * Search backwards on the optimistic assumption that the page being
+         * activated has just been added to this pagevec. Note that only
+         * the local pagevec is examined as a !PageLRU page could be in the
+         * process of being released, reclaimed, migrated or on a remote
+         * pagevec that is currently being drained. Furthermore, marking
+         * a remote pagevec's page PageActive potentially hits a race where
+         * a page is marked PageActive just after it is added to the inactive
+         * list causing accounting errors and BUG_ON checks to trigger.
+         */
+        for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
+                struct page *pagevec_page = pvec->pages[i];
+                if (pagevec_page == page) {
+                        SetPageActive(page);
+                        break;
+                }
+        }
+        put_cpu_var(lru_add_pvec);
+}
 /*
 * Mark a page as having seen activity.
 *
@@ -438,8 +469,18 @@ void activate_page(struct page *page)
 void mark_page_accessed(struct page *page)
 {
        if (!PageActive(page) && !PageUnevictable(page) &&
-                        PageReferenced(page) && PageLRU(page)) {
+                        PageReferenced(page)) {
-                activate_page(page);
+                /*
+                 * If the page is on the LRU, queue it for activation via
+                 * activate_page_pvecs. Otherwise, assume the page is on a
+                 * pagevec, mark it active and it'll be moved to the active
+                 * LRU on the next drain.
+                 */
+                if (PageLRU(page))
+                        activate_page(page);
+                else
+                        __lru_cache_activate_page(page);
                ClearPageReferenced(page);
        } else if (!PageReferenced(page)) {
                SetPageReferenced(page);
@@ -448,42 +489,37 @@ void mark_page_accessed(struct page *page)
 EXPORT_SYMBOL(mark_page_accessed);
 /*
- * Order of operations is important: flush the pagevec when it's already
+ * Queue the page for addition to the LRU via pagevec. The decision on whether
- * full, not when adding the last page, to make sure that last page is
+ * to add the page to the [in]active [file|anon] list is deferred until the
- * not added to the LRU directly when passed to this function. Because
+ * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
- * mark_page_accessed() (called after this when writing) only activates
+ * have the page added to the active list using mark_page_accessed().
- * pages that are on the LRU, linear writes in subpage chunks would see
- * every PAGEVEC_SIZE page activated, which is unexpected.
 */
-void __lru_cache_add(struct page *page, enum lru_list lru)
+void __lru_cache_add(struct page *page)
 {
-        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
+        struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
        page_cache_get(page);
        if (!pagevec_space(pvec))
-                __pagevec_lru_add(pvec, lru);
+                __pagevec_lru_add(pvec);
        pagevec_add(pvec, page);
-        put_cpu_var(lru_add_pvecs);
+        put_cpu_var(lru_add_pvec);
 }
 EXPORT_SYMBOL(__lru_cache_add);
 /**
- * lru_cache_add_lru - add a page to a page list
+ * lru_cache_add - add a page to a page list
 * @page: the page to be added to the LRU.
- * @lru: the LRU list to which the page is added.
 */
-void lru_cache_add_lru(struct page *page, enum lru_list lru)
+void lru_cache_add(struct page *page)
 {
        if (PageActive(page)) {
                VM_BUG_ON(PageUnevictable(page));
-                ClearPageActive(page);
        } else if (PageUnevictable(page)) {
                VM_BUG_ON(PageActive(page));
-                ClearPageUnevictable(page);
        }
-        VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
+        VM_BUG_ON(PageLRU(page));
-        __lru_cache_add(page, lru);
+        __lru_cache_add(page);
 }
 /**
@@ -583,15 +619,10 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
 */
 void lru_add_drain_cpu(int cpu)
 {
-        struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
+        struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
-        struct pagevec *pvec;
-        int lru;
-        for_each_lru(lru) {
+        if (pagevec_count(pvec))
-                pvec = &pvecs[lru - LRU_BASE];
+                __pagevec_lru_add(pvec);
-                if (pagevec_count(pvec))
-                        __pagevec_lru_add(pvec, lru);
-        }
        pvec = &per_cpu(lru_rotate_pvecs, cpu);
        if (pagevec_count(pvec)) {
@@ -708,6 +739,9 @@ void release_pages(struct page **pages, int nr, int cold)
                        del_page_from_lru_list(page, lruvec, page_off_lru(page));
                }
+                /* Clear Active bit in case of parallel mark_page_accessed */
+                ClearPageActive(page);
                list_add(&page->lru, &pages_to_free);
        }
        if (zone)
@@ -795,30 +829,26 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
                                 void *arg)
 {
-        enum lru_list lru = (enum lru_list)arg;
+        int file = page_is_file_cache(page);
-        int file = is_file_lru(lru);
+        int active = PageActive(page);
-        int active = is_active_lru(lru);
+        enum lru_list lru = page_lru(page);
-        VM_BUG_ON(PageActive(page));
        VM_BUG_ON(PageUnevictable(page));
        VM_BUG_ON(PageLRU(page));
        SetPageLRU(page);
-        if (active)
-                SetPageActive(page);
        add_page_to_lru_list(page, lruvec, lru);
        update_page_reclaim_stat(lruvec, file, active);
+        trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
 }
 /*
 * Add the passed pages to the LRU, then drop the caller's refcount
 * on them.  Reinitialises the caller's pagevec.
 */
-void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
+void __pagevec_lru_add(struct pagevec *pvec)
 {
-        VM_BUG_ON(is_unevictable_lru(lru));
+        pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
-        pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru);
 }
 EXPORT_SYMBOL(__pagevec_lru_add);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 746af55b8455..36af6eeaa67e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -212,7 +212,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
                }
-                if (si->flags & SWP_DISCARDABLE) {
+                if (si->flags & SWP_PAGE_DISCARD) {
                        /*
                         * Start range check on racing allocations, in case
                         * they overlap the cluster we eventually decide on
@@ -322,7 +322,7 @@ checks:
        if (si->lowest_alloc) {
                /*
-                 * Only set when SWP_DISCARDABLE, and there's a scan
+                 * Only set when SWP_PAGE_DISCARD, and there's a scan
                 * for a free cluster in progress or just completed.
                 */
                if (found_free_cluster) {
@@ -2016,6 +2016,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
        return nr_extents;
 }
+/*
+ * Helper to sys_swapon determining if a given swap
+ * backing device queue supports DISCARD operations.
+ */
+static bool swap_discardable(struct swap_info_struct *si)
+{
+        struct request_queue *q = bdev_get_queue(si->bdev);
+        if (!q || !blk_queue_discard(q))
+                return false;
+        return true;
+}
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
        struct swap_info_struct *p;
@@ -2123,8 +2137,37 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                        p->flags |= SWP_SOLIDSTATE;
                        p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
                }
-                if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
-                        p->flags |= SWP_DISCARDABLE;
+                if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+                        /*
+                         * When discard is enabled for swap with no particular
+                         * policy flagged, we set all swap discard flags here in
+                         * order to sustain backward compatibility with older
+                         * swapon(8) releases.
+                         */
+                        p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
+                                     SWP_PAGE_DISCARD);
+                        /*
+                         * By flagging sys_swapon, a sysadmin can tell us to
+                         * either do single-time area discards only, or to just
+                         * perform discards for released swap page-clusters.
+                         * Now it's time to adjust the p->flags accordingly.
+                         */
+                        if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
+                                p->flags &= ~SWP_PAGE_DISCARD;
+                        else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
+                                p->flags &= ~SWP_AREA_DISCARD;
+                        /* issue a swapon-time discard if it's still required */
+                        if (p->flags & SWP_AREA_DISCARD) {
+                                int err = discard_swap(p);
+                                if (unlikely(err))
+                                        printk(KERN_ERR
+                                               "swapon: discard_swap(%p): %d\n",
+                                                p, err);
+                        }
+                }
        }
        mutex_lock(&swapon_mutex);
@@ -2135,11 +2178,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        enable_swap_info(p, prio, swap_map, frontswap_map);
        printk(KERN_INFO "Adding %uk swap on %s.  "
-                        "Priority:%d extents:%d across:%lluk %s%s%s\n",
+                        "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
                p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
                (p->flags & SWP_DISCARDABLE) ? "D" : "",
+                (p->flags & SWP_AREA_DISCARD) ? "s" : "",
+                (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
                (frontswap_map) ? "FS" : "");
        mutex_unlock(&swapon_mutex);
diff --git a/mm/util.c b/mm/util.c
index ab1424dbe2e6..7441c41d00f6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -295,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 {
        mm->mmap_base = TASK_UNMAPPED_BASE;
        mm->get_unmapped_area = arch_get_unmapped_area;
-        mm->unmap_area = arch_unmap_area;
 }
 #endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d365724feb05..13a54953a273 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -292,7 +292,7 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
                va = rb_entry(n, struct vmap_area, rb_node);
                if (addr < va->va_start)
                        n = n->rb_left;
-                else if (addr > va->va_start)
+                else if (addr >= va->va_end)
                        n = n->rb_right;
                else
                        return va;
@@ -388,12 +388,12 @@ nocache:
                addr = ALIGN(first->va_end, align);
                if (addr < vstart)
                        goto nocache;
-                if (addr + size - 1 < addr)
+                if (addr + size < addr)
                        goto overflow;
        } else {
                addr = ALIGN(vstart, align);
-                if (addr + size - 1 < addr)
+                if (addr + size < addr)
                        goto overflow;
                n = vmap_area_root.rb_node;
@@ -420,7 +420,7 @@ nocache:
                if (addr + cached_hole_size < first->va_start)
                        cached_hole_size = first->va_start - addr;
                addr = ALIGN(first->va_end, align);
-                if (addr + size - 1 < addr)
+                if (addr + size < addr)
                        goto overflow;
                if (list_is_last(&first->list, &vmap_area_list))
@@ -754,7 +754,6 @@ struct vmap_block {
        struct vmap_area *va;
        struct vmap_block_queue *vbq;
        unsigned long free, dirty;
-        DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
        DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
        struct list_head free_list;
        struct rcu_head rcu_head;
@@ -820,7 +819,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        vb->va = va;
        vb->free = VMAP_BBMAP_BITS;
        vb->dirty = 0;
-        bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
        bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
        INIT_LIST_HEAD(&vb->free_list);
@@ -873,7 +871,6 @@ static void purge_fragmented_blocks(int cpu)
                if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
                        vb->free = 0; /* prevent further allocs after releasing lock */
                        vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
-                        bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
                        bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
                        spin_lock(&vbq->lock);
                        list_del_rcu(&vb->free_list);
@@ -891,11 +888,6 @@ static void purge_fragmented_blocks(int cpu)
        }
 }
-static void purge_fragmented_blocks_thiscpu(void)
-{
-        purge_fragmented_blocks(smp_processor_id());
-}
 static void purge_fragmented_blocks_allcpus(void)
 {
        int cpu;
@@ -910,7 +902,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
        struct vmap_block *vb;
        unsigned long addr = 0;
        unsigned int order;
-        int purge = 0;
        BUG_ON(size & ~PAGE_MASK);
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -934,17 +925,7 @@ again:
                if (vb->free < 1UL << order)
                        goto next;
-                i = bitmap_find_free_region(vb->alloc_map,
+                i = VMAP_BBMAP_BITS - vb->free;
-                                                VMAP_BBMAP_BITS, order);
-                if (i < 0) {
-                        if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
-                                /* fragmented and no outstanding allocations */
-                                BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
-                                purge = 1;
-                        }
-                        goto next;
-                }
                addr = vb->va->va_start + (i << PAGE_SHIFT);
                BUG_ON(addr_to_vb_idx(addr) !=
                                addr_to_vb_idx(vb->va->va_start));
@@ -960,9 +941,6 @@ next:
                spin_unlock(&vb->lock);
        }
-        if (purge)
-                purge_fragmented_blocks_thiscpu();
        put_cpu_var(vmap_block_queue);
        rcu_read_unlock();
@@ -1311,22 +1289,15 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
        spin_unlock(&vmap_area_lock);
 }
-static void clear_vm_unlist(struct vm_struct *vm)
+static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 {
        /*
-         * Before removing VM_UNLIST,
+         * Before removing VM_UNINITIALIZED,
         * we should make sure that vm has proper values.
         * Pair with smp_rmb() in show_numa_info().
         */
        smp_wmb();
-        vm->flags &= ~VM_UNLIST;
+        vm->flags &= ~VM_UNINITIALIZED;
-}
-static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-                              unsigned long flags, const void *caller)
-{
-        setup_vmalloc_vm(vm, va, flags, caller);
-        clear_vm_unlist(vm);
 }
 static struct vm_struct *__get_vm_area_node(unsigned long size,
@@ -1337,16 +1308,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
        struct vm_struct *area;
        BUG_ON(in_interrupt());
-        if (flags & VM_IOREMAP) {
+        if (flags & VM_IOREMAP)
-                int bit = fls(size);
+                align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
-                if (bit > IOREMAP_MAX_ORDER)
-                        bit = IOREMAP_MAX_ORDER;
-                else if (bit < PAGE_SHIFT)
-                        bit = PAGE_SHIFT;
-                align = 1ul << bit;
-        }
        size = PAGE_ALIGN(size);
        if (unlikely(!size))
@@ -1367,16 +1330,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
                return NULL;
        }
-        /*
+        setup_vmalloc_vm(area, va, flags, caller);
-         * When this function is called from __vmalloc_node_range,
-         * we add VM_UNLIST flag to avoid accessing uninitialized
-         * members of vm_struct such as pages and nr_pages fields.
-         * They will be set later.
-         */
-        if (flags & VM_UNLIST)
-                setup_vmalloc_vm(area, va, flags, caller);
-        else
-                insert_vmalloc_vm(area, va, flags, caller);
        return area;
 }
@@ -1476,10 +1430,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
        if (!addr)
                return;
-        if ((PAGE_SIZE-1) & (unsigned long)addr) {
+        if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
-                WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
+                        addr))
                return;
-        }
        area = remove_vm_area(addr);
        if (unlikely(!area)) {
@@ -1524,7 +1477,6 @@ static void __vunmap(const void *addr, int deallocate_pages)
 *      conventions for vfree() arch-depenedent would be a really bad idea)
 *
 *      NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
- *      
 */
 void vfree(const void *addr)
 {
@@ -1536,8 +1488,8 @@ void vfree(const void *addr)
                return;
        if (unlikely(in_interrupt())) {
                struct vfree_deferred *p = &__get_cpu_var(vfree_deferred);
-                llist_add((struct llist_node *)addr, &p->list);
+                if (llist_add((struct llist_node *)addr, &p->list))
-                schedule_work(&p->wq);
+                        schedule_work(&p->wq);
        } else
                __vunmap(addr, 1);
 }
@@ -1682,21 +1634,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
        if (!size || (size >> PAGE_SHIFT) > totalram_pages)
                goto fail;
-        area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
+        area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
                                  start, end, node, gfp_mask, caller);
        if (!area)
                goto fail;
        addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
        if (!addr)
-                return NULL;
+                goto fail;
        /*
-         * In this function, newly allocated vm_struct has VM_UNLIST flag.
+         * In this function, newly allocated vm_struct has VM_UNINITIALIZED
-         * It means that vm_struct is not fully initialized.
+         * flag. It means that vm_struct is not fully initialized.
         * Now, it is fully initialized, so remove this flag here.
         */
-        clear_vm_unlist(area);
+        clear_vm_uninitialized_flag(area);
        /*
         * A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2148,42 +2100,43 @@ finished:
 }
 /**
- *      remap_vmalloc_range  -  map vmalloc pages to userspace
+ *      remap_vmalloc_range_partial  -  map vmalloc pages to userspace
- *      @vma:           vma to cover (map full range of vma)
+ *      @vma:           vma to cover
- *      @addr:          vmalloc memory
+ *      @uaddr:         target user address to start at
- *      @pgoff:         number of pages into addr before first page to map
+ *      @kaddr:         virtual address of vmalloc kernel memory
+ *      @size:          size of map area
 *
 *      Returns:        0 for success, -Exxx on failure
 *
- *      This function checks that addr is a valid vmalloc'ed area, and
+ *      This function checks that @kaddr is a valid vmalloc'ed area,
- *      that it is big enough to cover the vma. Will return failure if
+ *      and that it is big enough to cover the range starting at
- *      that criteria isn't met.
+ *      @uaddr in @vma. Will return failure if that criteria isn't
+ *      met.
 *
 *      Similar to remap_pfn_range() (see mm/memory.c)
 */
-int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
-                                                unsigned long pgoff)
+                                void *kaddr, unsigned long size)
 {
        struct vm_struct *area;
-        unsigned long uaddr = vma->vm_start;
-        unsigned long usize = vma->vm_end - vma->vm_start;
-        if ((PAGE_SIZE-1) & (unsigned long)addr)
+        size = PAGE_ALIGN(size);
+        if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
                return -EINVAL;
-        area = find_vm_area(addr);
+        area = find_vm_area(kaddr);
        if (!area)
                return -EINVAL;
        if (!(area->flags & VM_USERMAP))
                return -EINVAL;
-        if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
+        if (kaddr + size > area->addr + area->size)
                return -EINVAL;
-        addr += pgoff << PAGE_SHIFT;
        do {
-                struct page *page = vmalloc_to_page(addr);
+                struct page *page = vmalloc_to_page(kaddr);
                int ret;
                ret = vm_insert_page(vma, uaddr, page);
@@ -2191,14 +2144,37 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                        return ret;
                uaddr += PAGE_SIZE;
-                addr += PAGE_SIZE;
+                kaddr += PAGE_SIZE;
-                usize -= PAGE_SIZE;
+                size -= PAGE_SIZE;
-        } while (usize > 0);
+        } while (size > 0);
        vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        return 0;
 }
+EXPORT_SYMBOL(remap_vmalloc_range_partial);
+/**
+ *      remap_vmalloc_range  -  map vmalloc pages to userspace
+ *      @vma:           vma to cover (map full range of vma)
+ *      @addr:          vmalloc memory
+ *      @pgoff:         number of pages into addr before first page to map
+ *
+ *      Returns:        0 for success, -Exxx on failure
+ *
+ *      This function checks that addr is a valid vmalloc'ed area, and
+ *      that it is big enough to cover the vma. Will return failure if
+ *      that criteria isn't met.
+ *
+ *      Similar to remap_pfn_range() (see mm/memory.c)
+ */
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+                                                unsigned long pgoff)
+{
+        return remap_vmalloc_range_partial(vma, vma->vm_start,
+                                           addr + (pgoff << PAGE_SHIFT),
+                                           vma->vm_end - vma->vm_start);
+}
 EXPORT_SYMBOL(remap_vmalloc_range);
 /*
@@ -2512,8 +2488,8 @@ found:
        /* insert all vm's */
        for (area = 0; area < nr_vms; area++)
-                insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
+                setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
-                                  pcpu_get_vm_areas);
+                                 pcpu_get_vm_areas);
        kfree(vas);
        return vms;
@@ -2592,11 +2568,6 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
                if (!counters)
                        return;
-                /* Pair with smp_wmb() in clear_vm_unlist() */
-                smp_rmb();
-                if (v->flags & VM_UNLIST)
-                        return;
                memset(counters, 0, nr_node_ids * sizeof(unsigned int));
                for (nr = 0; nr < v->nr_pages; nr++)
@@ -2625,6 +2596,11 @@ static int s_show(struct seq_file *m, void *p)
        v = va->vm;
+        /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+        smp_rmb();
+        if (v->flags & VM_UNINITIALIZED)
+                return 0;
        seq_printf(m, "0x%pK-0x%pK %7ld",
                v->addr, v->addr + v->size, v->size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa6a85378ee4..2cff0d491c6d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -546,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
 void putback_lru_page(struct page *page)
 {
        int lru;
-        int active = !!TestClearPageActive(page);
        int was_unevictable = PageUnevictable(page);
        VM_BUG_ON(PageLRU(page));
@@ -561,8 +560,8 @@ redo:
                 * unevictable page on [in]active list.
                 * We know how to handle that.
                 */
-                lru = active + page_lru_base_type(page);
+                lru = page_lru_base_type(page);
-                lru_cache_add_lru(page, lru);
+                lru_cache_add(page);
        } else {
                /*
                 * Put unevictable pages directly on zone's unevictable
@@ -669,6 +668,35 @@ static enum page_references page_check_references(struct page *page,
        return PAGEREF_RECLAIM;
 }
+/* Check if a page is dirty or under writeback */
+static void page_check_dirty_writeback(struct page *page,
+                                       bool *dirty, bool *writeback)
+{
+        struct address_space *mapping;
+        /*
+         * Anonymous pages are not handled by flushers and must be written
+         * from reclaim context. Do not stall reclaim based on them
+         */
+        if (!page_is_file_cache(page)) {
+                *dirty = false;
+                *writeback = false;
+                return;
+        }
+        /* By default assume that the page flags are accurate */
+        *dirty = PageDirty(page);
+        *writeback = PageWriteback(page);
+        /* Verify dirty/writeback state if the filesystem supports it */
+        if (!page_has_private(page))
+                return;
+        mapping = page_mapping(page);
+        if (mapping && mapping->a_ops->is_dirty_writeback)
+                mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
+}
 /*
 * shrink_page_list() returns the number of reclaimed pages
 */
@@ -677,16 +705,21 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                      struct scan_control *sc,
                                      enum ttu_flags ttu_flags,
                                      unsigned long *ret_nr_dirty,
+                                      unsigned long *ret_nr_unqueued_dirty,
+                                      unsigned long *ret_nr_congested,
                                      unsigned long *ret_nr_writeback,
+                                      unsigned long *ret_nr_immediate,
                                      bool force_reclaim)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
        int pgactivate = 0;
+        unsigned long nr_unqueued_dirty = 0;
        unsigned long nr_dirty = 0;
        unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_writeback = 0;
+        unsigned long nr_immediate = 0;
        cond_resched();
@@ -696,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                struct page *page;
                int may_enter_fs;
                enum page_references references = PAGEREF_RECLAIM_CLEAN;
+                bool dirty, writeback;
                cond_resched();
@@ -723,25 +757,77 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+                /*
+                 * The number of dirty pages determines if a zone is marked
+                 * reclaim_congested which affects wait_iff_congested. kswapd
+                 * will stall and start writing pages if the tail of the LRU
+                 * is all dirty unqueued pages.
+                 */
+                page_check_dirty_writeback(page, &dirty, &writeback);
+                if (dirty || writeback)
+                        nr_dirty++;
+                if (dirty && !writeback)
+                        nr_unqueued_dirty++;
+                /*
+                 * Treat this page as congested if the underlying BDI is or if
+                 * pages are cycling through the LRU so quickly that the
+                 * pages marked for immediate reclaim are making it to the
+                 * end of the LRU a second time.
+                 */
+                mapping = page_mapping(page);
+                if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
+                    (writeback && PageReclaim(page)))
+                        nr_congested++;
+                /*
+                 * If a page at the tail of the LRU is under writeback, there
+                 * are three cases to consider.
+                 *
+                 * 1) If reclaim is encountering an excessive number of pages
+                 *    under writeback and this page is both under writeback and
+                 *    PageReclaim then it indicates that pages are being queued
+                 *    for IO but are being recycled through the LRU before the
+                 *    IO can complete. Waiting on the page itself risks an
+                 *    indefinite stall if it is impossible to writeback the
+                 *    page due to IO error or disconnected storage so instead
+                 *    note that the LRU is being scanned too quickly and the
+                 *    caller can stall after page list has been processed.
+                 *
+                 * 2) Global reclaim encounters a page, memcg encounters a
+                 *    page that is not marked for immediate reclaim or
+                 *    the caller does not have __GFP_IO. In this case mark
+                 *    the page for immediate reclaim and continue scanning.
+                 *
+                 *    __GFP_IO is checked  because a loop driver thread might
+                 *    enter reclaim, and deadlock if it waits on a page for
+                 *    which it is needed to do the write (loop masks off
+                 *    __GFP_IO|__GFP_FS for this reason); but more thought
+                 *    would probably show more reasons.
+                 *
+                 *    Don't require __GFP_FS, since we're not going into the
+                 *    FS, just waiting on its writeback completion. Worryingly,
+                 *    ext4 gfs2 and xfs allocate pages with
+                 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+                 *    may_enter_fs here is liable to OOM on them.
+                 *
+                 * 3) memcg encounters a page that is not already marked
+                 *    PageReclaim. memcg does not have any dirty pages
+                 *    throttling so we could easily OOM just because too many
+                 *    pages are in writeback and there is nothing else to
+                 *    reclaim. Wait for the writeback to complete.
+                 */
                if (PageWriteback(page)) {
-                        /*
+                        /* Case 1 above */
-                         * memcg doesn't have any dirty pages throttling so we
+                        if (current_is_kswapd() &&
-                         * could easily OOM just because too many pages are in
+                            PageReclaim(page) &&
-                         * writeback and there is nothing else to reclaim.
+                            zone_is_reclaim_writeback(zone)) {
-                         *
+                                nr_immediate++;
-                         * Check __GFP_IO, certainly because a loop driver
+                                goto keep_locked;
-                         * thread might enter reclaim, and deadlock if it waits
-                         * on a page for which it is needed to do the write
+                        /* Case 2 above */
-                         * (loop masks off __GFP_IO|__GFP_FS for this reason);
+                        } else if (global_reclaim(sc) ||
-                         * but more thought would probably show more reasons.
-                         *
-                         * Don't require __GFP_FS, since we're not going into
-                         * the FS, just waiting on its writeback completion.
-                         * Worryingly, ext4 gfs2 and xfs allocate pages with
-                         * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
-                         * testing may_enter_fs here is liable to OOM on them.
-                         */
-                        if (global_reclaim(sc) ||
                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
                                /*
                                 * This is slightly racy - end_page_writeback()
@@ -756,9 +842,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 */
                                SetPageReclaim(page);
                                nr_writeback++;
                                goto keep_locked;
+                        /* Case 3 above */
+                        } else {
+                                wait_on_page_writeback(page);
                        }
-                        wait_on_page_writeback(page);
                }
                if (!force_reclaim)
@@ -784,9 +874,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        if (!add_to_swap(page, page_list))
                                goto activate_locked;
                        may_enter_fs = 1;
-                }
-                mapping = page_mapping(page);
+                        /* Adding to swap updated mapping */
+                        mapping = page_mapping(page);
+                }
                /*
                 * The page is mapped into the page tables of one or more
@@ -806,16 +897,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                }
                if (PageDirty(page)) {
-                        nr_dirty++;
                        /*
                         * Only kswapd can writeback filesystem pages to
-                         * avoid risk of stack overflow but do not writeback
+                         * avoid risk of stack overflow but only writeback
-                         * unless under significant pressure.
+                         * if many dirty pages have been encountered.
                         */
                        if (page_is_file_cache(page) &&
                                        (!current_is_kswapd() ||
-                                         sc->priority >= DEF_PRIORITY - 2)) {
+                                         !zone_is_reclaim_dirty(zone))) {
                                /*
                                 * Immediately reclaim when written back.
                                 * Similar in principal to deactivate_page()
@@ -838,7 +927,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        /* Page is dirty, try to write it out here */
                        switch (pageout(page, mapping, sc)) {
                        case PAGE_KEEP:
-                                nr_congested++;
                                goto keep_locked;
                        case PAGE_ACTIVATE:
                                goto activate_locked;
@@ -946,22 +1034,16 @@ keep:
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
-        /*
-         * Tag a zone as congested if all the dirty pages encountered were
-         * backed by a congested BDI. In this case, reclaimers should just
-         * back off and wait for congestion to clear because further reclaim
-         * will encounter the same problem
-         */
-        if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
-                zone_set_flag(zone, ZONE_CONGESTED);
        free_hot_cold_page_list(&free_pages, 1);
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
        mem_cgroup_uncharge_end();
        *ret_nr_dirty += nr_dirty;
+        *ret_nr_congested += nr_congested;
+        *ret_nr_unqueued_dirty += nr_unqueued_dirty;
        *ret_nr_writeback += nr_writeback;
+        *ret_nr_immediate += nr_immediate;
        return nr_reclaimed;
 }
@@ -973,7 +1055,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
                .priority = DEF_PRIORITY,
                .may_unmap = 1,
        };
-        unsigned long ret, dummy1, dummy2;
+        unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
        struct page *page, *next;
        LIST_HEAD(clean_pages);
@@ -985,8 +1067,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
        }
        ret = shrink_page_list(&clean_pages, zone, &sc,
-                                TTU_UNMAP|TTU_IGNORE_ACCESS,
+                        TTU_UNMAP|TTU_IGNORE_ACCESS,
-                                &dummy1, &dummy2, true);
+                        &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
        list_splice(&clean_pages, page_list);
        __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
        return ret;
@@ -1281,7 +1363,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        unsigned long nr_reclaimed = 0;
        unsigned long nr_taken;
        unsigned long nr_dirty = 0;
+        unsigned long nr_congested = 0;
+        unsigned long nr_unqueued_dirty = 0;
        unsigned long nr_writeback = 0;
+        unsigned long nr_immediate = 0;
        isolate_mode_t isolate_mode = 0;
        int file = is_file_lru(lru);
        struct zone *zone = lruvec_zone(lruvec);
@@ -1323,7 +1408,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                return 0;
        nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
-                                        &nr_dirty, &nr_writeback, false);
+                                &nr_dirty, &nr_unqueued_dirty, &nr_congested,
+                                &nr_writeback, &nr_immediate,
+                                false);
        spin_lock_irq(&zone->lru_lock);
@@ -1356,21 +1443,51 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         * as there is no guarantee the dirtying process is throttled in the
         * same way balance_dirty_pages() manages.
         *
-         * This scales the number of dirty pages that must be under writeback
+         * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
-         * before throttling depending on priority. It is a simple backoff
+         * of pages under pages flagged for immediate reclaim and stall if any
-         * function that has the most effect in the range DEF_PRIORITY to
+         * are encountered in the nr_immediate check below.
-         * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+         */
-         * in trouble and reclaim is considered to be in trouble.
+        if (nr_writeback && nr_writeback == nr_taken)
-         *
+                zone_set_flag(zone, ZONE_WRITEBACK);
-         * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
-         * DEF_PRIORITY-1  50% must be PageWriteback
+        /*
-         * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
+         * memcg will stall in page writeback so only consider forcibly
-         * ...
+         * stalling for global reclaim
-         * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
-         *                     isolated page is PageWriteback
         */
-        if (nr_writeback && nr_writeback >=
+        if (global_reclaim(sc)) {
-                        (nr_taken >> (DEF_PRIORITY - sc->priority)))
+                /*
+                 * Tag a zone as congested if all the dirty pages scanned were
+                 * backed by a congested BDI and wait_iff_congested will stall.
+                 */
+                if (nr_dirty && nr_dirty == nr_congested)
+                        zone_set_flag(zone, ZONE_CONGESTED);
+                /*
+                 * If dirty pages are scanned that are not queued for IO, it
+                 * implies that flushers are not keeping up. In this case, flag
+                 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
+                 * pages from reclaim context. It will forcibly stall in the
+                 * next check.
+                 */
+                if (nr_unqueued_dirty == nr_taken)
+                        zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+                /*
+                 * In addition, if kswapd scans pages marked marked for
+                 * immediate reclaim and under writeback (nr_immediate), it
+                 * implies that pages are cycling through the LRU faster than
+                 * they are written so also forcibly stall.
+                 */
+                if (nr_unqueued_dirty == nr_taken || nr_immediate)
+                        congestion_wait(BLK_RW_ASYNC, HZ/10);
+        }
+        /*
+         * Stall direct reclaim for IO completions if underlying BDIs or zone
+         * is congested. Allow kswapd to continue until it starts encountering
+         * unqueued dirty pages or cycling through the LRU too quickly.
+         */
+        if (!sc->hibernation_mode && !current_is_kswapd())
                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1822,17 +1939,25 @@ out:
 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
+        unsigned long targets[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list lru;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        struct blk_plug plug;
+        bool scan_adjusted = false;
        get_scan_count(lruvec, sc, nr);
+        /* Record the original scan target for proportional adjustments later */
+        memcpy(targets, nr, sizeof(nr));
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
+                unsigned long nr_anon, nr_file, percentage;
+                unsigned long nr_scanned;
                for_each_evictable_lru(lru) {
                        if (nr[lru]) {
                                nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
@@ -1842,17 +1967,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
                                                            lruvec, sc);
                        }
                }
+                if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+                        continue;
                /*
-                 * On large memory systems, scan >> priority can become
+                 * For global direct reclaim, reclaim only the number of pages
-                 * really large. This is fine for the starting priority;
+                 * requested. Less care is taken to scan proportionally as it
-                 * we want to put equal scanning pressure on each zone.
+                 * is more important to minimise direct reclaim stall latency
-                 * However, if the VM has a harder time of freeing pages,
+                 * than it is to properly age the LRU lists.
-                 * with multiple processes reclaiming pages, the total
-                 * freeing target can get unreasonably large.
                 */
-                if (nr_reclaimed >= nr_to_reclaim &&
+                if (global_reclaim(sc) && !current_is_kswapd())
-                    sc->priority < DEF_PRIORITY)
                        break;
+                /*
+                 * For kswapd and memcg, reclaim at least the number of pages
+                 * requested. Ensure that the anon and file LRUs shrink
+                 * proportionally what was requested by get_scan_count(). We
+                 * stop reclaiming one LRU and reduce the amount scanning
+                 * proportional to the original scan target.
+                 */
+                nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
+                nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+                if (nr_file > nr_anon) {
+                        unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
+                                                targets[LRU_ACTIVE_ANON] + 1;
+                        lru = LRU_BASE;
+                        percentage = nr_anon * 100 / scan_target;
+                } else {
+                        unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
+                                                targets[LRU_ACTIVE_FILE] + 1;
+                        lru = LRU_FILE;
+                        percentage = nr_file * 100 / scan_target;
+                }
+                /* Stop scanning the smaller of the LRU */
+                nr[lru] = 0;
+                nr[lru + LRU_ACTIVE] = 0;
+                /*
+                 * Recalculate the other LRU scan count based on its original
+                 * scan target and the percentage scanning already complete
+                 */
+                lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
+                nr_scanned = targets[lru] - nr[lru];
+                nr[lru] = targets[lru] * (100 - percentage) / 100;
+                nr[lru] -= min(nr[lru], nr_scanned);
+                lru += LRU_ACTIVE;
+                nr_scanned = targets[lru] - nr[lru];
+                nr[lru] = targets[lru] * (100 - percentage) / 100;
+                nr[lru] -= min(nr[lru], nr_scanned);
+                scan_adjusted = true;
        }
        blk_finish_plug(&plug);
        sc->nr_reclaimed += nr_reclaimed;
@@ -2179,8 +2347,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                aborted_reclaim = shrink_zones(zonelist, sc);
                /*
-                 * Don't shrink slabs when reclaiming memory from
+                 * Don't shrink slabs when reclaiming memory from over limit
-                 * over limit cgroups
+                 * cgroups but do shrink slab at least once when aborting
+                 * reclaim for compaction to avoid unevenly scanning file/anon
+                 * LRU pages over slab pages.
                 */
                if (global_reclaim(sc)) {
                        unsigned long lru_pages = 0;
@@ -2222,18 +2392,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                                WB_REASON_TRY_TO_FREE_PAGES);
                        sc->may_writepage = 1;
                }
+        } while (--sc->priority >= 0 && !aborted_reclaim);
-                /* Take a nap, wait for some writeback to complete */
-                if (!sc->hibernation_mode && sc->nr_scanned &&
-                    sc->priority < DEF_PRIORITY - 2) {
-                        struct zone *preferred_zone;
-                        first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
-                                                &cpuset_current_mems_allowed,
-                                                &preferred_zone);
-                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
-                }
-        } while (--sc->priority >= 0);
 out:
        delayacct_freepages_end();
@@ -2601,6 +2760,91 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 }
 /*
+ * kswapd shrinks the zone by the number of pages required to reach
+ * the high watermark.
+ *
+ * Returns true if kswapd scanned at least the requested number of pages to
+ * reclaim or if the lack of progress was due to pages under writeback.
+ * This is used to determine if the scanning priority needs to be raised.
+ */
+static bool kswapd_shrink_zone(struct zone *zone,
+                               int classzone_idx,
+                               struct scan_control *sc,
+                               unsigned long lru_pages,
+                               unsigned long *nr_attempted)
+{
+        unsigned long nr_slab;
+        int testorder = sc->order;
+        unsigned long balance_gap;
+        struct reclaim_state *reclaim_state = current->reclaim_state;
+        struct shrink_control shrink = {
+                .gfp_mask = sc->gfp_mask,
+        };
+        bool lowmem_pressure;
+        /* Reclaim above the high watermark. */
+        sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
+        /*
+         * Kswapd reclaims only single pages with compaction enabled. Trying
+         * too hard to reclaim until contiguous free pages have become
+         * available can hurt performance by evicting too much useful data
+         * from memory. Do not reclaim more than needed for compaction.
+         */
+        if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
+                        compaction_suitable(zone, sc->order) !=
+                                COMPACT_SKIPPED)
+                testorder = 0;
+        /*
+         * We put equal pressure on every zone, unless one zone has way too
+         * many pages free already. The "too many pages" is defined as the
+         * high wmark plus a "gap" where the gap is either the low
+         * watermark or 1% of the zone, whichever is smaller.
+         */
+        balance_gap = min(low_wmark_pages(zone),
+                (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                KSWAPD_ZONE_BALANCE_GAP_RATIO);
+        /*
+         * If there is no low memory pressure or the zone is balanced then no
+         * reclaim is necessary
+         */
+        lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
+        if (!lowmem_pressure && zone_balanced(zone, testorder,
+                                                balance_gap, classzone_idx))
+                return true;
+        shrink_zone(zone, sc);
+        reclaim_state->reclaimed_slab = 0;
+        nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
+        sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+        /* Account for the number of pages attempted to reclaim */
+        *nr_attempted += sc->nr_to_reclaim;
+        if (nr_slab == 0 && !zone_reclaimable(zone))
+                zone->all_unreclaimable = 1;
+        zone_clear_flag(zone, ZONE_WRITEBACK);
+        /*
+         * If a zone reaches its high watermark, consider it to be no longer
+         * congested. It's possible there are dirty pages backed by congested
+         * BDIs but as pressure is relieved, speculatively avoid congestion
+         * waits.
+         */
+        if (!zone->all_unreclaimable &&
+            zone_balanced(zone, testorder, 0, classzone_idx)) {
+                zone_clear_flag(zone, ZONE_CONGESTED);
+                zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
+        }
+        return sc->nr_scanned >= sc->nr_to_reclaim;
+}
+/*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at high_wmark_pages(zone).
 *
@@ -2624,35 +2868,28 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                                        int *classzone_idx)
 {
-        bool pgdat_is_balanced = false;
        int i;
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
-        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
+                .priority = DEF_PRIORITY,
                .may_unmap = 1,
                .may_swap = 1,
-                /*
+                .may_writepage = !laptop_mode,
-                 * kswapd doesn't want to be bailed out while reclaim. because
-                 * we want to put equal scanning pressure on each zone.
-                 */
-                .nr_to_reclaim = ULONG_MAX,
                .order = order,
                .target_mem_cgroup = NULL,
        };
-        struct shrink_control shrink = {
-                .gfp_mask = sc.gfp_mask,
-        };
-loop_again:
-        sc.priority = DEF_PRIORITY;
-        sc.nr_reclaimed = 0;
-        sc.may_writepage = !laptop_mode;
        count_vm_event(PAGEOUTRUN);
        do {
                unsigned long lru_pages = 0;
+                unsigned long nr_attempted = 0;
+                bool raise_priority = true;
+                bool pgdat_needs_compaction = (order > 0);
+                sc.nr_reclaimed = 0;
                /*
                 * Scan in the highmem->dma direction for the highest
@@ -2689,23 +2926,46 @@ loop_again:
                                end_zone = i;
                                break;
                        } else {
-                                /* If balanced, clear the congested flag */
+                                /*
+                                 * If balanced, clear the dirty and congested
+                                 * flags
+                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
+                                zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
                        }
                }
-                if (i < 0) {
+                if (i < 0)
-                        pgdat_is_balanced = true;
                        goto out;
-                }
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
+                        if (!populated_zone(zone))
+                                continue;
                        lru_pages += zone_reclaimable_pages(zone);
+                        /*
+                         * If any zone is currently balanced then kswapd will
+                         * not call compaction as it is expected that the
+                         * necessary pages are already available.
+                         */
+                        if (pgdat_needs_compaction &&
+                                        zone_watermark_ok(zone, order,
+                                                low_wmark_pages(zone),
+                                                *classzone_idx, 0))
+                                pgdat_needs_compaction = false;
                }
                /*
+                 * If we're getting trouble reclaiming, start doing writepage
+                 * even in laptop mode.
+                 */
+                if (sc.priority < DEF_PRIORITY - 2)
+                        sc.may_writepage = 1;
+                /*
                 * Now scan the zone in the dma->highmem direction, stopping
                 * at the last zone which needs scanning.
                 *
@@ -2716,8 +2976,6 @@ loop_again:
                 */
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
-                        int nr_slab, testorder;
-                        unsigned long balance_gap;
                        if (!populated_zone(zone))
                                continue;
@@ -2738,65 +2996,14 @@ loop_again:
                        sc.nr_reclaimed += nr_soft_reclaimed;
                        /*
-                         * We put equal pressure on every zone, unless
+                         * There should be no need to raise the scanning
-                         * one zone has way too many pages free
+                         * priority if enough pages are already being scanned
-                         * already. The "too many pages" is defined
+                         * that that high watermark would be met at 100%
-                         * as the high wmark plus a "gap" where the
+                         * efficiency.
-                         * gap is either the low watermark or 1%
-                         * of the zone, whichever is smaller.
                         */
-                        balance_gap = min(low_wmark_pages(zone),
+                        if (kswapd_shrink_zone(zone, end_zone, &sc,
-                                (zone->managed_pages +
+                                        lru_pages, &nr_attempted))
-                                        KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                                raise_priority = false;
-                                KSWAPD_ZONE_BALANCE_GAP_RATIO);
-                        /*
-                         * Kswapd reclaims only single pages with compaction
-                         * enabled. Trying too hard to reclaim until contiguous
-                         * free pages have become available can hurt performance
-                         * by evicting too much useful data from memory.
-                         * Do not reclaim more than needed for compaction.
-                         */
-                        testorder = order;
-                        if (IS_ENABLED(CONFIG_COMPACTION) && order &&
-                                        compaction_suitable(zone, order) !=
-                                                COMPACT_SKIPPED)
-                                testorder = 0;
-                        if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
-                            !zone_balanced(zone, testorder,
-                                           balance_gap, end_zone)) {
-                                shrink_zone(zone, &sc);
-                                reclaim_state->reclaimed_slab = 0;
-                                nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
-                                sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-                                if (nr_slab == 0 && !zone_reclaimable(zone))
-                                        zone->all_unreclaimable = 1;
-                        }
-                        /*
-                         * If we're getting trouble reclaiming, start doing
-                         * writepage even in laptop mode.
-                         */
-                        if (sc.priority < DEF_PRIORITY - 2)
-                                sc.may_writepage = 1;
-                        if (zone->all_unreclaimable) {
-                                if (end_zone && end_zone == i)
-                                        end_zone--;
-                                continue;
-                        }
-                        if (zone_balanced(zone, testorder, 0, end_zone))
-                                /*
-                                 * If a zone reaches its high watermark,
-                                 * consider it to be no longer congested. It's
-                                 * possible there are dirty pages backed by
-                                 * congested BDIs but as pressure is relieved,
-                                 * speculatively avoid congestion waits
-                                 */
-                                zone_clear_flag(zone, ZONE_CONGESTED);
                }
                /*
@@ -2808,74 +3015,38 @@ loop_again:
                                pfmemalloc_watermark_ok(pgdat))
                        wake_up(&pgdat->pfmemalloc_wait);
-                if (pgdat_balanced(pgdat, order, *classzone_idx)) {
-                        pgdat_is_balanced = true;
-                        break;          /* kswapd: all done */
-                }
                /*
-                 * We do this so kswapd doesn't build up large priorities for
+                 * Fragmentation may mean that the system cannot be rebalanced
-                 * example when it is freeing in parallel with allocators. It
+                 * for high-order allocations in all zones. If twice the
-                 * matches the direct reclaim path behaviour in terms of impact
+                 * allocation size has been reclaimed and the zones are still
-                 * on zone->*_priority.
+                 * not balanced then recheck the watermarks at order-0 to
+                 * prevent kswapd reclaiming excessively. Assume that a
+                 * process requested a high-order can direct reclaim/compact.
                 */
-                if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
+                if (order && sc.nr_reclaimed >= 2UL << order)
-                        break;
+                        order = sc.order = 0;
-        } while (--sc.priority >= 0);
-out:
-        if (!pgdat_is_balanced) {
-                cond_resched();
-                try_to_freeze();
+                /* Check if kswapd should be suspending */
+                if (try_to_freeze() || kthread_should_stop())
+                        break;
                /*
-                 * Fragmentation may mean that the system cannot be
+                 * Compact if necessary and kswapd is reclaiming at least the
-                 * rebalanced for high-order allocations in all zones.
+                 * high watermark number of pages as requsted
-                 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
-                 * it means the zones have been fully scanned and are still
-                 * not balanced. For high-order allocations, there is
-                 * little point trying all over again as kswapd may
-                 * infinite loop.
-                 *
-                 * Instead, recheck all watermarks at order-0 as they
-                 * are the most important. If watermarks are ok, kswapd will go
-                 * back to sleep. High-order users can still perform direct
-                 * reclaim if they wish.
                 */
-                if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
+                if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
-                        order = sc.order = 0;
-                goto loop_again;
-        }
-        /*
-         * If kswapd was reclaiming at a higher order, it has the option of
-         * sleeping without all zones being balanced. Before it does, it must
-         * ensure that the watermarks for order-0 on *all* zones are met and
-         * that the congestion flags are cleared. The congestion flag must
-         * be cleared as kswapd is the only mechanism that clears the flag
-         * and it is potentially going to sleep here.
-         */
-        if (order) {
-                int zones_need_compaction = 1;
-                for (i = 0; i <= end_zone; i++) {
-                        struct zone *zone = pgdat->node_zones + i;
-                        if (!populated_zone(zone))
-                                continue;
-                        /* Check if the memory needs to be defragmented. */
-                        if (zone_watermark_ok(zone, order,
-                                    low_wmark_pages(zone), *classzone_idx, 0))
-                                zones_need_compaction = 0;
-                }
-                if (zones_need_compaction)
                        compact_pgdat(pgdat, order);
-        }
+                /*
+                 * Raise priority if scanning rate is too low or there was no
+                 * progress in reclaiming pages
+                 */
+                if (raise_priority || !sc.nr_reclaimed)
+                        sc.priority--;
+        } while (sc.priority >= 1 &&
+                 !pgdat_balanced(pgdat, order, *classzone_idx));
+out:
        /*
         * Return the order we were reclaiming at so prepare_kswapd_sleep()
         * makes a decision on the order we were last reclaiming at. However,
diff --git a/mm/zbud.c b/mm/zbud.c
new file mode 100644
index 000000000000..9bb4710e3589
--- /dev/null
+++ b/mm/zbud.c
@@ -0,0 +1,527 @@
+/*
+ * zbud.c
+ *
+ * Copyright (C) 2013, Seth Jennings, IBM
+ *
+ * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
+ *
+ * zbud is an special purpose allocator for storing compressed pages.  Contrary
+ * to what its name may suggest, zbud is not a buddy allocator, but rather an
+ * allocator that "buddies" two compressed pages together in a single memory
+ * page.
+ *
+ * While this design limits storage density, it has simple and deterministic
+ * reclaim properties that make it preferable to a higher density approach when
+ * reclaim will be used.
+ *
+ * zbud works by storing compressed pages, or "zpages", together in pairs in a
+ * single memory page called a "zbud page".  The first buddy is "left
+ * justifed" at the beginning of the zbud page, and the last buddy is "right
+ * justified" at the end of the zbud page.  The benefit is that if either
+ * buddy is freed, the freed buddy space, coalesced with whatever slack space
+ * that existed between the buddies, results in the largest possible free region
+ * within the zbud page.
+ *
+ * zbud also provides an attractive lower bound on density. The ratio of zpages
+ * to zbud pages can not be less than 1.  This ensures that zbud can never "do
+ * harm" by using more pages to store zpages than the uncompressed zpages would
+ * have used on their own.
+ *
+ * zbud pages are divided into "chunks".  The size of the chunks is fixed at
+ * compile time and determined by NCHUNKS_ORDER below.  Dividing zbud pages
+ * into chunks allows organizing unbuddied zbud pages into a manageable number
+ * of unbuddied lists according to the number of free chunks available in the
+ * zbud page.
+ *
+ * The zbud API differs from that of conventional allocators in that the
+ * allocation function, zbud_alloc(), returns an opaque handle to the user,
+ * not a dereferenceable pointer.  The user must map the handle using
+ * zbud_map() in order to get a usable pointer by which to access the
+ * allocation data and unmap the handle with zbud_unmap() when operations
+ * on the allocation data are complete.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/zbud.h>
+/*****************
+ * Structures
+*****************/
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation.  It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
+ * will be 64 freelists per pool.
+ */
+#define NCHUNKS_ORDER   6
+#define CHUNK_SHIFT     (PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE      (1 << CHUNK_SHIFT)
+#define NCHUNKS         (PAGE_SIZE >> CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+/**
+ * struct zbud_pool - stores metadata for each zbud pool
+ * @lock:       protects all pool fields and first|last_chunk fields of any
+ *              zbud page in the pool
+ * @unbuddied:  array of lists tracking zbud pages that only contain one buddy;
+ *              the lists each zbud page is added to depends on the size of
+ *              its free region.
+ * @buddied:    list tracking the zbud pages that contain two buddies;
+ *              these zbud pages are full
+ * @lru:        list tracking the zbud pages in LRU order by most recently
+ *              added buddy.
+ * @pages_nr:   number of zbud pages in the pool.
+ * @ops:        pointer to a structure of user defined operations specified at
+ *              pool creation time.
+ *
+ * This structure is allocated at pool creation time and maintains metadata
+ * pertaining to a particular zbud pool.
+ */
+struct zbud_pool {
+        spinlock_t lock;
+        struct list_head unbuddied[NCHUNKS];
+        struct list_head buddied;
+        struct list_head lru;
+        u64 pages_nr;
+        struct zbud_ops *ops;
+};
+/*
+ * struct zbud_header - zbud page metadata occupying the first chunk of each
+ *                      zbud page.
+ * @buddy:      links the zbud page into the unbuddied/buddied lists in the pool
+ * @lru:        links the zbud page into the lru list in the pool
+ * @first_chunks:       the size of the first buddy in chunks, 0 if free
+ * @last_chunks:        the size of the last buddy in chunks, 0 if free
+ */
+struct zbud_header {
+        struct list_head buddy;
+        struct list_head lru;
+        unsigned int first_chunks;
+        unsigned int last_chunks;
+        bool under_reclaim;
+};
+/*****************
+ * Helpers
+*****************/
+/* Just to make the code easier to read */
+enum buddy {
+        FIRST,
+        LAST
+};
+/* Converts an allocation size in bytes to size in zbud chunks */
+static int size_to_chunks(int size)
+{
+        return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
+}
+#define for_each_unbuddied_list(_iter, _begin) \
+        for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+/* Initializes the zbud header of a newly allocated zbud page */
+static struct zbud_header *init_zbud_page(struct page *page)
+{
+        struct zbud_header *zhdr = page_address(page);
+        zhdr->first_chunks = 0;
+        zhdr->last_chunks = 0;
+        INIT_LIST_HEAD(&zhdr->buddy);
+        INIT_LIST_HEAD(&zhdr->lru);
+        zhdr->under_reclaim = 0;
+        return zhdr;
+}
+/* Resets the struct page fields and frees the page */
+static void free_zbud_page(struct zbud_header *zhdr)
+{
+        __free_page(virt_to_page(zhdr));
+}
+/*
+ * Encodes the handle of a particular buddy within a zbud page
+ * Pool lock should be held as this function accesses first|last_chunks
+ */
+static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
+{
+        unsigned long handle;
+        /*
+         * For now, the encoded handle is actually just the pointer to the data
+         * but this might not always be the case.  A little information hiding.
+         * Add CHUNK_SIZE to the handle if it is the first allocation to jump
+         * over the zbud header in the first chunk.
+         */
+        handle = (unsigned long)zhdr;
+        if (bud == FIRST)
+                /* skip over zbud header */
+                handle += ZHDR_SIZE_ALIGNED;
+        else /* bud == LAST */
+                handle += PAGE_SIZE - (zhdr->last_chunks  << CHUNK_SHIFT);
+        return handle;
+}
+/* Returns the zbud page where a given handle is stored */
+static struct zbud_header *handle_to_zbud_header(unsigned long handle)
+{
+        return (struct zbud_header *)(handle & PAGE_MASK);
+}
+/* Returns the number of free chunks in a zbud page */
+static int num_free_chunks(struct zbud_header *zhdr)
+{
+        /*
+         * Rather than branch for different situations, just use the fact that
+         * free buddies have a length of zero to simplify everything. -1 at the
+         * end for the zbud header.
+         */
+        return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
+}
+/*****************
+ * API Functions
+*****************/
+/**
+ * zbud_create_pool() - create a new zbud pool
+ * @gfp:        gfp flags when allocating the zbud pool structure
+ * @ops:        user-defined operations for the zbud pool
+ *
+ * Return: pointer to the new zbud pool or NULL if the metadata allocation
+ * failed.
+ */
+struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
+{
+        struct zbud_pool *pool;
+        int i;
+        pool = kmalloc(sizeof(struct zbud_pool), gfp);
+        if (!pool)
+                return NULL;
+        spin_lock_init(&pool->lock);
+        for_each_unbuddied_list(i, 0)
+                INIT_LIST_HEAD(&pool->unbuddied[i]);
+        INIT_LIST_HEAD(&pool->buddied);
+        INIT_LIST_HEAD(&pool->lru);
+        pool->pages_nr = 0;
+        pool->ops = ops;
+        return pool;
+}
+/**
+ * zbud_destroy_pool() - destroys an existing zbud pool
+ * @pool:       the zbud pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+void zbud_destroy_pool(struct zbud_pool *pool)
+{
+        kfree(pool);
+}
+/**
+ * zbud_alloc() - allocates a region of a given size
+ * @pool:       zbud pool from which to allocate
+ * @size:       size in bytes of the desired allocation
+ * @gfp:        gfp flags used if the pool needs to grow
+ * @handle:     handle of the new allocation
+ *
+ * This function will attempt to find a free region in the pool large enough to
+ * satisfy the allocation request.  A search of the unbuddied lists is
+ * performed first. If no suitable free region is found, then a new page is
+ * allocated and added to the pool to satisfy the request.
+ *
+ * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
+ * as zbud pool pages.
+ *
+ * Return: 0 if success and handle is set, otherwise -EINVAL is the size or
+ * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
+ * a new page.
+ */
+int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
+                        unsigned long *handle)
+{
+        int chunks, i, freechunks;
+        struct zbud_header *zhdr = NULL;
+        enum buddy bud;
+        struct page *page;
+        if (size <= 0 || gfp & __GFP_HIGHMEM)
+                return -EINVAL;
+        if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED)
+                return -ENOSPC;
+        chunks = size_to_chunks(size);
+        spin_lock(&pool->lock);
+        /* First, try to find an unbuddied zbud page. */
+        zhdr = NULL;
+        for_each_unbuddied_list(i, chunks) {
+                if (!list_empty(&pool->unbuddied[i])) {
+                        zhdr = list_first_entry(&pool->unbuddied[i],
+                                        struct zbud_header, buddy);
+                        list_del(&zhdr->buddy);
+                        if (zhdr->first_chunks == 0)
+                                bud = FIRST;
+                        else
+                                bud = LAST;
+                        goto found;
+                }
+        }
+        /* Couldn't find unbuddied zbud page, create new one */
+        spin_unlock(&pool->lock);
+        page = alloc_page(gfp);
+        if (!page)
+                return -ENOMEM;
+        spin_lock(&pool->lock);
+        pool->pages_nr++;
+        zhdr = init_zbud_page(page);
+        bud = FIRST;
+found:
+        if (bud == FIRST)
+                zhdr->first_chunks = chunks;
+        else
+                zhdr->last_chunks = chunks;
+        if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
+                /* Add to unbuddied list */
+                freechunks = num_free_chunks(zhdr);
+                list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+        } else {
+                /* Add to buddied list */
+                list_add(&zhdr->buddy, &pool->buddied);
+        }
+        /* Add/move zbud page to beginning of LRU */
+        if (!list_empty(&zhdr->lru))
+                list_del(&zhdr->lru);
+        list_add(&zhdr->lru, &pool->lru);
+        *handle = encode_handle(zhdr, bud);
+        spin_unlock(&pool->lock);
+        return 0;
+}
+/**
+ * zbud_free() - frees the allocation associated with the given handle
+ * @pool:       pool in which the allocation resided
+ * @handle:     handle associated with the allocation returned by zbud_alloc()
+ *
+ * In the case that the zbud page in which the allocation resides is under
+ * reclaim, as indicated by the PG_reclaim flag being set, this function
+ * only sets the first|last_chunks to 0.  The page is actually freed
+ * once both buddies are evicted (see zbud_reclaim_page() below).
+ */
+void zbud_free(struct zbud_pool *pool, unsigned long handle)
+{
+        struct zbud_header *zhdr;
+        int freechunks;
+        spin_lock(&pool->lock);
+        zhdr = handle_to_zbud_header(handle);
+        /* If first buddy, handle will be page aligned */
+        if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
+                zhdr->last_chunks = 0;
+        else
+                zhdr->first_chunks = 0;
+        if (zhdr->under_reclaim) {
+                /* zbud page is under reclaim, reclaim will free */
+                spin_unlock(&pool->lock);
+                return;
+        }
+        /* Remove from existing buddy list */
+        list_del(&zhdr->buddy);
+        if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+                /* zbud page is empty, free */
+                list_del(&zhdr->lru);
+                free_zbud_page(zhdr);
+                pool->pages_nr--;
+        } else {
+                /* Add to unbuddied list */
+                freechunks = num_free_chunks(zhdr);
+                list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+        }
+        spin_unlock(&pool->lock);
+}
+#define list_tail_entry(ptr, type, member) \
+        list_entry((ptr)->prev, type, member)
+/**
+ * zbud_reclaim_page() - evicts allocations from a pool page and frees it
+ * @pool:       pool from which a page will attempt to be evicted
+ * @retires:    number of pages on the LRU list for which eviction will
+ *              be attempted before failing
+ *
+ * zbud reclaim is different from normal system reclaim in that the reclaim is
+ * done from the bottom, up.  This is because only the bottom layer, zbud, has
+ * information on how the allocations are organized within each zbud page. This
+ * has the potential to create interesting locking situations between zbud and
+ * the user, however.
+ *
+ * To avoid these, this is how zbud_reclaim_page() should be called:
+ * The user detects a page should be reclaimed and calls zbud_reclaim_page().
+ * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
+ * the user-defined eviction handler with the pool and handle as arguments.
+ *
+ * If the handle can not be evicted, the eviction handler should return
+ * non-zero. zbud_reclaim_page() will add the zbud page back to the
+ * appropriate list and try the next zbud page on the LRU up to
+ * a user defined number of retries.
+ *
+ * If the handle is successfully evicted, the eviction handler should
+ * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
+ * contains logic to delay freeing the page if the page is under reclaim,
+ * as indicated by the setting of the PG_reclaim flag on the underlying page.
+ *
+ * If all buddies in the zbud page are successfully evicted, then the
+ * zbud page can be freed.
+ *
+ * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
+ * no pages to evict or an eviction handler is not registered, -EAGAIN if
+ * the retry limit was hit.
+ */
+int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
+{
+        int i, ret, freechunks;
+        struct zbud_header *zhdr;
+        unsigned long first_handle = 0, last_handle = 0;
+        spin_lock(&pool->lock);
+        if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
+                        retries == 0) {
+                spin_unlock(&pool->lock);
+                return -EINVAL;
+        }
+        for (i = 0; i < retries; i++) {
+                zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
+                list_del(&zhdr->lru);
+                list_del(&zhdr->buddy);
+                /* Protect zbud page against free */
+                zhdr->under_reclaim = true;
+                /*
+                 * We need encode the handles before unlocking, since we can
+                 * race with free that will set (first|last)_chunks to 0
+                 */
+                first_handle = 0;
+                last_handle = 0;
+                if (zhdr->first_chunks)
+                        first_handle = encode_handle(zhdr, FIRST);
+                if (zhdr->last_chunks)
+                        last_handle = encode_handle(zhdr, LAST);
+                spin_unlock(&pool->lock);
+                /* Issue the eviction callback(s) */
+                if (first_handle) {
+                        ret = pool->ops->evict(pool, first_handle);
+                        if (ret)
+                                goto next;
+                }
+                if (last_handle) {
+                        ret = pool->ops->evict(pool, last_handle);
+                        if (ret)
+                                goto next;
+                }
+next:
+                spin_lock(&pool->lock);
+                zhdr->under_reclaim = false;
+                if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+                        /*
+                         * Both buddies are now free, free the zbud page and
+                         * return success.
+                         */
+                        free_zbud_page(zhdr);
+                        pool->pages_nr--;
+                        spin_unlock(&pool->lock);
+                        return 0;
+                } else if (zhdr->first_chunks == 0 ||
+                                zhdr->last_chunks == 0) {
+                        /* add to unbuddied list */
+                        freechunks = num_free_chunks(zhdr);
+                        list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+                } else {
+                        /* add to buddied list */
+                        list_add(&zhdr->buddy, &pool->buddied);
+                }
+                /* add to beginning of LRU */
+                list_add(&zhdr->lru, &pool->lru);
+        }
+        spin_unlock(&pool->lock);
+        return -EAGAIN;
+}
+/**
+ * zbud_map() - maps the allocation associated with the given handle
+ * @pool:       pool in which the allocation resides
+ * @handle:     handle associated with the allocation to be mapped
+ *
+ * While trivial for zbud, the mapping functions for others allocators
+ * implementing this allocation API could have more complex information encoded
+ * in the handle and could create temporary mappings to make the data
+ * accessible to the user.
+ *
+ * Returns: a pointer to the mapped allocation
+ */
+void *zbud_map(struct zbud_pool *pool, unsigned long handle)
+{
+        return (void *)(handle);
+}
+/**
+ * zbud_unmap() - maps the allocation associated with the given handle
+ * @pool:       pool in which the allocation resides
+ * @handle:     handle associated with the allocation to be unmapped
+ */
+void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
+{
+}
+/**
+ * zbud_get_pool_size() - gets the zbud pool size in pages
+ * @pool:       pool whose size is being queried
+ *
+ * Returns: size in pages of the given pool.  The pool lock need not be
+ * taken to access pages_nr.
+ */
+u64 zbud_get_pool_size(struct zbud_pool *pool)
+{
+        return pool->pages_nr;
+}
+static int __init init_zbud(void)
+{
+        /* Make sure the zbud header will fit in one chunk */
+        BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
+        pr_info("loaded\n");
+        return 0;
+}
+static void __exit exit_zbud(void)
+{
+        pr_info("unloaded\n");
+}
+module_init(init_zbud);
+module_exit(exit_zbud);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");
diff --git a/mm/zswap.c b/mm/zswap.c
new file mode 100644
index 000000000000..deda2b671e12
--- /dev/null
+++ b/mm/zswap.c
@@ -0,0 +1,943 @@
+/*
+ * zswap.c - zswap driver file
+ *
+ * zswap is a backend for frontswap that takes pages that are in the process
+ * of being swapped out and attempts to compress and store them in a
+ * RAM-based memory pool.  This can result in a significant I/O reduction on
+ * the swap device and, in the case where decompressing from RAM is faster
+ * than reading from the swap device, can also improve workload performance.
+ *
+ * Copyright (C) 2012  Seth Jennings <sjenning@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/frontswap.h>
+#include <linux/rbtree.h>
+#include <linux/swap.h>
+#include <linux/crypto.h>
+#include <linux/mempool.h>
+#include <linux/zbud.h>
+#include <linux/mm_types.h>
+#include <linux/page-flags.h>
+#include <linux/swapops.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+/*********************************
+* statistics
+**********************************/
+/* Number of memory pages used by the compressed pool */
+static u64 zswap_pool_pages;
+/* The number of compressed pages currently stored in zswap */
+static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
+/*
+ * The statistics below are not protected from concurrent access for
+ * performance reasons so they may not be a 100% accurate.  However,
+ * they do provide useful information on roughly how many times a
+ * certain event is occurring.
+*/
+/* Pool limit was hit (see zswap_max_pool_percent) */
+static u64 zswap_pool_limit_hit;
+/* Pages written back when pool limit was reached */
+static u64 zswap_written_back_pages;
+/* Store failed due to a reclaim failure after pool limit was reached */
+static u64 zswap_reject_reclaim_fail;
+/* Compressed page was too big for the allocator to (optimally) store */
+static u64 zswap_reject_compress_poor;
+/* Store failed because underlying allocator could not get memory */
+static u64 zswap_reject_alloc_fail;
+/* Store failed because the entry metadata could not be allocated (rare) */
+static u64 zswap_reject_kmemcache_fail;
+/* Duplicate store was encountered (rare) */
+static u64 zswap_duplicate_entry;
+/*********************************
+* tunables
+**********************************/
+/* Enable/disable zswap (disabled by default, fixed at boot for now) */
+static bool zswap_enabled __read_mostly;
+module_param_named(enabled, zswap_enabled, bool, 0);
+/* Compressor to be used by zswap (fixed at boot for now) */
+#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
+static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
+module_param_named(compressor, zswap_compressor, charp, 0);
+/* The maximum percentage of memory that the compressed pool can occupy */
+static unsigned int zswap_max_pool_percent = 20;
+module_param_named(max_pool_percent,
+                        zswap_max_pool_percent, uint, 0644);
+/*********************************
+* compression functions
+**********************************/
+/* per-cpu compression transforms */
+static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
+enum comp_op {
+        ZSWAP_COMPOP_COMPRESS,
+        ZSWAP_COMPOP_DECOMPRESS
+};
+static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
+                                u8 *dst, unsigned int *dlen)
+{
+        struct crypto_comp *tfm;
+        int ret;
+        tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
+        switch (op) {
+        case ZSWAP_COMPOP_COMPRESS:
+                ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
+                break;
+        case ZSWAP_COMPOP_DECOMPRESS:
+                ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        put_cpu();
+        return ret;
+}
+static int __init zswap_comp_init(void)
+{
+        if (!crypto_has_comp(zswap_compressor, 0, 0)) {
+                pr_info("%s compressor not available\n", zswap_compressor);
+                /* fall back to default compressor */
+                zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
+                if (!crypto_has_comp(zswap_compressor, 0, 0))
+                        /* can't even load the default compressor */
+                        return -ENODEV;
+        }
+        pr_info("using %s compressor\n", zswap_compressor);
+        /* alloc percpu transforms */
+        zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
+        if (!zswap_comp_pcpu_tfms)
+                return -ENOMEM;
+        return 0;
+}
+static void zswap_comp_exit(void)
+{
+        /* free percpu transforms */
+        if (zswap_comp_pcpu_tfms)
+                free_percpu(zswap_comp_pcpu_tfms);
+}
+/*********************************
+* data structures
+**********************************/
+/*
+ * struct zswap_entry
+ *
+ * This structure contains the metadata for tracking a single compressed
+ * page within zswap.
+ *
+ * rbnode - links the entry into red-black tree for the appropriate swap type
+ * refcount - the number of outstanding reference to the entry. This is needed
+ *            to protect against premature freeing of the entry by code
+ *            concurent calls to load, invalidate, and writeback.  The lock
+ *            for the zswap_tree structure that contains the entry must
+ *            be held while changing the refcount.  Since the lock must
+ *            be held, there is no reason to also make refcount atomic.
+ * offset - the swap offset for the entry.  Index into the red-black tree.
+ * handle - zsmalloc allocation handle that stores the compressed page data
+ * length - the length in bytes of the compressed page data.  Needed during
+ *           decompression
+ */
+struct zswap_entry {
+        struct rb_node rbnode;
+        pgoff_t offset;
+        int refcount;
+        unsigned int length;
+        unsigned long handle;
+};
+struct zswap_header {
+        swp_entry_t swpentry;
+};
+/*
+ * The tree lock in the zswap_tree struct protects a few things:
+ * - the rbtree
+ * - the refcount field of each entry in the tree
+ */
+struct zswap_tree {
+        struct rb_root rbroot;
+        spinlock_t lock;
+        struct zbud_pool *pool;
+};
+static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+/*********************************
+* zswap entry functions
+**********************************/
+static struct kmem_cache *zswap_entry_cache;
+static int zswap_entry_cache_create(void)
+{
+        zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
+        return (zswap_entry_cache == NULL);
+}
+static void zswap_entry_cache_destory(void)
+{
+        kmem_cache_destroy(zswap_entry_cache);
+}
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
+{
+        struct zswap_entry *entry;
+        entry = kmem_cache_alloc(zswap_entry_cache, gfp);
+        if (!entry)
+                return NULL;
+        entry->refcount = 1;
+        return entry;
+}
+static void zswap_entry_cache_free(struct zswap_entry *entry)
+{
+        kmem_cache_free(zswap_entry_cache, entry);
+}
+/* caller must hold the tree lock */
+static void zswap_entry_get(struct zswap_entry *entry)
+{
+        entry->refcount++;
+}
+/* caller must hold the tree lock */
+static int zswap_entry_put(struct zswap_entry *entry)
+{
+        entry->refcount--;
+        return entry->refcount;
+}
+/*********************************
+* rbtree functions
+**********************************/
+static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
+{
+        struct rb_node *node = root->rb_node;
+        struct zswap_entry *entry;
+        while (node) {
+                entry = rb_entry(node, struct zswap_entry, rbnode);
+                if (entry->offset > offset)
+                        node = node->rb_left;
+                else if (entry->offset < offset)
+                        node = node->rb_right;
+                else
+                        return entry;
+        }
+        return NULL;
+}
+/*
+ * In the case that a entry with the same offset is found, a pointer to
+ * the existing entry is stored in dupentry and the function returns -EEXIST
+ */
+static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
+                        struct zswap_entry **dupentry)
+{
+        struct rb_node **link = &root->rb_node, *parent = NULL;
+        struct zswap_entry *myentry;
+        while (*link) {
+                parent = *link;
+                myentry = rb_entry(parent, struct zswap_entry, rbnode);
+                if (myentry->offset > entry->offset)
+                        link = &(*link)->rb_left;
+                else if (myentry->offset < entry->offset)
+                        link = &(*link)->rb_right;
+                else {
+                        *dupentry = myentry;
+                        return -EEXIST;
+                }
+        }
+        rb_link_node(&entry->rbnode, parent, link);
+        rb_insert_color(&entry->rbnode, root);
+        return 0;
+}
+/*********************************
+* per-cpu code
+**********************************/
+static DEFINE_PER_CPU(u8 *, zswap_dstmem);
+static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
+{
+        struct crypto_comp *tfm;
+        u8 *dst;
+        switch (action) {
+        case CPU_UP_PREPARE:
+                tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
+                if (IS_ERR(tfm)) {
+                        pr_err("can't allocate compressor transform\n");
+                        return NOTIFY_BAD;
+                }
+                *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
+                dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
+                if (!dst) {
+                        pr_err("can't allocate compressor buffer\n");
+                        crypto_free_comp(tfm);
+                        *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
+                        return NOTIFY_BAD;
+                }
+                per_cpu(zswap_dstmem, cpu) = dst;
+                break;
+        case CPU_DEAD:
+        case CPU_UP_CANCELED:
+                tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
+                if (tfm) {
+                        crypto_free_comp(tfm);
+                        *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
+                }
+                dst = per_cpu(zswap_dstmem, cpu);
+                kfree(dst);
+                per_cpu(zswap_dstmem, cpu) = NULL;
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static int zswap_cpu_notifier(struct notifier_block *nb,
+                                unsigned long action, void *pcpu)
+{
+        unsigned long cpu = (unsigned long)pcpu;
+        return __zswap_cpu_notifier(action, cpu);
+}
+static struct notifier_block zswap_cpu_notifier_block = {
+        .notifier_call = zswap_cpu_notifier
+};
+static int zswap_cpu_init(void)
+{
+        unsigned long cpu;
+        get_online_cpus();
+        for_each_online_cpu(cpu)
+                if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
+                        goto cleanup;
+        register_cpu_notifier(&zswap_cpu_notifier_block);
+        put_online_cpus();
+        return 0;
+cleanup:
+        for_each_online_cpu(cpu)
+                __zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
+        put_online_cpus();
+        return -ENOMEM;
+}
+/*********************************
+* helpers
+**********************************/
+static bool zswap_is_full(void)
+{
+        return (totalram_pages * zswap_max_pool_percent / 100 <
+                zswap_pool_pages);
+}
+/*
+ * Carries out the common pattern of freeing and entry's zsmalloc allocation,
+ * freeing the entry itself, and decrementing the number of stored pages.
+ */
+static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
+{
+        zbud_free(tree->pool, entry->handle);
+        zswap_entry_cache_free(entry);
+        atomic_dec(&zswap_stored_pages);
+        zswap_pool_pages = zbud_get_pool_size(tree->pool);
+}
+/*********************************
+* writeback code
+**********************************/
+/* return enum for zswap_get_swap_cache_page */
+enum zswap_get_swap_ret {
+        ZSWAP_SWAPCACHE_NEW,
+        ZSWAP_SWAPCACHE_EXIST,
+        ZSWAP_SWAPCACHE_NOMEM
+};
+/*
+ * zswap_get_swap_cache_page
+ *
+ * This is an adaption of read_swap_cache_async()
+ *
+ * This function tries to find a page with the given swap entry
+ * in the swapper_space address space (the swap cache).  If the page
+ * is found, it is returned in retpage.  Otherwise, a page is allocated,
+ * added to the swap cache, and returned in retpage.
+ *
+ * If success, the swap cache page is returned in retpage
+ * Returns 0 if page was already in the swap cache, page is not locked
+ * Returns 1 if the new page needs to be populated, page is locked
+ * Returns <0 on error
+ */
+static int zswap_get_swap_cache_page(swp_entry_t entry,
+                                struct page **retpage)
+{
+        struct page *found_page, *new_page = NULL;
+        struct address_space *swapper_space = &swapper_spaces[swp_type(entry)];
+        int err;
+        *retpage = NULL;
+        do {
+                /*
+                 * First check the swap cache.  Since this is normally
+                 * called after lookup_swap_cache() failed, re-calling
+                 * that would confuse statistics.
+                 */
+                found_page = find_get_page(swapper_space, entry.val);
+                if (found_page)
+                        break;
+                /*
+                 * Get a new page to read into from swap.
+                 */
+                if (!new_page) {
+                        new_page = alloc_page(GFP_KERNEL);
+                        if (!new_page)
+                                break; /* Out of memory */
+                }
+                /*
+                 * call radix_tree_preload() while we can wait.
+                 */
+                err = radix_tree_preload(GFP_KERNEL);
+                if (err)
+                        break;
+                /*
+                 * Swap entry may have been freed since our caller observed it.
+                 */
+                err = swapcache_prepare(entry);
+                if (err == -EEXIST) { /* seems racy */
+                        radix_tree_preload_end();
+                        continue;
+                }
+                if (err) { /* swp entry is obsolete ? */
+                        radix_tree_preload_end();
+                        break;
+                }
+                /* May fail (-ENOMEM) if radix-tree node allocation failed. */
+                __set_page_locked(new_page);
+                SetPageSwapBacked(new_page);
+                err = __add_to_swap_cache(new_page, entry);
+                if (likely(!err)) {
+                        radix_tree_preload_end();
+                        lru_cache_add_anon(new_page);
+                        *retpage = new_page;
+                        return ZSWAP_SWAPCACHE_NEW;
+                }
+                radix_tree_preload_end();
+                ClearPageSwapBacked(new_page);
+                __clear_page_locked(new_page);
+                /*
+                 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
+                 * clear SWAP_HAS_CACHE flag.
+                 */
+                swapcache_free(entry, NULL);
+        } while (err != -ENOMEM);
+        if (new_page)
+                page_cache_release(new_page);
+        if (!found_page)
+                return ZSWAP_SWAPCACHE_NOMEM;
+        *retpage = found_page;
+        return ZSWAP_SWAPCACHE_EXIST;
+}
+/*
+ * Attempts to free an entry by adding a page to the swap cache,
+ * decompressing the entry data into the page, and issuing a
+ * bio write to write the page back to the swap device.
+ *
+ * This can be thought of as a "resumed writeback" of the page
+ * to the swap device.  We are basically resuming the same swap
+ * writeback path that was intercepted with the frontswap_store()
+ * in the first place.  After the page has been decompressed into
+ * the swap cache, the compressed version stored by zswap can be
+ * freed.
+ */
+static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
+{
+        struct zswap_header *zhdr;
+        swp_entry_t swpentry;
+        struct zswap_tree *tree;
+        pgoff_t offset;
+        struct zswap_entry *entry;
+        struct page *page;
+        u8 *src, *dst;
+        unsigned int dlen;
+        int ret, refcount;
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_NONE,
+        };
+        /* extract swpentry from data */
+        zhdr = zbud_map(pool, handle);
+        swpentry = zhdr->swpentry; /* here */
+        zbud_unmap(pool, handle);
+        tree = zswap_trees[swp_type(swpentry)];
+        offset = swp_offset(swpentry);
+        BUG_ON(pool != tree->pool);
+        /* find and ref zswap entry */
+        spin_lock(&tree->lock);
+        entry = zswap_rb_search(&tree->rbroot, offset);
+        if (!entry) {
+                /* entry was invalidated */
+                spin_unlock(&tree->lock);
+                return 0;
+        }
+        zswap_entry_get(entry);
+        spin_unlock(&tree->lock);
+        BUG_ON(offset != entry->offset);
+        /* try to allocate swap cache page */
+        switch (zswap_get_swap_cache_page(swpentry, &page)) {
+        case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
+                ret = -ENOMEM;
+                goto fail;
+        case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
+                /* page is already in the swap cache, ignore for now */
+                page_cache_release(page);
+                ret = -EEXIST;
+                goto fail;
+        case ZSWAP_SWAPCACHE_NEW: /* page is locked */
+                /* decompress */
+                dlen = PAGE_SIZE;
+                src = (u8 *)zbud_map(tree->pool, entry->handle) +
+                        sizeof(struct zswap_header);
+                dst = kmap_atomic(page);
+                ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
+                                entry->length, dst, &dlen);
+                kunmap_atomic(dst);
+                zbud_unmap(tree->pool, entry->handle);
+                BUG_ON(ret);
+                BUG_ON(dlen != PAGE_SIZE);
+                /* page is up to date */
+                SetPageUptodate(page);
+        }
+        /* start writeback */
+        __swap_writepage(page, &wbc, end_swap_bio_write);
+        page_cache_release(page);
+        zswap_written_back_pages++;
+        spin_lock(&tree->lock);
+        /* drop local reference */
+        zswap_entry_put(entry);
+        /* drop the initial reference from entry creation */
+        refcount = zswap_entry_put(entry);
+        /*
+         * There are three possible values for refcount here:
+         * (1) refcount is 1, load is in progress, unlink from rbtree,
+         *     load will free
+         * (2) refcount is 0, (normal case) entry is valid,
+         *     remove from rbtree and free entry
+         * (3) refcount is -1, invalidate happened during writeback,
+         *     free entry
+         */
+        if (refcount >= 0) {
+                /* no invalidate yet, remove from rbtree */
+                rb_erase(&entry->rbnode, &tree->rbroot);
+        }
+        spin_unlock(&tree->lock);
+        if (refcount <= 0) {
+                /* free the entry */
+                zswap_free_entry(tree, entry);
+                return 0;
+        }
+        return -EAGAIN;
+fail:
+        spin_lock(&tree->lock);
+        zswap_entry_put(entry);
+        spin_unlock(&tree->lock);
+        return ret;
+}
+/*********************************
+* frontswap hooks
+**********************************/
+/* attempts to compress and store an single page */
+static int zswap_frontswap_store(unsigned type, pgoff_t offset,
+                                struct page *page)
+{
+        struct zswap_tree *tree = zswap_trees[type];
+        struct zswap_entry *entry, *dupentry;
+        int ret;
+        unsigned int dlen = PAGE_SIZE, len;
+        unsigned long handle;
+        char *buf;
+        u8 *src, *dst;
+        struct zswap_header *zhdr;
+        if (!tree) {
+                ret = -ENODEV;
+                goto reject;
+        }
+        /* reclaim space if needed */
+        if (zswap_is_full()) {
+                zswap_pool_limit_hit++;
+                if (zbud_reclaim_page(tree->pool, 8)) {
+                        zswap_reject_reclaim_fail++;
+                        ret = -ENOMEM;
+                        goto reject;
+                }
+        }
+        /* allocate entry */
+        entry = zswap_entry_cache_alloc(GFP_KERNEL);
+        if (!entry) {
+                zswap_reject_kmemcache_fail++;
+                ret = -ENOMEM;
+                goto reject;
+        }
+        /* compress */
+        dst = get_cpu_var(zswap_dstmem);
+        src = kmap_atomic(page);
+        ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
+        kunmap_atomic(src);
+        if (ret) {
+                ret = -EINVAL;
+                goto freepage;
+        }
+        /* store */
+        len = dlen + sizeof(struct zswap_header);
+        ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN,
+                &handle);
+        if (ret == -ENOSPC) {
+                zswap_reject_compress_poor++;
+                goto freepage;
+        }
+        if (ret) {
+                zswap_reject_alloc_fail++;
+                goto freepage;
+        }
+        zhdr = zbud_map(tree->pool, handle);
+        zhdr->swpentry = swp_entry(type, offset);
+        buf = (u8 *)(zhdr + 1);
+        memcpy(buf, dst, dlen);
+        zbud_unmap(tree->pool, handle);
+        put_cpu_var(zswap_dstmem);
+        /* populate entry */
+        entry->offset = offset;
+        entry->handle = handle;
+        entry->length = dlen;
+        /* map */
+        spin_lock(&tree->lock);
+        do {
+                ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
+                if (ret == -EEXIST) {
+                        zswap_duplicate_entry++;
+                        /* remove from rbtree */
+                        rb_erase(&dupentry->rbnode, &tree->rbroot);
+                        if (!zswap_entry_put(dupentry)) {
+                                /* free */
+                                zswap_free_entry(tree, dupentry);
+                        }
+                }
+        } while (ret == -EEXIST);
+        spin_unlock(&tree->lock);
+        /* update stats */
+        atomic_inc(&zswap_stored_pages);
+        zswap_pool_pages = zbud_get_pool_size(tree->pool);
+        return 0;
+freepage:
+        put_cpu_var(zswap_dstmem);
+        zswap_entry_cache_free(entry);
+reject:
+        return ret;
+}
+/*
+ * returns 0 if the page was successfully decompressed
+ * return -1 on entry not found or error
+*/
+static int zswap_frontswap_load(unsigned type, pgoff_t offset,
+                                struct page *page)
+{
+        struct zswap_tree *tree = zswap_trees[type];
+        struct zswap_entry *entry;
+        u8 *src, *dst;
+        unsigned int dlen;
+        int refcount, ret;
+        /* find */
+        spin_lock(&tree->lock);
+        entry = zswap_rb_search(&tree->rbroot, offset);
+        if (!entry) {
+                /* entry was written back */
+                spin_unlock(&tree->lock);
+                return -1;
+        }
+        zswap_entry_get(entry);
+        spin_unlock(&tree->lock);
+        /* decompress */
+        dlen = PAGE_SIZE;
+        src = (u8 *)zbud_map(tree->pool, entry->handle) +
+                        sizeof(struct zswap_header);
+        dst = kmap_atomic(page);
+        ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
+                dst, &dlen);
+        kunmap_atomic(dst);
+        zbud_unmap(tree->pool, entry->handle);
+        BUG_ON(ret);
+        spin_lock(&tree->lock);
+        refcount = zswap_entry_put(entry);
+        if (likely(refcount)) {
+                spin_unlock(&tree->lock);
+                return 0;
+        }
+        spin_unlock(&tree->lock);
+        /*
+         * We don't have to unlink from the rbtree because
+         * zswap_writeback_entry() or zswap_frontswap_invalidate page()
+         * has already done this for us if we are the last reference.
+         */
+        /* free */
+        zswap_free_entry(tree, entry);
+        return 0;
+}
+/* frees an entry in zswap */
+static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+        struct zswap_tree *tree = zswap_trees[type];
+        struct zswap_entry *entry;
+        int refcount;
+        /* find */
+        spin_lock(&tree->lock);
+        entry = zswap_rb_search(&tree->rbroot, offset);
+        if (!entry) {
+                /* entry was written back */
+                spin_unlock(&tree->lock);
+                return;
+        }
+        /* remove from rbtree */
+        rb_erase(&entry->rbnode, &tree->rbroot);
+        /* drop the initial reference from entry creation */
+        refcount = zswap_entry_put(entry);
+        spin_unlock(&tree->lock);
+        if (refcount) {
+                /* writeback in progress, writeback will free */
+                return;
+        }
+        /* free */
+        zswap_free_entry(tree, entry);
+}
+/* frees all zswap entries for the given swap type */
+static void zswap_frontswap_invalidate_area(unsigned type)
+{
+        struct zswap_tree *tree = zswap_trees[type];
+        struct rb_node *node;
+        struct zswap_entry *entry;
+        if (!tree)
+                return;
+        /* walk the tree and free everything */
+        spin_lock(&tree->lock);
+        /*
+         * TODO: Even though this code should not be executed because
+         * the try_to_unuse() in swapoff should have emptied the tree,
+         * it is very wasteful to rebalance the tree after every
+         * removal when we are freeing the whole tree.
+         *
+         * If post-order traversal code is ever added to the rbtree
+         * implementation, it should be used here.
+         */
+        while ((node = rb_first(&tree->rbroot))) {
+                entry = rb_entry(node, struct zswap_entry, rbnode);
+                rb_erase(&entry->rbnode, &tree->rbroot);
+                zbud_free(tree->pool, entry->handle);
+                zswap_entry_cache_free(entry);
+                atomic_dec(&zswap_stored_pages);
+        }
+        tree->rbroot = RB_ROOT;
+        spin_unlock(&tree->lock);
+}
+static struct zbud_ops zswap_zbud_ops = {
+        .evict = zswap_writeback_entry
+};
+static void zswap_frontswap_init(unsigned type)
+{
+        struct zswap_tree *tree;
+        tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
+        if (!tree)
+                goto err;
+        tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
+        if (!tree->pool)
+                goto freetree;
+        tree->rbroot = RB_ROOT;
+        spin_lock_init(&tree->lock);
+        zswap_trees[type] = tree;
+        return;
+freetree:
+        kfree(tree);
+err:
+        pr_err("alloc failed, zswap disabled for swap type %d\n", type);
+}
+static struct frontswap_ops zswap_frontswap_ops = {
+        .store = zswap_frontswap_store,
+        .load = zswap_frontswap_load,
+        .invalidate_page = zswap_frontswap_invalidate_page,
+        .invalidate_area = zswap_frontswap_invalidate_area,
+        .init = zswap_frontswap_init
+};
+/*********************************
+* debugfs functions
+**********************************/
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+static struct dentry *zswap_debugfs_root;
+static int __init zswap_debugfs_init(void)
+{
+        if (!debugfs_initialized())
+                return -ENODEV;
+        zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
+        if (!zswap_debugfs_root)
+                return -ENOMEM;
+        debugfs_create_u64("pool_limit_hit", S_IRUGO,
+                        zswap_debugfs_root, &zswap_pool_limit_hit);
+        debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
+                        zswap_debugfs_root, &zswap_reject_reclaim_fail);
+        debugfs_create_u64("reject_alloc_fail", S_IRUGO,
+                        zswap_debugfs_root, &zswap_reject_alloc_fail);
+        debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
+                        zswap_debugfs_root, &zswap_reject_kmemcache_fail);
+        debugfs_create_u64("reject_compress_poor", S_IRUGO,
+                        zswap_debugfs_root, &zswap_reject_compress_poor);
+        debugfs_create_u64("written_back_pages", S_IRUGO,
+                        zswap_debugfs_root, &zswap_written_back_pages);
+        debugfs_create_u64("duplicate_entry", S_IRUGO,
+                        zswap_debugfs_root, &zswap_duplicate_entry);
+        debugfs_create_u64("pool_pages", S_IRUGO,
+                        zswap_debugfs_root, &zswap_pool_pages);
+        debugfs_create_atomic_t("stored_pages", S_IRUGO,
+                        zswap_debugfs_root, &zswap_stored_pages);
+        return 0;
+}
+static void __exit zswap_debugfs_exit(void)
+{
+        debugfs_remove_recursive(zswap_debugfs_root);
+}
+#else
+static int __init zswap_debugfs_init(void)
+{
+        return 0;
+}
+static void __exit zswap_debugfs_exit(void) { }
+#endif
+/*********************************
+* module init and exit
+**********************************/
+static int __init init_zswap(void)
+{
+        if (!zswap_enabled)
+                return 0;
+        pr_info("loading zswap\n");
+        if (zswap_entry_cache_create()) {
+                pr_err("entry cache creation failed\n");
+                goto error;
+        }
+        if (zswap_comp_init()) {
+                pr_err("compressor initialization failed\n");
+                goto compfail;
+        }
+        if (zswap_cpu_init()) {
+                pr_err("per-cpu initialization failed\n");
+                goto pcpufail;
+        }
+        frontswap_register_ops(&zswap_frontswap_ops);
+        if (zswap_debugfs_init())
+                pr_warn("debugfs initialization failed\n");
+        return 0;
+pcpufail:
+        zswap_comp_exit();
+compfail:
+        zswap_entry_cache_destory();
+error:
+        return -ENOMEM;
+}
+/* must be late so crypto has time to come up */
+late_initcall(init_zswap);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("Compressed cache for swap pages");
author	Alexander Graf <agraf@suse.de>	2013-08-28 18:41:59 -0400
committer	Alexander Graf <agraf@suse.de>	2013-08-28 18:41:59 -0400
commit	bf550fc93d9855872a95e69e4002256110d89858 (patch)
tree	10876bb4304bffe54c4160a132e7b8de6577ac4e /mm
parent	7e48c101e0c53e6095c5f4f5e63d14df50aae8fc (diff)
parent	cc2df20c7c4ce594c3e17e9cc260c330646012c8 (diff)