44 files changed, 4508 insertions, 1010 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c2c8a4a11898..3ad483bdf505 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS
          See Documentation/nommu-mmap.txt for more information.
+config TRANSPARENT_HUGEPAGE
+        bool "Transparent Hugepage Support"
+        depends on X86 && MMU
+        select COMPACTION
+        help
+          Transparent Hugepages allows the kernel to use huge pages and
+          huge tlb transparently to the applications whenever possible.
+          This feature can improve computing performance to certain
+          applications by speeding up page faults during memory
+          allocation, by reducing the number of tlb misses and by speeding
+          up the pagetable walking.
+          If memory constrained on embedded, you may want to say N.
+choice
+        prompt "Transparent Hugepage Support sysfs defaults"
+        depends on TRANSPARENT_HUGEPAGE
+        default TRANSPARENT_HUGEPAGE_ALWAYS
+        help
+          Selects the sysfs defaults for Transparent Hugepage Support.
+        config TRANSPARENT_HUGEPAGE_ALWAYS
+                bool "always"
+        help
+          Enabling Transparent Hugepage always, can increase the
+          memory footprint of applications without a guaranteed
+          benefit but it will work automatically for all applications.
+        config TRANSPARENT_HUGEPAGE_MADVISE
+                bool "madvise"
+        help
+          Enabling Transparent Hugepage madvise, will only provide a
+          performance improvement benefit to the applications using
+          madvise(MADV_HUGEPAGE) but it won't risk to increase the
+          memory footprint of applications without a guaranteed
+          benefit.
+endchoice
 #
 # UP and nommu archs use km based percpu allocator
 #
diff --git a/mm/Makefile b/mm/Makefile
index f73f75a29f82..2b1b575ae712 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,7 @@
 mmu-y                   := nommu.o
 mmu-$(CONFIG_MMU)       := fremap.o highmem.o madvise.o memory.o mincore.o \
                           mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-                           vmalloc.o pagewalk.o
+                           vmalloc.o pagewalk.o pgtable-generic.o
 obj-y                   := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
@@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 1a8894eadf72..6d592a021072 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,9 @@
 #include <linux/sysfs.h>
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
 /*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
+        bool sync;                      /* Synchronous migration */
        /* Account for isolated anon and file pages */
        unsigned long nr_anon;
@@ -38,6 +42,8 @@ struct compact_control {
        unsigned int order;             /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
+        int compact_mode;
 };
 static unsigned long release_freepages(struct list_head *freelist)
@@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                                struct list_head *freelist)
 {
        unsigned long zone_end_pfn, end_pfn;
-        int total_isolated = 0;
+        int nr_scanned = 0, total_isolated = 0;
        struct page *cursor;
        /* Get the last PFN we should scan for free pages at */
@@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                if (!pfn_valid_within(blockpfn))
                        continue;
+                nr_scanned++;
                if (!PageBuddy(page))
                        continue;
@@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                }
        }
+        trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
        return total_isolated;
 }
@@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
        unsigned long low_pfn, end_pfn;
+        unsigned long last_pageblock_nr = 0, pageblock_nr;
+        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
        /* Do not scan outside zone boundaries */
@@ -266,20 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone,
                struct page *page;
                if (!pfn_valid_within(low_pfn))
                        continue;
+                nr_scanned++;
                /* Get the page and skip if free */
                page = pfn_to_page(low_pfn);
                if (PageBuddy(page))
                        continue;
+                /*
+                 * For async migration, also only scan in MOVABLE blocks. Async
+                 * migration is optimistic to see if the minimum amount of work
+                 * satisfies the allocation
+                 */
+                pageblock_nr = low_pfn >> pageblock_order;
+                if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+                                get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+                        low_pfn += pageblock_nr_pages;
+                        low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+                        last_pageblock_nr = pageblock_nr;
+                        continue;
+                }
+                if (!PageLRU(page))
+                        continue;
+                /*
+                 * PageLRU is set, and lru_lock excludes isolation,
+                 * splitting and collapsing (collapsing has already
+                 * happened if PageLRU is set).
+                 */
+                if (PageTransHuge(page)) {
+                        low_pfn += (1 << compound_order(page)) - 1;
+                        continue;
+                }
                /* Try isolate the page */
                if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
                        continue;
+                VM_BUG_ON(PageTransCompound(page));
                /* Successfully isolated */
                del_page_from_lru_list(zone, page, page_lru(page));
                list_add(&page->lru, migratelist);
                cc->nr_migratepages++;
+                nr_isolated++;
                /* Avoid isolating too much */
                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -291,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
        cc->migrate_pfn = low_pfn;
+        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
        return cc->nr_migratepages;
 }
@@ -341,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc)
 }
 static int compact_finished(struct zone *zone,
-                                                struct compact_control *cc)
+                            struct compact_control *cc)
 {
        unsigned int order;
-        unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+        unsigned long watermark;
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
@@ -354,12 +397,27 @@ static int compact_finished(struct zone *zone,
                return COMPACT_COMPLETE;
        /* Compaction run is not finished if the watermark is not met */
+        if (cc->compact_mode != COMPACT_MODE_KSWAPD)
+                watermark = low_wmark_pages(zone);
+        else
+                watermark = high_wmark_pages(zone);
+        watermark += (1 << cc->order);
        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
                return COMPACT_CONTINUE;
        if (cc->order == -1)
                return COMPACT_CONTINUE;
+        /*
+         * Generating only one page of the right order is not enough
+         * for kswapd, we must continue until we're above the high
+         * watermark as a pool for high order GFP_ATOMIC allocations
+         * too.
+         */
+        if (cc->compact_mode == COMPACT_MODE_KSWAPD)
+                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
        for (order = cc->order; order < MAX_ORDER; order++) {
                /* Job done if page is free of the right migratetype */
@@ -374,10 +432,62 @@ static int compact_finished(struct zone *zone,
        return COMPACT_CONTINUE;
 }
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ *   COMPACT_SKIPPED  - If there are too few free pages for compaction
+ *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
+ *   COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+        int fragindex;
+        unsigned long watermark;
+        /*
+         * Watermarks for order-0 must be met for compaction. Note the 2UL.
+         * This is because during migration, copies of pages need to be
+         * allocated and for a short time, the footprint is higher
+         */
+        watermark = low_wmark_pages(zone) + (2UL << order);
+        if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+                return COMPACT_SKIPPED;
+        /*
+         * fragmentation index determines if allocation failures are due to
+         * low memory or external fragmentation
+         *
+         * index of -1 implies allocations might succeed dependingon watermarks
+         * index towards 0 implies failure is due to lack of memory
+         * index towards 1000 implies failure is due to fragmentation
+         *
+         * Only compact if a failure would be due to fragmentation.
+         */
+        fragindex = fragmentation_index(zone, order);
+        if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+                return COMPACT_SKIPPED;
+        if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+                return COMPACT_PARTIAL;
+        return COMPACT_CONTINUE;
+}
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
+        ret = compaction_suitable(zone, cc->order);
+        switch (ret) {
+        case COMPACT_PARTIAL:
+        case COMPACT_SKIPPED:
+                /* Compaction is likely to fail */
+                return ret;
+        case COMPACT_CONTINUE:
+                /* Fall through to compaction */
+                ;
+        }
        /* Setup to move all movable pages to the end of the zone */
        cc->migrate_pfn = zone->zone_start_pfn;
        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -393,7 +503,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                nr_migrate = cc->nr_migratepages;
                migrate_pages(&cc->migratepages, compaction_alloc,
-                                                (unsigned long)cc, 0);
+                                (unsigned long)cc, false,
+                                cc->sync);
                update_nr_listpages(cc);
                nr_remaining = cc->nr_migratepages;
@@ -401,6 +512,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
                if (nr_remaining)
                        count_vm_events(COMPACTPAGEFAILED, nr_remaining);
+                trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+                                                nr_remaining);
                /* Release LRU pages not migrated */
                if (!list_empty(&cc->migratepages)) {
@@ -417,8 +530,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        return ret;
 }
-static unsigned long compact_zone_order(struct zone *zone,
+unsigned long compact_zone_order(struct zone *zone,
-                                                int order, gfp_t gfp_mask)
+                                 int order, gfp_t gfp_mask,
+                                 bool sync,
+                                 int compact_mode)
 {
        struct compact_control cc = {
                .nr_freepages = 0,
@@ -426,6 +541,8 @@ static unsigned long compact_zone_order(struct zone *zone,
                .order = order,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
+                .sync = sync,
+                .compact_mode = compact_mode,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -441,16 +558,17 @@ int sysctl_extfrag_threshold = 500;
 * @order: The order of the current allocation
 * @gfp_mask: The GFP mask of the current allocation
 * @nodemask: The allowed nodes to allocate from
+ * @sync: Whether migration is synchronous or not
 *
 * This is the main entry point for direct page compaction.
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
-                        int order, gfp_t gfp_mask, nodemask_t *nodemask)
+                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
+                        bool sync)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
        int may_perform_io = gfp_mask & __GFP_IO;
-        unsigned long watermark;
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
@@ -460,7 +578,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
         * made because an assumption is made that the page allocator can satisfy
         * the "cheaper" orders without taking special steps
         */
-        if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
+        if (!order || !may_enter_fs || !may_perform_io)
                return rc;
        count_vm_event(COMPACTSTALL);
@@ -468,43 +586,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
-                int fragindex;
                int status;
-                /*
+                status = compact_zone_order(zone, order, gfp_mask, sync,
-                 * Watermarks for order-0 must be met for compaction. Note
+                                            COMPACT_MODE_DIRECT_RECLAIM);
-                 * the 2UL. This is because during migration, copies of
-                 * pages need to be allocated and for a short time, the
-                 * footprint is higher
-                 */
-                watermark = low_wmark_pages(zone) + (2UL << order);
-                if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
-                        continue;
-                /*
-                 * fragmentation index determines if allocation failures are
-                 * due to low memory or external fragmentation
-                 *
-                 * index of -1 implies allocations might succeed depending
-                 *      on watermarks
-                 * index towards 0 implies failure is due to lack of memory
-                 * index towards 1000 implies failure is due to fragmentation
-                 *
-                 * Only compact if a failure would be due to fragmentation.
-                 */
-                fragindex = fragmentation_index(zone, order);
-                if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-                        continue;
-                if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
-                        rc = COMPACT_PARTIAL;
-                        break;
-                }
-                status = compact_zone_order(zone, order, gfp_mask);
                rc = max(status, rc);
-                if (zone_watermark_ok(zone, order, watermark, 0, 0))
+                /* If a normal allocation would succeed, stop compacting */
+                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
                        break;
        }
@@ -531,6 +620,7 @@ static int compact_node(int nid)
                        .nr_freepages = 0,
                        .nr_migratepages = 0,
                        .order = -1,
+                        .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
                };
                zone = &pgdat->node_zones[zoneid];
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 4df2de77e069..03bf3bb4519a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
                if (mem_flags & __GFP_WAIT) {
                        DECLARE_WAITQUEUE(wait, current);
-                        __set_current_state(TASK_INTERRUPTIBLE);
+                        __set_current_state(TASK_UNINTERRUPTIBLE);
                        __add_wait_queue(&pool->waitq, &wait);
                        spin_unlock_irqrestore(&pool->lock, flags);
@@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
 static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
 {
-        unsigned long flags;
        struct dma_page *page;
-        spin_lock_irqsave(&pool->lock, flags);
        list_for_each_entry(page, &pool->page_list, page_list) {
                if (dma < page->dma)
                        continue;
                if (dma < (page->dma + pool->allocation))
-                        goto done;
+                        return page;
        }
-        page = NULL;
+        return NULL;
- done:
-        spin_unlock_irqrestore(&pool->lock, flags);
-        return page;
 }
 /**
@@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        unsigned long flags;
        unsigned int offset;
+        spin_lock_irqsave(&pool->lock, flags);
        page = pool_find_page(pool, dma);
        if (!page) {
+                spin_unlock_irqrestore(&pool->lock, flags);
                if (pool->dev)
                        dev_err(pool->dev,
                                "dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        offset = vaddr - page->vaddr;
 #ifdef  DMAPOOL_DEBUG
        if ((dma - page->dma) != offset) {
+                spin_unlock_irqrestore(&pool->lock, flags);
                if (pool->dev)
                        dev_err(pool->dev,
                                "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
                                chain = *(int *)(page->vaddr + chain);
                                continue;
                        }
+                        spin_unlock_irqrestore(&pool->lock, flags);
                        if (pool->dev)
                                dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
                                        "already free\n", pool->name,
@@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        memset(vaddr, POOL_POISON_FREED, pool->size);
 #endif
-        spin_lock_irqsave(&pool->lock, flags);
        page->in_use--;
        *(int *)vaddr = page->offset;
        page->offset = offset;
diff --git a/mm/filemap.c b/mm/filemap.c
index 6b9aee20f242..83a45d35468b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -102,9 +102,6 @@
 *    ->inode_lock              (zap_pte_range->set_page_dirty)
 *    ->private_lock            (zap_pte_range->__set_page_dirty_buffers)
 *
- *  ->task->proc_lock
- *    ->dcache_lock             (proc_pid_lookup)
- *
 *  (code doesn't rely on that order, so you could switch it around)
 *  ->tasklist_lock             (memory_failure, collect_procs_ao)
 *    ->i_mmap_lock
@@ -301,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                                continue;
                        wait_on_page_writeback(page);
-                        if (PageError(page))
+                        if (TestClearPageError(page))
                                ret = -EIO;
                }
                pagevec_release(&pvec);
@@ -840,9 +837,6 @@ repeat:
                if (radix_tree_deref_retry(page))
                        goto restart;
-                if (page->mapping == NULL || page->index != index)
-                        break;
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -852,6 +846,16 @@ repeat:
                        goto repeat;
                }
+                /*
+                 * must check mapping and index after taking the ref.
+                 * otherwise we can get both false positives and false
+                 * negatives, which is just confusing to the caller.
+                 */
+                if (page->mapping == NULL || page->index != index) {
+                        page_cache_release(page);
+                        break;
+                }
                pages[ret] = page;
                ret++;
                index++;
@@ -2223,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
                gfp_notmask = __GFP_FS;
 repeat:
        page = find_lock_page(mapping, index);
-        if (likely(page))
+        if (page)
                return page;
        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..004c9c2aac78
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2346 @@
+/*
+ *  Copyright (C) 2009  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+/*
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
+ */
+unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
+        (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static int khugepaged(void *none);
+static int mm_slots_hash_init(void);
+static int khugepaged_slab_init(void);
+static void khugepaged_slab_free(void);
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash __read_mostly;
+static struct kmem_cache *mm_slot_cache __read_mostly;
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+        struct hlist_node hash;
+        struct list_head mm_node;
+        struct mm_struct *mm;
+};
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+        struct list_head mm_head;
+        struct mm_slot *mm_slot;
+        unsigned long address;
+} khugepaged_scan = {
+        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+static int set_recommended_min_free_kbytes(void)
+{
+        struct zone *zone;
+        int nr_zones = 0;
+        unsigned long recommended_min;
+        extern int min_free_kbytes;
+        if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                      &transparent_hugepage_flags) &&
+            !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                      &transparent_hugepage_flags))
+                return 0;
+        for_each_populated_zone(zone)
+                nr_zones++;
+        /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+        recommended_min = pageblock_nr_pages * nr_zones * 2;
+        /*
+         * Make sure that on average at least two pageblocks are almost free
+         * of another type, one for a migratetype to fall back to and a
+         * second to avoid subsequent fallbacks of other types There are 3
+         * MIGRATE_TYPES we care about.
+         */
+        recommended_min += pageblock_nr_pages * nr_zones *
+                           MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+        /* don't ever allow to reserve more than 5% of the lowmem */
+        recommended_min = min(recommended_min,
+                              (unsigned long) nr_free_buffer_pages() / 20);
+        recommended_min <<= (PAGE_SHIFT-10);
+        if (recommended_min > min_free_kbytes)
+                min_free_kbytes = recommended_min;
+        setup_per_zone_wmarks();
+        return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+static int start_khugepaged(void)
+{
+        int err = 0;
+        if (khugepaged_enabled()) {
+                int wakeup;
+                if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                mutex_lock(&khugepaged_mutex);
+                if (!khugepaged_thread)
+                        khugepaged_thread = kthread_run(khugepaged, NULL,
+                                                        "khugepaged");
+                if (unlikely(IS_ERR(khugepaged_thread))) {
+                        printk(KERN_ERR
+                               "khugepaged: kthread_run(khugepaged) failed\n");
+                        err = PTR_ERR(khugepaged_thread);
+                        khugepaged_thread = NULL;
+                }
+                wakeup = !list_empty(&khugepaged_scan.mm_head);
+                mutex_unlock(&khugepaged_mutex);
+                if (wakeup)
+                        wake_up_interruptible(&khugepaged_wait);
+                set_recommended_min_free_kbytes();
+        } else
+                /* wakeup to exit */
+                wake_up_interruptible(&khugepaged_wait);
+out:
+        return err;
+}
+#ifdef CONFIG_SYSFS
+static ssize_t double_flag_show(struct kobject *kobj,
+                                struct kobj_attribute *attr, char *buf,
+                                enum transparent_hugepage_flag enabled,
+                                enum transparent_hugepage_flag req_madv)
+{
+        if (test_bit(enabled, &transparent_hugepage_flags)) {
+                VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
+                return sprintf(buf, "[always] madvise never\n");
+        } else if (test_bit(req_madv, &transparent_hugepage_flags))
+                return sprintf(buf, "always [madvise] never\n");
+        else
+                return sprintf(buf, "always madvise [never]\n");
+}
+static ssize_t double_flag_store(struct kobject *kobj,
+                                 struct kobj_attribute *attr,
+                                 const char *buf, size_t count,
+                                 enum transparent_hugepage_flag enabled,
+                                 enum transparent_hugepage_flag req_madv)
+{
+        if (!memcmp("always", buf,
+                    min(sizeof("always")-1, count))) {
+                set_bit(enabled, &transparent_hugepage_flags);
+                clear_bit(req_madv, &transparent_hugepage_flags);
+        } else if (!memcmp("madvise", buf,
+                           min(sizeof("madvise")-1, count))) {
+                clear_bit(enabled, &transparent_hugepage_flags);
+                set_bit(req_madv, &transparent_hugepage_flags);
+        } else if (!memcmp("never", buf,
+                           min(sizeof("never")-1, count))) {
+                clear_bit(enabled, &transparent_hugepage_flags);
+                clear_bit(req_madv, &transparent_hugepage_flags);
+        } else
+                return -EINVAL;
+        return count;
+}
+static ssize_t enabled_show(struct kobject *kobj,
+                            struct kobj_attribute *attr, char *buf)
+{
+        return double_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_FLAG,
+                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+}
+static ssize_t enabled_store(struct kobject *kobj,
+                             struct kobj_attribute *attr,
+                             const char *buf, size_t count)
+{
+        ssize_t ret;
+        ret = double_flag_store(kobj, attr, buf, count,
+                                TRANSPARENT_HUGEPAGE_FLAG,
+                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+        if (ret > 0) {
+                int err = start_khugepaged();
+                if (err)
+                        ret = err;
+        }
+        if (ret > 0 &&
+            (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                      &transparent_hugepage_flags) ||
+             test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                      &transparent_hugepage_flags)))
+                set_recommended_min_free_kbytes();
+        return ret;
+}
+static struct kobj_attribute enabled_attr =
+        __ATTR(enabled, 0644, enabled_show, enabled_store);
+static ssize_t single_flag_show(struct kobject *kobj,
+                                struct kobj_attribute *attr, char *buf,
+                                enum transparent_hugepage_flag flag)
+{
+        if (test_bit(flag, &transparent_hugepage_flags))
+                return sprintf(buf, "[yes] no\n");
+        else
+                return sprintf(buf, "yes [no]\n");
+}
+static ssize_t single_flag_store(struct kobject *kobj,
+                                 struct kobj_attribute *attr,
+                                 const char *buf, size_t count,
+                                 enum transparent_hugepage_flag flag)
+{
+        if (!memcmp("yes", buf,
+                    min(sizeof("yes")-1, count))) {
+                set_bit(flag, &transparent_hugepage_flags);
+        } else if (!memcmp("no", buf,
+                           min(sizeof("no")-1, count))) {
+                clear_bit(flag, &transparent_hugepage_flags);
+        } else
+                return -EINVAL;
+        return count;
+}
+/*
+ * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
+ * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
+ * memory just to allocate one more hugepage.
+ */
+static ssize_t defrag_show(struct kobject *kobj,
+                           struct kobj_attribute *attr, char *buf)
+{
+        return double_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+                                TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static ssize_t defrag_store(struct kobject *kobj,
+                            struct kobj_attribute *attr,
+                            const char *buf, size_t count)
+{
+        return double_flag_store(kobj, attr, buf, count,
+                                 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+                                 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static struct kobj_attribute defrag_attr =
+        __ATTR(defrag, 0644, defrag_show, defrag_store);
+#ifdef CONFIG_DEBUG_VM
+static ssize_t debug_cow_show(struct kobject *kobj,
+                                struct kobj_attribute *attr, char *buf)
+{
+        return single_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static ssize_t debug_cow_store(struct kobject *kobj,
+                               struct kobj_attribute *attr,
+                               const char *buf, size_t count)
+{
+        return single_flag_store(kobj, attr, buf, count,
+                                 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static struct kobj_attribute debug_cow_attr =
+        __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
+#endif /* CONFIG_DEBUG_VM */
+static struct attribute *hugepage_attr[] = {
+        &enabled_attr.attr,
+        &defrag_attr.attr,
+#ifdef CONFIG_DEBUG_VM
+        &debug_cow_attr.attr,
+#endif
+        NULL,
+};
+static struct attribute_group hugepage_attr_group = {
+        .attrs = hugepage_attr,
+};
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+                                         struct kobj_attribute *attr,
+                                         char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+                                          struct kobj_attribute *attr,
+                                          const char *buf, size_t count)
+{
+        unsigned long msecs;
+        int err;
+        err = strict_strtoul(buf, 10, &msecs);
+        if (err || msecs > UINT_MAX)
+                return -EINVAL;
+        khugepaged_scan_sleep_millisecs = msecs;
+        wake_up_interruptible(&khugepaged_wait);
+        return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+        __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+               scan_sleep_millisecs_store);
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+                                          struct kobj_attribute *attr,
+                                          char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+                                           struct kobj_attribute *attr,
+                                           const char *buf, size_t count)
+{
+        unsigned long msecs;
+        int err;
+        err = strict_strtoul(buf, 10, &msecs);
+        if (err || msecs > UINT_MAX)
+                return -EINVAL;
+        khugepaged_alloc_sleep_millisecs = msecs;
+        wake_up_interruptible(&khugepaged_wait);
+        return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+        __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+               alloc_sleep_millisecs_store);
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+                                  struct kobj_attribute *attr,
+                                  char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   const char *buf, size_t count)
+{
+        int err;
+        unsigned long pages;
+        err = strict_strtoul(buf, 10, &pages);
+        if (err || !pages || pages > UINT_MAX)
+                return -EINVAL;
+        khugepaged_pages_to_scan = pages;
+        return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+        __ATTR(pages_to_scan, 0644, pages_to_scan_show,
+               pages_to_scan_store);
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+                                    struct kobj_attribute *attr,
+                                    char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+        __ATTR_RO(pages_collapsed);
+static ssize_t full_scans_show(struct kobject *kobj,
+                               struct kobj_attribute *attr,
+                               char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+        __ATTR_RO(full_scans);
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+                                      struct kobj_attribute *attr, char *buf)
+{
+        return single_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+                                       struct kobj_attribute *attr,
+                                       const char *buf, size_t count)
+{
+        return single_flag_store(kobj, attr, buf, count,
+                                 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+        __ATTR(defrag, 0644, khugepaged_defrag_show,
+               khugepaged_defrag_store);
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+                                             struct kobj_attribute *attr,
+                                             char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+                                              struct kobj_attribute *attr,
+                                              const char *buf, size_t count)
+{
+        int err;
+        unsigned long max_ptes_none;
+        err = strict_strtoul(buf, 10, &max_ptes_none);
+        if (err || max_ptes_none > HPAGE_PMD_NR-1)
+                return -EINVAL;
+        khugepaged_max_ptes_none = max_ptes_none;
+        return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+        __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+               khugepaged_max_ptes_none_store);
+static struct attribute *khugepaged_attr[] = {
+        &khugepaged_defrag_attr.attr,
+        &khugepaged_max_ptes_none_attr.attr,
+        &pages_to_scan_attr.attr,
+        &pages_collapsed_attr.attr,
+        &full_scans_attr.attr,
+        &scan_sleep_millisecs_attr.attr,
+        &alloc_sleep_millisecs_attr.attr,
+        NULL,
+};
+static struct attribute_group khugepaged_attr_group = {
+        .attrs = khugepaged_attr,
+        .name = "khugepaged",
+};
+#endif /* CONFIG_SYSFS */
+static int __init hugepage_init(void)
+{
+        int err;
+#ifdef CONFIG_SYSFS
+        static struct kobject *hugepage_kobj;
+#endif
+        err = -EINVAL;
+        if (!has_transparent_hugepage()) {
+                transparent_hugepage_flags = 0;
+                goto out;
+        }
+#ifdef CONFIG_SYSFS
+        err = -ENOMEM;
+        hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+        if (unlikely(!hugepage_kobj)) {
+                printk(KERN_ERR "hugepage: failed kobject create\n");
+                goto out;
+        }
+        err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
+        if (err) {
+                printk(KERN_ERR "hugepage: failed register hugeage group\n");
+                goto out;
+        }
+        err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
+        if (err) {
+                printk(KERN_ERR "hugepage: failed register hugeage group\n");
+                goto out;
+        }
+#endif
+        err = khugepaged_slab_init();
+        if (err)
+                goto out;
+        err = mm_slots_hash_init();
+        if (err) {
+                khugepaged_slab_free();
+                goto out;
+        }
+        /*
+         * By default disable transparent hugepages on smaller systems,
+         * where the extra memory used could hurt more than TLB overhead
+         * is likely to save.  The admin can still enable it through /sys.
+         */
+        if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+                transparent_hugepage_flags = 0;
+        start_khugepaged();
+        set_recommended_min_free_kbytes();
+out:
+        return err;
+}
+module_init(hugepage_init)
+static int __init setup_transparent_hugepage(char *str)
+{
+        int ret = 0;
+        if (!str)
+                goto out;
+        if (!strcmp(str, "always")) {
+                set_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                        &transparent_hugepage_flags);
+                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                          &transparent_hugepage_flags);
+                ret = 1;
+        } else if (!strcmp(str, "madvise")) {
+                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                          &transparent_hugepage_flags);
+                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                        &transparent_hugepage_flags);
+                ret = 1;
+        } else if (!strcmp(str, "never")) {
+                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                          &transparent_hugepage_flags);
+                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                          &transparent_hugepage_flags);
+                ret = 1;
+        }
+out:
+        if (!ret)
+                printk(KERN_WARNING
+                       "transparent_hugepage= cannot parse, ignored\n");
+        return ret;
+}
+__setup("transparent_hugepage=", setup_transparent_hugepage);
+static void prepare_pmd_huge_pte(pgtable_t pgtable,
+                                 struct mm_struct *mm)
+{
+        assert_spin_locked(&mm->page_table_lock);
+        /* FIFO */
+        if (!mm->pmd_huge_pte)
+                INIT_LIST_HEAD(&pgtable->lru);
+        else
+                list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+        mm->pmd_huge_pte = pgtable;
+}
+static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+        if (likely(vma->vm_flags & VM_WRITE))
+                pmd = pmd_mkwrite(pmd);
+        return pmd;
+}
+static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
+                                        struct vm_area_struct *vma,
+                                        unsigned long haddr, pmd_t *pmd,
+                                        struct page *page)
+{
+        int ret = 0;
+        pgtable_t pgtable;
+        VM_BUG_ON(!PageCompound(page));
+        pgtable = pte_alloc_one(mm, haddr);
+        if (unlikely(!pgtable)) {
+                mem_cgroup_uncharge_page(page);
+                put_page(page);
+                return VM_FAULT_OOM;
+        }
+        clear_huge_page(page, haddr, HPAGE_PMD_NR);
+        __SetPageUptodate(page);
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_none(*pmd))) {
+                spin_unlock(&mm->page_table_lock);
+                mem_cgroup_uncharge_page(page);
+                put_page(page);
+                pte_free(mm, pgtable);
+        } else {
+                pmd_t entry;
+                entry = mk_pmd(page, vma->vm_page_prot);
+                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+                entry = pmd_mkhuge(entry);
+                /*
+                 * The spinlocking to take the lru_lock inside
+                 * page_add_new_anon_rmap() acts as a full memory
+                 * barrier to be sure clear_huge_page writes become
+                 * visible after the set_pmd_at() write.
+                 */
+                page_add_new_anon_rmap(page, vma, haddr);
+                set_pmd_at(mm, haddr, pmd, entry);
+                prepare_pmd_huge_pte(pgtable, mm);
+                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+                spin_unlock(&mm->page_table_lock);
+        }
+        return ret;
+}
+static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+{
+        return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+}
+static inline struct page *alloc_hugepage_vma(int defrag,
+                                              struct vm_area_struct *vma,
+                                              unsigned long haddr)
+{
+        return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
+                               HPAGE_PMD_ORDER, vma, haddr);
+}
+#ifndef CONFIG_NUMA
+static inline struct page *alloc_hugepage(int defrag)
+{
+        return alloc_pages(alloc_hugepage_gfpmask(defrag),
+                           HPAGE_PMD_ORDER);
+}
+#endif
+int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                               unsigned long address, pmd_t *pmd,
+                               unsigned int flags)
+{
+        struct page *page;
+        unsigned long haddr = address & HPAGE_PMD_MASK;
+        pte_t *pte;
+        if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
+                if (unlikely(anon_vma_prepare(vma)))
+                        return VM_FAULT_OOM;
+                if (unlikely(khugepaged_enter(vma)))
+                        return VM_FAULT_OOM;
+                page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                                          vma, haddr);
+                if (unlikely(!page))
+                        goto out;
+                if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+                        put_page(page);
+                        goto out;
+                }
+                return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+        }
+out:
+        /*
+         * Use __pte_alloc instead of pte_alloc_map, because we can't
+         * run pte_offset_map on the pmd, if an huge pmd could
+         * materialize from under us from a different thread.
+         */
+        if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+                return VM_FAULT_OOM;
+        /* if an huge pmd materialized from under us just retry later */
+        if (unlikely(pmd_trans_huge(*pmd)))
+                return 0;
+        /*
+         * A regular pmd is established and it can't morph into a huge pmd
+         * from under us anymore at this point because we hold the mmap_sem
+         * read mode and khugepaged takes it in write mode. So now it's
+         * safe to run pte_offset_map().
+         */
+        pte = pte_offset_map(pmd, address);
+        return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+}
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+                  struct vm_area_struct *vma)
+{
+        struct page *src_page;
+        pmd_t pmd;
+        pgtable_t pgtable;
+        int ret;
+        ret = -ENOMEM;
+        pgtable = pte_alloc_one(dst_mm, addr);
+        if (unlikely(!pgtable))
+                goto out;
+        spin_lock(&dst_mm->page_table_lock);
+        spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+        ret = -EAGAIN;
+        pmd = *src_pmd;
+        if (unlikely(!pmd_trans_huge(pmd))) {
+                pte_free(dst_mm, pgtable);
+                goto out_unlock;
+        }
+        if (unlikely(pmd_trans_splitting(pmd))) {
+                /* split huge page running from under us */
+                spin_unlock(&src_mm->page_table_lock);
+                spin_unlock(&dst_mm->page_table_lock);
+                pte_free(dst_mm, pgtable);
+                wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+                goto out;
+        }
+        src_page = pmd_page(pmd);
+        VM_BUG_ON(!PageHead(src_page));
+        get_page(src_page);
+        page_dup_rmap(src_page);
+        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+        pmdp_set_wrprotect(src_mm, addr, src_pmd);
+        pmd = pmd_mkold(pmd_wrprotect(pmd));
+        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+        prepare_pmd_huge_pte(pgtable, dst_mm);
+        ret = 0;
+out_unlock:
+        spin_unlock(&src_mm->page_table_lock);
+        spin_unlock(&dst_mm->page_table_lock);
+out:
+        return ret;
+}
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
+{
+        pgtable_t pgtable;
+        assert_spin_locked(&mm->page_table_lock);
+        /* FIFO */
+        pgtable = mm->pmd_huge_pte;
+        if (list_empty(&pgtable->lru))
+                mm->pmd_huge_pte = NULL;
+        else {
+                mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+                                              struct page, lru);
+                list_del(&pgtable->lru);
+        }
+        return pgtable;
+}
+static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
+                                        struct vm_area_struct *vma,
+                                        unsigned long address,
+                                        pmd_t *pmd, pmd_t orig_pmd,
+                                        struct page *page,
+                                        unsigned long haddr)
+{
+        pgtable_t pgtable;
+        pmd_t _pmd;
+        int ret = 0, i;
+        struct page **pages;
+        pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
+                        GFP_KERNEL);
+        if (unlikely(!pages)) {
+                ret |= VM_FAULT_OOM;
+                goto out;
+        }
+        for (i = 0; i < HPAGE_PMD_NR; i++) {
+                pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+                                          vma, address);
+                if (unlikely(!pages[i] ||
+                             mem_cgroup_newpage_charge(pages[i], mm,
+                                                       GFP_KERNEL))) {
+                        if (pages[i])
+                                put_page(pages[i]);
+                        mem_cgroup_uncharge_start();
+                        while (--i >= 0) {
+                                mem_cgroup_uncharge_page(pages[i]);
+                                put_page(pages[i]);
+                        }
+                        mem_cgroup_uncharge_end();
+                        kfree(pages);
+                        ret |= VM_FAULT_OOM;
+                        goto out;
+                }
+        }
+        for (i = 0; i < HPAGE_PMD_NR; i++) {
+                copy_user_highpage(pages[i], page + i,
+                                   haddr + PAGE_SHIFT*i, vma);
+                __SetPageUptodate(pages[i]);
+                cond_resched();
+        }
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(*pmd, orig_pmd)))
+                goto out_free_pages;
+        VM_BUG_ON(!PageHead(page));
+        pmdp_clear_flush_notify(vma, haddr, pmd);
+        /* leave pmd empty until pte is filled */
+        pgtable = get_pmd_huge_pte(mm);
+        pmd_populate(mm, &_pmd, pgtable);
+        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+                pte_t *pte, entry;
+                entry = mk_pte(pages[i], vma->vm_page_prot);
+                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                page_add_new_anon_rmap(pages[i], vma, haddr);
+                pte = pte_offset_map(&_pmd, haddr);
+                VM_BUG_ON(!pte_none(*pte));
+                set_pte_at(mm, haddr, pte, entry);
+                pte_unmap(pte);
+        }
+        kfree(pages);
+        mm->nr_ptes++;
+        smp_wmb(); /* make pte visible before pmd */
+        pmd_populate(mm, pmd, pgtable);
+        page_remove_rmap(page);
+        spin_unlock(&mm->page_table_lock);
+        ret |= VM_FAULT_WRITE;
+        put_page(page);
+out:
+        return ret;
+out_free_pages:
+        spin_unlock(&mm->page_table_lock);
+        mem_cgroup_uncharge_start();
+        for (i = 0; i < HPAGE_PMD_NR; i++) {
+                mem_cgroup_uncharge_page(pages[i]);
+                put_page(pages[i]);
+        }
+        mem_cgroup_uncharge_end();
+        kfree(pages);
+        goto out;
+}
+int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                        unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+{
+        int ret = 0;
+        struct page *page, *new_page;
+        unsigned long haddr;
+        VM_BUG_ON(!vma->anon_vma);
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(*pmd, orig_pmd)))
+                goto out_unlock;
+        page = pmd_page(orig_pmd);
+        VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+        haddr = address & HPAGE_PMD_MASK;
+        if (page_mapcount(page) == 1) {
+                pmd_t entry;
+                entry = pmd_mkyoung(orig_pmd);
+                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+                if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
+                        update_mmu_cache(vma, address, entry);
+                ret |= VM_FAULT_WRITE;
+                goto out_unlock;
+        }
+        get_page(page);
+        spin_unlock(&mm->page_table_lock);
+        if (transparent_hugepage_enabled(vma) &&
+            !transparent_hugepage_debug_cow())
+                new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                                              vma, haddr);
+        else
+                new_page = NULL;
+        if (unlikely(!new_page)) {
+                ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+                                                   pmd, orig_pmd, page, haddr);
+                put_page(page);
+                goto out;
+        }
+        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+                put_page(new_page);
+                put_page(page);
+                ret |= VM_FAULT_OOM;
+                goto out;
+        }
+        copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+        __SetPageUptodate(new_page);
+        spin_lock(&mm->page_table_lock);
+        put_page(page);
+        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+                mem_cgroup_uncharge_page(new_page);
+                put_page(new_page);
+        } else {
+                pmd_t entry;
+                VM_BUG_ON(!PageHead(page));
+                entry = mk_pmd(new_page, vma->vm_page_prot);
+                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+                entry = pmd_mkhuge(entry);
+                pmdp_clear_flush_notify(vma, haddr, pmd);
+                page_add_new_anon_rmap(new_page, vma, haddr);
+                set_pmd_at(mm, haddr, pmd, entry);
+                update_mmu_cache(vma, address, entry);
+                page_remove_rmap(page);
+                put_page(page);
+                ret |= VM_FAULT_WRITE;
+        }
+out_unlock:
+        spin_unlock(&mm->page_table_lock);
+out:
+        return ret;
+}
+struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+                                   unsigned long addr,
+                                   pmd_t *pmd,
+                                   unsigned int flags)
+{
+        struct page *page = NULL;
+        assert_spin_locked(&mm->page_table_lock);
+        if (flags & FOLL_WRITE && !pmd_write(*pmd))
+                goto out;
+        page = pmd_page(*pmd);
+        VM_BUG_ON(!PageHead(page));
+        if (flags & FOLL_TOUCH) {
+                pmd_t _pmd;
+                /*
+                 * We should set the dirty bit only for FOLL_WRITE but
+                 * for now the dirty bit in the pmd is meaningless.
+                 * And if the dirty bit will become meaningful and
+                 * we'll only set it with FOLL_WRITE, an atomic
+                 * set_bit will be required on the pmd to set the
+                 * young bit, instead of the current set_pmd_at.
+                 */
+                _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+                set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+        }
+        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+        VM_BUG_ON(!PageCompound(page));
+        if (flags & FOLL_GET)
+                get_page(page);
+out:
+        return page;
+}
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+                 pmd_t *pmd)
+{
+        int ret = 0;
+        spin_lock(&tlb->mm->page_table_lock);
+        if (likely(pmd_trans_huge(*pmd))) {
+                if (unlikely(pmd_trans_splitting(*pmd))) {
+                        spin_unlock(&tlb->mm->page_table_lock);
+                        wait_split_huge_page(vma->anon_vma,
+                                             pmd);
+                } else {
+                        struct page *page;
+                        pgtable_t pgtable;
+                        pgtable = get_pmd_huge_pte(tlb->mm);
+                        page = pmd_page(*pmd);
+                        pmd_clear(pmd);
+                        page_remove_rmap(page);
+                        VM_BUG_ON(page_mapcount(page) < 0);
+                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+                        VM_BUG_ON(!PageHead(page));
+                        spin_unlock(&tlb->mm->page_table_lock);
+                        tlb_remove_page(tlb, page);
+                        pte_free(tlb->mm, pgtable);
+                        ret = 1;
+                }
+        } else
+                spin_unlock(&tlb->mm->page_table_lock);
+        return ret;
+}
+int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+                unsigned long addr, unsigned long end,
+                unsigned char *vec)
+{
+        int ret = 0;
+        spin_lock(&vma->vm_mm->page_table_lock);
+        if (likely(pmd_trans_huge(*pmd))) {
+                ret = !pmd_trans_splitting(*pmd);
+                spin_unlock(&vma->vm_mm->page_table_lock);
+                if (unlikely(!ret))
+                        wait_split_huge_page(vma->anon_vma, pmd);
+                else {
+                        /*
+                         * All logical pages in the range are present
+                         * if backed by a huge page.
+                         */
+                        memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+                }
+        } else
+                spin_unlock(&vma->vm_mm->page_table_lock);
+        return ret;
+}
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+                unsigned long addr, pgprot_t newprot)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        int ret = 0;
+        spin_lock(&mm->page_table_lock);
+        if (likely(pmd_trans_huge(*pmd))) {
+                if (unlikely(pmd_trans_splitting(*pmd))) {
+                        spin_unlock(&mm->page_table_lock);
+                        wait_split_huge_page(vma->anon_vma, pmd);
+                } else {
+                        pmd_t entry;
+                        entry = pmdp_get_and_clear(mm, addr, pmd);
+                        entry = pmd_modify(entry, newprot);
+                        set_pmd_at(mm, addr, pmd, entry);
+                        spin_unlock(&vma->vm_mm->page_table_lock);
+                        flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+                        ret = 1;
+                }
+        } else
+                spin_unlock(&vma->vm_mm->page_table_lock);
+        return ret;
+}
+pmd_t *page_check_address_pmd(struct page *page,
+                              struct mm_struct *mm,
+                              unsigned long address,
+                              enum page_check_address_pmd_flag flag)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd, *ret = NULL;
+        if (address & ~HPAGE_PMD_MASK)
+                goto out;
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                goto out;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                goto out;
+        pmd = pmd_offset(pud, address);
+        if (pmd_none(*pmd))
+                goto out;
+        if (pmd_page(*pmd) != page)
+                goto out;
+        /*
+         * split_vma() may create temporary aliased mappings. There is
+         * no risk as long as all huge pmd are found and have their
+         * splitting bit set before __split_huge_page_refcount
+         * runs. Finding the same huge pmd more than once during the
+         * same rmap walk is not a problem.
+         */
+        if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+            pmd_trans_splitting(*pmd))
+                goto out;
+        if (pmd_trans_huge(*pmd)) {
+                VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
+                          !pmd_trans_splitting(*pmd));
+                ret = pmd;
+        }
+out:
+        return ret;
+}
+static int __split_huge_page_splitting(struct page *page,
+                                       struct vm_area_struct *vma,
+                                       unsigned long address)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        pmd_t *pmd;
+        int ret = 0;
+        spin_lock(&mm->page_table_lock);
+        pmd = page_check_address_pmd(page, mm, address,
+                                     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+        if (pmd) {
+                /*
+                 * We can't temporarily set the pmd to null in order
+                 * to split it, the pmd must remain marked huge at all
+                 * times or the VM won't take the pmd_trans_huge paths
+                 * and it won't wait on the anon_vma->root->lock to
+                 * serialize against split_huge_page*.
+                 */
+                pmdp_splitting_flush_notify(vma, address, pmd);
+                ret = 1;
+        }
+        spin_unlock(&mm->page_table_lock);
+        return ret;
+}
+static void __split_huge_page_refcount(struct page *page)
+{
+        int i;
+        unsigned long head_index = page->index;
+        struct zone *zone = page_zone(page);
+        int zonestat;
+        /* prevent PageLRU to go away from under us, and freeze lru stats */
+        spin_lock_irq(&zone->lru_lock);
+        compound_lock(page);
+        for (i = 1; i < HPAGE_PMD_NR; i++) {
+                struct page *page_tail = page + i;
+                /* tail_page->_count cannot change */
+                atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+                BUG_ON(page_count(page) <= 0);
+                atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+                BUG_ON(atomic_read(&page_tail->_count) <= 0);
+                /* after clearing PageTail the gup refcount can be released */
+                smp_mb();
+                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+                page_tail->flags |= (page->flags &
+                                     ((1L << PG_referenced) |
+                                      (1L << PG_swapbacked) |
+                                      (1L << PG_mlocked) |
+                                      (1L << PG_uptodate)));
+                page_tail->flags |= (1L << PG_dirty);
+                /*
+                 * 1) clear PageTail before overwriting first_page
+                 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
+                 */
+                smp_wmb();
+                /*
+                 * __split_huge_page_splitting() already set the
+                 * splitting bit in all pmd that could map this
+                 * hugepage, that will ensure no CPU can alter the
+                 * mapcount on the head page. The mapcount is only
+                 * accounted in the head page and it has to be
+                 * transferred to all tail pages in the below code. So
+                 * for this code to be safe, the split the mapcount
+                 * can't change. But that doesn't mean userland can't
+                 * keep changing and reading the page contents while
+                 * we transfer the mapcount, so the pmd splitting
+                 * status is achieved setting a reserved bit in the
+                 * pmd, not by clearing the present bit.
+                */
+                BUG_ON(page_mapcount(page_tail));
+                page_tail->_mapcount = page->_mapcount;
+                BUG_ON(page_tail->mapping);
+                page_tail->mapping = page->mapping;
+                page_tail->index = ++head_index;
+                BUG_ON(!PageAnon(page_tail));
+                BUG_ON(!PageUptodate(page_tail));
+                BUG_ON(!PageDirty(page_tail));
+                BUG_ON(!PageSwapBacked(page_tail));
+                lru_add_page_tail(zone, page, page_tail);
+        }
+        __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+        __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+        /*
+         * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
+         * so adjust those appropriately if this page is on the LRU.
+         */
+        if (PageLRU(page)) {
+                zonestat = NR_LRU_BASE + page_lru(page);
+                __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
+        }
+        ClearPageCompound(page);
+        compound_unlock(page);
+        spin_unlock_irq(&zone->lru_lock);
+        for (i = 1; i < HPAGE_PMD_NR; i++) {
+                struct page *page_tail = page + i;
+                BUG_ON(page_count(page_tail) <= 0);
+                /*
+                 * Tail pages may be freed if there wasn't any mapping
+                 * like if add_to_swap() is running on a lru page that
+                 * had its mapping zapped. And freeing these pages
+                 * requires taking the lru_lock so we do the put_page
+                 * of the tail pages after the split is complete.
+                 */
+                put_page(page_tail);
+        }
+        /*
+         * Only the head page (now become a regular page) is required
+         * to be pinned by the caller.
+         */
+        BUG_ON(page_count(page) <= 0);
+}
+static int __split_huge_page_map(struct page *page,
+                                 struct vm_area_struct *vma,
+                                 unsigned long address)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        pmd_t *pmd, _pmd;
+        int ret = 0, i;
+        pgtable_t pgtable;
+        unsigned long haddr;
+        spin_lock(&mm->page_table_lock);
+        pmd = page_check_address_pmd(page, mm, address,
+                                     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+        if (pmd) {
+                pgtable = get_pmd_huge_pte(mm);
+                pmd_populate(mm, &_pmd, pgtable);
+                for (i = 0, haddr = address; i < HPAGE_PMD_NR;
+                     i++, haddr += PAGE_SIZE) {
+                        pte_t *pte, entry;
+                        BUG_ON(PageCompound(page+i));
+                        entry = mk_pte(page + i, vma->vm_page_prot);
+                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                        if (!pmd_write(*pmd))
+                                entry = pte_wrprotect(entry);
+                        else
+                                BUG_ON(page_mapcount(page) != 1);
+                        if (!pmd_young(*pmd))
+                                entry = pte_mkold(entry);
+                        pte = pte_offset_map(&_pmd, haddr);
+                        BUG_ON(!pte_none(*pte));
+                        set_pte_at(mm, haddr, pte, entry);
+                        pte_unmap(pte);
+                }
+                mm->nr_ptes++;
+                smp_wmb(); /* make pte visible before pmd */
+                /*
+                 * Up to this point the pmd is present and huge and
+                 * userland has the whole access to the hugepage
+                 * during the split (which happens in place). If we
+                 * overwrite the pmd with the not-huge version
+                 * pointing to the pte here (which of course we could
+                 * if all CPUs were bug free), userland could trigger
+                 * a small page size TLB miss on the small sized TLB
+                 * while the hugepage TLB entry is still established
+                 * in the huge TLB. Some CPU doesn't like that. See
+                 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
+                 * Erratum 383 on page 93. Intel should be safe but is
+                 * also warns that it's only safe if the permission
+                 * and cache attributes of the two entries loaded in
+                 * the two TLB is identical (which should be the case
+                 * here). But it is generally safer to never allow
+                 * small and huge TLB entries for the same virtual
+                 * address to be loaded simultaneously. So instead of
+                 * doing "pmd_populate(); flush_tlb_range();" we first
+                 * mark the current pmd notpresent (atomically because
+                 * here the pmd_trans_huge and pmd_trans_splitting
+                 * must remain set at all times on the pmd until the
+                 * split is complete for this pmd), then we flush the
+                 * SMP TLB and finally we write the non-huge version
+                 * of the pmd entry with pmd_populate.
+                 */
+                set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+                pmd_populate(mm, pmd, pgtable);
+                ret = 1;
+        }
+        spin_unlock(&mm->page_table_lock);
+        return ret;
+}
+/* must be called with anon_vma->root->lock hold */
+static void __split_huge_page(struct page *page,
+                              struct anon_vma *anon_vma)
+{
+        int mapcount, mapcount2;
+        struct anon_vma_chain *avc;
+        BUG_ON(!PageHead(page));
+        BUG_ON(PageTail(page));
+        mapcount = 0;
+        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+                struct vm_area_struct *vma = avc->vma;
+                unsigned long addr = vma_address(page, vma);
+                BUG_ON(is_vma_temporary_stack(vma));
+                if (addr == -EFAULT)
+                        continue;
+                mapcount += __split_huge_page_splitting(page, vma, addr);
+        }
+        /*
+         * It is critical that new vmas are added to the tail of the
+         * anon_vma list. This guarantes that if copy_huge_pmd() runs
+         * and establishes a child pmd before
+         * __split_huge_page_splitting() freezes the parent pmd (so if
+         * we fail to prevent copy_huge_pmd() from running until the
+         * whole __split_huge_page() is complete), we will still see
+         * the newly established pmd of the child later during the
+         * walk, to be able to set it as pmd_trans_splitting too.
+         */
+        if (mapcount != page_mapcount(page))
+                printk(KERN_ERR "mapcount %d page_mapcount %d\n",
+                       mapcount, page_mapcount(page));
+        BUG_ON(mapcount != page_mapcount(page));
+        __split_huge_page_refcount(page);
+        mapcount2 = 0;
+        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+                struct vm_area_struct *vma = avc->vma;
+                unsigned long addr = vma_address(page, vma);
+                BUG_ON(is_vma_temporary_stack(vma));
+                if (addr == -EFAULT)
+                        continue;
+                mapcount2 += __split_huge_page_map(page, vma, addr);
+        }
+        if (mapcount != mapcount2)
+                printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
+                       mapcount, mapcount2, page_mapcount(page));
+        BUG_ON(mapcount != mapcount2);
+}
+int split_huge_page(struct page *page)
+{
+        struct anon_vma *anon_vma;
+        int ret = 1;
+        BUG_ON(!PageAnon(page));
+        anon_vma = page_lock_anon_vma(page);
+        if (!anon_vma)
+                goto out;
+        ret = 0;
+        if (!PageCompound(page))
+                goto out_unlock;
+        BUG_ON(!PageSwapBacked(page));
+        __split_huge_page(page, anon_vma);
+        BUG_ON(PageCompound(page));
+out_unlock:
+        page_unlock_anon_vma(anon_vma);
+out:
+        return ret;
+}
+int hugepage_madvise(struct vm_area_struct *vma,
+                     unsigned long *vm_flags, int advice)
+{
+        switch (advice) {
+        case MADV_HUGEPAGE:
+                /*
+                 * Be somewhat over-protective like KSM for now!
+                 */
+                if (*vm_flags & (VM_HUGEPAGE |
+                                 VM_SHARED   | VM_MAYSHARE   |
+                                 VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+                                 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+                                 VM_MIXEDMAP | VM_SAO))
+                        return -EINVAL;
+                *vm_flags &= ~VM_NOHUGEPAGE;
+                *vm_flags |= VM_HUGEPAGE;
+                /*
+                 * If the vma become good for khugepaged to scan,
+                 * register it here without waiting a page fault that
+                 * may not happen any time soon.
+                 */
+                if (unlikely(khugepaged_enter_vma_merge(vma)))
+                        return -ENOMEM;
+                break;
+        case MADV_NOHUGEPAGE:
+                /*
+                 * Be somewhat over-protective like KSM for now!
+                 */
+                if (*vm_flags & (VM_NOHUGEPAGE |
+                                 VM_SHARED   | VM_MAYSHARE   |
+                                 VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+                                 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+                                 VM_MIXEDMAP | VM_SAO))
+                        return -EINVAL;
+                *vm_flags &= ~VM_HUGEPAGE;
+                *vm_flags |= VM_NOHUGEPAGE;
+                /*
+                 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+                 * this vma even if we leave the mm registered in khugepaged if
+                 * it got registered before VM_NOHUGEPAGE was set.
+                 */
+                break;
+        }
+        return 0;
+}
+static int __init khugepaged_slab_init(void)
+{
+        mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+                                          sizeof(struct mm_slot),
+                                          __alignof__(struct mm_slot), 0, NULL);
+        if (!mm_slot_cache)
+                return -ENOMEM;
+        return 0;
+}
+static void __init khugepaged_slab_free(void)
+{
+        kmem_cache_destroy(mm_slot_cache);
+        mm_slot_cache = NULL;
+}
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+        if (!mm_slot_cache)     /* initialization failed */
+                return NULL;
+        return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+        kmem_cache_free(mm_slot_cache, mm_slot);
+}
+static int __init mm_slots_hash_init(void)
+{
+        mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
+                                GFP_KERNEL);
+        if (!mm_slots_hash)
+                return -ENOMEM;
+        return 0;
+}
+#if 0
+static void __init mm_slots_hash_free(void)
+{
+        kfree(mm_slots_hash);
+        mm_slots_hash = NULL;
+}
+#endif
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+        struct mm_slot *mm_slot;
+        struct hlist_head *bucket;
+        struct hlist_node *node;
+        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+                                % MM_SLOTS_HASH_HEADS];
+        hlist_for_each_entry(mm_slot, node, bucket, hash) {
+                if (mm == mm_slot->mm)
+                        return mm_slot;
+        }
+        return NULL;
+}
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+                                    struct mm_slot *mm_slot)
+{
+        struct hlist_head *bucket;
+        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+                                % MM_SLOTS_HASH_HEADS];
+        mm_slot->mm = mm;
+        hlist_add_head(&mm_slot->hash, bucket);
+}
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+        return atomic_read(&mm->mm_users) == 0;
+}
+int __khugepaged_enter(struct mm_struct *mm)
+{
+        struct mm_slot *mm_slot;
+        int wakeup;
+        mm_slot = alloc_mm_slot();
+        if (!mm_slot)
+                return -ENOMEM;
+        /* __khugepaged_exit() must not run from under us */
+        VM_BUG_ON(khugepaged_test_exit(mm));
+        if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+                free_mm_slot(mm_slot);
+                return 0;
+        }
+        spin_lock(&khugepaged_mm_lock);
+        insert_to_mm_slots_hash(mm, mm_slot);
+        /*
+         * Insert just behind the scanning cursor, to let the area settle
+         * down a little.
+         */
+        wakeup = list_empty(&khugepaged_scan.mm_head);
+        list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+        spin_unlock(&khugepaged_mm_lock);
+        atomic_inc(&mm->mm_count);
+        if (wakeup)
+                wake_up_interruptible(&khugepaged_wait);
+        return 0;
+}
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+        unsigned long hstart, hend;
+        if (!vma->anon_vma)
+                /*
+                 * Not yet faulted in so we will register later in the
+                 * page fault if needed.
+                 */
+                return 0;
+        if (vma->vm_file || vma->vm_ops)
+                /* khugepaged not yet working on file or special mappings */
+                return 0;
+        VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+        hend = vma->vm_end & HPAGE_PMD_MASK;
+        if (hstart < hend)
+                return khugepaged_enter(vma);
+        return 0;
+}
+void __khugepaged_exit(struct mm_struct *mm)
+{
+        struct mm_slot *mm_slot;
+        int free = 0;
+        spin_lock(&khugepaged_mm_lock);
+        mm_slot = get_mm_slot(mm);
+        if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+                hlist_del(&mm_slot->hash);
+                list_del(&mm_slot->mm_node);
+                free = 1;
+        }
+        if (free) {
+                spin_unlock(&khugepaged_mm_lock);
+                clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+                free_mm_slot(mm_slot);
+                mmdrop(mm);
+        } else if (mm_slot) {
+                spin_unlock(&khugepaged_mm_lock);
+                /*
+                 * This is required to serialize against
+                 * khugepaged_test_exit() (which is guaranteed to run
+                 * under mmap sem read mode). Stop here (after we
+                 * return all pagetables will be destroyed) until
+                 * khugepaged has finished working on the pagetables
+                 * under the mmap_sem.
+                 */
+                down_write(&mm->mmap_sem);
+                up_write(&mm->mmap_sem);
+        } else
+                spin_unlock(&khugepaged_mm_lock);
+}
+static void release_pte_page(struct page *page)
+{
+        /* 0 stands for page_is_file_cache(page) == false */
+        dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+        unlock_page(page);
+        putback_lru_page(page);
+}
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+        while (--_pte >= pte) {
+                pte_t pteval = *_pte;
+                if (!pte_none(pteval))
+                        release_pte_page(pte_page(pteval));
+        }
+}
+static void release_all_pte_pages(pte_t *pte)
+{
+        release_pte_pages(pte, pte + HPAGE_PMD_NR);
+}
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+                                        unsigned long address,
+                                        pte_t *pte)
+{
+        struct page *page;
+        pte_t *_pte;
+        int referenced = 0, isolated = 0, none = 0;
+        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+             _pte++, address += PAGE_SIZE) {
+                pte_t pteval = *_pte;
+                if (pte_none(pteval)) {
+                        if (++none <= khugepaged_max_ptes_none)
+                                continue;
+                        else {
+                                release_pte_pages(pte, _pte);
+                                goto out;
+                        }
+                }
+                if (!pte_present(pteval) || !pte_write(pteval)) {
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                page = vm_normal_page(vma, address, pteval);
+                if (unlikely(!page)) {
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                VM_BUG_ON(PageCompound(page));
+                BUG_ON(!PageAnon(page));
+                VM_BUG_ON(!PageSwapBacked(page));
+                /* cannot use mapcount: can't collapse if there's a gup pin */
+                if (page_count(page) != 1) {
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                /*
+                 * We can do it before isolate_lru_page because the
+                 * page can't be freed from under us. NOTE: PG_lock
+                 * is needed to serialize against split_huge_page
+                 * when invoked from the VM.
+                 */
+                if (!trylock_page(page)) {
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                /*
+                 * Isolate the page to avoid collapsing an hugepage
+                 * currently in use by the VM.
+                 */
+                if (isolate_lru_page(page)) {
+                        unlock_page(page);
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                /* 0 stands for page_is_file_cache(page) == false */
+                inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+                VM_BUG_ON(!PageLocked(page));
+                VM_BUG_ON(PageLRU(page));
+                /* If there is no mapped pte young don't collapse the page */
+                if (pte_young(pteval) || PageReferenced(page) ||
+                    mmu_notifier_test_young(vma->vm_mm, address))
+                        referenced = 1;
+        }
+        if (unlikely(!referenced))
+                release_all_pte_pages(pte);
+        else
+                isolated = 1;
+out:
+        return isolated;
+}
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+                                      struct vm_area_struct *vma,
+                                      unsigned long address,
+                                      spinlock_t *ptl)
+{
+        pte_t *_pte;
+        for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+                pte_t pteval = *_pte;
+                struct page *src_page;
+                if (pte_none(pteval)) {
+                        clear_user_highpage(page, address);
+                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+                } else {
+                        src_page = pte_page(pteval);
+                        copy_user_highpage(page, src_page, address, vma);
+                        VM_BUG_ON(page_mapcount(src_page) != 1);
+                        VM_BUG_ON(page_count(src_page) != 2);
+                        release_pte_page(src_page);
+                        /*
+                         * ptl mostly unnecessary, but preempt has to
+                         * be disabled to update the per-cpu stats
+                         * inside page_remove_rmap().
+                         */
+                        spin_lock(ptl);
+                        /*
+                         * paravirt calls inside pte_clear here are
+                         * superfluous.
+                         */
+                        pte_clear(vma->vm_mm, address, _pte);
+                        page_remove_rmap(src_page);
+                        spin_unlock(ptl);
+                        free_page_and_swap_cache(src_page);
+                }
+                address += PAGE_SIZE;
+                page++;
+        }
+}
+static void collapse_huge_page(struct mm_struct *mm,
+                               unsigned long address,
+                               struct page **hpage,
+                               struct vm_area_struct *vma)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd, _pmd;
+        pte_t *pte;
+        pgtable_t pgtable;
+        struct page *new_page;
+        spinlock_t *ptl;
+        int isolated;
+        unsigned long hstart, hend;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
+        VM_BUG_ON(!*hpage);
+        new_page = *hpage;
+#else
+        VM_BUG_ON(*hpage);
+        /*
+         * Allocate the page while the vma is still valid and under
+         * the mmap_sem read mode so there is no memory allocation
+         * later when we take the mmap_sem in write mode. This is more
+         * friendly behavior (OTOH it may actually hide bugs) to
+         * filesystems in userland with daemons allocating memory in
+         * the userland I/O paths.  Allocating memory with the
+         * mmap_sem in read mode is good idea also to allow greater
+         * scalability.
+         */
+        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+        if (unlikely(!new_page)) {
+                up_read(&mm->mmap_sem);
+                *hpage = ERR_PTR(-ENOMEM);
+                return;
+        }
+#endif
+        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+                up_read(&mm->mmap_sem);
+                put_page(new_page);
+                return;
+        }
+        /* after allocating the hugepage upgrade to mmap_sem write mode */
+        up_read(&mm->mmap_sem);
+        /*
+         * Prevent all access to pagetables with the exception of
+         * gup_fast later hanlded by the ptep_clear_flush and the VM
+         * handled by the anon_vma lock + PG_lock.
+         */
+        down_write(&mm->mmap_sem);
+        if (unlikely(khugepaged_test_exit(mm)))
+                goto out;
+        vma = find_vma(mm, address);
+        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+        hend = vma->vm_end & HPAGE_PMD_MASK;
+        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+                goto out;
+        if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+            (vma->vm_flags & VM_NOHUGEPAGE))
+                goto out;
+        /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+        if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+                goto out;
+        VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                goto out;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                goto out;
+        pmd = pmd_offset(pud, address);
+        /* pmd can't go away or become huge under us */
+        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+                goto out;
+        anon_vma_lock(vma->anon_vma);
+        pte = pte_offset_map(pmd, address);
+        ptl = pte_lockptr(mm, pmd);
+        spin_lock(&mm->page_table_lock); /* probably unnecessary */
+        /*
+         * After this gup_fast can't run anymore. This also removes
+         * any huge TLB entry from the CPU so we won't allow
+         * huge and small TLB entries for the same virtual address
+         * to avoid the risk of CPU bugs in that area.
+         */
+        _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+        spin_unlock(&mm->page_table_lock);
+        spin_lock(ptl);
+        isolated = __collapse_huge_page_isolate(vma, address, pte);
+        spin_unlock(ptl);
+        pte_unmap(pte);
+        if (unlikely(!isolated)) {
+                spin_lock(&mm->page_table_lock);
+                BUG_ON(!pmd_none(*pmd));
+                set_pmd_at(mm, address, pmd, _pmd);
+                spin_unlock(&mm->page_table_lock);
+                anon_vma_unlock(vma->anon_vma);
+                mem_cgroup_uncharge_page(new_page);
+                goto out;
+        }
+        /*
+         * All pages are isolated and locked so anon_vma rmap
+         * can't run anymore.
+         */
+        anon_vma_unlock(vma->anon_vma);
+        __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+        __SetPageUptodate(new_page);
+        pgtable = pmd_pgtable(_pmd);
+        VM_BUG_ON(page_count(pgtable) != 1);
+        VM_BUG_ON(page_mapcount(pgtable) != 0);
+        _pmd = mk_pmd(new_page, vma->vm_page_prot);
+        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+        _pmd = pmd_mkhuge(_pmd);
+        /*
+         * spin_lock() below is not the equivalent of smp_wmb(), so
+         * this is needed to avoid the copy_huge_page writes to become
+         * visible after the set_pmd_at() write.
+         */
+        smp_wmb();
+        spin_lock(&mm->page_table_lock);
+        BUG_ON(!pmd_none(*pmd));
+        page_add_new_anon_rmap(new_page, vma, address);
+        set_pmd_at(mm, address, pmd, _pmd);
+        update_mmu_cache(vma, address, entry);
+        prepare_pmd_huge_pte(pgtable, mm);
+        mm->nr_ptes--;
+        spin_unlock(&mm->page_table_lock);
+#ifndef CONFIG_NUMA
+        *hpage = NULL;
+#endif
+        khugepaged_pages_collapsed++;
+out_up_write:
+        up_write(&mm->mmap_sem);
+        return;
+out:
+#ifdef CONFIG_NUMA
+        put_page(new_page);
+#endif
+        goto out_up_write;
+}
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+                               struct vm_area_struct *vma,
+                               unsigned long address,
+                               struct page **hpage)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte, *_pte;
+        int ret = 0, referenced = 0, none = 0;
+        struct page *page;
+        unsigned long _address;
+        spinlock_t *ptl;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                goto out;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                goto out;
+        pmd = pmd_offset(pud, address);
+        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+                goto out;
+        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+        for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+             _pte++, _address += PAGE_SIZE) {
+                pte_t pteval = *_pte;
+                if (pte_none(pteval)) {
+                        if (++none <= khugepaged_max_ptes_none)
+                                continue;
+                        else
+                                goto out_unmap;
+                }
+                if (!pte_present(pteval) || !pte_write(pteval))
+                        goto out_unmap;
+                page = vm_normal_page(vma, _address, pteval);
+                if (unlikely(!page))
+                        goto out_unmap;
+                VM_BUG_ON(PageCompound(page));
+                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+                        goto out_unmap;
+                /* cannot use mapcount: can't collapse if there's a gup pin */
+                if (page_count(page) != 1)
+                        goto out_unmap;
+                if (pte_young(pteval) || PageReferenced(page) ||
+                    mmu_notifier_test_young(vma->vm_mm, address))
+                        referenced = 1;
+        }
+        if (referenced)
+                ret = 1;
+out_unmap:
+        pte_unmap_unlock(pte, ptl);
+        if (ret)
+                /* collapse_huge_page will return with the mmap_sem released */
+                collapse_huge_page(mm, address, hpage, vma);
+out:
+        return ret;
+}
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+        struct mm_struct *mm = mm_slot->mm;
+        VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+        if (khugepaged_test_exit(mm)) {
+                /* free mm_slot */
+                hlist_del(&mm_slot->hash);
+                list_del(&mm_slot->mm_node);
+                /*
+                 * Not strictly needed because the mm exited already.
+                 *
+                 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+                 */
+                /* khugepaged_mm_lock actually not necessary for the below */
+                free_mm_slot(mm_slot);
+                mmdrop(mm);
+        }
+}
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+                                            struct page **hpage)
+{
+        struct mm_slot *mm_slot;
+        struct mm_struct *mm;
+        struct vm_area_struct *vma;
+        int progress = 0;
+        VM_BUG_ON(!pages);
+        VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+        if (khugepaged_scan.mm_slot)
+                mm_slot = khugepaged_scan.mm_slot;
+        else {
+                mm_slot = list_entry(khugepaged_scan.mm_head.next,
+                                     struct mm_slot, mm_node);
+                khugepaged_scan.address = 0;
+                khugepaged_scan.mm_slot = mm_slot;
+        }
+        spin_unlock(&khugepaged_mm_lock);
+        mm = mm_slot->mm;
+        down_read(&mm->mmap_sem);
+        if (unlikely(khugepaged_test_exit(mm)))
+                vma = NULL;
+        else
+                vma = find_vma(mm, khugepaged_scan.address);
+        progress++;
+        for (; vma; vma = vma->vm_next) {
+                unsigned long hstart, hend;
+                cond_resched();
+                if (unlikely(khugepaged_test_exit(mm))) {
+                        progress++;
+                        break;
+                }
+                if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+                     !khugepaged_always()) ||
+                    (vma->vm_flags & VM_NOHUGEPAGE)) {
+                        progress++;
+                        continue;
+                }
+                /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+                if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
+                        khugepaged_scan.address = vma->vm_end;
+                        progress++;
+                        continue;
+                }
+                VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+                hend = vma->vm_end & HPAGE_PMD_MASK;
+                if (hstart >= hend) {
+                        progress++;
+                        continue;
+                }
+                if (khugepaged_scan.address < hstart)
+                        khugepaged_scan.address = hstart;
+                if (khugepaged_scan.address > hend) {
+                        khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
+                        progress++;
+                        continue;
+                }
+                BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+                while (khugepaged_scan.address < hend) {
+                        int ret;
+                        cond_resched();
+                        if (unlikely(khugepaged_test_exit(mm)))
+                                goto breakouterloop;
+                        VM_BUG_ON(khugepaged_scan.address < hstart ||
+                                  khugepaged_scan.address + HPAGE_PMD_SIZE >
+                                  hend);
+                        ret = khugepaged_scan_pmd(mm, vma,
+                                                  khugepaged_scan.address,
+                                                  hpage);
+                        /* move to next address */
+                        khugepaged_scan.address += HPAGE_PMD_SIZE;
+                        progress += HPAGE_PMD_NR;
+                        if (ret)
+                                /* we released mmap_sem so break loop */
+                                goto breakouterloop_mmap_sem;
+                        if (progress >= pages)
+                                goto breakouterloop;
+                }
+        }
+breakouterloop:
+        up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+        spin_lock(&khugepaged_mm_lock);
+        BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+        /*
+         * Release the current mm_slot if this mm is about to die, or
+         * if we scanned all vmas of this mm.
+         */
+        if (khugepaged_test_exit(mm) || !vma) {
+                /*
+                 * Make sure that if mm_users is reaching zero while
+                 * khugepaged runs here, khugepaged_exit will find
+                 * mm_slot not pointing to the exiting mm.
+                 */
+                if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+                        khugepaged_scan.mm_slot = list_entry(
+                                mm_slot->mm_node.next,
+                                struct mm_slot, mm_node);
+                        khugepaged_scan.address = 0;
+                } else {
+                        khugepaged_scan.mm_slot = NULL;
+                        khugepaged_full_scans++;
+                }
+                collect_mm_slot(mm_slot);
+        }
+        return progress;
+}
+static int khugepaged_has_work(void)
+{
+        return !list_empty(&khugepaged_scan.mm_head) &&
+                khugepaged_enabled();
+}
+static int khugepaged_wait_event(void)
+{
+        return !list_empty(&khugepaged_scan.mm_head) ||
+                !khugepaged_enabled();
+}
+static void khugepaged_do_scan(struct page **hpage)
+{
+        unsigned int progress = 0, pass_through_head = 0;
+        unsigned int pages = khugepaged_pages_to_scan;
+        barrier(); /* write khugepaged_pages_to_scan to local stack */
+        while (progress < pages) {
+                cond_resched();
+#ifndef CONFIG_NUMA
+                if (!*hpage) {
+                        *hpage = alloc_hugepage(khugepaged_defrag());
+                        if (unlikely(!*hpage))
+                                break;
+                }
+#else
+                if (IS_ERR(*hpage))
+                        break;
+#endif
+                if (unlikely(kthread_should_stop() || freezing(current)))
+                        break;
+                spin_lock(&khugepaged_mm_lock);
+                if (!khugepaged_scan.mm_slot)
+                        pass_through_head++;
+                if (khugepaged_has_work() &&
+                    pass_through_head < 2)
+                        progress += khugepaged_scan_mm_slot(pages - progress,
+                                                            hpage);
+                else
+                        progress = pages;
+                spin_unlock(&khugepaged_mm_lock);
+        }
+}
+static void khugepaged_alloc_sleep(void)
+{
+        DEFINE_WAIT(wait);
+        add_wait_queue(&khugepaged_wait, &wait);
+        schedule_timeout_interruptible(
+                msecs_to_jiffies(
+                        khugepaged_alloc_sleep_millisecs));
+        remove_wait_queue(&khugepaged_wait, &wait);
+}
+#ifndef CONFIG_NUMA
+static struct page *khugepaged_alloc_hugepage(void)
+{
+        struct page *hpage;
+        do {
+                hpage = alloc_hugepage(khugepaged_defrag());
+                if (!hpage)
+                        khugepaged_alloc_sleep();
+        } while (unlikely(!hpage) &&
+                 likely(khugepaged_enabled()));
+        return hpage;
+}
+#endif
+static void khugepaged_loop(void)
+{
+        struct page *hpage;
+#ifdef CONFIG_NUMA
+        hpage = NULL;
+#endif
+        while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
+                hpage = khugepaged_alloc_hugepage();
+                if (unlikely(!hpage))
+                        break;
+#else
+                if (IS_ERR(hpage)) {
+                        khugepaged_alloc_sleep();
+                        hpage = NULL;
+                }
+#endif
+                khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
+                if (hpage)
+                        put_page(hpage);
+#endif
+                try_to_freeze();
+                if (unlikely(kthread_should_stop()))
+                        break;
+                if (khugepaged_has_work()) {
+                        DEFINE_WAIT(wait);
+                        if (!khugepaged_scan_sleep_millisecs)
+                                continue;
+                        add_wait_queue(&khugepaged_wait, &wait);
+                        schedule_timeout_interruptible(
+                                msecs_to_jiffies(
+                                        khugepaged_scan_sleep_millisecs));
+                        remove_wait_queue(&khugepaged_wait, &wait);
+                } else if (khugepaged_enabled())
+                        wait_event_freezable(khugepaged_wait,
+                                             khugepaged_wait_event());
+        }
+}
+static int khugepaged(void *none)
+{
+        struct mm_slot *mm_slot;
+        set_freezable();
+        set_user_nice(current, 19);
+        /* serialize with start_khugepaged() */
+        mutex_lock(&khugepaged_mutex);
+        for (;;) {
+                mutex_unlock(&khugepaged_mutex);
+                BUG_ON(khugepaged_thread != current);
+                khugepaged_loop();
+                BUG_ON(khugepaged_thread != current);
+                mutex_lock(&khugepaged_mutex);
+                if (!khugepaged_enabled())
+                        break;
+                if (unlikely(kthread_should_stop()))
+                        break;
+        }
+        spin_lock(&khugepaged_mm_lock);
+        mm_slot = khugepaged_scan.mm_slot;
+        khugepaged_scan.mm_slot = NULL;
+        if (mm_slot)
+                collect_mm_slot(mm_slot);
+        spin_unlock(&khugepaged_mm_lock);
+        khugepaged_thread = NULL;
+        mutex_unlock(&khugepaged_mutex);
+        return 0;
+}
+void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+{
+        struct page *page;
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_trans_huge(*pmd))) {
+                spin_unlock(&mm->page_table_lock);
+                return;
+        }
+        page = pmd_page(*pmd);
+        VM_BUG_ON(!page_count(page));
+        get_page(page);
+        spin_unlock(&mm->page_table_lock);
+        split_huge_page(page);
+        put_page(page);
+        BUG_ON(pmd_trans_huge(*pmd));
+}
+static void split_huge_page_address(struct mm_struct *mm,
+                                    unsigned long address)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                return;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                return;
+        pmd = pmd_offset(pud, address);
+        if (!pmd_present(*pmd))
+                return;
+        /*
+         * Caller holds the mmap_sem write mode, so a huge pmd cannot
+         * materialize from under us.
+         */
+        split_huge_page_pmd(mm, pmd);
+}
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+                             unsigned long start,
+                             unsigned long end,
+                             long adjust_next)
+{
+        /*
+         * If the new start address isn't hpage aligned and it could
+         * previously contain an hugepage: check if we need to split
+         * an huge pmd.
+         */
+        if (start & ~HPAGE_PMD_MASK &&
+            (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+            (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+                split_huge_page_address(vma->vm_mm, start);
+        /*
+         * If the new end address isn't hpage aligned and it could
+         * previously contain an hugepage: check if we need to split
+         * an huge pmd.
+         */
+        if (end & ~HPAGE_PMD_MASK &&
+            (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+            (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+                split_huge_page_address(vma->vm_mm, end);
+        /*
+         * If we're also updating the vma->vm_next->vm_start, if the new
+         * vm_next->vm_start isn't page aligned and it could previously
+         * contain an hugepage: check if we need to split an huge pmd.
+         */
+        if (adjust_next > 0) {
+                struct vm_area_struct *next = vma->vm_next;
+                unsigned long nstart = next->vm_start;
+                nstart += adjust_next << PAGE_SHIFT;
+                if (nstart & ~HPAGE_PMD_MASK &&
+                    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+                    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+                        split_huge_page_address(next->vm_mm, nstart);
+        }
+}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85855240933d..bb0b7c128015 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma)
        return 0;
 }
-static void clear_gigantic_page(struct page *page,
-                        unsigned long addr, unsigned long sz)
-{
-        int i;
-        struct page *p = page;
-        might_sleep();
-        for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
-                cond_resched();
-                clear_user_highpage(p, addr + i * PAGE_SIZE);
-        }
-}
-static void clear_huge_page(struct page *page,
-                        unsigned long addr, unsigned long sz)
-{
-        int i;
-        if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
-                clear_gigantic_page(page, addr, sz);
-                return;
-        }
-        might_sleep();
-        for (i = 0; i < sz/PAGE_SIZE; i++) {
-                cond_resched();
-                clear_user_highpage(page + i, addr + i * PAGE_SIZE);
-        }
-}
-static void copy_user_gigantic_page(struct page *dst, struct page *src,
-                           unsigned long addr, struct vm_area_struct *vma)
-{
-        int i;
-        struct hstate *h = hstate_vma(vma);
-        struct page *dst_base = dst;
-        struct page *src_base = src;
-        for (i = 0; i < pages_per_huge_page(h); ) {
-                cond_resched();
-                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
-                i++;
-                dst = mem_map_next(dst, dst_base, i);
-                src = mem_map_next(src, src_base, i);
-        }
-}
-static void copy_user_huge_page(struct page *dst, struct page *src,
-                           unsigned long addr, struct vm_area_struct *vma)
-{
-        int i;
-        struct hstate *h = hstate_vma(vma);
-        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
-                copy_user_gigantic_page(dst, src, addr, vma);
-                return;
-        }
-        might_sleep();
-        for (i = 0; i < pages_per_huge_page(h); i++) {
-                cond_resched();
-                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
-        }
-}
 static void copy_gigantic_page(struct page *dst, struct page *src)
 {
        int i;
@@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
        return sprintf(buf, "%lu\n", nr_huge_pages);
 }
 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                        struct kobject *kobj, struct kobj_attribute *attr,
                        const char *buf, size_t len)
@@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
        err = strict_strtoul(buf, 10, &count);
        if (err)
-                return 0;
+                goto out;
        h = kobj_to_hstate(kobj, &nid);
+        if (h->order >= MAX_ORDER) {
+                err = -EINVAL;
+                goto out;
+        }
        if (nid == NUMA_NO_NODE) {
                /*
                 * global hstate attribute
@@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                NODEMASK_FREE(nodes_allowed);
        return len;
+out:
+        NODEMASK_FREE(nodes_allowed);
+        return err;
 }
 static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
        struct hstate *h = kobj_to_hstate(kobj, NULL);
        return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
 }
 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
 {
@@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
        unsigned long input;
        struct hstate *h = kobj_to_hstate(kobj, NULL);
+        if (h->order >= MAX_ORDER)
+                return -EINVAL;
        err = strict_strtoul(buf, 10, &input);
        if (err)
-                return 0;
+                return err;
        spin_lock(&hugetlb_lock);
        h->nr_overcommit_huge_pages = input;
@@ -1922,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
 {
        struct hstate *h = &default_hstate;
        unsigned long tmp;
+        int ret;
        if (!write)
                tmp = h->max_huge_pages;
+        if (write && h->order >= MAX_ORDER)
+                return -EINVAL;
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
-        proc_doulongvec_minmax(table, write, buffer, length, ppos);
+        ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+        if (ret)
+                goto out;
        if (write) {
                NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1943,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
                if (nodes_allowed != &node_states[N_HIGH_MEMORY])
                        NODEMASK_FREE(nodes_allowed);
        }
+out:
-        return 0;
+        return ret;
 }
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1982,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 {
        struct hstate *h = &default_hstate;
        unsigned long tmp;
+        int ret;
        if (!write)
                tmp = h->nr_overcommit_huge_pages;
+        if (write && h->order >= MAX_ORDER)
+                return -EINVAL;
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
-        proc_doulongvec_minmax(table, write, buffer, length, ppos);
+        ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+        if (ret)
+                goto out;
        if (write) {
                spin_lock(&hugetlb_lock);
                h->nr_overcommit_huge_pages = tmp;
                spin_unlock(&hugetlb_lock);
        }
+out:
-        return 0;
+        return ret;
 }
 #endif /* CONFIG_SYSCTL */
@@ -2454,7 +2414,8 @@ retry_avoidcopy:
                return VM_FAULT_OOM;
        }
-        copy_user_huge_page(new_page, old_page, address, vma);
+        copy_user_huge_page(new_page, old_page, address, vma,
+                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);
        /*
@@ -2558,7 +2519,7 @@ retry:
                        ret = -PTR_ERR(page);
                        goto out;
                }
-                clear_huge_page(page, address, huge_page_size(h));
+                clear_huge_page(page, address, pages_per_huge_page(h));
                __SetPageUptodate(page);
                if (vma->vm_flags & VM_MAYSHARE) {
diff --git a/mm/internal.h b/mm/internal.h
index dedb0aff673f..4c98630f0f77 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,6 +39,15 @@ static inline void __put_page(struct page *page)
 extern unsigned long highest_memmap_pfn;
+#ifdef CONFIG_SMP
+extern int putback_active_lru_page(struct zone *zone, struct page *page);
+#else
+static inline int putback_active_lru_page(struct zone *zone, struct page *page)
+{
+        return 0;
+}
+#endif
 /*
 * in mm/vmscan.c:
 */
@@ -134,6 +143,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
        }
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long vma_address(struct page *page,
+                                 struct vm_area_struct *vma);
+#endif
 #else /* !CONFIG_MMU */
 static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 {
@@ -243,7 +256,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int len, unsigned int foll_flags,
-                     struct page **pages, struct vm_area_struct **vmas);
+                     struct page **pages, struct vm_area_struct **vmas,
+                     int *nonblocking);
 #define ZONE_RECLAIM_NOSCAN     -2
 #define ZONE_RECLAIM_FULL       -1
diff --git a/mm/ksm.c b/mm/ksm.c
index 43bc893470b4..c2b2a94f9d67 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -34,6 +34,7 @@
 #include <linux/swap.h>
 #include <linux/ksm.h>
 #include <linux/hash.h>
+#include <linux/freezer.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -411,6 +412,20 @@ out:
        up_read(&mm->mmap_sem);
 }
+static struct page *page_trans_compound_anon(struct page *page)
+{
+        if (PageTransCompound(page)) {
+                struct page *head = compound_trans_head(page);
+                /*
+                 * head may actually be splitted and freed from under
+                 * us but it's ok here.
+                 */
+                if (PageAnon(head))
+                        return head;
+        }
+        return NULL;
+}
 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 {
        struct mm_struct *mm = rmap_item->mm;
@@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        page = follow_page(vma, addr, FOLL_GET);
        if (IS_ERR_OR_NULL(page))
                goto out;
-        if (PageAnon(page)) {
+        if (PageAnon(page) || page_trans_compound_anon(page)) {
                flush_anon_page(vma, page, addr);
                flush_dcache_page(page);
        } else {
@@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
        if (addr == -EFAULT)
                goto out;
+        BUG_ON(PageTransCompound(page));
        ptep = page_check_address(page, mm, addr, &ptl, 0);
        if (!ptep)
                goto out;
@@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
                goto out;
        pmd = pmd_offset(pud, addr);
+        BUG_ON(pmd_trans_huge(*pmd));
        if (!pmd_present(*pmd))
                goto out;
@@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
        page_remove_rmap(page);
+        if (!page_mapped(page))
+                try_to_free_swap(page);
        put_page(page);
        pte_unmap_unlock(ptep, ptl);
@@ -808,6 +827,33 @@ out:
        return err;
 }
+static int page_trans_compound_anon_split(struct page *page)
+{
+        int ret = 0;
+        struct page *transhuge_head = page_trans_compound_anon(page);
+        if (transhuge_head) {
+                /* Get the reference on the head to split it. */
+                if (get_page_unless_zero(transhuge_head)) {
+                        /*
+                         * Recheck we got the reference while the head
+                         * was still anonymous.
+                         */
+                        if (PageAnon(transhuge_head))
+                                ret = split_huge_page(transhuge_head);
+                        else
+                                /*
+                                 * Retry later if split_huge_page run
+                                 * from under us.
+                                 */
+                                ret = 1;
+                        put_page(transhuge_head);
+                } else
+                        /* Retry later if split_huge_page run from under us. */
+                        ret = 1;
+        }
+        return ret;
+}
 /*
 * try_to_merge_one_page - take two pages and merge them into one
 * @vma: the vma that holds the pte pointing to page
@@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
        if (!(vma->vm_flags & VM_MERGEABLE))
                goto out;
+        if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+                goto out;
+        BUG_ON(PageTransCompound(page));
        if (!PageAnon(page))
                goto out;
@@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
        slot = ksm_scan.mm_slot;
        if (slot == &ksm_mm_head) {
+                /*
+                 * A number of pages can hang around indefinitely on per-cpu
+                 * pagevecs, raised page count preventing write_protect_page
+                 * from merging them.  Though it doesn't really matter much,
+                 * it is puzzling to see some stuck in pages_volatile until
+                 * other activity jostles them out, and they also prevented
+                 * LTP's KSM test from succeeding deterministically; so drain
+                 * them here (here rather than on entry to ksm_do_scan(),
+                 * so we don't IPI too often when pages_to_scan is set low).
+                 */
+                lru_add_drain_all();
                root_unstable_tree = RB_ROOT;
                spin_lock(&ksm_mmlist_lock);
@@ -1277,7 +1338,13 @@ next_mm:
                        if (ksm_test_exit(mm))
                                break;
                        *page = follow_page(vma, ksm_scan.address, FOLL_GET);
-                        if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
+                        if (IS_ERR_OR_NULL(*page)) {
+                                ksm_scan.address += PAGE_SIZE;
+                                cond_resched();
+                                continue;
+                        }
+                        if (PageAnon(*page) ||
+                            page_trans_compound_anon(*page)) {
                                flush_anon_page(vma, *page, ksm_scan.address);
                                flush_dcache_page(*page);
                                rmap_item = get_next_rmap_item(slot,
@@ -1291,8 +1358,7 @@ next_mm:
                                up_read(&mm->mmap_sem);
                                return rmap_item;
                        }
-                        if (!IS_ERR_OR_NULL(*page))
+                        put_page(*page);
-                                put_page(*page);
                        ksm_scan.address += PAGE_SIZE;
                        cond_resched();
                }
@@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages)
        struct rmap_item *rmap_item;
        struct page *uninitialized_var(page);
-        while (scan_npages--) {
+        while (scan_npages-- && likely(!freezing(current))) {
                cond_resched();
                rmap_item = scan_get_next_rmap_item(&page);
                if (!rmap_item)
@@ -1370,6 +1436,7 @@ static int ksmd_should_run(void)
 static int ksm_scan_thread(void *nothing)
 {
+        set_freezable();
        set_user_nice(current, 5);
        while (!kthread_should_stop()) {
@@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing)
                        ksm_do_scan(ksm_thread_pages_to_scan);
                mutex_unlock(&ksm_thread_mutex);
+                try_to_freeze();
                if (ksmd_should_run()) {
                        schedule_timeout_interruptible(
                                msecs_to_jiffies(ksm_thread_sleep_millisecs));
                } else {
-                        wait_event_interruptible(ksm_thread_wait,
+                        wait_event_freezable(ksm_thread_wait,
                                ksmd_should_run() || kthread_should_stop());
                }
        }
diff --git a/mm/madvise.c b/mm/madvise.c
index 319528b8db74..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
                if (error)
                        goto out;
                break;
+        case MADV_HUGEPAGE:
+        case MADV_NOHUGEPAGE:
+                error = hugepage_madvise(vma, &new_flags, behavior);
+                if (error)
+                        goto out;
+                break;
        }
        if (new_flags == vma->vm_flags) {
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior)
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        case MADV_HUGEPAGE:
+        case MADV_NOHUGEPAGE:
+#endif
                return 1;
        default:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00bb8a64d028..8ab841031436 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -292,7 +292,6 @@ static struct move_charge_struct {
        unsigned long moved_charge;
        unsigned long moved_swap;
        struct task_struct *moving_task;        /* a task moving charges */
-        struct mm_struct *mm;
        wait_queue_head_t waitq;                /* a waitq for other context */
 } mc = {
        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -821,7 +820,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
                return;
        VM_BUG_ON(list_empty(&pc->lru));
        list_del_init(&pc->lru);
-        return;
 }
 void mem_cgroup_del_lru(struct page *page)
@@ -1087,7 +1085,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                case 0:
                        list_move(&page->lru, dst);
                        mem_cgroup_del_lru(page);
-                        nr_taken++;
+                        nr_taken += hpage_nr_pages(page);
                        break;
                case -EBUSY:
                        /* we don't affect global LRU but rotate in our LRU */
@@ -1312,8 +1310,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        u64 limit;
        u64 memsw;
-        limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+        limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
-                        total_swap_pages;
+        limit += total_swap_pages << PAGE_SHIFT;
        memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
        /*
         * If memsw is finite and limits the amount of swap space available
@@ -1600,11 +1599,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 * possibility of race condition. If there is, we take a lock.
 */
-static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
+void mem_cgroup_update_page_stat(struct page *page,
+                                 enum mem_cgroup_page_stat_item idx, int val)
 {
        struct mem_cgroup *mem;
        struct page_cgroup *pc = lookup_page_cgroup(page);
        bool need_unlock = false;
+        unsigned long uninitialized_var(flags);
        if (unlikely(!pc))
                return;
@@ -1616,37 +1617,34 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
        /* pc->mem_cgroup is unstable ? */
        if (unlikely(mem_cgroup_stealed(mem))) {
                /* take a lock against to access pc->mem_cgroup */
-                lock_page_cgroup(pc);
+                move_lock_page_cgroup(pc, &flags);
                need_unlock = true;
                mem = pc->mem_cgroup;
                if (!mem || !PageCgroupUsed(pc))
                        goto out;
        }
-        this_cpu_add(mem->stat->count[idx], val);
        switch (idx) {
-        case MEM_CGROUP_STAT_FILE_MAPPED:
+        case MEMCG_NR_FILE_MAPPED:
                if (val > 0)
                        SetPageCgroupFileMapped(pc);
                else if (!page_mapped(page))
                        ClearPageCgroupFileMapped(pc);
+                idx = MEM_CGROUP_STAT_FILE_MAPPED;
                break;
        default:
                BUG();
        }
+        this_cpu_add(mem->stat->count[idx], val);
 out:
        if (unlikely(need_unlock))
-                unlock_page_cgroup(pc);
+                move_unlock_page_cgroup(pc, &flags);
        rcu_read_unlock();
        return;
 }
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
-void mem_cgroup_update_file_mapped(struct page *page, int val)
-{
-        mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
-}
 /*
 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1887,12 +1885,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
 * oom-killer can be invoked.
 */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
-                gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+                                   gfp_t gfp_mask,
+                                   struct mem_cgroup **memcg, bool oom,
+                                   int page_size)
 {
        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct mem_cgroup *mem = NULL;
        int ret;
-        int csize = CHARGE_SIZE;
+        int csize = max(CHARGE_SIZE, (unsigned long) page_size);
        /*
         * Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1917,7 +1917,7 @@ again:
                VM_BUG_ON(css_is_removed(&mem->css));
                if (mem_cgroup_is_root(mem))
                        goto done;
-                if (consume_stock(mem))
+                if (page_size == PAGE_SIZE && consume_stock(mem))
                        goto done;
                css_get(&mem->css);
        } else {
@@ -1940,7 +1940,7 @@ again:
                        rcu_read_unlock();
                        goto done;
                }
-                if (consume_stock(mem)) {
+                if (page_size == PAGE_SIZE && consume_stock(mem)) {
                        /*
                         * It seems dagerous to access memcg without css_get().
                         * But considering how consume_stok works, it's not
@@ -1981,7 +1981,7 @@ again:
                case CHARGE_OK:
                        break;
                case CHARGE_RETRY: /* not in OOM situation but retry */
-                        csize = PAGE_SIZE;
+                        csize = page_size;
                        css_put(&mem->css);
                        mem = NULL;
                        goto again;
@@ -2002,8 +2002,8 @@ again:
                }
        } while (ret != CHARGE_OK);
-        if (csize > PAGE_SIZE)
+        if (csize > page_size)
-                refill_stock(mem, csize - PAGE_SIZE);
+                refill_stock(mem, csize - page_size);
        css_put(&mem->css);
 done:
        *memcg = mem;
@@ -2031,9 +2031,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
        }
 }
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+                                     int page_size)
 {
-        __mem_cgroup_cancel_charge(mem, 1);
+        __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
 }
 /*
@@ -2087,22 +2088,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
 * USED state. If already USED, uncharge and return.
 */
+static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+                                         struct page_cgroup *pc,
-                                     struct page_cgroup *pc,
+                                         enum charge_type ctype)
-                                     enum charge_type ctype)
 {
-        /* try_charge() can return NULL to *memcg, taking care of it. */
-        if (!mem)
-                return;
-        lock_page_cgroup(pc);
-        if (unlikely(PageCgroupUsed(pc))) {
-                unlock_page_cgroup(pc);
-                mem_cgroup_cancel_charge(mem);
-                return;
-        }
        pc->mem_cgroup = mem;
        /*
         * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2127,6 +2116,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
        }
        mem_cgroup_charge_statistics(mem, pc, true);
+}
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+                                       struct page_cgroup *pc,
+                                       enum charge_type ctype,
+                                       int page_size)
+{
+        int i;
+        int count = page_size >> PAGE_SHIFT;
+        /* try_charge() can return NULL to *memcg, taking care of it. */
+        if (!mem)
+                return;
+        lock_page_cgroup(pc);
+        if (unlikely(PageCgroupUsed(pc))) {
+                unlock_page_cgroup(pc);
+                mem_cgroup_cancel_charge(mem, page_size);
+                return;
+        }
+        /*
+         * we don't need page_cgroup_lock about tail pages, becase they are not
+         * accessed by any other context at this point.
+         */
+        for (i = 0; i < count; i++)
+                ____mem_cgroup_commit_charge(mem, pc + i, ctype);
        unlock_page_cgroup(pc);
        /*
@@ -2173,7 +2189,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
        mem_cgroup_charge_statistics(from, pc, false);
        if (uncharge)
                /* This is not "cancel", but cancel_charge does all we need. */
-                mem_cgroup_cancel_charge(from);
+                mem_cgroup_cancel_charge(from, PAGE_SIZE);
        /* caller should have done css_get */
        pc->mem_cgroup = to;
@@ -2195,9 +2211,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
                struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
        int ret = -EINVAL;
+        unsigned long flags;
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+                move_lock_page_cgroup(pc, &flags);
                __mem_cgroup_move_account(pc, from, to, uncharge);
+                move_unlock_page_cgroup(pc, &flags);
                ret = 0;
        }
        unlock_page_cgroup(pc);
@@ -2234,13 +2254,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
                goto put;
        parent = mem_cgroup_from_cont(pcg);
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,
+                                      PAGE_SIZE);
        if (ret || !parent)
                goto put_back;
        ret = mem_cgroup_move_account(pc, child, parent, true);
        if (ret)
-                mem_cgroup_cancel_charge(parent);
+                mem_cgroup_cancel_charge(parent, PAGE_SIZE);
 put_back:
        putback_lru_page(page);
 put:
@@ -2261,6 +2282,12 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        struct mem_cgroup *mem = NULL;
        struct page_cgroup *pc;
        int ret;
+        int page_size = PAGE_SIZE;
+        if (PageTransHuge(page)) {
+                page_size <<= compound_order(page);
+                VM_BUG_ON(!PageTransHuge(page));
+        }
        pc = lookup_page_cgroup(page);
        /* can happen at boot */
@@ -2268,11 +2295,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
-        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
        if (ret || !mem)
                return ret;
-        __mem_cgroup_commit_charge(mem, pc, ctype);
+        __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
        return 0;
 }
@@ -2281,8 +2308,6 @@ int mem_cgroup_newpage_charge(struct page *page,
 {
        if (mem_cgroup_disabled())
                return 0;
-        if (PageCompound(page))
-                return 0;
        /*
         * If already mapped, we don't have to account.
         * If page cache, page->mapping has address_space.
@@ -2388,13 +2413,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
        if (!mem)
                goto charge_cur_mm;
        *ptr = mem;
-        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
        css_put(&mem->css);
        return ret;
 charge_cur_mm:
        if (unlikely(!mm))
                mm = &init_mm;
-        return __mem_cgroup_try_charge(mm, mask, ptr, true);
+        return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
 }
 static void
@@ -2410,7 +2435,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
        cgroup_exclude_rmdir(&ptr->css);
        pc = lookup_page_cgroup(page);
        mem_cgroup_lru_del_before_commit_swapcache(page);
-        __mem_cgroup_commit_charge(ptr, pc, ctype);
+        __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
        mem_cgroup_lru_add_after_commit_swapcache(page);
        /*
         * Now swap is on-memory. This means this page may be
@@ -2459,11 +2484,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
                return;
        if (!mem)
                return;
-        mem_cgroup_cancel_charge(mem);
+        mem_cgroup_cancel_charge(mem, PAGE_SIZE);
 }
 static void
-__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
+              int page_size)
 {
        struct memcg_batch_info *batch = NULL;
        bool uncharge_memsw = true;
@@ -2490,6 +2516,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
        if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
                goto direct_uncharge;
+        if (page_size != PAGE_SIZE)
+                goto direct_uncharge;
        /*
         * In typical case, batch->memcg == mem. This means we can
         * merge a series of uncharges to an uncharge of res_counter.
@@ -2503,9 +2532,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
                batch->memsw_bytes += PAGE_SIZE;
        return;
 direct_uncharge:
-        res_counter_uncharge(&mem->res, PAGE_SIZE);
+        res_counter_uncharge(&mem->res, page_size);
        if (uncharge_memsw)
-                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+                res_counter_uncharge(&mem->memsw, page_size);
        if (unlikely(batch->memcg != mem))
                memcg_oom_recover(mem);
        return;
@@ -2517,8 +2546,11 @@ direct_uncharge:
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
+        int i;
+        int count;
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
+        int page_size = PAGE_SIZE;
        if (mem_cgroup_disabled())
                return NULL;
@@ -2526,6 +2558,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        if (PageSwapCache(page))
                return NULL;
+        if (PageTransHuge(page)) {
+                page_size <<= compound_order(page);
+                VM_BUG_ON(!PageTransHuge(page));
+        }
+        count = page_size >> PAGE_SHIFT;
        /*
         * Check if our page_cgroup is valid
         */
@@ -2558,7 +2596,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        mem_cgroup_charge_statistics(mem, pc, false);
+        for (i = 0; i < count; i++)
+                mem_cgroup_charge_statistics(mem, pc + i, false);
        ClearPageCgroupUsed(pc);
        /*
@@ -2579,7 +2618,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                mem_cgroup_get(mem);
        }
        if (!mem_cgroup_is_root(mem))
-                __do_uncharge(mem, ctype);
+                __do_uncharge(mem, ctype, page_size);
        return mem;
@@ -2774,6 +2813,7 @@ int mem_cgroup_prepare_migration(struct page *page,
        enum charge_type ctype;
        int ret = 0;
+        VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
                return 0;
@@ -2823,7 +2863,7 @@ int mem_cgroup_prepare_migration(struct page *page,
                return 0;
        *ptr = mem;
-        ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
+        ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
        css_put(&mem->css);/* drop extra refcnt */
        if (ret || *ptr == NULL) {
                if (PageAnon(page)) {
@@ -2850,13 +2890,13 @@ int mem_cgroup_prepare_migration(struct page *page,
                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
        else
                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-        __mem_cgroup_commit_charge(mem, pc, ctype);
+        __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
        return ret;
 }
 /* remove redundant charge if migration failed*/
 void mem_cgroup_end_migration(struct mem_cgroup *mem,
-        struct page *oldpage, struct page *newpage)
+        struct page *oldpage, struct page *newpage, bool migration_ok)
 {
        struct page *used, *unused;
        struct page_cgroup *pc;
@@ -2865,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
                return;
        /* blocks rmdir() */
        cgroup_exclude_rmdir(&mem->css);
-        /* at migration success, oldpage->mapping is NULL. */
+        if (!migration_ok) {
-        if (oldpage->mapping) {
                used = oldpage;
                unused = newpage;
        } else {
@@ -4176,13 +4215,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
         */
        if (!node_state(node, N_NORMAL_MEMORY))
                tmp = -1;
-        pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+        pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
        if (!pn)
                return 1;
        mem->info.nodeinfo[node] = pn;
-        memset(pn, 0, sizeof(*pn));
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
                for_each_lru(l)
@@ -4206,14 +4243,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        /* Can be very big if MAX_NUMNODES is very big */
        if (size < PAGE_SIZE)
-                mem = kmalloc(size, GFP_KERNEL);
+                mem = kzalloc(size, GFP_KERNEL);
        else
-                mem = vmalloc(size);
+                mem = vzalloc(size);
        if (!mem)
                return NULL;
-        memset(mem, 0, size);
        mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
        if (!mem->stat)
                goto out_free;
@@ -4461,7 +4497,8 @@ one_by_one:
                        batch_count = PRECHARGE_COUNT_AT_ONCE;
                        cond_resched();
                }
-                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+                                              PAGE_SIZE);
                if (ret || !mem)
                        /* mem_cgroup_clear_mc() will do uncharge later */
                        return -ENOMEM;
@@ -4623,6 +4660,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        pte_t *pte;
        spinlock_t *ptl;
+        VM_BUG_ON(pmd_trans_huge(*pmd));
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE)
                if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4638,7 +4676,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
        unsigned long precharge;
        struct vm_area_struct *vma;
-        /* We've already held the mmap_sem */
+        down_read(&mm->mmap_sem);
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                struct mm_walk mem_cgroup_count_precharge_walk = {
                        .pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4650,6 +4688,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
                walk_page_range(vma->vm_start, vma->vm_end,
                                        &mem_cgroup_count_precharge_walk);
        }
+        up_read(&mm->mmap_sem);
        precharge = mc.precharge;
        mc.precharge = 0;
@@ -4659,10 +4698,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 {
-        return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+        unsigned long precharge = mem_cgroup_count_precharge(mm);
+        VM_BUG_ON(mc.moving_task);
+        mc.moving_task = current;
+        return mem_cgroup_do_precharge(precharge);
 }
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
 {
        struct mem_cgroup *from = mc.from;
        struct mem_cgroup *to = mc.to;
@@ -4697,23 +4741,28 @@ static void mem_cgroup_clear_mc(void)
                                                PAGE_SIZE * mc.moved_swap);
                }
                /* we've already done mem_cgroup_get(mc.to) */
                mc.moved_swap = 0;
        }
-        if (mc.mm) {
+        memcg_oom_recover(from);
-                up_read(&mc.mm->mmap_sem);
+        memcg_oom_recover(to);
-                mmput(mc.mm);
+        wake_up_all(&mc.waitq);
-        }
+}
+static void mem_cgroup_clear_mc(void)
+{
+        struct mem_cgroup *from = mc.from;
+        /*
+         * we must clear moving_task before waking up waiters at the end of
+         * task migration.
+         */
+        mc.moving_task = NULL;
+        __mem_cgroup_clear_mc();
        spin_lock(&mc.lock);
        mc.from = NULL;
        mc.to = NULL;
        spin_unlock(&mc.lock);
-        mc.moving_task = NULL;
-        mc.mm = NULL;
        mem_cgroup_end_move(from);
-        memcg_oom_recover(from);
-        memcg_oom_recover(to);
-        wake_up_all(&mc.waitq);
 }
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4735,38 +4784,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                        return 0;
                /* We move charges only when we move a owner of the mm */
                if (mm->owner == p) {
-                        /*
-                         * We do all the move charge works under one mmap_sem to
-                         * avoid deadlock with down_write(&mmap_sem)
-                         * -> try_charge() -> if (mc.moving_task) -> sleep.
-                         */
-                        down_read(&mm->mmap_sem);
                        VM_BUG_ON(mc.from);
                        VM_BUG_ON(mc.to);
                        VM_BUG_ON(mc.precharge);
                        VM_BUG_ON(mc.moved_charge);
                        VM_BUG_ON(mc.moved_swap);
-                        VM_BUG_ON(mc.moving_task);
-                        VM_BUG_ON(mc.mm);
                        mem_cgroup_start_move(from);
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = mem;
-                        mc.precharge = 0;
-                        mc.moved_charge = 0;
-                        mc.moved_swap = 0;
                        spin_unlock(&mc.lock);
-                        mc.moving_task = current;
+                        /* We set mc.moving_task later */
-                        mc.mm = mm;
                        ret = mem_cgroup_precharge_mc(mm);
                        if (ret)
                                mem_cgroup_clear_mc();
-                        /* We call up_read() and mmput() in clear_mc(). */
+                }
-                } else
+                mmput(mm);
-                        mmput(mm);
        }
        return ret;
 }
@@ -4789,6 +4823,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        spinlock_t *ptl;
 retry:
+        VM_BUG_ON(pmd_trans_huge(*pmd));
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; addr += PAGE_SIZE) {
                pte_t ptent = *(pte++);
@@ -4854,7 +4889,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
        struct vm_area_struct *vma;
        lru_add_drain_all();
-        /* We've already held the mmap_sem */
+retry:
+        if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+                /*
+                 * Someone who are holding the mmap_sem might be waiting in
+                 * waitq. So we cancel all extra charges, wake up all waiters,
+                 * and retry. Because we cancel precharges, we might not be able
+                 * to move enough charges, but moving charge is a best-effort
+                 * feature anyway, so it wouldn't be a big problem.
+                 */
+                __mem_cgroup_clear_mc();
+                cond_resched();
+                goto retry;
+        }
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                int ret;
                struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4873,6 +4920,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
                         */
                        break;
        }
+        up_read(&mm->mmap_sem);
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4881,11 +4929,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct task_struct *p,
                                bool threadgroup)
 {
-        if (!mc.mm)
+        struct mm_struct *mm;
+        if (!mc.to)
                /* no need to move charge */
                return;
-        mem_cgroup_move_charge(mc.mm);
+        mm = get_task_mm(p);
+        if (mm) {
+                mem_cgroup_move_charge(mm);
+                mmput(mm);
+        }
        mem_cgroup_clear_mc();
 }
 #else   /* !CONFIG_MMU */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 46ab2c044b0e..548fbd70f026 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
 #ifdef __ARCH_SI_TRAPNO
        si.si_trapno = trapno;
 #endif
-        si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+        si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
        /*
         * Don't use force here, it's convenient if the signal
         * can be temporarily blocked.
@@ -386,6 +386,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct task_struct *tsk;
        struct anon_vma *av;
+        if (!PageHuge(page) && unlikely(split_huge_page(page)))
+                return;
        read_lock(&tasklist_lock);
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
@@ -928,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 static void set_page_hwpoison_huge_page(struct page *hpage)
 {
        int i;
-        int nr_pages = 1 << compound_order(hpage);
+        int nr_pages = 1 << compound_trans_order(hpage);
        for (i = 0; i < nr_pages; i++)
                SetPageHWPoison(hpage + i);
 }
@@ -936,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
 static void clear_page_hwpoison_huge_page(struct page *hpage)
 {
        int i;
-        int nr_pages = 1 << compound_order(hpage);
+        int nr_pages = 1 << compound_trans_order(hpage);
        for (i = 0; i < nr_pages; i++)
                ClearPageHWPoison(hpage + i);
 }
@@ -966,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
                return 0;
        }
-        nr_pages = 1 << compound_order(hpage);
+        nr_pages = 1 << compound_trans_order(hpage);
        atomic_long_add(nr_pages, &mce_bad_pages);
        /*
@@ -1164,7 +1166,7 @@ int unpoison_memory(unsigned long pfn)
                return 0;
        }
-        nr_pages = 1 << compound_order(page);
+        nr_pages = 1 << compound_trans_order(page);
        if (!get_page_unless_zero(page)) {
                /*
@@ -1290,9 +1292,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
        /* Keep page count to indicate a given hugepage is isolated. */
        list_add(&hpage->lru, &pagelist);
-        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+                                true);
        if (ret) {
-                        putback_lru_pages(&pagelist);
+                putback_lru_pages(&pagelist);
                pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
                         pfn, ret, page->flags);
                if (ret > 0)
@@ -1301,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
        }
 done:
        if (!PageHWPoison(hpage))
-                atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+                atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
        set_page_hwpoison_huge_page(hpage);
        dequeue_hwpoisoned_huge_page(hpage);
        /* keep elevated page count for bad page */
@@ -1413,7 +1416,8 @@ int soft_offline_page(struct page *page, int flags)
                LIST_HEAD(pagelist);
                list_add(&page->lru, &pagelist);
-                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+                                                                0, true);
                if (ret) {
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
diff --git a/mm/memory.c b/mm/memory.c
index 02e48aa0ed13..31250faff390 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
        }
 }
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+                pmd_t *pmd, unsigned long address)
 {
        pgtable_t new = pte_alloc_one(mm, address);
+        int wait_split_huge_page;
        if (!new)
                return -ENOMEM;
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
        spin_lock(&mm->page_table_lock);
-        if (!pmd_present(*pmd)) {       /* Has another populated it ? */
+        wait_split_huge_page = 0;
+        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                mm->nr_ptes++;
                pmd_populate(mm, pmd, new);
                new = NULL;
-        }
+        } else if (unlikely(pmd_trans_splitting(*pmd)))
+                wait_split_huge_page = 1;
        spin_unlock(&mm->page_table_lock);
        if (new)
                pte_free(mm, new);
+        if (wait_split_huge_page)
+                wait_split_huge_page(vma->anon_vma, pmd);
        return 0;
 }
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
        smp_wmb(); /* See comment in __pte_alloc */
        spin_lock(&init_mm.page_table_lock);
-        if (!pmd_present(*pmd)) {       /* Has another populated it ? */
+        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
-        }
+        } else
+                VM_BUG_ON(pmd_trans_splitting(*pmd));
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
@@ -719,9 +726,9 @@ out_set_pte:
        return 0;
 }
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-                pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+                   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
-                unsigned long addr, unsigned long end)
+                   unsigned long addr, unsigned long end)
 {
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*src_pmd)) {
+                        int err;
+                        VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+                        err = copy_huge_pmd(dst_mm, src_mm,
+                                            dst_pmd, src_pmd, addr, vma);
+                        if (err == -ENOMEM)
+                                return -ENOMEM;
+                        if (!err)
+                                continue;
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(src_pmd))
                        continue;
                if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*pmd)) {
+                        if (next-addr != HPAGE_PMD_SIZE) {
+                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+                                split_huge_page_pmd(vma->vm_mm, pmd);
+                        } else if (zap_huge_pmd(tlb, vma, pmd)) {
+                                (*zap_work)--;
+                                continue;
+                        }
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(pmd)) {
                        (*zap_work)--;
                        continue;
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        pud = pud_offset(pgd, address);
        if (pud_none(*pud))
                goto no_page_table;
-        if (pud_huge(*pud)) {
+        if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
                goto out;
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                goto no_page_table;
-        if (pmd_huge(*pmd)) {
+        if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                goto out;
        }
+        if (pmd_trans_huge(*pmd)) {
+                if (flags & FOLL_SPLIT) {
+                        split_huge_page_pmd(mm, pmd);
+                        goto split_fallthrough;
+                }
+                spin_lock(&mm->page_table_lock);
+                if (likely(pmd_trans_huge(*pmd))) {
+                        if (unlikely(pmd_trans_splitting(*pmd))) {
+                                spin_unlock(&mm->page_table_lock);
+                                wait_split_huge_page(vma->anon_vma, pmd);
+                        } else {
+                                page = follow_trans_huge_pmd(mm, address,
+                                                             pmd, flags);
+                                spin_unlock(&mm->page_table_lock);
+                                goto out;
+                        }
+                } else
+                        spin_unlock(&mm->page_table_lock);
+                /* fall through */
+        }
+split_fallthrough:
        if (unlikely(pmd_bad(*pmd)))
                goto no_page_table;
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                 */
                mark_page_accessed(page);
        }
+        if (flags & FOLL_MLOCK) {
+                /*
+                 * The preliminary mapping check is mainly to avoid the
+                 * pointless overhead of lock_page on the ZERO_PAGE
+                 * which might bounce very badly if there is contention.
+                 *
+                 * If the page is already locked, we don't need to
+                 * handle it now - vmscan will handle it later if and
+                 * when it attempts to reclaim the page.
+                 */
+                if (page->mapping && trylock_page(page)) {
+                        lru_add_drain();  /* push cached pages to LRU */
+                        /*
+                         * Because we lock page here and migration is
+                         * blocked by the pte's page reference, we need
+                         * only check for file-cache page truncation.
+                         */
+                        if (page->mapping)
+                                mlock_vma_page(page);
+                        unlock_page(page);
+                }
+        }
 unlock:
        pte_unmap_unlock(ptep, ptl);
 out:
@@ -1341,7 +1412,8 @@ no_page_table:
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int nr_pages, unsigned int gup_flags,
-                     struct page **pages, struct vm_area_struct **vmas)
+                     struct page **pages, struct vm_area_struct **vmas,
+                     int *nonblocking)
 {
        int i;
        unsigned long vm_flags;
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        pmd = pmd_offset(pud, pg);
                        if (pmd_none(*pmd))
                                return i ? : -EFAULT;
+                        VM_BUG_ON(pmd_trans_huge(*pmd));
                        pte = pte_offset_map(pmd, pg);
                        if (pte_none(*pte)) {
                                pte_unmap(pte);
@@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        cond_resched();
                        while (!(page = follow_page(vma, start, foll_flags))) {
                                int ret;
+                                unsigned int fault_flags = 0;
+                                if (foll_flags & FOLL_WRITE)
+                                        fault_flags |= FAULT_FLAG_WRITE;
+                                if (nonblocking)
+                                        fault_flags |= FAULT_FLAG_ALLOW_RETRY;
                                ret = handle_mm_fault(mm, vma, start,
-                                        (foll_flags & FOLL_WRITE) ?
+                                                        fault_flags);
-                                        FAULT_FLAG_WRITE : 0);
                                if (ret & VM_FAULT_ERROR) {
                                        if (ret & VM_FAULT_OOM)
@@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                else
                                        tsk->min_flt++;
+                                if (ret & VM_FAULT_RETRY) {
+                                        *nonblocking = 0;
+                                        return i;
+                                }
                                /*
                                 * The VM_FAULT_WRITE bit tells us that
                                 * do_wp_page has broken COW when necessary,
@@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        if (force)
                flags |= FOLL_FORCE;
-        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+                                NULL);
 }
 EXPORT_SYMBOL(get_user_pages);
@@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr)
        struct page *page;
        if (__get_user_pages(current, current->mm, addr, 1,
-                        FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
+                             FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+                             NULL) < 1)
                return NULL;
        flush_cache_page(vma, addr, page_to_pfn(page));
        return page;
@@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
        pud_t * pud = pud_alloc(mm, pgd, addr);
        if (pud) {
                pmd_t * pmd = pmd_alloc(mm, pud, addr);
-                if (pmd)
+                if (pmd) {
+                        VM_BUG_ON(pmd_trans_huge(*pmd));
                        return pte_alloc_map_lock(mm, pmd, addr, ptl);
+                }
        }
        return NULL;
 }
@@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
+        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                if (remap_pte_range(mm, pmd, addr, next,
@@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
        return same;
 }
-/*
- * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
- * servicing faults for write access.  In the normal case, do always want
- * pte_mkwrite.  But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-{
-        if (likely(vma->vm_flags & VM_WRITE))
-                pte = pte_mkwrite(pte);
-        return pte;
-}
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
        /*
@@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *old_page, *new_page;
        pte_t entry;
-        int reuse = 0, ret = 0;
+        int ret = 0;
        int page_mkwrite = 0;
        struct page *dirty_page = NULL;
@@ -2149,14 +2224,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        }
                        page_cache_release(old_page);
                }
-                reuse = reuse_swap_page(old_page);
+                if (reuse_swap_page(old_page)) {
-                if (reuse)
                        /*
                         * The page is all ours.  Move it to our anon_vma so
                         * the rmap code will not search our parent or siblings.
                         * Protected against the rmap code by the page lock.
                         */
                        page_move_anon_rmap(old_page, vma, address);
+                        unlock_page(old_page);
+                        goto reuse;
+                }
                unlock_page(old_page);
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
@@ -2220,18 +2297,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                }
                dirty_page = old_page;
                get_page(dirty_page);
-                reuse = 1;
-        }
-        if (reuse) {
 reuse:
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                if (ptep_set_access_flags(vma, address, page_table, entry,1))
                        update_mmu_cache(vma, address, page_table);
+                pte_unmap_unlock(page_table, ptl);
                ret |= VM_FAULT_WRITE;
-                goto unlock;
+                if (!dirty_page)
+                        return ret;
+                /*
+                 * Yes, Virginia, this is actually required to prevent a race
+                 * with clear_page_dirty_for_io() from clearing the page dirty
+                 * bit after it clear all dirty ptes, but before a racing
+                 * do_wp_page installs a dirty pte.
+                 *
+                 * do_no_page is protected similarly.
+                 */
+                if (!page_mkwrite) {
+                        wait_on_page_locked(dirty_page);
+                        set_page_dirty_balance(dirty_page, page_mkwrite);
+                }
+                put_page(dirty_page);
+                if (page_mkwrite) {
+                        struct address_space *mapping = dirty_page->mapping;
+                        set_page_dirty(dirty_page);
+                        unlock_page(dirty_page);
+                        page_cache_release(dirty_page);
+                        if (mapping)    {
+                                /*
+                                 * Some device drivers do not set page.mapping
+                                 * but still dirty their pages
+                                 */
+                                balance_dirty_pages_ratelimited(mapping);
+                        }
+                }
+                /* file_update_time outside page_lock */
+                if (vma->vm_file)
+                        file_update_time(vma->vm_file);
+                return ret;
        }
        /*
@@ -2337,39 +2448,6 @@ gotten:
                page_cache_release(old_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
-        if (dirty_page) {
-                /*
-                 * Yes, Virginia, this is actually required to prevent a race
-                 * with clear_page_dirty_for_io() from clearing the page dirty
-                 * bit after it clear all dirty ptes, but before a racing
-                 * do_wp_page installs a dirty pte.
-                 *
-                 * do_no_page is protected similarly.
-                 */
-                if (!page_mkwrite) {
-                        wait_on_page_locked(dirty_page);
-                        set_page_dirty_balance(dirty_page, page_mkwrite);
-                }
-                put_page(dirty_page);
-                if (page_mkwrite) {
-                        struct address_space *mapping = dirty_page->mapping;
-                        set_page_dirty(dirty_page);
-                        unlock_page(dirty_page);
-                        page_cache_release(dirty_page);
-                        if (mapping)    {
-                                /*
-                                 * Some device drivers do not set page.mapping
-                                 * but still dirty their pages
-                                 */
-                                balance_dirty_pages_ratelimited(mapping);
-                        }
-                }
-                /* file_update_time outside page_lock */
-                if (vma->vm_file)
-                        file_update_time(vma->vm_file);
-        }
        return ret;
 oom_free_new:
        page_cache_release(new_page);
@@ -3147,9 +3225,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_sem still held, but pte unmapped and unlocked.
 */
-static inline int handle_pte_fault(struct mm_struct *mm,
+int handle_pte_fault(struct mm_struct *mm,
-                struct vm_area_struct *vma, unsigned long address,
+                     struct vm_area_struct *vma, unsigned long address,
-                pte_t *pte, pmd_t *pmd, unsigned int flags)
+                     pte_t *pte, pmd_t *pmd, unsigned int flags)
 {
        pte_t entry;
        spinlock_t *ptl;
@@ -3228,9 +3306,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pmd = pmd_alloc(mm, pud, address);
        if (!pmd)
                return VM_FAULT_OOM;
-        pte = pte_alloc_map(mm, pmd, address);
+        if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
-        if (!pte)
+                if (!vma->vm_ops)
+                        return do_huge_pmd_anonymous_page(mm, vma, address,
+                                                          pmd, flags);
+        } else {
+                pmd_t orig_pmd = *pmd;
+                barrier();
+                if (pmd_trans_huge(orig_pmd)) {
+                        if (flags & FAULT_FLAG_WRITE &&
+                            !pmd_write(orig_pmd) &&
+                            !pmd_trans_splitting(orig_pmd))
+                                return do_huge_pmd_wp_page(mm, vma, address,
+                                                           pmd, orig_pmd);
+                        return 0;
+                }
+        }
+        /*
+         * Use __pte_alloc instead of pte_alloc_map, because we can't
+         * run pte_offset_map on the pmd, if an huge pmd could
+         * materialize from under us from a different thread.
+         */
+        if (unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
+        /* if an huge pmd materialized from under us just retry later */
+        if (unlikely(pmd_trans_huge(*pmd)))
+                return 0;
+        /*
+         * A regular pmd is established and it can't morph into a huge pmd
+         * from under us anymore at this point because we hold the mmap_sem
+         * read mode and khugepaged takes it in write mode. So now it's
+         * safe to run pte_offset_map().
+         */
+        pte = pte_offset_map(pmd, address);
        return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
@@ -3296,7 +3405,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
        vma = find_vma(current->mm, addr);
        if (!vma)
                return -ENOMEM;
-        write = (vma->vm_flags & VM_WRITE) != 0;
+        /*
+         * We want to touch writable mappings with a write fault in order
+         * to break COW, except for shared mappings because these don't COW
+         * and we would not want to dirty them for nothing.
+         */
+        write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
        BUG_ON(addr >= end);
        BUG_ON(end > vma->vm_end);
        len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3368,6 +3482,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
                goto out;
        pmd = pmd_offset(pud, address);
+        VM_BUG_ON(pmd_trans_huge(*pmd));
        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
                goto out;
@@ -3608,3 +3723,74 @@ void might_fault(void)
 }
 EXPORT_SYMBOL(might_fault);
 #endif
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+static void clear_gigantic_page(struct page *page,
+                                unsigned long addr,
+                                unsigned int pages_per_huge_page)
+{
+        int i;
+        struct page *p = page;
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page;
+             i++, p = mem_map_next(p, page, i)) {
+                cond_resched();
+                clear_user_highpage(p, addr + i * PAGE_SIZE);
+        }
+}
+void clear_huge_page(struct page *page,
+                     unsigned long addr, unsigned int pages_per_huge_page)
+{
+        int i;
+        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+                clear_gigantic_page(page, addr, pages_per_huge_page);
+                return;
+        }
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page; i++) {
+                cond_resched();
+                clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+        }
+}
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+                                    unsigned long addr,
+                                    struct vm_area_struct *vma,
+                                    unsigned int pages_per_huge_page)
+{
+        int i;
+        struct page *dst_base = dst;
+        struct page *src_base = src;
+        for (i = 0; i < pages_per_huge_page; ) {
+                cond_resched();
+                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+                i++;
+                dst = mem_map_next(dst, dst_base, i);
+                src = mem_map_next(src, src_base, i);
+        }
+}
+void copy_user_huge_page(struct page *dst, struct page *src,
+                         unsigned long addr, struct vm_area_struct *vma,
+                         unsigned int pages_per_huge_page)
+{
+        int i;
+        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+                copy_user_gigantic_page(dst, src, addr, vma,
+                                        pages_per_huge_page);
+                return;
+        }
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page; i++) {
+                cond_resched();
+                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+        }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2c6523af5473..e92f04749fcb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info,  struct page *page, int type)
+static void get_page_bootmem(unsigned long info,  struct page *page,
+                             unsigned long type)
 {
-        atomic_set(&page->_mapcount, type);
+        page->lru.next = (struct list_head *) type;
        SetPagePrivate(page);
        set_page_private(page, info);
        atomic_inc(&page->_count);
@@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info,  struct page *page, int type)
 * so use __ref to tell modpost not to generate a warning */
 void __ref put_page_bootmem(struct page *page)
 {
-        int type;
+        unsigned long type;
-        type = atomic_read(&page->_mapcount);
+        type = (unsigned long) page->lru.next;
-        BUG_ON(type >= -1);
+        BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+               type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
        if (atomic_dec_return(&page->_count) == 1) {
                ClearPagePrivate(page);
                set_page_private(page, 0);
-                reset_page_mapcount(page);
+                INIT_LIST_HEAD(&page->lru);
                __free_pages_bootmem(page, 0);
        }
@@ -733,7 +735,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                        goto out;
                }
                /* this function returns # of failed pages */
-                ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
+                ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+                                                                true, true);
                if (ret)
                        putback_lru_pages(&source);
        }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 11ff260fb282..368fc9d23610 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                split_huge_page_pmd(vma->vm_mm, pmd);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
                return PTR_ERR(vma);
        if (!list_empty(&pagelist)) {
-                err = migrate_pages(&pagelist, new_node_page, dest, 0);
+                err = migrate_pages(&pagelist, new_node_page, dest,
+                                                                false, true);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist)) {
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
-                                                (unsigned long)vma, 0);
+                                                (unsigned long)vma,
+                                                false, true);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
@@ -1308,16 +1311,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
        /* Find the mm_struct */
        rcu_read_lock();
-        read_lock(&tasklist_lock);
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
-                read_unlock(&tasklist_lock);
                rcu_read_unlock();
                err = -ESRCH;
                goto out;
        }
        mm = get_task_mm(task);
-        read_unlock(&tasklist_lock);
        rcu_read_unlock();
        err = -EINVAL;
@@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 }
 /**
- *      alloc_page_vma  - Allocate a page for a VMA.
+ *      alloc_pages_vma - Allocate a page for a VMA.
 *
 *      @gfp:
 *      %GFP_USER    user allocation.
@@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 *      %GFP_FS      allocation should not call back into a file system.
 *      %GFP_ATOMIC  don't sleep.
 *
+ *      @order:Order of the GFP allocation.
 *      @vma:  Pointer to VMA or NULL if not available.
 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 *
@@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 *      Should be called with the mm_sem of the vma hold.
 */
 struct page *
-alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+                unsigned long addr)
 {
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
        struct zonelist *zl;
@@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
                mpol_cond_put(pol);
-                page = alloc_page_interleave(gfp, 0, nid);
+                page = alloc_page_interleave(gfp, order, nid);
                put_mems_allowed();
                return page;
        }
@@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
                /*
                 * slow path: ref counted shared policy
                 */
-                struct page *page =  __alloc_pages_nodemask(gfp, 0,
+                struct page *page =  __alloc_pages_nodemask(gfp, order,
                                                zl, policy_nodemask(gfp, pol));
                __mpol_put(pol);
                put_mems_allowed();
@@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
        /*
         * fast path:  default or task policy
         */
-        page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+        page = __alloc_pages_nodemask(gfp, order, zl,
+                                      policy_nodemask(gfp, pol));
        put_mems_allowed();
        return page;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 6ae8a66a7045..46fe8cc13d67 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                        goto out;
                pmd = pmd_offset(pud, addr);
+                if (pmd_trans_huge(*pmd))
+                        goto out;
                if (!pmd_present(*pmd))
                        goto out;
@@ -246,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        expected_count = 2 + page_has_private(page);
        if (page_count(page) != expected_count ||
-                        (struct page *)radix_tree_deref_slot(pslot) != page) {
+                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
                spin_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
@@ -318,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
        expected_count = 2 + page_has_private(page);
        if (page_count(page) != expected_count ||
-            (struct page *)radix_tree_deref_slot(pslot) != page) {
+                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
                spin_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
@@ -614,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 * to the newly allocated page in newpage.
 */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-                        struct page *page, int force, int offlining)
+                        struct page *page, int force, bool offlining, bool sync)
 {
        int rc = 0;
        int *result = NULL;
        struct page *newpage = get_new_page(page, private, &result);
        int remap_swapcache = 1;
-        int rcu_locked = 0;
        int charge = 0;
        struct mem_cgroup *mem = NULL;
        struct anon_vma *anon_vma = NULL;
@@ -632,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                /* page was freed from under us. So we are done. */
                goto move_newpage;
        }
+        if (unlikely(PageTransHuge(page)))
+                if (unlikely(split_huge_page(page)))
+                        goto move_newpage;
        /* prepare cgroup just returns 0 or -ENOMEM */
        rc = -EAGAIN;
@@ -639,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        if (!trylock_page(page)) {
                if (!force)
                        goto move_newpage;
+                /*
+                 * It's not safe for direct compaction to call lock_page.
+                 * For example, during page readahead pages are added locked
+                 * to the LRU. Later, when the IO completes the pages are
+                 * marked uptodate and unlocked. However, the queueing
+                 * could be merging multiple pages for one bio (e.g.
+                 * mpage_readpages). If an allocation happens for the
+                 * second or third page, the process can end up locking
+                 * the same page twice and deadlocking. Rather than
+                 * trying to be clever about what pages can be locked,
+                 * avoid the use of lock_page for direct compaction
+                 * altogether.
+                 */
+                if (current->flags & PF_MEMALLOC)
+                        goto move_newpage;
                lock_page(page);
        }
@@ -665,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        BUG_ON(charge);
        if (PageWriteback(page)) {
-                if (!force)
+                if (!force || !sync)
                        goto uncharge;
                wait_on_page_writeback(page);
        }
        /*
         * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
         * we cannot notice that anon_vma is freed while we migrates a page.
-         * This rcu_read_lock() delays freeing anon_vma pointer until the end
+         * This get_anon_vma() delays freeing anon_vma pointer until the end
         * of migration. File cache pages are no problem because of page_lock()
         * File Caches may use write_page() or lock_page() in migration, then,
         * just care Anon page here.
         */
        if (PageAnon(page)) {
-                rcu_read_lock();
+                /*
-                rcu_locked = 1;
+                 * Only page_lock_anon_vma() understands the subtleties of
+                 * getting a hold on an anon_vma from outside one of its mms.
-                /* Determine how to safely use anon_vma */
+                 */
-                if (!page_mapped(page)) {
+                anon_vma = page_lock_anon_vma(page);
-                        if (!PageSwapCache(page))
+                if (anon_vma) {
-                                goto rcu_unlock;
+                        /*
+                         * Take a reference count on the anon_vma if the
+                         * page is mapped so that it is guaranteed to
+                         * exist when the page is remapped later
+                         */
+                        get_anon_vma(anon_vma);
+                        page_unlock_anon_vma(anon_vma);
+                } else if (PageSwapCache(page)) {
                        /*
                         * We cannot be sure that the anon_vma of an unmapped
                         * swapcache page is safe to use because we don't
@@ -700,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                         */
                        remap_swapcache = 0;
                } else {
-                        /*
+                        goto uncharge;
-                         * Take a reference count on the anon_vma if the
-                         * page is mapped so that it is guaranteed to
-                         * exist when the page is remapped later
-                         */
-                        anon_vma = page_anon_vma(page);
-                        get_anon_vma(anon_vma);
                }
        }
@@ -723,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
         * free the metadata, so the page can be freed.
         */
        if (!page->mapping) {
-                if (!PageAnon(page) && page_has_private(page)) {
+                VM_BUG_ON(PageAnon(page));
-                        /*
+                if (page_has_private(page)) {
-                         * Go direct to try_to_free_buffers() here because
-                         * a) that's what try_to_release_page() would do anyway
-                         * b) we may be under rcu_read_lock() here, so we can't
-                         *    use GFP_KERNEL which is what try_to_release_page()
-                         *    needs to be effective.
-                         */
                        try_to_free_buffers(page);
-                        goto rcu_unlock;
+                        goto uncharge;
                }
                goto skip_unmap;
        }
@@ -746,17 +761,14 @@ skip_unmap:
        if (rc && remap_swapcache)
                remove_migration_ptes(page, page);
-rcu_unlock:
        /* Drop an anon_vma reference if we took one */
        if (anon_vma)
                drop_anon_vma(anon_vma);
-        if (rcu_locked)
-                rcu_read_unlock();
 uncharge:
        if (!charge)
-                mem_cgroup_end_migration(mem, page, newpage);
+                mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
        unlock_page(page);
@@ -810,12 +822,11 @@ move_newpage:
 */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
                                unsigned long private, struct page *hpage,
-                                int force, int offlining)
+                                int force, bool offlining, bool sync)
 {
        int rc = 0;
        int *result = NULL;
        struct page *new_hpage = get_new_page(hpage, private, &result);
-        int rcu_locked = 0;
        struct anon_vma *anon_vma = NULL;
        if (!new_hpage)
@@ -824,18 +835,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        rc = -EAGAIN;
        if (!trylock_page(hpage)) {
-                if (!force)
+                if (!force || !sync)
                        goto out;
                lock_page(hpage);
        }
        if (PageAnon(hpage)) {
-                rcu_read_lock();
+                anon_vma = page_lock_anon_vma(hpage);
-                rcu_locked = 1;
+                if (anon_vma) {
+                        get_anon_vma(anon_vma);
-                if (page_mapped(hpage)) {
+                        page_unlock_anon_vma(anon_vma);
-                        anon_vma = page_anon_vma(hpage);
-                        atomic_inc(&anon_vma->external_refcount);
                }
        }
@@ -847,16 +856,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        if (rc)
                remove_migration_ptes(hpage, hpage);
-        if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
+        if (anon_vma)
-                                            &anon_vma->lock)) {
+                drop_anon_vma(anon_vma);
-                int empty = list_empty(&anon_vma->head);
-                spin_unlock(&anon_vma->lock);
-                if (empty)
-                        anon_vma_free(anon_vma);
-        }
-        if (rcu_locked)
-                rcu_read_unlock();
 out:
        unlock_page(hpage);
@@ -892,7 +893,8 @@ out:
 * Return: Number of pages not migrated or error code.
 */
 int migrate_pages(struct list_head *from,
-                new_page_t get_new_page, unsigned long private, int offlining)
+                new_page_t get_new_page, unsigned long private, bool offlining,
+                bool sync)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -912,7 +914,8 @@ int migrate_pages(struct list_head *from,
                        cond_resched();
                        rc = unmap_and_move(get_new_page, private,
-                                                page, pass > 2, offlining);
+                                                page, pass > 2, offlining,
+                                                sync);
                        switch(rc) {
                        case -ENOMEM:
@@ -941,7 +944,8 @@ out:
 }
 int migrate_huge_pages(struct list_head *from,
-                new_page_t get_new_page, unsigned long private, int offlining)
+                new_page_t get_new_page, unsigned long private, bool offlining,
+                bool sync)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -957,7 +961,8 @@ int migrate_huge_pages(struct list_head *from,
                        cond_resched();
                        rc = unmap_and_move_huge_page(get_new_page,
-                                        private, page, pass > 2, offlining);
+                                        private, page, pass > 2, offlining,
+                                        sync);
                        switch(rc) {
                        case -ENOMEM:
@@ -1042,7 +1047,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
                        goto set_status;
-                page = follow_page(vma, pp->addr, FOLL_GET);
+                page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
                err = PTR_ERR(page);
                if (IS_ERR(page))
@@ -1090,7 +1095,7 @@ set_status:
        err = 0;
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_page_node,
-                                (unsigned long)pm, 0);
+                                (unsigned long)pm, 0, true);
                if (err)
                        putback_lru_pages(&pagelist);
        }
diff --git a/mm/mincore.c b/mm/mincore.c
index 9ac42dc6d7b6..a4e6b9d75c76 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*pmd)) {
+                        if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
+                                vec += (next - addr) >> PAGE_SHIFT;
+                                continue;
+                        }
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(pmd))
                        mincore_unmapped_range(vma, addr, next, vec);
                else
diff --git a/mm/mlock.c b/mm/mlock.c
index b70919ce4f72..13e81ee8be9d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
 * vma->vm_mm->mmap_sem must be held for at least read.
 */
 static long __mlock_vma_pages_range(struct vm_area_struct *vma,
-                                    unsigned long start, unsigned long end)
+                                    unsigned long start, unsigned long end,
+                                    int *nonblocking)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = start;
-        struct page *pages[16]; /* 16 gives a reasonable batch */
        int nr_pages = (end - start) / PAGE_SIZE;
-        int ret = 0;
        int gup_flags;
        VM_BUG_ON(start & ~PAGE_MASK);
@@ -170,73 +169,26 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        VM_BUG_ON(end   > vma->vm_end);
        VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
-        gup_flags = FOLL_TOUCH | FOLL_GET;
+        gup_flags = FOLL_TOUCH;
-        if (vma->vm_flags & VM_WRITE)
+        /*
+         * We want to touch writable mappings with a write fault in order
+         * to break COW, except for shared mappings because these don't COW
+         * and we would not want to dirty them for nothing.
+         */
+        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
                gup_flags |= FOLL_WRITE;
+        if (vma->vm_flags & VM_LOCKED)
+                gup_flags |= FOLL_MLOCK;
        /* We don't try to access the guard page of a stack vma */
        if (stack_guard_page(vma, start)) {
                addr += PAGE_SIZE;
                nr_pages--;
        }
-        while (nr_pages > 0) {
+        return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
-                int i;
+                                NULL, NULL, nonblocking);
-                cond_resched();
-                /*
-                 * get_user_pages makes pages present if we are
-                 * setting mlock. and this extra reference count will
-                 * disable migration of this page.  However, page may
-                 * still be truncated out from under us.
-                 */
-                ret = __get_user_pages(current, mm, addr,
-                                min_t(int, nr_pages, ARRAY_SIZE(pages)),
-                                gup_flags, pages, NULL);
-                /*
-                 * This can happen for, e.g., VM_NONLINEAR regions before
-                 * a page has been allocated and mapped at a given offset,
-                 * or for addresses that map beyond end of a file.
-                 * We'll mlock the pages if/when they get faulted in.
-                 */
-                if (ret < 0)
-                        break;
-                lru_add_drain();        /* push cached pages to LRU */
-                for (i = 0; i < ret; i++) {
-                        struct page *page = pages[i];
-                        if (page->mapping) {
-                                /*
-                                 * That preliminary check is mainly to avoid
-                                 * the pointless overhead of lock_page on the
-                                 * ZERO_PAGE: which might bounce very badly if
-                                 * there is contention.  However, we're still
-                                 * dirtying its cacheline with get/put_page:
-                                 * we'll add another __get_user_pages flag to
-                                 * avoid it if that case turns out to matter.
-                                 */
-                                lock_page(page);
-                                /*
-                                 * Because we lock page here and migration is
-                                 * blocked by the elevated reference, we need
-                                 * only check for file-cache page truncation.
-                                 */
-                                if (page->mapping)
-                                        mlock_vma_page(page);
-                                unlock_page(page);
-                        }
-                        put_page(page); /* ref from get_user_pages() */
-                }
-                addr += ret * PAGE_SIZE;
-                nr_pages -= ret;
-                ret = 0;
-        }
-        return ret;     /* 0 or negative error code */
 }
 /*
@@ -280,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
                        is_vm_hugetlb_page(vma) ||
                        vma == get_gate_vma(current))) {
-                __mlock_vma_pages_range(vma, start, end);
+                __mlock_vma_pages_range(vma, start, end, NULL);
                /* Hide errors from mmap() and other callers */
                return 0;
@@ -372,18 +324,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
        int ret = 0;
        int lock = newflags & VM_LOCKED;
-        if (newflags == vma->vm_flags ||
+        if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
-                        (vma->vm_flags & (VM_IO | VM_PFNMAP)))
+            is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))
                goto out;       /* don't set VM_LOCKED,  don't count */
-        if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
-                        is_vm_hugetlb_page(vma) ||
-                        vma == get_gate_vma(current)) {
-                if (lock)
-                        make_pages_present(start, end);
-                goto out;       /* don't set VM_LOCKED,  don't count */
-        }
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
                          vma->vm_file, pgoff, vma_policy(vma));
@@ -419,14 +363,10 @@ success:
         * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
         */
-        if (lock) {
+        if (lock)
                vma->vm_flags = newflags;
-                ret = __mlock_vma_pages_range(vma, start, end);
+        else
-                if (ret < 0)
-                        ret = __mlock_posix_error_return(ret);
-        } else {
                munlock_vma_pages_range(vma, start, end);
-        }
 out:
        *prev = vma;
@@ -439,7 +379,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
        struct vm_area_struct * vma, * prev;
        int error;
-        len = PAGE_ALIGN(len);
+        VM_BUG_ON(start & ~PAGE_MASK);
+        VM_BUG_ON(len != PAGE_ALIGN(len));
        end = start + len;
        if (end < start)
                return -EINVAL;
@@ -482,6 +423,62 @@ static int do_mlock(unsigned long start, size_t len, int on)
        return error;
 }
+static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long end, nstart, nend;
+        struct vm_area_struct *vma = NULL;
+        int locked = 0;
+        int ret = 0;
+        VM_BUG_ON(start & ~PAGE_MASK);
+        VM_BUG_ON(len != PAGE_ALIGN(len));
+        end = start + len;
+        for (nstart = start; nstart < end; nstart = nend) {
+                /*
+                 * We want to fault in pages for [nstart; end) address range.
+                 * Find first corresponding VMA.
+                 */
+                if (!locked) {
+                        locked = 1;
+                        down_read(&mm->mmap_sem);
+                        vma = find_vma(mm, nstart);
+                } else if (nstart >= vma->vm_end)
+                        vma = vma->vm_next;
+                if (!vma || vma->vm_start >= end)
+                        break;
+                /*
+                 * Set [nstart; nend) to intersection of desired address
+                 * range with the first VMA. Also, skip undesirable VMA types.
+                 */
+                nend = min(end, vma->vm_end);
+                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+                        continue;
+                if (nstart < vma->vm_start)
+                        nstart = vma->vm_start;
+                /*
+                 * Now fault in a range of pages. __mlock_vma_pages_range()
+                 * double checks the vma flags, so that it won't mlock pages
+                 * if the vma was already munlocked.
+                 */
+                ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
+                if (ret < 0) {
+                        if (ignore_errors) {
+                                ret = 0;
+                                continue;       /* continue at next VMA */
+                        }
+                        ret = __mlock_posix_error_return(ret);
+                        break;
+                }
+                nend = nstart + ret * PAGE_SIZE;
+                ret = 0;
+        }
+        if (locked)
+                up_read(&mm->mmap_sem);
+        return ret;     /* 0 or negative error code */
+}
 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 {
        unsigned long locked;
@@ -507,6 +504,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
        if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
                error = do_mlock(start, len, 1);
        up_write(&current->mm->mmap_sem);
+        if (!error)
+                error = do_mlock_pages(start, len, 0);
        return error;
 }
@@ -571,6 +570,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
            capable(CAP_IPC_LOCK))
                ret = do_mlockall(flags);
        up_write(&current->mm->mmap_sem);
+        if (!ret && (flags & MCL_CURRENT)) {
+                /* Ignore errors */
+                do_mlock_pages(0, TASK_SIZE, 1);
+        }
 out:
        return ret;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 50a4aa0255a0..2ec8eb5a9cdd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
 #include <linux/audit.h>
+#include <linux/khugepaged.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
        down_write(&mm->mmap_sem);
 #ifdef CONFIG_COMPAT_BRK
-        min_brk = mm->end_code;
+        /*
+         * CONFIG_COMPAT_BRK can still be overridden by setting
+         * randomize_va_space to 2, which will still cause mm->start_brk
+         * to be arbitrarily shifted
+         */
+        if (mm->start_brk > PAGE_ALIGN(mm->end_data))
+                min_brk = mm->start_brk;
+        else
+                min_brk = mm->end_data;
 #else
        min_brk = mm->start_brk;
 #endif
@@ -588,6 +597,8 @@ again:			remove_next = 1 + (end > next->vm_end);
                }
        }
+        vma_adjust_trans_huge(vma, start, end, adjust_next);
        /*
         * When changing only vma->vm_end, we don't really need anon_vma
         * lock. This is a fairly rare case by itself, but the anon_vma
@@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                end, prev->vm_pgoff, NULL);
                if (err)
                        return NULL;
+                khugepaged_enter_vma_merge(prev);
                return prev;
        }
@@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                next->vm_pgoff - pglen, NULL);
                if (err)
                        return NULL;
+                khugepaged_enter_vma_merge(area);
                return area;
        }
@@ -1761,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                }
        }
        vma_unlock_anon_vma(vma);
+        khugepaged_enter_vma_merge(vma);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1808,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma,
                }
        }
        vma_unlock_anon_vma(vma);
+        khugepaged_enter_vma_merge(vma);
        return error;
 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 438951d366f2..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return young;
 }
+int __mmu_notifier_test_young(struct mm_struct *mm,
+                              unsigned long address)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        int young = 0;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->test_young) {
+                        young = mn->ops->test_young(mn, mm, address);
+                        if (young)
+                                break;
+                }
+        }
+        rcu_read_unlock();
+        return young;
+}
 void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
                               pte_t pte)
 {
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
        return 1;
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-#ifdef CONFIG_SMP
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
-unsigned long zone_nr_free_pages(struct zone *zone)
-{
-        unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
-        /*
-         * While kswapd is awake, it is considered the zone is under some
-         * memory pressure. Under pressure, there is a risk that
-         * per-cpu-counter-drift will allow the min watermark to be breached
-         * potentially causing a live-lock. While kswapd is awake and
-         * free pages are low, get a better estimate for free pages
-         */
-        if (nr_free_pages < zone->percpu_drift_mark &&
-                        !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
-                return zone_page_state_snapshot(zone, NR_FREE_PAGES);
-        return nr_free_pages;
-}
-#endif /* CONFIG_SMP */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c5133873097..5a688a2756be 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
        pte_unmap_unlock(pte - 1, ptl);
 }
-static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                unsigned long addr, unsigned long end, pgprot_t newprot,
                int dirty_accountable)
 {
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*pmd)) {
+                        if (next - addr != HPAGE_PMD_SIZE)
+                                split_huge_page_pmd(vma->vm_mm, pmd);
+                        else if (change_huge_pmd(vma, pmd, addr, newprot))
+                                continue;
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
+                change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
+                                 dirty_accountable);
        } while (pmd++, addr = next, addr != end);
 }
-static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
                int dirty_accountable)
 {
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
+                change_pmd_range(vma, pud, addr, next, newprot,
+                                 dirty_accountable);
        } while (pud++, addr = next, addr != end);
 }
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
+                change_pud_range(vma, pgd, addr, next, newprot,
+                                 dirty_accountable);
        } while (pgd++, addr = next, addr != end);
        flush_tlb_range(vma, start, end);
 }
diff --git a/mm/mremap.c b/mm/mremap.c
index 563fbdd6293a..9925b6391b80 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
                return NULL;
        pmd = pmd_offset(pud, addr);
+        split_huge_page_pmd(mm, pmd);
        if (pmd_none_or_clear_bad(pmd))
                return NULL;
        return pmd;
 }
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+                            unsigned long addr)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
        if (!pmd)
                return NULL;
-        if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+        VM_BUG_ON(pmd_trans_huge(*pmd));
+        if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
                return NULL;
        return pmd;
@@ -147,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                old_pmd = get_old_pmd(vma->vm_mm, old_addr);
                if (!old_pmd)
                        continue;
-                new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+                new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
                if (!new_pmd)
                        break;
                next = (new_addr + PMD_SIZE) & PMD_MASK;
diff --git a/mm/nommu.c b/mm/nommu.c
index ef4045d010d5..f59e1424d3db 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp)
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int nr_pages, unsigned int foll_flags,
-                     struct page **pages, struct vm_area_struct **vmas)
+                     struct page **pages, struct vm_area_struct **vmas,
+                     int *retry)
 {
        struct vm_area_struct *vma;
        unsigned long vm_flags;
@@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        if (force)
                flags |= FOLL_FORCE;
-        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+                                NULL);
 }
 EXPORT_SYMBOL(get_user_pages);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b4edfe7ce06c..2cb01f6ec5d0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -404,15 +404,18 @@ unsigned long determine_dirtyable_memory(void)
 * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
 * - vm.dirty_ratio             or  vm.dirty_bytes
 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * runtime tasks.
+ * real-time tasks.
 */
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 {
        unsigned long background;
        unsigned long dirty;
-        unsigned long available_memory = determine_dirtyable_memory();
+        unsigned long uninitialized_var(available_memory);
        struct task_struct *tsk;
+        if (!vm_dirty_bytes || !dirty_background_bytes)
+                available_memory = determine_dirtyable_memory();
        if (vm_dirty_bytes)
                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
        else
@@ -1103,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page);
 int __set_page_dirty_no_writeback(struct page *page)
 {
        if (!PageDirty(page))
-                SetPageDirty(page);
+                return !TestSetPageDirty(page);
        return 0;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ff7e15872398..90c1439549fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
        }
 }
+/* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
        int i;
@@ -426,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
 *
 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
 */
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
-        unsigned long buddy_idx = page_idx ^ (1 << order);
-        return page + (buddy_idx - page_idx);
-}
 static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
 {
-        return (page_idx & ~(1 << order));
+        return page_idx ^ (1 << order);
 }
 /*
@@ -448,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
 *
- * For recording whether a page is in the buddy system, we use PG_buddy.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
@@ -482,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
 * order is recorded in page_private(page) field.
 * So when we are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were   
@@ -499,6 +492,7 @@ static inline void __free_one_page(struct page *page,
 {
        unsigned long page_idx;
        unsigned long combined_idx;
+        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
        if (unlikely(PageCompound(page)))
@@ -513,7 +507,8 @@ static inline void __free_one_page(struct page *page,
        VM_BUG_ON(bad_range(zone, page));
        while (order < MAX_ORDER-1) {
-                buddy = __page_find_buddy(page, page_idx, order);
+                buddy_idx = __find_buddy_index(page_idx, order);
+                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
                        break;
@@ -521,7 +516,7 @@ static inline void __free_one_page(struct page *page,
                list_del(&buddy->lru);
                zone->free_area[order].nr_free--;
                rmv_page_order(buddy);
-                combined_idx = __find_combined_index(page_idx, order);
+                combined_idx = buddy_idx & page_idx;
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
                order++;
@@ -538,9 +533,10 @@ static inline void __free_one_page(struct page *page,
         */
        if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
                struct page *higher_page, *higher_buddy;
-                combined_idx = __find_combined_index(page_idx, order);
+                combined_idx = buddy_idx & page_idx;
-                higher_page = page + combined_idx - page_idx;
+                higher_page = page + (combined_idx - page_idx);
-                higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+                buddy_idx = __find_buddy_index(combined_idx, order + 1);
+                higher_buddy = page + (buddy_idx - combined_idx);
                if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                        list_add_tail(&page->lru,
                                &zone->free_area[order].free_list[migratetype]);
@@ -651,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        trace_mm_page_free_direct(page, order);
        kmemcheck_free_shadow(page, order);
-        for (i = 0; i < (1 << order); i++) {
+        if (PageAnon(page))
-                struct page *pg = page + i;
+                page->mapping = NULL;
+        for (i = 0; i < (1 << order); i++)
-                if (PageAnon(pg))
+                bad += free_pages_check(page + i);
-                        pg->mapping = NULL;
-                bad += free_pages_check(pg);
-        }
        if (bad)
                return false;
@@ -1460,24 +1453,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
 * of the allocation.
 */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                      int classzone_idx, int alloc_flags)
+                      int classzone_idx, int alloc_flags, long free_pages)
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
-        long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
        int o;
+        free_pages -= (1 << order) + 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-                return 0;
+                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
                free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1479,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                min >>= 1;
                if (free_pages <= min)
-                        return 0;
+                        return false;
        }
-        return 1;
+        return true;
+}
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                      int classzone_idx, int alloc_flags)
+{
+        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                        zone_page_state(z, NR_FREE_PAGES));
+}
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+                      int classzone_idx, int alloc_flags)
+{
+        long free_pages = zone_page_state(z, NR_FREE_PAGES);
+        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                                                free_pages);
 }
 #ifdef CONFIG_NUMA
@@ -1793,15 +1805,18 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress)
+        int migratetype, unsigned long *did_some_progress,
+        bool sync_migration)
 {
        struct page *page;
        if (!order || compaction_deferred(preferred_zone))
                return NULL;
+        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                                nodemask);
+                                                nodemask, sync_migration);
+        current->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
                /* Page migration frees to the PCP lists but we want merging */
@@ -1837,7 +1852,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress)
+        int migratetype, unsigned long *did_some_progress,
+        bool sync_migration)
 {
        return NULL;
 }
@@ -1852,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 {
        struct page *page = NULL;
        struct reclaim_state reclaim_state;
-        struct task_struct *p = current;
        bool drained = false;
        cond_resched();
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
-        p->flags |= PF_MEMALLOC;
+        current->flags |= PF_MEMALLOC;
        lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
-        p->reclaim_state = &reclaim_state;
+        current->reclaim_state = &reclaim_state;
        *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
-        p->reclaim_state = NULL;
+        current->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
-        p->flags &= ~PF_MEMALLOC;
+        current->flags &= ~PF_MEMALLOC;
        cond_resched();
@@ -1920,19 +1935,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 static inline
 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-                                                enum zone_type high_zoneidx)
+                                                enum zone_type high_zoneidx,
+                                                enum zone_type classzone_idx)
 {
        struct zoneref *z;
        struct zone *zone;
        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-                wakeup_kswapd(zone, order);
+                wakeup_kswapd(zone, order, classzone_idx);
 }
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
-        struct task_struct *p = current;
        int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
        const gfp_t wait = gfp_mask & __GFP_WAIT;
@@ -1948,18 +1963,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
        if (!wait) {
-                alloc_flags |= ALLOC_HARDER;
+                /*
+                 * Not worth trying to allocate harder for
+                 * __GFP_NOMEMALLOC even if it can't schedule.
+                 */
+                if  (!(gfp_mask & __GFP_NOMEMALLOC))
+                        alloc_flags |= ALLOC_HARDER;
                /*
                 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
                 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                 */
                alloc_flags &= ~ALLOC_CPUSET;
-        } else if (unlikely(rt_task(p)) && !in_interrupt())
+        } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
                if (!in_interrupt() &&
-                    ((p->flags & PF_MEMALLOC) ||
+                    ((current->flags & PF_MEMALLOC) ||
                     unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
@@ -1978,7 +1998,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        int alloc_flags;
        unsigned long pages_reclaimed = 0;
        unsigned long did_some_progress;
-        struct task_struct *p = current;
+        bool sync_migration = false;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2003,7 +2023,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 restart:
-        wake_all_kswapd(order, zonelist, high_zoneidx);
+        if (!(gfp_mask & __GFP_NO_KSWAPD))
+                wake_all_kswapd(order, zonelist, high_zoneidx,
+                                                zone_idx(preferred_zone));
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2034,21 +2056,26 @@ rebalance:
                goto nopage;
        /* Avoid recursion of direct reclaim */
-        if (p->flags & PF_MEMALLOC)
+        if (current->flags & PF_MEMALLOC)
                goto nopage;
        /* Avoid allocations with no watermarks from looping endlessly */
        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                goto nopage;
-        /* Try direct compaction */
+        /*
+         * Try direct compaction. The first pass is asynchronous. Subsequent
+         * attempts after direct reclaim are synchronous
+         */
        page = __alloc_pages_direct_compact(gfp_mask, order,
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                        migratetype, &did_some_progress);
+                                        migratetype, &did_some_progress,
+                                        sync_migration);
        if (page)
                goto got_pg;
+        sync_migration = true;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2102,13 +2129,27 @@ rebalance:
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
+        } else {
+                /*
+                 * High-order allocations do not necessarily loop after
+                 * direct reclaim and reclaim/compaction depends on compaction
+                 * being called after reclaim so call directly if necessary
+                 */
+                page = __alloc_pages_direct_compact(gfp_mask, order,
+                                        zonelist, high_zoneidx,
+                                        nodemask,
+                                        alloc_flags, preferred_zone,
+                                        migratetype, &did_some_progress,
+                                        sync_migration);
+                if (page)
+                        goto got_pg;
        }
 nopage:
        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
                printk(KERN_WARNING "%s: page allocation failure."
                        " order:%d, mode:0x%x\n",
-                        p->comm, order, gfp_mask);
+                        current->comm, order, gfp_mask);
                dump_stack();
                show_mem();
        }
@@ -2442,7 +2483,7 @@ void show_free_areas(void)
                        " all_unreclaimable? %s"
                        "\n",
                        zone->name,
-                        K(zone_nr_free_pages(zone)),
+                        K(zone_page_state(zone, NR_FREE_PAGES)),
                        K(min_wmark_pages(zone)),
                        K(low_wmark_pages(zone)),
                        K(high_wmark_pages(zone)),
@@ -2585,9 +2626,16 @@ static int __parse_numa_zonelist_order(char *s)
 static __init int setup_numa_zonelist_order(char *s)
 {
-        if (s)
+        int ret;
-                return __parse_numa_zonelist_order(s);
-        return 0;
+        if (!s)
+                return 0;
+        ret = __parse_numa_zonelist_order(s);
+        if (ret == 0)
+                strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+        return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
@@ -4014,7 +4062,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
                zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
 }
 #else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
                                struct zone *zone, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
@@ -5517,7 +5565,6 @@ static struct trace_print_flags pageflag_names[] = {
        {1UL << PG_swapcache,           "swapcache"     },
        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
        {1UL << PG_reclaim,             "reclaim"       },
-        {1UL << PG_buddy,               "buddy"         },
        {1UL << PG_swapbacked,          "swapbacked"    },
        {1UL << PG_unevictable,         "unevictable"   },
 #ifdef CONFIG_MMU
@@ -5565,7 +5612,7 @@ void dump_page(struct page *page)
 {
        printk(KERN_ALERT
               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-                page, page_count(page), page_mapcount(page),
+                page, atomic_read(&page->_count), page_mapcount(page),
                page->mapping, page->index);
        dump_page_flags(page->flags);
 }
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 38cc58b8b2b0..7cfa6ae02303 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                split_huge_page_pmd(walk->mm, pmd);
                if (pmd_none_or_clear_bad(pmd)) {
                        if (walk->pte_hole)
                                err = walk->pte_hole(addr, next, walk);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 7d9c1d0ebd3f..ea534960a04b 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
                return NULL;
        vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
-                                pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
+                                pcpu_nr_groups, pcpu_atom_size);
        if (!vms) {
                pcpu_free_chunk(chunk);
                return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 02ba91230b99..3f930018aa60 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -258,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
 /*
 * (Un)populated page region iterators.  Iterate over (un)populated
- * page regions betwen @start and @end in @chunk.  @rs and @re should
+ * page regions between @start and @end in @chunk.  @rs and @re should
 * be integer variables and will be set to start and end page index of
 * the current region.
 */
@@ -293,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size)
        if (size <= PAGE_SIZE)
                return kzalloc(size, GFP_KERNEL);
-        else {
+        else
-                void *ptr = vmalloc(size);
+                return vzalloc(size);
-                if (ptr)
-                        memset(ptr, 0, size);
-                return ptr;
-        }
 }
 /**
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 000000000000..d030548047e2
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,123 @@
+/*
+ *  mm/pgtable-generic.c
+ *
+ *  Generic pgtable methods declared in asm-generic/pgtable.h
+ *
+ *  Copyright (C) 2010  Linus Torvalds
+ */
+#include <asm/tlb.h>
+#include <asm-generic/pgtable.h>
+#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+/*
+ * Only sets the access flags (dirty, accessed, and
+ * writable). Furthermore, we know it always gets set to a "more
+ * permissive" setting, which allows most architectures to optimize
+ * this. We return whether the PTE actually changed, which in turn
+ * instructs the caller to do things like update__mmu_cache.  This
+ * used to be done in the caller, but sparc needs minor faults to
+ * force that call on sun4c so we changed this macro slightly
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+                          unsigned long address, pte_t *ptep,
+                          pte_t entry, int dirty)
+{
+        int changed = !pte_same(*ptep, entry);
+        if (changed) {
+                set_pte_at(vma->vm_mm, address, ptep, entry);
+                flush_tlb_page(vma, address);
+        }
+        return changed;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp,
+                          pmd_t entry, int dirty)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        int changed = !pmd_same(*pmdp, entry);
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        if (changed) {
+                set_pmd_at(vma->vm_mm, address, pmdp, entry);
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        }
+        return changed;
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+        BUG();
+        return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
+#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+                           unsigned long address, pte_t *ptep)
+{
+        int young;
+        young = ptep_test_and_clear_young(vma, address, ptep);
+        if (young)
+                flush_tlb_page(vma, address);
+        return young;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                           unsigned long address, pmd_t *pmdp)
+{
+        int young;
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+        BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        young = pmdp_test_and_clear_young(vma, address, pmdp);
+        if (young)
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        return young;
+}
+#endif
+#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                       pte_t *ptep)
+{
+        pte_t pte;
+        pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
+        flush_tlb_page(vma, address);
+        return pte;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                       pmd_t *pmdp)
+{
+        pmd_t pmd;
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+        BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        return pmd;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+                           pmd_t *pmdp)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        pmd_t pmd = pmd_mksplitting(*pmdp);
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+        /* tlb flush only to serialize against gup-fast */
+        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+        BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 1a8bf76bfd03..f21f4a1d6a1c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -94,7 +94,7 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
 * anonymous pages mapped into it with that anon_vma.
 *
 * The common case will be that we already have one, but if
- * if not we either need to find an adjacent mapping that we
+ * not we either need to find an adjacent mapping that we
 * can re-use the anon_vma from (very common when the only
 * reason for splitting a vma has been mprotect()), or we
 * allocate a new one.
@@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
        list_add(&avc->same_vma, &vma->anon_vma_chain);
        anon_vma_lock(anon_vma);
+        /*
+         * It's critical to add new vmas to the tail of the anon_vma,
+         * see comment in huge_memory.c:__split_huge_page().
+         */
        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
        anon_vma_unlock(anon_vma);
 }
@@ -360,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
 * Returns virtual address or -EFAULT if page's index/offset is not
 * within the range mapped the @vma.
 */
-static inline unsigned long
+inline unsigned long
 vma_address(struct page *page, struct vm_area_struct *vma)
 {
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -435,6 +439,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return NULL;
+        if (pmd_trans_huge(*pmd))
+                return NULL;
        pte = pte_offset_map(pmd, address);
        /* Make a quick check before getting the lock */
@@ -489,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        unsigned long *vm_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
-        pte_t *pte;
-        spinlock_t *ptl;
        int referenced = 0;
-        pte = page_check_address(page, mm, address, &ptl, 0);
-        if (!pte)
-                goto out;
        /*
         * Don't want to elevate referenced for mlocked page that gets this far,
         * in order that it progresses to try_to_unmap and is moved to the
         * unevictable list.
         */
        if (vma->vm_flags & VM_LOCKED) {
-                *mapcount = 1;  /* break early from loop */
+                *mapcount = 0;  /* break early from loop */
                *vm_flags |= VM_LOCKED;
-                goto out_unmap;
+                goto out;
-        }
-        if (ptep_clear_flush_young_notify(vma, address, pte)) {
-                /*
-                 * Don't treat a reference through a sequentially read
-                 * mapping as such.  If the page has been used in
-                 * another mapping, we will catch it; if this other
-                 * mapping is already gone, the unmap path will have
-                 * set PG_referenced or activated the page.
-                 */
-                if (likely(!VM_SequentialReadHint(vma)))
-                        referenced++;
        }
        /* Pretend the page is referenced if the task has the
@@ -526,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        rwsem_is_locked(&mm->mmap_sem))
                referenced++;
-out_unmap:
+        if (unlikely(PageTransHuge(page))) {
+                pmd_t *pmd;
+                spin_lock(&mm->page_table_lock);
+                pmd = page_check_address_pmd(page, mm, address,
+                                             PAGE_CHECK_ADDRESS_PMD_FLAG);
+                if (pmd && !pmd_trans_splitting(*pmd) &&
+                    pmdp_clear_flush_young_notify(vma, address, pmd))
+                        referenced++;
+                spin_unlock(&mm->page_table_lock);
+        } else {
+                pte_t *pte;
+                spinlock_t *ptl;
+                pte = page_check_address(page, mm, address, &ptl, 0);
+                if (!pte)
+                        goto out;
+                if (ptep_clear_flush_young_notify(vma, address, pte)) {
+                        /*
+                         * Don't treat a reference through a sequentially read
+                         * mapping as such.  If the page has been used in
+                         * another mapping, we will catch it; if this other
+                         * mapping is already gone, the unmap path will have
+                         * set PG_referenced or activated the page.
+                         */
+                        if (likely(!VM_SequentialReadHint(vma)))
+                                referenced++;
+                }
+                pte_unmap_unlock(pte, ptl);
+        }
        (*mapcount)--;
-        pte_unmap_unlock(pte, ptl);
        if (referenced)
                *vm_flags |= vma->vm_flags;
@@ -864,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, int exclusive)
 {
        int first = atomic_inc_and_test(&page->_mapcount);
-        if (first)
+        if (first) {
-                __inc_zone_page_state(page, NR_ANON_PAGES);
+                if (!PageTransHuge(page))
+                        __inc_zone_page_state(page, NR_ANON_PAGES);
+                else
+                        __inc_zone_page_state(page,
+                                              NR_ANON_TRANSPARENT_HUGEPAGES);
+        }
        if (unlikely(PageKsm(page)))
                return;
@@ -893,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page,
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        SetPageSwapBacked(page);
        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
-        __inc_zone_page_state(page, NR_ANON_PAGES);
+        if (!PageTransHuge(page))
+                __inc_zone_page_state(page, NR_ANON_PAGES);
+        else
+                __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __page_set_anon_rmap(page, vma, address, 1);
        if (page_evictable(page, vma))
                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -911,7 +937,7 @@ void page_add_file_rmap(struct page *page)
 {
        if (atomic_inc_and_test(&page->_mapcount)) {
                __inc_zone_page_state(page, NR_FILE_MAPPED);
-                mem_cgroup_update_file_mapped(page, 1);
+                mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
        }
 }
@@ -946,10 +972,14 @@ void page_remove_rmap(struct page *page)
                return;
        if (PageAnon(page)) {
                mem_cgroup_uncharge_page(page);
-                __dec_zone_page_state(page, NR_ANON_PAGES);
+                if (!PageTransHuge(page))
+                        __dec_zone_page_state(page, NR_ANON_PAGES);
+                else
+                        __dec_zone_page_state(page,
+                                              NR_ANON_TRANSPARENT_HUGEPAGES);
        } else {
                __dec_zone_page_state(page, NR_FILE_MAPPED);
-                mem_cgroup_update_file_mapped(page, -1);
+                mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
        }
        /*
         * It would be tidy to reset the PageAnon mapping here,
@@ -1202,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        return ret;
 }
-static bool is_vma_temporary_stack(struct vm_area_struct *vma)
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
 {
        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1400,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
        int ret;
        BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
        if (unlikely(PageKsm(page)))
                ret = try_to_unmap_ksm(page, flags);
diff --git a/mm/shmem.c b/mm/shmem.c
index 47fdeeb9d636..5ee67c990602 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2415,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
        return &p->vfs_inode;
 }
+static void shmem_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+}
 static void shmem_destroy_inode(struct inode *inode)
 {
        if ((inode->i_mode & S_IFMT) == S_IFREG) {
                /* only struct inode is valid if it's an inline symlink */
                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
        }
-        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+        call_rcu(&inode->i_rcu, shmem_i_callback);
 }
 static void init_once(void *foo)
diff --git a/mm/slab.c b/mm/slab.c
index b1e40dafbab3..264037449f08 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -829,12 +829,12 @@ static void init_reap_node(int cpu)
 static void next_reap_node(void)
 {
-        int node = __get_cpu_var(slab_reap_node);
+        int node = __this_cpu_read(slab_reap_node);
        node = next_node(node, node_online_map);
        if (unlikely(node >= MAX_NUMNODES))
                node = first_node(node_online_map);
-        __get_cpu_var(slab_reap_node) = node;
+        __this_cpu_write(slab_reap_node, node);
 }
 #else
@@ -1012,7 +1012,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
 */
 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
 {
-        int node = __get_cpu_var(slab_reap_node);
+        int node = __this_cpu_read(slab_reap_node);
        if (l3->alien) {
                struct array_cache *ac = l3->alien[node];
@@ -1293,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
                 * anything expensive but will only modify reap_work
                 * and reschedule the timer.
                */
-                cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
+                cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
                /* Now the cache_reaper is guaranteed to be not running. */
                per_cpu(slab_reap_work, cpu).work.func = NULL;
                break;
@@ -2781,7 +2781,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
 /*
 * Map pages beginning at addr to the given cache and slab. This is required
 * for the slab allocator to be able to lookup the cache and slab of a
- * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ * virtual address for kfree, ksize, and slab debugging.
 */
 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
                           void *addr)
@@ -3653,42 +3653,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 EXPORT_SYMBOL(kmem_cache_alloc);
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
+void *
+kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
 {
-        return __cache_alloc(cachep, flags, __builtin_return_address(0));
+        void *ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
-#endif
-/**
+        ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
- * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
- * @cachep: the cache we're checking against
- * @ptr: pointer to validate
- *
- * This verifies that the untrusted pointer looks sane;
- * it is _not_ a guarantee that the pointer is actually
- * part of the slab cache in question, but it at least
- * validates that the pointer can be dereferenced and
- * looks half-way sane.
- *
- * Currently only used for dentry validation.
- */
-int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
-{
-        unsigned long size = cachep->buffer_size;
-        struct page *page;
-        if (unlikely(!kern_ptr_validate(ptr, size)))
+        trace_kmalloc(_RET_IP_, ret,
-                goto out;
+                      size, slab_buffer_size(cachep), flags);
-        page = virt_to_page(ptr);
+        return ret;
-        if (unlikely(!PageSlab(page)))
-                goto out;
-        if (unlikely(page_get_cache(page) != cachep))
-                goto out;
-        return 1;
-out:
-        return 0;
 }
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+#endif
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
@@ -3705,31 +3682,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
+void *kmem_cache_alloc_node_trace(size_t size,
-                                    gfp_t flags,
+                                  struct kmem_cache *cachep,
-                                    int nodeid)
+                                  gfp_t flags,
+                                  int nodeid)
 {
-        return __cache_alloc_node(cachep, flags, nodeid,
+        void *ret;
+        ret = __cache_alloc_node(cachep, flags, nodeid,
                                  __builtin_return_address(0));
+        trace_kmalloc_node(_RET_IP_, ret,
+                           size, slab_buffer_size(cachep),
+                           flags, nodeid);
+        return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
 #endif
 static __always_inline void *
 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 {
        struct kmem_cache *cachep;
-        void *ret;
        cachep = kmem_find_general_cachep(size, flags);
        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
                return cachep;
-        ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
+        return kmem_cache_alloc_node_trace(size, cachep, flags, node);
-        trace_kmalloc_node((unsigned long) caller, ret,
-                           size, cachep->buffer_size, flags, node);
-        return ret;
 }
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
diff --git a/mm/slob.c b/mm/slob.c
index 617b6d6c42c7..3588eaaef726 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -678,11 +678,6 @@ int kmem_cache_shrink(struct kmem_cache *d)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
-int kmem_ptr_validate(struct kmem_cache *a, const void *b)
-{
-        return 0;
-}
 static unsigned int slob_ready __read_mostly;
 int slab_is_available(void)
diff --git a/mm/slub.c b/mm/slub.c
index bec0e355fbad..c7ef0070dd86 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -28,6 +28,8 @@
 #include <linux/math64.h>
 #include <linux/fault-inject.h>
+#include <trace/events/kmem.h>
 /*
 * Lock order:
 *   1. slab_lock(page)
@@ -1774,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 EXPORT_SYMBOL(kmem_cache_alloc);
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+{
+        void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+        trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+        return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
 {
-        return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+        void *ret = kmalloc_order(size, flags, order);
+        trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+        return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+EXPORT_SYMBOL(kmalloc_order_trace);
 #endif
 #ifdef CONFIG_NUMA
@@ -1794,13 +1806,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
                                    gfp_t gfpflags,
-                                    int node)
+                                    int node, size_t size)
 {
-        return slab_alloc(s, gfpflags, node, _RET_IP_);
+        void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+        trace_kmalloc_node(_RET_IP_, ret,
+                           size, s->size, gfpflags, node);
+        return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
 #endif
 #endif
@@ -1917,17 +1933,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 }
 EXPORT_SYMBOL(kmem_cache_free);
-/* Figure out on which slab page the object resides */
-static struct page *get_object_page(const void *x)
-{
-        struct page *page = virt_to_head_page(x);
-        if (!PageSlab(page))
-                return NULL;
-        return page;
-}
 /*
 * Object placement in a slab is made very easy because we always start at
 * offset 0. If we tune the size of the object to the alignment then we can
@@ -2386,35 +2391,6 @@ error:
 }
 /*
- * Check if a given pointer is valid
- */
-int kmem_ptr_validate(struct kmem_cache *s, const void *object)
-{
-        struct page *page;
-        if (!kern_ptr_validate(object, s->size))
-                return 0;
-        page = get_object_page(object);
-        if (!page || s != page->slab)
-                /* No slab or wrong slab */
-                return 0;
-        if (!check_valid_pointer(s, page, object))
-                return 0;
-        /*
-         * We could also check if the object is on the slabs freelist.
-         * But this would be too expensive and it seems that the main
-         * purpose of kmem_ptr_valid() is to check if the object belongs
-         * to a certain slab.
-         */
-        return 1;
-}
-EXPORT_SYMBOL(kmem_ptr_validate);
-/*
 * Determine the size of a slab object
 */
 unsigned int kmem_cache_size(struct kmem_cache *s)
@@ -3660,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
                len += sprintf(buf + len, "%7ld ", l->count);
                if (l->addr)
-                        len += sprint_symbol(buf + len, (unsigned long)l->addr);
+                        len += sprintf(buf + len, "%pS", (void *)l->addr);
                else
                        len += sprintf(buf + len, "<not-available>");
@@ -3970,12 +3946,9 @@ SLAB_ATTR(min_partial);
 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
 {
-        if (s->ctor) {
+        if (!s->ctor)
-                int n = sprint_symbol(buf, (unsigned long)s->ctor);
+                return 0;
+        return sprintf(buf, "%pS\n", s->ctor);
-                return n + sprintf(buf + n, "\n");
-        }
-        return 0;
 }
 SLAB_ATTR_RO(ctor);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 29d6cbffb283..64b984091edb 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -9,7 +9,7 @@
 *
 * However, virtual mappings need a page table and TLBs. Many Linux
 * architectures already map their physical space using 1-1 mappings
- * via TLBs. For those arches the virtual memmory map is essentially
+ * via TLBs. For those arches the virtual memory map is essentially
 * for free if we use the same page size as the 1-1 mappings. In that
 * case the overhead consists of a few additional pages that are
 * allocated to create a view of memory for vmemmap.
diff --git a/mm/sparse.c b/mm/sparse.c
index 95ac219af379..93250207c5cf 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 static void free_map_bootmem(struct page *page, unsigned long nr_pages)
 {
        unsigned long maps_section_nr, removing_section_nr, i;
-        int magic;
+        unsigned long magic;
        for (i = 0; i < nr_pages; i++, page++) {
-                magic = atomic_read(&page->_mapcount);
+                magic = (unsigned long) page->lru.next;
                BUG_ON(magic == NODE_INFO);
diff --git a/mm/swap.c b/mm/swap.c
index 3f4854205b16..bbc1ce9f9460 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page)
                del_page_from_lru(zone, page);
                spin_unlock_irqrestore(&zone->lru_lock, flags);
        }
+}
+static void __put_single_page(struct page *page)
+{
+        __page_cache_release(page);
        free_hot_cold_page(page, 0);
 }
-static void put_compound_page(struct page *page)
+static void __put_compound_page(struct page *page)
 {
-        page = compound_head(page);
+        compound_page_dtor *dtor;
-        if (put_page_testzero(page)) {
-                compound_page_dtor *dtor;
-                dtor = get_compound_page_dtor(page);
+        __page_cache_release(page);
-                (*dtor)(page);
+        dtor = get_compound_page_dtor(page);
+        (*dtor)(page);
+}
+static void put_compound_page(struct page *page)
+{
+        if (unlikely(PageTail(page))) {
+                /* __split_huge_page_refcount can run under us */
+                struct page *page_head = page->first_page;
+                smp_rmb();
+                /*
+                 * If PageTail is still set after smp_rmb() we can be sure
+                 * that the page->first_page we read wasn't a dangling pointer.
+                 * See __split_huge_page_refcount() smp_wmb().
+                 */
+                if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+                        unsigned long flags;
+                        /*
+                         * Verify that our page_head wasn't converted
+                         * to a a regular page before we got a
+                         * reference on it.
+                         */
+                        if (unlikely(!PageHead(page_head))) {
+                                /* PageHead is cleared after PageTail */
+                                smp_rmb();
+                                VM_BUG_ON(PageTail(page));
+                                goto out_put_head;
+                        }
+                        /*
+                         * Only run compound_lock on a valid PageHead,
+                         * after having it pinned with
+                         * get_page_unless_zero() above.
+                         */
+                        smp_mb();
+                        /* page_head wasn't a dangling pointer */
+                        flags = compound_lock_irqsave(page_head);
+                        if (unlikely(!PageTail(page))) {
+                                /* __split_huge_page_refcount run before us */
+                                compound_unlock_irqrestore(page_head, flags);
+                                VM_BUG_ON(PageHead(page_head));
+                        out_put_head:
+                                if (put_page_testzero(page_head))
+                                        __put_single_page(page_head);
+                        out_put_single:
+                                if (put_page_testzero(page))
+                                        __put_single_page(page);
+                                return;
+                        }
+                        VM_BUG_ON(page_head != page->first_page);
+                        /*
+                         * We can release the refcount taken by
+                         * get_page_unless_zero now that
+                         * split_huge_page_refcount is blocked on the
+                         * compound_lock.
+                         */
+                        if (put_page_testzero(page_head))
+                                VM_BUG_ON(1);
+                        /* __split_huge_page_refcount will wait now */
+                        VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                        atomic_dec(&page->_count);
+                        VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+                        compound_unlock_irqrestore(page_head, flags);
+                        if (put_page_testzero(page_head)) {
+                                if (PageHead(page_head))
+                                        __put_compound_page(page_head);
+                                else
+                                        __put_single_page(page_head);
+                        }
+                } else {
+                        /* page_head is a dangling pointer */
+                        VM_BUG_ON(PageTail(page));
+                        goto out_put_single;
+                }
+        } else if (put_page_testzero(page)) {
+                if (PageHead(page))
+                        __put_compound_page(page);
+                else
+                        __put_single_page(page);
        }
 }
@@ -75,7 +155,7 @@ void put_page(struct page *page)
        if (unlikely(PageCompound(page)))
                put_compound_page(page);
        else if (put_page_testzero(page))
-                __page_cache_release(page);
+                __put_single_page(page);
 }
 EXPORT_SYMBOL(put_page);
@@ -98,15 +178,13 @@ void put_pages_list(struct list_head *pages)
 }
 EXPORT_SYMBOL(put_pages_list);
-/*
+static void pagevec_lru_move_fn(struct pagevec *pvec,
- * pagevec_move_tail() must be called with IRQ disabled.
+                                void (*move_fn)(struct page *page, void *arg),
- * Otherwise this may cause nasty races.
+                                void *arg)
- */
-static void pagevec_move_tail(struct pagevec *pvec)
 {
        int i;
-        int pgmoved = 0;
        struct zone *zone = NULL;
+        unsigned long flags = 0;
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
@@ -114,29 +192,49 @@ static void pagevec_move_tail(struct pagevec *pvec)
                if (pagezone != zone) {
                        if (zone)
-                                spin_unlock(&zone->lru_lock);
+                                spin_unlock_irqrestore(&zone->lru_lock, flags);
                        zone = pagezone;
-                        spin_lock(&zone->lru_lock);
+                        spin_lock_irqsave(&zone->lru_lock, flags);
-                }
-                if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-                        int lru = page_lru_base_type(page);
-                        list_move_tail(&page->lru, &zone->lru[lru].list);
-                        pgmoved++;
                }
+                (*move_fn)(page, arg);
        }
        if (zone)
-                spin_unlock(&zone->lru_lock);
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
-        __count_vm_events(PGROTATED, pgmoved);
+        release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
-        release_pages(pvec->pages, pvec->nr, pvec->cold);
        pagevec_reinit(pvec);
 }
+static void pagevec_move_tail_fn(struct page *page, void *arg)
+{
+        int *pgmoved = arg;
+        struct zone *zone = page_zone(page);
+        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+                int lru = page_lru_base_type(page);
+                list_move_tail(&page->lru, &zone->lru[lru].list);
+                (*pgmoved)++;
+        }
+}
+/*
+ * pagevec_move_tail() must be called with IRQ disabled.
+ * Otherwise this may cause nasty races.
+ */
+static void pagevec_move_tail(struct pagevec *pvec)
+{
+        int pgmoved = 0;
+        pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
+        __count_vm_events(PGROTATED, pgmoved);
+}
 /*
 * Writeback is about to end against a page which has been marked for immediate
 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 * inactive list.
 */
-void  rotate_reclaimable_page(struct page *page)
+void rotate_reclaimable_page(struct page *page)
 {
        if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
            !PageUnevictable(page) && PageLRU(page)) {
@@ -173,27 +271,94 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page,
 }
 /*
- * FIXME: speed this up?
+ * A page will go to active list either by activate_page or putback_lru_page.
+ * In the activate_page case, the page hasn't active bit set. The page might
+ * not in LRU list because it's isolated before it gets a chance to be moved to
+ * active list. The window is small because pagevec just stores several pages.
+ * For such case, we do nothing for such page.
+ * In the putback_lru_page case, the page isn't in lru list but has active
+ * bit set
 */
-void activate_page(struct page *page)
+static void __activate_page(struct page *page, void *arg)
 {
        struct zone *zone = page_zone(page);
+        int file = page_is_file_cache(page);
+        int lru = page_lru_base_type(page);
+        bool putback = !PageLRU(page);
-        spin_lock_irq(&zone->lru_lock);
+        /* The page is isolated before it's moved to active list */
-        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+        if (!PageLRU(page) && !PageActive(page))
-                int file = page_is_file_cache(page);
+                return;
-                int lru = page_lru_base_type(page);
+        if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page))
+                return;
+        if (!putback)
                del_page_from_lru_list(zone, page, lru);
+        else
+                SetPageLRU(page);
-                SetPageActive(page);
+        SetPageActive(page);
-                lru += LRU_ACTIVE;
+        lru += LRU_ACTIVE;
-                add_page_to_lru_list(zone, page, lru);
+        add_page_to_lru_list(zone, page, lru);
-                __count_vm_event(PGACTIVATE);
-                update_page_reclaim_stat(zone, page, file, 1);
+        if (putback)
+                return;
+        __count_vm_event(PGACTIVATE);
+        update_page_reclaim_stat(zone, page, file, 1);
+}
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+static void activate_page_drain(int cpu)
+{
+        struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
+        if (pagevec_count(pvec))
+                pagevec_lru_move_fn(pvec, __activate_page, NULL);
+}
+void activate_page(struct page *page)
+{
+        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+                struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+                page_cache_get(page);
+                if (!pagevec_add(pvec, page))
+                        pagevec_lru_move_fn(pvec, __activate_page, NULL);
+                put_cpu_var(activate_page_pvecs);
+        }
+}
+/* Caller should hold zone->lru_lock */
+int putback_active_lru_page(struct zone *zone, struct page *page)
+{
+        struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+        if (!pagevec_add(pvec, page)) {
+                spin_unlock_irq(&zone->lru_lock);
+                pagevec_lru_move_fn(pvec, __activate_page, NULL);
+                spin_lock_irq(&zone->lru_lock);
        }
+        put_cpu_var(activate_page_pvecs);
+        return 1;
+}
+#else
+static inline void activate_page_drain(int cpu)
+{
+}
+void activate_page(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        spin_lock_irq(&zone->lru_lock);
+        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page))
+                __activate_page(page, NULL);
        spin_unlock_irq(&zone->lru_lock);
 }
+#endif
 /*
 * Mark a page as having seen activity.
@@ -292,6 +457,7 @@ static void drain_cpu_pagevecs(int cpu)
                pagevec_move_tail(pvec);
                local_irq_restore(flags);
        }
+        activate_page_drain(cpu);
 }
 void lru_add_drain(void)
@@ -399,44 +565,70 @@ void __pagevec_release(struct pagevec *pvec)
 EXPORT_SYMBOL(__pagevec_release);
+/* used by __split_huge_page_refcount() */
+void lru_add_page_tail(struct zone* zone,
+                       struct page *page, struct page *page_tail)
+{
+        int active;
+        enum lru_list lru;
+        const int file = 0;
+        struct list_head *head;
+        VM_BUG_ON(!PageHead(page));
+        VM_BUG_ON(PageCompound(page_tail));
+        VM_BUG_ON(PageLRU(page_tail));
+        VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
+        SetPageLRU(page_tail);
+        if (page_evictable(page_tail, NULL)) {
+                if (PageActive(page)) {
+                        SetPageActive(page_tail);
+                        active = 1;
+                        lru = LRU_ACTIVE_ANON;
+                } else {
+                        active = 0;
+                        lru = LRU_INACTIVE_ANON;
+                }
+                update_page_reclaim_stat(zone, page_tail, file, active);
+                if (likely(PageLRU(page)))
+                        head = page->lru.prev;
+                else
+                        head = &zone->lru[lru].list;
+                __add_page_to_lru_list(zone, page_tail, lru, head);
+        } else {
+                SetPageUnevictable(page_tail);
+                add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
+        }
+}
+static void ____pagevec_lru_add_fn(struct page *page, void *arg)
+{
+        enum lru_list lru = (enum lru_list)arg;
+        struct zone *zone = page_zone(page);
+        int file = is_file_lru(lru);
+        int active = is_active_lru(lru);
+        VM_BUG_ON(PageActive(page));
+        VM_BUG_ON(PageUnevictable(page));
+        VM_BUG_ON(PageLRU(page));
+        SetPageLRU(page);
+        if (active)
+                SetPageActive(page);
+        update_page_reclaim_stat(zone, page, file, active);
+        add_page_to_lru_list(zone, page, lru);
+}
 /*
 * Add the passed pages to the LRU, then drop the caller's refcount
 * on them.  Reinitialises the caller's pagevec.
 */
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
-        int i;
-        struct zone *zone = NULL;
        VM_BUG_ON(is_unevictable_lru(lru));
-        for (i = 0; i < pagevec_count(pvec); i++) {
+        pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
-                struct page *page = pvec->pages[i];
-                struct zone *pagezone = page_zone(page);
-                int file;
-                int active;
-                if (pagezone != zone) {
-                        if (zone)
-                                spin_unlock_irq(&zone->lru_lock);
-                        zone = pagezone;
-                        spin_lock_irq(&zone->lru_lock);
-                }
-                VM_BUG_ON(PageActive(page));
-                VM_BUG_ON(PageUnevictable(page));
-                VM_BUG_ON(PageLRU(page));
-                SetPageLRU(page);
-                active = is_active_lru(lru);
-                file = is_file_lru(lru);
-                if (active)
-                        SetPageActive(page);
-                update_page_reclaim_stat(zone, page, file, active);
-                add_page_to_lru_list(zone, page, lru);
-        }
-        if (zone)
-                spin_unlock_irq(&zone->lru_lock);
-        release_pages(pvec->pages, pvec->nr, pvec->cold);
-        pagevec_reinit(pvec);
 }
 EXPORT_SYMBOL(____pagevec_lru_add);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e10f5833167f..5c8cfabbc9bc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -157,6 +157,12 @@ int add_to_swap(struct page *page)
        if (!entry.val)
                return 0;
+        if (unlikely(PageTransHuge(page)))
+                if (unlikely(split_huge_page(page))) {
+                        swapcache_free(entry, NULL);
+                        return 0;
+                }
        /*
         * Radix-tree node allocations from PF_MEMALLOC contexts could
         * completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 67ddaaf98c74..07a458d72fa8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (unlikely(pmd_trans_huge(*pmd)))
+                        continue;
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
@@ -1677,7 +1679,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        if (S_ISBLK(inode->i_mode)) {
                struct block_device *bdev = I_BDEV(inode);
                set_blocksize(bdev, p->old_block_size);
-                bd_release(bdev);
+                blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
        } else {
                mutex_lock(&inode->i_mutex);
                inode->i_flags &= ~S_SWAPFILE;
@@ -1939,7 +1941,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        error = -EINVAL;
        if (S_ISBLK(inode->i_mode)) {
                bdev = I_BDEV(inode);
-                error = bd_claim(bdev, sys_swapon);
+                error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+                                   sys_swapon);
                if (error < 0) {
                        bdev = NULL;
                        error = -EINVAL;
@@ -2136,7 +2139,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 bad_swap:
        if (bdev) {
                set_blocksize(bdev, p->old_block_size);
-                bd_release(bdev);
+                blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
        }
        destroy_swap_extents(p);
        swap_cgroup_swapoff(type);
diff --git a/mm/util.c b/mm/util.c
index 73dac81e9f78..f126975ef23e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -186,27 +186,6 @@ void kzfree(const void *p)
 }
 EXPORT_SYMBOL(kzfree);
-int kern_ptr_validate(const void *ptr, unsigned long size)
-{
-        unsigned long addr = (unsigned long)ptr;
-        unsigned long min_addr = PAGE_OFFSET;
-        unsigned long align_mask = sizeof(void *) - 1;
-        if (unlikely(addr < min_addr))
-                goto out;
-        if (unlikely(addr > (unsigned long)high_memory - size))
-                goto out;
-        if (unlikely(addr & align_mask))
-                goto out;
-        if (unlikely(!kern_addr_valid(addr)))
-                goto out;
-        if (unlikely(!kern_addr_valid(addr + size - 1)))
-                goto out;
-        return 1;
-out:
-        return 0;
-}
 /*
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 816f074fb4e1..f9b166732e70 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -748,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
                                        VMALLOC_START, VMALLOC_END,
                                        node, gfp_mask);
-        if (unlikely(IS_ERR(va))) {
+        if (IS_ERR(va)) {
                kfree(vb);
                return ERR_CAST(va);
        }
@@ -1316,13 +1316,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                                -1, GFP_KERNEL, caller);
 }
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
-                                   int node, gfp_t gfp_mask)
-{
-        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-                                  node, gfp_mask, __builtin_return_address(0));
-}
 static struct vm_struct *find_vm_area(const void *addr)
 {
        struct vmap_area *va;
@@ -1538,25 +1531,12 @@ fail:
        return NULL;
 }
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
-{
-        void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
-                                         __builtin_return_address(0));
-        /*
-         * A ref_count = 3 is needed because the vm_struct and vmap_area
-         * structures allocated in the __get_vm_area_node() function contain
-         * references to the virtual address of the vmalloc'ed block.
-         */
-        kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
-        return addr;
-}
 /**
- *      __vmalloc_node  -  allocate virtually contiguous memory
+ *      __vmalloc_node_range  -  allocate virtually contiguous memory
 *      @size:          allocation size
 *      @align:         desired alignment
+ *      @start:         vm area range start
+ *      @end:           vm area range end
 *      @gfp_mask:      flags for the page level allocator
 *      @prot:          protection mask for the allocated pages
 *      @node:          node to use for allocation or -1
@@ -1566,9 +1546,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 *      allocator with @gfp_mask flags.  Map them into contiguous
 *      kernel virtual space, using a pagetable protection of @prot.
 */
-static void *__vmalloc_node(unsigned long size, unsigned long align,
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
-                            gfp_t gfp_mask, pgprot_t prot,
+                        unsigned long start, unsigned long end, gfp_t gfp_mask,
-                            int node, void *caller)
+                        pgprot_t prot, int node, void *caller)
 {
        struct vm_struct *area;
        void *addr;
@@ -1578,8 +1558,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
        if (!size || (size >> PAGE_SHIFT) > totalram_pages)
                return NULL;
-        area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
+        area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
-                                  VMALLOC_END, node, gfp_mask, caller);
+                                  gfp_mask, caller);
        if (!area)
                return NULL;
@@ -1596,6 +1576,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
        return addr;
 }
+/**
+ *      __vmalloc_node  -  allocate virtually contiguous memory
+ *      @size:          allocation size
+ *      @align:         desired alignment
+ *      @gfp_mask:      flags for the page level allocator
+ *      @prot:          protection mask for the allocated pages
+ *      @node:          node to use for allocation or -1
+ *      @caller:        caller's return address
+ *
+ *      Allocate enough pages to cover @size from the page level
+ *      allocator with @gfp_mask flags.  Map them into contiguous
+ *      kernel virtual space, using a pagetable protection of @prot.
+ */
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+                            gfp_t gfp_mask, pgprot_t prot,
+                            int node, void *caller)
+{
+        return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+                                gfp_mask, prot, node, caller);
+}
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
        return __vmalloc_node(size, 1, gfp_mask, prot, -1,
@@ -2204,17 +2205,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
 * @sizes: array containing size of each area
 * @nr_vms: the number of areas to allocate
 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
- * @gfp_mask: allocation mask
 *
 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
 *          vm_structs on success, %NULL on failure
 *
 * Percpu allocator wants to use congruent vm areas so that it can
 * maintain the offsets among percpu areas.  This function allocates
- * congruent vmalloc areas for it.  These areas tend to be scattered
+ * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
- * pretty far, distance between two areas easily going up to
+ * be scattered pretty far, distance between two areas easily going up
- * gigabytes.  To avoid interacting with regular vmallocs, these areas
+ * to gigabytes.  To avoid interacting with regular vmallocs, these
- * are allocated from top.
+ * areas are allocated from top.
 *
 * Despite its complicated look, this allocator is rather simple.  It
 * does everything top-down and scans areas from the end looking for
@@ -2225,7 +2225,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
 */
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
-                                     size_t align, gfp_t gfp_mask)
+                                     size_t align)
 {
        const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
        const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2235,8 +2235,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
        unsigned long base, start, end, last_end;
        bool purged = false;
-        gfp_mask &= GFP_RECLAIM_MASK;
        /* verify parameters and allocate data structures */
        BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
        for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2269,14 +2267,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                return NULL;
        }
-        vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
+        vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
-        vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
+        vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
        if (!vas || !vms)
                goto err_free;
        for (area = 0; area < nr_vms; area++) {
-                vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
+                vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
-                vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
+                vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
                if (!vas[area] || !vms[area])
                        goto err_free;
        }
@@ -2457,13 +2455,8 @@ static int s_show(struct seq_file *m, void *p)
        seq_printf(m, "0x%p-0x%p %7ld",
                v->addr, v->addr + v->size, v->size);
-        if (v->caller) {
+        if (v->caller)
-                char buff[KSYM_SYMBOL_LEN];
+                seq_printf(m, " %pS", v->caller);
-                seq_putc(m, ' ');
-                sprint_symbol(buff, (unsigned long)v->caller);
-                seq_puts(m, buff);
-        }
        if (v->nr_pages)
                seq_printf(m, " pages=%d", v->nr_pages);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9ca587c69274..99999a9b2b0b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/compaction.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
@@ -40,6 +41,7 @@
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
+#include <linux/compaction.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -51,11 +53,23 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
-enum lumpy_mode {
+/*
-        LUMPY_MODE_NONE,
+ * reclaim_mode determines how the inactive list is shrunk
-        LUMPY_MODE_ASYNC,
+ * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
-        LUMPY_MODE_SYNC,
+ * RECLAIM_MODE_ASYNC:  Do not block
-};
+ * RECLAIM_MODE_SYNC:   Allow blocking e.g. call wait_on_page_writeback
+ * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
+ *                      page from the LRU and reclaim all pages within a
+ *                      naturally aligned range
+ * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ *                      order-0 pages and then compact the zone
+ */
+typedef unsigned __bitwise__ reclaim_mode_t;
+#define RECLAIM_MODE_SINGLE             ((__force reclaim_mode_t)0x01u)
+#define RECLAIM_MODE_ASYNC              ((__force reclaim_mode_t)0x02u)
+#define RECLAIM_MODE_SYNC               ((__force reclaim_mode_t)0x04u)
+#define RECLAIM_MODE_LUMPYRECLAIM       ((__force reclaim_mode_t)0x08u)
+#define RECLAIM_MODE_COMPACTION         ((__force reclaim_mode_t)0x10u)
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
@@ -88,7 +102,7 @@ struct scan_control {
         * Intend to reclaim enough continuous memory rather than reclaim
         * enough amount of memory. i.e, mode for high order allocation.
         */
-        enum lumpy_mode lumpy_reclaim_mode;
+        reclaim_mode_t reclaim_mode;
        /* Which cgroup do we reclaim from */
        struct mem_cgroup *mem_cgroup;
@@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
        return ret;
 }
-static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
+static void set_reclaim_mode(int priority, struct scan_control *sc,
                                   bool sync)
 {
-        enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
+        reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
        /*
-         * Some reclaim have alredy been failed. No worth to try synchronous
+         * Initially assume we are entering either lumpy reclaim or
-         * lumpy reclaim.
+         * reclaim/compaction.Depending on the order, we will either set the
+         * sync mode or just reclaim order-0 pages later.
         */
-        if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
+        if (COMPACTION_BUILD)
-                return;
+                sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
+        else
+                sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
        /*
-         * If we need a large contiguous chunk of memory, or have
+         * Avoid using lumpy reclaim or reclaim/compaction if possible by
-         * trouble getting a small set of contiguous pages, we
+         * restricting when its set to either costly allocations or when
-         * will reclaim both active and inactive pages.
+         * under memory pressure
         */
        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-                sc->lumpy_reclaim_mode = mode;
+                sc->reclaim_mode |= syncmode;
        else if (sc->order && priority < DEF_PRIORITY - 2)
-                sc->lumpy_reclaim_mode = mode;
+                sc->reclaim_mode |= syncmode;
        else
-                sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+                sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
 }
-static void disable_lumpy_reclaim_mode(struct scan_control *sc)
+static void reset_reclaim_mode(struct scan_control *sc)
 {
-        sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+        sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
 }
 static inline int is_page_cache_freeable(struct page *page)
@@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                 * first attempt to free a range of pages fails.
                 */
                if (PageWriteback(page) &&
-                    sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC)
+                    (sc->reclaim_mode & RECLAIM_MODE_SYNC))
                        wait_on_page_writeback(page);
                if (!PageWriteback(page)) {
@@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        ClearPageReclaim(page);
                }
                trace_mm_vmscan_writepage(page,
-                        trace_reclaim_flags(page, sc->lumpy_reclaim_mode));
+                        trace_reclaim_flags(page, sc->reclaim_mode));
                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -622,7 +639,7 @@ static enum page_references page_check_references(struct page *page,
        referenced_page = TestClearPageReferenced(page);
        /* Lumpy reclaim - ignore references */
-        if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE)
+        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
                return PAGEREF_RECLAIM;
        /*
@@ -739,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         * for any page for which writeback has already
                         * started.
                         */
-                        if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC &&
+                        if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
                            may_enter_fs)
                                wait_on_page_writeback(page);
                        else {
@@ -895,7 +912,7 @@ cull_mlocked:
                        try_to_free_swap(page);
                unlock_page(page);
                putback_lru_page(page);
-                disable_lumpy_reclaim_mode(sc);
+                reset_reclaim_mode(sc);
                continue;
 activate_locked:
@@ -908,7 +925,7 @@ activate_locked:
 keep_locked:
                unlock_page(page);
 keep:
-                disable_lumpy_reclaim_mode(sc);
+                reset_reclaim_mode(sc);
 keep_lumpy:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
@@ -1028,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                case 0:
                        list_move(&page->lru, dst);
                        mem_cgroup_del_lru(page);
-                        nr_taken++;
+                        nr_taken += hpage_nr_pages(page);
                        break;
                case -EBUSY:
@@ -1086,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        if (__isolate_lru_page(cursor_page, mode, file) == 0) {
                                list_move(&cursor_page->lru, dst);
                                mem_cgroup_del_lru(cursor_page);
-                                nr_taken++;
+                                nr_taken += hpage_nr_pages(page);
                                nr_lumpy_taken++;
                                if (PageDirty(cursor_page))
                                        nr_lumpy_dirty++;
@@ -1141,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
        struct page *page;
        list_for_each_entry(page, page_list, lru) {
+                int numpages = hpage_nr_pages(page);
                lru = page_lru_base_type(page);
                if (PageActive(page)) {
                        lru += LRU_ACTIVE;
                        ClearPageActive(page);
-                        nr_active++;
+                        nr_active += numpages;
                }
                if (count)
-                        count[lru]++;
+                        count[lru] += numpages;
        }
        return nr_active;
@@ -1253,13 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
                        spin_lock_irq(&zone->lru_lock);
                        continue;
                }
-                SetPageLRU(page);
                lru = page_lru(page);
-                add_page_to_lru_list(zone, page, lru);
                if (is_active_lru(lru)) {
                        int file = is_file_lru(lru);
-                        reclaim_stat->recent_rotated[file]++;
+                        int numpages = hpage_nr_pages(page);
+                        reclaim_stat->recent_rotated[file] += numpages;
+                        if (putback_active_lru_page(zone, page))
+                                continue;
                }
+                SetPageLRU(page);
+                add_page_to_lru_list(zone, page, lru);
                if (!pagevec_add(&pvec, page)) {
                        spin_unlock_irq(&zone->lru_lock);
                        __pagevec_release(&pvec);
@@ -1324,7 +1345,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
                return false;
        /* Only stall on lumpy reclaim */
-        if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
+        if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
                return false;
        /* If we have relaimed everything on the isolated list, no stall */
@@ -1368,15 +1389,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                        return SWAP_CLUSTER_MAX;
        }
-        set_lumpy_reclaim_mode(priority, sc, false);
+        set_reclaim_mode(priority, sc, false);
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
+                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                        ISOLATE_INACTIVE : ISOLATE_BOTH,
+                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
@@ -1388,8 +1409,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
+                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                        ISOLATE_INACTIVE : ISOLATE_BOTH,
+                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, sc->mem_cgroup,
                        0, file);
                /*
@@ -1411,7 +1432,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
-                set_lumpy_reclaim_mode(priority, sc, true);
+                set_reclaim_mode(priority, sc, true);
                nr_reclaimed += shrink_page_list(&page_list, zone, sc);
        }
@@ -1426,7 +1447,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                zone_idx(zone),
                nr_scanned, nr_reclaimed,
                priority,
-                trace_shrink_flags(file, sc->lumpy_reclaim_mode));
+                trace_shrink_flags(file, sc->reclaim_mode));
        return nr_reclaimed;
 }
@@ -1466,7 +1487,7 @@ static void move_active_pages_to_lru(struct zone *zone,
                list_move(&page->lru, &zone->lru[lru].list);
                mem_cgroup_add_lru_list(page, lru);
-                pgmoved++;
+                pgmoved += hpage_nr_pages(page);
                if (!pagevec_add(&pvec, page) || list_empty(list)) {
                        spin_unlock_irq(&zone->lru_lock);
@@ -1534,7 +1555,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                }
                if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
-                        nr_rotated++;
+                        nr_rotated += hpage_nr_pages(page);
                        /*
                         * Identify referenced, file-backed active pages and
                         * give them one more trip around the active list. So
@@ -1805,6 +1826,57 @@ out:
 }
 /*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+                                        unsigned long nr_reclaimed,
+                                        unsigned long nr_scanned,
+                                        struct scan_control *sc)
+{
+        unsigned long pages_for_compaction;
+        unsigned long inactive_lru_pages;
+        /* If not in reclaim/compaction mode, stop */
+        if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+                return false;
+        /*
+         * If we failed to reclaim and have scanned the full list, stop.
+         * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+         *       faster but obviously would be less likely to succeed
+         *       allocation. If this is desirable, use GFP_REPEAT to decide
+         *       if both reclaimed and scanned should be checked or just
+         *       reclaimed
+         */
+        if (!nr_reclaimed && !nr_scanned)
+                return false;
+        /*
+         * If we have not reclaimed enough pages for compaction and the
+         * inactive lists are large enough, continue reclaiming
+         */
+        pages_for_compaction = (2UL << sc->order);
+        inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+                                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+        if (sc->nr_reclaimed < pages_for_compaction &&
+                        inactive_lru_pages > pages_for_compaction)
+                return true;
+        /* If compaction would go ahead or the allocation would succeed, stop */
+        switch (compaction_suitable(zone, sc->order)) {
+        case COMPACT_PARTIAL:
+        case COMPACT_CONTINUE:
+                return false;
+        default:
+                return true;
+        }
+}
+/*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
 static void shrink_zone(int priority, struct zone *zone,
@@ -1813,9 +1885,12 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list l;
-        unsigned long nr_reclaimed = sc->nr_reclaimed;
+        unsigned long nr_reclaimed;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+        unsigned long nr_scanned = sc->nr_scanned;
+restart:
+        nr_reclaimed = 0;
        get_scan_count(zone, sc, nr, priority);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1841,8 +1916,7 @@ static void shrink_zone(int priority, struct zone *zone,
                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
+        sc->nr_reclaimed += nr_reclaimed;
-        sc->nr_reclaimed = nr_reclaimed;
        /*
         * Even if we did not try to evict anon pages at all, we want to
@@ -1851,6 +1925,11 @@ static void shrink_zone(int priority, struct zone *zone,
        if (inactive_anon_is_low(zone, sc))
                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+        /* reclaim/compaction might need reclaim to continue */
+        if (should_continue_reclaim(zone, nr_reclaimed,
+                                        sc->nr_scanned - nr_scanned, sc))
+                goto restart;
        throttle_vm_writeout(sc->gfp_mask);
 }
@@ -2124,38 +2203,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 }
 #endif
+/*
+ * pgdat_balanced is used when checking if a node is balanced for high-order
+ * allocations. Only zones that meet watermarks and are in a zone allowed
+ * by the callers classzone_idx are added to balanced_pages. The total of
+ * balanced pages must be at least 25% of the zones allowed by classzone_idx
+ * for the node to be considered balanced. Forcing all zones to be balanced
+ * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * The choice of 25% is due to
+ *   o a 16M DMA zone that is balanced will not balance a zone on any
+ *     reasonable sized machine
+ *   o On all other machines, the top zone must be at least a reasonable
+ *     precentage of the middle zones. For example, on 32-bit x86, highmem
+ *     would need to be at least 256M for it to be balance a whole node.
+ *     Similarly, on x86-64 the Normal zone would need to be at least 1G
+ *     to balance a node on its own. These seemed like reasonable ratios.
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
+                                                int classzone_idx)
+{
+        unsigned long present_pages = 0;
+        int i;
+        for (i = 0; i <= classzone_idx; i++)
+                present_pages += pgdat->node_zones[i].present_pages;
+        return balanced_pages > (present_pages >> 2);
+}
 /* is kswapd sleeping prematurely? */
-static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+                                        int classzone_idx)
 {
        int i;
+        unsigned long balanced = 0;
+        bool all_zones_ok = true;
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
-                return 1;
+                return true;
-        /* If after HZ/10, a zone is below the high mark, it's premature */
+        /* Check the watermark levels */
        for (i = 0; i < pgdat->nr_zones; i++) {
                struct zone *zone = pgdat->node_zones + i;
                if (!populated_zone(zone))
                        continue;
-                if (zone->all_unreclaimable)
+                /*
+                 * balance_pgdat() skips over all_unreclaimable after
+                 * DEF_PRIORITY. Effectively, it considers them balanced so
+                 * they must be considered balanced here as well if kswapd
+                 * is to sleep
+                 */
+                if (zone->all_unreclaimable) {
+                        balanced += zone->present_pages;
                        continue;
+                }
-                if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+                if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
-                                                                0, 0))
+                                                        classzone_idx, 0))
-                        return 1;
+                        all_zones_ok = false;
+                else
+                        balanced += zone->present_pages;
        }
-        return 0;
+        /*
+         * For high-order requests, the balanced zones must contain at least
+         * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
+         * must be balanced
+         */
+        if (order)
+                return pgdat_balanced(pgdat, balanced, classzone_idx);
+        else
+                return !all_zones_ok;
 }
 /*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at high_wmark_pages(zone).
 *
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
 *
 * There is special handling here for zones which are full of pinned pages.
 * This can happen if the pages are all mlocked, or if they are all used by
@@ -2172,11 +2300,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
 * interoperates with the page allocator fallback scheme to ensure that aging
 * of pages is balanced across the zones.
 */
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+                                                        int *classzone_idx)
 {
        int all_zones_ok;
+        unsigned long balanced;
        int priority;
        int i;
+        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
        unsigned long total_scanned;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        struct scan_control sc = {
@@ -2199,7 +2330,6 @@ loop_again:
        count_vm_event(PAGEOUTRUN);
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-                int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
                unsigned long lru_pages = 0;
                int has_under_min_watermark_zone = 0;
@@ -2208,6 +2338,7 @@ loop_again:
                        disable_swap_token();
                all_zones_ok = 1;
+                balanced = 0;
                /*
                 * Scan in the highmem->dma direction for the highest
@@ -2230,9 +2361,10 @@ loop_again:
                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
                                                        &sc, priority, 0);
-                        if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
+                                *classzone_idx = i;
                                break;
                        }
                }
@@ -2255,6 +2387,7 @@ loop_again:
                 * cause too much scanning of the lower zones.
                 */
                for (i = 0; i <= end_zone; i++) {
+                        int compaction;
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
@@ -2276,7 +2409,7 @@ loop_again:
                         * We put equal pressure on every zone, unless one
                         * zone has way too many pages free already.
                         */
-                        if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok_safe(zone, order,
                                        8*high_wmark_pages(zone), end_zone, 0))
                                shrink_zone(priority, zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
@@ -2284,9 +2417,26 @@ loop_again:
                                                lru_pages);
                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
                        total_scanned += sc.nr_scanned;
+                        compaction = 0;
+                        if (order &&
+                            zone_watermark_ok(zone, 0,
+                                               high_wmark_pages(zone),
+                                              end_zone, 0) &&
+                            !zone_watermark_ok(zone, order,
+                                               high_wmark_pages(zone),
+                                               end_zone, 0)) {
+                                compact_zone_order(zone,
+                                                   order,
+                                                   sc.gfp_mask, false,
+                                                   COMPACT_MODE_KSWAPD);
+                                compaction = 1;
+                        }
                        if (zone->all_unreclaimable)
                                continue;
-                        if (nr_slab == 0 && !zone_reclaimable(zone))
+                        if (!compaction && nr_slab == 0 &&
+                            !zone_reclaimable(zone))
                                zone->all_unreclaimable = 1;
                        /*
                         * If we've done a decent amount of scanning and
@@ -2297,7 +2447,7 @@ loop_again:
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
-                        if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
                                /*
@@ -2305,7 +2455,7 @@ loop_again:
                                 * means that we have a GFP_ATOMIC allocation
                                 * failure risk. Hurry up!
                                 */
-                                if (!zone_watermark_ok(zone, order,
+                                if (!zone_watermark_ok_safe(zone, order,
                                            min_wmark_pages(zone), end_zone, 0))
                                        has_under_min_watermark_zone = 1;
                        } else {
@@ -2317,10 +2467,12 @@ loop_again:
                                 * spectulatively avoid congestion waits
                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
+                                if (i <= *classzone_idx)
+                                        balanced += zone->present_pages;
                        }
                }
-                if (all_zones_ok)
+                if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
                        break;          /* kswapd: all done */
                /*
                 * OK, kswapd is getting into trouble.  Take a nap, then take
@@ -2343,7 +2495,13 @@ loop_again:
                        break;
        }
 out:
-        if (!all_zones_ok) {
+        /*
+         * order-0: All zones must meet high watermark for a balanced node
+         * high-order: Balanced zones must make up at least 25% of the node
+         *             for the node to be balanced
+         */
+        if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
                cond_resched();
                try_to_freeze();
@@ -2368,7 +2526,88 @@ out:
                goto loop_again;
        }
-        return sc.nr_reclaimed;
+        /*
+         * If kswapd was reclaiming at a higher order, it has the option of
+         * sleeping without all zones being balanced. Before it does, it must
+         * ensure that the watermarks for order-0 on *all* zones are met and
+         * that the congestion flags are cleared. The congestion flag must
+         * be cleared as kswapd is the only mechanism that clears the flag
+         * and it is potentially going to sleep here.
+         */
+        if (order) {
+                for (i = 0; i <= end_zone; i++) {
+                        struct zone *zone = pgdat->node_zones + i;
+                        if (!populated_zone(zone))
+                                continue;
+                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                                continue;
+                        /* Confirm the zone is balanced for order-0 */
+                        if (!zone_watermark_ok(zone, 0,
+                                        high_wmark_pages(zone), 0, 0)) {
+                                order = sc.order = 0;
+                                goto loop_again;
+                        }
+                        /* If balanced, clear the congested flag */
+                        zone_clear_flag(zone, ZONE_CONGESTED);
+                }
+        }
+        /*
+         * Return the order we were reclaiming at so sleeping_prematurely()
+         * makes a decision on the order we were last reclaiming at. However,
+         * if another caller entered the allocator slow path while kswapd
+         * was awake, order will remain at the higher level
+         */
+        *classzone_idx = end_zone;
+        return order;
+}
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+{
+        long remaining = 0;
+        DEFINE_WAIT(wait);
+        if (freezing(current) || kthread_should_stop())
+                return;
+        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+        /* Try to sleep for a short interval */
+        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+                remaining = schedule_timeout(HZ/10);
+                finish_wait(&pgdat->kswapd_wait, &wait);
+                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+        }
+        /*
+         * After a short sleep, check if it was a premature sleep. If not, then
+         * go fully to sleep until explicitly woken up.
+         */
+        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+                /*
+                 * vmstat counters are not perfectly accurate and the estimated
+                 * value for counters such as NR_FREE_PAGES can deviate from the
+                 * true value by nr_online_cpus * threshold. To avoid the zone
+                 * watermarks being breached while under pressure, we reduce the
+                 * per-cpu vmstat threshold while kswapd is awake and restore
+                 * them before going back to sleep.
+                 */
+                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+                schedule();
+                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
+        } else {
+                if (remaining)
+                        count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+                else
+                        count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+        }
+        finish_wait(&pgdat->kswapd_wait, &wait);
 }
 /*
@@ -2387,9 +2626,10 @@ out:
 static int kswapd(void *p)
 {
        unsigned long order;
+        int classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
-        DEFINE_WAIT(wait);
        struct reclaim_state reclaim_state = {
                .reclaimed_slab = 0,
        };
@@ -2417,49 +2657,30 @@ static int kswapd(void *p)
        set_freezable();
        order = 0;
+        classzone_idx = MAX_NR_ZONES - 1;
        for ( ; ; ) {
                unsigned long new_order;
+                int new_classzone_idx;
                int ret;
-                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
                new_order = pgdat->kswapd_max_order;
+                new_classzone_idx = pgdat->classzone_idx;
                pgdat->kswapd_max_order = 0;
-                if (order < new_order) {
+                pgdat->classzone_idx = MAX_NR_ZONES - 1;
+                if (order < new_order || classzone_idx > new_classzone_idx) {
                        /*
                         * Don't sleep if someone wants a larger 'order'
-                         * allocation
+                         * allocation or has tigher zone constraints
                         */
                        order = new_order;
+                        classzone_idx = new_classzone_idx;
                } else {
-                        if (!freezing(current) && !kthread_should_stop()) {
+                        kswapd_try_to_sleep(pgdat, order, classzone_idx);
-                                long remaining = 0;
-                                /* Try to sleep for a short interval */
-                                if (!sleeping_prematurely(pgdat, order, remaining)) {
-                                        remaining = schedule_timeout(HZ/10);
-                                        finish_wait(&pgdat->kswapd_wait, &wait);
-                                        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
-                                }
-                                /*
-                                 * After a short sleep, check if it was a
-                                 * premature sleep. If not, then go fully
-                                 * to sleep until explicitly woken up
-                                 */
-                                if (!sleeping_prematurely(pgdat, order, remaining)) {
-                                        trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
-                                        schedule();
-                                } else {
-                                        if (remaining)
-                                                count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
-                                        else
-                                                count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
-                                }
-                        }
                        order = pgdat->kswapd_max_order;
+                        classzone_idx = pgdat->classzone_idx;
+                        pgdat->kswapd_max_order = 0;
+                        pgdat->classzone_idx = MAX_NR_ZONES - 1;
                }
-                finish_wait(&pgdat->kswapd_wait, &wait);
                ret = try_to_freeze();
                if (kthread_should_stop())
@@ -2471,7 +2692,7 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                        balance_pgdat(pgdat, order);
+                        order = balance_pgdat(pgdat, order, &classzone_idx);
                }
        }
        return 0;
@@ -2480,23 +2701,26 @@ static int kswapd(void *p)
 /*
 * A zone is low on free memory, so wake its kswapd task to service it.
 */
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 {
        pg_data_t *pgdat;
        if (!populated_zone(zone))
                return;
-        pgdat = zone->zone_pgdat;
-        if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
-                return;
-        if (pgdat->kswapd_max_order < order)
-                pgdat->kswapd_max_order = order;
-        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                return;
+        pgdat = zone->zone_pgdat;
+        if (pgdat->kswapd_max_order < order) {
+                pgdat->kswapd_max_order = order;
+                pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
+        }
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
+        if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+                return;
+        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        wake_up_interruptible(&pgdat->kswapd_wait);
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8f62f17ee1c7..0c3b5048773e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
 #ifdef CONFIG_SMP
-static int calculate_threshold(struct zone *zone)
+int calculate_pressure_threshold(struct zone *zone)
+{
+        int threshold;
+        int watermark_distance;
+        /*
+         * As vmstats are not up to date, there is drift between the estimated
+         * and real values. For high thresholds and a high number of CPUs, it
+         * is possible for the min watermark to be breached while the estimated
+         * value looks fine. The pressure threshold is a reduced value such
+         * that even the maximum amount of drift will not accidentally breach
+         * the min watermark
+         */
+        watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+        threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+        /*
+         * Maximum threshold is 125
+         */
+        threshold = min(125, threshold);
+        return threshold;
+}
+int calculate_normal_threshold(struct zone *zone)
 {
        int threshold;
        int mem;        /* memory in 128 MB units */
@@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
        for_each_populated_zone(zone) {
                unsigned long max_drift, tolerate_drift;
-                threshold = calculate_threshold(zone);
+                threshold = calculate_normal_threshold(zone);
                for_each_online_cpu(cpu)
                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -161,42 +185,50 @@ static void refresh_zone_stat_thresholds(void)
        }
 }
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+                                int (*calculate_pressure)(struct zone *))
+{
+        struct zone *zone;
+        int cpu;
+        int threshold;
+        int i;
+        for (i = 0; i < pgdat->nr_zones; i++) {
+                zone = &pgdat->node_zones[i];
+                if (!zone->percpu_drift_mark)
+                        continue;
+                threshold = (*calculate_pressure)(zone);
+                for_each_possible_cpu(cpu)
+                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+                                                        = threshold;
+        }
+}
 /*
 * For use when we know that interrupts are disabled.
 */
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
                                int delta)
 {
-        struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
+        struct per_cpu_pageset __percpu *pcp = zone->pageset;
+        s8 __percpu *p = pcp->vm_stat_diff + item;
-        s8 *p = pcp->vm_stat_diff + item;
        long x;
+        long t;
+        x = delta + __this_cpu_read(*p);
-        x = delta + *p;
+        t = __this_cpu_read(pcp->stat_threshold);
-        if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
+        if (unlikely(x > t || x < -t)) {
                zone_page_state_add(x, zone, item);
                x = 0;
        }
-        *p = x;
+        __this_cpu_write(*p, x);
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
 /*
- * For an unknown interrupt state
- */
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
-                                        int delta)
-{
-        unsigned long flags;
-        local_irq_save(flags);
-        __mod_zone_page_state(zone, item, delta);
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-/*
 * Optimized increment and decrement functions.
 *
 * These are only for a single page and therefore can take a struct page *
@@ -221,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
 */
 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-        struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
+        struct per_cpu_pageset __percpu *pcp = zone->pageset;
-        s8 *p = pcp->vm_stat_diff + item;
+        s8 __percpu *p = pcp->vm_stat_diff + item;
+        s8 v, t;
-        (*p)++;
+        v = __this_cpu_inc_return(*p);
+        t = __this_cpu_read(pcp->stat_threshold);
+        if (unlikely(v > t)) {
+                s8 overstep = t >> 1;
-        if (unlikely(*p > pcp->stat_threshold)) {
+                zone_page_state_add(v + overstep, zone, item);
-                int overstep = pcp->stat_threshold / 2;
+                __this_cpu_write(*p, -overstep);
-                zone_page_state_add(*p + overstep, zone, item);
-                *p = -overstep;
        }
 }
@@ -242,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state);
 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-        struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
+        struct per_cpu_pageset __percpu *pcp = zone->pageset;
-        s8 *p = pcp->vm_stat_diff + item;
+        s8 __percpu *p = pcp->vm_stat_diff + item;
+        s8 v, t;
-        (*p)--;
-        if (unlikely(*p < - pcp->stat_threshold)) {
+        v = __this_cpu_dec_return(*p);
-                int overstep = pcp->stat_threshold / 2;
+        t = __this_cpu_read(pcp->stat_threshold);
+        if (unlikely(v < - t)) {
+                s8 overstep = t >> 1;
-                zone_page_state_add(*p - overstep, zone, item);
+                zone_page_state_add(v - overstep, zone, item);
-                *p = overstep;
+                __this_cpu_write(*p, overstep);
        }
 }
@@ -261,6 +295,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
+#ifdef CONFIG_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ *     0       No overstepping
+ *     1       Overstepping half of threshold
+ *     -1      Overstepping minus half of threshold
+*/
+static inline void mod_state(struct zone *zone,
+       enum zone_stat_item item, int delta, int overstep_mode)
+{
+        struct per_cpu_pageset __percpu *pcp = zone->pageset;
+        s8 __percpu *p = pcp->vm_stat_diff + item;
+        long o, n, t, z;
+        do {
+                z = 0;  /* overflow to zone counters */
+                /*
+                 * The fetching of the stat_threshold is racy. We may apply
+                 * a counter threshold to the wrong the cpu if we get
+                 * rescheduled while executing here. However, the following
+                 * will apply the threshold again and therefore bring the
+                 * counter under the threshold.
+                 */
+                t = this_cpu_read(pcp->stat_threshold);
+                o = this_cpu_read(*p);
+                n = delta + o;
+                if (n > t || n < -t) {
+                        int os = overstep_mode * (t >> 1) ;
+                        /* Overflow must be added to zone counters */
+                        z = n + os;
+                        n = -os;
+                }
+        } while (this_cpu_cmpxchg(*p, o, n) != o);
+        if (z)
+                zone_page_state_add(z, zone, item);
+}
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+                                        int delta)
+{
+        mod_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+        mod_state(zone, item, 1, 1);
+}
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        mod_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        mod_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+#else
+/*
+ * Use interrupt disable to serialize counter updates
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+                                        int delta)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __mod_zone_page_state(zone, item, delta);
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
        unsigned long flags;
@@ -291,6 +411,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
+#endif
 /*
 * Update the zone counters for one cpu.
@@ -759,6 +880,7 @@ static const char * const vmstat_text[] = {
        "numa_local",
        "numa_other",
 #endif
+        "nr_anon_transparent_hugepages",
        "nr_dirty_threshold",
        "nr_dirty_background_threshold",
@@ -834,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n        scanned  %lu"
                   "\n        spanned  %lu"
                   "\n        present  %lu",
-                   zone_nr_free_pages(zone),
+                   zone_page_state(zone, NR_FREE_PAGES),
                   min_wmark_pages(zone),
                   low_wmark_pages(zone),
                   high_wmark_pages(zone),
@@ -1033,7 +1155,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
-                cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+                cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
                per_cpu(vmstat_work, cpu).work.func = NULL;
                break;
        case CPU_DOWN_FAILED: