Merge branch 'akpm' (Andrew's patch-bomb)

Merge patches from Andrew Morton: "A few misc things and very nearly all of the MM tree. A tremendous amount of stuff (again), including a significant rbtree library rework." * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (160 commits) sparc64: Support transparent huge pages. mm: thp: Use more portable PMD clearing sequenece in zap_huge_pmd(). mm: Add and use update_mmu_cache_pmd() in transparent huge page code. sparc64: Document PGD and PMD layout. sparc64: Eliminate PTE table memory wastage. sparc64: Halve the size of PTE tables sparc64: Only support 4MB huge pages and 8KB base pages. memory-hotplug: suppress "Trying to free nonexistent resource <XXXXXXXXXXXXXXXX-YYYYYYYYYYYYYYYY>" warning mm: memcg: clean up mm_match_cgroup() signature mm: document PageHuge somewhat mm: use %pK for /proc/vmallocinfo mm, thp: fix mlock statistics mm, thp: fix mapped pages avoiding unevictable list on mlock memory-hotplug: update memory block's state and notify userspace memory-hotplug: preparation to notify memory block's state at memory hot remove mm: avoid section mismatch warning for memblock_type_name make GFP_NOTRACK definition unconditional cma: decrease cc.nr_migratepages after reclaiming pagelist CMA: migrate mlocked pages kpageflags: fix wrong KPF_THP on non-huge compound pages ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-10-09 03:23:15 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-10-09 03:23:15 -0400
commit: 9e2d8656f5e8aa214e66b462680cf86b210b74a8 (patch)
tree: f67d62e896cedf75599ea45f9ecf9999c6ad24cd /mm
parent: 1ea4f4f8405cc1ceec23f2d261bc3775785e6712 (diff)
parent: 9e695d2ecc8451cc2c1603d60b5c8e7f5581923a (diff)
38 files changed, 1830 insertions, 1320 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d5c8019c6627..a3f8dddaaab3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS
 # support for memory compaction
 config COMPACTION
        bool "Allow for memory compaction"
+        def_bool y
        select MIGRATION
        depends on MMU
        help
@@ -318,7 +319,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 config TRANSPARENT_HUGEPAGE
        bool "Transparent Hugepage Support"
-        depends on X86 && MMU
+        depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select COMPACTION
        help
          Transparent Hugepages allows the kernel to use huge pages and
diff --git a/mm/Makefile b/mm/Makefile
index 92753e2d82da..6b025f80af34 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -14,9 +14,9 @@ endif
 obj-y                   := filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
-                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+                           util.o mmzone.o vmstat.o backing-dev.o \
                           mm_init.o mmu_context.o percpu.o slab_common.o \
-                           compaction.o $(mmu-y)
+                           compaction.o interval_tree.o $(mmu-y)
 obj-y += init-mm.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..434be4ae7a04 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                        int order = ilog2(BITS_PER_LONG);
                        __free_pages_bootmem(pfn_to_page(start), order);
+                        fixup_zone_present_pages(page_to_nid(pfn_to_page(start)),
+                                        start, start + BITS_PER_LONG);
                        count += BITS_PER_LONG;
                        start += BITS_PER_LONG;
                } else {
@@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                                if (vec & 1) {
                                        page = pfn_to_page(start + off);
                                        __free_pages_bootmem(page, 0);
+                                        fixup_zone_present_pages(
+                                                page_to_nid(page),
+                                                start + off, start + off + 1);
                                        count++;
                                }
                                vec >>= 1;
@@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        pages = bdata->node_low_pfn - bdata->node_min_pfn;
        pages = bootmem_bootmap_pages(pages);
        count += pages;
-        while (pages--)
+        while (pages--) {
+                fixup_zone_present_pages(page_to_nid(page),
+                                page_to_pfn(page), page_to_pfn(page) + 1);
                __free_pages_bootmem(page++, 0);
+        }
        bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
diff --git a/mm/compaction.c b/mm/compaction.c
index 7fcd3a52e68d..2c4ce17651d8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,111 @@ static inline bool migrate_async_suitable(int migratetype)
        return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
+#ifdef CONFIG_COMPACTION
+/* Returns true if the pageblock should be scanned for pages to isolate. */
+static inline bool isolation_suitable(struct compact_control *cc,
+                                        struct page *page)
+{
+        if (cc->ignore_skip_hint)
+                return true;
+        return !get_pageblock_skip(page);
+}
+/*
+ * This function is called to clear all cached information on pageblocks that
+ * should be skipped for page isolation when the migrate and free page scanner
+ * meet.
+ */
+static void __reset_isolation_suitable(struct zone *zone)
+{
+        unsigned long start_pfn = zone->zone_start_pfn;
+        unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        unsigned long pfn;
+        zone->compact_cached_migrate_pfn = start_pfn;
+        zone->compact_cached_free_pfn = end_pfn;
+        zone->compact_blockskip_flush = false;
+        /* Walk the zone and mark every pageblock as suitable for isolation */
+        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+                struct page *page;
+                cond_resched();
+                if (!pfn_valid(pfn))
+                        continue;
+                page = pfn_to_page(pfn);
+                if (zone != page_zone(page))
+                        continue;
+                clear_pageblock_skip(page);
+        }
+}
+void reset_isolation_suitable(pg_data_t *pgdat)
+{
+        int zoneid;
+        for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+                struct zone *zone = &pgdat->node_zones[zoneid];
+                if (!populated_zone(zone))
+                        continue;
+                /* Only flush if a full compaction finished recently */
+                if (zone->compact_blockskip_flush)
+                        __reset_isolation_suitable(zone);
+        }
+}
+/*
+ * If no pages were isolated then mark this pageblock to be skipped in the
+ * future. The information is later cleared by __reset_isolation_suitable().
+ */
+static void update_pageblock_skip(struct compact_control *cc,
+                        struct page *page, unsigned long nr_isolated,
+                        bool migrate_scanner)
+{
+        struct zone *zone = cc->zone;
+        if (!page)
+                return;
+        if (!nr_isolated) {
+                unsigned long pfn = page_to_pfn(page);
+                set_pageblock_skip(page);
+                /* Update where compaction should restart */
+                if (migrate_scanner) {
+                        if (!cc->finished_update_migrate &&
+                            pfn > zone->compact_cached_migrate_pfn)
+                                zone->compact_cached_migrate_pfn = pfn;
+                } else {
+                        if (!cc->finished_update_free &&
+                            pfn < zone->compact_cached_free_pfn)
+                                zone->compact_cached_free_pfn = pfn;
+                }
+        }
+}
+#else
+static inline bool isolation_suitable(struct compact_control *cc,
+                                        struct page *page)
+{
+        return true;
+}
+static void update_pageblock_skip(struct compact_control *cc,
+                        struct page *page, unsigned long nr_isolated,
+                        bool migrate_scanner)
+{
+}
+#endif /* CONFIG_COMPACTION */
+static inline bool should_release_lock(spinlock_t *lock)
+{
+        return need_resched() || spin_is_contended(lock);
+}
 /*
 * Compaction requires the taking of some coarse locks that are potentially
 * very heavily contended. Check if the process needs to be scheduled or
@@ -62,7 +167,7 @@ static inline bool migrate_async_suitable(int migratetype)
 static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
                                      bool locked, struct compact_control *cc)
 {
-        if (need_resched() || spin_is_contended(lock)) {
+        if (should_release_lock(lock)) {
                if (locked) {
                        spin_unlock_irqrestore(lock, *flags);
                        locked = false;
@@ -70,14 +175,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
                /* async aborts if taking too long or contended */
                if (!cc->sync) {
-                        if (cc->contended)
+                        cc->contended = true;
-                                *cc->contended = true;
                        return false;
                }
                cond_resched();
-                if (fatal_signal_pending(current))
-                        return false;
        }
        if (!locked)
@@ -91,44 +193,139 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
        return compact_checklock_irqsave(lock, flags, false, cc);
 }
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+        int migratetype = get_pageblock_migratetype(page);
+        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+        if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+                return false;
+        /* If the page is a large free page, then allow migration */
+        if (PageBuddy(page) && page_order(page) >= pageblock_order)
+                return true;
+        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+        if (migrate_async_suitable(migratetype))
+                return true;
+        /* Otherwise skip the block */
+        return false;
+}
+static void compact_capture_page(struct compact_control *cc)
+{
+        unsigned long flags;
+        int mtype, mtype_low, mtype_high;
+        if (!cc->page || *cc->page)
+                return;
+        /*
+         * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
+         * regardless of the migratetype of the freelist is is captured from.
+         * This is fine because the order for a high-order MIGRATE_MOVABLE
+         * allocation is typically at least a pageblock size and overall
+         * fragmentation is not impaired. Other allocation types must
+         * capture pages from their own migratelist because otherwise they
+         * could pollute other pageblocks like MIGRATE_MOVABLE with
+         * difficult to move pages and making fragmentation worse overall.
+         */
+        if (cc->migratetype == MIGRATE_MOVABLE) {
+                mtype_low = 0;
+                mtype_high = MIGRATE_PCPTYPES;
+        } else {
+                mtype_low = cc->migratetype;
+                mtype_high = cc->migratetype + 1;
+        }
+        /* Speculatively examine the free lists without zone lock */
+        for (mtype = mtype_low; mtype < mtype_high; mtype++) {
+                int order;
+                for (order = cc->order; order < MAX_ORDER; order++) {
+                        struct page *page;
+                        struct free_area *area;
+                        area = &(cc->zone->free_area[order]);
+                        if (list_empty(&area->free_list[mtype]))
+                                continue;
+                        /* Take the lock and attempt capture of the page */
+                        if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
+                                return;
+                        if (!list_empty(&area->free_list[mtype])) {
+                                page = list_entry(area->free_list[mtype].next,
+                                                        struct page, lru);
+                                if (capture_free_page(page, cc->order, mtype)) {
+                                        spin_unlock_irqrestore(&cc->zone->lock,
+                                                                        flags);
+                                        *cc->page = page;
+                                        return;
+                                }
+                        }
+                        spin_unlock_irqrestore(&cc->zone->lock, flags);
+                }
+        }
+}
 /*
 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
 * pages inside of the pageblock (even though it may still end up isolating
 * some pages).
 */
-static unsigned long isolate_freepages_block(unsigned long blockpfn,
+static unsigned long isolate_freepages_block(struct compact_control *cc,
+                                unsigned long blockpfn,
                                unsigned long end_pfn,
                                struct list_head *freelist,
                                bool strict)
 {
        int nr_scanned = 0, total_isolated = 0;
-        struct page *cursor;
+        struct page *cursor, *valid_page = NULL;
+        unsigned long nr_strict_required = end_pfn - blockpfn;
+        unsigned long flags;
+        bool locked = false;
        cursor = pfn_to_page(blockpfn);
-        /* Isolate free pages. This assumes the block is valid */
+        /* Isolate free pages. */
        for (; blockpfn < end_pfn; blockpfn++, cursor++) {
                int isolated, i;
                struct page *page = cursor;
-                if (!pfn_valid_within(blockpfn)) {
-                        if (strict)
-                                return 0;
-                        continue;
-                }
                nr_scanned++;
+                if (!pfn_valid_within(blockpfn))
+                        continue;
+                if (!valid_page)
+                        valid_page = page;
+                if (!PageBuddy(page))
+                        continue;
-                if (!PageBuddy(page)) {
+                /*
-                        if (strict)
+                 * The zone lock must be held to isolate freepages.
-                                return 0;
+                 * Unfortunately this is a very coarse lock and can be
+                 * heavily contended if there are parallel allocations
+                 * or parallel compactions. For async compaction do not
+                 * spin on the lock and we acquire the lock as late as
+                 * possible.
+                 */
+                locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
+                                                                locked, cc);
+                if (!locked)
+                        break;
+                /* Recheck this is a suitable migration target under lock */
+                if (!strict && !suitable_migration_target(page))
+                        break;
+                /* Recheck this is a buddy page under lock */
+                if (!PageBuddy(page))
                        continue;
-                }
                /* Found a free page, break it into order-0 pages */
                isolated = split_free_page(page);
                if (!isolated && strict)
-                        return 0;
+                        break;
                total_isolated += isolated;
                for (i = 0; i < isolated; i++) {
                        list_add(&page->lru, freelist);
@@ -143,6 +340,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
        }
        trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
+        /*
+         * If strict isolation is requested by CMA then check that all the
+         * pages requested were isolated. If there were any failures, 0 is
+         * returned and CMA will fail.
+         */
+        if (strict && nr_strict_required != total_isolated)
+                total_isolated = 0;
+        if (locked)
+                spin_unlock_irqrestore(&cc->zone->lock, flags);
+        /* Update the pageblock-skip if the whole pageblock was scanned */
+        if (blockpfn == end_pfn)
+                update_pageblock_skip(cc, valid_page, total_isolated, false);
        return total_isolated;
 }
@@ -160,17 +373,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
 * a free page).
 */
 unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
+isolate_freepages_range(struct compact_control *cc,
+                        unsigned long start_pfn, unsigned long end_pfn)
 {
-        unsigned long isolated, pfn, block_end_pfn, flags;
+        unsigned long isolated, pfn, block_end_pfn;
-        struct zone *zone = NULL;
        LIST_HEAD(freelist);
-        if (pfn_valid(start_pfn))
-                zone = page_zone(pfn_to_page(start_pfn));
        for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
-                if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
+                if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
                        break;
                /*
@@ -180,10 +390,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
                block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
                block_end_pfn = min(block_end_pfn, end_pfn);
-                spin_lock_irqsave(&zone->lock, flags);
+                isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
-                isolated = isolate_freepages_block(pfn, block_end_pfn,
                                                   &freelist, true);
-                spin_unlock_irqrestore(&zone->lock, flags);
                /*
                 * In strict mode, isolate_freepages_block() returns 0 if
@@ -253,6 +461,7 @@ static bool too_many_isolated(struct zone *zone)
 * @cc:         Compaction control structure.
 * @low_pfn:    The first PFN of the range.
 * @end_pfn:    The one-past-the-last PFN of the range.
+ * @unevictable: true if it allows to isolate unevictable pages
 *
 * Isolate all pages that can be migrated from the range specified by
 * [low_pfn, end_pfn).  Returns zero if there is a fatal signal
@@ -268,7 +477,7 @@ static bool too_many_isolated(struct zone *zone)
 */
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
-                           unsigned long low_pfn, unsigned long end_pfn)
+                unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
 {
        unsigned long last_pageblock_nr = 0, pageblock_nr;
        unsigned long nr_scanned = 0, nr_isolated = 0;
@@ -276,7 +485,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        isolate_mode_t mode = 0;
        struct lruvec *lruvec;
        unsigned long flags;
-        bool locked;
+        bool locked = false;
+        struct page *page = NULL, *valid_page = NULL;
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -296,23 +506,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        /* Time to isolate some pages for migration */
        cond_resched();
-        spin_lock_irqsave(&zone->lru_lock, flags);
-        locked = true;
        for (; low_pfn < end_pfn; low_pfn++) {
-                struct page *page;
                /* give a chance to irqs before checking need_resched() */
-                if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+                if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-                        spin_unlock_irqrestore(&zone->lru_lock, flags);
+                        if (should_release_lock(&zone->lru_lock)) {
-                        locked = false;
+                                spin_unlock_irqrestore(&zone->lru_lock, flags);
+                                locked = false;
+                        }
                }
-                /* Check if it is ok to still hold the lock */
-                locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
-                                                                locked, cc);
-                if (!locked)
-                        break;
                /*
                 * migrate_pfn does not necessarily start aligned to a
                 * pageblock. Ensure that pfn_valid is called when moving
@@ -340,6 +542,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                if (page_zone(page) != zone)
                        continue;
+                if (!valid_page)
+                        valid_page = page;
+                /* If isolation recently failed, do not retry */
+                pageblock_nr = low_pfn >> pageblock_order;
+                if (!isolation_suitable(cc, page))
+                        goto next_pageblock;
                /* Skip if free */
                if (PageBuddy(page))
                        continue;
@@ -349,24 +559,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                 * migration is optimistic to see if the minimum amount of work
                 * satisfies the allocation
                 */
-                pageblock_nr = low_pfn >> pageblock_order;
                if (!cc->sync && last_pageblock_nr != pageblock_nr &&
                    !migrate_async_suitable(get_pageblock_migratetype(page))) {
-                        low_pfn += pageblock_nr_pages;
+                        cc->finished_update_migrate = true;
-                        low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+                        goto next_pageblock;
-                        last_pageblock_nr = pageblock_nr;
-                        continue;
                }
+                /* Check may be lockless but that's ok as we recheck later */
                if (!PageLRU(page))
                        continue;
                /*
-                 * PageLRU is set, and lru_lock excludes isolation,
+                 * PageLRU is set. lru_lock normally excludes isolation
-                 * splitting and collapsing (collapsing has already
+                 * splitting and collapsing (collapsing has already happened
-                 * happened if PageLRU is set).
+                 * if PageLRU is set) but the lock is not necessarily taken
+                 * here and it is wasteful to take it just to check transhuge.
+                 * Check TransHuge without lock and skip the whole pageblock if
+                 * it's either a transhuge or hugetlbfs page, as calling
+                 * compound_order() without preventing THP from splitting the
+                 * page underneath us may return surprising results.
                 */
                if (PageTransHuge(page)) {
+                        if (!locked)
+                                goto next_pageblock;
+                        low_pfn += (1 << compound_order(page)) - 1;
+                        continue;
+                }
+                /* Check if it is ok to still hold the lock */
+                locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+                                                                locked, cc);
+                if (!locked || fatal_signal_pending(current))
+                        break;
+                /* Recheck PageLRU and PageTransHuge under lock */
+                if (!PageLRU(page))
+                        continue;
+                if (PageTransHuge(page)) {
                        low_pfn += (1 << compound_order(page)) - 1;
                        continue;
                }
@@ -374,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                if (!cc->sync)
                        mode |= ISOLATE_ASYNC_MIGRATE;
+                if (unevictable)
+                        mode |= ISOLATE_UNEVICTABLE;
                lruvec = mem_cgroup_page_lruvec(page, zone);
                /* Try isolate the page */
@@ -383,6 +615,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                VM_BUG_ON(PageTransCompound(page));
                /* Successfully isolated */
+                cc->finished_update_migrate = true;
                del_page_from_lru_list(page, lruvec, page_lru(page));
                list_add(&page->lru, migratelist);
                cc->nr_migratepages++;
@@ -393,6 +626,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                        ++low_pfn;
                        break;
                }
+                continue;
+next_pageblock:
+                low_pfn += pageblock_nr_pages;
+                low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+                last_pageblock_nr = pageblock_nr;
        }
        acct_isolated(zone, locked, cc);
@@ -400,6 +640,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        if (locked)
                spin_unlock_irqrestore(&zone->lru_lock, flags);
+        /* Update the pageblock-skip if the whole pageblock was scanned */
+        if (low_pfn == end_pfn)
+                update_pageblock_skip(cc, valid_page, nr_isolated, true);
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
        return low_pfn;
@@ -407,43 +651,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
-{
-        int migratetype = get_pageblock_migratetype(page);
-        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
-        if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
-                return false;
-        /* If the page is a large free page, then allow migration */
-        if (PageBuddy(page) && page_order(page) >= pageblock_order)
-                return true;
-        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-        if (migrate_async_suitable(migratetype))
-                return true;
-        /* Otherwise skip the block */
-        return false;
-}
-/*
- * Returns the start pfn of the last page block in a zone.  This is the starting
- * point for full compaction of a zone.  Compaction searches for free pages from
- * the end of each zone, while isolate_freepages_block scans forward inside each
- * page block.
- */
-static unsigned long start_free_pfn(struct zone *zone)
-{
-        unsigned long free_pfn;
-        free_pfn = zone->zone_start_pfn + zone->spanned_pages;
-        free_pfn &= ~(pageblock_nr_pages-1);
-        return free_pfn;
-}
 /*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
@@ -453,7 +660,6 @@ static void isolate_freepages(struct zone *zone,
 {
        struct page *page;
        unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
-        unsigned long flags;
        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
@@ -501,30 +707,16 @@ static void isolate_freepages(struct zone *zone,
                if (!suitable_migration_target(page))
                        continue;
-                /*
+                /* If isolation recently failed, do not retry */
-                 * Found a block suitable for isolating free pages from. Now
+                if (!isolation_suitable(cc, page))
-                 * we disabled interrupts, double check things are ok and
+                        continue;
-                 * isolate the pages. This is to minimise the time IRQs
-                 * are disabled
-                 */
-                isolated = 0;
-                /*
+                /* Found a block suitable for isolating free pages from */
-                 * The zone lock must be held to isolate freepages. This
+                isolated = 0;
-                 * unfortunately this is a very coarse lock and can be
+                end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
-                 * heavily contended if there are parallel allocations
+                isolated = isolate_freepages_block(cc, pfn, end_pfn,
-                 * or parallel compactions. For async compaction do not
+                                                   freelist, false);
-                 * spin on the lock
+                nr_freepages += isolated;
-                 */
-                if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
-                        break;
-                if (suitable_migration_target(page)) {
-                        end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
-                        isolated = isolate_freepages_block(pfn, end_pfn,
-                                                           freelist, false);
-                        nr_freepages += isolated;
-                }
-                spin_unlock_irqrestore(&zone->lock, flags);
                /*
                 * Record the highest PFN we isolated pages from. When next
@@ -532,17 +724,8 @@ static void isolate_freepages(struct zone *zone,
                 * page migration may have returned some pages to the allocator
                 */
                if (isolated) {
+                        cc->finished_update_free = true;
                        high_pfn = max(high_pfn, pfn);
-                        /*
-                         * If the free scanner has wrapped, update
-                         * compact_cached_free_pfn to point to the highest
-                         * pageblock with free pages. This reduces excessive
-                         * scanning of full pageblocks near the end of the
-                         * zone
-                         */
-                        if (cc->order > 0 && cc->wrapped)
-                                zone->compact_cached_free_pfn = high_pfn;
                }
        }
@@ -551,11 +734,6 @@ static void isolate_freepages(struct zone *zone,
        cc->free_pfn = high_pfn;
        cc->nr_freepages = nr_freepages;
-        /* If compact_cached_free_pfn is reset then set it now */
-        if (cc->order > 0 && !cc->wrapped &&
-                        zone->compact_cached_free_pfn == start_free_pfn(zone))
-                zone->compact_cached_free_pfn = high_pfn;
 }
 /*
@@ -633,8 +811,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        }
        /* Perform the isolation */
-        low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
+        low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
-        if (!low_pfn)
+        if (!low_pfn || cc->contended)
                return ISOLATE_ABORT;
        cc->migrate_pfn = low_pfn;
@@ -645,33 +823,24 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 static int compact_finished(struct zone *zone,
                            struct compact_control *cc)
 {
-        unsigned int order;
        unsigned long watermark;
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
-        /*
+        /* Compaction run completes if the migrate and free scanner meet */
-         * A full (order == -1) compaction run starts at the beginning and
-         * end of a zone; it completes when the migrate and free scanner meet.
-         * A partial (order > 0) compaction can start with the free scanner
-         * at a random point in the zone, and may have to restart.
-         */
        if (cc->free_pfn <= cc->migrate_pfn) {
-                if (cc->order > 0 && !cc->wrapped) {
+                /*
-                        /* We started partway through; restart at the end. */
+                 * Mark that the PG_migrate_skip information should be cleared
-                        unsigned long free_pfn = start_free_pfn(zone);
+                 * by kswapd when it goes to sleep. kswapd does not set the
-                        zone->compact_cached_free_pfn = free_pfn;
+                 * flag itself as the decision to be clear should be directly
-                        cc->free_pfn = free_pfn;
+                 * based on an allocation request.
-                        cc->wrapped = 1;
+                 */
-                        return COMPACT_CONTINUE;
+                if (!current_is_kswapd())
-                }
+                        zone->compact_blockskip_flush = true;
-                return COMPACT_COMPLETE;
-        }
-        /* We wrapped around and ended up where we started. */
-        if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
                return COMPACT_COMPLETE;
+        }
        /*
         * order == -1 is expected when compacting via
@@ -688,14 +857,22 @@ static int compact_finished(struct zone *zone,
                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
-        for (order = cc->order; order < MAX_ORDER; order++) {
+        if (cc->page) {
-                /* Job done if page is free of the right migratetype */
+                /* Was a suitable page captured? */
-                if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
+                if (*cc->page)
-                        return COMPACT_PARTIAL;
-                /* Job done if allocation would set block type */
-                if (order >= pageblock_order && zone->free_area[order].nr_free)
                        return COMPACT_PARTIAL;
+        } else {
+                unsigned int order;
+                for (order = cc->order; order < MAX_ORDER; order++) {
+                        struct free_area *area = &zone->free_area[cc->order];
+                        /* Job done if page is free of the right migratetype */
+                        if (!list_empty(&area->free_list[cc->migratetype]))
+                                return COMPACT_PARTIAL;
+                        /* Job done if allocation would set block type */
+                        if (cc->order >= pageblock_order && area->nr_free)
+                                return COMPACT_PARTIAL;
+                }
        }
        return COMPACT_CONTINUE;
@@ -754,6 +931,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
+        unsigned long start_pfn = zone->zone_start_pfn;
+        unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
        ret = compaction_suitable(zone, cc->order);
        switch (ret) {
@@ -766,18 +945,30 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                ;
        }
-        /* Setup to move all movable pages to the end of the zone */
+        /*
-        cc->migrate_pfn = zone->zone_start_pfn;
+         * Setup to move all movable pages to the end of the zone. Used cached
+         * information on where the scanners should start but check that it
-        if (cc->order > 0) {
+         * is initialised by ensuring the values are within zone boundaries.
-                /* Incremental compaction. Start where the last one stopped. */
+         */
-                cc->free_pfn = zone->compact_cached_free_pfn;
+        cc->migrate_pfn = zone->compact_cached_migrate_pfn;
-                cc->start_free_pfn = cc->free_pfn;
+        cc->free_pfn = zone->compact_cached_free_pfn;
-        } else {
+        if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
-                /* Order == -1 starts at the end of the zone. */
+                cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
-                cc->free_pfn = start_free_pfn(zone);
+                zone->compact_cached_free_pfn = cc->free_pfn;
+        }
+        if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+                cc->migrate_pfn = start_pfn;
+                zone->compact_cached_migrate_pfn = cc->migrate_pfn;
        }
+        /*
+         * Clear pageblock skip if there were failures recently and compaction
+         * is about to be retried after being deferred. kswapd does not do
+         * this reset as it'll reset the cached information when going to sleep.
+         */
+        if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+                __reset_isolation_suitable(zone);
        migrate_prep_local();
        while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
@@ -787,6 +978,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                switch (isolate_migratepages(zone, cc)) {
                case ISOLATE_ABORT:
                        ret = COMPACT_PARTIAL;
+                        putback_lru_pages(&cc->migratepages);
+                        cc->nr_migratepages = 0;
                        goto out;
                case ISOLATE_NONE:
                        continue;
@@ -817,6 +1010,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                                goto out;
                        }
                }
+                /* Capture a page now if it is a suitable size */
+                compact_capture_page(cc);
        }
 out:
@@ -829,8 +1025,10 @@ out:
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                 bool sync, bool *contended)
+                                 bool sync, bool *contended,
+                                 struct page **page)
 {
+        unsigned long ret;
        struct compact_control cc = {
                .nr_freepages = 0,
                .nr_migratepages = 0,
@@ -838,12 +1036,18 @@ static unsigned long compact_zone_order(struct zone *zone,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
                .sync = sync,
-                .contended = contended,
+                .page = page,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
-        return compact_zone(zone, &cc);
+        ret = compact_zone(zone, &cc);
+        VM_BUG_ON(!list_empty(&cc.freepages));
+        VM_BUG_ON(!list_empty(&cc.migratepages));
+        *contended = cc.contended;
+        return ret;
 }
 int sysctl_extfrag_threshold = 500;
@@ -855,12 +1059,14 @@ int sysctl_extfrag_threshold = 500;
 * @gfp_mask: The GFP mask of the current allocation
 * @nodemask: The allowed nodes to allocate from
 * @sync: Whether migration is synchronous or not
+ * @contended: Return value that is true if compaction was aborted due to lock contention
+ * @page: Optionally capture a free page of the requested order during compaction
 *
 * This is the main entry point for direct page compaction.
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                        bool sync, bool *contended)
+                        bool sync, bool *contended, struct page **page)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -868,28 +1074,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
+        int alloc_flags = 0;
-        /*
+        /* Check if the GFP flags allow compaction */
-         * Check whether it is worth even starting compaction. The order check is
-         * made because an assumption is made that the page allocator can satisfy
-         * the "cheaper" orders without taking special steps
-         */
        if (!order || !may_enter_fs || !may_perform_io)
                return rc;
        count_vm_event(COMPACTSTALL);
+#ifdef CONFIG_CMA
+        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+                alloc_flags |= ALLOC_CMA;
+#endif
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
                int status;
                status = compact_zone_order(zone, order, gfp_mask, sync,
-                                                contended);
+                                                contended, page);
                rc = max(status, rc);
                /* If a normal allocation would succeed, stop compacting */
-                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
+                                      alloc_flags))
                        break;
        }
@@ -940,6 +1148,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
        struct compact_control cc = {
                .order = order,
                .sync = false,
+                .page = NULL,
        };
        return __compact_pgdat(pgdat, &cc);
@@ -950,6 +1159,7 @@ static int compact_node(int nid)
        struct compact_control cc = {
                .order = -1,
                .sync = true,
+                .page = NULL,
        };
        return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index 384344575c37..83efee76a5c0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
         * Do we have something in the page cache already?
         */
        page = find_get_page(mapping, offset);
-        if (likely(page)) {
+        if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
                /*
                 * We found the page, so try async readahead before
                 * waiting for the lock.
                 */
                do_async_mmap_readahead(vma, ra, file, page, offset);
-        } else {
+        } else if (!page) {
                /* No page in the page cache at all */
                do_sync_mmap_readahead(vma, ra, file, offset);
                count_vm_event(PGMAJFAULT);
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
 const struct vm_operations_struct generic_file_vm_ops = {
        .fault          = filemap_fault,
        .page_mkwrite   = filemap_page_mkwrite,
+        .remap_pages    = generic_file_remap_pages,
 };
 /* This is used for a general mmap of a disk file */
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &generic_file_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 13e013b1270c..a912da6ddfd4 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping,
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm;
-        struct prio_tree_iter iter;
        unsigned long address;
        pte_t *pte;
        pte_t pteval;
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
 retry:
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -193,11 +192,13 @@ retry:
                if (pte) {
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
-                        pteval = ptep_clear_flush_notify(vma, address, pte);
+                        pteval = ptep_clear_flush(vma, address, pte);
                        page_remove_rmap(page);
                        dec_mm_counter(mm, MM_FILEPAGES);
                        BUG_ON(pte_dirty(pteval));
                        pte_unmap_unlock(pte, ptl);
+                        /* must invalidate_page _before_ freeing the page */
+                        mmu_notifier_invalidate_page(mm, address);
                        page_cache_release(page);
                }
        }
@@ -305,6 +306,7 @@ out:
 static const struct vm_operations_struct xip_file_vm_ops = {
        .fault  = xip_file_fault,
        .page_mkwrite   = filemap_page_mkwrite,
+        .remap_pages = generic_file_remap_pages,
 };
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -313,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
        file_accessed(file);
        vma->vm_ops = &xip_file_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
+        vma->vm_flags |= VM_MIXEDMAP;
        return 0;
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 048659c0c03d..3899a86851ce 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,6 +5,7 @@
 *
 * started by Ingo Molnar, Copyright (C) 2002, 2003
 */
+#include <linux/export.h>
 #include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -80,9 +81,10 @@ out:
        return err;
 }
-static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-                        unsigned long addr, unsigned long size, pgoff_t pgoff)
+                             unsigned long size, pgoff_t pgoff)
 {
+        struct mm_struct *mm = vma->vm_mm;
        int err;
        do {
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
                pgoff++;
        } while (size);
-        return 0;
+        return 0;
 }
+EXPORT_SYMBOL(generic_file_remap_pages);
 /**
 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
                goto out;
-        if (!(vma->vm_flags & VM_CAN_NONLINEAR))
+        if (!vma->vm_ops->remap_pages)
                goto out;
        if (start < vma->vm_start || start + size > vma->vm_end)
@@ -212,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
                vma->vm_flags |= VM_NONLINEAR;
-                vma_prio_tree_remove(vma, &mapping->i_mmap);
+                vma_interval_tree_remove(vma, &mapping->i_mmap);
                vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
                flush_dcache_mmap_unlock(mapping);
                mutex_unlock(&mapping->i_mmap_mutex);
@@ -228,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        }
        mmu_notifier_invalidate_range_start(mm, start, start + size);
-        err = populate_range(mm, vma, start, size, pgoff);
+        err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
        mmu_notifier_invalidate_range_end(mm, start, start + size);
        if (!err && !(flags & MAP_NONBLOCK)) {
                if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 141dbb695097..a863af26c79c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -102,10 +102,7 @@ static int set_recommended_min_free_kbytes(void)
        unsigned long recommended_min;
        extern int min_free_kbytes;
-        if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+        if (!khugepaged_enabled())
-                      &transparent_hugepage_flags) &&
-            !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-                      &transparent_hugepage_flags))
                return 0;
        for_each_populated_zone(zone)
@@ -139,12 +136,6 @@ static int start_khugepaged(void)
 {
        int err = 0;
        if (khugepaged_enabled()) {
-                int wakeup;
-                if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
-                        err = -ENOMEM;
-                        goto out;
-                }
-                mutex_lock(&khugepaged_mutex);
                if (!khugepaged_thread)
                        khugepaged_thread = kthread_run(khugepaged, NULL,
                                                        "khugepaged");
@@ -154,16 +145,16 @@ static int start_khugepaged(void)
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
                }
-                wakeup = !list_empty(&khugepaged_scan.mm_head);
-                mutex_unlock(&khugepaged_mutex);
+                if (!list_empty(&khugepaged_scan.mm_head))
-                if (wakeup)
                        wake_up_interruptible(&khugepaged_wait);
                set_recommended_min_free_kbytes();
-        } else
+        } else if (khugepaged_thread) {
-                /* wakeup to exit */
+                kthread_stop(khugepaged_thread);
-                wake_up_interruptible(&khugepaged_wait);
+                khugepaged_thread = NULL;
-out:
+        }
        return err;
 }
@@ -224,18 +215,16 @@ static ssize_t enabled_store(struct kobject *kobj,
                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
        if (ret > 0) {
-                int err = start_khugepaged();
+                int err;
+                mutex_lock(&khugepaged_mutex);
+                err = start_khugepaged();
+                mutex_unlock(&khugepaged_mutex);
                if (err)
                        ret = err;
        }
-        if (ret > 0 &&
-            (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
-                      &transparent_hugepage_flags) ||
-             test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-                      &transparent_hugepage_flags)))
-                set_recommended_min_free_kbytes();
        return ret;
 }
 static struct kobj_attribute enabled_attr =
@@ -570,8 +559,6 @@ static int __init hugepage_init(void)
        start_khugepaged();
-        set_recommended_min_free_kbytes();
        return 0;
 out:
        hugepage_exit_sysfs(hugepage_kobj);
@@ -611,19 +598,6 @@ out:
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
-static void prepare_pmd_huge_pte(pgtable_t pgtable,
-                                 struct mm_struct *mm)
-{
-        assert_spin_locked(&mm->page_table_lock);
-        /* FIFO */
-        if (!mm->pmd_huge_pte)
-                INIT_LIST_HEAD(&pgtable->lru);
-        else
-                list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
-        mm->pmd_huge_pte = pgtable;
-}
 static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
        if (likely(vma->vm_flags & VM_WRITE))
@@ -665,7 +639,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                 */
                page_add_new_anon_rmap(page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
-                prepare_pmd_huge_pte(pgtable, mm);
+                pgtable_trans_huge_deposit(mm, pgtable);
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm->nr_ptes++;
                spin_unlock(&mm->page_table_lock);
@@ -791,7 +765,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-        prepare_pmd_huge_pte(pgtable, dst_mm);
+        pgtable_trans_huge_deposit(dst_mm, pgtable);
        dst_mm->nr_ptes++;
        ret = 0;
@@ -802,25 +776,6 @@ out:
        return ret;
 }
-/* no "address" argument so destroys page coloring of some arch */
-pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
-{
-        pgtable_t pgtable;
-        assert_spin_locked(&mm->page_table_lock);
-        /* FIFO */
-        pgtable = mm->pmd_huge_pte;
-        if (list_empty(&pgtable->lru))
-                mm->pmd_huge_pte = NULL;
-        else {
-                mm->pmd_huge_pte = list_entry(pgtable->lru.next,
-                                              struct page, lru);
-                list_del(&pgtable->lru);
-        }
-        return pgtable;
-}
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
@@ -832,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        pmd_t _pmd;
        int ret = 0, i;
        struct page **pages;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
                        GFP_KERNEL);
@@ -868,15 +825,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                cond_resched();
        }
+        mmun_start = haddr;
+        mmun_end   = haddr + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(*pmd, orig_pmd)))
                goto out_free_pages;
        VM_BUG_ON(!PageHead(page));
-        pmdp_clear_flush_notify(vma, haddr, pmd);
+        pmdp_clear_flush(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
-        pgtable = get_pmd_huge_pte(mm);
+        pgtable = pgtable_trans_huge_withdraw(mm);
        pmd_populate(mm, &_pmd, pgtable);
        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -896,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        page_remove_rmap(page);
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        ret |= VM_FAULT_WRITE;
        put_page(page);
@@ -904,6 +867,7 @@ out:
 out_free_pages:
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        mem_cgroup_uncharge_start();
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                mem_cgroup_uncharge_page(pages[i]);
@@ -920,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int ret = 0;
        struct page *page, *new_page;
        unsigned long haddr;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        VM_BUG_ON(!vma->anon_vma);
        spin_lock(&mm->page_table_lock);
@@ -934,7 +900,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
-                        update_mmu_cache(vma, address, entry);
+                        update_mmu_cache_pmd(vma, address, pmd);
                ret |= VM_FAULT_WRITE;
                goto out_unlock;
        }
@@ -970,38 +936,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
        __SetPageUptodate(new_page);
+        mmun_start = haddr;
+        mmun_end   = haddr + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        put_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
                spin_unlock(&mm->page_table_lock);
                mem_cgroup_uncharge_page(new_page);
                put_page(new_page);
-                goto out;
+                goto out_mn;
        } else {
                pmd_t entry;
                VM_BUG_ON(!PageHead(page));
                entry = mk_pmd(new_page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                entry = pmd_mkhuge(entry);
-                pmdp_clear_flush_notify(vma, haddr, pmd);
+                pmdp_clear_flush(vma, haddr, pmd);
                page_add_new_anon_rmap(new_page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
-                update_mmu_cache(vma, address, entry);
+                update_mmu_cache_pmd(vma, address, pmd);
                page_remove_rmap(page);
                put_page(page);
                ret |= VM_FAULT_WRITE;
        }
-out_unlock:
        spin_unlock(&mm->page_table_lock);
+out_mn:
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
        return ret;
+out_unlock:
+        spin_unlock(&mm->page_table_lock);
+        return ret;
 }
-struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                   unsigned long addr,
                                   pmd_t *pmd,
                                   unsigned int flags)
 {
+        struct mm_struct *mm = vma->vm_mm;
        struct page *page = NULL;
        assert_spin_locked(&mm->page_table_lock);
@@ -1024,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
                _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
                set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
        }
+        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+                if (page->mapping && trylock_page(page)) {
+                        lru_add_drain();
+                        if (page->mapping)
+                                mlock_vma_page(page);
+                        unlock_page(page);
+                }
+        }
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON(!PageCompound(page));
        if (flags & FOLL_GET)
@@ -1041,9 +1024,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
                struct page *page;
                pgtable_t pgtable;
-                pgtable = get_pmd_huge_pte(tlb->mm);
+                pmd_t orig_pmd;
-                page = pmd_page(*pmd);
+                pgtable = pgtable_trans_huge_withdraw(tlb->mm);
-                pmd_clear(pmd);
+                orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
+                page = pmd_page(orig_pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                page_remove_rmap(page);
                VM_BUG_ON(page_mapcount(page) < 0);
@@ -1207,7 +1191,11 @@ static int __split_huge_page_splitting(struct page *page,
        struct mm_struct *mm = vma->vm_mm;
        pmd_t *pmd;
        int ret = 0;
+        /* For mmu_notifiers */
+        const unsigned long mmun_start = address;
+        const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        pmd = page_check_address_pmd(page, mm, address,
                                     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1219,10 +1207,11 @@ static int __split_huge_page_splitting(struct page *page,
                 * and it won't wait on the anon_vma->root->mutex to
                 * serialize against split_huge_page*.
                 */
-                pmdp_splitting_flush_notify(vma, address, pmd);
+                pmdp_splitting_flush(vma, address, pmd);
                ret = 1;
        }
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        return ret;
 }
@@ -1358,11 +1347,11 @@ static int __split_huge_page_map(struct page *page,
        pmd = page_check_address_pmd(page, mm, address,
                                     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
        if (pmd) {
-                pgtable = get_pmd_huge_pte(mm);
+                pgtable = pgtable_trans_huge_withdraw(mm);
                pmd_populate(mm, &_pmd, pgtable);
-                for (i = 0, haddr = address; i < HPAGE_PMD_NR;
+                haddr = address;
-                     i++, haddr += PAGE_SIZE) {
+                for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
                        pte_t *pte, entry;
                        BUG_ON(PageCompound(page+i));
                        entry = mk_pte(page + i, vma->vm_page_prot);
@@ -1406,8 +1395,7 @@ static int __split_huge_page_map(struct page *page,
                 * SMP TLB and finally we write the non-huge version
                 * of the pmd entry with pmd_populate.
                 */
-                set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
+                pmdp_invalidate(vma, address, pmd);
-                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
                pmd_populate(mm, pmd, pgtable);
                ret = 1;
        }
@@ -1421,18 +1409,17 @@ static void __split_huge_page(struct page *page,
                              struct anon_vma *anon_vma)
 {
        int mapcount, mapcount2;
+        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct anon_vma_chain *avc;
        BUG_ON(!PageHead(page));
        BUG_ON(PageTail(page));
        mapcount = 0;
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long addr = vma_address(page, vma);
                BUG_ON(is_vma_temporary_stack(vma));
-                if (addr == -EFAULT)
-                        continue;
                mapcount += __split_huge_page_splitting(page, vma, addr);
        }
        /*
@@ -1453,12 +1440,10 @@ static void __split_huge_page(struct page *page,
        __split_huge_page_refcount(page);
        mapcount2 = 0;
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long addr = vma_address(page, vma);
                BUG_ON(is_vma_temporary_stack(vma));
-                if (addr == -EFAULT)
-                        continue;
                mapcount2 += __split_huge_page_map(page, vma, addr);
        }
        if (mapcount != mapcount2)
@@ -1491,12 +1476,13 @@ out:
        return ret;
 }
-#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
-                   VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
 int hugepage_madvise(struct vm_area_struct *vma,
                     unsigned long *vm_flags, int advice)
 {
+        struct mm_struct *mm = vma->vm_mm;
        switch (advice) {
        case MADV_HUGEPAGE:
                /*
@@ -1504,6 +1490,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
                 */
                if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
                        return -EINVAL;
+                if (mm->def_flags & VM_NOHUGEPAGE)
+                        return -EINVAL;
                *vm_flags &= ~VM_NOHUGEPAGE;
                *vm_flags |= VM_HUGEPAGE;
                /*
@@ -1655,11 +1643,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
        if (vma->vm_ops)
                /* khugepaged not yet working on file or special mappings */
                return 0;
-        /*
+        VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-         * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-         * true too, verify it here.
-         */
-        VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (hstart < hend)
@@ -1833,28 +1817,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
        }
 }
-static void collapse_huge_page(struct mm_struct *mm,
+static void khugepaged_alloc_sleep(void)
-                               unsigned long address,
-                               struct page **hpage,
-                               struct vm_area_struct *vma,
-                               int node)
 {
-        pgd_t *pgd;
+        wait_event_freezable_timeout(khugepaged_wait, false,
-        pud_t *pud;
+                        msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
-        pmd_t *pmd, _pmd;
+}
-        pte_t *pte;
-        pgtable_t pgtable;
-        struct page *new_page;
-        spinlock_t *ptl;
-        int isolated;
-        unsigned long hstart, hend;
-        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifdef CONFIG_NUMA
-#ifndef CONFIG_NUMA
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-        up_read(&mm->mmap_sem);
+{
-        VM_BUG_ON(!*hpage);
+        if (IS_ERR(*hpage)) {
-        new_page = *hpage;
+                if (!*wait)
-#else
+                        return false;
+                *wait = false;
+                *hpage = NULL;
+                khugepaged_alloc_sleep();
+        } else if (*hpage) {
+                put_page(*hpage);
+                *hpage = NULL;
+        }
+        return true;
+}
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+                       struct vm_area_struct *vma, unsigned long address,
+                       int node)
+{
        VM_BUG_ON(*hpage);
        /*
         * Allocate the page while the vma is still valid and under
@@ -1866,7 +1857,7 @@ static void collapse_huge_page(struct mm_struct *mm,
         * mmap_sem in read mode is good idea also to allow greater
         * scalability.
         */
-        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+        *hpage  = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
                                      node, __GFP_OTHER_NODE);
        /*
@@ -1874,20 +1865,85 @@ static void collapse_huge_page(struct mm_struct *mm,
         * preparation for taking it in write mode.
         */
        up_read(&mm->mmap_sem);
-        if (unlikely(!new_page)) {
+        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
-                return;
+                return NULL;
        }
-#endif
        count_vm_event(THP_COLLAPSE_ALLOC);
-        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+        return *hpage;
-#ifdef CONFIG_NUMA
+}
-                put_page(new_page);
+#else
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+        struct page *hpage;
+        do {
+                hpage = alloc_hugepage(khugepaged_defrag());
+                if (!hpage) {
+                        count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+                        if (!*wait)
+                                return NULL;
+                        *wait = false;
+                        khugepaged_alloc_sleep();
+                } else
+                        count_vm_event(THP_COLLAPSE_ALLOC);
+        } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+        return hpage;
+}
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+        if (!*hpage)
+                *hpage = khugepaged_alloc_hugepage(wait);
+        if (unlikely(!*hpage))
+                return false;
+        return true;
+}
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+                       struct vm_area_struct *vma, unsigned long address,
+                       int node)
+{
+        up_read(&mm->mmap_sem);
+        VM_BUG_ON(!*hpage);
+        return  *hpage;
+}
 #endif
+static void collapse_huge_page(struct mm_struct *mm,
+                                   unsigned long address,
+                                   struct page **hpage,
+                                   struct vm_area_struct *vma,
+                                   int node)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd, _pmd;
+        pte_t *pte;
+        pgtable_t pgtable;
+        struct page *new_page;
+        spinlock_t *ptl;
+        int isolated;
+        unsigned long hstart, hend;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        /* release the mmap_sem read lock. */
+        new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+        if (!new_page)
+                return;
+        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
                return;
-        }
        /*
         * Prevent all access to pagetables with the exception of
@@ -1912,11 +1968,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out;
        if (is_vma_temporary_stack(vma))
                goto out;
-        /*
+        VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-         * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-         * true too, verify it here.
-         */
-        VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
@@ -1936,6 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm,
        pte = pte_offset_map(pmd, address);
        ptl = pte_lockptr(mm, pmd);
+        mmun_start = address;
+        mmun_end   = address + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock); /* probably unnecessary */
        /*
         * After this gup_fast can't run anymore. This also removes
@@ -1943,8 +1998,9 @@ static void collapse_huge_page(struct mm_struct *mm,
         * huge and small TLB entries for the same virtual address
         * to avoid the risk of CPU bugs in that area.
         */
-        _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+        _pmd = pmdp_clear_flush(vma, address, pmd);
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        spin_lock(ptl);
        isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1970,8 +2026,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        pte_unmap(pte);
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
-        VM_BUG_ON(page_count(pgtable) != 1);
-        VM_BUG_ON(page_mapcount(pgtable) != 0);
        _pmd = mk_pmd(new_page, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
@@ -1988,13 +2042,12 @@ static void collapse_huge_page(struct mm_struct *mm,
        BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address);
        set_pmd_at(mm, address, pmd, _pmd);
-        update_mmu_cache(vma, address, _pmd);
+        update_mmu_cache_pmd(vma, address, pmd);
-        prepare_pmd_huge_pte(pgtable, mm);
+        pgtable_trans_huge_deposit(mm, pgtable);
        spin_unlock(&mm->page_table_lock);
-#ifndef CONFIG_NUMA
        *hpage = NULL;
-#endif
        khugepaged_pages_collapsed++;
 out_up_write:
        up_write(&mm->mmap_sem);
@@ -2002,9 +2055,6 @@ out_up_write:
 out:
        mem_cgroup_uncharge_page(new_page);
-#ifdef CONFIG_NUMA
-        put_page(new_page);
-#endif
        goto out_up_write;
 }
@@ -2154,12 +2204,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                        goto skip;
                if (is_vma_temporary_stack(vma))
                        goto skip;
-                /*
+                VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-                 * If is_pfn_mapping() is true is_learn_pfn_mapping()
-                 * must be true too, verify it here.
-                 */
-                VM_BUG_ON(is_linear_pfn_mapping(vma) ||
-                          vma->vm_flags & VM_NO_THP);
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2234,32 +2279,23 @@ static int khugepaged_has_work(void)
 static int khugepaged_wait_event(void)
 {
        return !list_empty(&khugepaged_scan.mm_head) ||
-                !khugepaged_enabled();
+                kthread_should_stop();
 }
-static void khugepaged_do_scan(struct page **hpage)
+static void khugepaged_do_scan(void)
 {
+        struct page *hpage = NULL;
        unsigned int progress = 0, pass_through_head = 0;
        unsigned int pages = khugepaged_pages_to_scan;
+        bool wait = true;
        barrier(); /* write khugepaged_pages_to_scan to local stack */
        while (progress < pages) {
-                cond_resched();
+                if (!khugepaged_prealloc_page(&hpage, &wait))
-#ifndef CONFIG_NUMA
-                if (!*hpage) {
-                        *hpage = alloc_hugepage(khugepaged_defrag());
-                        if (unlikely(!*hpage)) {
-                                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-                                break;
-                        }
-                        count_vm_event(THP_COLLAPSE_ALLOC);
-                }
-#else
-                if (IS_ERR(*hpage))
                        break;
-#endif
+                cond_resched();
                if (unlikely(kthread_should_stop() || freezing(current)))
                        break;
@@ -2270,73 +2306,32 @@ static void khugepaged_do_scan(struct page **hpage)
                if (khugepaged_has_work() &&
                    pass_through_head < 2)
                        progress += khugepaged_scan_mm_slot(pages - progress,
-                                                            hpage);
+                                                            &hpage);
                else
                        progress = pages;
                spin_unlock(&khugepaged_mm_lock);
        }
-}
-static void khugepaged_alloc_sleep(void)
+        if (!IS_ERR_OR_NULL(hpage))
-{
+                put_page(hpage);
-        wait_event_freezable_timeout(khugepaged_wait, false,
-                        msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
 }
-#ifndef CONFIG_NUMA
+static void khugepaged_wait_work(void)
-static struct page *khugepaged_alloc_hugepage(void)
 {
-        struct page *hpage;
+        try_to_freeze();
-        do {
-                hpage = alloc_hugepage(khugepaged_defrag());
-                if (!hpage) {
-                        count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-                        khugepaged_alloc_sleep();
-                } else
-                        count_vm_event(THP_COLLAPSE_ALLOC);
-        } while (unlikely(!hpage) &&
-                 likely(khugepaged_enabled()));
-        return hpage;
-}
-#endif
-static void khugepaged_loop(void)
+        if (khugepaged_has_work()) {
-{
+                if (!khugepaged_scan_sleep_millisecs)
-        struct page *hpage;
+                        return;
-#ifdef CONFIG_NUMA
+                wait_event_freezable_timeout(khugepaged_wait,
-        hpage = NULL;
+                                             kthread_should_stop(),
-#endif
+                        msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
-        while (likely(khugepaged_enabled())) {
+                return;
-#ifndef CONFIG_NUMA
-                hpage = khugepaged_alloc_hugepage();
-                if (unlikely(!hpage))
-                        break;
-#else
-                if (IS_ERR(hpage)) {
-                        khugepaged_alloc_sleep();
-                        hpage = NULL;
-                }
-#endif
-                khugepaged_do_scan(&hpage);
-#ifndef CONFIG_NUMA
-                if (hpage)
-                        put_page(hpage);
-#endif
-                try_to_freeze();
-                if (unlikely(kthread_should_stop()))
-                        break;
-                if (khugepaged_has_work()) {
-                        if (!khugepaged_scan_sleep_millisecs)
-                                continue;
-                        wait_event_freezable_timeout(khugepaged_wait, false,
-                            msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
-                } else if (khugepaged_enabled())
-                        wait_event_freezable(khugepaged_wait,
-                                             khugepaged_wait_event());
        }
+        if (khugepaged_enabled())
+                wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 static int khugepaged(void *none)
@@ -2346,20 +2341,9 @@ static int khugepaged(void *none)
        set_freezable();
        set_user_nice(current, 19);
-        /* serialize with start_khugepaged() */
+        while (!kthread_should_stop()) {
-        mutex_lock(&khugepaged_mutex);
+                khugepaged_do_scan();
+                khugepaged_wait_work();
-        for (;;) {
-                mutex_unlock(&khugepaged_mutex);
-                VM_BUG_ON(khugepaged_thread != current);
-                khugepaged_loop();
-                VM_BUG_ON(khugepaged_thread != current);
-                mutex_lock(&khugepaged_mutex);
-                if (!khugepaged_enabled())
-                        break;
-                if (unlikely(kthread_should_stop()))
-                        break;
        }
        spin_lock(&khugepaged_mm_lock);
@@ -2368,10 +2352,6 @@ static int khugepaged(void *none)
        if (mm_slot)
                collect_mm_slot(mm_slot);
        spin_unlock(&khugepaged_mm_lock);
-        khugepaged_thread = NULL;
-        mutex_unlock(&khugepaged_mutex);
        return 0;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc727122dd44..59a0059b39e2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,7 +30,6 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
-#include <linux/hugetlb_cgroup.h>
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -637,6 +636,7 @@ static void free_huge_page(struct page *page)
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
        } else {
+                arch_clear_hugepage_flags(page);
                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
        }
 }
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for normal or
+ * transparent huge pages.  See the PageTransHuge() documentation for more
+ * details.
+ */
 int PageHuge(struct page *page)
 {
        compound_page_dtor *dtor;
@@ -2355,13 +2360,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        struct page *page;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
+        const unsigned long mmun_start = start; /* For mmu_notifiers */
+        const unsigned long mmun_end   = end;   /* For mmu_notifiers */
        WARN_ON(!is_vm_hugetlb_page(vma));
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));
        tlb_start_vma(tlb, vma);
-        mmu_notifier_invalidate_range_start(mm, start, end);
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 again:
        spin_lock(&mm->page_table_lock);
        for (address = start; address < end; address += sz) {
@@ -2425,7 +2432,7 @@ again:
                if (address < end && !ref_page)
                        goto again;
        }
-        mmu_notifier_invalidate_range_end(mm, start, end);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        tlb_end_vma(tlb, vma);
 }
@@ -2473,7 +2480,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
        struct hstate *h = hstate_vma(vma);
        struct vm_area_struct *iter_vma;
        struct address_space *mapping;
-        struct prio_tree_iter iter;
        pgoff_t pgoff;
        /*
@@ -2481,7 +2487,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * from page cache lookup which is in HPAGE_SIZE units.
         */
        address = address & huge_page_mask(h);
-        pgoff = vma_hugecache_offset(h, vma, address);
+        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+                        vma->vm_pgoff;
        mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
        /*
@@ -2490,7 +2497,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * __unmap_hugepage_range() is called as the lock is already held
         */
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
                /* Do not unmap the current VMA */
                if (iter_vma == vma)
                        continue;
@@ -2525,6 +2532,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *old_page, *new_page;
        int avoidcopy;
        int outside_reserve = 0;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        old_page = pte_page(pte);
@@ -2611,6 +2620,9 @@ retry_avoidcopy:
                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);
+        mmun_start = address & huge_page_mask(h);
+        mmun_end = mmun_start + huge_page_size(h);
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
         * Retake the page_table_lock to check for racing updates
         * before the page tables are altered
@@ -2619,9 +2631,6 @@ retry_avoidcopy:
        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
                /* Break COW */
-                mmu_notifier_invalidate_range_start(mm,
-                        address & huge_page_mask(h),
-                        (address & huge_page_mask(h)) + huge_page_size(h));
                huge_ptep_clear_flush(vma, address, ptep);
                set_huge_pte_at(mm, address, ptep,
                                make_huge_pte(vma, new_page, 1));
@@ -2629,10 +2638,11 @@ retry_avoidcopy:
                hugepage_add_new_anon_rmap(new_page, vma, address);
                /* Make the old page be freed below */
                new_page = old_page;
-                mmu_notifier_invalidate_range_end(mm,
-                        address & huge_page_mask(h),
-                        (address & huge_page_mask(h)) + huge_page_size(h));
        }
+        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+        /* Caller expects lock to be held */
+        spin_lock(&mm->page_table_lock);
        page_cache_release(new_page);
        page_cache_release(old_page);
        return 0;
diff --git a/mm/internal.h b/mm/internal.h
index b8c91b342e24..a4fa284f6bc2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,26 +118,27 @@ struct compact_control {
        unsigned long nr_freepages;     /* Number of isolated free pages */
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
-        unsigned long start_free_pfn;   /* where we started the search */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        bool sync;                      /* Synchronous migration */
-        bool wrapped;                   /* Order > 0 compactions are
+        bool ignore_skip_hint;          /* Scan blocks even if marked skip */
-                                           incremental, once free_pfn
+        bool finished_update_free;      /* True when the zone cached pfns are
-                                           and migrate_pfn meet, we restart
+                                         * no longer being updated
-                                           from the top of the zone;
+                                         */
-                                           remember we wrapped around. */
+        bool finished_update_migrate;
        int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
-        bool *contended;                /* True if a lock was contended */
+        bool contended;                 /* True if a lock was contended */
+        struct page **page;             /* Page captured of requested size */
 };
 unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
+isolate_freepages_range(struct compact_control *cc,
+                        unsigned long start_pfn, unsigned long end_pfn);
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
-                           unsigned long low_pfn, unsigned long end_pfn);
+        unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
 #endif
@@ -167,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 }
 /*
- * Called only in fault path via page_evictable() for a new page
+ * Called only in fault path, to determine if a new page is being
- * to determine if it's being mapped into a LOCKED vma.
+ * mapped into a LOCKED vma.  If it is, mark page as mlocked.
- * If so, mark page as mlocked.
 */
 static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
                                    struct page *page)
@@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
                return 0;
        if (!TestSetPageMlocked(page)) {
-                inc_zone_page_state(page, NR_MLOCK);
+                mod_zone_page_state(page_zone(page), NR_MLOCK,
+                                    hpage_nr_pages(page));
                count_vm_event(UNEVICTABLE_PGMLOCKED);
        }
        return 1;
@@ -201,12 +202,7 @@ extern void munlock_vma_page(struct page *page);
 * If called for a page that is still mapped by mlocked vmas, all we do
 * is revert to lazy LRU behaviour -- semantics are not broken.
 */
-extern void __clear_page_mlock(struct page *page);
+extern void clear_page_mlock(struct page *page);
-static inline void clear_page_mlock(struct page *page)
-{
-        if (unlikely(TestClearPageMlocked(page)))
-                __clear_page_mlock(page);
-}
 /*
 * mlock_migrate_page - called only from migrate_page_copy() to
@@ -340,7 +336,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 #define ZONE_RECLAIM_FULL       -1
 #define ZONE_RECLAIM_SOME       0
 #define ZONE_RECLAIM_SUCCESS    1
-#endif
 extern int hwpoison_filter(struct page *p);
@@ -356,3 +351,20 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long);
 extern void set_pageblock_order(void);
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+                                            struct list_head *page_list);
+/* The ALLOC_WMARK bits are used as an index to zone->watermark */
+#define ALLOC_WMARK_MIN         WMARK_MIN
+#define ALLOC_WMARK_LOW         WMARK_LOW
+#define ALLOC_WMARK_HIGH        WMARK_HIGH
+#define ALLOC_NO_WATERMARKS     0x04 /* don't check watermarks at all */
+/* Mask to get the watermark bits */
+#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)
+#define ALLOC_HARDER            0x10 /* try to alloc harder */
+#define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET            0x40 /* check for correct cpuset */
+#define ALLOC_CMA               0x80 /* allow allocations from CMA areas */
+#endif  /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 000000000000..4a5822a586e6
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,112 @@
+/*
+ * mm/interval_tree.c - interval tree for mapping->i_mmap
+ *
+ * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
+ *
+ * This file is released under the GPL v2.
+ */
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rmap.h>
+#include <linux/interval_tree_generic.h>
+static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
+{
+        return v->vm_pgoff;
+}
+static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
+{
+        return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+}
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
+                     unsigned long, shared.linear.rb_subtree_last,
+                     vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+/* Insert node immediately after prev in the interval tree */
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+                                    struct vm_area_struct *prev,
+                                    struct rb_root *root)
+{
+        struct rb_node **link;
+        struct vm_area_struct *parent;
+        unsigned long last = vma_last_pgoff(node);
+        VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
+        if (!prev->shared.linear.rb.rb_right) {
+                parent = prev;
+                link = &prev->shared.linear.rb.rb_right;
+        } else {
+                parent = rb_entry(prev->shared.linear.rb.rb_right,
+                                  struct vm_area_struct, shared.linear.rb);
+                if (parent->shared.linear.rb_subtree_last < last)
+                        parent->shared.linear.rb_subtree_last = last;
+                while (parent->shared.linear.rb.rb_left) {
+                        parent = rb_entry(parent->shared.linear.rb.rb_left,
+                                struct vm_area_struct, shared.linear.rb);
+                        if (parent->shared.linear.rb_subtree_last < last)
+                                parent->shared.linear.rb_subtree_last = last;
+                }
+                link = &parent->shared.linear.rb.rb_left;
+        }
+        node->shared.linear.rb_subtree_last = last;
+        rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
+        rb_insert_augmented(&node->shared.linear.rb, root,
+                            &vma_interval_tree_augment);
+}
+static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
+{
+        return vma_start_pgoff(avc->vma);
+}
+static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
+{
+        return vma_last_pgoff(avc->vma);
+}
+INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
+                     avc_start_pgoff, avc_last_pgoff,
+                     static inline, __anon_vma_interval_tree)
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+                                   struct rb_root *root)
+{
+#ifdef CONFIG_DEBUG_VM_RB
+        node->cached_vma_start = avc_start_pgoff(node);
+        node->cached_vma_last = avc_last_pgoff(node);
+#endif
+        __anon_vma_interval_tree_insert(node, root);
+}
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+                                   struct rb_root *root)
+{
+        __anon_vma_interval_tree_remove(node, root);
+}
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root *root,
+                                  unsigned long first, unsigned long last)
+{
+        return __anon_vma_interval_tree_iter_first(root, first, last);
+}
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
+                                 unsigned long first, unsigned long last)
+{
+        return __anon_vma_interval_tree_iter_next(node, first, last);
+}
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
+{
+        WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
+        WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
+}
+#endif
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 0de83b4541e9..a217cc544060 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -29,7 +29,7 @@
 * - kmemleak_lock (rwlock): protects the object_list modifications and
 *   accesses to the object_tree_root. The object_list is the main list
 *   holding the metadata (struct kmemleak_object) for the allocated memory
- *   blocks. The object_tree_root is a priority search tree used to look-up
+ *   blocks. The object_tree_root is a red black tree used to look-up
 *   metadata based on a pointer to the corresponding memory block.  The
 *   kmemleak_object structures are added to the object_list and
 *   object_tree_root in the create_object() function called from the
@@ -71,7 +71,7 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/kthread.h>
-#include <linux/prio_tree.h>
+#include <linux/rbtree.h>
 #include <linux/fs.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -132,7 +132,7 @@ struct kmemleak_scan_area {
 * Structure holding the metadata for each allocated memory block.
 * Modifications to such objects should be made while holding the
 * object->lock. Insertions or deletions from object_list, gray_list or
- * tree_node are already protected by the corresponding locks or mutex (see
+ * rb_node are already protected by the corresponding locks or mutex (see
 * the notes on locking above). These objects are reference-counted
 * (use_count) and freed using the RCU mechanism.
 */
@@ -141,7 +141,7 @@ struct kmemleak_object {
        unsigned long flags;            /* object status flags */
        struct list_head object_list;
        struct list_head gray_list;
-        struct prio_tree_node tree_node;
+        struct rb_node rb_node;
        struct rcu_head rcu;            /* object_list lockless traversal */
        /* object usage count; object freed when use_count == 0 */
        atomic_t use_count;
@@ -182,9 +182,9 @@ struct kmemleak_object {
 static LIST_HEAD(object_list);
 /* the list of gray-colored objects (see color_gray comment below) */
 static LIST_HEAD(gray_list);
-/* prio search tree for object boundaries */
+/* search tree for object boundaries */
-static struct prio_tree_root object_tree_root;
+static struct rb_root object_tree_root = RB_ROOT;
-/* rw_lock protecting the access to object_list and prio_tree_root */
+/* rw_lock protecting the access to object_list and object_tree_root */
 static DEFINE_RWLOCK(kmemleak_lock);
 /* allocation caches for kmemleak internal data */
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
        trace.entries = object->trace;
        pr_notice("Object 0x%08lx (size %zu):\n",
-                  object->tree_node.start, object->size);
+                  object->pointer, object->size);
        pr_notice("  comm \"%s\", pid %d, jiffies %lu\n",
                  object->comm, object->pid, object->jiffies);
        pr_notice("  min_count = %d\n", object->min_count);
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object)
 }
 /*
- * Look-up a memory block metadata (kmemleak_object) in the priority search
+ * Look-up a memory block metadata (kmemleak_object) in the object search
 * tree based on a pointer value. If alias is 0, only values pointing to the
 * beginning of the memory block are allowed. The kmemleak_lock must be held
 * when calling this function.
 */
 static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
 {
-        struct prio_tree_node *node;
+        struct rb_node *rb = object_tree_root.rb_node;
-        struct prio_tree_iter iter;
-        struct kmemleak_object *object;
+        while (rb) {
+                struct kmemleak_object *object =
-        prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
+                        rb_entry(rb, struct kmemleak_object, rb_node);
-        node = prio_tree_next(&iter);
+                if (ptr < object->pointer)
-        if (node) {
+                        rb = object->rb_node.rb_left;
-                object = prio_tree_entry(node, struct kmemleak_object,
+                else if (object->pointer + object->size <= ptr)
-                                         tree_node);
+                        rb = object->rb_node.rb_right;
-                if (!alias && object->pointer != ptr) {
+                else if (object->pointer == ptr || alias)
+                        return object;
+                else {
                        kmemleak_warn("Found object by alias at 0x%08lx\n",
                                      ptr);
                        dump_object_info(object);
-                        object = NULL;
+                        break;
                }
-        } else
+        }
-                object = NULL;
+        return NULL;
-        return object;
 }
 /*
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object)
 }
 /*
- * Look up an object in the prio search tree and increase its use_count.
+ * Look up an object in the object search tree and increase its use_count.
 */
 static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
 {
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
                                             int min_count, gfp_t gfp)
 {
        unsigned long flags;
-        struct kmemleak_object *object;
+        struct kmemleak_object *object, *parent;
-        struct prio_tree_node *node;
+        struct rb_node **link, *rb_parent;
        object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
        if (!object) {
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        /* kernel backtrace */
        object->trace_len = __save_stack_trace(object->trace);
-        INIT_PRIO_TREE_NODE(&object->tree_node);
-        object->tree_node.start = ptr;
-        object->tree_node.last = ptr + size - 1;
        write_lock_irqsave(&kmemleak_lock, flags);
        min_addr = min(min_addr, ptr);
        max_addr = max(max_addr, ptr + size);
-        node = prio_tree_insert(&object_tree_root, &object->tree_node);
+        link = &object_tree_root.rb_node;
-        /*
+        rb_parent = NULL;
-         * The code calling the kernel does not yet have the pointer to the
+        while (*link) {
-         * memory block to be able to free it.  However, we still hold the
+                rb_parent = *link;
-         * kmemleak_lock here in case parts of the kernel started freeing
+                parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
-         * random memory blocks.
+                if (ptr + size <= parent->pointer)
-         */
+                        link = &parent->rb_node.rb_left;
-        if (node != &object->tree_node) {
+                else if (parent->pointer + parent->size <= ptr)
-                kmemleak_stop("Cannot insert 0x%lx into the object search tree "
+                        link = &parent->rb_node.rb_right;
-                              "(already existing)\n", ptr);
+                else {
-                object = lookup_object(ptr, 1);
+                        kmemleak_stop("Cannot insert 0x%lx into the object "
-                spin_lock(&object->lock);
+                                      "search tree (overlaps existing)\n",
-                dump_object_info(object);
+                                      ptr);
-                spin_unlock(&object->lock);
+                        kmem_cache_free(object_cache, object);
+                        object = parent;
-                goto out;
+                        spin_lock(&object->lock);
+                        dump_object_info(object);
+                        spin_unlock(&object->lock);
+                        goto out;
+                }
        }
+        rb_link_node(&object->rb_node, rb_parent, link);
+        rb_insert_color(&object->rb_node, &object_tree_root);
        list_add_tail_rcu(&object->object_list, &object_list);
 out:
        write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object)
        unsigned long flags;
        write_lock_irqsave(&kmemleak_lock, flags);
-        prio_tree_remove(&object_tree_root, &object->tree_node);
+        rb_erase(&object->rb_node, &object_tree_root);
        list_del_rcu(&object->object_list);
        write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -1766,7 +1769,6 @@ void __init kmemleak_init(void)
        object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
        scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
-        INIT_PRIO_TREE_ROOT(&object_tree_root);
        if (crt_early_log >= ARRAY_SIZE(early_log))
                pr_warning("Early log buffer exceeded (%d), please increase "
diff --git a/mm/ksm.c b/mm/ksm.c
index 47c885368890..ae539f0b8aa1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
        spinlock_t *ptl;
        int swapped;
        int err = -EFAULT;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        addr = page_address_in_vma(page, vma);
        if (addr == -EFAULT)
                goto out;
        BUG_ON(PageTransCompound(page));
+        mmun_start = addr;
+        mmun_end   = addr + PAGE_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        ptep = page_check_address(page, mm, addr, &ptl, 0);
        if (!ptep)
-                goto out;
+                goto out_mn;
        if (pte_write(*ptep) || pte_dirty(*ptep)) {
                pte_t entry;
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 out_unlock:
        pte_unmap_unlock(ptep, ptl);
+out_mn:
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
        return err;
 }
@@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        spinlock_t *ptl;
        unsigned long addr;
        int err = -EFAULT;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        addr = page_address_in_vma(page, vma);
        if (addr == -EFAULT)
@@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        if (!pmd_present(*pmd))
                goto out;
+        mmun_start = addr;
+        mmun_end   = addr + PAGE_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!pte_same(*ptep, orig_pte)) {
                pte_unmap_unlock(ptep, ptl);
-                goto out;
+                goto out_mn;
        }
        get_page(kpage);
@@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        pte_unmap_unlock(ptep, ptl);
        err = 0;
+out_mn:
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
        return err;
 }
@@ -1469,10 +1486,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                 */
                if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
                                 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-                                 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
+                                 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
-                                 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
                        return 0;               /* just ignore the advice */
+#ifdef VM_SAO
+                if (*vm_flags & VM_SAO)
+                        return 0;
+#endif
                if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
                        err = __ksm_enter(mm);
                        if (err)
@@ -1582,7 +1603,7 @@ struct page *ksm_does_need_to_copy(struct page *page,
                SetPageSwapBacked(new_page);
                __set_page_locked(new_page);
-                if (page_evictable(new_page, vma))
+                if (!mlocked_vma_newpage(vma, new_page))
                        lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
                else
                        add_page_to_unevictable_list(new_page);
@@ -1614,7 +1635,8 @@ again:
                struct vm_area_struct *vma;
                anon_vma_lock(anon_vma);
-                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+                                               0, ULONG_MAX) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
@@ -1667,7 +1689,8 @@ again:
                struct vm_area_struct *vma;
                anon_vma_lock(anon_vma);
-                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+                                               0, ULONG_MAX) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
@@ -1719,7 +1742,8 @@ again:
                struct vm_area_struct *vma;
                anon_vma_lock(anon_vma);
-                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+                                               0, ULONG_MAX) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
diff --git a/mm/madvise.c b/mm/madvise.c
index 14d260fa0d17..03dfa5c7adb3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma,
                new_flags &= ~VM_DONTCOPY;
                break;
        case MADV_DONTDUMP:
-                new_flags |= VM_NODUMP;
+                new_flags |= VM_DONTDUMP;
                break;
        case MADV_DODUMP:
-                new_flags &= ~VM_NODUMP;
+                if (new_flags & VM_SPECIAL) {
+                        error = -EINVAL;
+                        goto out;
+                }
+                new_flags &= ~VM_DONTDUMP;
                break;
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
diff --git a/mm/memblock.c b/mm/memblock.c
index 82aa349d2f7a..931eef145af5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0;
 static int memblock_reserved_in_slab __initdata_memblock = 0;
 /* inline so we don't get a warning when pr_debug is compiled out */
-static inline const char *memblock_type_name(struct memblock_type *type)
+static __init_memblock const char *
+memblock_type_name(struct memblock_type *type)
 {
        if (type == &memblock.memory)
                return "memory";
@@ -756,7 +757,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
                return ret;
        for (i = start_rgn; i < end_rgn; i++)
-                type->regions[i].nid = nid;
+                memblock_set_region_node(&type->regions[i], nid);
        memblock_merge_regions(type);
        return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a72f2ffdc3d0..7acf43bf04a2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,7 @@
 #include <linux/oom.h>
 #include "internal.h"
 #include <net/sock.h>
+#include <net/ip.h>
 #include <net/tcp_memcontrol.h>
 #include <asm/uaccess.h>
@@ -326,7 +327,7 @@ struct mem_cgroup {
        struct mem_cgroup_stat_cpu nocpu_base;
        spinlock_t pcp_counter_lock;
-#ifdef CONFIG_INET
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
        struct tcp_memcontrol tcp_mem;
 #endif
 };
@@ -411,12 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
        return container_of(s, struct mem_cgroup, css);
 }
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+{
+        return (memcg == root_mem_cgroup);
+}
 /* Writing them here to avoid exposing memcg's inner layout */
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
-#include <net/sock.h>
-#include <net/ip.h>
-static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
 void sock_update_memcg(struct sock *sk)
 {
        if (mem_cgroup_sockets_enabled) {
@@ -461,7 +464,6 @@ void sock_release_memcg(struct sock *sk)
        }
 }
-#ifdef CONFIG_INET
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 {
        if (!memcg || mem_cgroup_is_root(memcg))
@@ -470,10 +472,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
        return &memcg->tcp_mem.cg_proto;
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
-#endif /* CONFIG_INET */
-#endif /* CONFIG_MEMCG_KMEM */
-#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
        if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -1016,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
             iter != NULL;                              \
             iter = mem_cgroup_iter(NULL, iter, NULL))
-static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
-{
-        return (memcg == root_mem_cgroup);
-}
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
        struct mem_cgroup *memcg;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a6e2141a6610..6c5899b9034a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct vm_area_struct *vma;
        struct task_struct *tsk;
        struct anon_vma *av;
+        pgoff_t pgoff;
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
                return;
+        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        read_lock(&tasklist_lock);
        for_each_process (tsk) {
                struct anon_vma_chain *vmac;
                if (!task_early_kill(tsk))
                        continue;
-                list_for_each_entry(vmac, &av->head, same_anon_vma) {
+                anon_vma_interval_tree_foreach(vmac, &av->rb_root,
+                                               pgoff, pgoff) {
                        vma = vmac->vma;
                        if (!page_mapped_in_vma(page, vma))
                                continue;
@@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 {
        struct vm_area_struct *vma;
        struct task_struct *tsk;
-        struct prio_tree_iter iter;
        struct address_space *mapping = page->mapping;
        mutex_lock(&mapping->i_mmap_mutex);
@@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
                if (!task_early_kill(tsk))
                        continue;
-                vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+                vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
                                      pgoff) {
                        /*
                         * Send early kill signal to tasks where a vma covers
diff --git a/mm/memory.c b/mm/memory.c
index 57361708d1a5..fb135ba4aba9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
        add_taint(TAINT_BAD_PAGE);
 }
-static inline int is_cow_mapping(vm_flags_t flags)
+static inline bool is_cow_mapping(vm_flags_t flags)
 {
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        unsigned long next;
        unsigned long addr = vma->vm_start;
        unsigned long end = vma->vm_end;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        bool is_cow;
        int ret;
        /*
@@ -1047,7 +1050,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * readonly mappings. The tradeoff is that copy_page_range is more
         * efficient than faulting.
         */
-        if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+        if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
+                               VM_PFNMAP | VM_MIXEDMAP))) {
                if (!vma->anon_vma)
                        return 0;
        }
@@ -1055,12 +1059,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (is_vm_hugetlb_page(vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, vma);
-        if (unlikely(is_pfn_mapping(vma))) {
+        if (unlikely(vma->vm_flags & VM_PFNMAP)) {
                /*
                 * We do not free on error cases below as remove_vma
                 * gets called on error from higher level routine
                 */
-                ret = track_pfn_vma_copy(vma);
+                ret = track_pfn_copy(vma);
                if (ret)
                        return ret;
        }
@@ -1071,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * parent mm. And a permission downgrade will only happen if
         * is_cow_mapping() returns true.
         */
-        if (is_cow_mapping(vma->vm_flags))
+        is_cow = is_cow_mapping(vma->vm_flags);
-                mmu_notifier_invalidate_range_start(src_mm, addr, end);
+        mmun_start = addr;
+        mmun_end   = end;
+        if (is_cow)
+                mmu_notifier_invalidate_range_start(src_mm, mmun_start,
+                                                    mmun_end);
        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
@@ -1088,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);
-        if (is_cow_mapping(vma->vm_flags))
+        if (is_cow)
-                mmu_notifier_invalidate_range_end(src_mm,
+                mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
-                                                  vma->vm_start, end);
        return ret;
 }
@@ -1327,8 +1334,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
        if (vma->vm_file)
                uprobe_munmap(vma, start, end);
-        if (unlikely(is_pfn_mapping(vma)))
+        if (unlikely(vma->vm_flags & VM_PFNMAP))
-                untrack_pfn_vma(vma, 0, 0);
+                untrack_pfn(vma, 0, 0);
        if (start != end) {
                if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -1521,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                                spin_unlock(&mm->page_table_lock);
                                wait_split_huge_page(vma->anon_vma, pmd);
                        } else {
-                                page = follow_trans_huge_pmd(mm, address,
+                                page = follow_trans_huge_pmd(vma, address,
                                                             pmd, flags);
                                spin_unlock(&mm->page_table_lock);
                                goto out;
@@ -1576,12 +1583,12 @@ split_fallthrough:
                if (page->mapping && trylock_page(page)) {
                        lru_add_drain();  /* push cached pages to LRU */
                        /*
-                         * Because we lock page here and migration is
+                         * Because we lock page here, and migration is
-                         * blocked by the pte's page reference, we need
+                         * blocked by the pte's page reference, and we
-                         * only check for file-cache page truncation.
+                         * know the page is still mapped, we don't even
+                         * need to check for file-cache page truncation.
                         */
-                        if (page->mapping)
+                        mlock_vma_page(page);
-                                mlock_vma_page(page);
                        unlock_page(page);
                }
        }
@@ -2085,6 +2092,11 @@ out:
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
+ *
+ * Usually this function is called from f_op->mmap() handler
+ * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
+ * Caller must set VM_MIXEDMAP on vma if it wants to call this
+ * function from other places, for example from page-fault handler.
 */
 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page)
@@ -2093,7 +2105,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                return -EFAULT;
        if (!page_count(page))
                return -EINVAL;
-        vma->vm_flags |= VM_INSERTPAGE;
+        if (!(vma->vm_flags & VM_MIXEDMAP)) {
+                BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+                BUG_ON(vma->vm_flags & VM_PFNMAP);
+                vma->vm_flags |= VM_MIXEDMAP;
+        }
        return insert_page(vma, addr, page, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_insert_page);
@@ -2132,7 +2148,7 @@ out:
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 *
- * Similar to vm_inert_page, this allows drivers to insert individual pages
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
 * they've allocated into a user vma. Same comments apply.
 *
 * This function should only be called from a vm_ops->fault handler, and
@@ -2162,14 +2178,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
-        if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
+        if (track_pfn_insert(vma, &pgprot, pfn))
                return -EINVAL;
        ret = insert_pfn(vma, addr, pfn, pgprot);
-        if (ret)
-                untrack_pfn_vma(vma, pfn, PAGE_SIZE);
        return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
@@ -2290,37 +2303,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
         * rest of the world about it:
         *   VM_IO tells people not to look at these pages
         *      (accesses can have side effects).
-         *   VM_RESERVED is specified all over the place, because
-         *      in 2.4 it kept swapout's vma scan off this vma; but
-         *      in 2.6 the LRU scan won't even find its pages, so this
-         *      flag means no more than count its pages in reserved_vm,
-         *      and omit it from core dump, even when VM_IO turned off.
         *   VM_PFNMAP tells the core MM that the base pages are just
         *      raw PFN mappings, and do not have a "struct page" associated
         *      with them.
+         *   VM_DONTEXPAND
+         *      Disable vma merging and expanding with mremap().
+         *   VM_DONTDUMP
+         *      Omit vma from core dump, even when VM_IO turned off.
         *
         * There's a horrible special case to handle copy-on-write
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+         * See vm_normal_page() for details.
         */
-        if (addr == vma->vm_start && end == vma->vm_end) {
+        if (is_cow_mapping(vma->vm_flags)) {
+                if (addr != vma->vm_start || end != vma->vm_end)
+                        return -EINVAL;
                vma->vm_pgoff = pfn;
-                vma->vm_flags |= VM_PFN_AT_MMAP;
+        }
-        } else if (is_cow_mapping(vma->vm_flags))
-                return -EINVAL;
-        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
-        err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
+        err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
-        if (err) {
+        if (err)
-                /*
-                 * To indicate that track_pfn related cleanup is not
-                 * needed from higher level routine calling unmap_vmas
-                 */
-                vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
-                vma->vm_flags &= ~VM_PFN_AT_MMAP;
                return -EINVAL;
-        }
+        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
@@ -2335,7 +2341,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
        } while (pgd++, addr = next, addr != end);
        if (err)
-                untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
+                untrack_pfn(vma, pfn, PAGE_ALIGN(size));
        return err;
 }
@@ -2516,11 +2522,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                spinlock_t *ptl, pte_t orig_pte)
        __releases(ptl)
 {
-        struct page *old_page, *new_page;
+        struct page *old_page, *new_page = NULL;
        pte_t entry;
        int ret = 0;
        int page_mkwrite = 0;
        struct page *dirty_page = NULL;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        bool mmun_called = false;       /* For mmu_notifiers */
        old_page = vm_normal_page(vma, address, orig_pte);
        if (!old_page) {
@@ -2698,6 +2707,11 @@ gotten:
        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
+        mmun_start  = address & PAGE_MASK;
+        mmun_end    = (address & PAGE_MASK) + PAGE_SIZE;
+        mmun_called = true;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
         * Re-check the pte - we dropped the lock
         */
@@ -2764,6 +2778,8 @@ gotten:
                page_cache_release(new_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
+        if (mmun_called)
+                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (old_page) {
                /*
                 * Don't let another task, with possibly unlocked vma,
@@ -2801,14 +2817,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
        zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
 }
-static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root *root,
                                            struct zap_details *details)
 {
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        pgoff_t vba, vea, zba, zea;
-        vma_prio_tree_foreach(vma, &iter, root,
+        vma_interval_tree_foreach(vma, root,
                        details->first_index, details->last_index) {
                vba = vma->vm_pgoff;
@@ -2839,7 +2854,7 @@ static inline void unmap_mapping_range_list(struct list_head *head,
         * across *all* the pages in each nonlinear VMA, not just the pages
         * whose virtual address lies outside the file truncation point.
         */
-        list_for_each_entry(vma, head, shared.vm_set.list) {
+        list_for_each_entry(vma, head, shared.nonlinear) {
                details->nonlinear_vma = vma;
                unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
        }
@@ -2883,7 +2898,7 @@ void unmap_mapping_range(struct address_space *mapping,
        mutex_lock(&mapping->i_mmap_mutex);
-        if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a5b90d0cfd7..56b758ae57d2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info,  struct page *page,
 void __ref put_page_bootmem(struct page *page)
 {
        unsigned long type;
+        struct zone *zone;
        type = (unsigned long) page->lru.next;
        BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page)
                set_page_private(page, 0);
                INIT_LIST_HEAD(&page->lru);
                __free_pages_bootmem(page, 0);
+                zone = page_zone(page);
+                zone_span_writelock(zone);
+                zone->present_pages++;
+                zone_span_writeunlock(zone);
+                totalram_pages++;
        }
 }
@@ -362,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
        BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
        BUG_ON(nr_pages % PAGES_PER_SECTION);
+        release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
        sections_to_remove = nr_pages / PAGES_PER_SECTION;
        for (i = 0; i < sections_to_remove; i++) {
                unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
-                release_mem_region(pfn << PAGE_SHIFT,
-                                   PAGES_PER_SECTION << PAGE_SHIFT);
                ret = __remove_section(zone, __pfn_to_section(pfn));
                if (ret)
                        break;
@@ -756,13 +763,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
        return 0;
 }
-static struct page *
-hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
-{
-        /* This should be improooooved!! */
-        return alloc_page(GFP_HIGHUSER_MOVABLE);
-}
 #define NR_OFFLINE_AT_ONCE_PAGES        (256)
 static int
 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -813,8 +813,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                        putback_lru_pages(&source);
                        goto out;
                }
-                /* this function returns # of failed pages */
-                ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+                /*
+                 * alloc_migrate_target should be improooooved!!
+                 * migrate_pages returns # of failed pages.
+                 */
+                ret = migrate_pages(&source, alloc_migrate_target, 0,
                                                        true, MIGRATE_SYNC);
                if (ret)
                        putback_lru_pages(&source);
@@ -870,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        return offlined;
 }
-static int __ref offline_pages(unsigned long start_pfn,
+static int __ref __offline_pages(unsigned long start_pfn,
                  unsigned long end_pfn, unsigned long timeout)
 {
        unsigned long pfn, nr_pages, expire;
@@ -970,8 +974,13 @@ repeat:
        init_per_zone_wmark_min();
-        if (!populated_zone(zone))
+        if (!populated_zone(zone)) {
                zone_pcp_reset(zone);
+                mutex_lock(&zonelists_mutex);
+                build_all_zonelists(NULL, NULL);
+                mutex_unlock(&zonelists_mutex);
+        } else
+                zone_pcp_update(zone);
        if (!node_present_pages(node)) {
                node_clear_state(node, N_HIGH_MEMORY);
@@ -998,15 +1007,55 @@ out:
        return ret;
 }
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+        return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+}
 int remove_memory(u64 start, u64 size)
 {
+        struct memory_block *mem = NULL;
+        struct mem_section *section;
        unsigned long start_pfn, end_pfn;
+        unsigned long pfn, section_nr;
+        int ret;
        start_pfn = PFN_DOWN(start);
        end_pfn = start_pfn + PFN_DOWN(size);
-        return offline_pages(start_pfn, end_pfn, 120 * HZ);
+        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+                section_nr = pfn_to_section_nr(pfn);
+                if (!present_section_nr(section_nr))
+                        continue;
+                section = __nr_to_section(section_nr);
+                /* same memblock? */
+                if (mem)
+                        if ((section_nr >= mem->start_section_nr) &&
+                            (section_nr <= mem->end_section_nr))
+                                continue;
+                mem = find_memory_block_hinted(section, mem);
+                if (!mem)
+                        continue;
+                ret = offline_memory_block(mem);
+                if (ret) {
+                        kobject_put(&mem->dev.kobj);
+                        return ret;
+                }
+        }
+        if (mem)
+                kobject_put(&mem->dev.kobj);
+        return 0;
 }
 #else
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+        return -EINVAL;
+}
 int remove_memory(u64 start, u64 size)
 {
        return -EINVAL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ada3be6e252..0b78fb9ea65b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        return first;
 }
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+                                                struct mempolicy *pol)
+{
+        int err;
+        struct mempolicy *old;
+        struct mempolicy *new;
+        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
+                 vma->vm_ops, vma->vm_file,
+                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+        new = mpol_dup(pol);
+        if (IS_ERR(new))
+                return PTR_ERR(new);
+        if (vma->vm_ops && vma->vm_ops->set_policy) {
+                err = vma->vm_ops->set_policy(vma, new);
+                if (err)
+                        goto err_out;
+        }
+        old = vma->vm_policy;
+        vma->vm_policy = new; /* protected by mmap_sem */
+        mpol_put(old);
+        return 0;
+ err_out:
+        mpol_put(new);
+        return err;
+}
 /* Step 2: apply policy to a range and do splits. */
 static int mbind_range(struct mm_struct *mm, unsigned long start,
                       unsigned long end, struct mempolicy *new_pol)
@@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
                        if (err)
                                goto out;
                }
+                err = vma_replace_policy(vma, new_pol);
-                /*
+                if (err)
-                 * Apply policy to a single VMA. The reference counting of
+                        goto out;
-                 * policy for vma_policy linkages has already been handled by
-                 * vma_merge and split_vma as necessary. If this is a shared
-                 * policy then ->set_policy will increment the reference count
-                 * for an sp node.
-                 */
-                pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
-                        vma->vm_start, vma->vm_end, vma->vm_pgoff,
-                        vma->vm_ops, vma->vm_file,
-                        vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-                if (vma->vm_ops && vma->vm_ops->set_policy) {
-                        err = vma->vm_ops->set_policy(vma, new_pol);
-                        if (err)
-                                goto out;
-                }
        }
 out:
@@ -924,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
        nodemask_t nmask;
        LIST_HEAD(pagelist);
        int err = 0;
-        struct vm_area_struct *vma;
        nodes_clear(nmask);
        node_set(source, nmask);
-        vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+        /*
+         * This does not "check" the range but isolates all pages that
+         * need migration.  Between passing in the full user address
+         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+         */
+        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+        check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
-        if (IS_ERR(vma))
-                return PTR_ERR(vma);
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest,
@@ -1530,8 +1555,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
                                                                        addr);
                        if (vpol)
                                pol = vpol;
-                } else if (vma->vm_policy)
+                } else if (vma->vm_policy) {
                        pol = vma->vm_policy;
+                        /*
+                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
+                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+                         * count on these policies which will be dropped by
+                         * mpol_cond_put() later
+                         */
+                        if (mpol_needs_cond_ref(pol))
+                                mpol_get(pol);
+                }
        }
        if (!pol)
                pol = &default_policy;
@@ -2061,7 +2096,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 */
 /* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
 static struct sp_node *
 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 {
@@ -2125,36 +2160,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
        if (!sp->root.rb_node)
                return NULL;
-        spin_lock(&sp->lock);
+        mutex_lock(&sp->mutex);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
-        spin_unlock(&sp->lock);
+        mutex_unlock(&sp->mutex);
        return pol;
 }
+static void sp_free(struct sp_node *n)
+{
+        mpol_put(n->policy);
+        kmem_cache_free(sn_cache, n);
+}
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
        rb_erase(&n->nd, &sp->root);
-        mpol_put(n->policy);
+        sp_free(n);
-        kmem_cache_free(sn_cache, n);
 }
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
 {
-        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+        struct sp_node *n;
+        struct mempolicy *newpol;
+        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n)
                return NULL;
+        newpol = mpol_dup(pol);
+        if (IS_ERR(newpol)) {
+                kmem_cache_free(sn_cache, n);
+                return NULL;
+        }
+        newpol->flags |= MPOL_F_SHARED;
        n->start = start;
        n->end = end;
-        mpol_get(pol);
+        n->policy = newpol;
-        pol->flags |= MPOL_F_SHARED;    /* for unref */
-        n->policy = pol;
        return n;
 }
@@ -2162,10 +2211,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
                                 unsigned long end, struct sp_node *new)
 {
-        struct sp_node *n, *new2 = NULL;
+        struct sp_node *n;
+        int ret = 0;
-restart:
+        mutex_lock(&sp->mutex);
-        spin_lock(&sp->lock);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
@@ -2178,16 +2227,14 @@ restart:
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
+                                struct sp_node *new2;
+                                new2 = sp_alloc(end, n->end, n->policy);
                                if (!new2) {
-                                        spin_unlock(&sp->lock);
+                                        ret = -ENOMEM;
-                                        new2 = sp_alloc(end, n->end, n->policy);
+                                        goto out;
-                                        if (!new2)
-                                                return -ENOMEM;
-                                        goto restart;
                                }
                                n->end = start;
                                sp_insert(sp, new2);
-                                new2 = NULL;
                                break;
                        } else
                                n->end = start;
@@ -2198,12 +2245,9 @@ restart:
        }
        if (new)
                sp_insert(sp, new);
-        spin_unlock(&sp->lock);
+out:
-        if (new2) {
+        mutex_unlock(&sp->mutex);
-                mpol_put(new2->policy);
+        return ret;
-                kmem_cache_free(sn_cache, new2);
-        }
-        return 0;
 }
 /**
@@ -2221,7 +2265,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
        int ret;
        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
-        spin_lock_init(&sp->lock);
+        mutex_init(&sp->mutex);
        if (mpol) {
                struct vm_area_struct pvma;
@@ -2275,7 +2319,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
        }
        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
        if (err && new)
-                kmem_cache_free(sn_cache, new);
+                sp_free(new);
        return err;
 }
@@ -2287,16 +2331,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
        if (!p->root.rb_node)
                return;
-        spin_lock(&p->lock);
+        mutex_lock(&p->mutex);
        next = rb_first(&p->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
-                rb_erase(&n->nd, &p->root);
+                sp_delete(p, n);
-                mpol_put(n->policy);
-                kmem_cache_free(sn_cache, n);
        }
-        spin_unlock(&p->lock);
+        mutex_unlock(&p->mutex);
 }
 /* assumes fs == KERNEL_DS */
diff --git a/mm/mlock.c b/mm/mlock.c
index ef726e8aa8e9..f0b9ce572fc7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -51,15 +51,13 @@ EXPORT_SYMBOL(can_do_mlock);
 /*
 *  LRU accounting for clear_page_mlock()
 */
-void __clear_page_mlock(struct page *page)
+void clear_page_mlock(struct page *page)
 {
-        VM_BUG_ON(!PageLocked(page));
+        if (!TestClearPageMlocked(page))
-        if (!page->mapping) {   /* truncated ? */
                return;
-        }
-        dec_zone_page_state(page, NR_MLOCK);
+        mod_zone_page_state(page_zone(page), NR_MLOCK,
+                            -hpage_nr_pages(page));
        count_vm_event(UNEVICTABLE_PGCLEARED);
        if (!isolate_lru_page(page)) {
                putback_lru_page(page);
@@ -81,7 +79,8 @@ void mlock_vma_page(struct page *page)
        BUG_ON(!PageLocked(page));
        if (!TestSetPageMlocked(page)) {
-                inc_zone_page_state(page, NR_MLOCK);
+                mod_zone_page_state(page_zone(page), NR_MLOCK,
+                                    hpage_nr_pages(page));
                count_vm_event(UNEVICTABLE_PGMLOCKED);
                if (!isolate_lru_page(page))
                        putback_lru_page(page);
@@ -108,7 +107,8 @@ void munlock_vma_page(struct page *page)
        BUG_ON(!PageLocked(page));
        if (TestClearPageMlocked(page)) {
-                dec_zone_page_state(page, NR_MLOCK);
+                mod_zone_page_state(page_zone(page), NR_MLOCK,
+                                    -hpage_nr_pages(page));
                if (!isolate_lru_page(page)) {
                        int ret = SWAP_AGAIN;
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                goto no_mlock;
-        if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+        if (!((vma->vm_flags & VM_DONTEXPAND) ||
                        is_vm_hugetlb_page(vma) ||
                        vma == get_gate_vma(current->mm))) {
@@ -290,14 +290,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
                if (page && !IS_ERR(page)) {
                        lock_page(page);
-                        /*
+                        munlock_vma_page(page);
-                         * Like in __mlock_vma_pages_range(),
-                         * because we lock page here and migration is
-                         * blocked by the elevated reference, we need
-                         * only check for file-cache page truncation.
-                         */
-                        if (page->mapping)
-                                munlock_vma_page(page);
                        unlock_page(page);
                        put_page(page);
                }
diff --git a/mm/mmap.c b/mm/mmap.c
index 872441e81914..2d942353d681 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                unsigned long start, unsigned long end);
-/*
- * WARNING: the debugging will use recursive algorithms so never enable this
- * unless you know what you are doing.
- */
-#undef DEBUG_MM_RB
 /* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
@@ -199,14 +193,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
        flush_dcache_mmap_lock(mapping);
        if (unlikely(vma->vm_flags & VM_NONLINEAR))
-                list_del_init(&vma->shared.vm_set.list);
+                list_del_init(&vma->shared.nonlinear);
        else
-                vma_prio_tree_remove(vma, &mapping->i_mmap);
+                vma_interval_tree_remove(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
 }
 /*
- * Unlink a file-based vm structure from its prio_tree, to hide
+ * Unlink a file-based vm structure from its interval tree, to hide
 * vma from rmap and vmtruncate before freeing its page tables.
 */
 void unlink_file_vma(struct vm_area_struct *vma)
@@ -231,11 +225,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
        might_sleep();
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
-        if (vma->vm_file) {
+        if (vma->vm_file)
                fput(vma->vm_file);
-                if (vma->vm_flags & VM_EXECUTABLE)
-                        removed_exe_file_vma(vma->vm_mm);
-        }
        mpol_put(vma_policy(vma));
        kmem_cache_free(vm_area_cachep, vma);
        return next;
@@ -306,7 +297,7 @@ out:
        return retval;
 }
-#ifdef DEBUG_MM_RB
+#ifdef CONFIG_DEBUG_VM_RB
 static int browse_rb(struct rb_root *root)
 {
        int i = 0, j;
@@ -340,9 +331,12 @@ void validate_mm(struct mm_struct *mm)
 {
        int bug = 0;
        int i = 0;
-        struct vm_area_struct *tmp = mm->mmap;
+        struct vm_area_struct *vma = mm->mmap;
-        while (tmp) {
+        while (vma) {
-                tmp = tmp->vm_next;
+                struct anon_vma_chain *avc;
+                list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                        anon_vma_interval_tree_verify(avc);
+                vma = vma->vm_next;
                i++;
        }
        if (i != mm->map_count)
@@ -356,17 +350,46 @@ void validate_mm(struct mm_struct *mm)
 #define validate_mm(mm) do { } while (0)
 #endif
-static struct vm_area_struct *
+/*
-find_vma_prepare(struct mm_struct *mm, unsigned long addr,
+ * vma has some anon_vma assigned, and is already inserted on that
-                struct vm_area_struct **pprev, struct rb_node ***rb_link,
+ * anon_vma's interval trees.
-                struct rb_node ** rb_parent)
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_sem and by
+ * the root anon_vma's mutex.
+ */
+static inline void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
 {
-        struct vm_area_struct * vma;
+        struct anon_vma_chain *avc;
-        struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
+        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+static inline void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+        struct anon_vma_chain *avc;
+        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+static int find_vma_links(struct mm_struct *mm, unsigned long addr,
+                unsigned long end, struct vm_area_struct **pprev,
+                struct rb_node ***rb_link, struct rb_node **rb_parent)
+{
+        struct rb_node **__rb_link, *__rb_parent, *rb_prev;
        __rb_link = &mm->mm_rb.rb_node;
        rb_prev = __rb_parent = NULL;
-        vma = NULL;
        while (*__rb_link) {
                struct vm_area_struct *vma_tmp;
@@ -375,9 +398,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
                if (vma_tmp->vm_end > addr) {
-                        vma = vma_tmp;
+                        /* Fail if an existing vma overlaps the area */
-                        if (vma_tmp->vm_start <= addr)
+                        if (vma_tmp->vm_start < end)
-                                break;
+                                return -ENOMEM;
                        __rb_link = &__rb_parent->rb_left;
                } else {
                        rb_prev = __rb_parent;
@@ -390,7 +413,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
        *rb_link = __rb_link;
        *rb_parent = __rb_parent;
-        return vma;
+        return 0;
 }
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -417,7 +440,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
                if (unlikely(vma->vm_flags & VM_NONLINEAR))
                        vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
                else
-                        vma_prio_tree_insert(vma, &mapping->i_mmap);
+                        vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
        }
 }
@@ -455,15 +478,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 /*
 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree.  It has already been inserted into the prio_tree.
+ * mm's list and rbtree.  It has already been inserted into the interval tree.
 */
 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-        struct vm_area_struct *__vma, *prev;
+        struct vm_area_struct *prev;
        struct rb_node **rb_link, *rb_parent;
-        __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
+        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
-        BUG_ON(__vma && __vma->vm_start < vma->vm_end);
+                           &prev, &rb_link, &rb_parent))
+                BUG();
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        mm->map_count++;
 }
@@ -496,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        struct vm_area_struct *next = vma->vm_next;
        struct vm_area_struct *importer = NULL;
        struct address_space *mapping = NULL;
-        struct prio_tree_root *root = NULL;
+        struct rb_root *root = NULL;
        struct anon_vma *anon_vma = NULL;
        struct file *file = vma->vm_file;
        long adjust_next = 0;
@@ -559,7 +583,7 @@ again:			remove_next = 1 + (end > next->vm_end);
                mutex_lock(&mapping->i_mmap_mutex);
                if (insert) {
                        /*
-                         * Put into prio_tree now, so instantiated pages
+                         * Put into interval tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
                         * throughout; but we cannot insert into address
                         * space until vma start or end is updated.
@@ -570,22 +594,23 @@ again:			remove_next = 1 + (end > next->vm_end);
        vma_adjust_trans_huge(vma, start, end, adjust_next);
-        /*
+        anon_vma = vma->anon_vma;
-         * When changing only vma->vm_end, we don't really need anon_vma
+        if (!anon_vma && adjust_next)
-         * lock. This is a fairly rare case by itself, but the anon_vma
+                anon_vma = next->anon_vma;
-         * lock may be shared between many sibling processes.  Skipping
+        if (anon_vma) {
-         * the lock for brk adjustments makes a difference sometimes.
+                VM_BUG_ON(adjust_next && next->anon_vma &&
-         */
+                          anon_vma != next->anon_vma);
-        if (vma->anon_vma && (importer || start != vma->vm_start)) {
-                anon_vma = vma->anon_vma;
                anon_vma_lock(anon_vma);
+                anon_vma_interval_tree_pre_update_vma(vma);
+                if (adjust_next)
+                        anon_vma_interval_tree_pre_update_vma(next);
        }
        if (root) {
                flush_dcache_mmap_lock(mapping);
-                vma_prio_tree_remove(vma, root);
+                vma_interval_tree_remove(vma, root);
                if (adjust_next)
-                        vma_prio_tree_remove(next, root);
+                        vma_interval_tree_remove(next, root);
        }
        vma->vm_start = start;
@@ -598,8 +623,8 @@ again:			remove_next = 1 + (end > next->vm_end);
        if (root) {
                if (adjust_next)
-                        vma_prio_tree_insert(next, root);
+                        vma_interval_tree_insert(next, root);
-                vma_prio_tree_insert(vma, root);
+                vma_interval_tree_insert(vma, root);
                flush_dcache_mmap_unlock(mapping);
        }
@@ -620,8 +645,12 @@ again:			remove_next = 1 + (end > next->vm_end);
                __insert_vm_struct(mm, insert);
        }
-        if (anon_vma)
+        if (anon_vma) {
+                anon_vma_interval_tree_post_update_vma(vma);
+                if (adjust_next)
+                        anon_vma_interval_tree_post_update_vma(next);
                anon_vma_unlock(anon_vma);
+        }
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
@@ -636,8 +665,6 @@ again:			remove_next = 1 + (end > next->vm_end);
                if (file) {
                        uprobe_munmap(next, next->vm_start, next->vm_end);
                        fput(file);
-                        if (next->vm_flags & VM_EXECUTABLE)
-                                removed_exe_file_vma(mm);
                }
                if (next->anon_vma)
                        anon_vma_merge(vma, next);
@@ -669,8 +696,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
                        struct file *file, unsigned long vm_flags)
 {
-        /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
+        if (vma->vm_flags ^ vm_flags)
-        if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
                return 0;
        if (vma->vm_file != file)
                return 0;
@@ -951,8 +977,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
                        mm->exec_vm += pages;
        } else if (flags & stack_flags)
                mm->stack_vm += pages;
-        if (flags & (VM_RESERVED|VM_IO))
-                mm->reserved_vm += pages;
 }
 #endif /* CONFIG_PROC_FS */
@@ -1190,7 +1214,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
                return 0;
        /* Specialty mapping? */
-        if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+        if (vm_flags & VM_PFNMAP)
                return 0;
        /* Can the mapping track the dirty pages? */
@@ -1229,8 +1253,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        /* Clear old maps */
        error = -ENOMEM;
 munmap_back:
-        vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
-        if (vma && vma->vm_start < addr + len) {
                if (do_munmap(mm, addr, len))
                        return -ENOMEM;
                goto munmap_back;
@@ -1305,8 +1328,6 @@ munmap_back:
                error = file->f_op->mmap(file, vma);
                if (error)
                        goto unmap_and_free_vma;
-                if (vm_flags & VM_EXECUTABLE)
-                        added_exe_file_vma(mm);
                /* Can addr have changed??
                 *
@@ -1757,13 +1778,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
+                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
+                                anon_vma_interval_tree_post_update_vma(vma);
                                perf_event_mmap(vma);
                        }
                }
        }
        vma_unlock_anon_vma(vma);
        khugepaged_enter_vma_merge(vma);
+        validate_mm(vma->vm_mm);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1807,14 +1831,17 @@ int expand_downwards(struct vm_area_struct *vma,
                if (grow <= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
+                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
+                                anon_vma_interval_tree_post_update_vma(vma);
                                perf_event_mmap(vma);
                        }
                }
        }
        vma_unlock_anon_vma(vma);
        khugepaged_enter_vma_merge(vma);
+        validate_mm(vma->vm_mm);
        return error;
 }
@@ -1988,11 +2015,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        if (anon_vma_clone(new, vma))
                goto out_free_mpol;
-        if (new->vm_file) {
+        if (new->vm_file)
                get_file(new->vm_file);
-                if (vma->vm_flags & VM_EXECUTABLE)
-                        added_exe_file_vma(mm);
-        }
        if (new->vm_ops && new->vm_ops->open)
                new->vm_ops->open(new);
@@ -2010,11 +2034,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        /* Clean everything up if vma_adjust failed. */
        if (new->vm_ops && new->vm_ops->close)
                new->vm_ops->close(new);
-        if (new->vm_file) {
+        if (new->vm_file)
-                if (vma->vm_flags & VM_EXECUTABLE)
-                        removed_exe_file_vma(mm);
                fput(new->vm_file);
-        }
        unlink_anon_vmas(new);
 out_free_mpol:
        mpol_put(pol);
@@ -2199,8 +2220,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
         * Clear old maps.  this also does some error checking for us
         */
 munmap_back:
-        vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
-        if (vma && vma->vm_start < addr + len) {
                if (do_munmap(mm, addr, len))
                        return -ENOMEM;
                goto munmap_back;
@@ -2314,10 +2334,10 @@ void exit_mmap(struct mm_struct *mm)
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
 * then i_mmap_mutex is taken here.
 */
-int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-        struct vm_area_struct * __vma, * prev;
+        struct vm_area_struct *prev;
-        struct rb_node ** rb_link, * rb_parent;
+        struct rb_node **rb_link, *rb_parent;
        /*
         * The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2335,8 +2355,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
                BUG_ON(vma->anon_vma);
                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
        }
-        __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
+        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
-        if (__vma && __vma->vm_start < vma->vm_end)
+                           &prev, &rb_link, &rb_parent))
                return -ENOMEM;
        if ((vma->vm_flags & VM_ACCOUNT) &&
             security_vm_enough_memory_mm(mm, vma_pages(vma)))
@@ -2351,7 +2371,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 * prior to moving page table entries, to effect an mremap move.
 */
 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
-        unsigned long addr, unsigned long len, pgoff_t pgoff)
+        unsigned long addr, unsigned long len, pgoff_t pgoff,
+        bool *need_rmap_locks)
 {
        struct vm_area_struct *vma = *vmap;
        unsigned long vma_start = vma->vm_start;
@@ -2370,7 +2391,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                faulted_in_anon_vma = false;
        }
-        find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+                return NULL;    /* should never get here */
        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
                        vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
        if (new_vma) {
@@ -2392,32 +2414,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                         * linear if there are no pages mapped yet.
                         */
                        VM_BUG_ON(faulted_in_anon_vma);
-                        *vmap = new_vma;
+                        *vmap = vma = new_vma;
-                } else
+                }
-                        anon_vma_moveto_tail(new_vma);
+                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                if (new_vma) {
                        *new_vma = *vma;
+                        new_vma->vm_start = addr;
+                        new_vma->vm_end = addr + len;
+                        new_vma->vm_pgoff = pgoff;
                        pol = mpol_dup(vma_policy(vma));
                        if (IS_ERR(pol))
                                goto out_free_vma;
+                        vma_set_policy(new_vma, pol);
                        INIT_LIST_HEAD(&new_vma->anon_vma_chain);
                        if (anon_vma_clone(new_vma, vma))
                                goto out_free_mempol;
-                        vma_set_policy(new_vma, pol);
+                        if (new_vma->vm_file)
-                        new_vma->vm_start = addr;
-                        new_vma->vm_end = addr + len;
-                        new_vma->vm_pgoff = pgoff;
-                        if (new_vma->vm_file) {
                                get_file(new_vma->vm_file);
-                                if (vma->vm_flags & VM_EXECUTABLE)
-                                        added_exe_file_vma(mm);
-                        }
                        if (new_vma->vm_ops && new_vma->vm_ops->open)
                                new_vma->vm_ops->open(new_vma);
                        vma_link(mm, new_vma, prev, rb_link, rb_parent);
+                        *need_rmap_locks = false;
                }
        }
        return new_vma;
@@ -2535,7 +2554,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-        if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+        if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
@@ -2551,7 +2570,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * anon_vma->root->mutex.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
-                                       &anon_vma->root->head.next))
+                                       &anon_vma->root->rb_root.rb_node))
                        BUG();
        }
 }
@@ -2592,7 +2611,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 * A single task can't take more than one mm_take_all_locks() in a row
 * or it would deadlock.
 *
- * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
 * mapping->flags avoid to take the same lock twice, if more than one
 * vma in this mm is backed by the same anon_vma or address_space.
 *
@@ -2639,13 +2658,13 @@ out_unlock:
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-        if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+        if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
                 *
                 * We must however clear the bitflag before unlocking
-                 * the vma so the users using the anon_vma->head will
+                 * the vma so the users using the anon_vma->rb_root will
                 * never see our bitflag.
                 *
                 * No need of atomic instructions here, head.next
@@ -2653,7 +2672,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 * anon_vma->root->mutex.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
-                                          &anon_vma->root->head.next))
+                                          &anon_vma->root->rb_root.rb_node))
                        BUG();
                anon_vma_unlock(anon_vma);
        }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 862b60822d9f..479a1e751a73 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,10 +14,14 @@
 #include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/err.h>
+#include <linux/srcu.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+/* global SRCU for all MMs */
+static struct srcu_struct srcu;
 /*
 * This function can't run concurrently against mmu_notifier_register
 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,8 +29,8 @@
 * in parallel despite there being no task using this mm any more,
 * through the vmas outside of the exit_mmap context, such as with
 * vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to RCU and it serializes
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
- * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
 * can't go away from under us as exit_mmap holds an mm_count pin
 * itself.
 */
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm)
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
        /*
-         * RCU here will block mmu_notifier_unregister until
+         * SRCU here will block mmu_notifier_unregister until
         * ->release returns.
         */
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
                /*
                 * if ->release runs before mmu_notifier_unregister it
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
                 */
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
        spin_unlock(&mm->mmu_notifier_mm->lock);
        /*
-         * synchronize_rcu here prevents mmu_notifier_release to
+         * synchronize_srcu here prevents mmu_notifier_release to
         * return to exit_mmap (which would proceed freeing all pages
         * in the mm) until the ->release method returns, if it was
         * invoked by mmu_notifier_unregister.
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
         * The mmu_notifier_mm can't go away from under us because one
         * mm_count is hold by exit_mmap.
         */
-        synchronize_rcu();
+        synchronize_srcu(&srcu);
 }
 /*
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
-        int young = 0;
+        int young = 0, id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->clear_flush_young)
                        young |= mn->ops->clear_flush_young(mn, mm, address);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
        return young;
 }
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
-        int young = 0;
+        int young = 0, id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->test_young) {
                        young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
                                break;
                }
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
        return young;
 }
@@ -126,19 +131,14 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->change_pte)
                        mn->ops->change_pte(mn, mm, address, pte);
-                /*
-                 * Some drivers don't have change_pte,
-                 * so we must call invalidate_page in that case.
-                 */
-                else if (mn->ops->invalidate_page)
-                        mn->ops->invalidate_page(mn, mm, address);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
 }
 void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +146,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_page)
                        mn->ops->invalidate_page(mn, mm, address);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
 }
 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +161,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_range_start)
                        mn->ops->invalidate_range_start(mn, mm, start, end);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
 }
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +176,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_range_end)
                        mn->ops->invalidate_range_end(mn, mm, start, end);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
 }
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,22 +195,29 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
        BUG_ON(atomic_read(&mm->mm_users) <= 0);
-        ret = -ENOMEM;
+        /*
-        mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+        * Verify that mmu_notifier_init() already run and the global srcu is
-        if (unlikely(!mmu_notifier_mm))
+        * initialized.
-                goto out;
+        */
+        BUG_ON(!srcu.per_cpu_ref);
        if (take_mmap_sem)
                down_write(&mm->mmap_sem);
        ret = mm_take_all_locks(mm);
        if (unlikely(ret))
-                goto out_cleanup;
+                goto out;
        if (!mm_has_notifiers(mm)) {
+                mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm),
+                                        GFP_KERNEL);
+                if (unlikely(!mmu_notifier_mm)) {
+                        ret = -ENOMEM;
+                        goto out_of_mem;
+                }
                INIT_HLIST_HEAD(&mmu_notifier_mm->list);
                spin_lock_init(&mmu_notifier_mm->lock);
                mm->mmu_notifier_mm = mmu_notifier_mm;
-                mmu_notifier_mm = NULL;
        }
        atomic_inc(&mm->mm_count);
@@ -223,13 +233,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
        hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
        spin_unlock(&mm->mmu_notifier_mm->lock);
+out_of_mem:
        mm_drop_all_locks(mm);
-out_cleanup:
+out:
        if (take_mmap_sem)
                up_write(&mm->mmap_sem);
-        /* kfree() does nothing if mmu_notifier_mm is NULL */
-        kfree(mmu_notifier_mm);
-out:
        BUG_ON(atomic_read(&mm->mm_users) <= 0);
        return ret;
 }
@@ -274,8 +283,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
 /*
 * This releases the mm_count pin automatically and frees the mm
 * structure if it was the last user of it. It serializes against
- * running mmu notifiers with RCU and against mmu_notifier_unregister
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
- * with the unregister lock + RCU. All sptes must be dropped before
+ * with the unregister lock + SRCU. All sptes must be dropped before
 * calling mmu_notifier_unregister. ->release or any other notifier
 * method may be invoked concurrently with mmu_notifier_unregister,
 * and only after mmu_notifier_unregister returned we're guaranteed
@@ -287,11 +296,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
        if (!hlist_unhashed(&mn->hlist)) {
                /*
-                 * RCU here will force exit_mmap to wait ->release to finish
+                 * SRCU here will force exit_mmap to wait ->release to finish
                 * before freeing the pages.
                 */
-                rcu_read_lock();
+                int id;
+                id = srcu_read_lock(&srcu);
                /*
                 * exit_mmap will block in mmu_notifier_release to
                 * guarantee ->release is called before freeing the
@@ -299,7 +309,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
                 */
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
-                rcu_read_unlock();
+                srcu_read_unlock(&srcu, id);
                spin_lock(&mm->mmu_notifier_mm->lock);
                hlist_del_rcu(&mn->hlist);
@@ -310,10 +320,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
         * Wait any running method to finish, of course including
         * ->release if it was run by mmu_notifier_relase instead of us.
         */
-        synchronize_rcu();
+        synchronize_srcu(&srcu);
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
        mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+static int __init mmu_notifier_init(void)
+{
+        return init_srcu_struct(&srcu);
+}
+module_init(mmu_notifier_init);
diff --git a/mm/mremap.c b/mm/mremap.c
index cc06d0e48d05..1b61c2d3307a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                unsigned long old_addr, unsigned long old_end,
                struct vm_area_struct *new_vma, pmd_t *new_pmd,
-                unsigned long new_addr)
+                unsigned long new_addr, bool need_rmap_locks)
 {
        struct address_space *mapping = NULL;
+        struct anon_vma *anon_vma = NULL;
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
        spinlock_t *old_ptl, *new_ptl;
-        if (vma->vm_file) {
+        /*
-                /*
+         * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
-                 * Subtle point from Rajesh Venkatasubramanian: before
+         * locks to ensure that rmap will always observe either the old or the
-                 * moving file-based ptes, we must lock truncate_pagecache
+         * new ptes. This is the easiest way to avoid races with
-                 * out, since it might clean the dst vma before the src vma,
+         * truncate_pagecache(), page migration, etc...
-                 * and we propagate stale pages into the dst afterward.
+         *
-                 */
+         * When need_rmap_locks is false, we use other ways to avoid
-                mapping = vma->vm_file->f_mapping;
+         * such races:
-                mutex_lock(&mapping->i_mmap_mutex);
+         *
+         * - During exec() shift_arg_pages(), we use a specially tagged vma
+         *   which rmap call sites look for using is_vma_temporary_stack().
+         *
+         * - During mremap(), new_vma is often known to be placed after vma
+         *   in rmap traversal order. This ensures rmap will always observe
+         *   either the old pte, or the new pte, or both (the page table locks
+         *   serialize access to individual ptes, but only rmap traversal
+         *   order guarantees that we won't miss both the old and new ptes).
+         */
+        if (need_rmap_locks) {
+                if (vma->vm_file) {
+                        mapping = vma->vm_file->f_mapping;
+                        mutex_lock(&mapping->i_mmap_mutex);
+                }
+                if (vma->anon_vma) {
+                        anon_vma = vma->anon_vma;
+                        anon_vma_lock(anon_vma);
+                }
        }
        /*
@@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                spin_unlock(new_ptl);
        pte_unmap(new_pte - 1);
        pte_unmap_unlock(old_pte - 1, old_ptl);
+        if (anon_vma)
+                anon_vma_unlock(anon_vma);
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
 }
@@ -122,16 +143,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
-                unsigned long new_addr, unsigned long len)
+                unsigned long new_addr, unsigned long len,
+                bool need_rmap_locks)
 {
        unsigned long extent, next, old_end;
        pmd_t *old_pmd, *new_pmd;
        bool need_flush = false;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        old_end = old_addr + len;
        flush_cache_range(vma, old_addr, old_end);
-        mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
+        mmun_start = old_addr;
+        mmun_end   = old_end;
+        mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
        for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
                cond_resched();
@@ -169,13 +195,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                if (extent > LATENCY_LIMIT)
                        extent = LATENCY_LIMIT;
                move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-                                new_vma, new_pmd, new_addr);
+                          new_vma, new_pmd, new_addr, need_rmap_locks);
                need_flush = true;
        }
        if (likely(need_flush))
                flush_tlb_range(vma, old_end-len, old_addr);
-        mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
+        mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
        return len + old_addr - old_end;        /* how much done */
 }
@@ -193,6 +219,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        unsigned long hiwater_vm;
        int split = 0;
        int err;
+        bool need_rmap_locks;
        /*
         * We'd prefer to avoid failure later on in do_munmap:
@@ -214,27 +241,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                return err;
        new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
-        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+                           &need_rmap_locks);
        if (!new_vma)
                return -ENOMEM;
-        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+                                     need_rmap_locks);
        if (moved_len < old_len) {
                /*
-                 * Before moving the page tables from the new vma to
-                 * the old vma, we need to be sure the old vma is
-                 * queued after new vma in the same_anon_vma list to
-                 * prevent SMP races with rmap_walk (that could lead
-                 * rmap_walk to miss some page table).
-                 */
-                anon_vma_moveto_tail(vma);
-                /*
                 * On error, move entries back from new area to old,
                 * which will succeed since page tables still there,
                 * and then proceed to unmap new area instead of old.
                 */
-                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+                                 true);
                vma = new_vma;
                old_len = new_len;
                old_addr = new_addr;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 405573010f99..714d5d650470 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
                return 0;
        __free_pages_memory(start_pfn, end_pfn);
+        fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT),
+                        start_pfn, end_pfn);
        return end_pfn - start_pfn;
 }
@@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
        phys_addr_t start, end, size;
        u64 i;
+        reset_zone_present_pages();
        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
                count += __free_memory_core(start, end);
@@ -162,8 +165,6 @@ unsigned long __init free_all_bootmem(void)
         * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
         *  because in some case like Node0 doesn't have RAM installed
         *  low ram will be on Node1
-         * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
-         *  will be used instead of only Node0 related
         */
        return free_low_memory_core_early(MAX_NUMNODES);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index dee2ff89fd58..45131b41bcdb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
-                vma_prio_tree_insert(vma, &mapping->i_mmap);
+                vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
                mutex_unlock(&mapping->i_mmap_mutex);
        }
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
-                vma_prio_tree_remove(vma, &mapping->i_mmap);
+                vma_interval_tree_remove(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
                mutex_unlock(&mapping->i_mmap_mutex);
        }
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
        kenter("%p", vma);
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
-        if (vma->vm_file) {
+        if (vma->vm_file)
                fput(vma->vm_file);
-                if (vma->vm_flags & VM_EXECUTABLE)
-                        removed_exe_file_vma(mm);
-        }
        put_nommu_region(vma->vm_region);
        kmem_cache_free(vm_area_cachep, vma);
 }
@@ -1284,10 +1281,6 @@ unsigned long do_mmap_pgoff(struct file *file,
        if (file) {
                region->vm_file = get_file(file);
                vma->vm_file = get_file(file);
-                if (vm_flags & VM_EXECUTABLE) {
-                        added_exe_file_vma(current->mm);
-                        vma->vm_mm = current->mm;
-                }
        }
        down_write(&nommu_region_sem);
@@ -1440,8 +1433,6 @@ error:
        kmem_cache_free(vm_region_jar, region);
        if (vma->vm_file)
                fput(vma->vm_file);
-        if (vma->vm_flags & VM_EXECUTABLE)
-                removed_exe_file_vma(vma->vm_mm);
        kmem_cache_free(vm_area_cachep, vma);
        kleave(" = %d", ret);
        return ret;
@@ -1820,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
        if (addr != (pfn << PAGE_SHIFT))
                return -EINVAL;
-        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
        return 0;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -1961,6 +1952,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+                             unsigned long size, pgoff_t pgoff)
+{
+        BUG();
+        return 0;
+}
+EXPORT_SYMBOL(generic_file_remap_pages);
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long addr, void *buf, int len, int write)
 {
@@ -2045,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
                                size_t newsize)
 {
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        struct vm_region *region;
        pgoff_t low, high;
        size_t r_size, r_top;
@@ -2057,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
        mutex_lock(&inode->i_mapping->i_mmap_mutex);
        /* search for VMAs that fall within the dead zone */
-        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+        vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
-                              low, high) {
                /* found one - only interested if it's shared out of the page
                 * cache */
                if (vma->vm_flags & VM_SHARED) {
@@ -2074,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
         * we don't check for any regions that start beyond the EOF as there
         * shouldn't be any
         */
-        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+        vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
-                              0, ULONG_MAX) {
+                                  0, ULONG_MAX) {
                if (!(vma->vm_flags & VM_SHARED))
                        continue;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 198600861638..79e0f3e24831 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 {
        task_lock(current);
        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-                "oom_adj=%d, oom_score_adj=%d\n",
+                "oom_score_adj=%d\n",
-                current->comm, gfp_mask, order, current->signal->oom_adj,
+                current->comm, gfp_mask, order,
                current->signal->oom_score_adj);
        cpuset_print_task_mems_allowed(current);
        task_unlock(current);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c13ea7538891..bb90971182bd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page,
                if (page_is_guard(buddy)) {
                        clear_page_guard_flag(buddy);
                        set_page_private(page, 0);
-                        __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+                        __mod_zone_freepage_state(zone, 1 << order,
+                                                  migratetype);
                } else {
                        list_del(&buddy->lru);
                        zone->free_area[order].nr_free--;
@@ -597,17 +598,6 @@ out:
        zone->free_area[order].nr_free++;
 }
-/*
- * free_page_mlock() -- clean up attempts to free and mlocked() page.
- * Page should not be on lru, so no need to fix that up.
- * free_pages_check() will verify...
- */
-static inline void free_page_mlock(struct page *page)
-{
-        __dec_zone_page_state(page, NR_MLOCK);
-        __count_vm_event(UNEVICTABLE_MLOCKFREED);
-}
 static inline int free_pages_check(struct page *page)
 {
        if (unlikely(page_mapcount(page) |
@@ -668,12 +658,17 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        batch_free = to_free;
                do {
+                        int mt; /* migratetype of the to-be-freed page */
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
                        list_del(&page->lru);
+                        mt = get_freepage_migratetype(page);
                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-                        __free_one_page(page, zone, 0, page_private(page));
+                        __free_one_page(page, zone, 0, mt);
-                        trace_mm_page_pcpu_drain(page, 0, page_private(page));
+                        trace_mm_page_pcpu_drain(page, 0, mt);
+                        if (is_migrate_cma(mt))
+                                __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
                } while (--to_free && --batch_free && !list_empty(list));
        }
        __mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -688,7 +683,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
        zone->pages_scanned = 0;
        __free_one_page(page, zone, order, migratetype);
-        __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+        if (unlikely(migratetype != MIGRATE_ISOLATE))
+                __mod_zone_freepage_state(zone, 1 << order, migratetype);
        spin_unlock(&zone->lock);
 }
@@ -721,17 +717,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
        unsigned long flags;
-        int wasMlocked = __TestClearPageMlocked(page);
+        int migratetype;
        if (!free_pages_prepare(page, order))
                return;
        local_irq_save(flags);
-        if (unlikely(wasMlocked))
-                free_page_mlock(page);
        __count_vm_events(PGFREE, 1 << order);
-        free_one_page(page_zone(page), page, order,
+        migratetype = get_pageblock_migratetype(page);
-                                        get_pageblock_migratetype(page));
+        set_freepage_migratetype(page, migratetype);
+        free_one_page(page_zone(page), page, order, migratetype);
        local_irq_restore(flags);
 }
@@ -811,7 +806,8 @@ static inline void expand(struct zone *zone, struct page *page,
                        set_page_guard_flag(&page[size]);
                        set_page_private(&page[size], high);
                        /* Guard pages are not available for any usage */
-                        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+                        __mod_zone_freepage_state(zone, -(1 << high),
+                                                  migratetype);
                        continue;
                }
 #endif
@@ -915,7 +911,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 */
-static int move_freepages(struct zone *zone,
+int move_freepages(struct zone *zone,
                          struct page *start_page, struct page *end_page,
                          int migratetype)
 {
@@ -951,6 +947,7 @@ static int move_freepages(struct zone *zone,
                order = page_order(page);
                list_move(&page->lru,
                          &zone->free_area[order].free_list[migratetype]);
+                set_freepage_migratetype(page, migratetype);
                page += 1 << order;
                pages_moved += 1 << order;
        }
@@ -1135,8 +1132,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
                                mt = migratetype;
                }
-                set_page_private(page, mt);
+                set_freepage_migratetype(page, mt);
                list = &page->lru;
+                if (is_migrate_cma(mt))
+                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+                                              -(1 << order));
        }
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
        spin_unlock(&zone->lock);
@@ -1296,16 +1296,13 @@ void free_hot_cold_page(struct page *page, int cold)
        struct per_cpu_pages *pcp;
        unsigned long flags;
        int migratetype;
-        int wasMlocked = __TestClearPageMlocked(page);
        if (!free_pages_prepare(page, 0))
                return;
        migratetype = get_pageblock_migratetype(page);
-        set_page_private(page, migratetype);
+        set_freepage_migratetype(page, migratetype);
        local_irq_save(flags);
-        if (unlikely(wasMlocked))
-                free_page_mlock(page);
        __count_vm_event(PGFREE);
        /*
@@ -1380,20 +1377,16 @@ void split_page(struct page *page, unsigned int order)
 }
 /*
- * Similar to split_page except the page is already free. As this is only
+ * Similar to the split_page family of functions except that the page
- * being used for migration, the migratetype of the block also changes.
+ * required at the given order and being isolated now to prevent races
- * As this is called with interrupts disabled, the caller is responsible
+ * with parallel allocators
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
 */
-int split_free_page(struct page *page)
+int capture_free_page(struct page *page, int alloc_order, int migratetype)
 {
        unsigned int order;
        unsigned long watermark;
        struct zone *zone;
+        int mt;
        BUG_ON(!PageBuddy(page));
@@ -1409,12 +1402,16 @@ int split_free_page(struct page *page)
        list_del(&page->lru);
        zone->free_area[order].nr_free--;
        rmv_page_order(page);
-        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
-        /* Split into individual pages */
+        mt = get_pageblock_migratetype(page);
-        set_page_refcounted(page);
+        if (unlikely(mt != MIGRATE_ISOLATE))
-        split_page(page, order);
+                __mod_zone_freepage_state(zone, -(1UL << order), mt);
+        if (alloc_order != order)
+                expand(zone, page, alloc_order, order,
+                        &zone->free_area[order], migratetype);
+        /* Set the pageblock if the captured page is at least a pageblock */
        if (order >= pageblock_order - 1) {
                struct page *endpage = page + (1 << order) - 1;
                for (; page < endpage; page += pageblock_nr_pages) {
@@ -1425,7 +1422,35 @@ int split_free_page(struct page *page)
                }
        }
-        return 1 << order;
+        return 1UL << order;
+}
+/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+        unsigned int order;
+        int nr_pages;
+        BUG_ON(!PageBuddy(page));
+        order = page_order(page);
+        nr_pages = capture_free_page(page, order, 0);
+        if (!nr_pages)
+                return 0;
+        /* Split into individual pages */
+        set_page_refcounted(page);
+        split_page(page, order);
+        return nr_pages;
 }
 /*
@@ -1484,7 +1509,8 @@ again:
                spin_unlock(&zone->lock);
                if (!page)
                        goto failed;
-                __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
+                __mod_zone_freepage_state(zone, -(1 << order),
+                                          get_pageblock_migratetype(page));
        }
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -1501,19 +1527,6 @@ failed:
        return NULL;
 }
-/* The ALLOC_WMARK bits are used as an index to zone->watermark */
-#define ALLOC_WMARK_MIN         WMARK_MIN
-#define ALLOC_WMARK_LOW         WMARK_LOW
-#define ALLOC_WMARK_HIGH        WMARK_HIGH
-#define ALLOC_NO_WATERMARKS     0x04 /* don't check watermarks at all */
-/* Mask to get the watermark bits */
-#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)
-#define ALLOC_HARDER            0x10 /* try to alloc harder */
-#define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET            0x40 /* check for correct cpuset */
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct {
@@ -1608,7 +1621,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
+#ifdef CONFIG_CMA
+        /* If allocation can't use CMA areas don't use free CMA pages */
+        if (!(alloc_flags & ALLOC_CMA))
+                free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
        if (free_pages <= min + lowmem_reserve)
                return false;
        for (o = 0; o < order; o++) {
@@ -1782,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
        bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+        return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
+}
+static void __paginginit init_zone_allows_reclaim(int nid)
+{
+        int i;
+        for_each_online_node(i)
+                if (node_distance(nid, i) <= RECLAIM_DISTANCE) {
+                        node_set(i, NODE_DATA(nid)->reclaim_nodes);
+                        zone_reclaim_mode = 1;
+                }
+}
 #else   /* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1802,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+        return true;
+}
+static inline void init_zone_allows_reclaim(int nid)
+{
+}
 #endif  /* CONFIG_NUMA */
 /*
@@ -1886,7 +1928,8 @@ zonelist_scan:
                                did_zlc_setup = 1;
                        }
-                        if (zone_reclaim_mode == 0)
+                        if (zone_reclaim_mode == 0 ||
+                            !zone_allows_reclaim(preferred_zone, zone))
                                goto this_zone_full;
                        /*
@@ -2105,7 +2148,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
-        struct page *page;
+        struct page *page = NULL;
        if (!order)
                return NULL;
@@ -2118,10 +2161,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, sync_migration,
-                                                contended_compaction);
+                                                contended_compaction, &page);
        current->flags &= ~PF_MEMALLOC;
-        if (*did_some_progress != COMPACT_SKIPPED) {
+        /* If compaction captured a page, prep and use it */
+        if (page) {
+                prep_new_page(page, order, gfp_mask);
+                goto got_page;
+        }
+        if (*did_some_progress != COMPACT_SKIPPED) {
                /* Page migration frees to the PCP lists but we want merging */
                drain_pages(get_cpu());
                put_cpu();
@@ -2131,6 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                                alloc_flags & ~ALLOC_NO_WATERMARKS,
                                preferred_zone, migratetype);
                if (page) {
+got_page:
+                        preferred_zone->compact_blockskip_flush = false;
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
                        if (order >= preferred_zone->compact_order_failed)
@@ -2315,7 +2366,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                                 unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
+#ifdef CONFIG_CMA
+        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+                alloc_flags |= ALLOC_CMA;
+#endif
        return alloc_flags;
 }
@@ -2362,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 restart:
-        if (!(gfp_mask & __GFP_NO_KSWAPD))
+        wake_all_kswapd(order, zonelist, high_zoneidx,
-                wake_all_kswapd(order, zonelist, high_zoneidx,
+                                        zone_idx(preferred_zone));
-                                                zone_idx(preferred_zone));
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2441,7 +2494,7 @@ rebalance:
         * system then fail the allocation instead of entering direct reclaim.
         */
        if ((deferred_compaction || contended_compaction) &&
-                                                (gfp_mask & __GFP_NO_KSWAPD))
+            (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
                goto nopage;
        /* Try direct reclaim and then allocating */
@@ -2541,6 +2594,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        struct page *page = NULL;
        int migratetype = allocflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
+        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
        gfp_mask &= gfp_allowed_mask;
@@ -2569,9 +2623,13 @@ retry_cpuset:
        if (!preferred_zone)
                goto out;
+#ifdef CONFIG_CMA
+        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+                alloc_flags |= ALLOC_CMA;
+#endif
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-                        zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
+                        zonelist, high_zoneidx, alloc_flags,
                        preferred_zone, migratetype);
        if (unlikely(!page))
                page = __alloc_pages_slowpath(gfp_mask, order,
@@ -2852,7 +2910,8 @@ void show_free_areas(unsigned int filter)
                " unevictable:%lu"
                " dirty:%lu writeback:%lu unstable:%lu\n"
                " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
-                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
+                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+                " free_cma:%lu\n",
                global_page_state(NR_ACTIVE_ANON),
                global_page_state(NR_INACTIVE_ANON),
                global_page_state(NR_ISOLATED_ANON),
@@ -2869,7 +2928,8 @@ void show_free_areas(unsigned int filter)
                global_page_state(NR_FILE_MAPPED),
                global_page_state(NR_SHMEM),
                global_page_state(NR_PAGETABLE),
-                global_page_state(NR_BOUNCE));
+                global_page_state(NR_BOUNCE),
+                global_page_state(NR_FREE_CMA_PAGES));
        for_each_populated_zone(zone) {
                int i;
@@ -2901,6 +2961,7 @@ void show_free_areas(unsigned int filter)
                        " pagetables:%lukB"
                        " unstable:%lukB"
                        " bounce:%lukB"
+                        " free_cma:%lukB"
                        " writeback_tmp:%lukB"
                        " pages_scanned:%lu"
                        " all_unreclaimable? %s"
@@ -2930,6 +2991,7 @@ void show_free_areas(unsigned int filter)
                        K(zone_page_state(zone, NR_PAGETABLE)),
                        K(zone_page_state(zone, NR_UNSTABLE_NFS)),
                        K(zone_page_state(zone, NR_BOUNCE)),
+                        K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
                        K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
                        zone->pages_scanned,
                        (zone->all_unreclaimable ? "yes" : "no")
@@ -3328,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat)
        j = 0;
        while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
-                int distance = node_distance(local_node, node);
-                /*
-                 * If another node is sufficiently far away then it is better
-                 * to reclaim pages in a zone before going off node.
-                 */
-                if (distance > RECLAIM_DISTANCE)
-                        zone_reclaim_mode = 1;
                /*
                 * We don't want to pressure a particular node.
                 * So adding penalty to the first node in same
                 * distance group to make it round-robin.
                 */
-                if (distance != node_distance(local_node, prev_node))
+                if (node_distance(local_node, node) !=
+                    node_distance(local_node, prev_node))
                        node_load[node] = load;
                prev_node = node;
@@ -4438,11 +4492,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->spanned_pages = size;
                zone->present_pages = realsize;
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-                zone->compact_cached_free_pfn = zone->zone_start_pfn +
-                                                zone->spanned_pages;
-                zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
-#endif
 #ifdef CONFIG_NUMA
                zone->node = nid;
                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4521,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
+        init_zone_allows_reclaim(nid);
        calculate_node_totalpages(pgdat, zones_size, zholes_size);
        alloc_node_mem_map(pgdat);
@@ -4879,7 +4929,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                               zone_movable_pfn[i] << PAGE_SHIFT);
        }
-        /* Print out the early_node_map[] */
+        /* Print out the early node map */
        printk("Early memory node ranges\n");
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
                printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
@@ -5619,47 +5669,28 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
                                pageblock_nr_pages));
 }
-static struct page *
-__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
-                             int **resultp)
-{
-        gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
-        if (PageHighMem(page))
-                gfp_mask |= __GFP_HIGHMEM;
-        return alloc_page(gfp_mask);
-}
 /* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+                                        unsigned long start, unsigned long end)
 {
        /* This function is based on compact_zone() from compaction.c. */
+        unsigned long nr_reclaimed;
        unsigned long pfn = start;
        unsigned int tries = 0;
        int ret = 0;
-        struct compact_control cc = {
-                .nr_migratepages = 0,
-                .order = -1,
-                .zone = page_zone(pfn_to_page(start)),
-                .sync = true,
-        };
-        INIT_LIST_HEAD(&cc.migratepages);
        migrate_prep_local();
-        while (pfn < end || !list_empty(&cc.migratepages)) {
+        while (pfn < end || !list_empty(&cc->migratepages)) {
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }
-                if (list_empty(&cc.migratepages)) {
+                if (list_empty(&cc->migratepages)) {
-                        cc.nr_migratepages = 0;
+                        cc->nr_migratepages = 0;
-                        pfn = isolate_migratepages_range(cc.zone, &cc,
+                        pfn = isolate_migratepages_range(cc->zone, cc,
-                                                         pfn, end);
+                                                         pfn, end, true);
                        if (!pfn) {
                                ret = -EINTR;
                                break;
@@ -5670,12 +5701,16 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
                        break;
                }
-                ret = migrate_pages(&cc.migratepages,
+                nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
-                                    __alloc_contig_migrate_alloc,
+                                                        &cc->migratepages);
+                cc->nr_migratepages -= nr_reclaimed;
+                ret = migrate_pages(&cc->migratepages,
+                                    alloc_migrate_target,
                                    0, false, MIGRATE_SYNC);
        }
-        putback_lru_pages(&cc.migratepages);
+        putback_lru_pages(&cc->migratepages);
        return ret > 0 ? 0 : ret;
 }
@@ -5754,6 +5789,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        unsigned long outer_start, outer_end;
        int ret = 0, order;
+        struct compact_control cc = {
+                .nr_migratepages = 0,
+                .order = -1,
+                .zone = page_zone(pfn_to_page(start)),
+                .sync = true,
+                .ignore_skip_hint = true,
+        };
+        INIT_LIST_HEAD(&cc.migratepages);
        /*
         * What we do here is we mark all pageblocks in range as
         * MIGRATE_ISOLATE.  Because pageblock and max order pages may
@@ -5783,7 +5827,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        if (ret)
                goto done;
-        ret = __alloc_contig_migrate_range(start, end);
+        ret = __alloc_contig_migrate_range(&cc, start, end);
        if (ret)
                goto done;
@@ -5832,7 +5876,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
        /* Grab isolated pages from freelists. */
-        outer_end = isolate_freepages_range(outer_start, end);
+        outer_end = isolate_freepages_range(&cc, outer_start, end);
        if (!outer_end) {
                ret = -EBUSY;
                goto done;
@@ -5874,6 +5918,7 @@ static int __meminit __zone_pcp_update(void *data)
                local_irq_save(flags);
                if (pcp->count > 0)
                        free_pcppages_bulk(zone, pcp->count, pcp);
+                drain_zonestat(zone, pset);
                setup_pageset(pset, batch);
                local_irq_restore(flags);
        }
@@ -5890,10 +5935,16 @@ void __meminit zone_pcp_update(struct zone *zone)
 void zone_pcp_reset(struct zone *zone)
 {
        unsigned long flags;
+        int cpu;
+        struct per_cpu_pageset *pset;
        /* avoid races with drain_pages()  */
        local_irq_save(flags);
        if (zone->pageset != &boot_pageset) {
+                for_each_online_cpu(cpu) {
+                        pset = per_cpu_ptr(zone->pageset, cpu);
+                        drain_zonestat(zone, pset);
+                }
                free_percpu(zone->pageset);
                zone->pageset = &boot_pageset;
        }
@@ -6047,3 +6098,37 @@ void dump_page(struct page *page)
        dump_page_flags(page->flags);
        mem_cgroup_print_bad_page(page);
 }
+/* reset zone->present_pages */
+void reset_zone_present_pages(void)
+{
+        struct zone *z;
+        int i, nid;
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                for (i = 0; i < MAX_NR_ZONES; i++) {
+                        z = NODE_DATA(nid)->node_zones + i;
+                        z->present_pages = 0;
+                }
+        }
+}
+/* calculate zone's present pages in buddy system */
+void fixup_zone_present_pages(int nid, unsigned long start_pfn,
+                                unsigned long end_pfn)
+{
+        struct zone *z;
+        unsigned long zone_start_pfn, zone_end_pfn;
+        int i;
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                z = NODE_DATA(nid)->node_zones + i;
+                zone_start_pfn = z->zone_start_pfn;
+                zone_end_pfn = zone_start_pfn + z->spanned_pages;
+                /* if the two regions intersect */
+                if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn))
+                        z->present_pages += min(end_pfn, zone_end_pfn) -
+                                            max(start_pfn, zone_start_pfn);
+        }
+}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 247d1f175739..f2f5b4818e94 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -76,8 +76,13 @@ int set_migratetype_isolate(struct page *page)
 out:
        if (!ret) {
+                unsigned long nr_pages;
+                int migratetype = get_pageblock_migratetype(page);
                set_pageblock_isolate(page);
-                move_freepages_block(zone, page, MIGRATE_ISOLATE);
+                nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+                __mod_zone_freepage_state(zone, -nr_pages, migratetype);
        }
        spin_unlock_irqrestore(&zone->lock, flags);
@@ -89,12 +94,14 @@ out:
 void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 {
        struct zone *zone;
-        unsigned long flags;
+        unsigned long flags, nr_pages;
        zone = page_zone(page);
        spin_lock_irqsave(&zone->lock, flags);
        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
                goto out;
-        move_freepages_block(zone, page, migratetype);
+        nr_pages = move_freepages_block(zone, page, migratetype);
+        __mod_zone_freepage_state(zone, nr_pages, migratetype);
        restore_pageblock_isolate(page, migratetype);
 out:
        spin_unlock_irqrestore(&zone->lock, flags);
@@ -193,10 +200,25 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
                        continue;
                }
                page = pfn_to_page(pfn);
-                if (PageBuddy(page))
+                if (PageBuddy(page)) {
+                        /*
+                         * If race between isolatation and allocation happens,
+                         * some free pages could be in MIGRATE_MOVABLE list
+                         * although pageblock's migratation type of the page
+                         * is MIGRATE_ISOLATE. Catch it and move the page into
+                         * MIGRATE_ISOLATE list.
+                         */
+                        if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
+                                struct page *end_page;
+                                end_page = page + (1 << page_order(page)) - 1;
+                                move_freepages(page_zone(page), page, end_page,
+                                                MIGRATE_ISOLATE);
+                        }
                        pfn += 1 << page_order(page);
+                }
                else if (page_count(page) == 0 &&
-                                page_private(page) == MIGRATE_ISOLATE)
+                        get_freepage_migratetype(page) == MIGRATE_ISOLATE)
                        pfn += 1;
                else
                        break;
@@ -233,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        spin_unlock_irqrestore(&zone->lock, flags);
        return ret ? 0 : -EBUSY;
 }
+struct page *alloc_migrate_target(struct page *page, unsigned long private,
+                                  int **resultp)
+{
+        gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+        if (PageHighMem(page))
+                gfp_mask |= __GFP_HIGHMEM;
+        return alloc_page(gfp_mask);
+}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 74c0ddaa6fa0..e642627da6b7 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -120,3 +120,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
+#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+{
+        assert_spin_locked(&mm->page_table_lock);
+        /* FIFO */
+        if (!mm->pmd_huge_pte)
+                INIT_LIST_HEAD(&pgtable->lru);
+        else
+                list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+        mm->pmd_huge_pte = pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+{
+        pgtable_t pgtable;
+        assert_spin_locked(&mm->page_table_lock);
+        /* FIFO */
+        pgtable = mm->pmd_huge_pte;
+        if (list_empty(&pgtable->lru))
+                mm->pmd_huge_pte = NULL;
+        else {
+                mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+                                              struct page, lru);
+                list_del(&pgtable->lru);
+        }
+        return pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                     pmd_t *pmdp)
+{
+        set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
+        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
deleted file mode 100644
index 799dcfd7cd8c..000000000000
--- a/mm/prio_tree.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * mm/prio_tree.c - priority search tree for mapping->i_mmap
- *
- * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
- *
- * This file is released under the GPL v2.
- *
- * Based on the radix priority search tree proposed by Edward M. McCreight
- * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
- *
- * 02Feb2004    Initial version
- */
-#include <linux/mm.h>
-#include <linux/prio_tree.h>
-#include <linux/prefetch.h>
-/*
- * See lib/prio_tree.c for details on the general radix priority search tree
- * code.
- */
-/*
- * The following #defines are mirrored from lib/prio_tree.c. They're only used
- * for debugging, and should be removed (along with the debugging code using
- * them) when switching also VMAs to the regular prio_tree code.
- */
-#define RADIX_INDEX(vma)  ((vma)->vm_pgoff)
-#define VMA_SIZE(vma)     (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
-/* avoid overflow */
-#define HEAP_INDEX(vma)   ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
-/*
- * Radix priority search tree for address_space->i_mmap
- *
- * For each vma that map a unique set of file pages i.e., unique [radix_index,
- * heap_index] value, we have a corresponding priority search tree node. If
- * multiple vmas have identical [radix_index, heap_index] value, then one of
- * them is used as a tree node and others are stored in a vm_set list. The tree
- * node points to the first vma (head) of the list using vm_set.head.
- *
- * prio_tree_root
- *      |
- *      A       vm_set.head
- *     / \      /
- *    L   R -> H-I-J-K-M-N-O-P-Q-S
- *    ^   ^    <-- vm_set.list -->
- *  tree nodes
- *
- * We need some way to identify whether a vma is a tree node, head of a vm_set
- * list, or just a member of a vm_set list. We cannot use vm_flags to store
- * such information. The reason is, in the above figure, it is possible that
- * vm_flags' of R and H are covered by the different mmap_sems. When R is
- * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
- * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
- * That's why some trick involving shared.vm_set.parent is used for identifying
- * tree nodes and list head nodes.
- *
- * vma radix priority search tree node rules:
- *
- * vma->shared.vm_set.parent != NULL    ==> a tree node
- *      vma->shared.vm_set.head != NULL ==> list of others mapping same range
- *      vma->shared.vm_set.head == NULL ==> no others map the same range
- *
- * vma->shared.vm_set.parent == NULL
- *      vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
- *      vma->shared.vm_set.head == NULL ==> a list node
- */
-/*
- * Add a new vma known to map the same set of pages as the old vma:
- * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
- * Note that it just happens to work correctly on i_mmap_nonlinear too.
- */
-void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
-{
-        /* Leave these BUG_ONs till prio_tree patch stabilizes */
-        BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
-        BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
-        vma->shared.vm_set.head = NULL;
-        vma->shared.vm_set.parent = NULL;
-        if (!old->shared.vm_set.parent)
-                list_add(&vma->shared.vm_set.list,
-                                &old->shared.vm_set.list);
-        else if (old->shared.vm_set.head)
-                list_add_tail(&vma->shared.vm_set.list,
-                                &old->shared.vm_set.head->shared.vm_set.list);
-        else {
-                INIT_LIST_HEAD(&vma->shared.vm_set.list);
-                vma->shared.vm_set.head = old;
-                old->shared.vm_set.head = vma;
-        }
-}
-void vma_prio_tree_insert(struct vm_area_struct *vma,
-                          struct prio_tree_root *root)
-{
-        struct prio_tree_node *ptr;
-        struct vm_area_struct *old;
-        vma->shared.vm_set.head = NULL;
-        ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
-        if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
-                old = prio_tree_entry(ptr, struct vm_area_struct,
-                                        shared.prio_tree_node);
-                vma_prio_tree_add(vma, old);
-        }
-}
-void vma_prio_tree_remove(struct vm_area_struct *vma,
-                          struct prio_tree_root *root)
-{
-        struct vm_area_struct *node, *head, *new_head;
-        if (!vma->shared.vm_set.head) {
-                if (!vma->shared.vm_set.parent)
-                        list_del_init(&vma->shared.vm_set.list);
-                else
-                        raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
-        } else {
-                /* Leave this BUG_ON till prio_tree patch stabilizes */
-                BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
-                if (vma->shared.vm_set.parent) {
-                        head = vma->shared.vm_set.head;
-                        if (!list_empty(&head->shared.vm_set.list)) {
-                                new_head = list_entry(
-                                        head->shared.vm_set.list.next,
-                                        struct vm_area_struct,
-                                        shared.vm_set.list);
-                                list_del_init(&head->shared.vm_set.list);
-                        } else
-                                new_head = NULL;
-                        raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
-                                        &head->shared.prio_tree_node);
-                        head->shared.vm_set.head = new_head;
-                        if (new_head)
-                                new_head->shared.vm_set.head = head;
-                } else {
-                        node = vma->shared.vm_set.head;
-                        if (!list_empty(&vma->shared.vm_set.list)) {
-                                new_head = list_entry(
-                                        vma->shared.vm_set.list.next,
-                                        struct vm_area_struct,
-                                        shared.vm_set.list);
-                                list_del_init(&vma->shared.vm_set.list);
-                                node->shared.vm_set.head = new_head;
-                                new_head->shared.vm_set.head = node;
-                        } else
-                                node->shared.vm_set.head = NULL;
-                }
-        }
-}
-/*
- * Helper function to enumerate vmas that map a given file page or a set of
- * contiguous file pages. The function returns vmas that at least map a single
- * page in the given range of contiguous file pages.
- */
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
-                                        struct prio_tree_iter *iter)
-{
-        struct prio_tree_node *ptr;
-        struct vm_area_struct *next;
-        if (!vma) {
-                /*
-                 * First call is with NULL vma
-                 */
-                ptr = prio_tree_next(iter);
-                if (ptr) {
-                        next = prio_tree_entry(ptr, struct vm_area_struct,
-                                                shared.prio_tree_node);
-                        prefetch(next->shared.vm_set.head);
-                        return next;
-                } else
-                        return NULL;
-        }
-        if (vma->shared.vm_set.parent) {
-                if (vma->shared.vm_set.head) {
-                        next = vma->shared.vm_set.head;
-                        prefetch(next->shared.vm_set.list.next);
-                        return next;
-                }
-        } else {
-                next = list_entry(vma->shared.vm_set.list.next,
-                                struct vm_area_struct, shared.vm_set.list);
-                if (!next->shared.vm_set.head) {
-                        prefetch(next->shared.vm_set.list.next);
-                        return next;
-                }
-        }
-        ptr = prio_tree_next(iter);
-        if (ptr) {
-                next = prio_tree_entry(ptr, struct vm_area_struct,
-                                        shared.prio_tree_node);
-                prefetch(next->shared.vm_set.head);
-                return next;
-        } else
-                return NULL;
-}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0f3b7cda2a24..7df7984d476c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
        avc->vma = vma;
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
+        anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
-        /*
-         * It's critical to add new vmas to the tail of the anon_vma,
-         * see comment in huge_memory.c:__split_huge_page().
-         */
-        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
 }
 /**
@@ -269,51 +264,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 }
 /*
- * Some rmap walk that needs to find all ptes/hugepmds without false
- * negatives (like migrate and split_huge_page) running concurrent
- * with operations that copy or move pagetables (like mremap() and
- * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
- * list to be in a certain order: the dst_vma must be placed after the
- * src_vma in the list. This is always guaranteed by fork() but
- * mremap() needs to call this function to enforce it in case the
- * dst_vma isn't newly allocated and chained with the anon_vma_clone()
- * function but just an extension of a pre-existing vma through
- * vma_merge.
- *
- * NOTE: the same_anon_vma list can still be changed by other
- * processes while mremap runs because mremap doesn't hold the
- * anon_vma mutex to prevent modifications to the list while it
- * runs. All we need to enforce is that the relative order of this
- * process vmas isn't changing (we don't care about other vmas
- * order). Each vma corresponds to an anon_vma_chain structure so
- * there's no risk that other processes calling anon_vma_moveto_tail()
- * and changing the same_anon_vma list under mremap() will screw with
- * the relative order of this process vmas in the list, because we
- * they can't alter the order of any vma that belongs to this
- * process. And there can't be another anon_vma_moveto_tail() running
- * concurrently with mremap() coming from this process because we hold
- * the mmap_sem for the whole mremap(). fork() ordering dependency
- * also shouldn't be affected because fork() only cares that the
- * parent vmas are placed in the list before the child vmas and
- * anon_vma_moveto_tail() won't reorder vmas from either the fork()
- * parent or child.
- */
-void anon_vma_moveto_tail(struct vm_area_struct *dst)
-{
-        struct anon_vma_chain *pavc;
-        struct anon_vma *root = NULL;
-        list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
-                struct anon_vma *anon_vma = pavc->anon_vma;
-                VM_BUG_ON(pavc->vma != dst);
-                root = lock_anon_vma_root(root, anon_vma);
-                list_del(&pavc->same_anon_vma);
-                list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
-        }
-        unlock_anon_vma_root(root);
-}
-/*
 * Attach vma to its own anon_vma, as well as to the anon_vmas that
 * the corresponding VMA in the parent process is attached to.
 * Returns 0 on success, non-zero on failure.
@@ -381,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
                struct anon_vma *anon_vma = avc->anon_vma;
                root = lock_anon_vma_root(root, anon_vma);
-                list_del(&avc->same_anon_vma);
+                anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
                /*
                 * Leave empty anon_vmas on the list - we'll need
                 * to free them outside the lock.
                 */
-                if (list_empty(&anon_vma->head))
+                if (RB_EMPTY_ROOT(&anon_vma->rb_root))
                        continue;
                list_del(&avc->same_vma);
@@ -416,7 +366,7 @@ static void anon_vma_ctor(void *data)
        mutex_init(&anon_vma->mutex);
        atomic_set(&anon_vma->refcount, 0);
-        INIT_LIST_HEAD(&anon_vma->head);
+        anon_vma->rb_root = RB_ROOT;
 }
 void __init anon_vma_init(void)
@@ -560,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
 /*
 * At what user virtual address is page expected in @vma?
- * Returns virtual address or -EFAULT if page's index/offset is not
- * within the range mapped the @vma.
 */
-inline unsigned long
+static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+__vma_address(struct page *page, struct vm_area_struct *vma)
 {
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        unsigned long address;
        if (unlikely(is_vm_hugetlb_page(vma)))
                pgoff = page->index << huge_page_order(page_hstate(page));
-        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-        if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+        return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-                /* page should be within @vma mapping range */
+}
-                return -EFAULT;
-        }
+inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+        unsigned long address = __vma_address(page, vma);
+        /* page should be within @vma mapping range */
+        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        return address;
 }
@@ -585,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
+        unsigned long address;
        if (PageAnon(page)) {
                struct anon_vma *page__anon_vma = page_anon_vma(page);
                /*
@@ -600,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
                        return -EFAULT;
        } else
                return -EFAULT;
-        return vma_address(page, vma);
+        address = __vma_address(page, vma);
+        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+                return -EFAULT;
+        return address;
 }
 /*
@@ -674,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
        pte_t *pte;
        spinlock_t *ptl;
-        address = vma_address(page, vma);
+        address = __vma_address(page, vma);
-        if (address == -EFAULT)         /* out of vma range */
+        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
                return 0;
        pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
        if (!pte)                       /* the page is not in this mm */
@@ -769,6 +727,7 @@ static int page_referenced_anon(struct page *page,
 {
        unsigned int mapcount;
        struct anon_vma *anon_vma;
+        pgoff_t pgoff;
        struct anon_vma_chain *avc;
        int referenced = 0;
@@ -777,11 +736,10 @@ static int page_referenced_anon(struct page *page,
                return referenced;
        mapcount = page_mapcount(page);
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                /*
                 * If we are reclaiming on behalf of a cgroup, skip
                 * counting on behalf of references from different
@@ -820,7 +778,6 @@ static int page_referenced_file(struct page *page,
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        int referenced = 0;
        /*
@@ -846,10 +803,8 @@ static int page_referenced_file(struct page *page,
         */
        mapcount = page_mapcount(page);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                /*
                 * If we are reclaiming on behalf of a cgroup, skip
                 * counting on behalf of references from different
@@ -929,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                pte_t entry;
                flush_cache_page(vma, address, pte_pfn(*pte));
-                entry = ptep_clear_flush_notify(vma, address, pte);
+                entry = ptep_clear_flush(vma, address, pte);
                entry = pte_wrprotect(entry);
                entry = pte_mkclean(entry);
                set_pte_at(mm, address, pte, entry);
@@ -937,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
        }
        pte_unmap_unlock(pte, ptl);
+        if (ret)
+                mmu_notifier_invalidate_page(mm, address);
 out:
        return ret;
 }
@@ -945,17 +903,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
 {
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        int ret = 0;
        BUG_ON(PageAnon(page));
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (vma->vm_flags & VM_SHARED) {
                        unsigned long address = vma_address(page, vma);
-                        if (address == -EFAULT)
-                                continue;
                        ret += page_mkclean_one(page, vma, address);
                }
        }
@@ -1128,7 +1083,7 @@ void page_add_new_anon_rmap(struct page *page,
        else
                __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __page_set_anon_rmap(page, vma, address, 1);
-        if (page_evictable(page, vma))
+        if (!mlocked_vma_newpage(vma, page))
                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
        else
                add_page_to_unevictable_list(page);
@@ -1203,7 +1158,10 @@ void page_remove_rmap(struct page *page)
        } else {
                __dec_zone_page_state(page, NR_FILE_MAPPED);
                mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
+                mem_cgroup_end_update_page_stat(page, &locked, &flags);
        }
+        if (unlikely(PageMlocked(page)))
+                clear_page_mlock(page);
        /*
         * It would be tidy to reset the PageAnon mapping here,
         * but that might overwrite a racing page_add_anon_rmap
@@ -1213,6 +1171,7 @@ void page_remove_rmap(struct page *page)
         * Leaving it set also helps swapoff to reinstate ptes
         * faster for those pages still in swapcache.
         */
+        return;
 out:
        if (!anon)
                mem_cgroup_end_update_page_stat(page, &locked, &flags);
@@ -1256,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
-        pteval = ptep_clear_flush_notify(vma, address, pte);
+        pteval = ptep_clear_flush(vma, address, pte);
        /* Move the dirty bit to the physical page now the pte is gone. */
        if (pte_dirty(pteval))
@@ -1318,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 out_unmap:
        pte_unmap_unlock(pte, ptl);
+        if (ret != SWAP_FAIL)
+                mmu_notifier_invalidate_page(mm, address);
 out:
        return ret;
@@ -1382,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        spinlock_t *ptl;
        struct page *page;
        unsigned long address;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        unsigned long end;
        int ret = SWAP_AGAIN;
        int locked_vma = 0;
@@ -1405,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        if (!pmd_present(*pmd))
                return ret;
+        mmun_start = address;
+        mmun_end   = end;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
         * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
         * keep the sem while scanning the cluster for mlocking pages.
@@ -1438,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                /* Nuke the page table entry. */
                flush_cache_page(vma, address, pte_pfn(*pte));
-                pteval = ptep_clear_flush_notify(vma, address, pte);
+                pteval = ptep_clear_flush(vma, address, pte);
                /* If nonlinear, store the file page offset in the pte. */
                if (page->index != linear_page_index(vma, address))
@@ -1454,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                (*mapcount)--;
        }
        pte_unmap_unlock(pte - 1, ptl);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (locked_vma)
                up_read(&vma->vm_mm->mmap_sem);
        return ret;
@@ -1492,6 +1460,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma)
 static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
        struct anon_vma *anon_vma;
+        pgoff_t pgoff;
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
@@ -1499,7 +1468,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
        if (!anon_vma)
                return ret;
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address;
@@ -1516,8 +1486,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
                        continue;
                address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                ret = try_to_unmap_one(page, vma, address, flags);
                if (ret != SWAP_AGAIN || !page_mapped(page))
                        break;
@@ -1547,7 +1515,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        int ret = SWAP_AGAIN;
        unsigned long cursor;
        unsigned long max_nl_cursor = 0;
@@ -1555,10 +1522,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        unsigned int mapcount;
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                ret = try_to_unmap_one(page, vma, address, flags);
                if (ret != SWAP_AGAIN || !page_mapped(page))
                        goto out;
@@ -1576,7 +1541,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
                goto out;
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-                                                shared.vm_set.list) {
+                                                        shared.nonlinear) {
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
                        max_nl_cursor = cursor;
@@ -1608,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        do {
                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-                                                shared.vm_set.list) {
+                                                        shared.nonlinear) {
                        cursor = (unsigned long) vma->vm_private_data;
                        while ( cursor < max_nl_cursor &&
                                cursor < vma->vm_end - vma->vm_start) {
@@ -1631,7 +1596,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
         * in locked vmas).  Reset cursor on all unreserved nonlinear
         * vmas, now forgetting on which ones it had fallen behind.
         */
-        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
                vma->vm_private_data = NULL;
 out:
        mutex_unlock(&mapping->i_mmap_mutex);
@@ -1716,6 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
                struct vm_area_struct *, unsigned long, void *), void *arg)
 {
        struct anon_vma *anon_vma;
+        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
@@ -1729,11 +1695,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        if (!anon_vma)
                return ret;
        anon_vma_lock(anon_vma);
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                ret = rmap_one(page, vma, address, arg);
                if (ret != SWAP_AGAIN)
                        break;
@@ -1748,16 +1712,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        int ret = SWAP_AGAIN;
        if (!mapping)
                return ret;
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                ret = rmap_one(page, vma, address, arg);
                if (ret != SWAP_AGAIN)
                        break;
diff --git a/mm/shmem.c b/mm/shmem.c
index d3752110c8c7..cc12072f8787 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1339,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
        file_accessed(file);
        vma->vm_ops = &shmem_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
@@ -2643,6 +2642,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
 #endif
+        .remap_pages    = generic_file_remap_pages,
 };
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -2836,7 +2836,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
                fput(vma->vm_file);
        vma->vm_file = file;
        vma->vm_ops = &shmem_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
diff --git a/mm/swap.c b/mm/swap.c
index 77825883298f..6310dc2008ff 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page)
 }
 EXPORT_SYMBOL(mark_page_accessed);
+/*
+ * Order of operations is important: flush the pagevec when it's already
+ * full, not when adding the last page, to make sure that last page is
+ * not added to the LRU directly when passed to this function. Because
+ * mark_page_accessed() (called after this when writing) only activates
+ * pages that are on the LRU, linear writes in subpage chunks would see
+ * every PAGEVEC_SIZE page activated, which is unexpected.
+ */
 void __lru_cache_add(struct page *page, enum lru_list lru)
 {
        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
        page_cache_get(page);
-        if (!pagevec_add(pvec, page))
+        if (!pagevec_space(pvec))
                __pagevec_lru_add(pvec, lru);
+        pagevec_add(pvec, page);
        put_cpu_var(lru_add_pvecs);
 }
 EXPORT_SYMBOL(__lru_cache_add);
@@ -742,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
        SetPageLRU(page_tail);
-        if (page_evictable(page_tail, NULL)) {
+        if (page_evictable(page_tail)) {
                if (PageActive(page)) {
                        SetPageActive(page_tail);
                        active = 1;
diff --git a/mm/truncate.c b/mm/truncate.c
index 75801acdaac7..d51ce92d6e83 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        cancel_dirty_page(page, PAGE_CACHE_SIZE);
-        clear_page_mlock(page);
        ClearPageMappedToDisk(page);
        delete_from_page_cache(page);
        return 0;
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
        if (page_has_private(page) && !try_to_release_page(page, 0))
                return 0;
-        clear_page_mlock(page);
        ret = remove_mapping(mapping, page);
        return ret;
@@ -398,7 +396,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        if (PageDirty(page))
                goto failed;
-        clear_page_mlock(page);
        BUG_ON(page_has_private(page));
        __delete_from_page_cache(page);
        spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2bb90b1d241c..78e08300db21 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                usize -= PAGE_SIZE;
        } while (usize > 0);
-        /* Prevent "things" like memory migration? VM_flags need a cleanup... */
+        vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
-        vma->vm_flags |= VM_RESERVED;
        return 0;
 }
@@ -2572,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p)
 {
        struct vm_struct *v = p;
-        seq_printf(m, "0x%p-0x%p %7ld",
+        seq_printf(m, "0x%pK-0x%pK %7ld",
                v->addr, v->addr + v->size, v->size);
        if (v->caller)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99b434b674c0..2624edcfb420 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page)
 redo:
        ClearPageUnevictable(page);
-        if (page_evictable(page, NULL)) {
+        if (page_evictable(page)) {
                /*
                 * For evictable pages, we can use the cache.
                 * In event of a race, worst case is we end up with an
@@ -587,7 +587,7 @@ redo:
         * page is on unevictable list, it never be freed. To avoid that,
         * check after we added it to the list, again.
         */
-        if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
+        if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
                if (!isolate_lru_page(page)) {
                        put_page(page);
                        goto redo;
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page,
 static unsigned long shrink_page_list(struct list_head *page_list,
                                      struct zone *zone,
                                      struct scan_control *sc,
+                                      enum ttu_flags ttu_flags,
                                      unsigned long *ret_nr_dirty,
-                                      unsigned long *ret_nr_writeback)
+                                      unsigned long *ret_nr_writeback,
+                                      bool force_reclaim)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        mem_cgroup_uncharge_start();
        while (!list_empty(page_list)) {
-                enum page_references references;
                struct address_space *mapping;
                struct page *page;
                int may_enter_fs;
+                enum page_references references = PAGEREF_RECLAIM_CLEAN;
                cond_resched();
@@ -707,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                sc->nr_scanned++;
-                if (unlikely(!page_evictable(page, NULL)))
+                if (unlikely(!page_evictable(page)))
                        goto cull_mlocked;
                if (!sc->may_unmap && page_mapped(page))
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        wait_on_page_writeback(page);
                }
-                references = page_check_references(page, sc);
+                if (!force_reclaim)
+                        references = page_check_references(page, sc);
                switch (references) {
                case PAGEREF_ACTIVATE:
                        goto activate_locked;
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                        switch (try_to_unmap(page, TTU_UNMAP)) {
+                        switch (try_to_unmap(page, ttu_flags)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@ -960,6 +964,33 @@ keep:
        return nr_reclaimed;
 }
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+                                            struct list_head *page_list)
+{
+        struct scan_control sc = {
+                .gfp_mask = GFP_KERNEL,
+                .priority = DEF_PRIORITY,
+                .may_unmap = 1,
+        };
+        unsigned long ret, dummy1, dummy2;
+        struct page *page, *next;
+        LIST_HEAD(clean_pages);
+        list_for_each_entry_safe(page, next, page_list, lru) {
+                if (page_is_file_cache(page) && !PageDirty(page)) {
+                        ClearPageActive(page);
+                        list_move(&page->lru, &clean_pages);
+                }
+        }
+        ret = shrink_page_list(&clean_pages, zone, &sc,
+                                TTU_UNMAP|TTU_IGNORE_ACCESS,
+                                &dummy1, &dummy2, true);
+        list_splice(&clean_pages, page_list);
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+        return ret;
+}
 /*
 * Attempt to remove the specified page from its LRU.  Only take this page
 * if it is of the appropriate PageActive status.  Pages which are being
@@ -978,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
        if (!PageLRU(page))
                return ret;
-        /* Do not give back unevictable pages for compaction */
+        /* Compaction should not handle unevictable pages but CMA can do so */
-        if (PageUnevictable(page))
+        if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
                return ret;
        ret = -EBUSY;
@@ -1186,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
                VM_BUG_ON(PageLRU(page));
                list_del(&page->lru);
-                if (unlikely(!page_evictable(page, NULL))) {
+                if (unlikely(!page_evictable(page))) {
                        spin_unlock_irq(&zone->lru_lock);
                        putback_lru_page(page);
                        spin_lock_irq(&zone->lru_lock);
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        if (nr_taken == 0)
                return 0;
-        nr_reclaimed = shrink_page_list(&page_list, zone, sc,
+        nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
-                                                &nr_dirty, &nr_writeback);
+                                        &nr_dirty, &nr_writeback, false);
        spin_lock_irq(&zone->lru_lock);
@@ -1439,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
                page = lru_to_page(&l_hold);
                list_del(&page->lru);
-                if (unlikely(!page_evictable(page, NULL))) {
+                if (unlikely(!page_evictable(page))) {
                        putback_lru_page(page);
                        continue;
                }
@@ -1729,6 +1760,28 @@ static bool in_reclaim_compaction(struct scan_control *sc)
        return false;
 }
+#ifdef CONFIG_COMPACTION
+/*
+ * If compaction is deferred for sc->order then scale the number of pages
+ * reclaimed based on the number of consecutive allocation failures
+ */
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+                        struct lruvec *lruvec, struct scan_control *sc)
+{
+        struct zone *zone = lruvec_zone(lruvec);
+        if (zone->compact_order_failed <= sc->order)
+                pages_for_compaction <<= zone->compact_defer_shift;
+        return pages_for_compaction;
+}
+#else
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+                        struct lruvec *lruvec, struct scan_control *sc)
+{
+        return pages_for_compaction;
+}
+#endif
 /*
 * Reclaim/compaction is used for high-order allocation requests. It reclaims
 * order-0 pages before compacting the zone. should_continue_reclaim() returns
@@ -1776,6 +1829,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
+        pages_for_compaction = scale_for_compaction(pages_for_compaction,
+                                                    lruvec, sc);
        inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
        if (nr_swap_pages > 0)
                inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
@@ -2839,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+                /*
+                 * Compaction records what page blocks it recently failed to
+                 * isolate pages from and skips them in the future scanning.
+                 * When kswapd is going to sleep, it is reasonable to assume
+                 * that pages and compaction may succeed so reset the cache.
+                 */
+                reset_isolation_suitable(pgdat);
                if (!kthread_should_stop())
                        schedule();
@@ -3101,9 +3165,9 @@ int kswapd_run(int nid)
        if (IS_ERR(pgdat->kswapd)) {
                /* failure at boot is fatal */
                BUG_ON(system_state == SYSTEM_BOOTING);
-                printk("Failed to start kswapd on node %d\n",nid);
                pgdat->kswapd = NULL;
-                ret = -1;
+                pr_err("Failed to start kswapd on node %d\n", nid);
+                ret = PTR_ERR(pgdat->kswapd);
        }
        return ret;
 }
@@ -3350,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 /*
 * page_evictable - test whether a page is evictable
 * @page: the page to test
- * @vma: the VMA in which the page is or will be mapped, may be NULL
 *
 * Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list.  The vma argument is !NULL when called from the
+ * lists vs unevictable list.
- * fault path to determine how to instantate a new page.
 *
 * Reasons page might not be evictable:
 * (1) page's mapping marked unevictable
 * (2) page is part of an mlocked VMA
 *
 */
-int page_evictable(struct page *page, struct vm_area_struct *vma)
+int page_evictable(struct page *page)
 {
+        return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
-        if (mapping_unevictable(page_mapping(page)))
-                return 0;
-        if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
-                return 0;
-        return 1;
 }
 #ifdef CONFIG_SHMEM
@@ -3408,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
                if (!PageLRU(page) || !PageUnevictable(page))
                        continue;
-                if (page_evictable(page, NULL)) {
+                if (page_evictable(page)) {
                        enum lru_list lru = page_lru_base_type(page);
                        VM_BUG_ON(PageActive(page));
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b3e3b9d525d0..c7370579111b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu)
                        atomic_long_add(global_diff[i], &vm_stat[i]);
 }
+void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
+{
+        int i;
+        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                if (pset->vm_stat_diff[i]) {
+                        int v = pset->vm_stat_diff[i];
+                        pset->vm_stat_diff[i] = 0;
+                        atomic_long_add(v, &zone->vm_stat[i]);
+                        atomic_long_add(v, &vm_stat[i]);
+                }
+}
 #endif
 #ifdef CONFIG_NUMA
@@ -722,6 +734,7 @@ const char * const vmstat_text[] = {
        "numa_other",
 #endif
        "nr_anon_transparent_hugepages",
+        "nr_free_cma",
        "nr_dirty_threshold",
        "nr_dirty_background_threshold",
@@ -781,7 +794,6 @@ const char * const vmstat_text[] = {
        "unevictable_pgs_munlocked",
        "unevictable_pgs_cleared",
        "unevictable_pgs_stranded",
-        "unevictable_pgs_mlockfreed",
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        "thp_fault_alloc",
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-10-09 03:23:15 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-10-09 03:23:15 -0400
commit	9e2d8656f5e8aa214e66b462680cf86b210b74a8 (patch)
tree	f67d62e896cedf75599ea45f9ecf9999c6ad24cd /mm
parent	1ea4f4f8405cc1ceec23f2d261bc3775785e6712 (diff)
parent	9e695d2ecc8451cc2c1603d60b5c8e7f5581923a (diff)