nfs: disintegrate UAPI for nfs

This is to complete part of the Userspace API (UAPI) disintegration for which the preparatory patches were pulled recently. After these patches, userspace headers will be segregated into: include/uapi/linux/.../foo.h for the userspace interface stuff, and: include/linux/.../foo.h for the strictly kernel internal stuff. Signed-off-by: J. Bruce Fields <bfields@redhat.com>
author: J. Bruce Fields <bfields@redhat.com> 2012-10-09 18:35:22 -0400
committer: J. Bruce Fields <bfields@redhat.com> 2012-10-09 18:35:22 -0400
commit: f474af7051212b4efc8267583fad9c4ebf33ccff (patch)
tree: 1aa46ebc8065a341f247c2a2d9af2f624ad1d4f8 /mm
parent: 0d22f68f02c10d5d10ec5712917e5828b001a822 (diff)
parent: e3dd9a52cb5552c46c2a4ca7ccdfb4dab5c72457 (diff)
48 files changed, 2461 insertions, 1970 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d5c8019c662..a3f8dddaaab 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS
 # support for memory compaction
 config COMPACTION
        bool "Allow for memory compaction"
+        def_bool y
        select MIGRATION
        depends on MMU
        help
@@ -318,7 +319,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 config TRANSPARENT_HUGEPAGE
        bool "Transparent Hugepage Support"
-        depends on X86 && MMU
+        depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select COMPACTION
        help
          Transparent Hugepages allows the kernel to use huge pages and
diff --git a/mm/Makefile b/mm/Makefile
index 92753e2d82d..6b025f80af3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -14,9 +14,9 @@ endif
 obj-y                   := filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
-                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+                           util.o mmzone.o vmstat.o backing-dev.o \
                           mm_init.o mmu_context.o percpu.o slab_common.o \
-                           compaction.o $(mmu-y)
+                           compaction.o interval_tree.o $(mmu-y)
 obj-y += init-mm.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index bcb63ac48cc..434be4ae7a0 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                        int order = ilog2(BITS_PER_LONG);
                        __free_pages_bootmem(pfn_to_page(start), order);
+                        fixup_zone_present_pages(page_to_nid(pfn_to_page(start)),
+                                        start, start + BITS_PER_LONG);
                        count += BITS_PER_LONG;
                        start += BITS_PER_LONG;
                } else {
@@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                                if (vec & 1) {
                                        page = pfn_to_page(start + off);
                                        __free_pages_bootmem(page, 0);
+                                        fixup_zone_present_pages(
+                                                page_to_nid(page),
+                                                start + off, start + off + 1);
                                        count++;
                                }
                                vec >>= 1;
@@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        pages = bdata->node_low_pfn - bdata->node_min_pfn;
        pages = bootmem_bootmap_pages(pages);
        count += pages;
-        while (pages--)
+        while (pages--) {
+                fixup_zone_present_pages(page_to_nid(page),
+                                page_to_pfn(page), page_to_pfn(page) + 1);
                __free_pages_bootmem(page++, 0);
+        }
        bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
@@ -419,7 +427,7 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 }
 /**
- * reserve_bootmem - mark a page range as usable
+ * reserve_bootmem - mark a page range as reserved
 * @addr: starting address of the range
 * @size: size of the range in bytes
 * @flags: reservation flags (see linux/bootmem.h)
diff --git a/mm/compaction.c b/mm/compaction.c
index e78cb968842..2c4ce17651d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,44 +50,282 @@ static inline bool migrate_async_suitable(int migratetype)
        return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
+#ifdef CONFIG_COMPACTION
+/* Returns true if the pageblock should be scanned for pages to isolate. */
+static inline bool isolation_suitable(struct compact_control *cc,
+                                        struct page *page)
+{
+        if (cc->ignore_skip_hint)
+                return true;
+        return !get_pageblock_skip(page);
+}
+/*
+ * This function is called to clear all cached information on pageblocks that
+ * should be skipped for page isolation when the migrate and free page scanner
+ * meet.
+ */
+static void __reset_isolation_suitable(struct zone *zone)
+{
+        unsigned long start_pfn = zone->zone_start_pfn;
+        unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        unsigned long pfn;
+        zone->compact_cached_migrate_pfn = start_pfn;
+        zone->compact_cached_free_pfn = end_pfn;
+        zone->compact_blockskip_flush = false;
+        /* Walk the zone and mark every pageblock as suitable for isolation */
+        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+                struct page *page;
+                cond_resched();
+                if (!pfn_valid(pfn))
+                        continue;
+                page = pfn_to_page(pfn);
+                if (zone != page_zone(page))
+                        continue;
+                clear_pageblock_skip(page);
+        }
+}
+void reset_isolation_suitable(pg_data_t *pgdat)
+{
+        int zoneid;
+        for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+                struct zone *zone = &pgdat->node_zones[zoneid];
+                if (!populated_zone(zone))
+                        continue;
+                /* Only flush if a full compaction finished recently */
+                if (zone->compact_blockskip_flush)
+                        __reset_isolation_suitable(zone);
+        }
+}
+/*
+ * If no pages were isolated then mark this pageblock to be skipped in the
+ * future. The information is later cleared by __reset_isolation_suitable().
+ */
+static void update_pageblock_skip(struct compact_control *cc,
+                        struct page *page, unsigned long nr_isolated,
+                        bool migrate_scanner)
+{
+        struct zone *zone = cc->zone;
+        if (!page)
+                return;
+        if (!nr_isolated) {
+                unsigned long pfn = page_to_pfn(page);
+                set_pageblock_skip(page);
+                /* Update where compaction should restart */
+                if (migrate_scanner) {
+                        if (!cc->finished_update_migrate &&
+                            pfn > zone->compact_cached_migrate_pfn)
+                                zone->compact_cached_migrate_pfn = pfn;
+                } else {
+                        if (!cc->finished_update_free &&
+                            pfn < zone->compact_cached_free_pfn)
+                                zone->compact_cached_free_pfn = pfn;
+                }
+        }
+}
+#else
+static inline bool isolation_suitable(struct compact_control *cc,
+                                        struct page *page)
+{
+        return true;
+}
+static void update_pageblock_skip(struct compact_control *cc,
+                        struct page *page, unsigned long nr_isolated,
+                        bool migrate_scanner)
+{
+}
+#endif /* CONFIG_COMPACTION */
+static inline bool should_release_lock(spinlock_t *lock)
+{
+        return need_resched() || spin_is_contended(lock);
+}
+/*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. Check if the process needs to be scheduled or
+ * if the lock is contended. For async compaction, back out in the event
+ * if contention is severe. For sync compaction, schedule.
+ *
+ * Returns true if the lock is held.
+ * Returns false if the lock is released and compaction should abort
+ */
+static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+                                      bool locked, struct compact_control *cc)
+{
+        if (should_release_lock(lock)) {
+                if (locked) {
+                        spin_unlock_irqrestore(lock, *flags);
+                        locked = false;
+                }
+                /* async aborts if taking too long or contended */
+                if (!cc->sync) {
+                        cc->contended = true;
+                        return false;
+                }
+                cond_resched();
+        }
+        if (!locked)
+                spin_lock_irqsave(lock, *flags);
+        return true;
+}
+static inline bool compact_trylock_irqsave(spinlock_t *lock,
+                        unsigned long *flags, struct compact_control *cc)
+{
+        return compact_checklock_irqsave(lock, flags, false, cc);
+}
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+        int migratetype = get_pageblock_migratetype(page);
+        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+        if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+                return false;
+        /* If the page is a large free page, then allow migration */
+        if (PageBuddy(page) && page_order(page) >= pageblock_order)
+                return true;
+        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+        if (migrate_async_suitable(migratetype))
+                return true;
+        /* Otherwise skip the block */
+        return false;
+}
+static void compact_capture_page(struct compact_control *cc)
+{
+        unsigned long flags;
+        int mtype, mtype_low, mtype_high;
+        if (!cc->page || *cc->page)
+                return;
+        /*
+         * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
+         * regardless of the migratetype of the freelist is is captured from.
+         * This is fine because the order for a high-order MIGRATE_MOVABLE
+         * allocation is typically at least a pageblock size and overall
+         * fragmentation is not impaired. Other allocation types must
+         * capture pages from their own migratelist because otherwise they
+         * could pollute other pageblocks like MIGRATE_MOVABLE with
+         * difficult to move pages and making fragmentation worse overall.
+         */
+        if (cc->migratetype == MIGRATE_MOVABLE) {
+                mtype_low = 0;
+                mtype_high = MIGRATE_PCPTYPES;
+        } else {
+                mtype_low = cc->migratetype;
+                mtype_high = cc->migratetype + 1;
+        }
+        /* Speculatively examine the free lists without zone lock */
+        for (mtype = mtype_low; mtype < mtype_high; mtype++) {
+                int order;
+                for (order = cc->order; order < MAX_ORDER; order++) {
+                        struct page *page;
+                        struct free_area *area;
+                        area = &(cc->zone->free_area[order]);
+                        if (list_empty(&area->free_list[mtype]))
+                                continue;
+                        /* Take the lock and attempt capture of the page */
+                        if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
+                                return;
+                        if (!list_empty(&area->free_list[mtype])) {
+                                page = list_entry(area->free_list[mtype].next,
+                                                        struct page, lru);
+                                if (capture_free_page(page, cc->order, mtype)) {
+                                        spin_unlock_irqrestore(&cc->zone->lock,
+                                                                        flags);
+                                        *cc->page = page;
+                                        return;
+                                }
+                        }
+                        spin_unlock_irqrestore(&cc->zone->lock, flags);
+                }
+        }
+}
 /*
 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
 * pages inside of the pageblock (even though it may still end up isolating
 * some pages).
 */
-static unsigned long isolate_freepages_block(unsigned long blockpfn,
+static unsigned long isolate_freepages_block(struct compact_control *cc,
+                                unsigned long blockpfn,
                                unsigned long end_pfn,
                                struct list_head *freelist,
                                bool strict)
 {
        int nr_scanned = 0, total_isolated = 0;
-        struct page *cursor;
+        struct page *cursor, *valid_page = NULL;
+        unsigned long nr_strict_required = end_pfn - blockpfn;
+        unsigned long flags;
+        bool locked = false;
        cursor = pfn_to_page(blockpfn);
-        /* Isolate free pages. This assumes the block is valid */
+        /* Isolate free pages. */
        for (; blockpfn < end_pfn; blockpfn++, cursor++) {
                int isolated, i;
                struct page *page = cursor;
-                if (!pfn_valid_within(blockpfn)) {
-                        if (strict)
-                                return 0;
-                        continue;
-                }
                nr_scanned++;
+                if (!pfn_valid_within(blockpfn))
+                        continue;
+                if (!valid_page)
+                        valid_page = page;
+                if (!PageBuddy(page))
+                        continue;
-                if (!PageBuddy(page)) {
+                /*
-                        if (strict)
+                 * The zone lock must be held to isolate freepages.
-                                return 0;
+                 * Unfortunately this is a very coarse lock and can be
+                 * heavily contended if there are parallel allocations
+                 * or parallel compactions. For async compaction do not
+                 * spin on the lock and we acquire the lock as late as
+                 * possible.
+                 */
+                locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
+                                                                locked, cc);
+                if (!locked)
+                        break;
+                /* Recheck this is a suitable migration target under lock */
+                if (!strict && !suitable_migration_target(page))
+                        break;
+                /* Recheck this is a buddy page under lock */
+                if (!PageBuddy(page))
                        continue;
-                }
                /* Found a free page, break it into order-0 pages */
                isolated = split_free_page(page);
                if (!isolated && strict)
-                        return 0;
+                        break;
                total_isolated += isolated;
                for (i = 0; i < isolated; i++) {
                        list_add(&page->lru, freelist);
@@ -102,6 +340,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
        }
        trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
+        /*
+         * If strict isolation is requested by CMA then check that all the
+         * pages requested were isolated. If there were any failures, 0 is
+         * returned and CMA will fail.
+         */
+        if (strict && nr_strict_required != total_isolated)
+                total_isolated = 0;
+        if (locked)
+                spin_unlock_irqrestore(&cc->zone->lock, flags);
+        /* Update the pageblock-skip if the whole pageblock was scanned */
+        if (blockpfn == end_pfn)
+                update_pageblock_skip(cc, valid_page, total_isolated, false);
        return total_isolated;
 }
@@ -119,17 +373,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
 * a free page).
 */
 unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
+isolate_freepages_range(struct compact_control *cc,
+                        unsigned long start_pfn, unsigned long end_pfn)
 {
-        unsigned long isolated, pfn, block_end_pfn, flags;
+        unsigned long isolated, pfn, block_end_pfn;
-        struct zone *zone = NULL;
        LIST_HEAD(freelist);
-        if (pfn_valid(start_pfn))
-                zone = page_zone(pfn_to_page(start_pfn));
        for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
-                if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
+                if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
                        break;
                /*
@@ -139,10 +390,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
                block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
                block_end_pfn = min(block_end_pfn, end_pfn);
-                spin_lock_irqsave(&zone->lock, flags);
+                isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
-                isolated = isolate_freepages_block(pfn, block_end_pfn,
                                                   &freelist, true);
-                spin_unlock_irqrestore(&zone->lock, flags);
                /*
                 * In strict mode, isolate_freepages_block() returns 0 if
@@ -173,7 +422,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 }
 /* Update the number of anon and file isolated pages in the zone */
-static void acct_isolated(struct zone *zone, struct compact_control *cc)
+static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
 {
        struct page *page;
        unsigned int count[2] = { 0, };
@@ -181,8 +430,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
        list_for_each_entry(page, &cc->migratepages, lru)
                count[!!page_is_file_cache(page)]++;
-        __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+        /* If locked we can use the interrupt unsafe versions */
-        __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        if (locked) {
+                __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+                __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        } else {
+                mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+                mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        }
 }
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -206,6 +461,7 @@ static bool too_many_isolated(struct zone *zone)
 * @cc:         Compaction control structure.
 * @low_pfn:    The first PFN of the range.
 * @end_pfn:    The one-past-the-last PFN of the range.
+ * @unevictable: true if it allows to isolate unevictable pages
 *
 * Isolate all pages that can be migrated from the range specified by
 * [low_pfn, end_pfn).  Returns zero if there is a fatal signal
@@ -221,13 +477,16 @@ static bool too_many_isolated(struct zone *zone)
 */
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
-                           unsigned long low_pfn, unsigned long end_pfn)
+                unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
 {
        unsigned long last_pageblock_nr = 0, pageblock_nr;
        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
        isolate_mode_t mode = 0;
        struct lruvec *lruvec;
+        unsigned long flags;
+        bool locked = false;
+        struct page *page = NULL, *valid_page = NULL;
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -247,25 +506,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        /* Time to isolate some pages for migration */
        cond_resched();
-        spin_lock_irq(&zone->lru_lock);
        for (; low_pfn < end_pfn; low_pfn++) {
-                struct page *page;
-                bool locked = true;
                /* give a chance to irqs before checking need_resched() */
-                if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+                if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-                        spin_unlock_irq(&zone->lru_lock);
+                        if (should_release_lock(&zone->lru_lock)) {
-                        locked = false;
+                                spin_unlock_irqrestore(&zone->lru_lock, flags);
+                                locked = false;
+                        }
                }
-                if (need_resched() || spin_is_contended(&zone->lru_lock)) {
-                        if (locked)
-                                spin_unlock_irq(&zone->lru_lock);
-                        cond_resched();
-                        spin_lock_irq(&zone->lru_lock);
-                        if (fatal_signal_pending(current))
-                                break;
-                } else if (!locked)
-                        spin_lock_irq(&zone->lru_lock);
                /*
                 * migrate_pfn does not necessarily start aligned to a
@@ -294,6 +542,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                if (page_zone(page) != zone)
                        continue;
+                if (!valid_page)
+                        valid_page = page;
+                /* If isolation recently failed, do not retry */
+                pageblock_nr = low_pfn >> pageblock_order;
+                if (!isolation_suitable(cc, page))
+                        goto next_pageblock;
                /* Skip if free */
                if (PageBuddy(page))
                        continue;
@@ -303,24 +559,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                 * migration is optimistic to see if the minimum amount of work
                 * satisfies the allocation
                 */
-                pageblock_nr = low_pfn >> pageblock_order;
                if (!cc->sync && last_pageblock_nr != pageblock_nr &&
                    !migrate_async_suitable(get_pageblock_migratetype(page))) {
-                        low_pfn += pageblock_nr_pages;
+                        cc->finished_update_migrate = true;
-                        low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+                        goto next_pageblock;
-                        last_pageblock_nr = pageblock_nr;
-                        continue;
                }
+                /* Check may be lockless but that's ok as we recheck later */
                if (!PageLRU(page))
                        continue;
                /*
-                 * PageLRU is set, and lru_lock excludes isolation,
+                 * PageLRU is set. lru_lock normally excludes isolation
-                 * splitting and collapsing (collapsing has already
+                 * splitting and collapsing (collapsing has already happened
-                 * happened if PageLRU is set).
+                 * if PageLRU is set) but the lock is not necessarily taken
+                 * here and it is wasteful to take it just to check transhuge.
+                 * Check TransHuge without lock and skip the whole pageblock if
+                 * it's either a transhuge or hugetlbfs page, as calling
+                 * compound_order() without preventing THP from splitting the
+                 * page underneath us may return surprising results.
                 */
                if (PageTransHuge(page)) {
+                        if (!locked)
+                                goto next_pageblock;
+                        low_pfn += (1 << compound_order(page)) - 1;
+                        continue;
+                }
+                /* Check if it is ok to still hold the lock */
+                locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+                                                                locked, cc);
+                if (!locked || fatal_signal_pending(current))
+                        break;
+                /* Recheck PageLRU and PageTransHuge under lock */
+                if (!PageLRU(page))
+                        continue;
+                if (PageTransHuge(page)) {
                        low_pfn += (1 << compound_order(page)) - 1;
                        continue;
                }
@@ -328,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                if (!cc->sync)
                        mode |= ISOLATE_ASYNC_MIGRATE;
+                if (unevictable)
+                        mode |= ISOLATE_UNEVICTABLE;
                lruvec = mem_cgroup_page_lruvec(page, zone);
                /* Try isolate the page */
@@ -337,6 +615,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                VM_BUG_ON(PageTransCompound(page));
                /* Successfully isolated */
+                cc->finished_update_migrate = true;
                del_page_from_lru_list(page, lruvec, page_lru(page));
                list_add(&page->lru, migratelist);
                cc->nr_migratepages++;
@@ -347,11 +626,23 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                        ++low_pfn;
                        break;
                }
+                continue;
+next_pageblock:
+                low_pfn += pageblock_nr_pages;
+                low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+                last_pageblock_nr = pageblock_nr;
        }
-        acct_isolated(zone, cc);
+        acct_isolated(zone, locked, cc);
-        spin_unlock_irq(&zone->lru_lock);
+        if (locked)
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
+        /* Update the pageblock-skip if the whole pageblock was scanned */
+        if (low_pfn == end_pfn)
+                update_pageblock_skip(cc, valid_page, nr_isolated, true);
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -360,29 +651,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
-{
-        int migratetype = get_pageblock_migratetype(page);
-        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
-        if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
-                return false;
-        /* If the page is a large free page, then allow migration */
-        if (PageBuddy(page) && page_order(page) >= pageblock_order)
-                return true;
-        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-        if (migrate_async_suitable(migratetype))
-                return true;
-        /* Otherwise skip the block */
-        return false;
-}
 /*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
@@ -392,7 +660,6 @@ static void isolate_freepages(struct zone *zone,
 {
        struct page *page;
        unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
-        unsigned long flags;
        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
@@ -422,17 +689,6 @@ static void isolate_freepages(struct zone *zone,
                                        pfn -= pageblock_nr_pages) {
                unsigned long isolated;
-                /*
-                 * Skip ahead if another thread is compacting in the area
-                 * simultaneously. If we wrapped around, we can only skip
-                 * ahead if zone->compact_cached_free_pfn also wrapped to
-                 * above our starting point.
-                 */
-                if (cc->order > 0 && (!cc->wrapped ||
-                                      zone->compact_cached_free_pfn >
-                                      cc->start_free_pfn))
-                        pfn = min(pfn, zone->compact_cached_free_pfn);
                if (!pfn_valid(pfn))
                        continue;
@@ -451,21 +707,16 @@ static void isolate_freepages(struct zone *zone,
                if (!suitable_migration_target(page))
                        continue;
-                /*
+                /* If isolation recently failed, do not retry */
-                 * Found a block suitable for isolating free pages from. Now
+                if (!isolation_suitable(cc, page))
-                 * we disabled interrupts, double check things are ok and
+                        continue;
-                 * isolate the pages. This is to minimise the time IRQs
-                 * are disabled
+                /* Found a block suitable for isolating free pages from */
-                 */
                isolated = 0;
-                spin_lock_irqsave(&zone->lock, flags);
+                end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
-                if (suitable_migration_target(page)) {
+                isolated = isolate_freepages_block(cc, pfn, end_pfn,
-                        end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
+                                                   freelist, false);
-                        isolated = isolate_freepages_block(pfn, end_pfn,
+                nr_freepages += isolated;
-                                                           freelist, false);
-                        nr_freepages += isolated;
-                }
-                spin_unlock_irqrestore(&zone->lock, flags);
                /*
                 * Record the highest PFN we isolated pages from. When next
@@ -473,9 +724,8 @@ static void isolate_freepages(struct zone *zone,
                 * page migration may have returned some pages to the allocator
                 */
                if (isolated) {
+                        cc->finished_update_free = true;
                        high_pfn = max(high_pfn, pfn);
-                        if (cc->order > 0)
-                                zone->compact_cached_free_pfn = high_pfn;
                }
        }
@@ -561,8 +811,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        }
        /* Perform the isolation */
-        low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
+        low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
-        if (!low_pfn)
+        if (!low_pfn || cc->contended)
                return ISOLATE_ABORT;
        cc->migrate_pfn = low_pfn;
@@ -570,50 +820,27 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        return ISOLATE_SUCCESS;
 }
-/*
- * Returns the start pfn of the last page block in a zone.  This is the starting
- * point for full compaction of a zone.  Compaction searches for free pages from
- * the end of each zone, while isolate_freepages_block scans forward inside each
- * page block.
- */
-static unsigned long start_free_pfn(struct zone *zone)
-{
-        unsigned long free_pfn;
-        free_pfn = zone->zone_start_pfn + zone->spanned_pages;
-        free_pfn &= ~(pageblock_nr_pages-1);
-        return free_pfn;
-}
 static int compact_finished(struct zone *zone,
                            struct compact_control *cc)
 {
-        unsigned int order;
        unsigned long watermark;
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
-        /*
+        /* Compaction run completes if the migrate and free scanner meet */
-         * A full (order == -1) compaction run starts at the beginning and
-         * end of a zone; it completes when the migrate and free scanner meet.
-         * A partial (order > 0) compaction can start with the free scanner
-         * at a random point in the zone, and may have to restart.
-         */
        if (cc->free_pfn <= cc->migrate_pfn) {
-                if (cc->order > 0 && !cc->wrapped) {
+                /*
-                        /* We started partway through; restart at the end. */
+                 * Mark that the PG_migrate_skip information should be cleared
-                        unsigned long free_pfn = start_free_pfn(zone);
+                 * by kswapd when it goes to sleep. kswapd does not set the
-                        zone->compact_cached_free_pfn = free_pfn;
+                 * flag itself as the decision to be clear should be directly
-                        cc->free_pfn = free_pfn;
+                 * based on an allocation request.
-                        cc->wrapped = 1;
+                 */
-                        return COMPACT_CONTINUE;
+                if (!current_is_kswapd())
-                }
+                        zone->compact_blockskip_flush = true;
-                return COMPACT_COMPLETE;
-        }
-        /* We wrapped around and ended up where we started. */
-        if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
                return COMPACT_COMPLETE;
+        }
        /*
         * order == -1 is expected when compacting via
@@ -630,14 +857,22 @@ static int compact_finished(struct zone *zone,
                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
-        for (order = cc->order; order < MAX_ORDER; order++) {
+        if (cc->page) {
-                /* Job done if page is free of the right migratetype */
+                /* Was a suitable page captured? */
-                if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
+                if (*cc->page)
-                        return COMPACT_PARTIAL;
-                /* Job done if allocation would set block type */
-                if (order >= pageblock_order && zone->free_area[order].nr_free)
                        return COMPACT_PARTIAL;
+        } else {
+                unsigned int order;
+                for (order = cc->order; order < MAX_ORDER; order++) {
+                        struct free_area *area = &zone->free_area[cc->order];
+                        /* Job done if page is free of the right migratetype */
+                        if (!list_empty(&area->free_list[cc->migratetype]))
+                                return COMPACT_PARTIAL;
+                        /* Job done if allocation would set block type */
+                        if (cc->order >= pageblock_order && area->nr_free)
+                                return COMPACT_PARTIAL;
+                }
        }
        return COMPACT_CONTINUE;
@@ -696,6 +931,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
+        unsigned long start_pfn = zone->zone_start_pfn;
+        unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
        ret = compaction_suitable(zone, cc->order);
        switch (ret) {
@@ -708,18 +945,30 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                ;
        }
-        /* Setup to move all movable pages to the end of the zone */
+        /*
-        cc->migrate_pfn = zone->zone_start_pfn;
+         * Setup to move all movable pages to the end of the zone. Used cached
+         * information on where the scanners should start but check that it
-        if (cc->order > 0) {
+         * is initialised by ensuring the values are within zone boundaries.
-                /* Incremental compaction. Start where the last one stopped. */
+         */
-                cc->free_pfn = zone->compact_cached_free_pfn;
+        cc->migrate_pfn = zone->compact_cached_migrate_pfn;
-                cc->start_free_pfn = cc->free_pfn;
+        cc->free_pfn = zone->compact_cached_free_pfn;
-        } else {
+        if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
-                /* Order == -1 starts at the end of the zone. */
+                cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
-                cc->free_pfn = start_free_pfn(zone);
+                zone->compact_cached_free_pfn = cc->free_pfn;
+        }
+        if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+                cc->migrate_pfn = start_pfn;
+                zone->compact_cached_migrate_pfn = cc->migrate_pfn;
        }
+        /*
+         * Clear pageblock skip if there were failures recently and compaction
+         * is about to be retried after being deferred. kswapd does not do
+         * this reset as it'll reset the cached information when going to sleep.
+         */
+        if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+                __reset_isolation_suitable(zone);
        migrate_prep_local();
        while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
@@ -729,6 +978,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                switch (isolate_migratepages(zone, cc)) {
                case ISOLATE_ABORT:
                        ret = COMPACT_PARTIAL;
+                        putback_lru_pages(&cc->migratepages);
+                        cc->nr_migratepages = 0;
                        goto out;
                case ISOLATE_NONE:
                        continue;
@@ -759,6 +1010,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                                goto out;
                        }
                }
+                /* Capture a page now if it is a suitable size */
+                compact_capture_page(cc);
        }
 out:
@@ -771,8 +1025,10 @@ out:
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                 bool sync)
+                                 bool sync, bool *contended,
+                                 struct page **page)
 {
+        unsigned long ret;
        struct compact_control cc = {
                .nr_freepages = 0,
                .nr_migratepages = 0,
@@ -780,11 +1036,18 @@ static unsigned long compact_zone_order(struct zone *zone,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
                .sync = sync,
+                .page = page,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
-        return compact_zone(zone, &cc);
+        ret = compact_zone(zone, &cc);
+        VM_BUG_ON(!list_empty(&cc.freepages));
+        VM_BUG_ON(!list_empty(&cc.migratepages));
+        *contended = cc.contended;
+        return ret;
 }
 int sysctl_extfrag_threshold = 500;
@@ -796,12 +1059,14 @@ int sysctl_extfrag_threshold = 500;
 * @gfp_mask: The GFP mask of the current allocation
 * @nodemask: The allowed nodes to allocate from
 * @sync: Whether migration is synchronous or not
+ * @contended: Return value that is true if compaction was aborted due to lock contention
+ * @page: Optionally capture a free page of the requested order during compaction
 *
 * This is the main entry point for direct page compaction.
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                        bool sync)
+                        bool sync, bool *contended, struct page **page)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -809,27 +1074,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
+        int alloc_flags = 0;
-        /*
+        /* Check if the GFP flags allow compaction */
-         * Check whether it is worth even starting compaction. The order check is
-         * made because an assumption is made that the page allocator can satisfy
-         * the "cheaper" orders without taking special steps
-         */
        if (!order || !may_enter_fs || !may_perform_io)
                return rc;
        count_vm_event(COMPACTSTALL);
+#ifdef CONFIG_CMA
+        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+                alloc_flags |= ALLOC_CMA;
+#endif
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
                int status;
-                status = compact_zone_order(zone, order, gfp_mask, sync);
+                status = compact_zone_order(zone, order, gfp_mask, sync,
+                                                contended, page);
                rc = max(status, rc);
                /* If a normal allocation would succeed, stop compacting */
-                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
+                                      alloc_flags))
                        break;
        }
@@ -861,7 +1129,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                if (cc->order > 0) {
                        int ok = zone_watermark_ok(zone, cc->order,
                                                low_wmark_pages(zone), 0, 0);
-                        if (ok && cc->order > zone->compact_order_failed)
+                        if (ok && cc->order >= zone->compact_order_failed)
                                zone->compact_order_failed = cc->order + 1;
                        /* Currently async compaction is never deferred. */
                        else if (!ok && cc->sync)
@@ -880,6 +1148,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
        struct compact_control cc = {
                .order = order,
                .sync = false,
+                .page = NULL,
        };
        return __compact_pgdat(pgdat, &cc);
@@ -890,6 +1159,7 @@ static int compact_node(int nid)
        struct compact_control cc = {
                .order = -1,
                .sync = true,
+                .page = NULL,
        };
        return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 9b75a045dbf..a47f0f50c89 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -26,7 +26,7 @@
 */
 SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 {
-        struct file *file = fget(fd);
+        struct fd f = fdget(fd);
        struct address_space *mapping;
        struct backing_dev_info *bdi;
        loff_t endbyte;                 /* inclusive */
@@ -35,15 +35,15 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
        unsigned long nrpages;
        int ret = 0;
-        if (!file)
+        if (!f.file)
                return -EBADF;
-        if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
+        if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) {
                ret = -ESPIPE;
                goto out;
        }
-        mapping = file->f_mapping;
+        mapping = f.file->f_mapping;
        if (!mapping || len < 0) {
                ret = -EINVAL;
                goto out;
@@ -76,21 +76,21 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
        switch (advice) {
        case POSIX_FADV_NORMAL:
-                file->f_ra.ra_pages = bdi->ra_pages;
+                f.file->f_ra.ra_pages = bdi->ra_pages;
-                spin_lock(&file->f_lock);
+                spin_lock(&f.file->f_lock);
-                file->f_mode &= ~FMODE_RANDOM;
+                f.file->f_mode &= ~FMODE_RANDOM;
-                spin_unlock(&file->f_lock);
+                spin_unlock(&f.file->f_lock);
                break;
        case POSIX_FADV_RANDOM:
-                spin_lock(&file->f_lock);
+                spin_lock(&f.file->f_lock);
-                file->f_mode |= FMODE_RANDOM;
+                f.file->f_mode |= FMODE_RANDOM;
-                spin_unlock(&file->f_lock);
+                spin_unlock(&f.file->f_lock);
                break;
        case POSIX_FADV_SEQUENTIAL:
-                file->f_ra.ra_pages = bdi->ra_pages * 2;
+                f.file->f_ra.ra_pages = bdi->ra_pages * 2;
-                spin_lock(&file->f_lock);
+                spin_lock(&f.file->f_lock);
-                file->f_mode &= ~FMODE_RANDOM;
+                f.file->f_mode &= ~FMODE_RANDOM;
-                spin_unlock(&file->f_lock);
+                spin_unlock(&f.file->f_lock);
                break;
        case POSIX_FADV_WILLNEED:
                /* First and last PARTIAL page! */
@@ -106,7 +106,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                 * Ignore return value because fadvise() shall return
                 * success even if filesystem can't retrieve a hint,
                 */
-                force_page_cache_readahead(mapping, file, start_index,
+                force_page_cache_readahead(mapping, f.file, start_index,
                                           nrpages);
                break;
        case POSIX_FADV_NOREUSE:
@@ -128,7 +128,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                ret = -EINVAL;
        }
 out:
-        fput(file);
+        fdput(f);
        return ret;
 }
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
diff --git a/mm/filemap.c b/mm/filemap.c
index fa5ca304148..83efee76a5c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        retval = filemap_write_and_wait_range(mapping, pos,
                                        pos + iov_length(iov, nr_segs) - 1);
                        if (!retval) {
-                                struct blk_plug plug;
-                                blk_start_plug(&plug);
                                retval = mapping->a_ops->direct_IO(READ, iocb,
                                                        iov, pos, nr_segs);
-                                blk_finish_plug(&plug);
                        }
                        if (retval > 0) {
                                *ppos = pos + retval;
@@ -1611,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
         * Do we have something in the page cache already?
         */
        page = find_get_page(mapping, offset);
-        if (likely(page)) {
+        if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
                /*
                 * We found the page, so try async readahead before
                 * waiting for the lock.
                 */
                do_async_mmap_readahead(vma, ra, file, page, offset);
-        } else {
+        } else if (!page) {
                /* No page in the page cache at all */
                do_sync_mmap_readahead(vma, ra, file, offset);
                count_vm_event(PGMAJFAULT);
@@ -1741,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
 const struct vm_operations_struct generic_file_vm_ops = {
        .fault          = filemap_fault,
        .page_mkwrite   = filemap_page_mkwrite,
+        .remap_pages    = generic_file_remap_pages,
 };
 /* This is used for a general mmap of a disk file */
@@ -1753,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &generic_file_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
@@ -2527,14 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        struct blk_plug plug;
        ssize_t ret;
        BUG_ON(iocb->ki_pos != pos);
        sb_start_write(inode->i_sb);
        mutex_lock(&inode->i_mutex);
-        blk_start_plug(&plug);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);
@@ -2545,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0 && ret > 0)
                        ret = err;
        }
-        blk_finish_plug(&plug);
        sb_end_write(inode->i_sb);
        return ret;
 }
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 13e013b1270..a912da6ddfd 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping,
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm;
-        struct prio_tree_iter iter;
        unsigned long address;
        pte_t *pte;
        pte_t pteval;
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
 retry:
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -193,11 +192,13 @@ retry:
                if (pte) {
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
-                        pteval = ptep_clear_flush_notify(vma, address, pte);
+                        pteval = ptep_clear_flush(vma, address, pte);
                        page_remove_rmap(page);
                        dec_mm_counter(mm, MM_FILEPAGES);
                        BUG_ON(pte_dirty(pteval));
                        pte_unmap_unlock(pte, ptl);
+                        /* must invalidate_page _before_ freeing the page */
+                        mmu_notifier_invalidate_page(mm, address);
                        page_cache_release(page);
                }
        }
@@ -305,6 +306,7 @@ out:
 static const struct vm_operations_struct xip_file_vm_ops = {
        .fault  = xip_file_fault,
        .page_mkwrite   = filemap_page_mkwrite,
+        .remap_pages = generic_file_remap_pages,
 };
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -313,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
        file_accessed(file);
        vma->vm_ops = &xip_file_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
+        vma->vm_flags |= VM_MIXEDMAP;
        return 0;
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 9ed4fd43246..3899a86851c 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,6 +5,7 @@
 *
 * started by Ingo Molnar, Copyright (C) 2002, 2003
 */
+#include <linux/export.h>
 #include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -80,9 +81,10 @@ out:
        return err;
 }
-static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-                        unsigned long addr, unsigned long size, pgoff_t pgoff)
+                             unsigned long size, pgoff_t pgoff)
 {
+        struct mm_struct *mm = vma->vm_mm;
        int err;
        do {
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
                pgoff++;
        } while (size);
-        return 0;
+        return 0;
 }
+EXPORT_SYMBOL(generic_file_remap_pages);
 /**
 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
                goto out;
-        if (!(vma->vm_flags & VM_CAN_NONLINEAR))
+        if (!vma->vm_ops->remap_pages)
                goto out;
        if (start < vma->vm_start || start + size > vma->vm_end)
@@ -195,10 +197,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                 */
                if (mapping_cap_account_dirty(mapping)) {
                        unsigned long addr;
-                        struct file *file = vma->vm_file;
+                        struct file *file = get_file(vma->vm_file);
                        flags &= MAP_NONBLOCK;
-                        get_file(file);
                        addr = mmap_region(file, start, size,
                                        flags, vma->vm_flags, pgoff);
                        fput(file);
@@ -213,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
                vma->vm_flags |= VM_NONLINEAR;
-                vma_prio_tree_remove(vma, &mapping->i_mmap);
+                vma_interval_tree_remove(vma, &mapping->i_mmap);
                vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
                flush_dcache_mmap_unlock(mapping);
                mutex_unlock(&mapping->i_mmap_mutex);
@@ -229,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        }
        mmu_notifier_invalidate_range_start(mm, start, start + size);
-        err = populate_range(mm, vma, start, size, pgoff);
+        err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
        mmu_notifier_invalidate_range_end(mm, start, start + size);
        if (!err && !(flags & MAP_NONBLOCK)) {
                if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 6b3e71a2cd4..2890e67d602 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -44,6 +44,13 @@ EXPORT_SYMBOL(frontswap_enabled);
 */
 static bool frontswap_writethrough_enabled __read_mostly;
+/*
+ * If enabled, the underlying tmem implementation is capable of doing
+ * exclusive gets, so frontswap_load, on a successful tmem_get must
+ * mark the page as no longer in frontswap AND mark it dirty.
+ */
+static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
 #ifdef CONFIG_DEBUG_FS
 /*
 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
@@ -97,6 +104,15 @@ void frontswap_writethrough(bool enable)
 EXPORT_SYMBOL(frontswap_writethrough);
 /*
+ * Enable/disable frontswap exclusive gets (see above).
+ */
+void frontswap_tmem_exclusive_gets(bool enable)
+{
+        frontswap_tmem_exclusive_gets_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
+/*
 * Called when a swap device is swapon'd.
 */
 void __frontswap_init(unsigned type)
@@ -174,8 +190,13 @@ int __frontswap_load(struct page *page)
        BUG_ON(sis == NULL);
        if (frontswap_test(sis, offset))
                ret = frontswap_ops.load(type, offset, page);
-        if (ret == 0)
+        if (ret == 0) {
                inc_frontswap_loads();
+                if (frontswap_tmem_exclusive_gets_enabled) {
+                        SetPageDirty(page);
+                        frontswap_clear(sis, offset);
+                }
+        }
        return ret;
 }
 EXPORT_SYMBOL(__frontswap_load);
@@ -263,6 +284,11 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
        return ret;
 }
+/*
+ * Used to check if it's necessory and feasible to unuse pages.
+ * Return 1 when nothing to do, 0 when need to shink pages,
+ * error code when there is an error.
+ */
 static int __frontswap_shrink(unsigned long target_pages,
                                unsigned long *pages_to_unuse,
                                int *type)
@@ -275,7 +301,7 @@ static int __frontswap_shrink(unsigned long target_pages,
        if (total_pages <= target_pages) {
                /* Nothing to do */
                *pages_to_unuse = 0;
-                return 0;
+                return 1;
        }
        total_pages_to_unuse = total_pages - target_pages;
        return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
@@ -292,7 +318,7 @@ static int __frontswap_shrink(unsigned long target_pages,
 void frontswap_shrink(unsigned long target_pages)
 {
        unsigned long pages_to_unuse = 0;
-        int type, ret;
+        int uninitialized_var(type), ret;
        /*
         * we don't want to hold swap_lock while doing a very
@@ -302,7 +328,7 @@ void frontswap_shrink(unsigned long target_pages)
        spin_lock(&swap_lock);
        ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
        spin_unlock(&swap_lock);
-        if (ret == 0 && pages_to_unuse)
+        if (ret == 0)
                try_to_unuse(type, true, pages_to_unuse);
        return;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 57c4b930901..a863af26c79 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -102,10 +102,7 @@ static int set_recommended_min_free_kbytes(void)
        unsigned long recommended_min;
        extern int min_free_kbytes;
-        if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+        if (!khugepaged_enabled())
-                      &transparent_hugepage_flags) &&
-            !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-                      &transparent_hugepage_flags))
                return 0;
        for_each_populated_zone(zone)
@@ -139,12 +136,6 @@ static int start_khugepaged(void)
 {
        int err = 0;
        if (khugepaged_enabled()) {
-                int wakeup;
-                if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
-                        err = -ENOMEM;
-                        goto out;
-                }
-                mutex_lock(&khugepaged_mutex);
                if (!khugepaged_thread)
                        khugepaged_thread = kthread_run(khugepaged, NULL,
                                                        "khugepaged");
@@ -154,16 +145,16 @@ static int start_khugepaged(void)
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
                }
-                wakeup = !list_empty(&khugepaged_scan.mm_head);
-                mutex_unlock(&khugepaged_mutex);
+                if (!list_empty(&khugepaged_scan.mm_head))
-                if (wakeup)
                        wake_up_interruptible(&khugepaged_wait);
                set_recommended_min_free_kbytes();
-        } else
+        } else if (khugepaged_thread) {
-                /* wakeup to exit */
+                kthread_stop(khugepaged_thread);
-                wake_up_interruptible(&khugepaged_wait);
+                khugepaged_thread = NULL;
-out:
+        }
        return err;
 }
@@ -224,18 +215,16 @@ static ssize_t enabled_store(struct kobject *kobj,
                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
        if (ret > 0) {
-                int err = start_khugepaged();
+                int err;
+                mutex_lock(&khugepaged_mutex);
+                err = start_khugepaged();
+                mutex_unlock(&khugepaged_mutex);
                if (err)
                        ret = err;
        }
-        if (ret > 0 &&
-            (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
-                      &transparent_hugepage_flags) ||
-             test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-                      &transparent_hugepage_flags)))
-                set_recommended_min_free_kbytes();
        return ret;
 }
 static struct kobj_attribute enabled_attr =
@@ -570,8 +559,6 @@ static int __init hugepage_init(void)
        start_khugepaged();
-        set_recommended_min_free_kbytes();
        return 0;
 out:
        hugepage_exit_sysfs(hugepage_kobj);
@@ -611,19 +598,6 @@ out:
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
-static void prepare_pmd_huge_pte(pgtable_t pgtable,
-                                 struct mm_struct *mm)
-{
-        assert_spin_locked(&mm->page_table_lock);
-        /* FIFO */
-        if (!mm->pmd_huge_pte)
-                INIT_LIST_HEAD(&pgtable->lru);
-        else
-                list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
-        mm->pmd_huge_pte = pgtable;
-}
 static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
        if (likely(vma->vm_flags & VM_WRITE))
@@ -665,7 +639,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                 */
                page_add_new_anon_rmap(page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
-                prepare_pmd_huge_pte(pgtable, mm);
+                pgtable_trans_huge_deposit(mm, pgtable);
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm->nr_ptes++;
                spin_unlock(&mm->page_table_lock);
@@ -791,7 +765,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-        prepare_pmd_huge_pte(pgtable, dst_mm);
+        pgtable_trans_huge_deposit(dst_mm, pgtable);
        dst_mm->nr_ptes++;
        ret = 0;
@@ -802,25 +776,6 @@ out:
        return ret;
 }
-/* no "address" argument so destroys page coloring of some arch */
-pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
-{
-        pgtable_t pgtable;
-        assert_spin_locked(&mm->page_table_lock);
-        /* FIFO */
-        pgtable = mm->pmd_huge_pte;
-        if (list_empty(&pgtable->lru))
-                mm->pmd_huge_pte = NULL;
-        else {
-                mm->pmd_huge_pte = list_entry(pgtable->lru.next,
-                                              struct page, lru);
-                list_del(&pgtable->lru);
-        }
-        return pgtable;
-}
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
@@ -832,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        pmd_t _pmd;
        int ret = 0, i;
        struct page **pages;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
                        GFP_KERNEL);
@@ -868,15 +825,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                cond_resched();
        }
+        mmun_start = haddr;
+        mmun_end   = haddr + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(*pmd, orig_pmd)))
                goto out_free_pages;
        VM_BUG_ON(!PageHead(page));
-        pmdp_clear_flush_notify(vma, haddr, pmd);
+        pmdp_clear_flush(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
-        pgtable = get_pmd_huge_pte(mm);
+        pgtable = pgtable_trans_huge_withdraw(mm);
        pmd_populate(mm, &_pmd, pgtable);
        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -896,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        page_remove_rmap(page);
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        ret |= VM_FAULT_WRITE;
        put_page(page);
@@ -904,6 +867,7 @@ out:
 out_free_pages:
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        mem_cgroup_uncharge_start();
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                mem_cgroup_uncharge_page(pages[i]);
@@ -920,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int ret = 0;
        struct page *page, *new_page;
        unsigned long haddr;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        VM_BUG_ON(!vma->anon_vma);
        spin_lock(&mm->page_table_lock);
@@ -934,7 +900,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
-                        update_mmu_cache(vma, address, entry);
+                        update_mmu_cache_pmd(vma, address, pmd);
                ret |= VM_FAULT_WRITE;
                goto out_unlock;
        }
@@ -970,38 +936,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
        __SetPageUptodate(new_page);
+        mmun_start = haddr;
+        mmun_end   = haddr + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        put_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
                spin_unlock(&mm->page_table_lock);
                mem_cgroup_uncharge_page(new_page);
                put_page(new_page);
-                goto out;
+                goto out_mn;
        } else {
                pmd_t entry;
                VM_BUG_ON(!PageHead(page));
                entry = mk_pmd(new_page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                entry = pmd_mkhuge(entry);
-                pmdp_clear_flush_notify(vma, haddr, pmd);
+                pmdp_clear_flush(vma, haddr, pmd);
                page_add_new_anon_rmap(new_page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
-                update_mmu_cache(vma, address, entry);
+                update_mmu_cache_pmd(vma, address, pmd);
                page_remove_rmap(page);
                put_page(page);
                ret |= VM_FAULT_WRITE;
        }
-out_unlock:
        spin_unlock(&mm->page_table_lock);
+out_mn:
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
        return ret;
+out_unlock:
+        spin_unlock(&mm->page_table_lock);
+        return ret;
 }
-struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                   unsigned long addr,
                                   pmd_t *pmd,
                                   unsigned int flags)
 {
+        struct mm_struct *mm = vma->vm_mm;
        struct page *page = NULL;
        assert_spin_locked(&mm->page_table_lock);
@@ -1024,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
                _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
                set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
        }
+        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+                if (page->mapping && trylock_page(page)) {
+                        lru_add_drain();
+                        if (page->mapping)
+                                mlock_vma_page(page);
+                        unlock_page(page);
+                }
+        }
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON(!PageCompound(page));
        if (flags & FOLL_GET)
@@ -1041,9 +1024,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
                struct page *page;
                pgtable_t pgtable;
-                pgtable = get_pmd_huge_pte(tlb->mm);
+                pmd_t orig_pmd;
-                page = pmd_page(*pmd);
+                pgtable = pgtable_trans_huge_withdraw(tlb->mm);
-                pmd_clear(pmd);
+                orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
+                page = pmd_page(orig_pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                page_remove_rmap(page);
                VM_BUG_ON(page_mapcount(page) < 0);
@@ -1207,7 +1191,11 @@ static int __split_huge_page_splitting(struct page *page,
        struct mm_struct *mm = vma->vm_mm;
        pmd_t *pmd;
        int ret = 0;
+        /* For mmu_notifiers */
+        const unsigned long mmun_start = address;
+        const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        pmd = page_check_address_pmd(page, mm, address,
                                     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1219,10 +1207,11 @@ static int __split_huge_page_splitting(struct page *page,
                 * and it won't wait on the anon_vma->root->mutex to
                 * serialize against split_huge_page*.
                 */
-                pmdp_splitting_flush_notify(vma, address, pmd);
+                pmdp_splitting_flush(vma, address, pmd);
                ret = 1;
        }
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        return ret;
 }
@@ -1358,11 +1347,11 @@ static int __split_huge_page_map(struct page *page,
        pmd = page_check_address_pmd(page, mm, address,
                                     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
        if (pmd) {
-                pgtable = get_pmd_huge_pte(mm);
+                pgtable = pgtable_trans_huge_withdraw(mm);
                pmd_populate(mm, &_pmd, pgtable);
-                for (i = 0, haddr = address; i < HPAGE_PMD_NR;
+                haddr = address;
-                     i++, haddr += PAGE_SIZE) {
+                for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
                        pte_t *pte, entry;
                        BUG_ON(PageCompound(page+i));
                        entry = mk_pte(page + i, vma->vm_page_prot);
@@ -1406,8 +1395,7 @@ static int __split_huge_page_map(struct page *page,
                 * SMP TLB and finally we write the non-huge version
                 * of the pmd entry with pmd_populate.
                 */
-                set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
+                pmdp_invalidate(vma, address, pmd);
-                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
                pmd_populate(mm, pmd, pgtable);
                ret = 1;
        }
@@ -1421,18 +1409,17 @@ static void __split_huge_page(struct page *page,
                              struct anon_vma *anon_vma)
 {
        int mapcount, mapcount2;
+        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct anon_vma_chain *avc;
        BUG_ON(!PageHead(page));
        BUG_ON(PageTail(page));
        mapcount = 0;
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long addr = vma_address(page, vma);
                BUG_ON(is_vma_temporary_stack(vma));
-                if (addr == -EFAULT)
-                        continue;
                mapcount += __split_huge_page_splitting(page, vma, addr);
        }
        /*
@@ -1453,12 +1440,10 @@ static void __split_huge_page(struct page *page,
        __split_huge_page_refcount(page);
        mapcount2 = 0;
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long addr = vma_address(page, vma);
                BUG_ON(is_vma_temporary_stack(vma));
-                if (addr == -EFAULT)
-                        continue;
                mapcount2 += __split_huge_page_map(page, vma, addr);
        }
        if (mapcount != mapcount2)
@@ -1491,12 +1476,13 @@ out:
        return ret;
 }
-#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
-                   VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
 int hugepage_madvise(struct vm_area_struct *vma,
                     unsigned long *vm_flags, int advice)
 {
+        struct mm_struct *mm = vma->vm_mm;
        switch (advice) {
        case MADV_HUGEPAGE:
                /*
@@ -1504,6 +1490,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
                 */
                if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
                        return -EINVAL;
+                if (mm->def_flags & VM_NOHUGEPAGE)
+                        return -EINVAL;
                *vm_flags &= ~VM_NOHUGEPAGE;
                *vm_flags |= VM_HUGEPAGE;
                /*
@@ -1655,11 +1643,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
        if (vma->vm_ops)
                /* khugepaged not yet working on file or special mappings */
                return 0;
-        /*
+        VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-         * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-         * true too, verify it here.
-         */
-        VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (hstart < hend)
@@ -1811,7 +1795,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
                        src_page = pte_page(pteval);
                        copy_user_highpage(page, src_page, address, vma);
                        VM_BUG_ON(page_mapcount(src_page) != 1);
-                        VM_BUG_ON(page_count(src_page) != 2);
                        release_pte_page(src_page);
                        /*
                         * ptl mostly unnecessary, but preempt has to
@@ -1834,28 +1817,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
        }
 }
-static void collapse_huge_page(struct mm_struct *mm,
+static void khugepaged_alloc_sleep(void)
-                               unsigned long address,
-                               struct page **hpage,
-                               struct vm_area_struct *vma,
-                               int node)
 {
-        pgd_t *pgd;
+        wait_event_freezable_timeout(khugepaged_wait, false,
-        pud_t *pud;
+                        msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
-        pmd_t *pmd, _pmd;
+}
-        pte_t *pte;
-        pgtable_t pgtable;
-        struct page *new_page;
-        spinlock_t *ptl;
-        int isolated;
-        unsigned long hstart, hend;
-        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifdef CONFIG_NUMA
-#ifndef CONFIG_NUMA
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-        up_read(&mm->mmap_sem);
+{
-        VM_BUG_ON(!*hpage);
+        if (IS_ERR(*hpage)) {
-        new_page = *hpage;
+                if (!*wait)
-#else
+                        return false;
+                *wait = false;
+                *hpage = NULL;
+                khugepaged_alloc_sleep();
+        } else if (*hpage) {
+                put_page(*hpage);
+                *hpage = NULL;
+        }
+        return true;
+}
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+                       struct vm_area_struct *vma, unsigned long address,
+                       int node)
+{
        VM_BUG_ON(*hpage);
        /*
         * Allocate the page while the vma is still valid and under
@@ -1867,7 +1857,7 @@ static void collapse_huge_page(struct mm_struct *mm,
         * mmap_sem in read mode is good idea also to allow greater
         * scalability.
         */
-        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+        *hpage  = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
                                      node, __GFP_OTHER_NODE);
        /*
@@ -1875,20 +1865,85 @@ static void collapse_huge_page(struct mm_struct *mm,
         * preparation for taking it in write mode.
         */
        up_read(&mm->mmap_sem);
-        if (unlikely(!new_page)) {
+        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
-                return;
+                return NULL;
        }
-#endif
        count_vm_event(THP_COLLAPSE_ALLOC);
-        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+        return *hpage;
-#ifdef CONFIG_NUMA
+}
-                put_page(new_page);
+#else
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+        struct page *hpage;
+        do {
+                hpage = alloc_hugepage(khugepaged_defrag());
+                if (!hpage) {
+                        count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+                        if (!*wait)
+                                return NULL;
+                        *wait = false;
+                        khugepaged_alloc_sleep();
+                } else
+                        count_vm_event(THP_COLLAPSE_ALLOC);
+        } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+        return hpage;
+}
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+        if (!*hpage)
+                *hpage = khugepaged_alloc_hugepage(wait);
+        if (unlikely(!*hpage))
+                return false;
+        return true;
+}
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+                       struct vm_area_struct *vma, unsigned long address,
+                       int node)
+{
+        up_read(&mm->mmap_sem);
+        VM_BUG_ON(!*hpage);
+        return  *hpage;
+}
 #endif
+static void collapse_huge_page(struct mm_struct *mm,
+                                   unsigned long address,
+                                   struct page **hpage,
+                                   struct vm_area_struct *vma,
+                                   int node)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd, _pmd;
+        pte_t *pte;
+        pgtable_t pgtable;
+        struct page *new_page;
+        spinlock_t *ptl;
+        int isolated;
+        unsigned long hstart, hend;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        /* release the mmap_sem read lock. */
+        new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+        if (!new_page)
+                return;
+        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
                return;
-        }
        /*
         * Prevent all access to pagetables with the exception of
@@ -1913,11 +1968,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out;
        if (is_vma_temporary_stack(vma))
                goto out;
-        /*
+        VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-         * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-         * true too, verify it here.
-         */
-        VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
@@ -1937,6 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm,
        pte = pte_offset_map(pmd, address);
        ptl = pte_lockptr(mm, pmd);
+        mmun_start = address;
+        mmun_end   = address + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock); /* probably unnecessary */
        /*
         * After this gup_fast can't run anymore. This also removes
@@ -1944,8 +1998,9 @@ static void collapse_huge_page(struct mm_struct *mm,
         * huge and small TLB entries for the same virtual address
         * to avoid the risk of CPU bugs in that area.
         */
-        _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+        _pmd = pmdp_clear_flush(vma, address, pmd);
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        spin_lock(ptl);
        isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1971,8 +2026,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        pte_unmap(pte);
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
-        VM_BUG_ON(page_count(pgtable) != 1);
-        VM_BUG_ON(page_mapcount(pgtable) != 0);
        _pmd = mk_pmd(new_page, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
@@ -1989,13 +2042,12 @@ static void collapse_huge_page(struct mm_struct *mm,
        BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address);
        set_pmd_at(mm, address, pmd, _pmd);
-        update_mmu_cache(vma, address, _pmd);
+        update_mmu_cache_pmd(vma, address, pmd);
-        prepare_pmd_huge_pte(pgtable, mm);
+        pgtable_trans_huge_deposit(mm, pgtable);
        spin_unlock(&mm->page_table_lock);
-#ifndef CONFIG_NUMA
        *hpage = NULL;
-#endif
        khugepaged_pages_collapsed++;
 out_up_write:
        up_write(&mm->mmap_sem);
@@ -2003,9 +2055,6 @@ out_up_write:
 out:
        mem_cgroup_uncharge_page(new_page);
-#ifdef CONFIG_NUMA
-        put_page(new_page);
-#endif
        goto out_up_write;
 }
@@ -2155,12 +2204,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                        goto skip;
                if (is_vma_temporary_stack(vma))
                        goto skip;
-                /*
+                VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-                 * If is_pfn_mapping() is true is_learn_pfn_mapping()
-                 * must be true too, verify it here.
-                 */
-                VM_BUG_ON(is_linear_pfn_mapping(vma) ||
-                          vma->vm_flags & VM_NO_THP);
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2235,32 +2279,23 @@ static int khugepaged_has_work(void)
 static int khugepaged_wait_event(void)
 {
        return !list_empty(&khugepaged_scan.mm_head) ||
-                !khugepaged_enabled();
+                kthread_should_stop();
 }
-static void khugepaged_do_scan(struct page **hpage)
+static void khugepaged_do_scan(void)
 {
+        struct page *hpage = NULL;
        unsigned int progress = 0, pass_through_head = 0;
        unsigned int pages = khugepaged_pages_to_scan;
+        bool wait = true;
        barrier(); /* write khugepaged_pages_to_scan to local stack */
        while (progress < pages) {
-                cond_resched();
+                if (!khugepaged_prealloc_page(&hpage, &wait))
-#ifndef CONFIG_NUMA
-                if (!*hpage) {
-                        *hpage = alloc_hugepage(khugepaged_defrag());
-                        if (unlikely(!*hpage)) {
-                                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-                                break;
-                        }
-                        count_vm_event(THP_COLLAPSE_ALLOC);
-                }
-#else
-                if (IS_ERR(*hpage))
                        break;
-#endif
+                cond_resched();
                if (unlikely(kthread_should_stop() || freezing(current)))
                        break;
@@ -2271,73 +2306,32 @@ static void khugepaged_do_scan(struct page **hpage)
                if (khugepaged_has_work() &&
                    pass_through_head < 2)
                        progress += khugepaged_scan_mm_slot(pages - progress,
-                                                            hpage);
+                                                            &hpage);
                else
                        progress = pages;
                spin_unlock(&khugepaged_mm_lock);
        }
-}
-static void khugepaged_alloc_sleep(void)
+        if (!IS_ERR_OR_NULL(hpage))
-{
+                put_page(hpage);
-        wait_event_freezable_timeout(khugepaged_wait, false,
-                        msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
 }
-#ifndef CONFIG_NUMA
+static void khugepaged_wait_work(void)
-static struct page *khugepaged_alloc_hugepage(void)
 {
-        struct page *hpage;
+        try_to_freeze();
-        do {
-                hpage = alloc_hugepage(khugepaged_defrag());
-                if (!hpage) {
-                        count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-                        khugepaged_alloc_sleep();
-                } else
-                        count_vm_event(THP_COLLAPSE_ALLOC);
-        } while (unlikely(!hpage) &&
-                 likely(khugepaged_enabled()));
-        return hpage;
-}
-#endif
-static void khugepaged_loop(void)
+        if (khugepaged_has_work()) {
-{
+                if (!khugepaged_scan_sleep_millisecs)
-        struct page *hpage;
+                        return;
-#ifdef CONFIG_NUMA
+                wait_event_freezable_timeout(khugepaged_wait,
-        hpage = NULL;
+                                             kthread_should_stop(),
-#endif
+                        msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
-        while (likely(khugepaged_enabled())) {
+                return;
-#ifndef CONFIG_NUMA
-                hpage = khugepaged_alloc_hugepage();
-                if (unlikely(!hpage))
-                        break;
-#else
-                if (IS_ERR(hpage)) {
-                        khugepaged_alloc_sleep();
-                        hpage = NULL;
-                }
-#endif
-                khugepaged_do_scan(&hpage);
-#ifndef CONFIG_NUMA
-                if (hpage)
-                        put_page(hpage);
-#endif
-                try_to_freeze();
-                if (unlikely(kthread_should_stop()))
-                        break;
-                if (khugepaged_has_work()) {
-                        if (!khugepaged_scan_sleep_millisecs)
-                                continue;
-                        wait_event_freezable_timeout(khugepaged_wait, false,
-                            msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
-                } else if (khugepaged_enabled())
-                        wait_event_freezable(khugepaged_wait,
-                                             khugepaged_wait_event());
        }
+        if (khugepaged_enabled())
+                wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 static int khugepaged(void *none)
@@ -2347,20 +2341,9 @@ static int khugepaged(void *none)
        set_freezable();
        set_user_nice(current, 19);
-        /* serialize with start_khugepaged() */
+        while (!kthread_should_stop()) {
-        mutex_lock(&khugepaged_mutex);
+                khugepaged_do_scan();
+                khugepaged_wait_work();
-        for (;;) {
-                mutex_unlock(&khugepaged_mutex);
-                VM_BUG_ON(khugepaged_thread != current);
-                khugepaged_loop();
-                VM_BUG_ON(khugepaged_thread != current);
-                mutex_lock(&khugepaged_mutex);
-                if (!khugepaged_enabled())
-                        break;
-                if (unlikely(kthread_should_stop()))
-                        break;
        }
        spin_lock(&khugepaged_mm_lock);
@@ -2369,10 +2352,6 @@ static int khugepaged(void *none)
        if (mm_slot)
                collect_mm_slot(mm_slot);
        spin_unlock(&khugepaged_mm_lock);
-        khugepaged_thread = NULL;
-        mutex_unlock(&khugepaged_mutex);
        return 0;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc727122dd4..59a0059b39e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,7 +30,6 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
-#include <linux/hugetlb_cgroup.h>
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -637,6 +636,7 @@ static void free_huge_page(struct page *page)
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
        } else {
+                arch_clear_hugepage_flags(page);
                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
        }
 }
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for normal or
+ * transparent huge pages.  See the PageTransHuge() documentation for more
+ * details.
+ */
 int PageHuge(struct page *page)
 {
        compound_page_dtor *dtor;
@@ -2355,13 +2360,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        struct page *page;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
+        const unsigned long mmun_start = start; /* For mmu_notifiers */
+        const unsigned long mmun_end   = end;   /* For mmu_notifiers */
        WARN_ON(!is_vm_hugetlb_page(vma));
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));
        tlb_start_vma(tlb, vma);
-        mmu_notifier_invalidate_range_start(mm, start, end);
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 again:
        spin_lock(&mm->page_table_lock);
        for (address = start; address < end; address += sz) {
@@ -2425,7 +2432,7 @@ again:
                if (address < end && !ref_page)
                        goto again;
        }
-        mmu_notifier_invalidate_range_end(mm, start, end);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        tlb_end_vma(tlb, vma);
 }
@@ -2473,7 +2480,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
        struct hstate *h = hstate_vma(vma);
        struct vm_area_struct *iter_vma;
        struct address_space *mapping;
-        struct prio_tree_iter iter;
        pgoff_t pgoff;
        /*
@@ -2481,7 +2487,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * from page cache lookup which is in HPAGE_SIZE units.
         */
        address = address & huge_page_mask(h);
-        pgoff = vma_hugecache_offset(h, vma, address);
+        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+                        vma->vm_pgoff;
        mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
        /*
@@ -2490,7 +2497,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * __unmap_hugepage_range() is called as the lock is already held
         */
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
                /* Do not unmap the current VMA */
                if (iter_vma == vma)
                        continue;
@@ -2525,6 +2532,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *old_page, *new_page;
        int avoidcopy;
        int outside_reserve = 0;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        old_page = pte_page(pte);
@@ -2611,6 +2620,9 @@ retry_avoidcopy:
                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);
+        mmun_start = address & huge_page_mask(h);
+        mmun_end = mmun_start + huge_page_size(h);
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
         * Retake the page_table_lock to check for racing updates
         * before the page tables are altered
@@ -2619,9 +2631,6 @@ retry_avoidcopy:
        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
                /* Break COW */
-                mmu_notifier_invalidate_range_start(mm,
-                        address & huge_page_mask(h),
-                        (address & huge_page_mask(h)) + huge_page_size(h));
                huge_ptep_clear_flush(vma, address, ptep);
                set_huge_pte_at(mm, address, ptep,
                                make_huge_pte(vma, new_page, 1));
@@ -2629,10 +2638,11 @@ retry_avoidcopy:
                hugepage_add_new_anon_rmap(new_page, vma, address);
                /* Make the old page be freed below */
                new_page = old_page;
-                mmu_notifier_invalidate_range_end(mm,
-                        address & huge_page_mask(h),
-                        (address & huge_page_mask(h)) + huge_page_size(h));
        }
+        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+        /* Caller expects lock to be held */
+        spin_lock(&mm->page_table_lock);
        page_cache_release(new_page);
        page_cache_release(old_page);
        return 0;
diff --git a/mm/internal.h b/mm/internal.h
index 3314f79d775..a4fa284f6bc 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,25 +118,27 @@ struct compact_control {
        unsigned long nr_freepages;     /* Number of isolated free pages */
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
-        unsigned long start_free_pfn;   /* where we started the search */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        bool sync;                      /* Synchronous migration */
-        bool wrapped;                   /* Order > 0 compactions are
+        bool ignore_skip_hint;          /* Scan blocks even if marked skip */
-                                           incremental, once free_pfn
+        bool finished_update_free;      /* True when the zone cached pfns are
-                                           and migrate_pfn meet, we restart
+                                         * no longer being updated
-                                           from the top of the zone;
+                                         */
-                                           remember we wrapped around. */
+        bool finished_update_migrate;
        int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
+        bool contended;                 /* True if a lock was contended */
+        struct page **page;             /* Page captured of requested size */
 };
 unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
+isolate_freepages_range(struct compact_control *cc,
+                        unsigned long start_pfn, unsigned long end_pfn);
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
-                           unsigned long low_pfn, unsigned long end_pfn);
+        unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
 #endif
@@ -166,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 }
 /*
- * Called only in fault path via page_evictable() for a new page
+ * Called only in fault path, to determine if a new page is being
- * to determine if it's being mapped into a LOCKED vma.
+ * mapped into a LOCKED vma.  If it is, mark page as mlocked.
- * If so, mark page as mlocked.
 */
 static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
                                    struct page *page)
@@ -179,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
                return 0;
        if (!TestSetPageMlocked(page)) {
-                inc_zone_page_state(page, NR_MLOCK);
+                mod_zone_page_state(page_zone(page), NR_MLOCK,
+                                    hpage_nr_pages(page));
                count_vm_event(UNEVICTABLE_PGMLOCKED);
        }
        return 1;
@@ -200,12 +202,7 @@ extern void munlock_vma_page(struct page *page);
 * If called for a page that is still mapped by mlocked vmas, all we do
 * is revert to lazy LRU behaviour -- semantics are not broken.
 */
-extern void __clear_page_mlock(struct page *page);
+extern void clear_page_mlock(struct page *page);
-static inline void clear_page_mlock(struct page *page)
-{
-        if (unlikely(TestClearPageMlocked(page)))
-                __clear_page_mlock(page);
-}
 /*
 * mlock_migrate_page - called only from migrate_page_copy() to
@@ -339,7 +336,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 #define ZONE_RECLAIM_FULL       -1
 #define ZONE_RECLAIM_SOME       0
 #define ZONE_RECLAIM_SUCCESS    1
-#endif
 extern int hwpoison_filter(struct page *p);
@@ -355,3 +351,20 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long);
 extern void set_pageblock_order(void);
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+                                            struct list_head *page_list);
+/* The ALLOC_WMARK bits are used as an index to zone->watermark */
+#define ALLOC_WMARK_MIN         WMARK_MIN
+#define ALLOC_WMARK_LOW         WMARK_LOW
+#define ALLOC_WMARK_HIGH        WMARK_HIGH
+#define ALLOC_NO_WATERMARKS     0x04 /* don't check watermarks at all */
+/* Mask to get the watermark bits */
+#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)
+#define ALLOC_HARDER            0x10 /* try to alloc harder */
+#define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET            0x40 /* check for correct cpuset */
+#define ALLOC_CMA               0x80 /* allow allocations from CMA areas */
+#endif  /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 00000000000..4a5822a586e
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,112 @@
+/*
+ * mm/interval_tree.c - interval tree for mapping->i_mmap
+ *
+ * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
+ *
+ * This file is released under the GPL v2.
+ */
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rmap.h>
+#include <linux/interval_tree_generic.h>
+static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
+{
+        return v->vm_pgoff;
+}
+static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
+{
+        return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+}
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
+                     unsigned long, shared.linear.rb_subtree_last,
+                     vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+/* Insert node immediately after prev in the interval tree */
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+                                    struct vm_area_struct *prev,
+                                    struct rb_root *root)
+{
+        struct rb_node **link;
+        struct vm_area_struct *parent;
+        unsigned long last = vma_last_pgoff(node);
+        VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
+        if (!prev->shared.linear.rb.rb_right) {
+                parent = prev;
+                link = &prev->shared.linear.rb.rb_right;
+        } else {
+                parent = rb_entry(prev->shared.linear.rb.rb_right,
+                                  struct vm_area_struct, shared.linear.rb);
+                if (parent->shared.linear.rb_subtree_last < last)
+                        parent->shared.linear.rb_subtree_last = last;
+                while (parent->shared.linear.rb.rb_left) {
+                        parent = rb_entry(parent->shared.linear.rb.rb_left,
+                                struct vm_area_struct, shared.linear.rb);
+                        if (parent->shared.linear.rb_subtree_last < last)
+                                parent->shared.linear.rb_subtree_last = last;
+                }
+                link = &parent->shared.linear.rb.rb_left;
+        }
+        node->shared.linear.rb_subtree_last = last;
+        rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
+        rb_insert_augmented(&node->shared.linear.rb, root,
+                            &vma_interval_tree_augment);
+}
+static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
+{
+        return vma_start_pgoff(avc->vma);
+}
+static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
+{
+        return vma_last_pgoff(avc->vma);
+}
+INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
+                     avc_start_pgoff, avc_last_pgoff,
+                     static inline, __anon_vma_interval_tree)
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+                                   struct rb_root *root)
+{
+#ifdef CONFIG_DEBUG_VM_RB
+        node->cached_vma_start = avc_start_pgoff(node);
+        node->cached_vma_last = avc_last_pgoff(node);
+#endif
+        __anon_vma_interval_tree_insert(node, root);
+}
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+                                   struct rb_root *root)
+{
+        __anon_vma_interval_tree_remove(node, root);
+}
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root *root,
+                                  unsigned long first, unsigned long last)
+{
+        return __anon_vma_interval_tree_iter_first(root, first, last);
+}
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
+                                 unsigned long first, unsigned long last)
+{
+        return __anon_vma_interval_tree_iter_next(node, first, last);
+}
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
+{
+        WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
+        WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
+}
+#endif
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 45eb6217bf3..a217cc54406 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -29,7 +29,7 @@
 * - kmemleak_lock (rwlock): protects the object_list modifications and
 *   accesses to the object_tree_root. The object_list is the main list
 *   holding the metadata (struct kmemleak_object) for the allocated memory
- *   blocks. The object_tree_root is a priority search tree used to look-up
+ *   blocks. The object_tree_root is a red black tree used to look-up
 *   metadata based on a pointer to the corresponding memory block.  The
 *   kmemleak_object structures are added to the object_list and
 *   object_tree_root in the create_object() function called from the
@@ -71,7 +71,7 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/kthread.h>
-#include <linux/prio_tree.h>
+#include <linux/rbtree.h>
 #include <linux/fs.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -132,7 +132,7 @@ struct kmemleak_scan_area {
 * Structure holding the metadata for each allocated memory block.
 * Modifications to such objects should be made while holding the
 * object->lock. Insertions or deletions from object_list, gray_list or
- * tree_node are already protected by the corresponding locks or mutex (see
+ * rb_node are already protected by the corresponding locks or mutex (see
 * the notes on locking above). These objects are reference-counted
 * (use_count) and freed using the RCU mechanism.
 */
@@ -141,7 +141,7 @@ struct kmemleak_object {
        unsigned long flags;            /* object status flags */
        struct list_head object_list;
        struct list_head gray_list;
-        struct prio_tree_node tree_node;
+        struct rb_node rb_node;
        struct rcu_head rcu;            /* object_list lockless traversal */
        /* object usage count; object freed when use_count == 0 */
        atomic_t use_count;
@@ -182,9 +182,9 @@ struct kmemleak_object {
 static LIST_HEAD(object_list);
 /* the list of gray-colored objects (see color_gray comment below) */
 static LIST_HEAD(gray_list);
-/* prio search tree for object boundaries */
+/* search tree for object boundaries */
-static struct prio_tree_root object_tree_root;
+static struct rb_root object_tree_root = RB_ROOT;
-/* rw_lock protecting the access to object_list and prio_tree_root */
+/* rw_lock protecting the access to object_list and object_tree_root */
 static DEFINE_RWLOCK(kmemleak_lock);
 /* allocation caches for kmemleak internal data */
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
        trace.entries = object->trace;
        pr_notice("Object 0x%08lx (size %zu):\n",
-                  object->tree_node.start, object->size);
+                  object->pointer, object->size);
        pr_notice("  comm \"%s\", pid %d, jiffies %lu\n",
                  object->comm, object->pid, object->jiffies);
        pr_notice("  min_count = %d\n", object->min_count);
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object)
 }
 /*
- * Look-up a memory block metadata (kmemleak_object) in the priority search
+ * Look-up a memory block metadata (kmemleak_object) in the object search
 * tree based on a pointer value. If alias is 0, only values pointing to the
 * beginning of the memory block are allowed. The kmemleak_lock must be held
 * when calling this function.
 */
 static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
 {
-        struct prio_tree_node *node;
+        struct rb_node *rb = object_tree_root.rb_node;
-        struct prio_tree_iter iter;
-        struct kmemleak_object *object;
+        while (rb) {
+                struct kmemleak_object *object =
-        prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
+                        rb_entry(rb, struct kmemleak_object, rb_node);
-        node = prio_tree_next(&iter);
+                if (ptr < object->pointer)
-        if (node) {
+                        rb = object->rb_node.rb_left;
-                object = prio_tree_entry(node, struct kmemleak_object,
+                else if (object->pointer + object->size <= ptr)
-                                         tree_node);
+                        rb = object->rb_node.rb_right;
-                if (!alias && object->pointer != ptr) {
+                else if (object->pointer == ptr || alias)
+                        return object;
+                else {
                        kmemleak_warn("Found object by alias at 0x%08lx\n",
                                      ptr);
                        dump_object_info(object);
-                        object = NULL;
+                        break;
                }
-        } else
+        }
-                object = NULL;
+        return NULL;
-        return object;
 }
 /*
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object)
 }
 /*
- * Look up an object in the prio search tree and increase its use_count.
+ * Look up an object in the object search tree and increase its use_count.
 */
 static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
 {
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
                                             int min_count, gfp_t gfp)
 {
        unsigned long flags;
-        struct kmemleak_object *object;
+        struct kmemleak_object *object, *parent;
-        struct prio_tree_node *node;
+        struct rb_node **link, *rb_parent;
        object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
        if (!object) {
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        /* kernel backtrace */
        object->trace_len = __save_stack_trace(object->trace);
-        INIT_PRIO_TREE_NODE(&object->tree_node);
-        object->tree_node.start = ptr;
-        object->tree_node.last = ptr + size - 1;
        write_lock_irqsave(&kmemleak_lock, flags);
        min_addr = min(min_addr, ptr);
        max_addr = max(max_addr, ptr + size);
-        node = prio_tree_insert(&object_tree_root, &object->tree_node);
+        link = &object_tree_root.rb_node;
-        /*
+        rb_parent = NULL;
-         * The code calling the kernel does not yet have the pointer to the
+        while (*link) {
-         * memory block to be able to free it.  However, we still hold the
+                rb_parent = *link;
-         * kmemleak_lock here in case parts of the kernel started freeing
+                parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
-         * random memory blocks.
+                if (ptr + size <= parent->pointer)
-         */
+                        link = &parent->rb_node.rb_left;
-        if (node != &object->tree_node) {
+                else if (parent->pointer + parent->size <= ptr)
-                kmemleak_stop("Cannot insert 0x%lx into the object search tree "
+                        link = &parent->rb_node.rb_right;
-                              "(already existing)\n", ptr);
+                else {
-                object = lookup_object(ptr, 1);
+                        kmemleak_stop("Cannot insert 0x%lx into the object "
-                spin_lock(&object->lock);
+                                      "search tree (overlaps existing)\n",
-                dump_object_info(object);
+                                      ptr);
-                spin_unlock(&object->lock);
+                        kmem_cache_free(object_cache, object);
+                        object = parent;
-                goto out;
+                        spin_lock(&object->lock);
+                        dump_object_info(object);
+                        spin_unlock(&object->lock);
+                        goto out;
+                }
        }
+        rb_link_node(&object->rb_node, rb_parent, link);
+        rb_insert_color(&object->rb_node, &object_tree_root);
        list_add_tail_rcu(&object->object_list, &object_list);
 out:
        write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object)
        unsigned long flags;
        write_lock_irqsave(&kmemleak_lock, flags);
-        prio_tree_remove(&object_tree_root, &object->tree_node);
+        rb_erase(&object->rb_node, &object_tree_root);
        list_del_rcu(&object->object_list);
        write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -1483,13 +1486,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
        struct kmemleak_object *prev_obj = v;
        struct kmemleak_object *next_obj = NULL;
-        struct list_head *n = &prev_obj->object_list;
+        struct kmemleak_object *obj = prev_obj;
        ++(*pos);
-        list_for_each_continue_rcu(n, &object_list) {
+        list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
-                struct kmemleak_object *obj =
-                        list_entry(n, struct kmemleak_object, object_list);
                if (get_object(obj)) {
                        next_obj = obj;
                        break;
@@ -1768,7 +1769,6 @@ void __init kmemleak_init(void)
        object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
        scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
-        INIT_PRIO_TREE_ROOT(&object_tree_root);
        if (crt_early_log >= ARRAY_SIZE(early_log))
                pr_warning("Early log buffer exceeded (%d), please increase "
diff --git a/mm/ksm.c b/mm/ksm.c
index 47c88536889..ae539f0b8aa 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
        spinlock_t *ptl;
        int swapped;
        int err = -EFAULT;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        addr = page_address_in_vma(page, vma);
        if (addr == -EFAULT)
                goto out;
        BUG_ON(PageTransCompound(page));
+        mmun_start = addr;
+        mmun_end   = addr + PAGE_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        ptep = page_check_address(page, mm, addr, &ptl, 0);
        if (!ptep)
-                goto out;
+                goto out_mn;
        if (pte_write(*ptep) || pte_dirty(*ptep)) {
                pte_t entry;
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 out_unlock:
        pte_unmap_unlock(ptep, ptl);
+out_mn:
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
        return err;
 }
@@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        spinlock_t *ptl;
        unsigned long addr;
        int err = -EFAULT;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        addr = page_address_in_vma(page, vma);
        if (addr == -EFAULT)
@@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        if (!pmd_present(*pmd))
                goto out;
+        mmun_start = addr;
+        mmun_end   = addr + PAGE_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!pte_same(*ptep, orig_pte)) {
                pte_unmap_unlock(ptep, ptl);
-                goto out;
+                goto out_mn;
        }
        get_page(kpage);
@@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        pte_unmap_unlock(ptep, ptl);
        err = 0;
+out_mn:
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
        return err;
 }
@@ -1469,10 +1486,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                 */
                if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
                                 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-                                 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
+                                 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
-                                 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
                        return 0;               /* just ignore the advice */
+#ifdef VM_SAO
+                if (*vm_flags & VM_SAO)
+                        return 0;
+#endif
                if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
                        err = __ksm_enter(mm);
                        if (err)
@@ -1582,7 +1603,7 @@ struct page *ksm_does_need_to_copy(struct page *page,
                SetPageSwapBacked(new_page);
                __set_page_locked(new_page);
-                if (page_evictable(new_page, vma))
+                if (!mlocked_vma_newpage(vma, new_page))
                        lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
                else
                        add_page_to_unevictable_list(new_page);
@@ -1614,7 +1635,8 @@ again:
                struct vm_area_struct *vma;
                anon_vma_lock(anon_vma);
-                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+                                               0, ULONG_MAX) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
@@ -1667,7 +1689,8 @@ again:
                struct vm_area_struct *vma;
                anon_vma_lock(anon_vma);
-                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+                                               0, ULONG_MAX) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
@@ -1719,7 +1742,8 @@ again:
                struct vm_area_struct *vma;
                anon_vma_lock(anon_vma);
-                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+                                               0, ULONG_MAX) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
diff --git a/mm/madvise.c b/mm/madvise.c
index 14d260fa0d1..03dfa5c7adb 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma,
                new_flags &= ~VM_DONTCOPY;
                break;
        case MADV_DONTDUMP:
-                new_flags |= VM_NODUMP;
+                new_flags |= VM_DONTDUMP;
                break;
        case MADV_DODUMP:
-                new_flags &= ~VM_NODUMP;
+                if (new_flags & VM_SPECIAL) {
+                        error = -EINVAL;
+                        goto out;
+                }
+                new_flags &= ~VM_DONTDUMP;
                break;
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
diff --git a/mm/memblock.c b/mm/memblock.c
index 4d9393c7edc..931eef145af 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0;
 static int memblock_reserved_in_slab __initdata_memblock = 0;
 /* inline so we don't get a warning when pr_debug is compiled out */
-static inline const char *memblock_type_name(struct memblock_type *type)
+static __init_memblock const char *
+memblock_type_name(struct memblock_type *type)
 {
        if (type == &memblock.memory)
                return "memory";
@@ -246,7 +247,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
                                min(new_area_start, memblock.current_limit),
                                new_alloc_size, PAGE_SIZE);
-                new_array = addr ? __va(addr) : 0;
+                new_array = addr ? __va(addr) : NULL;
        }
        if (!addr) {
                pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
@@ -756,7 +757,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
                return ret;
        for (i = start_rgn; i < end_rgn; i++)
-                type->regions[i].nid = nid;
+                memblock_set_region_node(&type->regions[i], nid);
        memblock_merge_regions(type);
        return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 795e525afab..7acf43bf04a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,7 @@
 #include <linux/oom.h>
 #include "internal.h"
 #include <net/sock.h>
+#include <net/ip.h>
 #include <net/tcp_memcontrol.h>
 #include <asm/uaccess.h>
@@ -326,7 +327,7 @@ struct mem_cgroup {
        struct mem_cgroup_stat_cpu nocpu_base;
        spinlock_t pcp_counter_lock;
-#ifdef CONFIG_INET
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
        struct tcp_memcontrol tcp_mem;
 #endif
 };
@@ -411,12 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
        return container_of(s, struct mem_cgroup, css);
 }
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+{
+        return (memcg == root_mem_cgroup);
+}
 /* Writing them here to avoid exposing memcg's inner layout */
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
-#include <net/sock.h>
-#include <net/ip.h>
-static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
 void sock_update_memcg(struct sock *sk)
 {
        if (mem_cgroup_sockets_enabled) {
@@ -461,7 +464,6 @@ void sock_release_memcg(struct sock *sk)
        }
 }
-#ifdef CONFIG_INET
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 {
        if (!memcg || mem_cgroup_is_root(memcg))
@@ -470,10 +472,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
        return &memcg->tcp_mem.cg_proto;
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
-#endif /* CONFIG_INET */
-#endif /* CONFIG_MEMCG_KMEM */
-#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
        if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -1016,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
             iter != NULL;                              \
             iter = mem_cgroup_iter(NULL, iter, NULL))
-static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
-{
-        return (memcg == root_mem_cgroup);
-}
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
        struct mem_cgroup *memcg;
@@ -4973,6 +4967,13 @@ mem_cgroup_create(struct cgroup *cont)
        } else {
                res_counter_init(&memcg->res, NULL);
                res_counter_init(&memcg->memsw, NULL);
+                /*
+                 * Deeper hierachy with use_hierarchy == false doesn't make
+                 * much sense so let cgroup subsystem know about this
+                 * unfortunate state in our controller.
+                 */
+                if (parent && parent != root_mem_cgroup)
+                        mem_cgroup_subsys.broken_hierarchy = true;
        }
        memcg->last_scanned_node = MAX_NUMNODES;
        INIT_LIST_HEAD(&memcg->oom_notify);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a6e2141a661..6c5899b9034 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct vm_area_struct *vma;
        struct task_struct *tsk;
        struct anon_vma *av;
+        pgoff_t pgoff;
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
                return;
+        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        read_lock(&tasklist_lock);
        for_each_process (tsk) {
                struct anon_vma_chain *vmac;
                if (!task_early_kill(tsk))
                        continue;
-                list_for_each_entry(vmac, &av->head, same_anon_vma) {
+                anon_vma_interval_tree_foreach(vmac, &av->rb_root,
+                                               pgoff, pgoff) {
                        vma = vmac->vma;
                        if (!page_mapped_in_vma(page, vma))
                                continue;
@@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 {
        struct vm_area_struct *vma;
        struct task_struct *tsk;
-        struct prio_tree_iter iter;
        struct address_space *mapping = page->mapping;
        mutex_lock(&mapping->i_mmap_mutex);
@@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
                if (!task_early_kill(tsk))
                        continue;
-                vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+                vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
                                      pgoff) {
                        /*
                         * Send early kill signal to tasks where a vma covers
diff --git a/mm/memory.c b/mm/memory.c
index 57361708d1a..fb135ba4aba 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
        add_taint(TAINT_BAD_PAGE);
 }
-static inline int is_cow_mapping(vm_flags_t flags)
+static inline bool is_cow_mapping(vm_flags_t flags)
 {
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        unsigned long next;
        unsigned long addr = vma->vm_start;
        unsigned long end = vma->vm_end;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        bool is_cow;
        int ret;
        /*
@@ -1047,7 +1050,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * readonly mappings. The tradeoff is that copy_page_range is more
         * efficient than faulting.
         */
-        if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+        if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
+                               VM_PFNMAP | VM_MIXEDMAP))) {
                if (!vma->anon_vma)
                        return 0;
        }
@@ -1055,12 +1059,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (is_vm_hugetlb_page(vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, vma);
-        if (unlikely(is_pfn_mapping(vma))) {
+        if (unlikely(vma->vm_flags & VM_PFNMAP)) {
                /*
                 * We do not free on error cases below as remove_vma
                 * gets called on error from higher level routine
                 */
-                ret = track_pfn_vma_copy(vma);
+                ret = track_pfn_copy(vma);
                if (ret)
                        return ret;
        }
@@ -1071,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * parent mm. And a permission downgrade will only happen if
         * is_cow_mapping() returns true.
         */
-        if (is_cow_mapping(vma->vm_flags))
+        is_cow = is_cow_mapping(vma->vm_flags);
-                mmu_notifier_invalidate_range_start(src_mm, addr, end);
+        mmun_start = addr;
+        mmun_end   = end;
+        if (is_cow)
+                mmu_notifier_invalidate_range_start(src_mm, mmun_start,
+                                                    mmun_end);
        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
@@ -1088,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);
-        if (is_cow_mapping(vma->vm_flags))
+        if (is_cow)
-                mmu_notifier_invalidate_range_end(src_mm,
+                mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
-                                                  vma->vm_start, end);
        return ret;
 }
@@ -1327,8 +1334,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
        if (vma->vm_file)
                uprobe_munmap(vma, start, end);
-        if (unlikely(is_pfn_mapping(vma)))
+        if (unlikely(vma->vm_flags & VM_PFNMAP))
-                untrack_pfn_vma(vma, 0, 0);
+                untrack_pfn(vma, 0, 0);
        if (start != end) {
                if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -1521,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                                spin_unlock(&mm->page_table_lock);
                                wait_split_huge_page(vma->anon_vma, pmd);
                        } else {
-                                page = follow_trans_huge_pmd(mm, address,
+                                page = follow_trans_huge_pmd(vma, address,
                                                             pmd, flags);
                                spin_unlock(&mm->page_table_lock);
                                goto out;
@@ -1576,12 +1583,12 @@ split_fallthrough:
                if (page->mapping && trylock_page(page)) {
                        lru_add_drain();  /* push cached pages to LRU */
                        /*
-                         * Because we lock page here and migration is
+                         * Because we lock page here, and migration is
-                         * blocked by the pte's page reference, we need
+                         * blocked by the pte's page reference, and we
-                         * only check for file-cache page truncation.
+                         * know the page is still mapped, we don't even
+                         * need to check for file-cache page truncation.
                         */
-                        if (page->mapping)
+                        mlock_vma_page(page);
-                                mlock_vma_page(page);
                        unlock_page(page);
                }
        }
@@ -2085,6 +2092,11 @@ out:
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
+ *
+ * Usually this function is called from f_op->mmap() handler
+ * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
+ * Caller must set VM_MIXEDMAP on vma if it wants to call this
+ * function from other places, for example from page-fault handler.
 */
 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page)
@@ -2093,7 +2105,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                return -EFAULT;
        if (!page_count(page))
                return -EINVAL;
-        vma->vm_flags |= VM_INSERTPAGE;
+        if (!(vma->vm_flags & VM_MIXEDMAP)) {
+                BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+                BUG_ON(vma->vm_flags & VM_PFNMAP);
+                vma->vm_flags |= VM_MIXEDMAP;
+        }
        return insert_page(vma, addr, page, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_insert_page);
@@ -2132,7 +2148,7 @@ out:
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 *
- * Similar to vm_inert_page, this allows drivers to insert individual pages
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
 * they've allocated into a user vma. Same comments apply.
 *
 * This function should only be called from a vm_ops->fault handler, and
@@ -2162,14 +2178,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
-        if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
+        if (track_pfn_insert(vma, &pgprot, pfn))
                return -EINVAL;
        ret = insert_pfn(vma, addr, pfn, pgprot);
-        if (ret)
-                untrack_pfn_vma(vma, pfn, PAGE_SIZE);
        return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
@@ -2290,37 +2303,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
         * rest of the world about it:
         *   VM_IO tells people not to look at these pages
         *      (accesses can have side effects).
-         *   VM_RESERVED is specified all over the place, because
-         *      in 2.4 it kept swapout's vma scan off this vma; but
-         *      in 2.6 the LRU scan won't even find its pages, so this
-         *      flag means no more than count its pages in reserved_vm,
-         *      and omit it from core dump, even when VM_IO turned off.
         *   VM_PFNMAP tells the core MM that the base pages are just
         *      raw PFN mappings, and do not have a "struct page" associated
         *      with them.
+         *   VM_DONTEXPAND
+         *      Disable vma merging and expanding with mremap().
+         *   VM_DONTDUMP
+         *      Omit vma from core dump, even when VM_IO turned off.
         *
         * There's a horrible special case to handle copy-on-write
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+         * See vm_normal_page() for details.
         */
-        if (addr == vma->vm_start && end == vma->vm_end) {
+        if (is_cow_mapping(vma->vm_flags)) {
+                if (addr != vma->vm_start || end != vma->vm_end)
+                        return -EINVAL;
                vma->vm_pgoff = pfn;
-                vma->vm_flags |= VM_PFN_AT_MMAP;
+        }
-        } else if (is_cow_mapping(vma->vm_flags))
-                return -EINVAL;
-        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
-        err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
+        err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
-        if (err) {
+        if (err)
-                /*
-                 * To indicate that track_pfn related cleanup is not
-                 * needed from higher level routine calling unmap_vmas
-                 */
-                vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
-                vma->vm_flags &= ~VM_PFN_AT_MMAP;
                return -EINVAL;
-        }
+        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
@@ -2335,7 +2341,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
        } while (pgd++, addr = next, addr != end);
        if (err)
-                untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
+                untrack_pfn(vma, pfn, PAGE_ALIGN(size));
        return err;
 }
@@ -2516,11 +2522,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                spinlock_t *ptl, pte_t orig_pte)
        __releases(ptl)
 {
-        struct page *old_page, *new_page;
+        struct page *old_page, *new_page = NULL;
        pte_t entry;
        int ret = 0;
        int page_mkwrite = 0;
        struct page *dirty_page = NULL;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        bool mmun_called = false;       /* For mmu_notifiers */
        old_page = vm_normal_page(vma, address, orig_pte);
        if (!old_page) {
@@ -2698,6 +2707,11 @@ gotten:
        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
+        mmun_start  = address & PAGE_MASK;
+        mmun_end    = (address & PAGE_MASK) + PAGE_SIZE;
+        mmun_called = true;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
         * Re-check the pte - we dropped the lock
         */
@@ -2764,6 +2778,8 @@ gotten:
                page_cache_release(new_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
+        if (mmun_called)
+                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (old_page) {
                /*
                 * Don't let another task, with possibly unlocked vma,
@@ -2801,14 +2817,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
        zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
 }
-static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root *root,
                                            struct zap_details *details)
 {
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        pgoff_t vba, vea, zba, zea;
-        vma_prio_tree_foreach(vma, &iter, root,
+        vma_interval_tree_foreach(vma, root,
                        details->first_index, details->last_index) {
                vba = vma->vm_pgoff;
@@ -2839,7 +2854,7 @@ static inline void unmap_mapping_range_list(struct list_head *head,
         * across *all* the pages in each nonlinear VMA, not just the pages
         * whose virtual address lies outside the file truncation point.
         */
-        list_for_each_entry(vma, head, shared.vm_set.list) {
+        list_for_each_entry(vma, head, shared.nonlinear) {
                details->nonlinear_vma = vma;
                unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
        }
@@ -2883,7 +2898,7 @@ void unmap_mapping_range(struct address_space *mapping,
        mutex_lock(&mapping->i_mmap_mutex);
-        if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3ad25f9d1fc..56b758ae57d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info,  struct page *page,
 void __ref put_page_bootmem(struct page *page)
 {
        unsigned long type;
+        struct zone *zone;
        type = (unsigned long) page->lru.next;
        BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page)
                set_page_private(page, 0);
                INIT_LIST_HEAD(&page->lru);
                __free_pages_bootmem(page, 0);
+                zone = page_zone(page);
+                zone_span_writelock(zone);
+                zone->present_pages++;
+                zone_span_writeunlock(zone);
+                totalram_pages++;
        }
 }
@@ -126,9 +133,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
        struct mem_section *ms;
        struct page *page, *memmap;
-        if (!pfn_valid(start_pfn))
-                return;
        section_nr = pfn_to_section_nr(start_pfn);
        ms = __nr_to_section(section_nr);
@@ -187,9 +191,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
        end_pfn = pfn + pgdat->node_spanned_pages;
        /* register_section info */
-        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
+        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
-                register_page_bootmem_info_section(pfn);
+                /*
+                 * Some platforms can assign the same pfn to multiple nodes - on
+                 * node0 as well as nodeN.  To avoid registering a pfn against
+                 * multiple nodes we check that this pfn does not already
+                 * reside in some other node.
+                 */
+                if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
+                        register_page_bootmem_info_section(pfn);
+        }
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
@@ -358,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
        BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
        BUG_ON(nr_pages % PAGES_PER_SECTION);
+        release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
        sections_to_remove = nr_pages / PAGES_PER_SECTION;
        for (i = 0; i < sections_to_remove; i++) {
                unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
-                release_mem_region(pfn << PAGE_SHIFT,
-                                   PAGES_PER_SECTION << PAGE_SHIFT);
                ret = __remove_section(zone, __pfn_to_section(pfn));
                if (ret)
                        break;
@@ -752,13 +763,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
        return 0;
 }
-static struct page *
-hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
-{
-        /* This should be improooooved!! */
-        return alloc_page(GFP_HIGHUSER_MOVABLE);
-}
 #define NR_OFFLINE_AT_ONCE_PAGES        (256)
 static int
 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -809,8 +813,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                        putback_lru_pages(&source);
                        goto out;
                }
-                /* this function returns # of failed pages */
-                ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+                /*
+                 * alloc_migrate_target should be improooooved!!
+                 * migrate_pages returns # of failed pages.
+                 */
+                ret = migrate_pages(&source, alloc_migrate_target, 0,
                                                        true, MIGRATE_SYNC);
                if (ret)
                        putback_lru_pages(&source);
@@ -866,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        return offlined;
 }
-static int __ref offline_pages(unsigned long start_pfn,
+static int __ref __offline_pages(unsigned long start_pfn,
                  unsigned long end_pfn, unsigned long timeout)
 {
        unsigned long pfn, nr_pages, expire;
@@ -966,8 +974,13 @@ repeat:
        init_per_zone_wmark_min();
-        if (!populated_zone(zone))
+        if (!populated_zone(zone)) {
                zone_pcp_reset(zone);
+                mutex_lock(&zonelists_mutex);
+                build_all_zonelists(NULL, NULL);
+                mutex_unlock(&zonelists_mutex);
+        } else
+                zone_pcp_update(zone);
        if (!node_present_pages(node)) {
                node_clear_state(node, N_HIGH_MEMORY);
@@ -994,15 +1007,55 @@ out:
        return ret;
 }
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+        return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+}
 int remove_memory(u64 start, u64 size)
 {
+        struct memory_block *mem = NULL;
+        struct mem_section *section;
        unsigned long start_pfn, end_pfn;
+        unsigned long pfn, section_nr;
+        int ret;
        start_pfn = PFN_DOWN(start);
        end_pfn = start_pfn + PFN_DOWN(size);
-        return offline_pages(start_pfn, end_pfn, 120 * HZ);
+        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+                section_nr = pfn_to_section_nr(pfn);
+                if (!present_section_nr(section_nr))
+                        continue;
+                section = __nr_to_section(section_nr);
+                /* same memblock? */
+                if (mem)
+                        if ((section_nr >= mem->start_section_nr) &&
+                            (section_nr <= mem->end_section_nr))
+                                continue;
+                mem = find_memory_block_hinted(section, mem);
+                if (!mem)
+                        continue;
+                ret = offline_memory_block(mem);
+                if (ret) {
+                        kobject_put(&mem->dev.kobj);
+                        return ret;
+                }
+        }
+        if (mem)
+                kobject_put(&mem->dev.kobj);
+        return 0;
 }
 #else
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+        return -EINVAL;
+}
 int remove_memory(u64 start, u64 size)
 {
        return -EINVAL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bd92431d4c4..0b78fb9ea65 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        return first;
 }
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+                                                struct mempolicy *pol)
+{
+        int err;
+        struct mempolicy *old;
+        struct mempolicy *new;
+        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
+                 vma->vm_ops, vma->vm_file,
+                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+        new = mpol_dup(pol);
+        if (IS_ERR(new))
+                return PTR_ERR(new);
+        if (vma->vm_ops && vma->vm_ops->set_policy) {
+                err = vma->vm_ops->set_policy(vma, new);
+                if (err)
+                        goto err_out;
+        }
+        old = vma->vm_policy;
+        vma->vm_policy = new; /* protected by mmap_sem */
+        mpol_put(old);
+        return 0;
+ err_out:
+        mpol_put(new);
+        return err;
+}
 /* Step 2: apply policy to a range and do splits. */
 static int mbind_range(struct mm_struct *mm, unsigned long start,
                       unsigned long end, struct mempolicy *new_pol)
@@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
                        if (err)
                                goto out;
                }
+                err = vma_replace_policy(vma, new_pol);
-                /*
+                if (err)
-                 * Apply policy to a single VMA. The reference counting of
+                        goto out;
-                 * policy for vma_policy linkages has already been handled by
-                 * vma_merge and split_vma as necessary. If this is a shared
-                 * policy then ->set_policy will increment the reference count
-                 * for an sp node.
-                 */
-                pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
-                        vma->vm_start, vma->vm_end, vma->vm_pgoff,
-                        vma->vm_ops, vma->vm_file,
-                        vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-                if (vma->vm_ops && vma->vm_ops->set_policy) {
-                        err = vma->vm_ops->set_policy(vma, new_pol);
-                        if (err)
-                                goto out;
-                }
        }
 out:
@@ -924,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
        nodemask_t nmask;
        LIST_HEAD(pagelist);
        int err = 0;
-        struct vm_area_struct *vma;
        nodes_clear(nmask);
        node_set(source, nmask);
-        vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+        /*
+         * This does not "check" the range but isolates all pages that
+         * need migration.  Between passing in the full user address
+         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+         */
+        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+        check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
-        if (IS_ERR(vma))
-                return PTR_ERR(vma);
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest,
@@ -1530,8 +1555,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
                                                                        addr);
                        if (vpol)
                                pol = vpol;
-                } else if (vma->vm_policy)
+                } else if (vma->vm_policy) {
                        pol = vma->vm_policy;
+                        /*
+                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
+                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+                         * count on these policies which will be dropped by
+                         * mpol_cond_put() later
+                         */
+                        if (mpol_needs_cond_ref(pol))
+                                mpol_get(pol);
+                }
        }
        if (!pol)
                pol = &default_policy;
@@ -2061,7 +2096,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 */
 /* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
 static struct sp_node *
 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 {
@@ -2125,36 +2160,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
        if (!sp->root.rb_node)
                return NULL;
-        spin_lock(&sp->lock);
+        mutex_lock(&sp->mutex);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
-        spin_unlock(&sp->lock);
+        mutex_unlock(&sp->mutex);
        return pol;
 }
+static void sp_free(struct sp_node *n)
+{
+        mpol_put(n->policy);
+        kmem_cache_free(sn_cache, n);
+}
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
        rb_erase(&n->nd, &sp->root);
-        mpol_put(n->policy);
+        sp_free(n);
-        kmem_cache_free(sn_cache, n);
 }
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
 {
-        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+        struct sp_node *n;
+        struct mempolicy *newpol;
+        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n)
                return NULL;
+        newpol = mpol_dup(pol);
+        if (IS_ERR(newpol)) {
+                kmem_cache_free(sn_cache, n);
+                return NULL;
+        }
+        newpol->flags |= MPOL_F_SHARED;
        n->start = start;
        n->end = end;
-        mpol_get(pol);
+        n->policy = newpol;
-        pol->flags |= MPOL_F_SHARED;    /* for unref */
-        n->policy = pol;
        return n;
 }
@@ -2162,10 +2211,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
                                 unsigned long end, struct sp_node *new)
 {
-        struct sp_node *n, *new2 = NULL;
+        struct sp_node *n;
+        int ret = 0;
-restart:
+        mutex_lock(&sp->mutex);
-        spin_lock(&sp->lock);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
@@ -2178,16 +2227,14 @@ restart:
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
+                                struct sp_node *new2;
+                                new2 = sp_alloc(end, n->end, n->policy);
                                if (!new2) {
-                                        spin_unlock(&sp->lock);
+                                        ret = -ENOMEM;
-                                        new2 = sp_alloc(end, n->end, n->policy);
+                                        goto out;
-                                        if (!new2)
-                                                return -ENOMEM;
-                                        goto restart;
                                }
                                n->end = start;
                                sp_insert(sp, new2);
-                                new2 = NULL;
                                break;
                        } else
                                n->end = start;
@@ -2198,12 +2245,9 @@ restart:
        }
        if (new)
                sp_insert(sp, new);
-        spin_unlock(&sp->lock);
+out:
-        if (new2) {
+        mutex_unlock(&sp->mutex);
-                mpol_put(new2->policy);
+        return ret;
-                kmem_cache_free(sn_cache, new2);
-        }
-        return 0;
 }
 /**
@@ -2221,7 +2265,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
        int ret;
        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
-        spin_lock_init(&sp->lock);
+        mutex_init(&sp->mutex);
        if (mpol) {
                struct vm_area_struct pvma;
@@ -2275,7 +2319,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
        }
        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
        if (err && new)
-                kmem_cache_free(sn_cache, new);
+                sp_free(new);
        return err;
 }
@@ -2287,16 +2331,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
        if (!p->root.rb_node)
                return;
-        spin_lock(&p->lock);
+        mutex_lock(&p->mutex);
        next = rb_first(&p->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
-                rb_erase(&n->nd, &p->root);
+                sp_delete(p, n);
-                mpol_put(n->policy);
-                kmem_cache_free(sn_cache, n);
        }
-        spin_unlock(&p->lock);
+        mutex_unlock(&p->mutex);
 }
 /* assumes fs == KERNEL_DS */
@@ -2562,7 +2604,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
                break;
        default:
-                BUG();
+                return -EINVAL;
        }
        l = strlen(policy_modes[mode]);
diff --git a/mm/mlock.c b/mm/mlock.c
index ef726e8aa8e..f0b9ce572fc 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -51,15 +51,13 @@ EXPORT_SYMBOL(can_do_mlock);
 /*
 *  LRU accounting for clear_page_mlock()
 */
-void __clear_page_mlock(struct page *page)
+void clear_page_mlock(struct page *page)
 {
-        VM_BUG_ON(!PageLocked(page));
+        if (!TestClearPageMlocked(page))
-        if (!page->mapping) {   /* truncated ? */
                return;
-        }
-        dec_zone_page_state(page, NR_MLOCK);
+        mod_zone_page_state(page_zone(page), NR_MLOCK,
+                            -hpage_nr_pages(page));
        count_vm_event(UNEVICTABLE_PGCLEARED);
        if (!isolate_lru_page(page)) {
                putback_lru_page(page);
@@ -81,7 +79,8 @@ void mlock_vma_page(struct page *page)
        BUG_ON(!PageLocked(page));
        if (!TestSetPageMlocked(page)) {
-                inc_zone_page_state(page, NR_MLOCK);
+                mod_zone_page_state(page_zone(page), NR_MLOCK,
+                                    hpage_nr_pages(page));
                count_vm_event(UNEVICTABLE_PGMLOCKED);
                if (!isolate_lru_page(page))
                        putback_lru_page(page);
@@ -108,7 +107,8 @@ void munlock_vma_page(struct page *page)
        BUG_ON(!PageLocked(page));
        if (TestClearPageMlocked(page)) {
-                dec_zone_page_state(page, NR_MLOCK);
+                mod_zone_page_state(page_zone(page), NR_MLOCK,
+                                    -hpage_nr_pages(page));
                if (!isolate_lru_page(page)) {
                        int ret = SWAP_AGAIN;
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                goto no_mlock;
-        if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+        if (!((vma->vm_flags & VM_DONTEXPAND) ||
                        is_vm_hugetlb_page(vma) ||
                        vma == get_gate_vma(current->mm))) {
@@ -290,14 +290,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
                if (page && !IS_ERR(page)) {
                        lock_page(page);
-                        /*
+                        munlock_vma_page(page);
-                         * Like in __mlock_vma_pages_range(),
-                         * because we lock page here and migration is
-                         * blocked by the elevated reference, we need
-                         * only check for file-cache page truncation.
-                         */
-                        if (page->mapping)
-                                munlock_vma_page(page);
                        unlock_page(page);
                        put_page(page);
                }
diff --git a/mm/mmap.c b/mm/mmap.c
index e3e86914f11..2d942353d68 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                unsigned long start, unsigned long end);
-/*
- * WARNING: the debugging will use recursive algorithms so never enable this
- * unless you know what you are doing.
- */
-#undef DEBUG_MM_RB
 /* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
@@ -199,14 +193,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
        flush_dcache_mmap_lock(mapping);
        if (unlikely(vma->vm_flags & VM_NONLINEAR))
-                list_del_init(&vma->shared.vm_set.list);
+                list_del_init(&vma->shared.nonlinear);
        else
-                vma_prio_tree_remove(vma, &mapping->i_mmap);
+                vma_interval_tree_remove(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
 }
 /*
- * Unlink a file-based vm structure from its prio_tree, to hide
+ * Unlink a file-based vm structure from its interval tree, to hide
 * vma from rmap and vmtruncate before freeing its page tables.
 */
 void unlink_file_vma(struct vm_area_struct *vma)
@@ -231,11 +225,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
        might_sleep();
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
-        if (vma->vm_file) {
+        if (vma->vm_file)
                fput(vma->vm_file);
-                if (vma->vm_flags & VM_EXECUTABLE)
-                        removed_exe_file_vma(vma->vm_mm);
-        }
        mpol_put(vma_policy(vma));
        kmem_cache_free(vm_area_cachep, vma);
        return next;
@@ -306,7 +297,7 @@ out:
        return retval;
 }
-#ifdef DEBUG_MM_RB
+#ifdef CONFIG_DEBUG_VM_RB
 static int browse_rb(struct rb_root *root)
 {
        int i = 0, j;
@@ -340,9 +331,12 @@ void validate_mm(struct mm_struct *mm)
 {
        int bug = 0;
        int i = 0;
-        struct vm_area_struct *tmp = mm->mmap;
+        struct vm_area_struct *vma = mm->mmap;
-        while (tmp) {
+        while (vma) {
-                tmp = tmp->vm_next;
+                struct anon_vma_chain *avc;
+                list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                        anon_vma_interval_tree_verify(avc);
+                vma = vma->vm_next;
                i++;
        }
        if (i != mm->map_count)
@@ -356,17 +350,46 @@ void validate_mm(struct mm_struct *mm)
 #define validate_mm(mm) do { } while (0)
 #endif
-static struct vm_area_struct *
+/*
-find_vma_prepare(struct mm_struct *mm, unsigned long addr,
+ * vma has some anon_vma assigned, and is already inserted on that
-                struct vm_area_struct **pprev, struct rb_node ***rb_link,
+ * anon_vma's interval trees.
-                struct rb_node ** rb_parent)
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_sem and by
+ * the root anon_vma's mutex.
+ */
+static inline void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
+{
+        struct anon_vma_chain *avc;
+        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+static inline void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
 {
-        struct vm_area_struct * vma;
+        struct anon_vma_chain *avc;
-        struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
+        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+static int find_vma_links(struct mm_struct *mm, unsigned long addr,
+                unsigned long end, struct vm_area_struct **pprev,
+                struct rb_node ***rb_link, struct rb_node **rb_parent)
+{
+        struct rb_node **__rb_link, *__rb_parent, *rb_prev;
        __rb_link = &mm->mm_rb.rb_node;
        rb_prev = __rb_parent = NULL;
-        vma = NULL;
        while (*__rb_link) {
                struct vm_area_struct *vma_tmp;
@@ -375,9 +398,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
                if (vma_tmp->vm_end > addr) {
-                        vma = vma_tmp;
+                        /* Fail if an existing vma overlaps the area */
-                        if (vma_tmp->vm_start <= addr)
+                        if (vma_tmp->vm_start < end)
-                                break;
+                                return -ENOMEM;
                        __rb_link = &__rb_parent->rb_left;
                } else {
                        rb_prev = __rb_parent;
@@ -390,7 +413,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
        *rb_link = __rb_link;
        *rb_parent = __rb_parent;
-        return vma;
+        return 0;
 }
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -417,7 +440,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
                if (unlikely(vma->vm_flags & VM_NONLINEAR))
                        vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
                else
-                        vma_prio_tree_insert(vma, &mapping->i_mmap);
+                        vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
        }
 }
@@ -455,15 +478,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 /*
 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree.  It has already been inserted into the prio_tree.
+ * mm's list and rbtree.  It has already been inserted into the interval tree.
 */
 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-        struct vm_area_struct *__vma, *prev;
+        struct vm_area_struct *prev;
        struct rb_node **rb_link, *rb_parent;
-        __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
+        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
-        BUG_ON(__vma && __vma->vm_start < vma->vm_end);
+                           &prev, &rb_link, &rb_parent))
+                BUG();
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        mm->map_count++;
 }
@@ -496,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        struct vm_area_struct *next = vma->vm_next;
        struct vm_area_struct *importer = NULL;
        struct address_space *mapping = NULL;
-        struct prio_tree_root *root = NULL;
+        struct rb_root *root = NULL;
        struct anon_vma *anon_vma = NULL;
        struct file *file = vma->vm_file;
        long adjust_next = 0;
@@ -559,7 +583,7 @@ again:			remove_next = 1 + (end > next->vm_end);
                mutex_lock(&mapping->i_mmap_mutex);
                if (insert) {
                        /*
-                         * Put into prio_tree now, so instantiated pages
+                         * Put into interval tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
                         * throughout; but we cannot insert into address
                         * space until vma start or end is updated.
@@ -570,22 +594,23 @@ again:			remove_next = 1 + (end > next->vm_end);
        vma_adjust_trans_huge(vma, start, end, adjust_next);
-        /*
+        anon_vma = vma->anon_vma;
-         * When changing only vma->vm_end, we don't really need anon_vma
+        if (!anon_vma && adjust_next)
-         * lock. This is a fairly rare case by itself, but the anon_vma
+                anon_vma = next->anon_vma;
-         * lock may be shared between many sibling processes.  Skipping
+        if (anon_vma) {
-         * the lock for brk adjustments makes a difference sometimes.
+                VM_BUG_ON(adjust_next && next->anon_vma &&
-         */
+                          anon_vma != next->anon_vma);
-        if (vma->anon_vma && (importer || start != vma->vm_start)) {
-                anon_vma = vma->anon_vma;
                anon_vma_lock(anon_vma);
+                anon_vma_interval_tree_pre_update_vma(vma);
+                if (adjust_next)
+                        anon_vma_interval_tree_pre_update_vma(next);
        }
        if (root) {
                flush_dcache_mmap_lock(mapping);
-                vma_prio_tree_remove(vma, root);
+                vma_interval_tree_remove(vma, root);
                if (adjust_next)
-                        vma_prio_tree_remove(next, root);
+                        vma_interval_tree_remove(next, root);
        }
        vma->vm_start = start;
@@ -598,8 +623,8 @@ again:			remove_next = 1 + (end > next->vm_end);
        if (root) {
                if (adjust_next)
-                        vma_prio_tree_insert(next, root);
+                        vma_interval_tree_insert(next, root);
-                vma_prio_tree_insert(vma, root);
+                vma_interval_tree_insert(vma, root);
                flush_dcache_mmap_unlock(mapping);
        }
@@ -620,8 +645,12 @@ again:			remove_next = 1 + (end > next->vm_end);
                __insert_vm_struct(mm, insert);
        }
-        if (anon_vma)
+        if (anon_vma) {
+                anon_vma_interval_tree_post_update_vma(vma);
+                if (adjust_next)
+                        anon_vma_interval_tree_post_update_vma(next);
                anon_vma_unlock(anon_vma);
+        }
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
@@ -636,8 +665,6 @@ again:			remove_next = 1 + (end > next->vm_end);
                if (file) {
                        uprobe_munmap(next, next->vm_start, next->vm_end);
                        fput(file);
-                        if (next->vm_flags & VM_EXECUTABLE)
-                                removed_exe_file_vma(mm);
                }
                if (next->anon_vma)
                        anon_vma_merge(vma, next);
@@ -669,8 +696,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
                        struct file *file, unsigned long vm_flags)
 {
-        /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
+        if (vma->vm_flags ^ vm_flags)
-        if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
                return 0;
        if (vma->vm_file != file)
                return 0;
@@ -951,8 +977,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
                        mm->exec_vm += pages;
        } else if (flags & stack_flags)
                mm->stack_vm += pages;
-        if (flags & (VM_RESERVED|VM_IO))
-                mm->reserved_vm += pages;
 }
 #endif /* CONFIG_PROC_FS */
@@ -1190,7 +1214,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
                return 0;
        /* Specialty mapping? */
-        if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+        if (vm_flags & VM_PFNMAP)
                return 0;
        /* Can the mapping track the dirty pages? */
@@ -1229,8 +1253,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        /* Clear old maps */
        error = -ENOMEM;
 munmap_back:
-        vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
-        if (vma && vma->vm_start < addr + len) {
                if (do_munmap(mm, addr, len))
                        return -ENOMEM;
                goto munmap_back;
@@ -1301,13 +1324,10 @@ munmap_back:
                                goto free_vma;
                        correct_wcount = 1;
                }
-                vma->vm_file = file;
+                vma->vm_file = get_file(file);
-                get_file(file);
                error = file->f_op->mmap(file, vma);
                if (error)
                        goto unmap_and_free_vma;
-                if (vm_flags & VM_EXECUTABLE)
-                        added_exe_file_vma(mm);
                /* Can addr have changed??
                 *
@@ -1356,9 +1376,8 @@ out:
        } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
                make_pages_present(addr, addr + len);
-        if (file && uprobe_mmap(vma))
+        if (file)
-                /* matching probes but cannot insert */
+                uprobe_mmap(vma);
-                goto unmap_and_free_vma;
        return addr;
@@ -1759,13 +1778,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
+                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
+                                anon_vma_interval_tree_post_update_vma(vma);
                                perf_event_mmap(vma);
                        }
                }
        }
        vma_unlock_anon_vma(vma);
        khugepaged_enter_vma_merge(vma);
+        validate_mm(vma->vm_mm);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1809,14 +1831,17 @@ int expand_downwards(struct vm_area_struct *vma,
                if (grow <= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
+                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
+                                anon_vma_interval_tree_post_update_vma(vma);
                                perf_event_mmap(vma);
                        }
                }
        }
        vma_unlock_anon_vma(vma);
        khugepaged_enter_vma_merge(vma);
+        validate_mm(vma->vm_mm);
        return error;
 }
@@ -1990,11 +2015,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        if (anon_vma_clone(new, vma))
                goto out_free_mpol;
-        if (new->vm_file) {
+        if (new->vm_file)
                get_file(new->vm_file);
-                if (vma->vm_flags & VM_EXECUTABLE)
-                        added_exe_file_vma(mm);
-        }
        if (new->vm_ops && new->vm_ops->open)
                new->vm_ops->open(new);
@@ -2012,11 +2034,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        /* Clean everything up if vma_adjust failed. */
        if (new->vm_ops && new->vm_ops->close)
                new->vm_ops->close(new);
-        if (new->vm_file) {
+        if (new->vm_file)
-                if (vma->vm_flags & VM_EXECUTABLE)
-                        removed_exe_file_vma(mm);
                fput(new->vm_file);
-        }
        unlink_anon_vmas(new);
 out_free_mpol:
        mpol_put(pol);
@@ -2201,8 +2220,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
         * Clear old maps.  this also does some error checking for us
         */
 munmap_back:
-        vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
-        if (vma && vma->vm_start < addr + len) {
                if (do_munmap(mm, addr, len))
                        return -ENOMEM;
                goto munmap_back;
@@ -2309,17 +2327,17 @@ void exit_mmap(struct mm_struct *mm)
        }
        vm_unacct_memory(nr_accounted);
-        BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+        WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 /* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
 * then i_mmap_mutex is taken here.
 */
-int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-        struct vm_area_struct * __vma, * prev;
+        struct vm_area_struct *prev;
-        struct rb_node ** rb_link, * rb_parent;
+        struct rb_node **rb_link, *rb_parent;
        /*
         * The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2337,8 +2355,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
                BUG_ON(vma->anon_vma);
                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
        }
-        __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
+        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
-        if (__vma && __vma->vm_start < vma->vm_end)
+                           &prev, &rb_link, &rb_parent))
                return -ENOMEM;
        if ((vma->vm_flags & VM_ACCOUNT) &&
             security_vm_enough_memory_mm(mm, vma_pages(vma)))
@@ -2353,7 +2371,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 * prior to moving page table entries, to effect an mremap move.
 */
 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
-        unsigned long addr, unsigned long len, pgoff_t pgoff)
+        unsigned long addr, unsigned long len, pgoff_t pgoff,
+        bool *need_rmap_locks)
 {
        struct vm_area_struct *vma = *vmap;
        unsigned long vma_start = vma->vm_start;
@@ -2372,7 +2391,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                faulted_in_anon_vma = false;
        }
-        find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+                return NULL;    /* should never get here */
        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
                        vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
        if (new_vma) {
@@ -2394,32 +2414,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                         * linear if there are no pages mapped yet.
                         */
                        VM_BUG_ON(faulted_in_anon_vma);
-                        *vmap = new_vma;
+                        *vmap = vma = new_vma;
-                } else
+                }
-                        anon_vma_moveto_tail(new_vma);
+                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                if (new_vma) {
                        *new_vma = *vma;
+                        new_vma->vm_start = addr;
+                        new_vma->vm_end = addr + len;
+                        new_vma->vm_pgoff = pgoff;
                        pol = mpol_dup(vma_policy(vma));
                        if (IS_ERR(pol))
                                goto out_free_vma;
+                        vma_set_policy(new_vma, pol);
                        INIT_LIST_HEAD(&new_vma->anon_vma_chain);
                        if (anon_vma_clone(new_vma, vma))
                                goto out_free_mempol;
-                        vma_set_policy(new_vma, pol);
+                        if (new_vma->vm_file)
-                        new_vma->vm_start = addr;
-                        new_vma->vm_end = addr + len;
-                        new_vma->vm_pgoff = pgoff;
-                        if (new_vma->vm_file) {
                                get_file(new_vma->vm_file);
-                                if (vma->vm_flags & VM_EXECUTABLE)
-                                        added_exe_file_vma(mm);
-                        }
                        if (new_vma->vm_ops && new_vma->vm_ops->open)
                                new_vma->vm_ops->open(new_vma);
                        vma_link(mm, new_vma, prev, rb_link, rb_parent);
+                        *need_rmap_locks = false;
                }
        }
        return new_vma;
@@ -2537,7 +2554,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-        if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+        if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
@@ -2553,7 +2570,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * anon_vma->root->mutex.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
-                                       &anon_vma->root->head.next))
+                                       &anon_vma->root->rb_root.rb_node))
                        BUG();
        }
 }
@@ -2594,7 +2611,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 * A single task can't take more than one mm_take_all_locks() in a row
 * or it would deadlock.
 *
- * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
 * mapping->flags avoid to take the same lock twice, if more than one
 * vma in this mm is backed by the same anon_vma or address_space.
 *
@@ -2641,13 +2658,13 @@ out_unlock:
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-        if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+        if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
                 *
                 * We must however clear the bitflag before unlocking
-                 * the vma so the users using the anon_vma->head will
+                 * the vma so the users using the anon_vma->rb_root will
                 * never see our bitflag.
                 *
                 * No need of atomic instructions here, head.next
@@ -2655,7 +2672,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 * anon_vma->root->mutex.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
-                                          &anon_vma->root->head.next))
+                                          &anon_vma->root->rb_root.rb_node))
                        BUG();
                anon_vma_unlock(anon_vma);
        }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 862b60822d9..479a1e751a7 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,10 +14,14 @@
 #include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/err.h>
+#include <linux/srcu.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+/* global SRCU for all MMs */
+static struct srcu_struct srcu;
 /*
 * This function can't run concurrently against mmu_notifier_register
 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,8 +29,8 @@
 * in parallel despite there being no task using this mm any more,
 * through the vmas outside of the exit_mmap context, such as with
 * vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to RCU and it serializes
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
- * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
 * can't go away from under us as exit_mmap holds an mm_count pin
 * itself.
 */
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm)
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
        /*
-         * RCU here will block mmu_notifier_unregister until
+         * SRCU here will block mmu_notifier_unregister until
         * ->release returns.
         */
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
                /*
                 * if ->release runs before mmu_notifier_unregister it
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
                 */
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
        spin_unlock(&mm->mmu_notifier_mm->lock);
        /*
-         * synchronize_rcu here prevents mmu_notifier_release to
+         * synchronize_srcu here prevents mmu_notifier_release to
         * return to exit_mmap (which would proceed freeing all pages
         * in the mm) until the ->release method returns, if it was
         * invoked by mmu_notifier_unregister.
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
         * The mmu_notifier_mm can't go away from under us because one
         * mm_count is hold by exit_mmap.
         */
-        synchronize_rcu();
+        synchronize_srcu(&srcu);
 }
 /*
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
-        int young = 0;
+        int young = 0, id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->clear_flush_young)
                        young |= mn->ops->clear_flush_young(mn, mm, address);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
        return young;
 }
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
-        int young = 0;
+        int young = 0, id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->test_young) {
                        young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
                                break;
                }
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
        return young;
 }
@@ -126,19 +131,14 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->change_pte)
                        mn->ops->change_pte(mn, mm, address, pte);
-                /*
-                 * Some drivers don't have change_pte,
-                 * so we must call invalidate_page in that case.
-                 */
-                else if (mn->ops->invalidate_page)
-                        mn->ops->invalidate_page(mn, mm, address);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
 }
 void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +146,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_page)
                        mn->ops->invalidate_page(mn, mm, address);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
 }
 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +161,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_range_start)
                        mn->ops->invalidate_range_start(mn, mm, start, end);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
 }
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +176,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_range_end)
                        mn->ops->invalidate_range_end(mn, mm, start, end);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
 }
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,22 +195,29 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
        BUG_ON(atomic_read(&mm->mm_users) <= 0);
-        ret = -ENOMEM;
+        /*
-        mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+        * Verify that mmu_notifier_init() already run and the global srcu is
-        if (unlikely(!mmu_notifier_mm))
+        * initialized.
-                goto out;
+        */
+        BUG_ON(!srcu.per_cpu_ref);
        if (take_mmap_sem)
                down_write(&mm->mmap_sem);
        ret = mm_take_all_locks(mm);
        if (unlikely(ret))
-                goto out_cleanup;
+                goto out;
        if (!mm_has_notifiers(mm)) {
+                mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm),
+                                        GFP_KERNEL);
+                if (unlikely(!mmu_notifier_mm)) {
+                        ret = -ENOMEM;
+                        goto out_of_mem;
+                }
                INIT_HLIST_HEAD(&mmu_notifier_mm->list);
                spin_lock_init(&mmu_notifier_mm->lock);
                mm->mmu_notifier_mm = mmu_notifier_mm;
-                mmu_notifier_mm = NULL;
        }
        atomic_inc(&mm->mm_count);
@@ -223,13 +233,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
        hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
        spin_unlock(&mm->mmu_notifier_mm->lock);
+out_of_mem:
        mm_drop_all_locks(mm);
-out_cleanup:
+out:
        if (take_mmap_sem)
                up_write(&mm->mmap_sem);
-        /* kfree() does nothing if mmu_notifier_mm is NULL */
-        kfree(mmu_notifier_mm);
-out:
        BUG_ON(atomic_read(&mm->mm_users) <= 0);
        return ret;
 }
@@ -274,8 +283,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
 /*
 * This releases the mm_count pin automatically and frees the mm
 * structure if it was the last user of it. It serializes against
- * running mmu notifiers with RCU and against mmu_notifier_unregister
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
- * with the unregister lock + RCU. All sptes must be dropped before
+ * with the unregister lock + SRCU. All sptes must be dropped before
 * calling mmu_notifier_unregister. ->release or any other notifier
 * method may be invoked concurrently with mmu_notifier_unregister,
 * and only after mmu_notifier_unregister returned we're guaranteed
@@ -287,11 +296,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
        if (!hlist_unhashed(&mn->hlist)) {
                /*
-                 * RCU here will force exit_mmap to wait ->release to finish
+                 * SRCU here will force exit_mmap to wait ->release to finish
                 * before freeing the pages.
                 */
-                rcu_read_lock();
+                int id;
+                id = srcu_read_lock(&srcu);
                /*
                 * exit_mmap will block in mmu_notifier_release to
                 * guarantee ->release is called before freeing the
@@ -299,7 +309,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
                 */
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
-                rcu_read_unlock();
+                srcu_read_unlock(&srcu, id);
                spin_lock(&mm->mmu_notifier_mm->lock);
                hlist_del_rcu(&mn->hlist);
@@ -310,10 +320,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
         * Wait any running method to finish, of course including
         * ->release if it was run by mmu_notifier_relase instead of us.
         */
-        synchronize_rcu();
+        synchronize_srcu(&srcu);
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
        mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+static int __init mmu_notifier_init(void)
+{
+        return init_srcu_struct(&srcu);
+}
+module_init(mmu_notifier_init);
diff --git a/mm/mremap.c b/mm/mremap.c
index cc06d0e48d0..1b61c2d3307 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                unsigned long old_addr, unsigned long old_end,
                struct vm_area_struct *new_vma, pmd_t *new_pmd,
-                unsigned long new_addr)
+                unsigned long new_addr, bool need_rmap_locks)
 {
        struct address_space *mapping = NULL;
+        struct anon_vma *anon_vma = NULL;
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
        spinlock_t *old_ptl, *new_ptl;
-        if (vma->vm_file) {
+        /*
-                /*
+         * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
-                 * Subtle point from Rajesh Venkatasubramanian: before
+         * locks to ensure that rmap will always observe either the old or the
-                 * moving file-based ptes, we must lock truncate_pagecache
+         * new ptes. This is the easiest way to avoid races with
-                 * out, since it might clean the dst vma before the src vma,
+         * truncate_pagecache(), page migration, etc...
-                 * and we propagate stale pages into the dst afterward.
+         *
-                 */
+         * When need_rmap_locks is false, we use other ways to avoid
-                mapping = vma->vm_file->f_mapping;
+         * such races:
-                mutex_lock(&mapping->i_mmap_mutex);
+         *
+         * - During exec() shift_arg_pages(), we use a specially tagged vma
+         *   which rmap call sites look for using is_vma_temporary_stack().
+         *
+         * - During mremap(), new_vma is often known to be placed after vma
+         *   in rmap traversal order. This ensures rmap will always observe
+         *   either the old pte, or the new pte, or both (the page table locks
+         *   serialize access to individual ptes, but only rmap traversal
+         *   order guarantees that we won't miss both the old and new ptes).
+         */
+        if (need_rmap_locks) {
+                if (vma->vm_file) {
+                        mapping = vma->vm_file->f_mapping;
+                        mutex_lock(&mapping->i_mmap_mutex);
+                }
+                if (vma->anon_vma) {
+                        anon_vma = vma->anon_vma;
+                        anon_vma_lock(anon_vma);
+                }
        }
        /*
@@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                spin_unlock(new_ptl);
        pte_unmap(new_pte - 1);
        pte_unmap_unlock(old_pte - 1, old_ptl);
+        if (anon_vma)
+                anon_vma_unlock(anon_vma);
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
 }
@@ -122,16 +143,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
-                unsigned long new_addr, unsigned long len)
+                unsigned long new_addr, unsigned long len,
+                bool need_rmap_locks)
 {
        unsigned long extent, next, old_end;
        pmd_t *old_pmd, *new_pmd;
        bool need_flush = false;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        old_end = old_addr + len;
        flush_cache_range(vma, old_addr, old_end);
-        mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
+        mmun_start = old_addr;
+        mmun_end   = old_end;
+        mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
        for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
                cond_resched();
@@ -169,13 +195,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                if (extent > LATENCY_LIMIT)
                        extent = LATENCY_LIMIT;
                move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-                                new_vma, new_pmd, new_addr);
+                          new_vma, new_pmd, new_addr, need_rmap_locks);
                need_flush = true;
        }
        if (likely(need_flush))
                flush_tlb_range(vma, old_end-len, old_addr);
-        mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
+        mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
        return len + old_addr - old_end;        /* how much done */
 }
@@ -193,6 +219,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        unsigned long hiwater_vm;
        int split = 0;
        int err;
+        bool need_rmap_locks;
        /*
         * We'd prefer to avoid failure later on in do_munmap:
@@ -214,27 +241,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                return err;
        new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
-        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+                           &need_rmap_locks);
        if (!new_vma)
                return -ENOMEM;
-        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+                                     need_rmap_locks);
        if (moved_len < old_len) {
                /*
-                 * Before moving the page tables from the new vma to
-                 * the old vma, we need to be sure the old vma is
-                 * queued after new vma in the same_anon_vma list to
-                 * prevent SMP races with rmap_walk (that could lead
-                 * rmap_walk to miss some page table).
-                 */
-                anon_vma_moveto_tail(vma);
-                /*
                 * On error, move entries back from new area to old,
                 * which will succeed since page tables still there,
                 * and then proceed to unmap new area instead of old.
                 */
-                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+                                 true);
                vma = new_vma;
                old_len = new_len;
                old_addr = new_addr;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 405573010f9..714d5d65047 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
                return 0;
        __free_pages_memory(start_pfn, end_pfn);
+        fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT),
+                        start_pfn, end_pfn);
        return end_pfn - start_pfn;
 }
@@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
        phys_addr_t start, end, size;
        u64 i;
+        reset_zone_present_pages();
        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
                count += __free_memory_core(start, end);
@@ -162,8 +165,6 @@ unsigned long __init free_all_bootmem(void)
         * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
         *  because in some case like Node0 doesn't have RAM installed
         *  low ram will be on Node1
-         * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
-         *  will be used instead of only Node0 related
         */
        return free_low_memory_core_early(MAX_NUMNODES);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index d4b0c10872d..45131b41bcd 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
-                vma_prio_tree_insert(vma, &mapping->i_mmap);
+                vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
                mutex_unlock(&mapping->i_mmap_mutex);
        }
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
-                vma_prio_tree_remove(vma, &mapping->i_mmap);
+                vma_interval_tree_remove(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
                mutex_unlock(&mapping->i_mmap_mutex);
        }
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
        kenter("%p", vma);
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
-        if (vma->vm_file) {
+        if (vma->vm_file)
                fput(vma->vm_file);
-                if (vma->vm_flags & VM_EXECUTABLE)
-                        removed_exe_file_vma(mm);
-        }
        put_nommu_region(vma->vm_region);
        kmem_cache_free(vm_area_cachep, vma);
 }
@@ -1282,14 +1279,8 @@ unsigned long do_mmap_pgoff(struct file *file,
        vma->vm_pgoff = pgoff;
        if (file) {
-                region->vm_file = file;
+                region->vm_file = get_file(file);
-                get_file(file);
+                vma->vm_file = get_file(file);
-                vma->vm_file = file;
-                get_file(file);
-                if (vm_flags & VM_EXECUTABLE) {
-                        added_exe_file_vma(current->mm);
-                        vma->vm_mm = current->mm;
-                }
        }
        down_write(&nommu_region_sem);
@@ -1442,8 +1433,6 @@ error:
        kmem_cache_free(vm_region_jar, region);
        if (vma->vm_file)
                fput(vma->vm_file);
-        if (vma->vm_flags & VM_EXECUTABLE)
-                removed_exe_file_vma(vma->vm_mm);
        kmem_cache_free(vm_area_cachep, vma);
        kleave(" = %d", ret);
        return ret;
@@ -1822,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
        if (addr != (pfn << PAGE_SHIFT))
                return -EINVAL;
-        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
        return 0;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -1963,6 +1952,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+                             unsigned long size, pgoff_t pgoff)
+{
+        BUG();
+        return 0;
+}
+EXPORT_SYMBOL(generic_file_remap_pages);
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long addr, void *buf, int len, int write)
 {
@@ -2047,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
                                size_t newsize)
 {
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        struct vm_region *region;
        pgoff_t low, high;
        size_t r_size, r_top;
@@ -2059,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
        mutex_lock(&inode->i_mapping->i_mmap_mutex);
        /* search for VMAs that fall within the dead zone */
-        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+        vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
-                              low, high) {
                /* found one - only interested if it's shared out of the page
                 * cache */
                if (vma->vm_flags & VM_SHARED) {
@@ -2076,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
         * we don't check for any regions that start beyond the EOF as there
         * shouldn't be any
         */
-        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+        vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
-                              0, ULONG_MAX) {
+                                  0, ULONG_MAX) {
                if (!(vma->vm_flags & VM_SHARED))
                        continue;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 19860086163..79e0f3e2483 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 {
        task_lock(current);
        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-                "oom_adj=%d, oom_score_adj=%d\n",
+                "oom_score_adj=%d\n",
-                current->comm, gfp_mask, order, current->signal->oom_adj,
+                current->comm, gfp_mask, order,
                current->signal->oom_score_adj);
        cpuset_print_task_mems_allowed(current);
        task_unlock(current);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 009ac285fea..bb90971182b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page,
                if (page_is_guard(buddy)) {
                        clear_page_guard_flag(buddy);
                        set_page_private(page, 0);
-                        __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+                        __mod_zone_freepage_state(zone, 1 << order,
+                                                  migratetype);
                } else {
                        list_del(&buddy->lru);
                        zone->free_area[order].nr_free--;
@@ -584,7 +585,7 @@ static inline void __free_one_page(struct page *page,
                combined_idx = buddy_idx & page_idx;
                higher_page = page + (combined_idx - page_idx);
                buddy_idx = __find_buddy_index(combined_idx, order + 1);
-                higher_buddy = page + (buddy_idx - combined_idx);
+                higher_buddy = higher_page + (buddy_idx - combined_idx);
                if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                        list_add_tail(&page->lru,
                                &zone->free_area[order].free_list[migratetype]);
@@ -597,17 +598,6 @@ out:
        zone->free_area[order].nr_free++;
 }
-/*
- * free_page_mlock() -- clean up attempts to free and mlocked() page.
- * Page should not be on lru, so no need to fix that up.
- * free_pages_check() will verify...
- */
-static inline void free_page_mlock(struct page *page)
-{
-        __dec_zone_page_state(page, NR_MLOCK);
-        __count_vm_event(UNEVICTABLE_MLOCKFREED);
-}
 static inline int free_pages_check(struct page *page)
 {
        if (unlikely(page_mapcount(page) |
@@ -668,12 +658,17 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        batch_free = to_free;
                do {
+                        int mt; /* migratetype of the to-be-freed page */
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
                        list_del(&page->lru);
+                        mt = get_freepage_migratetype(page);
                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-                        __free_one_page(page, zone, 0, page_private(page));
+                        __free_one_page(page, zone, 0, mt);
-                        trace_mm_page_pcpu_drain(page, 0, page_private(page));
+                        trace_mm_page_pcpu_drain(page, 0, mt);
+                        if (is_migrate_cma(mt))
+                                __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
                } while (--to_free && --batch_free && !list_empty(list));
        }
        __mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -688,7 +683,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
        zone->pages_scanned = 0;
        __free_one_page(page, zone, order, migratetype);
-        __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+        if (unlikely(migratetype != MIGRATE_ISOLATE))
+                __mod_zone_freepage_state(zone, 1 << order, migratetype);
        spin_unlock(&zone->lock);
 }
@@ -721,17 +717,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
        unsigned long flags;
-        int wasMlocked = __TestClearPageMlocked(page);
+        int migratetype;
        if (!free_pages_prepare(page, order))
                return;
        local_irq_save(flags);
-        if (unlikely(wasMlocked))
-                free_page_mlock(page);
        __count_vm_events(PGFREE, 1 << order);
-        free_one_page(page_zone(page), page, order,
+        migratetype = get_pageblock_migratetype(page);
-                                        get_pageblock_migratetype(page));
+        set_freepage_migratetype(page, migratetype);
+        free_one_page(page_zone(page), page, order, migratetype);
        local_irq_restore(flags);
 }
@@ -811,7 +806,8 @@ static inline void expand(struct zone *zone, struct page *page,
                        set_page_guard_flag(&page[size]);
                        set_page_private(&page[size], high);
                        /* Guard pages are not available for any usage */
-                        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+                        __mod_zone_freepage_state(zone, -(1 << high),
+                                                  migratetype);
                        continue;
                }
 #endif
@@ -915,7 +911,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 */
-static int move_freepages(struct zone *zone,
+int move_freepages(struct zone *zone,
                          struct page *start_page, struct page *end_page,
                          int migratetype)
 {
@@ -951,6 +947,7 @@ static int move_freepages(struct zone *zone,
                order = page_order(page);
                list_move(&page->lru,
                          &zone->free_area[order].free_list[migratetype]);
+                set_freepage_migratetype(page, migratetype);
                page += 1 << order;
                pages_moved += 1 << order;
        }
@@ -1135,8 +1132,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
                                mt = migratetype;
                }
-                set_page_private(page, mt);
+                set_freepage_migratetype(page, mt);
                list = &page->lru;
+                if (is_migrate_cma(mt))
+                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+                                              -(1 << order));
        }
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
        spin_unlock(&zone->lock);
@@ -1296,16 +1296,13 @@ void free_hot_cold_page(struct page *page, int cold)
        struct per_cpu_pages *pcp;
        unsigned long flags;
        int migratetype;
-        int wasMlocked = __TestClearPageMlocked(page);
        if (!free_pages_prepare(page, 0))
                return;
        migratetype = get_pageblock_migratetype(page);
-        set_page_private(page, migratetype);
+        set_freepage_migratetype(page, migratetype);
        local_irq_save(flags);
-        if (unlikely(wasMlocked))
-                free_page_mlock(page);
        __count_vm_event(PGFREE);
        /*
@@ -1380,20 +1377,16 @@ void split_page(struct page *page, unsigned int order)
 }
 /*
- * Similar to split_page except the page is already free. As this is only
+ * Similar to the split_page family of functions except that the page
- * being used for migration, the migratetype of the block also changes.
+ * required at the given order and being isolated now to prevent races
- * As this is called with interrupts disabled, the caller is responsible
+ * with parallel allocators
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
 */
-int split_free_page(struct page *page)
+int capture_free_page(struct page *page, int alloc_order, int migratetype)
 {
        unsigned int order;
        unsigned long watermark;
        struct zone *zone;
+        int mt;
        BUG_ON(!PageBuddy(page));
@@ -1409,12 +1402,16 @@ int split_free_page(struct page *page)
        list_del(&page->lru);
        zone->free_area[order].nr_free--;
        rmv_page_order(page);
-        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
-        /* Split into individual pages */
+        mt = get_pageblock_migratetype(page);
-        set_page_refcounted(page);
+        if (unlikely(mt != MIGRATE_ISOLATE))
-        split_page(page, order);
+                __mod_zone_freepage_state(zone, -(1UL << order), mt);
+        if (alloc_order != order)
+                expand(zone, page, alloc_order, order,
+                        &zone->free_area[order], migratetype);
+        /* Set the pageblock if the captured page is at least a pageblock */
        if (order >= pageblock_order - 1) {
                struct page *endpage = page + (1 << order) - 1;
                for (; page < endpage; page += pageblock_nr_pages) {
@@ -1425,7 +1422,35 @@ int split_free_page(struct page *page)
                }
        }
-        return 1 << order;
+        return 1UL << order;
+}
+/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+        unsigned int order;
+        int nr_pages;
+        BUG_ON(!PageBuddy(page));
+        order = page_order(page);
+        nr_pages = capture_free_page(page, order, 0);
+        if (!nr_pages)
+                return 0;
+        /* Split into individual pages */
+        set_page_refcounted(page);
+        split_page(page, order);
+        return nr_pages;
 }
 /*
@@ -1484,7 +1509,8 @@ again:
                spin_unlock(&zone->lock);
                if (!page)
                        goto failed;
-                __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
+                __mod_zone_freepage_state(zone, -(1 << order),
+                                          get_pageblock_migratetype(page));
        }
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -1501,19 +1527,6 @@ failed:
        return NULL;
 }
-/* The ALLOC_WMARK bits are used as an index to zone->watermark */
-#define ALLOC_WMARK_MIN         WMARK_MIN
-#define ALLOC_WMARK_LOW         WMARK_LOW
-#define ALLOC_WMARK_HIGH        WMARK_HIGH
-#define ALLOC_NO_WATERMARKS     0x04 /* don't check watermarks at all */
-/* Mask to get the watermark bits */
-#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)
-#define ALLOC_HARDER            0x10 /* try to alloc harder */
-#define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET            0x40 /* check for correct cpuset */
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct {
@@ -1608,7 +1621,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
+#ifdef CONFIG_CMA
+        /* If allocation can't use CMA areas don't use free CMA pages */
+        if (!(alloc_flags & ALLOC_CMA))
+                free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
        if (free_pages <= min + lowmem_reserve)
                return false;
        for (o = 0; o < order; o++) {
@@ -1782,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
        bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+        return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
+}
+static void __paginginit init_zone_allows_reclaim(int nid)
+{
+        int i;
+        for_each_online_node(i)
+                if (node_distance(nid, i) <= RECLAIM_DISTANCE) {
+                        node_set(i, NODE_DATA(nid)->reclaim_nodes);
+                        zone_reclaim_mode = 1;
+                }
+}
 #else   /* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1802,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+        return true;
+}
+static inline void init_zone_allows_reclaim(int nid)
+{
+}
 #endif  /* CONFIG_NUMA */
 /*
@@ -1886,7 +1928,8 @@ zonelist_scan:
                                did_zlc_setup = 1;
                        }
-                        if (zone_reclaim_mode == 0)
+                        if (zone_reclaim_mode == 0 ||
+                            !zone_allows_reclaim(preferred_zone, zone))
                                goto this_zone_full;
                        /*
@@ -1928,6 +1971,17 @@ this_zone_full:
                zlc_active = 0;
                goto zonelist_scan;
        }
+        if (page)
+                /*
+                 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+                 * necessary to allocate the page. The expectation is
+                 * that the caller is taking steps that will free more
+                 * memory. The caller should avoid the page being used
+                 * for !PFMEMALLOC purposes.
+                 */
+                page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
        return page;
 }
@@ -2091,10 +2145,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int migratetype, bool sync_migration,
-        bool *deferred_compaction,
+        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
-        struct page *page;
+        struct page *page = NULL;
        if (!order)
                return NULL;
@@ -2106,10 +2160,17 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                nodemask, sync_migration);
+                                                nodemask, sync_migration,
+                                                contended_compaction, &page);
        current->flags &= ~PF_MEMALLOC;
-        if (*did_some_progress != COMPACT_SKIPPED) {
+        /* If compaction captured a page, prep and use it */
+        if (page) {
+                prep_new_page(page, order, gfp_mask);
+                goto got_page;
+        }
+        if (*did_some_progress != COMPACT_SKIPPED) {
                /* Page migration frees to the PCP lists but we want merging */
                drain_pages(get_cpu());
                put_cpu();
@@ -2119,6 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                                alloc_flags & ~ALLOC_NO_WATERMARKS,
                                preferred_zone, migratetype);
                if (page) {
+got_page:
+                        preferred_zone->compact_blockskip_flush = false;
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
                        if (order >= preferred_zone->compact_order_failed)
@@ -2152,7 +2215,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int migratetype, bool sync_migration,
-        bool *deferred_compaction,
+        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
        return NULL;
@@ -2303,7 +2366,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                                 unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
+#ifdef CONFIG_CMA
+        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+                alloc_flags |= ALLOC_CMA;
+#endif
        return alloc_flags;
 }
@@ -2325,6 +2391,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long did_some_progress;
        bool sync_migration = false;
        bool deferred_compaction = false;
+        bool contended_compaction = false;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2349,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 restart:
-        if (!(gfp_mask & __GFP_NO_KSWAPD))
+        wake_all_kswapd(order, zonelist, high_zoneidx,
-                wake_all_kswapd(order, zonelist, high_zoneidx,
+                                        zone_idx(preferred_zone));
-                                                zone_idx(preferred_zone));
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2389,14 +2455,6 @@ rebalance:
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
                if (page) {
-                        /*
-                         * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
-                         * necessary to allocate the page. The expectation is
-                         * that the caller is taking steps that will free more
-                         * memory. The caller should avoid the page being used
-                         * for !PFMEMALLOC purposes.
-                         */
-                        page->pfmemalloc = true;
                        goto got_pg;
                }
        }
@@ -2422,6 +2480,7 @@ rebalance:
                                        nodemask,
                                        alloc_flags, preferred_zone,
                                        migratetype, sync_migration,
+                                        &contended_compaction,
                                        &deferred_compaction,
                                        &did_some_progress);
        if (page)
@@ -2431,10 +2490,11 @@ rebalance:
        /*
         * If compaction is deferred for high-order allocations, it is because
         * sync compaction recently failed. In this is the case and the caller
-         * has requested the system not be heavily disrupted, fail the
+         * requested a movable allocation that does not heavily disrupt the
-         * allocation now instead of entering direct reclaim
+         * system then fail the allocation instead of entering direct reclaim.
         */
-        if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+        if ((deferred_compaction || contended_compaction) &&
+            (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
                goto nopage;
        /* Try direct reclaim and then allocating */
@@ -2505,6 +2565,7 @@ rebalance:
                                        nodemask,
                                        alloc_flags, preferred_zone,
                                        migratetype, sync_migration,
+                                        &contended_compaction,
                                        &deferred_compaction,
                                        &did_some_progress);
                if (page)
@@ -2533,6 +2594,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        struct page *page = NULL;
        int migratetype = allocflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
+        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
        gfp_mask &= gfp_allowed_mask;
@@ -2561,16 +2623,18 @@ retry_cpuset:
        if (!preferred_zone)
                goto out;
+#ifdef CONFIG_CMA
+        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+                alloc_flags |= ALLOC_CMA;
+#endif
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-                        zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
+                        zonelist, high_zoneidx, alloc_flags,
                        preferred_zone, migratetype);
        if (unlikely(!page))
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
-        else
-                page->pfmemalloc = false;
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -2846,7 +2910,8 @@ void show_free_areas(unsigned int filter)
                " unevictable:%lu"
                " dirty:%lu writeback:%lu unstable:%lu\n"
                " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
-                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
+                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+                " free_cma:%lu\n",
                global_page_state(NR_ACTIVE_ANON),
                global_page_state(NR_INACTIVE_ANON),
                global_page_state(NR_ISOLATED_ANON),
@@ -2863,7 +2928,8 @@ void show_free_areas(unsigned int filter)
                global_page_state(NR_FILE_MAPPED),
                global_page_state(NR_SHMEM),
                global_page_state(NR_PAGETABLE),
-                global_page_state(NR_BOUNCE));
+                global_page_state(NR_BOUNCE),
+                global_page_state(NR_FREE_CMA_PAGES));
        for_each_populated_zone(zone) {
                int i;
@@ -2895,6 +2961,7 @@ void show_free_areas(unsigned int filter)
                        " pagetables:%lukB"
                        " unstable:%lukB"
                        " bounce:%lukB"
+                        " free_cma:%lukB"
                        " writeback_tmp:%lukB"
                        " pages_scanned:%lu"
                        " all_unreclaimable? %s"
@@ -2924,6 +2991,7 @@ void show_free_areas(unsigned int filter)
                        K(zone_page_state(zone, NR_PAGETABLE)),
                        K(zone_page_state(zone, NR_UNSTABLE_NFS)),
                        K(zone_page_state(zone, NR_BOUNCE)),
+                        K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
                        K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
                        zone->pages_scanned,
                        (zone->all_unreclaimable ? "yes" : "no")
@@ -3322,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat)
        j = 0;
        while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
-                int distance = node_distance(local_node, node);
-                /*
-                 * If another node is sufficiently far away then it is better
-                 * to reclaim pages in a zone before going off node.
-                 */
-                if (distance > RECLAIM_DISTANCE)
-                        zone_reclaim_mode = 1;
                /*
                 * We don't want to pressure a particular node.
                 * So adding penalty to the first node in same
                 * distance group to make it round-robin.
                 */
-                if (distance != node_distance(local_node, prev_node))
+                if (node_distance(local_node, node) !=
+                    node_distance(local_node, prev_node))
                        node_load[node] = load;
                prev_node = node;
@@ -4432,11 +4492,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->spanned_pages = size;
                zone->present_pages = realsize;
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-                zone->compact_cached_free_pfn = zone->zone_start_pfn +
-                                                zone->spanned_pages;
-                zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
-#endif
 #ifdef CONFIG_NUMA
                zone->node = nid;
                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4515,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
+        init_zone_allows_reclaim(nid);
        calculate_node_totalpages(pgdat, zones_size, zholes_size);
        alloc_node_mem_map(pgdat);
@@ -4873,7 +4929,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                               zone_movable_pfn[i] << PAGE_SHIFT);
        }
-        /* Print out the early_node_map[] */
+        /* Print out the early node map */
        printk("Early memory node ranges\n");
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
                printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
@@ -5613,47 +5669,28 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
                                pageblock_nr_pages));
 }
-static struct page *
-__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
-                             int **resultp)
-{
-        gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
-        if (PageHighMem(page))
-                gfp_mask |= __GFP_HIGHMEM;
-        return alloc_page(gfp_mask);
-}
 /* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+                                        unsigned long start, unsigned long end)
 {
        /* This function is based on compact_zone() from compaction.c. */
+        unsigned long nr_reclaimed;
        unsigned long pfn = start;
        unsigned int tries = 0;
        int ret = 0;
-        struct compact_control cc = {
-                .nr_migratepages = 0,
-                .order = -1,
-                .zone = page_zone(pfn_to_page(start)),
-                .sync = true,
-        };
-        INIT_LIST_HEAD(&cc.migratepages);
        migrate_prep_local();
-        while (pfn < end || !list_empty(&cc.migratepages)) {
+        while (pfn < end || !list_empty(&cc->migratepages)) {
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }
-                if (list_empty(&cc.migratepages)) {
+                if (list_empty(&cc->migratepages)) {
-                        cc.nr_migratepages = 0;
+                        cc->nr_migratepages = 0;
-                        pfn = isolate_migratepages_range(cc.zone, &cc,
+                        pfn = isolate_migratepages_range(cc->zone, cc,
-                                                         pfn, end);
+                                                         pfn, end, true);
                        if (!pfn) {
                                ret = -EINTR;
                                break;
@@ -5664,12 +5701,16 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
                        break;
                }
-                ret = migrate_pages(&cc.migratepages,
+                nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
-                                    __alloc_contig_migrate_alloc,
+                                                        &cc->migratepages);
+                cc->nr_migratepages -= nr_reclaimed;
+                ret = migrate_pages(&cc->migratepages,
+                                    alloc_migrate_target,
                                    0, false, MIGRATE_SYNC);
        }
-        putback_lru_pages(&cc.migratepages);
+        putback_lru_pages(&cc->migratepages);
        return ret > 0 ? 0 : ret;
 }
@@ -5748,6 +5789,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        unsigned long outer_start, outer_end;
        int ret = 0, order;
+        struct compact_control cc = {
+                .nr_migratepages = 0,
+                .order = -1,
+                .zone = page_zone(pfn_to_page(start)),
+                .sync = true,
+                .ignore_skip_hint = true,
+        };
+        INIT_LIST_HEAD(&cc.migratepages);
        /*
         * What we do here is we mark all pageblocks in range as
         * MIGRATE_ISOLATE.  Because pageblock and max order pages may
@@ -5777,7 +5827,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        if (ret)
                goto done;
-        ret = __alloc_contig_migrate_range(start, end);
+        ret = __alloc_contig_migrate_range(&cc, start, end);
        if (ret)
                goto done;
@@ -5826,7 +5876,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
        /* Grab isolated pages from freelists. */
-        outer_end = isolate_freepages_range(outer_start, end);
+        outer_end = isolate_freepages_range(&cc, outer_start, end);
        if (!outer_end) {
                ret = -EBUSY;
                goto done;
@@ -5868,6 +5918,7 @@ static int __meminit __zone_pcp_update(void *data)
                local_irq_save(flags);
                if (pcp->count > 0)
                        free_pcppages_bulk(zone, pcp->count, pcp);
+                drain_zonestat(zone, pset);
                setup_pageset(pset, batch);
                local_irq_restore(flags);
        }
@@ -5884,10 +5935,16 @@ void __meminit zone_pcp_update(struct zone *zone)
 void zone_pcp_reset(struct zone *zone)
 {
        unsigned long flags;
+        int cpu;
+        struct per_cpu_pageset *pset;
        /* avoid races with drain_pages()  */
        local_irq_save(flags);
        if (zone->pageset != &boot_pageset) {
+                for_each_online_cpu(cpu) {
+                        pset = per_cpu_ptr(zone->pageset, cpu);
+                        drain_zonestat(zone, pset);
+                }
                free_percpu(zone->pageset);
                zone->pageset = &boot_pageset;
        }
@@ -6041,3 +6098,37 @@ void dump_page(struct page *page)
        dump_page_flags(page->flags);
        mem_cgroup_print_bad_page(page);
 }
+/* reset zone->present_pages */
+void reset_zone_present_pages(void)
+{
+        struct zone *z;
+        int i, nid;
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                for (i = 0; i < MAX_NR_ZONES; i++) {
+                        z = NODE_DATA(nid)->node_zones + i;
+                        z->present_pages = 0;
+                }
+        }
+}
+/* calculate zone's present pages in buddy system */
+void fixup_zone_present_pages(int nid, unsigned long start_pfn,
+                                unsigned long end_pfn)
+{
+        struct zone *z;
+        unsigned long zone_start_pfn, zone_end_pfn;
+        int i;
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                z = NODE_DATA(nid)->node_zones + i;
+                zone_start_pfn = z->zone_start_pfn;
+                zone_end_pfn = zone_start_pfn + z->spanned_pages;
+                /* if the two regions intersect */
+                if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn))
+                        z->present_pages += min(end_pfn, zone_end_pfn) -
+                                            max(start_pfn, zone_start_pfn);
+        }
+}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 247d1f17573..f2f5b4818e9 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -76,8 +76,13 @@ int set_migratetype_isolate(struct page *page)
 out:
        if (!ret) {
+                unsigned long nr_pages;
+                int migratetype = get_pageblock_migratetype(page);
                set_pageblock_isolate(page);
-                move_freepages_block(zone, page, MIGRATE_ISOLATE);
+                nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+                __mod_zone_freepage_state(zone, -nr_pages, migratetype);
        }
        spin_unlock_irqrestore(&zone->lock, flags);
@@ -89,12 +94,14 @@ out:
 void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 {
        struct zone *zone;
-        unsigned long flags;
+        unsigned long flags, nr_pages;
        zone = page_zone(page);
        spin_lock_irqsave(&zone->lock, flags);
        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
                goto out;
-        move_freepages_block(zone, page, migratetype);
+        nr_pages = move_freepages_block(zone, page, migratetype);
+        __mod_zone_freepage_state(zone, nr_pages, migratetype);
        restore_pageblock_isolate(page, migratetype);
 out:
        spin_unlock_irqrestore(&zone->lock, flags);
@@ -193,10 +200,25 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
                        continue;
                }
                page = pfn_to_page(pfn);
-                if (PageBuddy(page))
+                if (PageBuddy(page)) {
+                        /*
+                         * If race between isolatation and allocation happens,
+                         * some free pages could be in MIGRATE_MOVABLE list
+                         * although pageblock's migratation type of the page
+                         * is MIGRATE_ISOLATE. Catch it and move the page into
+                         * MIGRATE_ISOLATE list.
+                         */
+                        if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
+                                struct page *end_page;
+                                end_page = page + (1 << page_order(page)) - 1;
+                                move_freepages(page_zone(page), page, end_page,
+                                                MIGRATE_ISOLATE);
+                        }
                        pfn += 1 << page_order(page);
+                }
                else if (page_count(page) == 0 &&
-                                page_private(page) == MIGRATE_ISOLATE)
+                        get_freepage_migratetype(page) == MIGRATE_ISOLATE)
                        pfn += 1;
                else
                        break;
@@ -233,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        spin_unlock_irqrestore(&zone->lock, flags);
        return ret ? 0 : -EBUSY;
 }
+struct page *alloc_migrate_target(struct page *page, unsigned long private,
+                                  int **resultp)
+{
+        gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+        if (PageHighMem(page))
+                gfp_mask |= __GFP_HIGHMEM;
+        return alloc_page(gfp_mask);
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index bb4be7435ce..ddc5efb9c5b 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1370,7 +1370,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 #ifdef CONFIG_SMP
-const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
+const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
        [PCPU_FC_AUTO]  = "auto",
        [PCPU_FC_EMBED] = "embed",
        [PCPU_FC_PAGE]  = "page",
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 74c0ddaa6fa..e642627da6b 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -120,3 +120,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
+#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+{
+        assert_spin_locked(&mm->page_table_lock);
+        /* FIFO */
+        if (!mm->pmd_huge_pte)
+                INIT_LIST_HEAD(&pgtable->lru);
+        else
+                list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+        mm->pmd_huge_pte = pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+{
+        pgtable_t pgtable;
+        assert_spin_locked(&mm->page_table_lock);
+        /* FIFO */
+        pgtable = mm->pmd_huge_pte;
+        if (list_empty(&pgtable->lru))
+                mm->pmd_huge_pte = NULL;
+        else {
+                mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+                                              struct page, lru);
+                list_del(&pgtable->lru);
+        }
+        return pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                     pmd_t *pmdp)
+{
+        set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
+        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
deleted file mode 100644
index 799dcfd7cd8..00000000000
--- a/mm/prio_tree.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * mm/prio_tree.c - priority search tree for mapping->i_mmap
- *
- * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
- *
- * This file is released under the GPL v2.
- *
- * Based on the radix priority search tree proposed by Edward M. McCreight
- * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
- *
- * 02Feb2004    Initial version
- */
-#include <linux/mm.h>
-#include <linux/prio_tree.h>
-#include <linux/prefetch.h>
-/*
- * See lib/prio_tree.c for details on the general radix priority search tree
- * code.
- */
-/*
- * The following #defines are mirrored from lib/prio_tree.c. They're only used
- * for debugging, and should be removed (along with the debugging code using
- * them) when switching also VMAs to the regular prio_tree code.
- */
-#define RADIX_INDEX(vma)  ((vma)->vm_pgoff)
-#define VMA_SIZE(vma)     (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
-/* avoid overflow */
-#define HEAP_INDEX(vma)   ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
-/*
- * Radix priority search tree for address_space->i_mmap
- *
- * For each vma that map a unique set of file pages i.e., unique [radix_index,
- * heap_index] value, we have a corresponding priority search tree node. If
- * multiple vmas have identical [radix_index, heap_index] value, then one of
- * them is used as a tree node and others are stored in a vm_set list. The tree
- * node points to the first vma (head) of the list using vm_set.head.
- *
- * prio_tree_root
- *      |
- *      A       vm_set.head
- *     / \      /
- *    L   R -> H-I-J-K-M-N-O-P-Q-S
- *    ^   ^    <-- vm_set.list -->
- *  tree nodes
- *
- * We need some way to identify whether a vma is a tree node, head of a vm_set
- * list, or just a member of a vm_set list. We cannot use vm_flags to store
- * such information. The reason is, in the above figure, it is possible that
- * vm_flags' of R and H are covered by the different mmap_sems. When R is
- * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
- * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
- * That's why some trick involving shared.vm_set.parent is used for identifying
- * tree nodes and list head nodes.
- *
- * vma radix priority search tree node rules:
- *
- * vma->shared.vm_set.parent != NULL    ==> a tree node
- *      vma->shared.vm_set.head != NULL ==> list of others mapping same range
- *      vma->shared.vm_set.head == NULL ==> no others map the same range
- *
- * vma->shared.vm_set.parent == NULL
- *      vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
- *      vma->shared.vm_set.head == NULL ==> a list node
- */
-/*
- * Add a new vma known to map the same set of pages as the old vma:
- * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
- * Note that it just happens to work correctly on i_mmap_nonlinear too.
- */
-void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
-{
-        /* Leave these BUG_ONs till prio_tree patch stabilizes */
-        BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
-        BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
-        vma->shared.vm_set.head = NULL;
-        vma->shared.vm_set.parent = NULL;
-        if (!old->shared.vm_set.parent)
-                list_add(&vma->shared.vm_set.list,
-                                &old->shared.vm_set.list);
-        else if (old->shared.vm_set.head)
-                list_add_tail(&vma->shared.vm_set.list,
-                                &old->shared.vm_set.head->shared.vm_set.list);
-        else {
-                INIT_LIST_HEAD(&vma->shared.vm_set.list);
-                vma->shared.vm_set.head = old;
-                old->shared.vm_set.head = vma;
-        }
-}
-void vma_prio_tree_insert(struct vm_area_struct *vma,
-                          struct prio_tree_root *root)
-{
-        struct prio_tree_node *ptr;
-        struct vm_area_struct *old;
-        vma->shared.vm_set.head = NULL;
-        ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
-        if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
-                old = prio_tree_entry(ptr, struct vm_area_struct,
-                                        shared.prio_tree_node);
-                vma_prio_tree_add(vma, old);
-        }
-}
-void vma_prio_tree_remove(struct vm_area_struct *vma,
-                          struct prio_tree_root *root)
-{
-        struct vm_area_struct *node, *head, *new_head;
-        if (!vma->shared.vm_set.head) {
-                if (!vma->shared.vm_set.parent)
-                        list_del_init(&vma->shared.vm_set.list);
-                else
-                        raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
-        } else {
-                /* Leave this BUG_ON till prio_tree patch stabilizes */
-                BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
-                if (vma->shared.vm_set.parent) {
-                        head = vma->shared.vm_set.head;
-                        if (!list_empty(&head->shared.vm_set.list)) {
-                                new_head = list_entry(
-                                        head->shared.vm_set.list.next,
-                                        struct vm_area_struct,
-                                        shared.vm_set.list);
-                                list_del_init(&head->shared.vm_set.list);
-                        } else
-                                new_head = NULL;
-                        raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
-                                        &head->shared.prio_tree_node);
-                        head->shared.vm_set.head = new_head;
-                        if (new_head)
-                                new_head->shared.vm_set.head = head;
-                } else {
-                        node = vma->shared.vm_set.head;
-                        if (!list_empty(&vma->shared.vm_set.list)) {
-                                new_head = list_entry(
-                                        vma->shared.vm_set.list.next,
-                                        struct vm_area_struct,
-                                        shared.vm_set.list);
-                                list_del_init(&vma->shared.vm_set.list);
-                                node->shared.vm_set.head = new_head;
-                                new_head->shared.vm_set.head = node;
-                        } else
-                                node->shared.vm_set.head = NULL;
-                }
-        }
-}
-/*
- * Helper function to enumerate vmas that map a given file page or a set of
- * contiguous file pages. The function returns vmas that at least map a single
- * page in the given range of contiguous file pages.
- */
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
-                                        struct prio_tree_iter *iter)
-{
-        struct prio_tree_node *ptr;
-        struct vm_area_struct *next;
-        if (!vma) {
-                /*
-                 * First call is with NULL vma
-                 */
-                ptr = prio_tree_next(iter);
-                if (ptr) {
-                        next = prio_tree_entry(ptr, struct vm_area_struct,
-                                                shared.prio_tree_node);
-                        prefetch(next->shared.vm_set.head);
-                        return next;
-                } else
-                        return NULL;
-        }
-        if (vma->shared.vm_set.parent) {
-                if (vma->shared.vm_set.head) {
-                        next = vma->shared.vm_set.head;
-                        prefetch(next->shared.vm_set.list.next);
-                        return next;
-                }
-        } else {
-                next = list_entry(vma->shared.vm_set.list.next,
-                                struct vm_area_struct, shared.vm_set.list);
-                if (!next->shared.vm_set.head) {
-                        prefetch(next->shared.vm_set.list.next);
-                        return next;
-                }
-        }
-        ptr = prio_tree_next(iter);
-        if (ptr) {
-                next = prio_tree_entry(ptr, struct vm_area_struct,
-                                        shared.prio_tree_node);
-                prefetch(next->shared.vm_set.head);
-                return next;
-        } else
-                return NULL;
-}
diff --git a/mm/readahead.c b/mm/readahead.c
index ea8f8fa2164..7963f239123 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -579,19 +579,19 @@ do_readahead(struct address_space *mapping, struct file *filp,
 SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
 {
        ssize_t ret;
-        struct file *file;
+        struct fd f;
        ret = -EBADF;
-        file = fget(fd);
+        f = fdget(fd);
-        if (file) {
+        if (f.file) {
-                if (file->f_mode & FMODE_READ) {
+                if (f.file->f_mode & FMODE_READ) {
-                        struct address_space *mapping = file->f_mapping;
+                        struct address_space *mapping = f.file->f_mapping;
                        pgoff_t start = offset >> PAGE_CACHE_SHIFT;
                        pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
                        unsigned long len = end - start + 1;
-                        ret = do_readahead(mapping, file, start, len);
+                        ret = do_readahead(mapping, f.file, start, len);
                }
-                fput(file);
+                fdput(f);
        }
        return ret;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 0f3b7cda2a2..7df7984d476 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
        avc->vma = vma;
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
+        anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
-        /*
-         * It's critical to add new vmas to the tail of the anon_vma,
-         * see comment in huge_memory.c:__split_huge_page().
-         */
-        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
 }
 /**
@@ -269,51 +264,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 }
 /*
- * Some rmap walk that needs to find all ptes/hugepmds without false
- * negatives (like migrate and split_huge_page) running concurrent
- * with operations that copy or move pagetables (like mremap() and
- * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
- * list to be in a certain order: the dst_vma must be placed after the
- * src_vma in the list. This is always guaranteed by fork() but
- * mremap() needs to call this function to enforce it in case the
- * dst_vma isn't newly allocated and chained with the anon_vma_clone()
- * function but just an extension of a pre-existing vma through
- * vma_merge.
- *
- * NOTE: the same_anon_vma list can still be changed by other
- * processes while mremap runs because mremap doesn't hold the
- * anon_vma mutex to prevent modifications to the list while it
- * runs. All we need to enforce is that the relative order of this
- * process vmas isn't changing (we don't care about other vmas
- * order). Each vma corresponds to an anon_vma_chain structure so
- * there's no risk that other processes calling anon_vma_moveto_tail()
- * and changing the same_anon_vma list under mremap() will screw with
- * the relative order of this process vmas in the list, because we
- * they can't alter the order of any vma that belongs to this
- * process. And there can't be another anon_vma_moveto_tail() running
- * concurrently with mremap() coming from this process because we hold
- * the mmap_sem for the whole mremap(). fork() ordering dependency
- * also shouldn't be affected because fork() only cares that the
- * parent vmas are placed in the list before the child vmas and
- * anon_vma_moveto_tail() won't reorder vmas from either the fork()
- * parent or child.
- */
-void anon_vma_moveto_tail(struct vm_area_struct *dst)
-{
-        struct anon_vma_chain *pavc;
-        struct anon_vma *root = NULL;
-        list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
-                struct anon_vma *anon_vma = pavc->anon_vma;
-                VM_BUG_ON(pavc->vma != dst);
-                root = lock_anon_vma_root(root, anon_vma);
-                list_del(&pavc->same_anon_vma);
-                list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
-        }
-        unlock_anon_vma_root(root);
-}
-/*
 * Attach vma to its own anon_vma, as well as to the anon_vmas that
 * the corresponding VMA in the parent process is attached to.
 * Returns 0 on success, non-zero on failure.
@@ -381,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
                struct anon_vma *anon_vma = avc->anon_vma;
                root = lock_anon_vma_root(root, anon_vma);
-                list_del(&avc->same_anon_vma);
+                anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
                /*
                 * Leave empty anon_vmas on the list - we'll need
                 * to free them outside the lock.
                 */
-                if (list_empty(&anon_vma->head))
+                if (RB_EMPTY_ROOT(&anon_vma->rb_root))
                        continue;
                list_del(&avc->same_vma);
@@ -416,7 +366,7 @@ static void anon_vma_ctor(void *data)
        mutex_init(&anon_vma->mutex);
        atomic_set(&anon_vma->refcount, 0);
-        INIT_LIST_HEAD(&anon_vma->head);
+        anon_vma->rb_root = RB_ROOT;
 }
 void __init anon_vma_init(void)
@@ -560,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
 /*
 * At what user virtual address is page expected in @vma?
- * Returns virtual address or -EFAULT if page's index/offset is not
- * within the range mapped the @vma.
 */
-inline unsigned long
+static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+__vma_address(struct page *page, struct vm_area_struct *vma)
 {
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        unsigned long address;
        if (unlikely(is_vm_hugetlb_page(vma)))
                pgoff = page->index << huge_page_order(page_hstate(page));
-        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-        if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+        return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-                /* page should be within @vma mapping range */
+}
-                return -EFAULT;
-        }
+inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+        unsigned long address = __vma_address(page, vma);
+        /* page should be within @vma mapping range */
+        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        return address;
 }
@@ -585,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
+        unsigned long address;
        if (PageAnon(page)) {
                struct anon_vma *page__anon_vma = page_anon_vma(page);
                /*
@@ -600,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
                        return -EFAULT;
        } else
                return -EFAULT;
-        return vma_address(page, vma);
+        address = __vma_address(page, vma);
+        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+                return -EFAULT;
+        return address;
 }
 /*
@@ -674,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
        pte_t *pte;
        spinlock_t *ptl;
-        address = vma_address(page, vma);
+        address = __vma_address(page, vma);
-        if (address == -EFAULT)         /* out of vma range */
+        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
                return 0;
        pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
        if (!pte)                       /* the page is not in this mm */
@@ -769,6 +727,7 @@ static int page_referenced_anon(struct page *page,
 {
        unsigned int mapcount;
        struct anon_vma *anon_vma;
+        pgoff_t pgoff;
        struct anon_vma_chain *avc;
        int referenced = 0;
@@ -777,11 +736,10 @@ static int page_referenced_anon(struct page *page,
                return referenced;
        mapcount = page_mapcount(page);
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                /*
                 * If we are reclaiming on behalf of a cgroup, skip
                 * counting on behalf of references from different
@@ -820,7 +778,6 @@ static int page_referenced_file(struct page *page,
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        int referenced = 0;
        /*
@@ -846,10 +803,8 @@ static int page_referenced_file(struct page *page,
         */
        mapcount = page_mapcount(page);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                /*
                 * If we are reclaiming on behalf of a cgroup, skip
                 * counting on behalf of references from different
@@ -929,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                pte_t entry;
                flush_cache_page(vma, address, pte_pfn(*pte));
-                entry = ptep_clear_flush_notify(vma, address, pte);
+                entry = ptep_clear_flush(vma, address, pte);
                entry = pte_wrprotect(entry);
                entry = pte_mkclean(entry);
                set_pte_at(mm, address, pte, entry);
@@ -937,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
        }
        pte_unmap_unlock(pte, ptl);
+        if (ret)
+                mmu_notifier_invalidate_page(mm, address);
 out:
        return ret;
 }
@@ -945,17 +903,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
 {
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        int ret = 0;
        BUG_ON(PageAnon(page));
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (vma->vm_flags & VM_SHARED) {
                        unsigned long address = vma_address(page, vma);
-                        if (address == -EFAULT)
-                                continue;
                        ret += page_mkclean_one(page, vma, address);
                }
        }
@@ -1128,7 +1083,7 @@ void page_add_new_anon_rmap(struct page *page,
        else
                __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __page_set_anon_rmap(page, vma, address, 1);
-        if (page_evictable(page, vma))
+        if (!mlocked_vma_newpage(vma, page))
                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
        else
                add_page_to_unevictable_list(page);
@@ -1203,7 +1158,10 @@ void page_remove_rmap(struct page *page)
        } else {
                __dec_zone_page_state(page, NR_FILE_MAPPED);
                mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
+                mem_cgroup_end_update_page_stat(page, &locked, &flags);
        }
+        if (unlikely(PageMlocked(page)))
+                clear_page_mlock(page);
        /*
         * It would be tidy to reset the PageAnon mapping here,
         * but that might overwrite a racing page_add_anon_rmap
@@ -1213,6 +1171,7 @@ void page_remove_rmap(struct page *page)
         * Leaving it set also helps swapoff to reinstate ptes
         * faster for those pages still in swapcache.
         */
+        return;
 out:
        if (!anon)
                mem_cgroup_end_update_page_stat(page, &locked, &flags);
@@ -1256,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
-        pteval = ptep_clear_flush_notify(vma, address, pte);
+        pteval = ptep_clear_flush(vma, address, pte);
        /* Move the dirty bit to the physical page now the pte is gone. */
        if (pte_dirty(pteval))
@@ -1318,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 out_unmap:
        pte_unmap_unlock(pte, ptl);
+        if (ret != SWAP_FAIL)
+                mmu_notifier_invalidate_page(mm, address);
 out:
        return ret;
@@ -1382,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        spinlock_t *ptl;
        struct page *page;
        unsigned long address;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        unsigned long end;
        int ret = SWAP_AGAIN;
        int locked_vma = 0;
@@ -1405,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        if (!pmd_present(*pmd))
                return ret;
+        mmun_start = address;
+        mmun_end   = end;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
         * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
         * keep the sem while scanning the cluster for mlocking pages.
@@ -1438,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                /* Nuke the page table entry. */
                flush_cache_page(vma, address, pte_pfn(*pte));
-                pteval = ptep_clear_flush_notify(vma, address, pte);
+                pteval = ptep_clear_flush(vma, address, pte);
                /* If nonlinear, store the file page offset in the pte. */
                if (page->index != linear_page_index(vma, address))
@@ -1454,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                (*mapcount)--;
        }
        pte_unmap_unlock(pte - 1, ptl);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (locked_vma)
                up_read(&vma->vm_mm->mmap_sem);
        return ret;
@@ -1492,6 +1460,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma)
 static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
        struct anon_vma *anon_vma;
+        pgoff_t pgoff;
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
@@ -1499,7 +1468,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
        if (!anon_vma)
                return ret;
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address;
@@ -1516,8 +1486,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
                        continue;
                address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                ret = try_to_unmap_one(page, vma, address, flags);
                if (ret != SWAP_AGAIN || !page_mapped(page))
                        break;
@@ -1547,7 +1515,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        int ret = SWAP_AGAIN;
        unsigned long cursor;
        unsigned long max_nl_cursor = 0;
@@ -1555,10 +1522,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        unsigned int mapcount;
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                ret = try_to_unmap_one(page, vma, address, flags);
                if (ret != SWAP_AGAIN || !page_mapped(page))
                        goto out;
@@ -1576,7 +1541,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
                goto out;
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-                                                shared.vm_set.list) {
+                                                        shared.nonlinear) {
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
                        max_nl_cursor = cursor;
@@ -1608,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        do {
                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-                                                shared.vm_set.list) {
+                                                        shared.nonlinear) {
                        cursor = (unsigned long) vma->vm_private_data;
                        while ( cursor < max_nl_cursor &&
                                cursor < vma->vm_end - vma->vm_start) {
@@ -1631,7 +1596,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
         * in locked vmas).  Reset cursor on all unreserved nonlinear
         * vmas, now forgetting on which ones it had fallen behind.
         */
-        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
                vma->vm_private_data = NULL;
 out:
        mutex_unlock(&mapping->i_mmap_mutex);
@@ -1716,6 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
                struct vm_area_struct *, unsigned long, void *), void *arg)
 {
        struct anon_vma *anon_vma;
+        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
@@ -1729,11 +1695,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        if (!anon_vma)
                return ret;
        anon_vma_lock(anon_vma);
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                ret = rmap_one(page, vma, address, arg);
                if (ret != SWAP_AGAIN)
                        break;
@@ -1748,16 +1712,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        int ret = SWAP_AGAIN;
        if (!mapping)
                return ret;
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                ret = rmap_one(page, vma, address, arg);
                if (ret != SWAP_AGAIN)
                        break;
diff --git a/mm/shmem.c b/mm/shmem.c
index d4e184e2a38..cc12072f878 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -77,13 +77,6 @@ static struct vfsmount *shm_mnt;
 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
 #define SHORT_SYMLINK_LEN 128
-struct shmem_xattr {
-        struct list_head list;  /* anchored by shmem_inode_info->xattr_list */
-        char *name;             /* xattr name */
-        size_t size;
-        char value[0];
-};
 /*
 * shmem_fallocate and shmem_writepage communicate via inode->i_private
 * (with i_mutex making sure that it has only one user at a time):
@@ -636,7 +629,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 static void shmem_evict_inode(struct inode *inode)
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
-        struct shmem_xattr *xattr, *nxattr;
        if (inode->i_mapping->a_ops == &shmem_aops) {
                shmem_unacct_size(info->flags, inode->i_size);
@@ -650,10 +642,7 @@ static void shmem_evict_inode(struct inode *inode)
        } else
                kfree(info->symlink);
-        list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
+        simple_xattrs_free(&info->xattrs);
-                kfree(xattr->name);
-                kfree(xattr);
-        }
        BUG_ON(inode->i_blocks);
        shmem_free_inode(inode->i_sb);
        clear_inode(inode);
@@ -1350,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
        file_accessed(file);
        vma->vm_ops = &shmem_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
@@ -1377,7 +1365,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
                spin_lock_init(&info->lock);
                info->flags = flags & VM_NORESERVE;
                INIT_LIST_HEAD(&info->swaplist);
-                INIT_LIST_HEAD(&info->xattr_list);
+                simple_xattrs_init(&info->xattrs);
                cache_no_acl(inode);
                switch (mode & S_IFMT) {
@@ -2060,28 +2048,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
 */
 /*
- * Allocate new xattr and copy in the value; but leave the name to callers.
- */
-static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
-{
-        struct shmem_xattr *new_xattr;
-        size_t len;
-        /* wrap around? */
-        len = sizeof(*new_xattr) + size;
-        if (len <= sizeof(*new_xattr))
-                return NULL;
-        new_xattr = kmalloc(len, GFP_KERNEL);
-        if (!new_xattr)
-                return NULL;
-        new_xattr->size = size;
-        memcpy(new_xattr->value, value, size);
-        return new_xattr;
-}
-/*
 * Callback for security_inode_init_security() for acquiring xattrs.
 */
 static int shmem_initxattrs(struct inode *inode,
@@ -2090,11 +2056,11 @@ static int shmem_initxattrs(struct inode *inode,
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
        const struct xattr *xattr;
-        struct shmem_xattr *new_xattr;
+        struct simple_xattr *new_xattr;
        size_t len;
        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-                new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
+                new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
                if (!new_xattr)
                        return -ENOMEM;
@@ -2111,91 +2077,12 @@ static int shmem_initxattrs(struct inode *inode,
                memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
                       xattr->name, len);
-                spin_lock(&info->lock);
+                simple_xattr_list_add(&info->xattrs, new_xattr);
-                list_add(&new_xattr->list, &info->xattr_list);
-                spin_unlock(&info->lock);
        }
        return 0;
 }
-static int shmem_xattr_get(struct dentry *dentry, const char *name,
-                           void *buffer, size_t size)
-{
-        struct shmem_inode_info *info;
-        struct shmem_xattr *xattr;
-        int ret = -ENODATA;
-        info = SHMEM_I(dentry->d_inode);
-        spin_lock(&info->lock);
-        list_for_each_entry(xattr, &info->xattr_list, list) {
-                if (strcmp(name, xattr->name))
-                        continue;
-                ret = xattr->size;
-                if (buffer) {
-                        if (size < xattr->size)
-                                ret = -ERANGE;
-                        else
-                                memcpy(buffer, xattr->value, xattr->size);
-                }
-                break;
-        }
-        spin_unlock(&info->lock);
-        return ret;
-}
-static int shmem_xattr_set(struct inode *inode, const char *name,
-                           const void *value, size_t size, int flags)
-{
-        struct shmem_inode_info *info = SHMEM_I(inode);
-        struct shmem_xattr *xattr;
-        struct shmem_xattr *new_xattr = NULL;
-        int err = 0;
-        /* value == NULL means remove */
-        if (value) {
-                new_xattr = shmem_xattr_alloc(value, size);
-                if (!new_xattr)
-                        return -ENOMEM;
-                new_xattr->name = kstrdup(name, GFP_KERNEL);
-                if (!new_xattr->name) {
-                        kfree(new_xattr);
-                        return -ENOMEM;
-                }
-        }
-        spin_lock(&info->lock);
-        list_for_each_entry(xattr, &info->xattr_list, list) {
-                if (!strcmp(name, xattr->name)) {
-                        if (flags & XATTR_CREATE) {
-                                xattr = new_xattr;
-                                err = -EEXIST;
-                        } else if (new_xattr) {
-                                list_replace(&xattr->list, &new_xattr->list);
-                        } else {
-                                list_del(&xattr->list);
-                        }
-                        goto out;
-                }
-        }
-        if (flags & XATTR_REPLACE) {
-                xattr = new_xattr;
-                err = -ENODATA;
-        } else {
-                list_add(&new_xattr->list, &info->xattr_list);
-                xattr = NULL;
-        }
-out:
-        spin_unlock(&info->lock);
-        if (xattr)
-                kfree(xattr->name);
-        kfree(xattr);
-        return err;
-}
 static const struct xattr_handler *shmem_xattr_handlers[] = {
 #ifdef CONFIG_TMPFS_POSIX_ACL
        &generic_acl_access_handler,
@@ -2226,6 +2113,7 @@ static int shmem_xattr_validate(const char *name)
 static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
                              void *buffer, size_t size)
 {
+        struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
        int err;
        /*
@@ -2240,12 +2128,13 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
        if (err)
                return err;
-        return shmem_xattr_get(dentry, name, buffer, size);
+        return simple_xattr_get(&info->xattrs, name, buffer, size);
 }
 static int shmem_setxattr(struct dentry *dentry, const char *name,
                          const void *value, size_t size, int flags)
 {
+        struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
        int err;
        /*
@@ -2260,15 +2149,12 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
        if (err)
                return err;
-        if (size == 0)
+        return simple_xattr_set(&info->xattrs, name, value, size, flags);
-                value = "";  /* empty EA, do not remove */
-        return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
 }
 static int shmem_removexattr(struct dentry *dentry, const char *name)
 {
+        struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
        int err;
        /*
@@ -2283,45 +2169,13 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
        if (err)
                return err;
-        return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+        return simple_xattr_remove(&info->xattrs, name);
-}
-static bool xattr_is_trusted(const char *name)
-{
-        return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
 }
 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-        bool trusted = capable(CAP_SYS_ADMIN);
+        struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
-        struct shmem_xattr *xattr;
+        return simple_xattr_list(&info->xattrs, buffer, size);
-        struct shmem_inode_info *info;
-        size_t used = 0;
-        info = SHMEM_I(dentry->d_inode);
-        spin_lock(&info->lock);
-        list_for_each_entry(xattr, &info->xattr_list, list) {
-                size_t len;
-                /* skip "trusted." attributes for unprivileged callers */
-                if (!trusted && xattr_is_trusted(xattr->name))
-                        continue;
-                len = strlen(xattr->name) + 1;
-                used += len;
-                if (buffer) {
-                        if (size < used) {
-                                used = -ERANGE;
-                                break;
-                        }
-                        memcpy(buffer, xattr->name, len);
-                        buffer += len;
-                }
-        }
-        spin_unlock(&info->lock);
-        return used;
 }
 #endif /* CONFIG_TMPFS_XATTR */
@@ -2788,6 +2642,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
 #endif
+        .remap_pages    = generic_file_remap_pages,
 };
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -2981,7 +2836,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
                fput(vma->vm_file);
        vma->vm_file = file;
        vma->vm_ops = &shmem_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
diff --git a/mm/slab.c b/mm/slab.c
index f8b0d539b48..33d3363658d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -498,14 +498,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #endif
-#ifdef CONFIG_TRACING
-size_t slab_buffer_size(struct kmem_cache *cachep)
-{
-        return cachep->size;
-}
-EXPORT_SYMBOL(slab_buffer_size);
-#endif
 /*
 * Do not go above this order unless 0 objects fit into the slab or
 * overridden on the command line.
@@ -515,13 +507,6 @@ EXPORT_SYMBOL(slab_buffer_size);
 static int slab_max_order = SLAB_MAX_ORDER_LO;
 static bool slab_max_order_set __initdata;
-static inline struct kmem_cache *page_get_cache(struct page *page)
-{
-        page = compound_head(page);
-        BUG_ON(!PageSlab(page));
-        return page->slab_cache;
-}
 static inline struct kmem_cache *virt_to_cache(const void *obj)
 {
        struct page *page = virt_to_head_page(obj);
@@ -585,9 +570,9 @@ static struct arraycache_init initarray_generic =
    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 /* internal cache of cache description objs */
-static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
+static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
-static struct kmem_cache cache_cache = {
+static struct kmem_cache kmem_cache_boot = {
-        .nodelists = cache_cache_nodelists,
+        .nodelists = kmem_cache_nodelists,
        .batchcount = 1,
        .limit = BOOT_CPUCACHE_ENTRIES,
        .shared = 1,
@@ -810,6 +795,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
        *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 }
+#if DEBUG
 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
 static void __slab_error(const char *function, struct kmem_cache *cachep,
@@ -818,7 +804,9 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
               function, cachep->name, msg);
        dump_stack();
+        add_taint(TAINT_BAD_PAGE);
 }
+#endif
 /*
 * By default on NUMA we use alien caches to stage the freeing of
@@ -900,7 +888,7 @@ static void __cpuinit start_cpu_timer(int cpu)
         */
        if (keventd_up() && reap_work->work.func == NULL) {
                init_reap_node(cpu);
-                INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
+                INIT_DEFERRABLE_WORK(reap_work, cache_reap);
                schedule_delayed_work_on(cpu, reap_work,
                                        __round_jiffies_relative(HZ, cpu));
        }
@@ -983,7 +971,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
                }
                /* The caller cannot use PFMEMALLOC objects, find another one */
-                for (i = 1; i < ac->avail; i++) {
+                for (i = 0; i < ac->avail; i++) {
                        /* If a !PFMEMALLOC object is found, swap them */
                        if (!is_obj_pfmemalloc(ac->entry[i])) {
                                objp = ac->entry[i];
@@ -1000,7 +988,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
                l3 = cachep->nodelists[numa_mem_id()];
                if (!list_empty(&l3->slabs_free) && force_refill) {
                        struct slab *slabp = virt_to_slab(objp);
-                        ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
+                        ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem));
                        clear_obj_pfmemalloc(&objp);
                        recheck_pfmemalloc_active(cachep, ac);
                        return objp;
@@ -1032,7 +1020,7 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
 {
        if (unlikely(pfmemalloc_active)) {
                /* Some pfmemalloc slabs exist, check if this is one */
-                struct page *page = virt_to_page(objp);
+                struct page *page = virt_to_head_page(objp);
                if (PageSlabPfmemalloc(page))
                        set_obj_pfmemalloc(&objp);
        }
@@ -1601,15 +1589,17 @@ void __init kmem_cache_init(void)
        int order;
        int node;
+        kmem_cache = &kmem_cache_boot;
        if (num_possible_nodes() == 1)
                use_alien_caches = 0;
        for (i = 0; i < NUM_INIT_LISTS; i++) {
                kmem_list3_init(&initkmem_list3[i]);
                if (i < MAX_NUMNODES)
-                        cache_cache.nodelists[i] = NULL;
+                        kmem_cache->nodelists[i] = NULL;
        }
-        set_up_list3s(&cache_cache, CACHE_CACHE);
+        set_up_list3s(kmem_cache, CACHE_CACHE);
        /*
         * Fragmentation resistance on low memory - only use bigger
@@ -1621,9 +1611,9 @@ void __init kmem_cache_init(void)
        /* Bootstrap is tricky, because several objects are allocated
         * from caches that do not exist yet:
-         * 1) initialize the cache_cache cache: it contains the struct
+         * 1) initialize the kmem_cache cache: it contains the struct
-         *    kmem_cache structures of all caches, except cache_cache itself:
+         *    kmem_cache structures of all caches, except kmem_cache itself:
-         *    cache_cache is statically allocated.
+         *    kmem_cache is statically allocated.
         *    Initially an __init data area is used for the head array and the
         *    kmem_list3 structures, it's replaced with a kmalloc allocated
         *    array at the end of the bootstrap.
@@ -1632,43 +1622,43 @@ void __init kmem_cache_init(void)
         *    An __init data area is used for the head array.
         * 3) Create the remaining kmalloc caches, with minimally sized
         *    head arrays.
-         * 4) Replace the __init data head arrays for cache_cache and the first
+         * 4) Replace the __init data head arrays for kmem_cache and the first
         *    kmalloc cache with kmalloc allocated arrays.
-         * 5) Replace the __init data for kmem_list3 for cache_cache and
+         * 5) Replace the __init data for kmem_list3 for kmem_cache and
         *    the other cache's with kmalloc allocated memory.
         * 6) Resize the head arrays of the kmalloc caches to their final sizes.
         */
        node = numa_mem_id();
-        /* 1) create the cache_cache */
+        /* 1) create the kmem_cache */
        INIT_LIST_HEAD(&slab_caches);
-        list_add(&cache_cache.list, &slab_caches);
+        list_add(&kmem_cache->list, &slab_caches);
-        cache_cache.colour_off = cache_line_size();
+        kmem_cache->colour_off = cache_line_size();
-        cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
+        kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
-        cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
+        kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
        /*
         * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
         */
-        cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+        kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
                                  nr_node_ids * sizeof(struct kmem_list3 *);
-        cache_cache.object_size = cache_cache.size;
+        kmem_cache->object_size = kmem_cache->size;
-        cache_cache.size = ALIGN(cache_cache.size,
+        kmem_cache->size = ALIGN(kmem_cache->object_size,
                                        cache_line_size());
-        cache_cache.reciprocal_buffer_size =
+        kmem_cache->reciprocal_buffer_size =
-                reciprocal_value(cache_cache.size);
+                reciprocal_value(kmem_cache->size);
        for (order = 0; order < MAX_ORDER; order++) {
-                cache_estimate(order, cache_cache.size,
+                cache_estimate(order, kmem_cache->size,
-                        cache_line_size(), 0, &left_over, &cache_cache.num);
+                        cache_line_size(), 0, &left_over, &kmem_cache->num);
-                if (cache_cache.num)
+                if (kmem_cache->num)
                        break;
        }
-        BUG_ON(!cache_cache.num);
+        BUG_ON(!kmem_cache->num);
-        cache_cache.gfporder = order;
+        kmem_cache->gfporder = order;
-        cache_cache.colour = left_over / cache_cache.colour_off;
+        kmem_cache->colour = left_over / kmem_cache->colour_off;
-        cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
+        kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
                                      sizeof(struct slab), cache_line_size());
        /* 2+3) create the kmalloc caches */
@@ -1681,19 +1671,22 @@ void __init kmem_cache_init(void)
         * bug.
         */
-        sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name,
+        sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-                                        sizes[INDEX_AC].cs_size,
+        sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name;
-                                        ARCH_KMALLOC_MINALIGN,
+        sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size;
-                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+        sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size;
-                                        NULL);
+        sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
+        __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
+        list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
        if (INDEX_AC != INDEX_L3) {
-                sizes[INDEX_L3].cs_cachep =
+                sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-                        __kmem_cache_create(names[INDEX_L3].name,
+                sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
-                                sizes[INDEX_L3].cs_size,
+                sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
-                                ARCH_KMALLOC_MINALIGN,
+                sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
-                                ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+                sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
-                                NULL);
+                __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
+                list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
        }
        slab_early_init = 0;
@@ -1707,20 +1700,23 @@ void __init kmem_cache_init(void)
                 * allow tighter packing of the smaller caches.
                 */
                if (!sizes->cs_cachep) {
-                        sizes->cs_cachep = __kmem_cache_create(names->name,
+                        sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-                                        sizes->cs_size,
+                        sizes->cs_cachep->name = names->name;
-                                        ARCH_KMALLOC_MINALIGN,
+                        sizes->cs_cachep->size = sizes->cs_size;
-                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+                        sizes->cs_cachep->object_size = sizes->cs_size;
-                                        NULL);
+                        sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
+                        __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
+                        list_add(&sizes->cs_cachep->list, &slab_caches);
                }
 #ifdef CONFIG_ZONE_DMA
-                sizes->cs_dmacachep = __kmem_cache_create(
+                sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-                                        names->name_dma,
+                sizes->cs_dmacachep->name = names->name_dma;
-                                        sizes->cs_size,
+                sizes->cs_dmacachep->size = sizes->cs_size;
-                                        ARCH_KMALLOC_MINALIGN,
+                sizes->cs_dmacachep->object_size = sizes->cs_size;
-                                        ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
+                sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
-                                                SLAB_PANIC,
+                __kmem_cache_create(sizes->cs_dmacachep,
-                                        NULL);
+                               ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
+                list_add(&sizes->cs_dmacachep->list, &slab_caches);
 #endif
                sizes++;
                names++;
@@ -1731,15 +1727,15 @@ void __init kmem_cache_init(void)
                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-                BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
+                BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
-                memcpy(ptr, cpu_cache_get(&cache_cache),
+                memcpy(ptr, cpu_cache_get(kmem_cache),
                       sizeof(struct arraycache_init));
                /*
                 * Do not assume that spinlocks can be initialized via memcpy:
                 */
                spin_lock_init(&ptr->lock);
-                cache_cache.array[smp_processor_id()] = ptr;
+                kmem_cache->array[smp_processor_id()] = ptr;
                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
@@ -1760,7 +1756,7 @@ void __init kmem_cache_init(void)
                int nid;
                for_each_online_node(nid) {
-                        init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
+                        init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
                        init_list(malloc_sizes[INDEX_AC].cs_cachep,
                                  &initkmem_list3[SIZE_AC + nid], nid);
@@ -1781,9 +1777,6 @@ void __init kmem_cache_init_late(void)
        slab_state = UP;
-        /* Annotate slab for lockdep -- annotate the malloc caches */
-        init_lock_keys();
        /* 6) resize the head arrays to their final sizes */
        mutex_lock(&slab_mutex);
        list_for_each_entry(cachep, &slab_caches, list)
@@ -1791,6 +1784,9 @@ void __init kmem_cache_init_late(void)
                        BUG();
        mutex_unlock(&slab_mutex);
+        /* Annotate slab for lockdep -- annotate the malloc caches */
+        init_lock_keys();
        /* Done! */
        slab_state = FULL;
@@ -2209,27 +2205,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
        }
 }
-static void __kmem_cache_destroy(struct kmem_cache *cachep)
-{
-        int i;
-        struct kmem_list3 *l3;
-        for_each_online_cpu(i)
-            kfree(cachep->array[i]);
-        /* NUMA: free the list3 structures */
-        for_each_online_node(i) {
-                l3 = cachep->nodelists[i];
-                if (l3) {
-                        kfree(l3->shared);
-                        free_alien_cache(l3->alien);
-                        kfree(l3);
-                }
-        }
-        kmem_cache_free(&cache_cache, cachep);
-}
 /**
 * calculate_slab_order - calculate size (page order) of slabs
 * @cachep: pointer to the cache that is being created
@@ -2366,9 +2341,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 * Cannot be called within a int, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
- * @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting unloaded.
- *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -2381,13 +2353,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
-struct kmem_cache *
+int
-__kmem_cache_create (const char *name, size_t size, size_t align,
+__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
-        unsigned long flags, void (*ctor)(void *))
 {
        size_t left_over, slab_size, ralign;
-        struct kmem_cache *cachep = NULL;
        gfp_t gfp;
+        int err;
+        size_t size = cachep->size;
 #if DEBUG
 #if FORCED_DEBUG
@@ -2459,8 +2431,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
                ralign = ARCH_SLAB_MINALIGN;
        }
        /* 3) caller mandated alignment */
-        if (ralign < align) {
+        if (ralign < cachep->align) {
-                ralign = align;
+                ralign = cachep->align;
        }
        /* disable debug if necessary */
        if (ralign > __alignof__(unsigned long long))
@@ -2468,21 +2440,14 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
        /*
         * 4) Store it.
         */
-        align = ralign;
+        cachep->align = ralign;
        if (slab_is_available())
                gfp = GFP_KERNEL;
        else
                gfp = GFP_NOWAIT;
-        /* Get cache's description obj. */
-        cachep = kmem_cache_zalloc(&cache_cache, gfp);
-        if (!cachep)
-                return NULL;
        cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
-        cachep->object_size = size;
-        cachep->align = align;
 #if DEBUG
        /*
@@ -2506,8 +2471,9 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
        }
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
        if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
-            && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
+            && cachep->object_size > cache_line_size()
-                cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
+            && ALIGN(size, cachep->align) < PAGE_SIZE) {
+                cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
                size = PAGE_SIZE;
        }
 #endif
@@ -2527,18 +2493,15 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
                 */
                flags |= CFLGS_OFF_SLAB;
-        size = ALIGN(size, align);
+        size = ALIGN(size, cachep->align);
-        left_over = calculate_slab_order(cachep, size, align, flags);
+        left_over = calculate_slab_order(cachep, size, cachep->align, flags);
+        if (!cachep->num)
+                return -E2BIG;
-        if (!cachep->num) {
-                printk(KERN_ERR
-                       "kmem_cache_create: couldn't create cache %s.\n", name);
-                kmem_cache_free(&cache_cache, cachep);
-                return NULL;
-        }
        slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
-                          + sizeof(struct slab), align);
+                          + sizeof(struct slab), cachep->align);
        /*
         * If the slab has been placed off-slab, and we have enough space then
@@ -2566,8 +2529,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->colour_off = cache_line_size();
        /* Offset must be a multiple of the alignment. */
-        if (cachep->colour_off < align)
+        if (cachep->colour_off < cachep->align)
-                cachep->colour_off = align;
+                cachep->colour_off = cachep->align;
        cachep->colour = left_over / cachep->colour_off;
        cachep->slab_size = slab_size;
        cachep->flags = flags;
@@ -2588,12 +2551,11 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
                 */
                BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
        }
-        cachep->ctor = ctor;
-        cachep->name = name;
-        if (setup_cpu_cache(cachep, gfp)) {
+        err = setup_cpu_cache(cachep, gfp);
-                __kmem_cache_destroy(cachep);
+        if (err) {
-                return NULL;
+                __kmem_cache_shutdown(cachep);
+                return err;
        }
        if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2606,9 +2568,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
                slab_set_debugobj_lock_classes(cachep);
        }
-        /* cache setup completed, link it into the list */
+        return 0;
-        list_add(&cachep->list, &slab_caches);
-        return cachep;
 }
 #if DEBUG
@@ -2767,49 +2727,29 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
-/**
+int __kmem_cache_shutdown(struct kmem_cache *cachep)
- * kmem_cache_destroy - delete a cache
- * @cachep: the cache to destroy
- *
- * Remove a &struct kmem_cache object from the slab cache.
- *
- * It is expected this function will be called by a module when it is
- * unloaded.  This will remove the cache completely, and avoid a duplicate
- * cache being allocated each time a module is loaded and unloaded, if the
- * module doesn't have persistent in-kernel storage across loads and unloads.
- *
- * The cache must be empty before calling this function.
- *
- * The caller must guarantee that no one will allocate memory from the cache
- * during the kmem_cache_destroy().
- */
-void kmem_cache_destroy(struct kmem_cache *cachep)
 {
-        BUG_ON(!cachep || in_interrupt());
+        int i;
+        struct kmem_list3 *l3;
+        int rc = __cache_shrink(cachep);
-        /* Find the cache in the chain of caches. */
+        if (rc)
-        get_online_cpus();
+                return rc;
-        mutex_lock(&slab_mutex);
-        /*
-         * the chain is never empty, cache_cache is never destroyed
-         */
-        list_del(&cachep->list);
-        if (__cache_shrink(cachep)) {
-                slab_error(cachep, "Can't free all objects");
-                list_add(&cachep->list, &slab_caches);
-                mutex_unlock(&slab_mutex);
-                put_online_cpus();
-                return;
-        }
-        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
+        for_each_online_cpu(i)
-                rcu_barrier();
+            kfree(cachep->array[i]);
-        __kmem_cache_destroy(cachep);
+        /* NUMA: free the list3 structures */
-        mutex_unlock(&slab_mutex);
+        for_each_online_node(i) {
-        put_online_cpus();
+                l3 = cachep->nodelists[i];
+                if (l3) {
+                        kfree(l3->shared);
+                        free_alien_cache(l3->alien);
+                        kfree(l3);
+                }
+        }
+        return 0;
 }
-EXPORT_SYMBOL(kmem_cache_destroy);
 /*
 * Get the memory for a slab management obj.
@@ -3098,7 +3038,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
 }
 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
-                                   void *caller)
+                                   unsigned long caller)
 {
        struct page *page;
        unsigned int objnr;
@@ -3118,7 +3058,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
        }
        if (cachep->flags & SLAB_STORE_USER)
-                *dbg_userword(cachep, objp) = caller;
+                *dbg_userword(cachep, objp) = (void *)caller;
        objnr = obj_to_index(cachep, slabp, objp);
@@ -3131,7 +3071,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
-                        store_stackinfo(cachep, objp, (unsigned long)caller);
+                        store_stackinfo(cachep, objp, caller);
                        kernel_map_pages(virt_to_page(objp),
                                         cachep->size / PAGE_SIZE, 0);
                } else {
@@ -3260,6 +3200,7 @@ force_grow:
                /* cache_grow can reenable interrupts, then ac could change. */
                ac = cpu_cache_get(cachep);
+                node = numa_mem_id();
                /* no objects in sight? abort */
                if (!x && (ac->avail == 0 || force_refill))
@@ -3284,7 +3225,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
 #if DEBUG
 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
-                                gfp_t flags, void *objp, void *caller)
+                                gfp_t flags, void *objp, unsigned long caller)
 {
        if (!objp)
                return objp;
@@ -3301,7 +3242,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                poison_obj(cachep, objp, POISON_INUSE);
        }
        if (cachep->flags & SLAB_STORE_USER)
-                *dbg_userword(cachep, objp) = caller;
+                *dbg_userword(cachep, objp) = (void *)caller;
        if (cachep->flags & SLAB_RED_ZONE) {
                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
@@ -3342,7 +3283,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
 {
-        if (cachep == &cache_cache)
+        if (cachep == kmem_cache)
                return false;
        return should_failslab(cachep->object_size, flags, cachep->flags);
@@ -3575,8 +3516,8 @@ done:
 * Fallback to other node is possible if __GFP_THISNODE is not set.
 */
 static __always_inline void *
-__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
-                   void *caller)
+                   unsigned long caller)
 {
        unsigned long save_flags;
        void *ptr;
@@ -3662,7 +3603,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 #endif /* CONFIG_NUMA */
 static __always_inline void *
-__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 {
        unsigned long save_flags;
        void *objp;
@@ -3798,7 +3739,7 @@ free_done:
 * be in this state _before_ it is released.  Called with disabled ints.
 */
 static inline void __cache_free(struct kmem_cache *cachep, void *objp,
-    void *caller)
+                                unsigned long caller)
 {
        struct array_cache *ac = cpu_cache_get(cachep);
@@ -3838,7 +3779,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-        void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+        void *ret = slab_alloc(cachep, flags, _RET_IP_);
        trace_kmem_cache_alloc(_RET_IP_, ret,
                               cachep->object_size, cachep->size, flags);
@@ -3849,14 +3790,14 @@ EXPORT_SYMBOL(kmem_cache_alloc);
 #ifdef CONFIG_TRACING
 void *
-kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
+kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
 {
        void *ret;
-        ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+        ret = slab_alloc(cachep, flags, _RET_IP_);
        trace_kmalloc(_RET_IP_, ret,
-                      size, slab_buffer_size(cachep), flags);
+                      size, cachep->size, flags);
        return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -3865,8 +3806,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
-        void *ret = __cache_alloc_node(cachep, flags, nodeid,
+        void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
-                                       __builtin_return_address(0));
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
                                    cachep->object_size, cachep->size,
@@ -3877,17 +3817,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(size_t size,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
-                                  struct kmem_cache *cachep,
                                  gfp_t flags,
-                                  int nodeid)
+                                  int nodeid,
+                                  size_t size)
 {
        void *ret;
-        ret = __cache_alloc_node(cachep, flags, nodeid,
+        ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
-                                  __builtin_return_address(0));
        trace_kmalloc_node(_RET_IP_, ret,
-                           size, slab_buffer_size(cachep),
+                           size, cachep->size,
                           flags, nodeid);
        return ret;
 }
@@ -3895,34 +3835,33 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
 #endif
 static __always_inline void *
-__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
+__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
 {
        struct kmem_cache *cachep;
        cachep = kmem_find_general_cachep(size, flags);
        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
                return cachep;
-        return kmem_cache_alloc_node_trace(size, cachep, flags, node);
+        return kmem_cache_alloc_node_trace(cachep, flags, node, size);
 }
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
-        return __do_kmalloc_node(size, flags, node,
+        return __do_kmalloc_node(size, flags, node, _RET_IP_);
-                        __builtin_return_address(0));
 }
 EXPORT_SYMBOL(__kmalloc_node);
 void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
                int node, unsigned long caller)
 {
-        return __do_kmalloc_node(size, flags, node, (void *)caller);
+        return __do_kmalloc_node(size, flags, node, caller);
 }
 EXPORT_SYMBOL(__kmalloc_node_track_caller);
 #else
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
-        return __do_kmalloc_node(size, flags, node, NULL);
+        return __do_kmalloc_node(size, flags, node, 0);
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
@@ -3935,7 +3874,7 @@ EXPORT_SYMBOL(__kmalloc_node);
 * @caller: function caller for debug tracking of the caller
 */
 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
-                                          void *caller)
+                                          unsigned long caller)
 {
        struct kmem_cache *cachep;
        void *ret;
@@ -3948,9 +3887,9 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
        cachep = __find_general_cachep(size, flags);
        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
                return cachep;
-        ret = __cache_alloc(cachep, flags, caller);
+        ret = slab_alloc(cachep, flags, caller);
-        trace_kmalloc((unsigned long) caller, ret,
+        trace_kmalloc(caller, ret,
                      size, cachep->size, flags);
        return ret;
@@ -3960,20 +3899,20 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
 void *__kmalloc(size_t size, gfp_t flags)
 {
-        return __do_kmalloc(size, flags, __builtin_return_address(0));
+        return __do_kmalloc(size, flags, _RET_IP_);
 }
 EXPORT_SYMBOL(__kmalloc);
 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
 {
-        return __do_kmalloc(size, flags, (void *)caller);
+        return __do_kmalloc(size, flags, caller);
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
 #else
 void *__kmalloc(size_t size, gfp_t flags)
 {
-        return __do_kmalloc(size, flags, NULL);
+        return __do_kmalloc(size, flags, 0);
 }
 EXPORT_SYMBOL(__kmalloc);
 #endif
@@ -3994,7 +3933,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
        debug_check_no_locks_freed(objp, cachep->object_size);
        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
                debug_check_no_obj_freed(objp, cachep->object_size);
-        __cache_free(cachep, objp, __builtin_return_address(0));
+        __cache_free(cachep, objp, _RET_IP_);
        local_irq_restore(flags);
        trace_kmem_cache_free(_RET_IP_, objp);
@@ -4025,7 +3964,7 @@ void kfree(const void *objp)
        debug_check_no_locks_freed(objp, c->object_size);
        debug_check_no_obj_freed(objp, c->object_size);
-        __cache_free(c, (void *)objp, __builtin_return_address(0));
+        __cache_free(c, (void *)objp, _RET_IP_);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
diff --git a/mm/slab.h b/mm/slab.h
index db7848caaa2..7deeb449a30 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -25,9 +25,26 @@ extern enum slab_state slab_state;
 /* The slab cache mutex protects the management structures during changes */
 extern struct mutex slab_mutex;
+/* The list of all slab caches on the system */
 extern struct list_head slab_caches;
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+/* The slab cache that manages slab cache information */
+extern struct kmem_cache *kmem_cache;
+/* Functions provided by the slab allocators */
+extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
+#ifdef CONFIG_SLUB
+struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
        size_t align, unsigned long flags, void (*ctor)(void *));
+#else
+static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
+        size_t align, unsigned long flags, void (*ctor)(void *))
+{ return NULL; }
+#endif
+int __kmem_cache_shutdown(struct kmem_cache *);
 #endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index aa3ca5bb01b..9c217255ac4 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -22,6 +22,53 @@
 enum slab_state slab_state;
 LIST_HEAD(slab_caches);
 DEFINE_MUTEX(slab_mutex);
+struct kmem_cache *kmem_cache;
+#ifdef CONFIG_DEBUG_VM
+static int kmem_cache_sanity_check(const char *name, size_t size)
+{
+        struct kmem_cache *s = NULL;
+        if (!name || in_interrupt() || size < sizeof(void *) ||
+                size > KMALLOC_MAX_SIZE) {
+                pr_err("kmem_cache_create(%s) integrity check failed\n", name);
+                return -EINVAL;
+        }
+        list_for_each_entry(s, &slab_caches, list) {
+                char tmp;
+                int res;
+                /*
+                 * This happens when the module gets unloaded and doesn't
+                 * destroy its slab cache and no-one else reuses the vmalloc
+                 * area of the module.  Print a warning.
+                 */
+                res = probe_kernel_address(s->name, tmp);
+                if (res) {
+                        pr_err("Slab cache with size %d has lost its name\n",
+                               s->object_size);
+                        continue;
+                }
+                if (!strcmp(s->name, name)) {
+                        pr_err("%s (%s): Cache name already exists.\n",
+                               __func__, name);
+                        dump_stack();
+                        s = NULL;
+                        return -EINVAL;
+                }
+        }
+        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
+        return 0;
+}
+#else
+static inline int kmem_cache_sanity_check(const char *name, size_t size)
+{
+        return 0;
+}
+#endif
 /*
 * kmem_cache_create - Create a cache.
@@ -52,68 +99,92 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
                unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s = NULL;
+        int err = 0;
-#ifdef CONFIG_DEBUG_VM
-        if (!name || in_interrupt() || size < sizeof(void *) ||
-                size > KMALLOC_MAX_SIZE) {
-                printk(KERN_ERR "kmem_cache_create(%s) integrity check"
-                        " failed\n", name);
-                goto out;
-        }
-#endif
        get_online_cpus();
        mutex_lock(&slab_mutex);
-#ifdef CONFIG_DEBUG_VM
+        if (!kmem_cache_sanity_check(name, size) == 0)
-        list_for_each_entry(s, &slab_caches, list) {
+                goto out_locked;
-                char tmp;
-                int res;
-                /*
-                 * This happens when the module gets unloaded and doesn't
-                 * destroy its slab cache and no-one else reuses the vmalloc
-                 * area of the module.  Print a warning.
-                 */
-                res = probe_kernel_address(s->name, tmp);
-                if (res) {
-                        printk(KERN_ERR
-                               "Slab cache with size %d has lost its name\n",
-                               s->object_size);
-                        continue;
-                }
-                if (!strcmp(s->name, name)) {
+        s = __kmem_cache_alias(name, size, align, flags, ctor);
-                        printk(KERN_ERR "kmem_cache_create(%s): Cache name"
+        if (s)
-                                " already exists.\n",
+                goto out_locked;
-                                name);
-                        dump_stack();
+        s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
-                        s = NULL;
+        if (s) {
-                        goto oops;
+                s->object_size = s->size = size;
+                s->align = align;
+                s->ctor = ctor;
+                s->name = kstrdup(name, GFP_KERNEL);
+                if (!s->name) {
+                        kmem_cache_free(kmem_cache, s);
+                        err = -ENOMEM;
+                        goto out_locked;
                }
-        }
-        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
+                err = __kmem_cache_create(s, flags);
-#endif
+                if (!err) {
-        s = __kmem_cache_create(name, size, align, flags, ctor);
+                        s->refcount = 1;
+                        list_add(&s->list, &slab_caches);
-#ifdef CONFIG_DEBUG_VM
+                } else {
-oops:
+                        kfree(s->name);
-#endif
+                        kmem_cache_free(kmem_cache, s);
+                }
+        } else
+                err = -ENOMEM;
+out_locked:
        mutex_unlock(&slab_mutex);
        put_online_cpus();
-#ifdef CONFIG_DEBUG_VM
+        if (err) {
-out:
-#endif
+                if (flags & SLAB_PANIC)
-        if (!s && (flags & SLAB_PANIC))
+                        panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
-                panic("kmem_cache_create: Failed to create slab '%s'\n", name);
+                                name, err);
+                else {
+                        printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
+                                name, err);
+                        dump_stack();
+                }
+                return NULL;
+        }
        return s;
 }
 EXPORT_SYMBOL(kmem_cache_create);
+void kmem_cache_destroy(struct kmem_cache *s)
+{
+        get_online_cpus();
+        mutex_lock(&slab_mutex);
+        s->refcount--;
+        if (!s->refcount) {
+                list_del(&s->list);
+                if (!__kmem_cache_shutdown(s)) {
+                        if (s->flags & SLAB_DESTROY_BY_RCU)
+                                rcu_barrier();
+                        kfree(s->name);
+                        kmem_cache_free(kmem_cache, s);
+                } else {
+                        list_add(&s->list, &slab_caches);
+                        printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
+                                s->name);
+                        dump_stack();
+                }
+        }
+        mutex_unlock(&slab_mutex);
+        put_online_cpus();
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
 int slab_is_available(void)
 {
        return slab_state >= UP;
diff --git a/mm/slob.c b/mm/slob.c
index 45d4ca79933..a08e4681fd0 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -194,7 +194,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
        void *page;
 #ifdef CONFIG_NUMA
-        if (node != -1)
+        if (node != NUMA_NO_NODE)
                page = alloc_pages_exact_node(node, gfp, order);
        else
 #endif
@@ -290,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
                 * If there's a node specification, search for a partial
                 * page with a matching node id in the freelist.
                 */
-                if (node != -1 && page_to_nid(sp) != node)
+                if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
                        continue;
 #endif
                /* Enough room on this page? */
@@ -425,7 +425,8 @@ out:
 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
 */
-void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 {
        unsigned int *m;
        int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
@@ -446,7 +447,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
                *m = size;
                ret = (void *)m + align;
-                trace_kmalloc_node(_RET_IP_, ret,
+                trace_kmalloc_node(caller, ret,
                                   size, size + align, gfp, node);
        } else {
                unsigned int order = get_order(size);
@@ -460,15 +461,35 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
                        page->private = size;
                }
-                trace_kmalloc_node(_RET_IP_, ret,
+                trace_kmalloc_node(caller, ret,
                                   size, PAGE_SIZE << order, gfp, node);
        }
        kmemleak_alloc(ret, size, 1, gfp);
        return ret;
 }
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+{
+        return __do_kmalloc_node(size, gfp, node, _RET_IP_);
+}
 EXPORT_SYMBOL(__kmalloc_node);
+#ifdef CONFIG_TRACING
+void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
+{
+        return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
+}
+#ifdef CONFIG_NUMA
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
+                                        int node, unsigned long caller)
+{
+        return __do_kmalloc_node(size, gfp, node, caller);
+}
+#endif
+#endif
 void kfree(const void *block)
 {
        struct page *sp;
@@ -508,44 +529,24 @@ size_t ksize(const void *block)
 }
 EXPORT_SYMBOL(ksize);
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
-        size_t align, unsigned long flags, void (*ctor)(void *))
 {
-        struct kmem_cache *c;
+        size_t align = c->size;
-        c = slob_alloc(sizeof(struct kmem_cache),
-                GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1);
-        if (c) {
+        if (flags & SLAB_DESTROY_BY_RCU) {
-                c->name = name;
+                /* leave room for rcu footer at the end of object */
-                c->size = size;
+                c->size += sizeof(struct slob_rcu);
-                if (flags & SLAB_DESTROY_BY_RCU) {
-                        /* leave room for rcu footer at the end of object */
-                        c->size += sizeof(struct slob_rcu);
-                }
-                c->flags = flags;
-                c->ctor = ctor;
-                /* ignore alignment unless it's forced */
-                c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
-                if (c->align < ARCH_SLAB_MINALIGN)
-                        c->align = ARCH_SLAB_MINALIGN;
-                if (c->align < align)
-                        c->align = align;
-                kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
-                c->refcount = 1;
        }
-        return c;
+        c->flags = flags;
-}
+        /* ignore alignment unless it's forced */
+        c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+        if (c->align < ARCH_SLAB_MINALIGN)
+                c->align = ARCH_SLAB_MINALIGN;
+        if (c->align < align)
+                c->align = align;
-void kmem_cache_destroy(struct kmem_cache *c)
+        return 0;
-{
-        kmemleak_free(c);
-        if (c->flags & SLAB_DESTROY_BY_RCU)
-                rcu_barrier();
-        slob_free(c, sizeof(struct kmem_cache));
 }
-EXPORT_SYMBOL(kmem_cache_destroy);
 void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
@@ -613,14 +614,28 @@ unsigned int kmem_cache_size(struct kmem_cache *c)
 }
 EXPORT_SYMBOL(kmem_cache_size);
+int __kmem_cache_shutdown(struct kmem_cache *c)
+{
+        /* No way to check for remaining objects */
+        return 0;
+}
 int kmem_cache_shrink(struct kmem_cache *d)
 {
        return 0;
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
+struct kmem_cache kmem_cache_boot = {
+        .name = "kmem_cache",
+        .size = sizeof(struct kmem_cache),
+        .flags = SLAB_PANIC,
+        .align = ARCH_KMALLOC_MINALIGN,
+};
 void __init kmem_cache_init(void)
 {
+        kmem_cache = &kmem_cache_boot;
        slab_state = UP;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 8f78e257703..a0d698467f7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -210,11 +210,7 @@ static void sysfs_slab_remove(struct kmem_cache *);
 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
                                                        { return 0; }
-static inline void sysfs_slab_remove(struct kmem_cache *s)
+static inline void sysfs_slab_remove(struct kmem_cache *s) { }
-{
-        kfree(s->name);
-        kfree(s);
-}
 #endif
@@ -568,6 +564,8 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
        printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
        printk(KERN_ERR "----------------------------------------"
                        "-------------------------------------\n\n");
+        add_taint(TAINT_BAD_PAGE);
 }
 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
@@ -624,7 +622,7 @@ static void object_err(struct kmem_cache *s, struct page *page,
        print_trailer(s, page, object);
 }
-static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
+static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...)
 {
        va_list args;
        char buf[100];
@@ -1069,13 +1067,13 @@ bad:
        return 0;
 }
-static noinline int free_debug_processing(struct kmem_cache *s,
+static noinline struct kmem_cache_node *free_debug_processing(
-                 struct page *page, void *object, unsigned long addr)
+        struct kmem_cache *s, struct page *page, void *object,
+        unsigned long addr, unsigned long *flags)
 {
-        unsigned long flags;
+        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-        int rc = 0;
-        local_irq_save(flags);
+        spin_lock_irqsave(&n->list_lock, *flags);
        slab_lock(page);
        if (!check_slab(s, page))
@@ -1113,15 +1111,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
                set_track(s, object, TRACK_FREE, addr);
        trace(s, page, object, 0);
        init_object(s, object, SLUB_RED_INACTIVE);
-        rc = 1;
 out:
        slab_unlock(page);
-        local_irq_restore(flags);
+        /*
-        return rc;
+         * Keep node_lock to preserve integrity
+         * until the object is actually freed
+         */
+        return n;
 fail:
+        slab_unlock(page);
+        spin_unlock_irqrestore(&n->list_lock, *flags);
        slab_fix(s, "Object at 0x%p not freed", object);
-        goto out;
+        return NULL;
 }
 static int __init setup_slub_debug(char *str)
@@ -1214,8 +1216,9 @@ static inline void setup_object_debug(struct kmem_cache *s,
 static inline int alloc_debug_processing(struct kmem_cache *s,
        struct page *page, void *object, unsigned long addr) { return 0; }
-static inline int free_debug_processing(struct kmem_cache *s,
+static inline struct kmem_cache_node *free_debug_processing(
-        struct page *page, void *object, unsigned long addr) { return 0; }
+        struct kmem_cache *s, struct page *page, void *object,
+        unsigned long addr, unsigned long *flags) { return NULL; }
 static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
                        { return 1; }
@@ -1524,12 +1527,13 @@ static inline void *acquire_slab(struct kmem_cache *s,
 }
 static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
+static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
 /*
 * Try to allocate a partial slab from a specific node.
 */
-static void *get_partial_node(struct kmem_cache *s,
+static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
-                struct kmem_cache_node *n, struct kmem_cache_cpu *c)
+                                struct kmem_cache_cpu *c, gfp_t flags)
 {
        struct page *page, *page2;
        void *object = NULL;
@@ -1545,9 +1549,13 @@ static void *get_partial_node(struct kmem_cache *s,
        spin_lock(&n->list_lock);
        list_for_each_entry_safe(page, page2, &n->partial, lru) {
-                void *t = acquire_slab(s, n, page, object == NULL);
+                void *t;
                int available;
+                if (!pfmemalloc_match(page, flags))
+                        continue;
+                t = acquire_slab(s, n, page, object == NULL);
                if (!t)
                        break;
@@ -1614,7 +1622,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
                        if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
                                        n->nr_partial > s->min_partial) {
-                                object = get_partial_node(s, n, c);
+                                object = get_partial_node(s, n, c, flags);
                                if (object) {
                                        /*
                                         * Return the object even if
@@ -1643,7 +1651,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
        void *object;
        int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
-        object = get_partial_node(s, get_node(s, searchnode), c);
+        object = get_partial_node(s, get_node(s, searchnode), c, flags);
        if (object || node != NUMA_NO_NODE)
                return object;
@@ -1709,7 +1717,7 @@ static inline void note_cmpxchg_failure(const char *n,
        stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
 }
-void init_kmem_cache_cpus(struct kmem_cache *s)
+static void init_kmem_cache_cpus(struct kmem_cache *s)
 {
        int cpu;
@@ -1934,7 +1942,7 @@ static void unfreeze_partials(struct kmem_cache *s)
 * If we did not find a slot then simply move all the partials to the
 * per node partial list.
 */
-int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 {
        struct page *oldpage;
        int pages;
@@ -1957,6 +1965,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
                                local_irq_save(flags);
                                unfreeze_partials(s);
                                local_irq_restore(flags);
+                                oldpage = NULL;
                                pobjects = 0;
                                pages = 0;
                                stat(s, CPU_PARTIAL_DRAIN);
@@ -2305,7 +2314,7 @@ new_slab:
 *
 * Otherwise we can simply pick the next object from the lockless free list.
 */
-static __always_inline void *slab_alloc(struct kmem_cache *s,
+static __always_inline void *slab_alloc_node(struct kmem_cache *s,
                gfp_t gfpflags, int node, unsigned long addr)
 {
        void **object;
@@ -2375,9 +2384,15 @@ redo:
        return object;
 }
+static __always_inline void *slab_alloc(struct kmem_cache *s,
+                gfp_t gfpflags, unsigned long addr)
+{
+        return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+}
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
-        void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
        trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
@@ -2388,7 +2403,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
 #ifdef CONFIG_TRACING
 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
-        void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
        trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
        return ret;
 }
@@ -2406,7 +2421,7 @@ EXPORT_SYMBOL(kmalloc_order_trace);
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
-        void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+        void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
                                    s->object_size, s->size, gfpflags, node);
@@ -2420,7 +2435,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
                                    gfp_t gfpflags,
                                    int node, size_t size)
 {
-        void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+        void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
        trace_kmalloc_node(_RET_IP_, ret,
                           size, s->size, gfpflags, node);
@@ -2452,7 +2467,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
        stat(s, FREE_SLOWPATH);
-        if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
+        if (kmem_cache_debug(s) &&
+                !(n = free_debug_processing(s, page, x, addr, &flags)))
                return;
        do {
@@ -2607,6 +2623,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
        page = virt_to_head_page(x);
+        if (kmem_cache_debug(s) && page->slab != s) {
+                pr_err("kmem_cache_free: Wrong slab cache. %s but object"
+                        " is from  %s\n", page->slab->name, s->name);
+                WARN_ON_ONCE(1);
+                return;
+        }
        slab_free(s, page, x, _RET_IP_);
        trace_kmem_cache_free(_RET_IP_, x);
@@ -3021,17 +3044,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 }
-static int kmem_cache_open(struct kmem_cache *s,
+static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
-                const char *name, size_t size,
-                size_t align, unsigned long flags,
-                void (*ctor)(void *))
 {
-        memset(s, 0, kmem_size);
+        s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
-        s->name = name;
-        s->ctor = ctor;
-        s->object_size = size;
-        s->align = align;
-        s->flags = kmem_cache_flags(size, flags, name, ctor);
        s->reserved = 0;
        if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
@@ -3093,7 +3108,6 @@ static int kmem_cache_open(struct kmem_cache *s,
        else
                s->cpu_partial = 30;
-        s->refcount = 1;
 #ifdef CONFIG_NUMA
        s->remote_node_defrag_ratio = 1000;
 #endif
@@ -3101,16 +3115,16 @@ static int kmem_cache_open(struct kmem_cache *s,
                goto error;
        if (alloc_kmem_cache_cpus(s))
-                return 1;
+                return 0;
        free_kmem_cache_nodes(s);
 error:
        if (flags & SLAB_PANIC)
                panic("Cannot create slab %s size=%lu realsize=%u "
                        "order=%u offset=%u flags=%lx\n",
-                        s->name, (unsigned long)size, s->size, oo_order(s->oo),
+                        s->name, (unsigned long)s->size, s->size, oo_order(s->oo),
                        s->offset, flags);
-        return 0;
+        return -EINVAL;
 }
 /*
@@ -3132,7 +3146,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
                                     sizeof(long), GFP_ATOMIC);
        if (!map)
                return;
-        slab_err(s, page, "%s", text);
+        slab_err(s, page, text, s->name);
        slab_lock(page);
        get_map(s, page, map);
@@ -3164,7 +3178,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
                        discard_slab(s, page);
                } else {
                        list_slab_objects(s, page,
-                                "Objects remaining on kmem_cache_close()");
+                        "Objects remaining in %s on kmem_cache_close()");
                }
        }
 }
@@ -3177,7 +3191,6 @@ static inline int kmem_cache_close(struct kmem_cache *s)
        int node;
        flush_all(s);
-        free_percpu(s->cpu_slab);
        /* Attempt to free all objects */
        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = get_node(s, node);
@@ -3186,33 +3199,20 @@ static inline int kmem_cache_close(struct kmem_cache *s)
                if (n->nr_partial || slabs_node(s, node))
                        return 1;
        }
+        free_percpu(s->cpu_slab);
        free_kmem_cache_nodes(s);
        return 0;
 }
-/*
+int __kmem_cache_shutdown(struct kmem_cache *s)
- * Close a cache and release the kmem_cache structure
- * (must be used for caches created using kmem_cache_create)
- */
-void kmem_cache_destroy(struct kmem_cache *s)
 {
-        mutex_lock(&slab_mutex);
+        int rc = kmem_cache_close(s);
-        s->refcount--;
-        if (!s->refcount) {
+        if (!rc)
-                list_del(&s->list);
-                mutex_unlock(&slab_mutex);
-                if (kmem_cache_close(s)) {
-                        printk(KERN_ERR "SLUB %s: %s called for cache that "
-                                "still has objects.\n", s->name, __func__);
-                        dump_stack();
-                }
-                if (s->flags & SLAB_DESTROY_BY_RCU)
-                        rcu_barrier();
                sysfs_slab_remove(s);
-        } else
-                mutex_unlock(&slab_mutex);
+        return rc;
 }
-EXPORT_SYMBOL(kmem_cache_destroy);
 /********************************************************************
 *              Kmalloc subsystem
@@ -3221,8 +3221,6 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
 EXPORT_SYMBOL(kmalloc_caches);
-static struct kmem_cache *kmem_cache;
 #ifdef CONFIG_ZONE_DMA
 static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
 #endif
@@ -3268,14 +3266,17 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
 {
        struct kmem_cache *s;
-        s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+        s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+        s->name = name;
+        s->size = s->object_size = size;
+        s->align = ARCH_KMALLOC_MINALIGN;
        /*
         * This function is called with IRQs disabled during early-boot on
         * single CPU so there's no need to take slab_mutex here.
         */
-        if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
+        if (kmem_cache_open(s, flags))
-                                                                flags, NULL))
                goto panic;
        list_add(&s->list, &slab_caches);
@@ -3357,7 +3358,7 @@ void *__kmalloc(size_t size, gfp_t flags)
        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
-        ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
+        ret = slab_alloc(s, flags, _RET_IP_);
        trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
@@ -3400,7 +3401,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
-        ret = slab_alloc(s, flags, node, _RET_IP_);
+        ret = slab_alloc_node(s, flags, node, _RET_IP_);
        trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
@@ -3477,7 +3478,7 @@ void kfree(const void *x)
        if (unlikely(!PageSlab(page))) {
                BUG_ON(!PageCompound(page));
                kmemleak_free(x);
-                put_page(page);
+                __free_pages(page, compound_order(page));
                return;
        }
        slab_free(page->slab, page, object, _RET_IP_);
@@ -3714,12 +3715,12 @@ void __init kmem_cache_init(void)
                slub_max_order = 0;
        kmem_size = offsetof(struct kmem_cache, node) +
-                                nr_node_ids * sizeof(struct kmem_cache_node *);
+                        nr_node_ids * sizeof(struct kmem_cache_node *);
        /* Allocate two kmem_caches from the page allocator */
        kmalloc_size = ALIGN(kmem_size, cache_line_size());
        order = get_order(2 * kmalloc_size);
-        kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
+        kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
        /*
         * Must first have the slab cache available for the allocations of the
@@ -3728,9 +3729,10 @@ void __init kmem_cache_init(void)
         */
        kmem_cache_node = (void *)kmem_cache + kmalloc_size;
-        kmem_cache_open(kmem_cache_node, "kmem_cache_node",
+        kmem_cache_node->name = "kmem_cache_node";
-                sizeof(struct kmem_cache_node),
+        kmem_cache_node->size = kmem_cache_node->object_size =
-                0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+                sizeof(struct kmem_cache_node);
+        kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
@@ -3738,8 +3740,10 @@ void __init kmem_cache_init(void)
        slab_state = PARTIAL;
        temp_kmem_cache = kmem_cache;
-        kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
+        kmem_cache->name = "kmem_cache";
-                0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+        kmem_cache->size = kmem_cache->object_size = kmem_size;
+        kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
        kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
        memcpy(kmem_cache, temp_kmem_cache, kmem_size);
@@ -3928,11 +3932,10 @@ static struct kmem_cache *find_mergeable(size_t size,
        return NULL;
 }
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
                size_t align, unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s;
-        char *n;
        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
@@ -3946,36 +3949,29 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
                if (sysfs_slab_alias(s, name)) {
                        s->refcount--;
-                        return NULL;
+                        s = NULL;
                }
-                return s;
        }
-        n = kstrdup(name, GFP_KERNEL);
+        return s;
-        if (!n)
+}
-                return NULL;
-        s = kmalloc(kmem_size, GFP_KERNEL);
+int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
-        if (s) {
+{
-                if (kmem_cache_open(s, n,
+        int err;
-                                size, align, flags, ctor)) {
-                        int r;
-                        list_add(&s->list, &slab_caches);
+        err = kmem_cache_open(s, flags);
-                        mutex_unlock(&slab_mutex);
+        if (err)
-                        r = sysfs_slab_add(s);
+                return err;
-                        mutex_lock(&slab_mutex);
-                        if (!r)
+        mutex_unlock(&slab_mutex);
-                                return s;
+        err = sysfs_slab_add(s);
+        mutex_lock(&slab_mutex);
-                        list_del(&s->list);
+        if (err)
-                        kmem_cache_close(s);
+                kmem_cache_close(s);
-                }
-                kfree(s);
+        return err;
-        }
-        kfree(n);
-        return NULL;
 }
 #ifdef CONFIG_SMP
@@ -4028,7 +4024,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
-        ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
+        ret = slab_alloc(s, gfpflags, caller);
        /* Honor the call site pointer we received. */
        trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -4058,7 +4054,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
-        ret = slab_alloc(s, gfpflags, node, caller);
+        ret = slab_alloc_node(s, gfpflags, node, caller);
        /* Honor the call site pointer we received. */
        trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
@@ -5205,14 +5201,6 @@ static ssize_t slab_attr_store(struct kobject *kobj,
        return err;
 }
-static void kmem_cache_release(struct kobject *kobj)
-{
-        struct kmem_cache *s = to_slab(kobj);
-        kfree(s->name);
-        kfree(s);
-}
 static const struct sysfs_ops slab_sysfs_ops = {
        .show = slab_attr_show,
        .store = slab_attr_store,
@@ -5220,7 +5208,6 @@ static const struct sysfs_ops slab_sysfs_ops = {
 static struct kobj_type slab_ktype = {
        .sysfs_ops = &slab_sysfs_ops,
-        .release = kmem_cache_release
 };
 static int uevent_filter(struct kset *kset, struct kobject *kobj)
diff --git a/mm/swap.c b/mm/swap.c
index 77825883298..6310dc2008f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page)
 }
 EXPORT_SYMBOL(mark_page_accessed);
+/*
+ * Order of operations is important: flush the pagevec when it's already
+ * full, not when adding the last page, to make sure that last page is
+ * not added to the LRU directly when passed to this function. Because
+ * mark_page_accessed() (called after this when writing) only activates
+ * pages that are on the LRU, linear writes in subpage chunks would see
+ * every PAGEVEC_SIZE page activated, which is unexpected.
+ */
 void __lru_cache_add(struct page *page, enum lru_list lru)
 {
        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
        page_cache_get(page);
-        if (!pagevec_add(pvec, page))
+        if (!pagevec_space(pvec))
                __pagevec_lru_add(pvec, lru);
+        pagevec_add(pvec, page);
        put_cpu_var(lru_add_pvecs);
 }
 EXPORT_SYMBOL(__lru_cache_add);
@@ -742,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
        SetPageLRU(page_tail);
-        if (page_evictable(page_tail, NULL)) {
+        if (page_evictable(page_tail)) {
                if (PageActive(page)) {
                        SetPageActive(page_tail);
                        active = 1;
diff --git a/mm/truncate.c b/mm/truncate.c
index 75801acdaac..d51ce92d6e8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        cancel_dirty_page(page, PAGE_CACHE_SIZE);
-        clear_page_mlock(page);
        ClearPageMappedToDisk(page);
        delete_from_page_cache(page);
        return 0;
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
        if (page_has_private(page) && !try_to_release_page(page, 0))
                return 0;
-        clear_page_mlock(page);
        ret = remove_mapping(mapping, page);
        return ret;
@@ -398,7 +396,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        if (PageDirty(page))
                goto failed;
-        clear_page_mlock(page);
        BUG_ON(page_has_private(page));
        __delete_from_page_cache(page);
        spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/util.c b/mm/util.c
index 8c7265afa29..dc3036cdcc6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -105,6 +105,25 @@ void *memdup_user(const void __user *src, size_t len)
 }
 EXPORT_SYMBOL(memdup_user);
+static __always_inline void *__do_krealloc(const void *p, size_t new_size,
+                                           gfp_t flags)
+{
+        void *ret;
+        size_t ks = 0;
+        if (p)
+                ks = ksize(p);
+        if (ks >= new_size)
+                return (void *)p;
+        ret = kmalloc_track_caller(new_size, flags);
+        if (ret && p)
+                memcpy(ret, p, ks);
+        return ret;
+}
 /**
 * __krealloc - like krealloc() but don't free @p.
 * @p: object to reallocate memory for.
@@ -117,23 +136,11 @@ EXPORT_SYMBOL(memdup_user);
 */
 void *__krealloc(const void *p, size_t new_size, gfp_t flags)
 {
-        void *ret;
-        size_t ks = 0;
        if (unlikely(!new_size))
                return ZERO_SIZE_PTR;
-        if (p)
+        return __do_krealloc(p, new_size, flags);
-                ks = ksize(p);
-        if (ks >= new_size)
-                return (void *)p;
-        ret = kmalloc_track_caller(new_size, flags);
-        if (ret && p)
-                memcpy(ret, p, ks);
-        return ret;
 }
 EXPORT_SYMBOL(__krealloc);
@@ -157,7 +164,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
                return ZERO_SIZE_PTR;
        }
-        ret = __krealloc(p, new_size, flags);
+        ret = __do_krealloc(p, new_size, flags);
        if (ret && p != ret)
                kfree(p);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2bb90b1d241..78e08300db2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                usize -= PAGE_SIZE;
        } while (usize > 0);
-        /* Prevent "things" like memory migration? VM_flags need a cleanup... */
+        vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
-        vma->vm_flags |= VM_RESERVED;
        return 0;
 }
@@ -2572,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p)
 {
        struct vm_struct *v = p;
-        seq_printf(m, "0x%p-0x%p %7ld",
+        seq_printf(m, "0x%pK-0x%pK %7ld",
                v->addr, v->addr + v->size, v->size);
        if (v->caller)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8d01243d956..2624edcfb42 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page)
 redo:
        ClearPageUnevictable(page);
-        if (page_evictable(page, NULL)) {
+        if (page_evictable(page)) {
                /*
                 * For evictable pages, we can use the cache.
                 * In event of a race, worst case is we end up with an
@@ -587,7 +587,7 @@ redo:
         * page is on unevictable list, it never be freed. To avoid that,
         * check after we added it to the list, again.
         */
-        if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
+        if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
                if (!isolate_lru_page(page)) {
                        put_page(page);
                        goto redo;
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page,
 static unsigned long shrink_page_list(struct list_head *page_list,
                                      struct zone *zone,
                                      struct scan_control *sc,
+                                      enum ttu_flags ttu_flags,
                                      unsigned long *ret_nr_dirty,
-                                      unsigned long *ret_nr_writeback)
+                                      unsigned long *ret_nr_writeback,
+                                      bool force_reclaim)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        mem_cgroup_uncharge_start();
        while (!list_empty(page_list)) {
-                enum page_references references;
                struct address_space *mapping;
                struct page *page;
                int may_enter_fs;
+                enum page_references references = PAGEREF_RECLAIM_CLEAN;
                cond_resched();
@@ -707,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                sc->nr_scanned++;
-                if (unlikely(!page_evictable(page, NULL)))
+                if (unlikely(!page_evictable(page)))
                        goto cull_mlocked;
                if (!sc->may_unmap && page_mapped(page))
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        wait_on_page_writeback(page);
                }
-                references = page_check_references(page, sc);
+                if (!force_reclaim)
+                        references = page_check_references(page, sc);
                switch (references) {
                case PAGEREF_ACTIVATE:
                        goto activate_locked;
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                        switch (try_to_unmap(page, TTU_UNMAP)) {
+                        switch (try_to_unmap(page, ttu_flags)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@ -960,6 +964,33 @@ keep:
        return nr_reclaimed;
 }
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+                                            struct list_head *page_list)
+{
+        struct scan_control sc = {
+                .gfp_mask = GFP_KERNEL,
+                .priority = DEF_PRIORITY,
+                .may_unmap = 1,
+        };
+        unsigned long ret, dummy1, dummy2;
+        struct page *page, *next;
+        LIST_HEAD(clean_pages);
+        list_for_each_entry_safe(page, next, page_list, lru) {
+                if (page_is_file_cache(page) && !PageDirty(page)) {
+                        ClearPageActive(page);
+                        list_move(&page->lru, &clean_pages);
+                }
+        }
+        ret = shrink_page_list(&clean_pages, zone, &sc,
+                                TTU_UNMAP|TTU_IGNORE_ACCESS,
+                                &dummy1, &dummy2, true);
+        list_splice(&clean_pages, page_list);
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+        return ret;
+}
 /*
 * Attempt to remove the specified page from its LRU.  Only take this page
 * if it is of the appropriate PageActive status.  Pages which are being
@@ -978,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
        if (!PageLRU(page))
                return ret;
-        /* Do not give back unevictable pages for compaction */
+        /* Compaction should not handle unevictable pages but CMA can do so */
-        if (PageUnevictable(page))
+        if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
                return ret;
        ret = -EBUSY;
@@ -1186,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
                VM_BUG_ON(PageLRU(page));
                list_del(&page->lru);
-                if (unlikely(!page_evictable(page, NULL))) {
+                if (unlikely(!page_evictable(page))) {
                        spin_unlock_irq(&zone->lru_lock);
                        putback_lru_page(page);
                        spin_lock_irq(&zone->lru_lock);
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        if (nr_taken == 0)
                return 0;
-        nr_reclaimed = shrink_page_list(&page_list, zone, sc,
+        nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
-                                                &nr_dirty, &nr_writeback);
+                                        &nr_dirty, &nr_writeback, false);
        spin_lock_irq(&zone->lru_lock);
@@ -1439,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
                page = lru_to_page(&l_hold);
                list_del(&page->lru);
-                if (unlikely(!page_evictable(page, NULL))) {
+                if (unlikely(!page_evictable(page))) {
                        putback_lru_page(page);
                        continue;
                }
@@ -1729,6 +1760,28 @@ static bool in_reclaim_compaction(struct scan_control *sc)
        return false;
 }
+#ifdef CONFIG_COMPACTION
+/*
+ * If compaction is deferred for sc->order then scale the number of pages
+ * reclaimed based on the number of consecutive allocation failures
+ */
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+                        struct lruvec *lruvec, struct scan_control *sc)
+{
+        struct zone *zone = lruvec_zone(lruvec);
+        if (zone->compact_order_failed <= sc->order)
+                pages_for_compaction <<= zone->compact_defer_shift;
+        return pages_for_compaction;
+}
+#else
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+                        struct lruvec *lruvec, struct scan_control *sc)
+{
+        return pages_for_compaction;
+}
+#endif
 /*
 * Reclaim/compaction is used for high-order allocation requests. It reclaims
 * order-0 pages before compacting the zone. should_continue_reclaim() returns
@@ -1776,6 +1829,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
+        pages_for_compaction = scale_for_compaction(pages_for_compaction,
+                                                    lruvec, sc);
        inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
        if (nr_swap_pages > 0)
                inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
@@ -2839,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+                /*
+                 * Compaction records what page blocks it recently failed to
+                 * isolate pages from and skips them in the future scanning.
+                 * When kswapd is going to sleep, it is reasonable to assume
+                 * that pages and compaction may succeed so reset the cache.
+                 */
+                reset_isolation_suitable(pgdat);
                if (!kthread_should_stop())
                        schedule();
@@ -3101,8 +3165,9 @@ int kswapd_run(int nid)
        if (IS_ERR(pgdat->kswapd)) {
                /* failure at boot is fatal */
                BUG_ON(system_state == SYSTEM_BOOTING);
-                printk("Failed to start kswapd on node %d\n",nid);
+                pgdat->kswapd = NULL;
-                ret = -1;
+                pr_err("Failed to start kswapd on node %d\n", nid);
+                ret = PTR_ERR(pgdat->kswapd);
        }
        return ret;
 }
@@ -3349,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 /*
 * page_evictable - test whether a page is evictable
 * @page: the page to test
- * @vma: the VMA in which the page is or will be mapped, may be NULL
 *
 * Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list.  The vma argument is !NULL when called from the
+ * lists vs unevictable list.
- * fault path to determine how to instantate a new page.
 *
 * Reasons page might not be evictable:
 * (1) page's mapping marked unevictable
 * (2) page is part of an mlocked VMA
 *
 */
-int page_evictable(struct page *page, struct vm_area_struct *vma)
+int page_evictable(struct page *page)
 {
+        return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
-        if (mapping_unevictable(page_mapping(page)))
-                return 0;
-        if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
-                return 0;
-        return 1;
 }
 #ifdef CONFIG_SHMEM
@@ -3407,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
                if (!PageLRU(page) || !PageUnevictable(page))
                        continue;
-                if (page_evictable(page, NULL)) {
+                if (page_evictable(page)) {
                        enum lru_list lru = page_lru_base_type(page);
                        VM_BUG_ON(PageActive(page));
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df7a6748231..c7370579111 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu)
                        atomic_long_add(global_diff[i], &vm_stat[i]);
 }
+void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
+{
+        int i;
+        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                if (pset->vm_stat_diff[i]) {
+                        int v = pset->vm_stat_diff[i];
+                        pset->vm_stat_diff[i] = 0;
+                        atomic_long_add(v, &zone->vm_stat[i]);
+                        atomic_long_add(v, &vm_stat[i]);
+                }
+}
 #endif
 #ifdef CONFIG_NUMA
@@ -722,6 +734,7 @@ const char * const vmstat_text[] = {
        "numa_other",
 #endif
        "nr_anon_transparent_hugepages",
+        "nr_free_cma",
        "nr_dirty_threshold",
        "nr_dirty_background_threshold",
@@ -781,7 +794,6 @@ const char * const vmstat_text[] = {
        "unevictable_pgs_munlocked",
        "unevictable_pgs_cleared",
        "unevictable_pgs_stranded",
-        "unevictable_pgs_mlockfreed",
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        "thp_fault_alloc",
@@ -1157,7 +1169,7 @@ static void __cpuinit start_cpu_timer(int cpu)
 {
        struct delayed_work *work = &per_cpu(vmstat_work, cpu);
-        INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
+        INIT_DEFERRABLE_WORK(work, vmstat_update);
        schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
 }
author	J. Bruce Fields <bfields@redhat.com>	2012-10-09 18:35:22 -0400
committer	J. Bruce Fields <bfields@redhat.com>	2012-10-09 18:35:22 -0400
commit	f474af7051212b4efc8267583fad9c4ebf33ccff (patch)
tree	1aa46ebc8065a341f247c2a2d9af2f624ad1d4f8 /mm
parent	0d22f68f02c10d5d10ec5712917e5828b001a822 (diff)
parent	e3dd9a52cb5552c46c2a4ca7ccdfb4dab5c72457 (diff)