Merge branch 'master' into for-next

Sync up with Linus' tree to be able to apply Cesar's patch against newer version of the code. Signed-off-by: Jiri Kosina <jkosina@suse.cz>
author: Jiri Kosina <jkosina@suse.cz> 2012-10-28 14:28:52 -0400
committer: Jiri Kosina <jkosina@suse.cz> 2012-10-28 14:29:19 -0400
commit: 3bd7bf1f0fe14f591c089ae61bbfa9bd356f178a (patch)
tree: 0058693cc9e70b7461dae551f8a19aff2efd13ca /mm
parent: f16f84937d769c893492160b1a8c3672e3992beb (diff)
parent: e657e078d3dfa9f96976db7a2b5fd7d7c9f1f1a6 (diff)
51 files changed, 2446 insertions, 1976 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d5c8019c6627..a3f8dddaaab3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS
 # support for memory compaction
 config COMPACTION
        bool "Allow for memory compaction"
+        def_bool y
        select MIGRATION
        depends on MMU
        help
@@ -318,7 +319,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 config TRANSPARENT_HUGEPAGE
        bool "Transparent Hugepage Support"
-        depends on X86 && MMU
+        depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select COMPACTION
        help
          Transparent Hugepages allows the kernel to use huge pages and
diff --git a/mm/Makefile b/mm/Makefile
index 92753e2d82da..6b025f80af34 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -14,9 +14,9 @@ endif
 obj-y                   := filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
-                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+                           util.o mmzone.o vmstat.o backing-dev.o \
                           mm_init.o mmu_context.o percpu.o slab_common.o \
-                           compaction.o $(mmu-y)
+                           compaction.o interval_tree.o $(mmu-y)
 obj-y += init-mm.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b41823cc05e6..d3ca2b3ee176 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -158,16 +158,16 @@ static ssize_t read_ahead_kb_store(struct device *dev,
                                  const char *buf, size_t count)
 {
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
-        char *end;
        unsigned long read_ahead_kb;
-        ssize_t ret = -EINVAL;
+        ssize_t ret;
-        read_ahead_kb = simple_strtoul(buf, &end, 10);
+        ret = kstrtoul(buf, 10, &read_ahead_kb);
-        if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+        if (ret < 0)
-                bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
+                return ret;
-                ret = count;
-        }
+        bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
-        return ret;
+        return count;
 }
 #define K(pages) ((pages) << (PAGE_SHIFT - 10))
@@ -187,16 +187,17 @@ static ssize_t min_ratio_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
 {
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
-        char *end;
        unsigned int ratio;
-        ssize_t ret = -EINVAL;
+        ssize_t ret;
+        ret = kstrtouint(buf, 10, &ratio);
+        if (ret < 0)
+                return ret;
+        ret = bdi_set_min_ratio(bdi, ratio);
+        if (!ret)
+                ret = count;
-        ratio = simple_strtoul(buf, &end, 10);
-        if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
-                ret = bdi_set_min_ratio(bdi, ratio);
-                if (!ret)
-                        ret = count;
-        }
        return ret;
 }
 BDI_SHOW(min_ratio, bdi->min_ratio)
@@ -205,16 +206,17 @@ static ssize_t max_ratio_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
 {
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
-        char *end;
        unsigned int ratio;
-        ssize_t ret = -EINVAL;
+        ssize_t ret;
+        ret = kstrtouint(buf, 10, &ratio);
+        if (ret < 0)
+                return ret;
+        ret = bdi_set_max_ratio(bdi, ratio);
+        if (!ret)
+                ret = count;
-        ratio = simple_strtoul(buf, &end, 10);
-        if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
-                ret = bdi_set_max_ratio(bdi, ratio);
-                if (!ret)
-                        ret = count;
-        }
        return ret;
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..434be4ae7a04 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                        int order = ilog2(BITS_PER_LONG);
                        __free_pages_bootmem(pfn_to_page(start), order);
+                        fixup_zone_present_pages(page_to_nid(pfn_to_page(start)),
+                                        start, start + BITS_PER_LONG);
                        count += BITS_PER_LONG;
                        start += BITS_PER_LONG;
                } else {
@@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                                if (vec & 1) {
                                        page = pfn_to_page(start + off);
                                        __free_pages_bootmem(page, 0);
+                                        fixup_zone_present_pages(
+                                                page_to_nid(page),
+                                                start + off, start + off + 1);
                                        count++;
                                }
                                vec >>= 1;
@@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        pages = bdata->node_low_pfn - bdata->node_min_pfn;
        pages = bootmem_bootmap_pages(pages);
        count += pages;
-        while (pages--)
+        while (pages--) {
+                fixup_zone_present_pages(page_to_nid(page),
+                                page_to_pfn(page), page_to_pfn(page) + 1);
                __free_pages_bootmem(page++, 0);
+        }
        bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
diff --git a/mm/compaction.c b/mm/compaction.c
index 7fcd3a52e68d..9eef55838fca 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,111 @@ static inline bool migrate_async_suitable(int migratetype)
        return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
+#ifdef CONFIG_COMPACTION
+/* Returns true if the pageblock should be scanned for pages to isolate. */
+static inline bool isolation_suitable(struct compact_control *cc,
+                                        struct page *page)
+{
+        if (cc->ignore_skip_hint)
+                return true;
+        return !get_pageblock_skip(page);
+}
+/*
+ * This function is called to clear all cached information on pageblocks that
+ * should be skipped for page isolation when the migrate and free page scanner
+ * meet.
+ */
+static void __reset_isolation_suitable(struct zone *zone)
+{
+        unsigned long start_pfn = zone->zone_start_pfn;
+        unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        unsigned long pfn;
+        zone->compact_cached_migrate_pfn = start_pfn;
+        zone->compact_cached_free_pfn = end_pfn;
+        zone->compact_blockskip_flush = false;
+        /* Walk the zone and mark every pageblock as suitable for isolation */
+        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+                struct page *page;
+                cond_resched();
+                if (!pfn_valid(pfn))
+                        continue;
+                page = pfn_to_page(pfn);
+                if (zone != page_zone(page))
+                        continue;
+                clear_pageblock_skip(page);
+        }
+}
+void reset_isolation_suitable(pg_data_t *pgdat)
+{
+        int zoneid;
+        for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+                struct zone *zone = &pgdat->node_zones[zoneid];
+                if (!populated_zone(zone))
+                        continue;
+                /* Only flush if a full compaction finished recently */
+                if (zone->compact_blockskip_flush)
+                        __reset_isolation_suitable(zone);
+        }
+}
+/*
+ * If no pages were isolated then mark this pageblock to be skipped in the
+ * future. The information is later cleared by __reset_isolation_suitable().
+ */
+static void update_pageblock_skip(struct compact_control *cc,
+                        struct page *page, unsigned long nr_isolated,
+                        bool migrate_scanner)
+{
+        struct zone *zone = cc->zone;
+        if (!page)
+                return;
+        if (!nr_isolated) {
+                unsigned long pfn = page_to_pfn(page);
+                set_pageblock_skip(page);
+                /* Update where compaction should restart */
+                if (migrate_scanner) {
+                        if (!cc->finished_update_migrate &&
+                            pfn > zone->compact_cached_migrate_pfn)
+                                zone->compact_cached_migrate_pfn = pfn;
+                } else {
+                        if (!cc->finished_update_free &&
+                            pfn < zone->compact_cached_free_pfn)
+                                zone->compact_cached_free_pfn = pfn;
+                }
+        }
+}
+#else
+static inline bool isolation_suitable(struct compact_control *cc,
+                                        struct page *page)
+{
+        return true;
+}
+static void update_pageblock_skip(struct compact_control *cc,
+                        struct page *page, unsigned long nr_isolated,
+                        bool migrate_scanner)
+{
+}
+#endif /* CONFIG_COMPACTION */
+static inline bool should_release_lock(spinlock_t *lock)
+{
+        return need_resched() || spin_is_contended(lock);
+}
 /*
 * Compaction requires the taking of some coarse locks that are potentially
 * very heavily contended. Check if the process needs to be scheduled or
@@ -62,7 +167,7 @@ static inline bool migrate_async_suitable(int migratetype)
 static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
                                      bool locked, struct compact_control *cc)
 {
-        if (need_resched() || spin_is_contended(lock)) {
+        if (should_release_lock(lock)) {
                if (locked) {
                        spin_unlock_irqrestore(lock, *flags);
                        locked = false;
@@ -70,14 +175,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
                /* async aborts if taking too long or contended */
                if (!cc->sync) {
-                        if (cc->contended)
+                        cc->contended = true;
-                                *cc->contended = true;
                        return false;
                }
                cond_resched();
-                if (fatal_signal_pending(current))
-                        return false;
        }
        if (!locked)
@@ -91,44 +193,139 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
        return compact_checklock_irqsave(lock, flags, false, cc);
 }
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+        int migratetype = get_pageblock_migratetype(page);
+        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+        if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+                return false;
+        /* If the page is a large free page, then allow migration */
+        if (PageBuddy(page) && page_order(page) >= pageblock_order)
+                return true;
+        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+        if (migrate_async_suitable(migratetype))
+                return true;
+        /* Otherwise skip the block */
+        return false;
+}
+static void compact_capture_page(struct compact_control *cc)
+{
+        unsigned long flags;
+        int mtype, mtype_low, mtype_high;
+        if (!cc->page || *cc->page)
+                return;
+        /*
+         * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
+         * regardless of the migratetype of the freelist is is captured from.
+         * This is fine because the order for a high-order MIGRATE_MOVABLE
+         * allocation is typically at least a pageblock size and overall
+         * fragmentation is not impaired. Other allocation types must
+         * capture pages from their own migratelist because otherwise they
+         * could pollute other pageblocks like MIGRATE_MOVABLE with
+         * difficult to move pages and making fragmentation worse overall.
+         */
+        if (cc->migratetype == MIGRATE_MOVABLE) {
+                mtype_low = 0;
+                mtype_high = MIGRATE_PCPTYPES;
+        } else {
+                mtype_low = cc->migratetype;
+                mtype_high = cc->migratetype + 1;
+        }
+        /* Speculatively examine the free lists without zone lock */
+        for (mtype = mtype_low; mtype < mtype_high; mtype++) {
+                int order;
+                for (order = cc->order; order < MAX_ORDER; order++) {
+                        struct page *page;
+                        struct free_area *area;
+                        area = &(cc->zone->free_area[order]);
+                        if (list_empty(&area->free_list[mtype]))
+                                continue;
+                        /* Take the lock and attempt capture of the page */
+                        if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
+                                return;
+                        if (!list_empty(&area->free_list[mtype])) {
+                                page = list_entry(area->free_list[mtype].next,
+                                                        struct page, lru);
+                                if (capture_free_page(page, cc->order, mtype)) {
+                                        spin_unlock_irqrestore(&cc->zone->lock,
+                                                                        flags);
+                                        *cc->page = page;
+                                        return;
+                                }
+                        }
+                        spin_unlock_irqrestore(&cc->zone->lock, flags);
+                }
+        }
+}
 /*
 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
 * pages inside of the pageblock (even though it may still end up isolating
 * some pages).
 */
-static unsigned long isolate_freepages_block(unsigned long blockpfn,
+static unsigned long isolate_freepages_block(struct compact_control *cc,
+                                unsigned long blockpfn,
                                unsigned long end_pfn,
                                struct list_head *freelist,
                                bool strict)
 {
        int nr_scanned = 0, total_isolated = 0;
-        struct page *cursor;
+        struct page *cursor, *valid_page = NULL;
+        unsigned long nr_strict_required = end_pfn - blockpfn;
+        unsigned long flags;
+        bool locked = false;
        cursor = pfn_to_page(blockpfn);
-        /* Isolate free pages. This assumes the block is valid */
+        /* Isolate free pages. */
        for (; blockpfn < end_pfn; blockpfn++, cursor++) {
                int isolated, i;
                struct page *page = cursor;
-                if (!pfn_valid_within(blockpfn)) {
-                        if (strict)
-                                return 0;
-                        continue;
-                }
                nr_scanned++;
+                if (!pfn_valid_within(blockpfn))
+                        continue;
+                if (!valid_page)
+                        valid_page = page;
+                if (!PageBuddy(page))
+                        continue;
-                if (!PageBuddy(page)) {
+                /*
-                        if (strict)
+                 * The zone lock must be held to isolate freepages.
-                                return 0;
+                 * Unfortunately this is a very coarse lock and can be
+                 * heavily contended if there are parallel allocations
+                 * or parallel compactions. For async compaction do not
+                 * spin on the lock and we acquire the lock as late as
+                 * possible.
+                 */
+                locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
+                                                                locked, cc);
+                if (!locked)
+                        break;
+                /* Recheck this is a suitable migration target under lock */
+                if (!strict && !suitable_migration_target(page))
+                        break;
+                /* Recheck this is a buddy page under lock */
+                if (!PageBuddy(page))
                        continue;
-                }
                /* Found a free page, break it into order-0 pages */
                isolated = split_free_page(page);
                if (!isolated && strict)
-                        return 0;
+                        break;
                total_isolated += isolated;
                for (i = 0; i < isolated; i++) {
                        list_add(&page->lru, freelist);
@@ -143,6 +340,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
        }
        trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
+        /*
+         * If strict isolation is requested by CMA then check that all the
+         * pages requested were isolated. If there were any failures, 0 is
+         * returned and CMA will fail.
+         */
+        if (strict && nr_strict_required > total_isolated)
+                total_isolated = 0;
+        if (locked)
+                spin_unlock_irqrestore(&cc->zone->lock, flags);
+        /* Update the pageblock-skip if the whole pageblock was scanned */
+        if (blockpfn == end_pfn)
+                update_pageblock_skip(cc, valid_page, total_isolated, false);
        return total_isolated;
 }
@@ -160,17 +373,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
 * a free page).
 */
 unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
+isolate_freepages_range(struct compact_control *cc,
+                        unsigned long start_pfn, unsigned long end_pfn)
 {
-        unsigned long isolated, pfn, block_end_pfn, flags;
+        unsigned long isolated, pfn, block_end_pfn;
-        struct zone *zone = NULL;
        LIST_HEAD(freelist);
-        if (pfn_valid(start_pfn))
-                zone = page_zone(pfn_to_page(start_pfn));
        for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
-                if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
+                if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
                        break;
                /*
@@ -180,10 +390,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
                block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
                block_end_pfn = min(block_end_pfn, end_pfn);
-                spin_lock_irqsave(&zone->lock, flags);
+                isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
-                isolated = isolate_freepages_block(pfn, block_end_pfn,
                                                   &freelist, true);
-                spin_unlock_irqrestore(&zone->lock, flags);
                /*
                 * In strict mode, isolate_freepages_block() returns 0 if
@@ -253,6 +461,7 @@ static bool too_many_isolated(struct zone *zone)
 * @cc:         Compaction control structure.
 * @low_pfn:    The first PFN of the range.
 * @end_pfn:    The one-past-the-last PFN of the range.
+ * @unevictable: true if it allows to isolate unevictable pages
 *
 * Isolate all pages that can be migrated from the range specified by
 * [low_pfn, end_pfn).  Returns zero if there is a fatal signal
@@ -268,7 +477,7 @@ static bool too_many_isolated(struct zone *zone)
 */
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
-                           unsigned long low_pfn, unsigned long end_pfn)
+                unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
 {
        unsigned long last_pageblock_nr = 0, pageblock_nr;
        unsigned long nr_scanned = 0, nr_isolated = 0;
@@ -276,7 +485,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        isolate_mode_t mode = 0;
        struct lruvec *lruvec;
        unsigned long flags;
-        bool locked;
+        bool locked = false;
+        struct page *page = NULL, *valid_page = NULL;
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -296,23 +506,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        /* Time to isolate some pages for migration */
        cond_resched();
-        spin_lock_irqsave(&zone->lru_lock, flags);
-        locked = true;
        for (; low_pfn < end_pfn; low_pfn++) {
-                struct page *page;
                /* give a chance to irqs before checking need_resched() */
-                if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+                if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-                        spin_unlock_irqrestore(&zone->lru_lock, flags);
+                        if (should_release_lock(&zone->lru_lock)) {
-                        locked = false;
+                                spin_unlock_irqrestore(&zone->lru_lock, flags);
+                                locked = false;
+                        }
                }
-                /* Check if it is ok to still hold the lock */
-                locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
-                                                                locked, cc);
-                if (!locked)
-                        break;
                /*
                 * migrate_pfn does not necessarily start aligned to a
                 * pageblock. Ensure that pfn_valid is called when moving
@@ -340,6 +542,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                if (page_zone(page) != zone)
                        continue;
+                if (!valid_page)
+                        valid_page = page;
+                /* If isolation recently failed, do not retry */
+                pageblock_nr = low_pfn >> pageblock_order;
+                if (!isolation_suitable(cc, page))
+                        goto next_pageblock;
                /* Skip if free */
                if (PageBuddy(page))
                        continue;
@@ -349,24 +559,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                 * migration is optimistic to see if the minimum amount of work
                 * satisfies the allocation
                 */
-                pageblock_nr = low_pfn >> pageblock_order;
                if (!cc->sync && last_pageblock_nr != pageblock_nr &&
                    !migrate_async_suitable(get_pageblock_migratetype(page))) {
-                        low_pfn += pageblock_nr_pages;
+                        cc->finished_update_migrate = true;
-                        low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+                        goto next_pageblock;
-                        last_pageblock_nr = pageblock_nr;
-                        continue;
                }
+                /* Check may be lockless but that's ok as we recheck later */
                if (!PageLRU(page))
                        continue;
                /*
-                 * PageLRU is set, and lru_lock excludes isolation,
+                 * PageLRU is set. lru_lock normally excludes isolation
-                 * splitting and collapsing (collapsing has already
+                 * splitting and collapsing (collapsing has already happened
-                 * happened if PageLRU is set).
+                 * if PageLRU is set) but the lock is not necessarily taken
+                 * here and it is wasteful to take it just to check transhuge.
+                 * Check TransHuge without lock and skip the whole pageblock if
+                 * it's either a transhuge or hugetlbfs page, as calling
+                 * compound_order() without preventing THP from splitting the
+                 * page underneath us may return surprising results.
                 */
                if (PageTransHuge(page)) {
+                        if (!locked)
+                                goto next_pageblock;
+                        low_pfn += (1 << compound_order(page)) - 1;
+                        continue;
+                }
+                /* Check if it is ok to still hold the lock */
+                locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+                                                                locked, cc);
+                if (!locked || fatal_signal_pending(current))
+                        break;
+                /* Recheck PageLRU and PageTransHuge under lock */
+                if (!PageLRU(page))
+                        continue;
+                if (PageTransHuge(page)) {
                        low_pfn += (1 << compound_order(page)) - 1;
                        continue;
                }
@@ -374,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                if (!cc->sync)
                        mode |= ISOLATE_ASYNC_MIGRATE;
+                if (unevictable)
+                        mode |= ISOLATE_UNEVICTABLE;
                lruvec = mem_cgroup_page_lruvec(page, zone);
                /* Try isolate the page */
@@ -383,6 +615,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                VM_BUG_ON(PageTransCompound(page));
                /* Successfully isolated */
+                cc->finished_update_migrate = true;
                del_page_from_lru_list(page, lruvec, page_lru(page));
                list_add(&page->lru, migratelist);
                cc->nr_migratepages++;
@@ -393,6 +626,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                        ++low_pfn;
                        break;
                }
+                continue;
+next_pageblock:
+                low_pfn += pageblock_nr_pages;
+                low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+                last_pageblock_nr = pageblock_nr;
        }
        acct_isolated(zone, locked, cc);
@@ -400,6 +640,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        if (locked)
                spin_unlock_irqrestore(&zone->lru_lock, flags);
+        /* Update the pageblock-skip if the whole pageblock was scanned */
+        if (low_pfn == end_pfn)
+                update_pageblock_skip(cc, valid_page, nr_isolated, true);
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
        return low_pfn;
@@ -407,43 +651,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
-{
-        int migratetype = get_pageblock_migratetype(page);
-        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
-        if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
-                return false;
-        /* If the page is a large free page, then allow migration */
-        if (PageBuddy(page) && page_order(page) >= pageblock_order)
-                return true;
-        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-        if (migrate_async_suitable(migratetype))
-                return true;
-        /* Otherwise skip the block */
-        return false;
-}
-/*
- * Returns the start pfn of the last page block in a zone.  This is the starting
- * point for full compaction of a zone.  Compaction searches for free pages from
- * the end of each zone, while isolate_freepages_block scans forward inside each
- * page block.
- */
-static unsigned long start_free_pfn(struct zone *zone)
-{
-        unsigned long free_pfn;
-        free_pfn = zone->zone_start_pfn + zone->spanned_pages;
-        free_pfn &= ~(pageblock_nr_pages-1);
-        return free_pfn;
-}
 /*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
@@ -453,7 +660,6 @@ static void isolate_freepages(struct zone *zone,
 {
        struct page *page;
        unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
-        unsigned long flags;
        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
@@ -501,30 +707,16 @@ static void isolate_freepages(struct zone *zone,
                if (!suitable_migration_target(page))
                        continue;
-                /*
+                /* If isolation recently failed, do not retry */
-                 * Found a block suitable for isolating free pages from. Now
+                if (!isolation_suitable(cc, page))
-                 * we disabled interrupts, double check things are ok and
+                        continue;
-                 * isolate the pages. This is to minimise the time IRQs
-                 * are disabled
-                 */
-                isolated = 0;
-                /*
+                /* Found a block suitable for isolating free pages from */
-                 * The zone lock must be held to isolate freepages. This
+                isolated = 0;
-                 * unfortunately this is a very coarse lock and can be
+                end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
-                 * heavily contended if there are parallel allocations
+                isolated = isolate_freepages_block(cc, pfn, end_pfn,
-                 * or parallel compactions. For async compaction do not
+                                                   freelist, false);
-                 * spin on the lock
+                nr_freepages += isolated;
-                 */
-                if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
-                        break;
-                if (suitable_migration_target(page)) {
-                        end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
-                        isolated = isolate_freepages_block(pfn, end_pfn,
-                                                           freelist, false);
-                        nr_freepages += isolated;
-                }
-                spin_unlock_irqrestore(&zone->lock, flags);
                /*
                 * Record the highest PFN we isolated pages from. When next
@@ -532,17 +724,8 @@ static void isolate_freepages(struct zone *zone,
                 * page migration may have returned some pages to the allocator
                 */
                if (isolated) {
+                        cc->finished_update_free = true;
                        high_pfn = max(high_pfn, pfn);
-                        /*
-                         * If the free scanner has wrapped, update
-                         * compact_cached_free_pfn to point to the highest
-                         * pageblock with free pages. This reduces excessive
-                         * scanning of full pageblocks near the end of the
-                         * zone
-                         */
-                        if (cc->order > 0 && cc->wrapped)
-                                zone->compact_cached_free_pfn = high_pfn;
                }
        }
@@ -551,11 +734,6 @@ static void isolate_freepages(struct zone *zone,
        cc->free_pfn = high_pfn;
        cc->nr_freepages = nr_freepages;
-        /* If compact_cached_free_pfn is reset then set it now */
-        if (cc->order > 0 && !cc->wrapped &&
-                        zone->compact_cached_free_pfn == start_free_pfn(zone))
-                zone->compact_cached_free_pfn = high_pfn;
 }
 /*
@@ -633,8 +811,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        }
        /* Perform the isolation */
-        low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
+        low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
-        if (!low_pfn)
+        if (!low_pfn || cc->contended)
                return ISOLATE_ABORT;
        cc->migrate_pfn = low_pfn;
@@ -645,33 +823,24 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 static int compact_finished(struct zone *zone,
                            struct compact_control *cc)
 {
-        unsigned int order;
        unsigned long watermark;
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
-        /*
+        /* Compaction run completes if the migrate and free scanner meet */
-         * A full (order == -1) compaction run starts at the beginning and
-         * end of a zone; it completes when the migrate and free scanner meet.
-         * A partial (order > 0) compaction can start with the free scanner
-         * at a random point in the zone, and may have to restart.
-         */
        if (cc->free_pfn <= cc->migrate_pfn) {
-                if (cc->order > 0 && !cc->wrapped) {
+                /*
-                        /* We started partway through; restart at the end. */
+                 * Mark that the PG_migrate_skip information should be cleared
-                        unsigned long free_pfn = start_free_pfn(zone);
+                 * by kswapd when it goes to sleep. kswapd does not set the
-                        zone->compact_cached_free_pfn = free_pfn;
+                 * flag itself as the decision to be clear should be directly
-                        cc->free_pfn = free_pfn;
+                 * based on an allocation request.
-                        cc->wrapped = 1;
+                 */
-                        return COMPACT_CONTINUE;
+                if (!current_is_kswapd())
-                }
+                        zone->compact_blockskip_flush = true;
-                return COMPACT_COMPLETE;
-        }
-        /* We wrapped around and ended up where we started. */
-        if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
                return COMPACT_COMPLETE;
+        }
        /*
         * order == -1 is expected when compacting via
@@ -688,14 +857,22 @@ static int compact_finished(struct zone *zone,
                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
-        for (order = cc->order; order < MAX_ORDER; order++) {
+        if (cc->page) {
-                /* Job done if page is free of the right migratetype */
+                /* Was a suitable page captured? */
-                if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
+                if (*cc->page)
-                        return COMPACT_PARTIAL;
-                /* Job done if allocation would set block type */
-                if (order >= pageblock_order && zone->free_area[order].nr_free)
                        return COMPACT_PARTIAL;
+        } else {
+                unsigned int order;
+                for (order = cc->order; order < MAX_ORDER; order++) {
+                        struct free_area *area = &zone->free_area[cc->order];
+                        /* Job done if page is free of the right migratetype */
+                        if (!list_empty(&area->free_list[cc->migratetype]))
+                                return COMPACT_PARTIAL;
+                        /* Job done if allocation would set block type */
+                        if (cc->order >= pageblock_order && area->nr_free)
+                                return COMPACT_PARTIAL;
+                }
        }
        return COMPACT_CONTINUE;
@@ -754,6 +931,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
+        unsigned long start_pfn = zone->zone_start_pfn;
+        unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
        ret = compaction_suitable(zone, cc->order);
        switch (ret) {
@@ -766,18 +945,30 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                ;
        }
-        /* Setup to move all movable pages to the end of the zone */
+        /*
-        cc->migrate_pfn = zone->zone_start_pfn;
+         * Setup to move all movable pages to the end of the zone. Used cached
+         * information on where the scanners should start but check that it
-        if (cc->order > 0) {
+         * is initialised by ensuring the values are within zone boundaries.
-                /* Incremental compaction. Start where the last one stopped. */
+         */
-                cc->free_pfn = zone->compact_cached_free_pfn;
+        cc->migrate_pfn = zone->compact_cached_migrate_pfn;
-                cc->start_free_pfn = cc->free_pfn;
+        cc->free_pfn = zone->compact_cached_free_pfn;
-        } else {
+        if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
-                /* Order == -1 starts at the end of the zone. */
+                cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
-                cc->free_pfn = start_free_pfn(zone);
+                zone->compact_cached_free_pfn = cc->free_pfn;
+        }
+        if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+                cc->migrate_pfn = start_pfn;
+                zone->compact_cached_migrate_pfn = cc->migrate_pfn;
        }
+        /*
+         * Clear pageblock skip if there were failures recently and compaction
+         * is about to be retried after being deferred. kswapd does not do
+         * this reset as it'll reset the cached information when going to sleep.
+         */
+        if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+                __reset_isolation_suitable(zone);
        migrate_prep_local();
        while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
@@ -787,6 +978,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                switch (isolate_migratepages(zone, cc)) {
                case ISOLATE_ABORT:
                        ret = COMPACT_PARTIAL;
+                        putback_lru_pages(&cc->migratepages);
+                        cc->nr_migratepages = 0;
                        goto out;
                case ISOLATE_NONE:
                        continue;
@@ -817,6 +1010,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                                goto out;
                        }
                }
+                /* Capture a page now if it is a suitable size */
+                compact_capture_page(cc);
        }
 out:
@@ -829,8 +1025,10 @@ out:
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                 bool sync, bool *contended)
+                                 bool sync, bool *contended,
+                                 struct page **page)
 {
+        unsigned long ret;
        struct compact_control cc = {
                .nr_freepages = 0,
                .nr_migratepages = 0,
@@ -838,12 +1036,18 @@ static unsigned long compact_zone_order(struct zone *zone,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
                .sync = sync,
-                .contended = contended,
+                .page = page,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
-        return compact_zone(zone, &cc);
+        ret = compact_zone(zone, &cc);
+        VM_BUG_ON(!list_empty(&cc.freepages));
+        VM_BUG_ON(!list_empty(&cc.migratepages));
+        *contended = cc.contended;
+        return ret;
 }
 int sysctl_extfrag_threshold = 500;
@@ -855,12 +1059,14 @@ int sysctl_extfrag_threshold = 500;
 * @gfp_mask: The GFP mask of the current allocation
 * @nodemask: The allowed nodes to allocate from
 * @sync: Whether migration is synchronous or not
+ * @contended: Return value that is true if compaction was aborted due to lock contention
+ * @page: Optionally capture a free page of the requested order during compaction
 *
 * This is the main entry point for direct page compaction.
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                        bool sync, bool *contended)
+                        bool sync, bool *contended, struct page **page)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -868,28 +1074,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
+        int alloc_flags = 0;
-        /*
+        /* Check if the GFP flags allow compaction */
-         * Check whether it is worth even starting compaction. The order check is
-         * made because an assumption is made that the page allocator can satisfy
-         * the "cheaper" orders without taking special steps
-         */
        if (!order || !may_enter_fs || !may_perform_io)
                return rc;
        count_vm_event(COMPACTSTALL);
+#ifdef CONFIG_CMA
+        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+                alloc_flags |= ALLOC_CMA;
+#endif
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
                int status;
                status = compact_zone_order(zone, order, gfp_mask, sync,
-                                                contended);
+                                                contended, page);
                rc = max(status, rc);
                /* If a normal allocation would succeed, stop compacting */
-                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
+                                      alloc_flags))
                        break;
        }
@@ -940,6 +1148,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
        struct compact_control cc = {
                .order = order,
                .sync = false,
+                .page = NULL,
        };
        return __compact_pgdat(pgdat, &cc);
@@ -950,6 +1159,7 @@ static int compact_node(int nid)
        struct compact_control cc = {
                .order = -1,
                .sync = true,
+                .page = NULL,
        };
        return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 9b75a045dbf4..a47f0f50c89f 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -26,7 +26,7 @@
 */
 SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 {
-        struct file *file = fget(fd);
+        struct fd f = fdget(fd);
        struct address_space *mapping;
        struct backing_dev_info *bdi;
        loff_t endbyte;                 /* inclusive */
@@ -35,15 +35,15 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
        unsigned long nrpages;
        int ret = 0;
-        if (!file)
+        if (!f.file)
                return -EBADF;
-        if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
+        if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) {
                ret = -ESPIPE;
                goto out;
        }
-        mapping = file->f_mapping;
+        mapping = f.file->f_mapping;
        if (!mapping || len < 0) {
                ret = -EINVAL;
                goto out;
@@ -76,21 +76,21 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
        switch (advice) {
        case POSIX_FADV_NORMAL:
-                file->f_ra.ra_pages = bdi->ra_pages;
+                f.file->f_ra.ra_pages = bdi->ra_pages;
-                spin_lock(&file->f_lock);
+                spin_lock(&f.file->f_lock);
-                file->f_mode &= ~FMODE_RANDOM;
+                f.file->f_mode &= ~FMODE_RANDOM;
-                spin_unlock(&file->f_lock);
+                spin_unlock(&f.file->f_lock);
                break;
        case POSIX_FADV_RANDOM:
-                spin_lock(&file->f_lock);
+                spin_lock(&f.file->f_lock);
-                file->f_mode |= FMODE_RANDOM;
+                f.file->f_mode |= FMODE_RANDOM;
-                spin_unlock(&file->f_lock);
+                spin_unlock(&f.file->f_lock);
                break;
        case POSIX_FADV_SEQUENTIAL:
-                file->f_ra.ra_pages = bdi->ra_pages * 2;
+                f.file->f_ra.ra_pages = bdi->ra_pages * 2;
-                spin_lock(&file->f_lock);
+                spin_lock(&f.file->f_lock);
-                file->f_mode &= ~FMODE_RANDOM;
+                f.file->f_mode &= ~FMODE_RANDOM;
-                spin_unlock(&file->f_lock);
+                spin_unlock(&f.file->f_lock);
                break;
        case POSIX_FADV_WILLNEED:
                /* First and last PARTIAL page! */
@@ -106,7 +106,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                 * Ignore return value because fadvise() shall return
                 * success even if filesystem can't retrieve a hint,
                 */
-                force_page_cache_readahead(mapping, file, start_index,
+                force_page_cache_readahead(mapping, f.file, start_index,
                                           nrpages);
                break;
        case POSIX_FADV_NOREUSE:
@@ -128,7 +128,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                ret = -EINVAL;
        }
 out:
-        fput(file);
+        fdput(f);
        return ret;
 }
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
diff --git a/mm/filemap.c b/mm/filemap.c
index 384344575c37..83efee76a5c0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
         * Do we have something in the page cache already?
         */
        page = find_get_page(mapping, offset);
-        if (likely(page)) {
+        if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
                /*
                 * We found the page, so try async readahead before
                 * waiting for the lock.
                 */
                do_async_mmap_readahead(vma, ra, file, page, offset);
-        } else {
+        } else if (!page) {
                /* No page in the page cache at all */
                do_sync_mmap_readahead(vma, ra, file, offset);
                count_vm_event(PGMAJFAULT);
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
 const struct vm_operations_struct generic_file_vm_ops = {
        .fault          = filemap_fault,
        .page_mkwrite   = filemap_page_mkwrite,
+        .remap_pages    = generic_file_remap_pages,
 };
 /* This is used for a general mmap of a disk file */
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &generic_file_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 13e013b1270c..a912da6ddfd4 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping,
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm;
-        struct prio_tree_iter iter;
        unsigned long address;
        pte_t *pte;
        pte_t pteval;
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
 retry:
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -193,11 +192,13 @@ retry:
                if (pte) {
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
-                        pteval = ptep_clear_flush_notify(vma, address, pte);
+                        pteval = ptep_clear_flush(vma, address, pte);
                        page_remove_rmap(page);
                        dec_mm_counter(mm, MM_FILEPAGES);
                        BUG_ON(pte_dirty(pteval));
                        pte_unmap_unlock(pte, ptl);
+                        /* must invalidate_page _before_ freeing the page */
+                        mmu_notifier_invalidate_page(mm, address);
                        page_cache_release(page);
                }
        }
@@ -305,6 +306,7 @@ out:
 static const struct vm_operations_struct xip_file_vm_ops = {
        .fault  = xip_file_fault,
        .page_mkwrite   = filemap_page_mkwrite,
+        .remap_pages = generic_file_remap_pages,
 };
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -313,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
        file_accessed(file);
        vma->vm_ops = &xip_file_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
+        vma->vm_flags |= VM_MIXEDMAP;
        return 0;
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 9ed4fd432467..a0aaf0e56800 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,6 +5,7 @@
 *
 * started by Ingo Molnar, Copyright (C) 2002, 2003
 */
+#include <linux/export.h>
 #include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -80,9 +81,10 @@ out:
        return err;
 }
-static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-                        unsigned long addr, unsigned long size, pgoff_t pgoff)
+                             unsigned long size, pgoff_t pgoff)
 {
+        struct mm_struct *mm = vma->vm_mm;
        int err;
        do {
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
                pgoff++;
        } while (size);
-        return 0;
+        return 0;
 }
+EXPORT_SYMBOL(generic_file_remap_pages);
 /**
 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
                goto out;
-        if (!(vma->vm_flags & VM_CAN_NONLINEAR))
+        if (!vma->vm_ops || !vma->vm_ops->remap_pages)
                goto out;
        if (start < vma->vm_start || start + size > vma->vm_end)
@@ -195,10 +197,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                 */
                if (mapping_cap_account_dirty(mapping)) {
                        unsigned long addr;
-                        struct file *file = vma->vm_file;
+                        struct file *file = get_file(vma->vm_file);
                        flags &= MAP_NONBLOCK;
-                        get_file(file);
                        addr = mmap_region(file, start, size,
                                        flags, vma->vm_flags, pgoff);
                        fput(file);
@@ -213,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
                vma->vm_flags |= VM_NONLINEAR;
-                vma_prio_tree_remove(vma, &mapping->i_mmap);
+                vma_interval_tree_remove(vma, &mapping->i_mmap);
                vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
                flush_dcache_mmap_unlock(mapping);
                mutex_unlock(&mapping->i_mmap_mutex);
@@ -229,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        }
        mmu_notifier_invalidate_range_start(mm, start, start + size);
-        err = populate_range(mm, vma, start, size, pgoff);
+        err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
        mmu_notifier_invalidate_range_end(mm, start, start + size);
        if (!err && !(flags & MAP_NONBLOCK)) {
                if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 6b3e71a2cd48..2890e67d6026 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -44,6 +44,13 @@ EXPORT_SYMBOL(frontswap_enabled);
 */
 static bool frontswap_writethrough_enabled __read_mostly;
+/*
+ * If enabled, the underlying tmem implementation is capable of doing
+ * exclusive gets, so frontswap_load, on a successful tmem_get must
+ * mark the page as no longer in frontswap AND mark it dirty.
+ */
+static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
 #ifdef CONFIG_DEBUG_FS
 /*
 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
@@ -97,6 +104,15 @@ void frontswap_writethrough(bool enable)
 EXPORT_SYMBOL(frontswap_writethrough);
 /*
+ * Enable/disable frontswap exclusive gets (see above).
+ */
+void frontswap_tmem_exclusive_gets(bool enable)
+{
+        frontswap_tmem_exclusive_gets_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
+/*
 * Called when a swap device is swapon'd.
 */
 void __frontswap_init(unsigned type)
@@ -174,8 +190,13 @@ int __frontswap_load(struct page *page)
        BUG_ON(sis == NULL);
        if (frontswap_test(sis, offset))
                ret = frontswap_ops.load(type, offset, page);
-        if (ret == 0)
+        if (ret == 0) {
                inc_frontswap_loads();
+                if (frontswap_tmem_exclusive_gets_enabled) {
+                        SetPageDirty(page);
+                        frontswap_clear(sis, offset);
+                }
+        }
        return ret;
 }
 EXPORT_SYMBOL(__frontswap_load);
@@ -263,6 +284,11 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
        return ret;
 }
+/*
+ * Used to check if it's necessory and feasible to unuse pages.
+ * Return 1 when nothing to do, 0 when need to shink pages,
+ * error code when there is an error.
+ */
 static int __frontswap_shrink(unsigned long target_pages,
                                unsigned long *pages_to_unuse,
                                int *type)
@@ -275,7 +301,7 @@ static int __frontswap_shrink(unsigned long target_pages,
        if (total_pages <= target_pages) {
                /* Nothing to do */
                *pages_to_unuse = 0;
-                return 0;
+                return 1;
        }
        total_pages_to_unuse = total_pages - target_pages;
        return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
@@ -292,7 +318,7 @@ static int __frontswap_shrink(unsigned long target_pages,
 void frontswap_shrink(unsigned long target_pages)
 {
        unsigned long pages_to_unuse = 0;
-        int type, ret;
+        int uninitialized_var(type), ret;
        /*
         * we don't want to hold swap_lock while doing a very
@@ -302,7 +328,7 @@ void frontswap_shrink(unsigned long target_pages)
        spin_lock(&swap_lock);
        ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
        spin_unlock(&swap_lock);
-        if (ret == 0 && pages_to_unuse)
+        if (ret == 0)
                try_to_unuse(type, true, pages_to_unuse);
        return;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 141dbb695097..40f17c34b415 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -17,6 +17,7 @@
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
 #include <linux/mman.h>
+#include <linux/pagemap.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
@@ -102,10 +103,7 @@ static int set_recommended_min_free_kbytes(void)
        unsigned long recommended_min;
        extern int min_free_kbytes;
-        if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+        if (!khugepaged_enabled())
-                      &transparent_hugepage_flags) &&
-            !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-                      &transparent_hugepage_flags))
                return 0;
        for_each_populated_zone(zone)
@@ -139,12 +137,6 @@ static int start_khugepaged(void)
 {
        int err = 0;
        if (khugepaged_enabled()) {
-                int wakeup;
-                if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
-                        err = -ENOMEM;
-                        goto out;
-                }
-                mutex_lock(&khugepaged_mutex);
                if (!khugepaged_thread)
                        khugepaged_thread = kthread_run(khugepaged, NULL,
                                                        "khugepaged");
@@ -154,16 +146,16 @@ static int start_khugepaged(void)
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
                }
-                wakeup = !list_empty(&khugepaged_scan.mm_head);
-                mutex_unlock(&khugepaged_mutex);
+                if (!list_empty(&khugepaged_scan.mm_head))
-                if (wakeup)
                        wake_up_interruptible(&khugepaged_wait);
                set_recommended_min_free_kbytes();
-        } else
+        } else if (khugepaged_thread) {
-                /* wakeup to exit */
+                kthread_stop(khugepaged_thread);
-                wake_up_interruptible(&khugepaged_wait);
+                khugepaged_thread = NULL;
-out:
+        }
        return err;
 }
@@ -224,18 +216,16 @@ static ssize_t enabled_store(struct kobject *kobj,
                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
        if (ret > 0) {
-                int err = start_khugepaged();
+                int err;
+                mutex_lock(&khugepaged_mutex);
+                err = start_khugepaged();
+                mutex_unlock(&khugepaged_mutex);
                if (err)
                        ret = err;
        }
-        if (ret > 0 &&
-            (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
-                      &transparent_hugepage_flags) ||
-             test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-                      &transparent_hugepage_flags)))
-                set_recommended_min_free_kbytes();
        return ret;
 }
 static struct kobj_attribute enabled_attr =
@@ -570,8 +560,6 @@ static int __init hugepage_init(void)
        start_khugepaged();
-        set_recommended_min_free_kbytes();
        return 0;
 out:
        hugepage_exit_sysfs(hugepage_kobj);
@@ -611,19 +599,6 @@ out:
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
-static void prepare_pmd_huge_pte(pgtable_t pgtable,
-                                 struct mm_struct *mm)
-{
-        assert_spin_locked(&mm->page_table_lock);
-        /* FIFO */
-        if (!mm->pmd_huge_pte)
-                INIT_LIST_HEAD(&pgtable->lru);
-        else
-                list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
-        mm->pmd_huge_pte = pgtable;
-}
 static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
        if (likely(vma->vm_flags & VM_WRITE))
@@ -665,7 +640,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                 */
                page_add_new_anon_rmap(page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
-                prepare_pmd_huge_pte(pgtable, mm);
+                pgtable_trans_huge_deposit(mm, pgtable);
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm->nr_ptes++;
                spin_unlock(&mm->page_table_lock);
@@ -791,7 +766,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-        prepare_pmd_huge_pte(pgtable, dst_mm);
+        pgtable_trans_huge_deposit(dst_mm, pgtable);
        dst_mm->nr_ptes++;
        ret = 0;
@@ -802,25 +777,6 @@ out:
        return ret;
 }
-/* no "address" argument so destroys page coloring of some arch */
-pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
-{
-        pgtable_t pgtable;
-        assert_spin_locked(&mm->page_table_lock);
-        /* FIFO */
-        pgtable = mm->pmd_huge_pte;
-        if (list_empty(&pgtable->lru))
-                mm->pmd_huge_pte = NULL;
-        else {
-                mm->pmd_huge_pte = list_entry(pgtable->lru.next,
-                                              struct page, lru);
-                list_del(&pgtable->lru);
-        }
-        return pgtable;
-}
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
@@ -832,6 +788,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        pmd_t _pmd;
        int ret = 0, i;
        struct page **pages;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
                        GFP_KERNEL);
@@ -868,15 +826,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                cond_resched();
        }
+        mmun_start = haddr;
+        mmun_end   = haddr + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(*pmd, orig_pmd)))
                goto out_free_pages;
        VM_BUG_ON(!PageHead(page));
-        pmdp_clear_flush_notify(vma, haddr, pmd);
+        pmdp_clear_flush(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
-        pgtable = get_pmd_huge_pte(mm);
+        pgtable = pgtable_trans_huge_withdraw(mm);
        pmd_populate(mm, &_pmd, pgtable);
        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -896,6 +858,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        page_remove_rmap(page);
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        ret |= VM_FAULT_WRITE;
        put_page(page);
@@ -904,6 +868,7 @@ out:
 out_free_pages:
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        mem_cgroup_uncharge_start();
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                mem_cgroup_uncharge_page(pages[i]);
@@ -920,6 +885,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int ret = 0;
        struct page *page, *new_page;
        unsigned long haddr;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        VM_BUG_ON(!vma->anon_vma);
        spin_lock(&mm->page_table_lock);
@@ -934,7 +901,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
-                        update_mmu_cache(vma, address, entry);
+                        update_mmu_cache_pmd(vma, address, pmd);
                ret |= VM_FAULT_WRITE;
                goto out_unlock;
        }
@@ -970,38 +937,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
        __SetPageUptodate(new_page);
+        mmun_start = haddr;
+        mmun_end   = haddr + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        put_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
                spin_unlock(&mm->page_table_lock);
                mem_cgroup_uncharge_page(new_page);
                put_page(new_page);
-                goto out;
+                goto out_mn;
        } else {
                pmd_t entry;
                VM_BUG_ON(!PageHead(page));
                entry = mk_pmd(new_page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                entry = pmd_mkhuge(entry);
-                pmdp_clear_flush_notify(vma, haddr, pmd);
+                pmdp_clear_flush(vma, haddr, pmd);
                page_add_new_anon_rmap(new_page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
-                update_mmu_cache(vma, address, entry);
+                update_mmu_cache_pmd(vma, address, pmd);
                page_remove_rmap(page);
                put_page(page);
                ret |= VM_FAULT_WRITE;
        }
-out_unlock:
        spin_unlock(&mm->page_table_lock);
+out_mn:
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
        return ret;
+out_unlock:
+        spin_unlock(&mm->page_table_lock);
+        return ret;
 }
-struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                   unsigned long addr,
                                   pmd_t *pmd,
                                   unsigned int flags)
 {
+        struct mm_struct *mm = vma->vm_mm;
        struct page *page = NULL;
        assert_spin_locked(&mm->page_table_lock);
@@ -1024,6 +1000,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
                _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
                set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
        }
+        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+                if (page->mapping && trylock_page(page)) {
+                        lru_add_drain();
+                        if (page->mapping)
+                                mlock_vma_page(page);
+                        unlock_page(page);
+                }
+        }
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON(!PageCompound(page));
        if (flags & FOLL_GET)
@@ -1041,9 +1025,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
                struct page *page;
                pgtable_t pgtable;
-                pgtable = get_pmd_huge_pte(tlb->mm);
+                pmd_t orig_pmd;
-                page = pmd_page(*pmd);
+                pgtable = pgtable_trans_huge_withdraw(tlb->mm);
-                pmd_clear(pmd);
+                orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
+                page = pmd_page(orig_pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                page_remove_rmap(page);
                VM_BUG_ON(page_mapcount(page) < 0);
@@ -1207,7 +1192,11 @@ static int __split_huge_page_splitting(struct page *page,
        struct mm_struct *mm = vma->vm_mm;
        pmd_t *pmd;
        int ret = 0;
+        /* For mmu_notifiers */
+        const unsigned long mmun_start = address;
+        const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        pmd = page_check_address_pmd(page, mm, address,
                                     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1219,10 +1208,11 @@ static int __split_huge_page_splitting(struct page *page,
                 * and it won't wait on the anon_vma->root->mutex to
                 * serialize against split_huge_page*.
                 */
-                pmdp_splitting_flush_notify(vma, address, pmd);
+                pmdp_splitting_flush(vma, address, pmd);
                ret = 1;
        }
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        return ret;
 }
@@ -1358,11 +1348,11 @@ static int __split_huge_page_map(struct page *page,
        pmd = page_check_address_pmd(page, mm, address,
                                     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
        if (pmd) {
-                pgtable = get_pmd_huge_pte(mm);
+                pgtable = pgtable_trans_huge_withdraw(mm);
                pmd_populate(mm, &_pmd, pgtable);
-                for (i = 0, haddr = address; i < HPAGE_PMD_NR;
+                haddr = address;
-                     i++, haddr += PAGE_SIZE) {
+                for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
                        pte_t *pte, entry;
                        BUG_ON(PageCompound(page+i));
                        entry = mk_pte(page + i, vma->vm_page_prot);
@@ -1406,8 +1396,7 @@ static int __split_huge_page_map(struct page *page,
                 * SMP TLB and finally we write the non-huge version
                 * of the pmd entry with pmd_populate.
                 */
-                set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
+                pmdp_invalidate(vma, address, pmd);
-                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
                pmd_populate(mm, pmd, pgtable);
                ret = 1;
        }
@@ -1421,18 +1410,17 @@ static void __split_huge_page(struct page *page,
                              struct anon_vma *anon_vma)
 {
        int mapcount, mapcount2;
+        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct anon_vma_chain *avc;
        BUG_ON(!PageHead(page));
        BUG_ON(PageTail(page));
        mapcount = 0;
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long addr = vma_address(page, vma);
                BUG_ON(is_vma_temporary_stack(vma));
-                if (addr == -EFAULT)
-                        continue;
                mapcount += __split_huge_page_splitting(page, vma, addr);
        }
        /*
@@ -1453,12 +1441,10 @@ static void __split_huge_page(struct page *page,
        __split_huge_page_refcount(page);
        mapcount2 = 0;
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long addr = vma_address(page, vma);
                BUG_ON(is_vma_temporary_stack(vma));
-                if (addr == -EFAULT)
-                        continue;
                mapcount2 += __split_huge_page_map(page, vma, addr);
        }
        if (mapcount != mapcount2)
@@ -1491,12 +1477,13 @@ out:
        return ret;
 }
-#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
-                   VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
 int hugepage_madvise(struct vm_area_struct *vma,
                     unsigned long *vm_flags, int advice)
 {
+        struct mm_struct *mm = vma->vm_mm;
        switch (advice) {
        case MADV_HUGEPAGE:
                /*
@@ -1504,6 +1491,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
                 */
                if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
                        return -EINVAL;
+                if (mm->def_flags & VM_NOHUGEPAGE)
+                        return -EINVAL;
                *vm_flags &= ~VM_NOHUGEPAGE;
                *vm_flags |= VM_HUGEPAGE;
                /*
@@ -1655,11 +1644,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
        if (vma->vm_ops)
                /* khugepaged not yet working on file or special mappings */
                return 0;
-        /*
+        VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-         * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-         * true too, verify it here.
-         */
-        VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (hstart < hend)
@@ -1833,28 +1818,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
        }
 }
-static void collapse_huge_page(struct mm_struct *mm,
+static void khugepaged_alloc_sleep(void)
-                               unsigned long address,
-                               struct page **hpage,
-                               struct vm_area_struct *vma,
-                               int node)
 {
-        pgd_t *pgd;
+        wait_event_freezable_timeout(khugepaged_wait, false,
-        pud_t *pud;
+                        msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
-        pmd_t *pmd, _pmd;
+}
-        pte_t *pte;
-        pgtable_t pgtable;
-        struct page *new_page;
-        spinlock_t *ptl;
-        int isolated;
-        unsigned long hstart, hend;
-        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifdef CONFIG_NUMA
-#ifndef CONFIG_NUMA
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-        up_read(&mm->mmap_sem);
+{
-        VM_BUG_ON(!*hpage);
+        if (IS_ERR(*hpage)) {
-        new_page = *hpage;
+                if (!*wait)
-#else
+                        return false;
+                *wait = false;
+                *hpage = NULL;
+                khugepaged_alloc_sleep();
+        } else if (*hpage) {
+                put_page(*hpage);
+                *hpage = NULL;
+        }
+        return true;
+}
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+                       struct vm_area_struct *vma, unsigned long address,
+                       int node)
+{
        VM_BUG_ON(*hpage);
        /*
         * Allocate the page while the vma is still valid and under
@@ -1866,7 +1858,7 @@ static void collapse_huge_page(struct mm_struct *mm,
         * mmap_sem in read mode is good idea also to allow greater
         * scalability.
         */
-        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+        *hpage  = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
                                      node, __GFP_OTHER_NODE);
        /*
@@ -1874,20 +1866,85 @@ static void collapse_huge_page(struct mm_struct *mm,
         * preparation for taking it in write mode.
         */
        up_read(&mm->mmap_sem);
-        if (unlikely(!new_page)) {
+        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
-                return;
+                return NULL;
        }
-#endif
        count_vm_event(THP_COLLAPSE_ALLOC);
-        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+        return *hpage;
-#ifdef CONFIG_NUMA
+}
-                put_page(new_page);
+#else
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+        struct page *hpage;
+        do {
+                hpage = alloc_hugepage(khugepaged_defrag());
+                if (!hpage) {
+                        count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+                        if (!*wait)
+                                return NULL;
+                        *wait = false;
+                        khugepaged_alloc_sleep();
+                } else
+                        count_vm_event(THP_COLLAPSE_ALLOC);
+        } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+        return hpage;
+}
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+        if (!*hpage)
+                *hpage = khugepaged_alloc_hugepage(wait);
+        if (unlikely(!*hpage))
+                return false;
+        return true;
+}
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+                       struct vm_area_struct *vma, unsigned long address,
+                       int node)
+{
+        up_read(&mm->mmap_sem);
+        VM_BUG_ON(!*hpage);
+        return  *hpage;
+}
 #endif
+static void collapse_huge_page(struct mm_struct *mm,
+                                   unsigned long address,
+                                   struct page **hpage,
+                                   struct vm_area_struct *vma,
+                                   int node)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd, _pmd;
+        pte_t *pte;
+        pgtable_t pgtable;
+        struct page *new_page;
+        spinlock_t *ptl;
+        int isolated;
+        unsigned long hstart, hend;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        /* release the mmap_sem read lock. */
+        new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+        if (!new_page)
+                return;
+        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
                return;
-        }
        /*
         * Prevent all access to pagetables with the exception of
@@ -1912,11 +1969,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out;
        if (is_vma_temporary_stack(vma))
                goto out;
-        /*
+        VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-         * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-         * true too, verify it here.
-         */
-        VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
@@ -1936,6 +1989,9 @@ static void collapse_huge_page(struct mm_struct *mm,
        pte = pte_offset_map(pmd, address);
        ptl = pte_lockptr(mm, pmd);
+        mmun_start = address;
+        mmun_end   = address + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock); /* probably unnecessary */
        /*
         * After this gup_fast can't run anymore. This also removes
@@ -1943,8 +1999,9 @@ static void collapse_huge_page(struct mm_struct *mm,
         * huge and small TLB entries for the same virtual address
         * to avoid the risk of CPU bugs in that area.
         */
-        _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+        _pmd = pmdp_clear_flush(vma, address, pmd);
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        spin_lock(ptl);
        isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1970,8 +2027,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        pte_unmap(pte);
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
-        VM_BUG_ON(page_count(pgtable) != 1);
-        VM_BUG_ON(page_mapcount(pgtable) != 0);
        _pmd = mk_pmd(new_page, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
@@ -1988,13 +2043,12 @@ static void collapse_huge_page(struct mm_struct *mm,
        BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address);
        set_pmd_at(mm, address, pmd, _pmd);
-        update_mmu_cache(vma, address, _pmd);
+        update_mmu_cache_pmd(vma, address, pmd);
-        prepare_pmd_huge_pte(pgtable, mm);
+        pgtable_trans_huge_deposit(mm, pgtable);
        spin_unlock(&mm->page_table_lock);
-#ifndef CONFIG_NUMA
        *hpage = NULL;
-#endif
        khugepaged_pages_collapsed++;
 out_up_write:
        up_write(&mm->mmap_sem);
@@ -2002,9 +2056,6 @@ out_up_write:
 out:
        mem_cgroup_uncharge_page(new_page);
-#ifdef CONFIG_NUMA
-        put_page(new_page);
-#endif
        goto out_up_write;
 }
@@ -2154,12 +2205,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                        goto skip;
                if (is_vma_temporary_stack(vma))
                        goto skip;
-                /*
+                VM_BUG_ON(vma->vm_flags & VM_NO_THP);
-                 * If is_pfn_mapping() is true is_learn_pfn_mapping()
-                 * must be true too, verify it here.
-                 */
-                VM_BUG_ON(is_linear_pfn_mapping(vma) ||
-                          vma->vm_flags & VM_NO_THP);
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2234,32 +2280,23 @@ static int khugepaged_has_work(void)
 static int khugepaged_wait_event(void)
 {
        return !list_empty(&khugepaged_scan.mm_head) ||
-                !khugepaged_enabled();
+                kthread_should_stop();
 }
-static void khugepaged_do_scan(struct page **hpage)
+static void khugepaged_do_scan(void)
 {
+        struct page *hpage = NULL;
        unsigned int progress = 0, pass_through_head = 0;
        unsigned int pages = khugepaged_pages_to_scan;
+        bool wait = true;
        barrier(); /* write khugepaged_pages_to_scan to local stack */
        while (progress < pages) {
-                cond_resched();
+                if (!khugepaged_prealloc_page(&hpage, &wait))
-#ifndef CONFIG_NUMA
-                if (!*hpage) {
-                        *hpage = alloc_hugepage(khugepaged_defrag());
-                        if (unlikely(!*hpage)) {
-                                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-                                break;
-                        }
-                        count_vm_event(THP_COLLAPSE_ALLOC);
-                }
-#else
-                if (IS_ERR(*hpage))
                        break;
-#endif
+                cond_resched();
                if (unlikely(kthread_should_stop() || freezing(current)))
                        break;
@@ -2270,73 +2307,32 @@ static void khugepaged_do_scan(struct page **hpage)
                if (khugepaged_has_work() &&
                    pass_through_head < 2)
                        progress += khugepaged_scan_mm_slot(pages - progress,
-                                                            hpage);
+                                                            &hpage);
                else
                        progress = pages;
                spin_unlock(&khugepaged_mm_lock);
        }
-}
-static void khugepaged_alloc_sleep(void)
+        if (!IS_ERR_OR_NULL(hpage))
-{
+                put_page(hpage);
-        wait_event_freezable_timeout(khugepaged_wait, false,
-                        msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
 }
-#ifndef CONFIG_NUMA
+static void khugepaged_wait_work(void)
-static struct page *khugepaged_alloc_hugepage(void)
 {
-        struct page *hpage;
+        try_to_freeze();
-        do {
-                hpage = alloc_hugepage(khugepaged_defrag());
-                if (!hpage) {
-                        count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-                        khugepaged_alloc_sleep();
-                } else
-                        count_vm_event(THP_COLLAPSE_ALLOC);
-        } while (unlikely(!hpage) &&
-                 likely(khugepaged_enabled()));
-        return hpage;
-}
-#endif
-static void khugepaged_loop(void)
+        if (khugepaged_has_work()) {
-{
+                if (!khugepaged_scan_sleep_millisecs)
-        struct page *hpage;
+                        return;
-#ifdef CONFIG_NUMA
+                wait_event_freezable_timeout(khugepaged_wait,
-        hpage = NULL;
+                                             kthread_should_stop(),
-#endif
+                        msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
-        while (likely(khugepaged_enabled())) {
+                return;
-#ifndef CONFIG_NUMA
-                hpage = khugepaged_alloc_hugepage();
-                if (unlikely(!hpage))
-                        break;
-#else
-                if (IS_ERR(hpage)) {
-                        khugepaged_alloc_sleep();
-                        hpage = NULL;
-                }
-#endif
-                khugepaged_do_scan(&hpage);
-#ifndef CONFIG_NUMA
-                if (hpage)
-                        put_page(hpage);
-#endif
-                try_to_freeze();
-                if (unlikely(kthread_should_stop()))
-                        break;
-                if (khugepaged_has_work()) {
-                        if (!khugepaged_scan_sleep_millisecs)
-                                continue;
-                        wait_event_freezable_timeout(khugepaged_wait, false,
-                            msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
-                } else if (khugepaged_enabled())
-                        wait_event_freezable(khugepaged_wait,
-                                             khugepaged_wait_event());
        }
+        if (khugepaged_enabled())
+                wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 static int khugepaged(void *none)
@@ -2346,20 +2342,9 @@ static int khugepaged(void *none)
        set_freezable();
        set_user_nice(current, 19);
-        /* serialize with start_khugepaged() */
+        while (!kthread_should_stop()) {
-        mutex_lock(&khugepaged_mutex);
+                khugepaged_do_scan();
+                khugepaged_wait_work();
-        for (;;) {
-                mutex_unlock(&khugepaged_mutex);
-                VM_BUG_ON(khugepaged_thread != current);
-                khugepaged_loop();
-                VM_BUG_ON(khugepaged_thread != current);
-                mutex_lock(&khugepaged_mutex);
-                if (!khugepaged_enabled())
-                        break;
-                if (unlikely(kthread_should_stop()))
-                        break;
        }
        spin_lock(&khugepaged_mm_lock);
@@ -2368,10 +2353,6 @@ static int khugepaged(void *none)
        if (mm_slot)
                collect_mm_slot(mm_slot);
        spin_unlock(&khugepaged_mm_lock);
-        khugepaged_thread = NULL;
-        mutex_unlock(&khugepaged_mutex);
        return 0;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc727122dd44..59a0059b39e2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,7 +30,6 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
-#include <linux/hugetlb_cgroup.h>
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -637,6 +636,7 @@ static void free_huge_page(struct page *page)
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
        } else {
+                arch_clear_hugepage_flags(page);
                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
        }
 }
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for normal or
+ * transparent huge pages.  See the PageTransHuge() documentation for more
+ * details.
+ */
 int PageHuge(struct page *page)
 {
        compound_page_dtor *dtor;
@@ -2355,13 +2360,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        struct page *page;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
+        const unsigned long mmun_start = start; /* For mmu_notifiers */
+        const unsigned long mmun_end   = end;   /* For mmu_notifiers */
        WARN_ON(!is_vm_hugetlb_page(vma));
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));
        tlb_start_vma(tlb, vma);
-        mmu_notifier_invalidate_range_start(mm, start, end);
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 again:
        spin_lock(&mm->page_table_lock);
        for (address = start; address < end; address += sz) {
@@ -2425,7 +2432,7 @@ again:
                if (address < end && !ref_page)
                        goto again;
        }
-        mmu_notifier_invalidate_range_end(mm, start, end);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        tlb_end_vma(tlb, vma);
 }
@@ -2473,7 +2480,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
        struct hstate *h = hstate_vma(vma);
        struct vm_area_struct *iter_vma;
        struct address_space *mapping;
-        struct prio_tree_iter iter;
        pgoff_t pgoff;
        /*
@@ -2481,7 +2487,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * from page cache lookup which is in HPAGE_SIZE units.
         */
        address = address & huge_page_mask(h);
-        pgoff = vma_hugecache_offset(h, vma, address);
+        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+                        vma->vm_pgoff;
        mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
        /*
@@ -2490,7 +2497,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * __unmap_hugepage_range() is called as the lock is already held
         */
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
                /* Do not unmap the current VMA */
                if (iter_vma == vma)
                        continue;
@@ -2525,6 +2532,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *old_page, *new_page;
        int avoidcopy;
        int outside_reserve = 0;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        old_page = pte_page(pte);
@@ -2611,6 +2620,9 @@ retry_avoidcopy:
                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);
+        mmun_start = address & huge_page_mask(h);
+        mmun_end = mmun_start + huge_page_size(h);
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
         * Retake the page_table_lock to check for racing updates
         * before the page tables are altered
@@ -2619,9 +2631,6 @@ retry_avoidcopy:
        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
                /* Break COW */
-                mmu_notifier_invalidate_range_start(mm,
-                        address & huge_page_mask(h),
-                        (address & huge_page_mask(h)) + huge_page_size(h));
                huge_ptep_clear_flush(vma, address, ptep);
                set_huge_pte_at(mm, address, ptep,
                                make_huge_pte(vma, new_page, 1));
@@ -2629,10 +2638,11 @@ retry_avoidcopy:
                hugepage_add_new_anon_rmap(new_page, vma, address);
                /* Make the old page be freed below */
                new_page = old_page;
-                mmu_notifier_invalidate_range_end(mm,
-                        address & huge_page_mask(h),
-                        (address & huge_page_mask(h)) + huge_page_size(h));
        }
+        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+        /* Caller expects lock to be held */
+        spin_lock(&mm->page_table_lock);
        page_cache_release(new_page);
        page_cache_release(old_page);
        return 0;
diff --git a/mm/internal.h b/mm/internal.h
index b8c91b342e24..a4fa284f6bc2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,26 +118,27 @@ struct compact_control {
        unsigned long nr_freepages;     /* Number of isolated free pages */
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
-        unsigned long start_free_pfn;   /* where we started the search */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        bool sync;                      /* Synchronous migration */
-        bool wrapped;                   /* Order > 0 compactions are
+        bool ignore_skip_hint;          /* Scan blocks even if marked skip */
-                                           incremental, once free_pfn
+        bool finished_update_free;      /* True when the zone cached pfns are
-                                           and migrate_pfn meet, we restart
+                                         * no longer being updated
-                                           from the top of the zone;
+                                         */
-                                           remember we wrapped around. */
+        bool finished_update_migrate;
        int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
-        bool *contended;                /* True if a lock was contended */
+        bool contended;                 /* True if a lock was contended */
+        struct page **page;             /* Page captured of requested size */
 };
 unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
+isolate_freepages_range(struct compact_control *cc,
+                        unsigned long start_pfn, unsigned long end_pfn);
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
-                           unsigned long low_pfn, unsigned long end_pfn);
+        unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
 #endif
@@ -167,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 }
 /*
- * Called only in fault path via page_evictable() for a new page
+ * Called only in fault path, to determine if a new page is being
- * to determine if it's being mapped into a LOCKED vma.
+ * mapped into a LOCKED vma.  If it is, mark page as mlocked.
- * If so, mark page as mlocked.
 */
 static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
                                    struct page *page)
@@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
                return 0;
        if (!TestSetPageMlocked(page)) {
-                inc_zone_page_state(page, NR_MLOCK);
+                mod_zone_page_state(page_zone(page), NR_MLOCK,
+                                    hpage_nr_pages(page));
                count_vm_event(UNEVICTABLE_PGMLOCKED);
        }
        return 1;
@@ -201,12 +202,7 @@ extern void munlock_vma_page(struct page *page);
 * If called for a page that is still mapped by mlocked vmas, all we do
 * is revert to lazy LRU behaviour -- semantics are not broken.
 */
-extern void __clear_page_mlock(struct page *page);
+extern void clear_page_mlock(struct page *page);
-static inline void clear_page_mlock(struct page *page)
-{
-        if (unlikely(TestClearPageMlocked(page)))
-                __clear_page_mlock(page);
-}
 /*
 * mlock_migrate_page - called only from migrate_page_copy() to
@@ -340,7 +336,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 #define ZONE_RECLAIM_FULL       -1
 #define ZONE_RECLAIM_SOME       0
 #define ZONE_RECLAIM_SUCCESS    1
-#endif
 extern int hwpoison_filter(struct page *p);
@@ -356,3 +351,20 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long);
 extern void set_pageblock_order(void);
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+                                            struct list_head *page_list);
+/* The ALLOC_WMARK bits are used as an index to zone->watermark */
+#define ALLOC_WMARK_MIN         WMARK_MIN
+#define ALLOC_WMARK_LOW         WMARK_LOW
+#define ALLOC_WMARK_HIGH        WMARK_HIGH
+#define ALLOC_NO_WATERMARKS     0x04 /* don't check watermarks at all */
+/* Mask to get the watermark bits */
+#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)
+#define ALLOC_HARDER            0x10 /* try to alloc harder */
+#define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET            0x40 /* check for correct cpuset */
+#define ALLOC_CMA               0x80 /* allow allocations from CMA areas */
+#endif  /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 000000000000..4a5822a586e6
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,112 @@
+/*
+ * mm/interval_tree.c - interval tree for mapping->i_mmap
+ *
+ * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
+ *
+ * This file is released under the GPL v2.
+ */
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rmap.h>
+#include <linux/interval_tree_generic.h>
+static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
+{
+        return v->vm_pgoff;
+}
+static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
+{
+        return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+}
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
+                     unsigned long, shared.linear.rb_subtree_last,
+                     vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+/* Insert node immediately after prev in the interval tree */
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+                                    struct vm_area_struct *prev,
+                                    struct rb_root *root)
+{
+        struct rb_node **link;
+        struct vm_area_struct *parent;
+        unsigned long last = vma_last_pgoff(node);
+        VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
+        if (!prev->shared.linear.rb.rb_right) {
+                parent = prev;
+                link = &prev->shared.linear.rb.rb_right;
+        } else {
+                parent = rb_entry(prev->shared.linear.rb.rb_right,
+                                  struct vm_area_struct, shared.linear.rb);
+                if (parent->shared.linear.rb_subtree_last < last)
+                        parent->shared.linear.rb_subtree_last = last;
+                while (parent->shared.linear.rb.rb_left) {
+                        parent = rb_entry(parent->shared.linear.rb.rb_left,
+                                struct vm_area_struct, shared.linear.rb);
+                        if (parent->shared.linear.rb_subtree_last < last)
+                                parent->shared.linear.rb_subtree_last = last;
+                }
+                link = &parent->shared.linear.rb.rb_left;
+        }
+        node->shared.linear.rb_subtree_last = last;
+        rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
+        rb_insert_augmented(&node->shared.linear.rb, root,
+                            &vma_interval_tree_augment);
+}
+static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
+{
+        return vma_start_pgoff(avc->vma);
+}
+static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
+{
+        return vma_last_pgoff(avc->vma);
+}
+INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
+                     avc_start_pgoff, avc_last_pgoff,
+                     static inline, __anon_vma_interval_tree)
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+                                   struct rb_root *root)
+{
+#ifdef CONFIG_DEBUG_VM_RB
+        node->cached_vma_start = avc_start_pgoff(node);
+        node->cached_vma_last = avc_last_pgoff(node);
+#endif
+        __anon_vma_interval_tree_insert(node, root);
+}
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+                                   struct rb_root *root)
+{
+        __anon_vma_interval_tree_remove(node, root);
+}
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root *root,
+                                  unsigned long first, unsigned long last)
+{
+        return __anon_vma_interval_tree_iter_first(root, first, last);
+}
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
+                                 unsigned long first, unsigned long last)
+{
+        return __anon_vma_interval_tree_iter_next(node, first, last);
+}
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
+{
+        WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
+        WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
+}
+#endif
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 45eb6217bf38..a217cc544060 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -29,7 +29,7 @@
 * - kmemleak_lock (rwlock): protects the object_list modifications and
 *   accesses to the object_tree_root. The object_list is the main list
 *   holding the metadata (struct kmemleak_object) for the allocated memory
- *   blocks. The object_tree_root is a priority search tree used to look-up
+ *   blocks. The object_tree_root is a red black tree used to look-up
 *   metadata based on a pointer to the corresponding memory block.  The
 *   kmemleak_object structures are added to the object_list and
 *   object_tree_root in the create_object() function called from the
@@ -71,7 +71,7 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/kthread.h>
-#include <linux/prio_tree.h>
+#include <linux/rbtree.h>
 #include <linux/fs.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -132,7 +132,7 @@ struct kmemleak_scan_area {
 * Structure holding the metadata for each allocated memory block.
 * Modifications to such objects should be made while holding the
 * object->lock. Insertions or deletions from object_list, gray_list or
- * tree_node are already protected by the corresponding locks or mutex (see
+ * rb_node are already protected by the corresponding locks or mutex (see
 * the notes on locking above). These objects are reference-counted
 * (use_count) and freed using the RCU mechanism.
 */
@@ -141,7 +141,7 @@ struct kmemleak_object {
        unsigned long flags;            /* object status flags */
        struct list_head object_list;
        struct list_head gray_list;
-        struct prio_tree_node tree_node;
+        struct rb_node rb_node;
        struct rcu_head rcu;            /* object_list lockless traversal */
        /* object usage count; object freed when use_count == 0 */
        atomic_t use_count;
@@ -182,9 +182,9 @@ struct kmemleak_object {
 static LIST_HEAD(object_list);
 /* the list of gray-colored objects (see color_gray comment below) */
 static LIST_HEAD(gray_list);
-/* prio search tree for object boundaries */
+/* search tree for object boundaries */
-static struct prio_tree_root object_tree_root;
+static struct rb_root object_tree_root = RB_ROOT;
-/* rw_lock protecting the access to object_list and prio_tree_root */
+/* rw_lock protecting the access to object_list and object_tree_root */
 static DEFINE_RWLOCK(kmemleak_lock);
 /* allocation caches for kmemleak internal data */
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
        trace.entries = object->trace;
        pr_notice("Object 0x%08lx (size %zu):\n",
-                  object->tree_node.start, object->size);
+                  object->pointer, object->size);
        pr_notice("  comm \"%s\", pid %d, jiffies %lu\n",
                  object->comm, object->pid, object->jiffies);
        pr_notice("  min_count = %d\n", object->min_count);
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object)
 }
 /*
- * Look-up a memory block metadata (kmemleak_object) in the priority search
+ * Look-up a memory block metadata (kmemleak_object) in the object search
 * tree based on a pointer value. If alias is 0, only values pointing to the
 * beginning of the memory block are allowed. The kmemleak_lock must be held
 * when calling this function.
 */
 static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
 {
-        struct prio_tree_node *node;
+        struct rb_node *rb = object_tree_root.rb_node;
-        struct prio_tree_iter iter;
-        struct kmemleak_object *object;
+        while (rb) {
+                struct kmemleak_object *object =
-        prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
+                        rb_entry(rb, struct kmemleak_object, rb_node);
-        node = prio_tree_next(&iter);
+                if (ptr < object->pointer)
-        if (node) {
+                        rb = object->rb_node.rb_left;
-                object = prio_tree_entry(node, struct kmemleak_object,
+                else if (object->pointer + object->size <= ptr)
-                                         tree_node);
+                        rb = object->rb_node.rb_right;
-                if (!alias && object->pointer != ptr) {
+                else if (object->pointer == ptr || alias)
+                        return object;
+                else {
                        kmemleak_warn("Found object by alias at 0x%08lx\n",
                                      ptr);
                        dump_object_info(object);
-                        object = NULL;
+                        break;
                }
-        } else
+        }
-                object = NULL;
+        return NULL;
-        return object;
 }
 /*
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object)
 }
 /*
- * Look up an object in the prio search tree and increase its use_count.
+ * Look up an object in the object search tree and increase its use_count.
 */
 static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
 {
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
                                             int min_count, gfp_t gfp)
 {
        unsigned long flags;
-        struct kmemleak_object *object;
+        struct kmemleak_object *object, *parent;
-        struct prio_tree_node *node;
+        struct rb_node **link, *rb_parent;
        object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
        if (!object) {
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        /* kernel backtrace */
        object->trace_len = __save_stack_trace(object->trace);
-        INIT_PRIO_TREE_NODE(&object->tree_node);
-        object->tree_node.start = ptr;
-        object->tree_node.last = ptr + size - 1;
        write_lock_irqsave(&kmemleak_lock, flags);
        min_addr = min(min_addr, ptr);
        max_addr = max(max_addr, ptr + size);
-        node = prio_tree_insert(&object_tree_root, &object->tree_node);
+        link = &object_tree_root.rb_node;
-        /*
+        rb_parent = NULL;
-         * The code calling the kernel does not yet have the pointer to the
+        while (*link) {
-         * memory block to be able to free it.  However, we still hold the
+                rb_parent = *link;
-         * kmemleak_lock here in case parts of the kernel started freeing
+                parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
-         * random memory blocks.
+                if (ptr + size <= parent->pointer)
-         */
+                        link = &parent->rb_node.rb_left;
-        if (node != &object->tree_node) {
+                else if (parent->pointer + parent->size <= ptr)
-                kmemleak_stop("Cannot insert 0x%lx into the object search tree "
+                        link = &parent->rb_node.rb_right;
-                              "(already existing)\n", ptr);
+                else {
-                object = lookup_object(ptr, 1);
+                        kmemleak_stop("Cannot insert 0x%lx into the object "
-                spin_lock(&object->lock);
+                                      "search tree (overlaps existing)\n",
-                dump_object_info(object);
+                                      ptr);
-                spin_unlock(&object->lock);
+                        kmem_cache_free(object_cache, object);
+                        object = parent;
-                goto out;
+                        spin_lock(&object->lock);
+                        dump_object_info(object);
+                        spin_unlock(&object->lock);
+                        goto out;
+                }
        }
+        rb_link_node(&object->rb_node, rb_parent, link);
+        rb_insert_color(&object->rb_node, &object_tree_root);
        list_add_tail_rcu(&object->object_list, &object_list);
 out:
        write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object)
        unsigned long flags;
        write_lock_irqsave(&kmemleak_lock, flags);
-        prio_tree_remove(&object_tree_root, &object->tree_node);
+        rb_erase(&object->rb_node, &object_tree_root);
        list_del_rcu(&object->object_list);
        write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -1483,13 +1486,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
        struct kmemleak_object *prev_obj = v;
        struct kmemleak_object *next_obj = NULL;
-        struct list_head *n = &prev_obj->object_list;
+        struct kmemleak_object *obj = prev_obj;
        ++(*pos);
-        list_for_each_continue_rcu(n, &object_list) {
+        list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
-                struct kmemleak_object *obj =
-                        list_entry(n, struct kmemleak_object, object_list);
                if (get_object(obj)) {
                        next_obj = obj;
                        break;
@@ -1768,7 +1769,6 @@ void __init kmemleak_init(void)
        object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
        scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
-        INIT_PRIO_TREE_ROOT(&object_tree_root);
        if (crt_early_log >= ARRAY_SIZE(early_log))
                pr_warning("Early log buffer exceeded (%d), please increase "
diff --git a/mm/ksm.c b/mm/ksm.c
index 47c885368890..ae539f0b8aa1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
        spinlock_t *ptl;
        int swapped;
        int err = -EFAULT;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        addr = page_address_in_vma(page, vma);
        if (addr == -EFAULT)
                goto out;
        BUG_ON(PageTransCompound(page));
+        mmun_start = addr;
+        mmun_end   = addr + PAGE_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        ptep = page_check_address(page, mm, addr, &ptl, 0);
        if (!ptep)
-                goto out;
+                goto out_mn;
        if (pte_write(*ptep) || pte_dirty(*ptep)) {
                pte_t entry;
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 out_unlock:
        pte_unmap_unlock(ptep, ptl);
+out_mn:
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
        return err;
 }
@@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        spinlock_t *ptl;
        unsigned long addr;
        int err = -EFAULT;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        addr = page_address_in_vma(page, vma);
        if (addr == -EFAULT)
@@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        if (!pmd_present(*pmd))
                goto out;
+        mmun_start = addr;
+        mmun_end   = addr + PAGE_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!pte_same(*ptep, orig_pte)) {
                pte_unmap_unlock(ptep, ptl);
-                goto out;
+                goto out_mn;
        }
        get_page(kpage);
@@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        pte_unmap_unlock(ptep, ptl);
        err = 0;
+out_mn:
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
        return err;
 }
@@ -1469,10 +1486,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                 */
                if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
                                 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-                                 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
+                                 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
-                                 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
                        return 0;               /* just ignore the advice */
+#ifdef VM_SAO
+                if (*vm_flags & VM_SAO)
+                        return 0;
+#endif
                if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
                        err = __ksm_enter(mm);
                        if (err)
@@ -1582,7 +1603,7 @@ struct page *ksm_does_need_to_copy(struct page *page,
                SetPageSwapBacked(new_page);
                __set_page_locked(new_page);
-                if (page_evictable(new_page, vma))
+                if (!mlocked_vma_newpage(vma, new_page))
                        lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
                else
                        add_page_to_unevictable_list(new_page);
@@ -1614,7 +1635,8 @@ again:
                struct vm_area_struct *vma;
                anon_vma_lock(anon_vma);
-                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+                                               0, ULONG_MAX) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
@@ -1667,7 +1689,8 @@ again:
                struct vm_area_struct *vma;
                anon_vma_lock(anon_vma);
-                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+                                               0, ULONG_MAX) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
@@ -1719,7 +1742,8 @@ again:
                struct vm_area_struct *vma;
                anon_vma_lock(anon_vma);
-                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+                                               0, ULONG_MAX) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
diff --git a/mm/madvise.c b/mm/madvise.c
index 14d260fa0d17..03dfa5c7adb3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma,
                new_flags &= ~VM_DONTCOPY;
                break;
        case MADV_DONTDUMP:
-                new_flags |= VM_NODUMP;
+                new_flags |= VM_DONTDUMP;
                break;
        case MADV_DODUMP:
-                new_flags &= ~VM_NODUMP;
+                if (new_flags & VM_SPECIAL) {
+                        error = -EINVAL;
+                        goto out;
+                }
+                new_flags &= ~VM_DONTDUMP;
                break;
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
diff --git a/mm/memblock.c b/mm/memblock.c
index 82aa349d2f7a..625905523c2a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0;
 static int memblock_reserved_in_slab __initdata_memblock = 0;
 /* inline so we don't get a warning when pr_debug is compiled out */
-static inline const char *memblock_type_name(struct memblock_type *type)
+static __init_memblock const char *
+memblock_type_name(struct memblock_type *type)
 {
        if (type == &memblock.memory)
                return "memory";
@@ -756,7 +757,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
                return ret;
        for (i = start_rgn; i < end_rgn; i++)
-                type->regions[i].nid = nid;
+                memblock_set_region_node(&type->regions[i], nid);
        memblock_merge_regions(type);
        return 0;
@@ -929,6 +930,30 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
        return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
 }
+void __init_memblock memblock_trim_memory(phys_addr_t align)
+{
+        int i;
+        phys_addr_t start, end, orig_start, orig_end;
+        struct memblock_type *mem = &memblock.memory;
+        for (i = 0; i < mem->cnt; i++) {
+                orig_start = mem->regions[i].base;
+                orig_end = mem->regions[i].base + mem->regions[i].size;
+                start = round_up(orig_start, align);
+                end = round_down(orig_end, align);
+                if (start == orig_start && end == orig_end)
+                        continue;
+                if (start < end) {
+                        mem->regions[i].base = start;
+                        mem->regions[i].size = end - start;
+                } else {
+                        memblock_remove_region(mem, i);
+                        i--;
+                }
+        }
+}
 void __init_memblock memblock_set_current_limit(phys_addr_t limit)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 795e525afaba..7acf43bf04a2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,7 @@
 #include <linux/oom.h>
 #include "internal.h"
 #include <net/sock.h>
+#include <net/ip.h>
 #include <net/tcp_memcontrol.h>
 #include <asm/uaccess.h>
@@ -326,7 +327,7 @@ struct mem_cgroup {
        struct mem_cgroup_stat_cpu nocpu_base;
        spinlock_t pcp_counter_lock;
-#ifdef CONFIG_INET
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
        struct tcp_memcontrol tcp_mem;
 #endif
 };
@@ -411,12 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
        return container_of(s, struct mem_cgroup, css);
 }
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+{
+        return (memcg == root_mem_cgroup);
+}
 /* Writing them here to avoid exposing memcg's inner layout */
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
-#include <net/sock.h>
-#include <net/ip.h>
-static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
 void sock_update_memcg(struct sock *sk)
 {
        if (mem_cgroup_sockets_enabled) {
@@ -461,7 +464,6 @@ void sock_release_memcg(struct sock *sk)
        }
 }
-#ifdef CONFIG_INET
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 {
        if (!memcg || mem_cgroup_is_root(memcg))
@@ -470,10 +472,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
        return &memcg->tcp_mem.cg_proto;
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
-#endif /* CONFIG_INET */
-#endif /* CONFIG_MEMCG_KMEM */
-#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
        if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -1016,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
             iter != NULL;                              \
             iter = mem_cgroup_iter(NULL, iter, NULL))
-static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
-{
-        return (memcg == root_mem_cgroup);
-}
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
        struct mem_cgroup *memcg;
@@ -4973,6 +4967,13 @@ mem_cgroup_create(struct cgroup *cont)
        } else {
                res_counter_init(&memcg->res, NULL);
                res_counter_init(&memcg->memsw, NULL);
+                /*
+                 * Deeper hierachy with use_hierarchy == false doesn't make
+                 * much sense so let cgroup subsystem know about this
+                 * unfortunate state in our controller.
+                 */
+                if (parent && parent != root_mem_cgroup)
+                        mem_cgroup_subsys.broken_hierarchy = true;
        }
        memcg->last_scanned_node = MAX_NUMNODES;
        INIT_LIST_HEAD(&memcg->oom_notify);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a6e2141a6610..6c5899b9034a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct vm_area_struct *vma;
        struct task_struct *tsk;
        struct anon_vma *av;
+        pgoff_t pgoff;
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
                return;
+        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        read_lock(&tasklist_lock);
        for_each_process (tsk) {
                struct anon_vma_chain *vmac;
                if (!task_early_kill(tsk))
                        continue;
-                list_for_each_entry(vmac, &av->head, same_anon_vma) {
+                anon_vma_interval_tree_foreach(vmac, &av->rb_root,
+                                               pgoff, pgoff) {
                        vma = vmac->vma;
                        if (!page_mapped_in_vma(page, vma))
                                continue;
@@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 {
        struct vm_area_struct *vma;
        struct task_struct *tsk;
-        struct prio_tree_iter iter;
        struct address_space *mapping = page->mapping;
        mutex_lock(&mapping->i_mmap_mutex);
@@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
                if (!task_early_kill(tsk))
                        continue;
-                vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+                vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
                                      pgoff) {
                        /*
                         * Send early kill signal to tasks where a vma covers
diff --git a/mm/memory.c b/mm/memory.c
index 57361708d1a5..fb135ba4aba9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
        add_taint(TAINT_BAD_PAGE);
 }
-static inline int is_cow_mapping(vm_flags_t flags)
+static inline bool is_cow_mapping(vm_flags_t flags)
 {
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        unsigned long next;
        unsigned long addr = vma->vm_start;
        unsigned long end = vma->vm_end;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        bool is_cow;
        int ret;
        /*
@@ -1047,7 +1050,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * readonly mappings. The tradeoff is that copy_page_range is more
         * efficient than faulting.
         */
-        if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+        if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
+                               VM_PFNMAP | VM_MIXEDMAP))) {
                if (!vma->anon_vma)
                        return 0;
        }
@@ -1055,12 +1059,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (is_vm_hugetlb_page(vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, vma);
-        if (unlikely(is_pfn_mapping(vma))) {
+        if (unlikely(vma->vm_flags & VM_PFNMAP)) {
                /*
                 * We do not free on error cases below as remove_vma
                 * gets called on error from higher level routine
                 */
-                ret = track_pfn_vma_copy(vma);
+                ret = track_pfn_copy(vma);
                if (ret)
                        return ret;
        }
@@ -1071,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * parent mm. And a permission downgrade will only happen if
         * is_cow_mapping() returns true.
         */
-        if (is_cow_mapping(vma->vm_flags))
+        is_cow = is_cow_mapping(vma->vm_flags);
-                mmu_notifier_invalidate_range_start(src_mm, addr, end);
+        mmun_start = addr;
+        mmun_end   = end;
+        if (is_cow)
+                mmu_notifier_invalidate_range_start(src_mm, mmun_start,
+                                                    mmun_end);
        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
@@ -1088,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);
-        if (is_cow_mapping(vma->vm_flags))
+        if (is_cow)
-                mmu_notifier_invalidate_range_end(src_mm,
+                mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
-                                                  vma->vm_start, end);
        return ret;
 }
@@ -1327,8 +1334,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
        if (vma->vm_file)
                uprobe_munmap(vma, start, end);
-        if (unlikely(is_pfn_mapping(vma)))
+        if (unlikely(vma->vm_flags & VM_PFNMAP))
-                untrack_pfn_vma(vma, 0, 0);
+                untrack_pfn(vma, 0, 0);
        if (start != end) {
                if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -1521,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                                spin_unlock(&mm->page_table_lock);
                                wait_split_huge_page(vma->anon_vma, pmd);
                        } else {
-                                page = follow_trans_huge_pmd(mm, address,
+                                page = follow_trans_huge_pmd(vma, address,
                                                             pmd, flags);
                                spin_unlock(&mm->page_table_lock);
                                goto out;
@@ -1576,12 +1583,12 @@ split_fallthrough:
                if (page->mapping && trylock_page(page)) {
                        lru_add_drain();  /* push cached pages to LRU */
                        /*
-                         * Because we lock page here and migration is
+                         * Because we lock page here, and migration is
-                         * blocked by the pte's page reference, we need
+                         * blocked by the pte's page reference, and we
-                         * only check for file-cache page truncation.
+                         * know the page is still mapped, we don't even
+                         * need to check for file-cache page truncation.
                         */
-                        if (page->mapping)
+                        mlock_vma_page(page);
-                                mlock_vma_page(page);
                        unlock_page(page);
                }
        }
@@ -2085,6 +2092,11 @@ out:
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
+ *
+ * Usually this function is called from f_op->mmap() handler
+ * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
+ * Caller must set VM_MIXEDMAP on vma if it wants to call this
+ * function from other places, for example from page-fault handler.
 */
 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page)
@@ -2093,7 +2105,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                return -EFAULT;
        if (!page_count(page))
                return -EINVAL;
-        vma->vm_flags |= VM_INSERTPAGE;
+        if (!(vma->vm_flags & VM_MIXEDMAP)) {
+                BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+                BUG_ON(vma->vm_flags & VM_PFNMAP);
+                vma->vm_flags |= VM_MIXEDMAP;
+        }
        return insert_page(vma, addr, page, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_insert_page);
@@ -2132,7 +2148,7 @@ out:
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 *
- * Similar to vm_inert_page, this allows drivers to insert individual pages
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
 * they've allocated into a user vma. Same comments apply.
 *
 * This function should only be called from a vm_ops->fault handler, and
@@ -2162,14 +2178,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
-        if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
+        if (track_pfn_insert(vma, &pgprot, pfn))
                return -EINVAL;
        ret = insert_pfn(vma, addr, pfn, pgprot);
-        if (ret)
-                untrack_pfn_vma(vma, pfn, PAGE_SIZE);
        return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
@@ -2290,37 +2303,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
         * rest of the world about it:
         *   VM_IO tells people not to look at these pages
         *      (accesses can have side effects).
-         *   VM_RESERVED is specified all over the place, because
-         *      in 2.4 it kept swapout's vma scan off this vma; but
-         *      in 2.6 the LRU scan won't even find its pages, so this
-         *      flag means no more than count its pages in reserved_vm,
-         *      and omit it from core dump, even when VM_IO turned off.
         *   VM_PFNMAP tells the core MM that the base pages are just
         *      raw PFN mappings, and do not have a "struct page" associated
         *      with them.
+         *   VM_DONTEXPAND
+         *      Disable vma merging and expanding with mremap().
+         *   VM_DONTDUMP
+         *      Omit vma from core dump, even when VM_IO turned off.
         *
         * There's a horrible special case to handle copy-on-write
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+         * See vm_normal_page() for details.
         */
-        if (addr == vma->vm_start && end == vma->vm_end) {
+        if (is_cow_mapping(vma->vm_flags)) {
+                if (addr != vma->vm_start || end != vma->vm_end)
+                        return -EINVAL;
                vma->vm_pgoff = pfn;
-                vma->vm_flags |= VM_PFN_AT_MMAP;
+        }
-        } else if (is_cow_mapping(vma->vm_flags))
-                return -EINVAL;
-        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
-        err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
+        err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
-        if (err) {
+        if (err)
-                /*
-                 * To indicate that track_pfn related cleanup is not
-                 * needed from higher level routine calling unmap_vmas
-                 */
-                vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
-                vma->vm_flags &= ~VM_PFN_AT_MMAP;
                return -EINVAL;
-        }
+        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
@@ -2335,7 +2341,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
        } while (pgd++, addr = next, addr != end);
        if (err)
-                untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
+                untrack_pfn(vma, pfn, PAGE_ALIGN(size));
        return err;
 }
@@ -2516,11 +2522,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                spinlock_t *ptl, pte_t orig_pte)
        __releases(ptl)
 {
-        struct page *old_page, *new_page;
+        struct page *old_page, *new_page = NULL;
        pte_t entry;
        int ret = 0;
        int page_mkwrite = 0;
        struct page *dirty_page = NULL;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        bool mmun_called = false;       /* For mmu_notifiers */
        old_page = vm_normal_page(vma, address, orig_pte);
        if (!old_page) {
@@ -2698,6 +2707,11 @@ gotten:
        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
+        mmun_start  = address & PAGE_MASK;
+        mmun_end    = (address & PAGE_MASK) + PAGE_SIZE;
+        mmun_called = true;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
         * Re-check the pte - we dropped the lock
         */
@@ -2764,6 +2778,8 @@ gotten:
                page_cache_release(new_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
+        if (mmun_called)
+                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (old_page) {
                /*
                 * Don't let another task, with possibly unlocked vma,
@@ -2801,14 +2817,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
        zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
 }
-static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root *root,
                                            struct zap_details *details)
 {
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        pgoff_t vba, vea, zba, zea;
-        vma_prio_tree_foreach(vma, &iter, root,
+        vma_interval_tree_foreach(vma, root,
                        details->first_index, details->last_index) {
                vba = vma->vm_pgoff;
@@ -2839,7 +2854,7 @@ static inline void unmap_mapping_range_list(struct list_head *head,
         * across *all* the pages in each nonlinear VMA, not just the pages
         * whose virtual address lies outside the file truncation point.
         */
-        list_for_each_entry(vma, head, shared.vm_set.list) {
+        list_for_each_entry(vma, head, shared.nonlinear) {
                details->nonlinear_vma = vma;
                unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
        }
@@ -2883,7 +2898,7 @@ void unmap_mapping_range(struct address_space *mapping,
        mutex_lock(&mapping->i_mmap_mutex);
-        if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a5b90d0cfd7..56b758ae57d2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info,  struct page *page,
 void __ref put_page_bootmem(struct page *page)
 {
        unsigned long type;
+        struct zone *zone;
        type = (unsigned long) page->lru.next;
        BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page)
                set_page_private(page, 0);
                INIT_LIST_HEAD(&page->lru);
                __free_pages_bootmem(page, 0);
+                zone = page_zone(page);
+                zone_span_writelock(zone);
+                zone->present_pages++;
+                zone_span_writeunlock(zone);
+                totalram_pages++;
        }
 }
@@ -362,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
        BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
        BUG_ON(nr_pages % PAGES_PER_SECTION);
+        release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
        sections_to_remove = nr_pages / PAGES_PER_SECTION;
        for (i = 0; i < sections_to_remove; i++) {
                unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
-                release_mem_region(pfn << PAGE_SHIFT,
-                                   PAGES_PER_SECTION << PAGE_SHIFT);
                ret = __remove_section(zone, __pfn_to_section(pfn));
                if (ret)
                        break;
@@ -756,13 +763,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
        return 0;
 }
-static struct page *
-hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
-{
-        /* This should be improooooved!! */
-        return alloc_page(GFP_HIGHUSER_MOVABLE);
-}
 #define NR_OFFLINE_AT_ONCE_PAGES        (256)
 static int
 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -813,8 +813,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                        putback_lru_pages(&source);
                        goto out;
                }
-                /* this function returns # of failed pages */
-                ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+                /*
+                 * alloc_migrate_target should be improooooved!!
+                 * migrate_pages returns # of failed pages.
+                 */
+                ret = migrate_pages(&source, alloc_migrate_target, 0,
                                                        true, MIGRATE_SYNC);
                if (ret)
                        putback_lru_pages(&source);
@@ -870,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        return offlined;
 }
-static int __ref offline_pages(unsigned long start_pfn,
+static int __ref __offline_pages(unsigned long start_pfn,
                  unsigned long end_pfn, unsigned long timeout)
 {
        unsigned long pfn, nr_pages, expire;
@@ -970,8 +974,13 @@ repeat:
        init_per_zone_wmark_min();
-        if (!populated_zone(zone))
+        if (!populated_zone(zone)) {
                zone_pcp_reset(zone);
+                mutex_lock(&zonelists_mutex);
+                build_all_zonelists(NULL, NULL);
+                mutex_unlock(&zonelists_mutex);
+        } else
+                zone_pcp_update(zone);
        if (!node_present_pages(node)) {
                node_clear_state(node, N_HIGH_MEMORY);
@@ -998,15 +1007,55 @@ out:
        return ret;
 }
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+        return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+}
 int remove_memory(u64 start, u64 size)
 {
+        struct memory_block *mem = NULL;
+        struct mem_section *section;
        unsigned long start_pfn, end_pfn;
+        unsigned long pfn, section_nr;
+        int ret;
        start_pfn = PFN_DOWN(start);
        end_pfn = start_pfn + PFN_DOWN(size);
-        return offline_pages(start_pfn, end_pfn, 120 * HZ);
+        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+                section_nr = pfn_to_section_nr(pfn);
+                if (!present_section_nr(section_nr))
+                        continue;
+                section = __nr_to_section(section_nr);
+                /* same memblock? */
+                if (mem)
+                        if ((section_nr >= mem->start_section_nr) &&
+                            (section_nr <= mem->end_section_nr))
+                                continue;
+                mem = find_memory_block_hinted(section, mem);
+                if (!mem)
+                        continue;
+                ret = offline_memory_block(mem);
+                if (ret) {
+                        kobject_put(&mem->dev.kobj);
+                        return ret;
+                }
+        }
+        if (mem)
+                kobject_put(&mem->dev.kobj);
+        return 0;
 }
 #else
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+        return -EINVAL;
+}
 int remove_memory(u64 start, u64 size)
 {
        return -EINVAL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ada3be6e252..d04a8a54c294 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        return first;
 }
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+                                                struct mempolicy *pol)
+{
+        int err;
+        struct mempolicy *old;
+        struct mempolicy *new;
+        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
+                 vma->vm_ops, vma->vm_file,
+                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+        new = mpol_dup(pol);
+        if (IS_ERR(new))
+                return PTR_ERR(new);
+        if (vma->vm_ops && vma->vm_ops->set_policy) {
+                err = vma->vm_ops->set_policy(vma, new);
+                if (err)
+                        goto err_out;
+        }
+        old = vma->vm_policy;
+        vma->vm_policy = new; /* protected by mmap_sem */
+        mpol_put(old);
+        return 0;
+ err_out:
+        mpol_put(new);
+        return err;
+}
 /* Step 2: apply policy to a range and do splits. */
 static int mbind_range(struct mm_struct *mm, unsigned long start,
                       unsigned long end, struct mempolicy *new_pol)
@@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
                        if (err)
                                goto out;
                }
+                err = vma_replace_policy(vma, new_pol);
-                /*
+                if (err)
-                 * Apply policy to a single VMA. The reference counting of
+                        goto out;
-                 * policy for vma_policy linkages has already been handled by
-                 * vma_merge and split_vma as necessary. If this is a shared
-                 * policy then ->set_policy will increment the reference count
-                 * for an sp node.
-                 */
-                pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
-                        vma->vm_start, vma->vm_end, vma->vm_pgoff,
-                        vma->vm_ops, vma->vm_file,
-                        vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-                if (vma->vm_ops && vma->vm_ops->set_policy) {
-                        err = vma->vm_ops->set_policy(vma, new_pol);
-                        if (err)
-                                goto out;
-                }
        }
 out:
@@ -924,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
        nodemask_t nmask;
        LIST_HEAD(pagelist);
        int err = 0;
-        struct vm_area_struct *vma;
        nodes_clear(nmask);
        node_set(source, nmask);
-        vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+        /*
+         * This does not "check" the range but isolates all pages that
+         * need migration.  Between passing in the full user address
+         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+         */
+        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+        check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
-        if (IS_ERR(vma))
-                return PTR_ERR(vma);
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest,
@@ -1511,9 +1536,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 *
 * Returns effective policy for a VMA at specified address.
 * Falls back to @task or system default policy, as necessary.
- * Current or other task's task mempolicy and non-shared vma policies
+ * Current or other task's task mempolicy and non-shared vma policies must be
- * are protected by the task's mmap_sem, which must be held for read by
+ * protected by task_lock(task) by the caller.
- * the caller.
 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
 * count--added by the get_policy() vm_op, as appropriate--to protect against
 * freeing by another task.  It is the caller's responsibility to free the
@@ -1530,8 +1554,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
                                                                        addr);
                        if (vpol)
                                pol = vpol;
-                } else if (vma->vm_policy)
+                } else if (vma->vm_policy) {
                        pol = vma->vm_policy;
+                        /*
+                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
+                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+                         * count on these policies which will be dropped by
+                         * mpol_cond_put() later
+                         */
+                        if (mpol_needs_cond_ref(pol))
+                                mpol_get(pol);
+                }
        }
        if (!pol)
                pol = &default_policy;
@@ -2061,7 +2095,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 */
 /* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
 static struct sp_node *
 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 {
@@ -2125,36 +2159,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
        if (!sp->root.rb_node)
                return NULL;
-        spin_lock(&sp->lock);
+        mutex_lock(&sp->mutex);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
-        spin_unlock(&sp->lock);
+        mutex_unlock(&sp->mutex);
        return pol;
 }
+static void sp_free(struct sp_node *n)
+{
+        mpol_put(n->policy);
+        kmem_cache_free(sn_cache, n);
+}
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
        rb_erase(&n->nd, &sp->root);
-        mpol_put(n->policy);
+        sp_free(n);
-        kmem_cache_free(sn_cache, n);
 }
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
 {
-        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+        struct sp_node *n;
+        struct mempolicy *newpol;
+        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n)
                return NULL;
+        newpol = mpol_dup(pol);
+        if (IS_ERR(newpol)) {
+                kmem_cache_free(sn_cache, n);
+                return NULL;
+        }
+        newpol->flags |= MPOL_F_SHARED;
        n->start = start;
        n->end = end;
-        mpol_get(pol);
+        n->policy = newpol;
-        pol->flags |= MPOL_F_SHARED;    /* for unref */
-        n->policy = pol;
        return n;
 }
@@ -2162,10 +2210,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
                                 unsigned long end, struct sp_node *new)
 {
-        struct sp_node *n, *new2 = NULL;
+        struct sp_node *n;
+        int ret = 0;
-restart:
+        mutex_lock(&sp->mutex);
-        spin_lock(&sp->lock);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
@@ -2178,16 +2226,14 @@ restart:
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
+                                struct sp_node *new2;
+                                new2 = sp_alloc(end, n->end, n->policy);
                                if (!new2) {
-                                        spin_unlock(&sp->lock);
+                                        ret = -ENOMEM;
-                                        new2 = sp_alloc(end, n->end, n->policy);
+                                        goto out;
-                                        if (!new2)
-                                                return -ENOMEM;
-                                        goto restart;
                                }
                                n->end = start;
                                sp_insert(sp, new2);
-                                new2 = NULL;
                                break;
                        } else
                                n->end = start;
@@ -2198,12 +2244,9 @@ restart:
        }
        if (new)
                sp_insert(sp, new);
-        spin_unlock(&sp->lock);
+out:
-        if (new2) {
+        mutex_unlock(&sp->mutex);
-                mpol_put(new2->policy);
+        return ret;
-                kmem_cache_free(sn_cache, new2);
-        }
-        return 0;
 }
 /**
@@ -2221,7 +2264,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
        int ret;
        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
-        spin_lock_init(&sp->lock);
+        mutex_init(&sp->mutex);
        if (mpol) {
                struct vm_area_struct pvma;
@@ -2275,7 +2318,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
        }
        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
        if (err && new)
-                kmem_cache_free(sn_cache, new);
+                sp_free(new);
        return err;
 }
@@ -2287,16 +2330,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
        if (!p->root.rb_node)
                return;
-        spin_lock(&p->lock);
+        mutex_lock(&p->mutex);
        next = rb_first(&p->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
-                rb_erase(&n->nd, &p->root);
+                sp_delete(p, n);
-                mpol_put(n->policy);
-                kmem_cache_free(sn_cache, n);
        }
-        spin_unlock(&p->lock);
+        mutex_unlock(&p->mutex);
 }
 /* assumes fs == KERNEL_DS */
diff --git a/mm/mlock.c b/mm/mlock.c
index ef726e8aa8e9..f0b9ce572fc7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -51,15 +51,13 @@ EXPORT_SYMBOL(can_do_mlock);
 /*
 *  LRU accounting for clear_page_mlock()
 */
-void __clear_page_mlock(struct page *page)
+void clear_page_mlock(struct page *page)
 {
-        VM_BUG_ON(!PageLocked(page));
+        if (!TestClearPageMlocked(page))
-        if (!page->mapping) {   /* truncated ? */
                return;
-        }
-        dec_zone_page_state(page, NR_MLOCK);
+        mod_zone_page_state(page_zone(page), NR_MLOCK,
+                            -hpage_nr_pages(page));
        count_vm_event(UNEVICTABLE_PGCLEARED);
        if (!isolate_lru_page(page)) {
                putback_lru_page(page);
@@ -81,7 +79,8 @@ void mlock_vma_page(struct page *page)
        BUG_ON(!PageLocked(page));
        if (!TestSetPageMlocked(page)) {
-                inc_zone_page_state(page, NR_MLOCK);
+                mod_zone_page_state(page_zone(page), NR_MLOCK,
+                                    hpage_nr_pages(page));
                count_vm_event(UNEVICTABLE_PGMLOCKED);
                if (!isolate_lru_page(page))
                        putback_lru_page(page);
@@ -108,7 +107,8 @@ void munlock_vma_page(struct page *page)
        BUG_ON(!PageLocked(page));
        if (TestClearPageMlocked(page)) {
-                dec_zone_page_state(page, NR_MLOCK);
+                mod_zone_page_state(page_zone(page), NR_MLOCK,
+                                    -hpage_nr_pages(page));
                if (!isolate_lru_page(page)) {
                        int ret = SWAP_AGAIN;
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                goto no_mlock;
-        if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+        if (!((vma->vm_flags & VM_DONTEXPAND) ||
                        is_vm_hugetlb_page(vma) ||
                        vma == get_gate_vma(current->mm))) {
@@ -290,14 +290,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
                if (page && !IS_ERR(page)) {
                        lock_page(page);
-                        /*
+                        munlock_vma_page(page);
-                         * Like in __mlock_vma_pages_range(),
-                         * because we lock page here and migration is
-                         * blocked by the elevated reference, we need
-                         * only check for file-cache page truncation.
-                         */
-                        if (page->mapping)
-                                munlock_vma_page(page);
                        unlock_page(page);
                        put_page(page);
                }
diff --git a/mm/mmap.c b/mm/mmap.c
index ae18a48e7e4e..2d942353d681 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                unsigned long start, unsigned long end);
-/*
- * WARNING: the debugging will use recursive algorithms so never enable this
- * unless you know what you are doing.
- */
-#undef DEBUG_MM_RB
 /* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
@@ -199,14 +193,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
        flush_dcache_mmap_lock(mapping);
        if (unlikely(vma->vm_flags & VM_NONLINEAR))
-                list_del_init(&vma->shared.vm_set.list);
+                list_del_init(&vma->shared.nonlinear);
        else
-                vma_prio_tree_remove(vma, &mapping->i_mmap);
+                vma_interval_tree_remove(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
 }
 /*
- * Unlink a file-based vm structure from its prio_tree, to hide
+ * Unlink a file-based vm structure from its interval tree, to hide
 * vma from rmap and vmtruncate before freeing its page tables.
 */
 void unlink_file_vma(struct vm_area_struct *vma)
@@ -231,11 +225,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
        might_sleep();
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
-        if (vma->vm_file) {
+        if (vma->vm_file)
                fput(vma->vm_file);
-                if (vma->vm_flags & VM_EXECUTABLE)
-                        removed_exe_file_vma(vma->vm_mm);
-        }
        mpol_put(vma_policy(vma));
        kmem_cache_free(vm_area_cachep, vma);
        return next;
@@ -306,7 +297,7 @@ out:
        return retval;
 }
-#ifdef DEBUG_MM_RB
+#ifdef CONFIG_DEBUG_VM_RB
 static int browse_rb(struct rb_root *root)
 {
        int i = 0, j;
@@ -340,9 +331,12 @@ void validate_mm(struct mm_struct *mm)
 {
        int bug = 0;
        int i = 0;
-        struct vm_area_struct *tmp = mm->mmap;
+        struct vm_area_struct *vma = mm->mmap;
-        while (tmp) {
+        while (vma) {
-                tmp = tmp->vm_next;
+                struct anon_vma_chain *avc;
+                list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                        anon_vma_interval_tree_verify(avc);
+                vma = vma->vm_next;
                i++;
        }
        if (i != mm->map_count)
@@ -356,17 +350,46 @@ void validate_mm(struct mm_struct *mm)
 #define validate_mm(mm) do { } while (0)
 #endif
-static struct vm_area_struct *
+/*
-find_vma_prepare(struct mm_struct *mm, unsigned long addr,
+ * vma has some anon_vma assigned, and is already inserted on that
-                struct vm_area_struct **pprev, struct rb_node ***rb_link,
+ * anon_vma's interval trees.
-                struct rb_node ** rb_parent)
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_sem and by
+ * the root anon_vma's mutex.
+ */
+static inline void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
 {
-        struct vm_area_struct * vma;
+        struct anon_vma_chain *avc;
-        struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
+        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+static inline void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+        struct anon_vma_chain *avc;
+        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+static int find_vma_links(struct mm_struct *mm, unsigned long addr,
+                unsigned long end, struct vm_area_struct **pprev,
+                struct rb_node ***rb_link, struct rb_node **rb_parent)
+{
+        struct rb_node **__rb_link, *__rb_parent, *rb_prev;
        __rb_link = &mm->mm_rb.rb_node;
        rb_prev = __rb_parent = NULL;
-        vma = NULL;
        while (*__rb_link) {
                struct vm_area_struct *vma_tmp;
@@ -375,9 +398,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
                if (vma_tmp->vm_end > addr) {
-                        vma = vma_tmp;
+                        /* Fail if an existing vma overlaps the area */
-                        if (vma_tmp->vm_start <= addr)
+                        if (vma_tmp->vm_start < end)
-                                break;
+                                return -ENOMEM;
                        __rb_link = &__rb_parent->rb_left;
                } else {
                        rb_prev = __rb_parent;
@@ -390,7 +413,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
        *rb_link = __rb_link;
        *rb_parent = __rb_parent;
-        return vma;
+        return 0;
 }
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -417,7 +440,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
                if (unlikely(vma->vm_flags & VM_NONLINEAR))
                        vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
                else
-                        vma_prio_tree_insert(vma, &mapping->i_mmap);
+                        vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
        }
 }
@@ -455,15 +478,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 /*
 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree.  It has already been inserted into the prio_tree.
+ * mm's list and rbtree.  It has already been inserted into the interval tree.
 */
 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-        struct vm_area_struct *__vma, *prev;
+        struct vm_area_struct *prev;
        struct rb_node **rb_link, *rb_parent;
-        __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
+        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
-        BUG_ON(__vma && __vma->vm_start < vma->vm_end);
+                           &prev, &rb_link, &rb_parent))
+                BUG();
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        mm->map_count++;
 }
@@ -496,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        struct vm_area_struct *next = vma->vm_next;
        struct vm_area_struct *importer = NULL;
        struct address_space *mapping = NULL;
-        struct prio_tree_root *root = NULL;
+        struct rb_root *root = NULL;
        struct anon_vma *anon_vma = NULL;
        struct file *file = vma->vm_file;
        long adjust_next = 0;
@@ -559,7 +583,7 @@ again:			remove_next = 1 + (end > next->vm_end);
                mutex_lock(&mapping->i_mmap_mutex);
                if (insert) {
                        /*
-                         * Put into prio_tree now, so instantiated pages
+                         * Put into interval tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
                         * throughout; but we cannot insert into address
                         * space until vma start or end is updated.
@@ -570,22 +594,23 @@ again:			remove_next = 1 + (end > next->vm_end);
        vma_adjust_trans_huge(vma, start, end, adjust_next);
-        /*
+        anon_vma = vma->anon_vma;
-         * When changing only vma->vm_end, we don't really need anon_vma
+        if (!anon_vma && adjust_next)
-         * lock. This is a fairly rare case by itself, but the anon_vma
+                anon_vma = next->anon_vma;
-         * lock may be shared between many sibling processes.  Skipping
+        if (anon_vma) {
-         * the lock for brk adjustments makes a difference sometimes.
+                VM_BUG_ON(adjust_next && next->anon_vma &&
-         */
+                          anon_vma != next->anon_vma);
-        if (vma->anon_vma && (importer || start != vma->vm_start)) {
-                anon_vma = vma->anon_vma;
                anon_vma_lock(anon_vma);
+                anon_vma_interval_tree_pre_update_vma(vma);
+                if (adjust_next)
+                        anon_vma_interval_tree_pre_update_vma(next);
        }
        if (root) {
                flush_dcache_mmap_lock(mapping);
-                vma_prio_tree_remove(vma, root);
+                vma_interval_tree_remove(vma, root);
                if (adjust_next)
-                        vma_prio_tree_remove(next, root);
+                        vma_interval_tree_remove(next, root);
        }
        vma->vm_start = start;
@@ -598,8 +623,8 @@ again:			remove_next = 1 + (end > next->vm_end);
        if (root) {
                if (adjust_next)
-                        vma_prio_tree_insert(next, root);
+                        vma_interval_tree_insert(next, root);
-                vma_prio_tree_insert(vma, root);
+                vma_interval_tree_insert(vma, root);
                flush_dcache_mmap_unlock(mapping);
        }
@@ -620,8 +645,12 @@ again:			remove_next = 1 + (end > next->vm_end);
                __insert_vm_struct(mm, insert);
        }
-        if (anon_vma)
+        if (anon_vma) {
+                anon_vma_interval_tree_post_update_vma(vma);
+                if (adjust_next)
+                        anon_vma_interval_tree_post_update_vma(next);
                anon_vma_unlock(anon_vma);
+        }
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
@@ -636,8 +665,6 @@ again:			remove_next = 1 + (end > next->vm_end);
                if (file) {
                        uprobe_munmap(next, next->vm_start, next->vm_end);
                        fput(file);
-                        if (next->vm_flags & VM_EXECUTABLE)
-                                removed_exe_file_vma(mm);
                }
                if (next->anon_vma)
                        anon_vma_merge(vma, next);
@@ -669,8 +696,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
                        struct file *file, unsigned long vm_flags)
 {
-        /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
+        if (vma->vm_flags ^ vm_flags)
-        if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
                return 0;
        if (vma->vm_file != file)
                return 0;
@@ -951,8 +977,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
                        mm->exec_vm += pages;
        } else if (flags & stack_flags)
                mm->stack_vm += pages;
-        if (flags & (VM_RESERVED|VM_IO))
-                mm->reserved_vm += pages;
 }
 #endif /* CONFIG_PROC_FS */
@@ -1190,7 +1214,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
                return 0;
        /* Specialty mapping? */
-        if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+        if (vm_flags & VM_PFNMAP)
                return 0;
        /* Can the mapping track the dirty pages? */
@@ -1229,8 +1253,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        /* Clear old maps */
        error = -ENOMEM;
 munmap_back:
-        vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
-        if (vma && vma->vm_start < addr + len) {
                if (do_munmap(mm, addr, len))
                        return -ENOMEM;
                goto munmap_back;
@@ -1301,13 +1324,10 @@ munmap_back:
                                goto free_vma;
                        correct_wcount = 1;
                }
-                vma->vm_file = file;
+                vma->vm_file = get_file(file);
-                get_file(file);
                error = file->f_op->mmap(file, vma);
                if (error)
                        goto unmap_and_free_vma;
-                if (vm_flags & VM_EXECUTABLE)
-                        added_exe_file_vma(mm);
                /* Can addr have changed??
                 *
@@ -1758,13 +1778,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
+                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
+                                anon_vma_interval_tree_post_update_vma(vma);
                                perf_event_mmap(vma);
                        }
                }
        }
        vma_unlock_anon_vma(vma);
        khugepaged_enter_vma_merge(vma);
+        validate_mm(vma->vm_mm);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1808,14 +1831,17 @@ int expand_downwards(struct vm_area_struct *vma,
                if (grow <= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
+                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
+                                anon_vma_interval_tree_post_update_vma(vma);
                                perf_event_mmap(vma);
                        }
                }
        }
        vma_unlock_anon_vma(vma);
        khugepaged_enter_vma_merge(vma);
+        validate_mm(vma->vm_mm);
        return error;
 }
@@ -1989,11 +2015,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        if (anon_vma_clone(new, vma))
                goto out_free_mpol;
-        if (new->vm_file) {
+        if (new->vm_file)
                get_file(new->vm_file);
-                if (vma->vm_flags & VM_EXECUTABLE)
-                        added_exe_file_vma(mm);
-        }
        if (new->vm_ops && new->vm_ops->open)
                new->vm_ops->open(new);
@@ -2011,11 +2034,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        /* Clean everything up if vma_adjust failed. */
        if (new->vm_ops && new->vm_ops->close)
                new->vm_ops->close(new);
-        if (new->vm_file) {
+        if (new->vm_file)
-                if (vma->vm_flags & VM_EXECUTABLE)
-                        removed_exe_file_vma(mm);
                fput(new->vm_file);
-        }
        unlink_anon_vmas(new);
 out_free_mpol:
        mpol_put(pol);
@@ -2200,8 +2220,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
         * Clear old maps.  this also does some error checking for us
         */
 munmap_back:
-        vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
-        if (vma && vma->vm_start < addr + len) {
                if (do_munmap(mm, addr, len))
                        return -ENOMEM;
                goto munmap_back;
@@ -2315,10 +2334,10 @@ void exit_mmap(struct mm_struct *mm)
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
 * then i_mmap_mutex is taken here.
 */
-int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-        struct vm_area_struct * __vma, * prev;
+        struct vm_area_struct *prev;
-        struct rb_node ** rb_link, * rb_parent;
+        struct rb_node **rb_link, *rb_parent;
        /*
         * The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2336,8 +2355,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
                BUG_ON(vma->anon_vma);
                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
        }
-        __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
+        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
-        if (__vma && __vma->vm_start < vma->vm_end)
+                           &prev, &rb_link, &rb_parent))
                return -ENOMEM;
        if ((vma->vm_flags & VM_ACCOUNT) &&
             security_vm_enough_memory_mm(mm, vma_pages(vma)))
@@ -2352,7 +2371,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 * prior to moving page table entries, to effect an mremap move.
 */
 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
-        unsigned long addr, unsigned long len, pgoff_t pgoff)
+        unsigned long addr, unsigned long len, pgoff_t pgoff,
+        bool *need_rmap_locks)
 {
        struct vm_area_struct *vma = *vmap;
        unsigned long vma_start = vma->vm_start;
@@ -2371,7 +2391,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                faulted_in_anon_vma = false;
        }
-        find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+                return NULL;    /* should never get here */
        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
                        vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
        if (new_vma) {
@@ -2393,32 +2414,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                         * linear if there are no pages mapped yet.
                         */
                        VM_BUG_ON(faulted_in_anon_vma);
-                        *vmap = new_vma;
+                        *vmap = vma = new_vma;
-                } else
+                }
-                        anon_vma_moveto_tail(new_vma);
+                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                if (new_vma) {
                        *new_vma = *vma;
+                        new_vma->vm_start = addr;
+                        new_vma->vm_end = addr + len;
+                        new_vma->vm_pgoff = pgoff;
                        pol = mpol_dup(vma_policy(vma));
                        if (IS_ERR(pol))
                                goto out_free_vma;
+                        vma_set_policy(new_vma, pol);
                        INIT_LIST_HEAD(&new_vma->anon_vma_chain);
                        if (anon_vma_clone(new_vma, vma))
                                goto out_free_mempol;
-                        vma_set_policy(new_vma, pol);
+                        if (new_vma->vm_file)
-                        new_vma->vm_start = addr;
-                        new_vma->vm_end = addr + len;
-                        new_vma->vm_pgoff = pgoff;
-                        if (new_vma->vm_file) {
                                get_file(new_vma->vm_file);
-                                if (vma->vm_flags & VM_EXECUTABLE)
-                                        added_exe_file_vma(mm);
-                        }
                        if (new_vma->vm_ops && new_vma->vm_ops->open)
                                new_vma->vm_ops->open(new_vma);
                        vma_link(mm, new_vma, prev, rb_link, rb_parent);
+                        *need_rmap_locks = false;
                }
        }
        return new_vma;
@@ -2536,7 +2554,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-        if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+        if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
@@ -2552,7 +2570,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * anon_vma->root->mutex.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
-                                       &anon_vma->root->head.next))
+                                       &anon_vma->root->rb_root.rb_node))
                        BUG();
        }
 }
@@ -2593,7 +2611,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 * A single task can't take more than one mm_take_all_locks() in a row
 * or it would deadlock.
 *
- * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
 * mapping->flags avoid to take the same lock twice, if more than one
 * vma in this mm is backed by the same anon_vma or address_space.
 *
@@ -2640,13 +2658,13 @@ out_unlock:
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-        if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+        if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
                 *
                 * We must however clear the bitflag before unlocking
-                 * the vma so the users using the anon_vma->head will
+                 * the vma so the users using the anon_vma->rb_root will
                 * never see our bitflag.
                 *
                 * No need of atomic instructions here, head.next
@@ -2654,7 +2672,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 * anon_vma->root->mutex.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
-                                          &anon_vma->root->head.next))
+                                          &anon_vma->root->rb_root.rb_node))
                        BUG();
                anon_vma_unlock(anon_vma);
        }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 862b60822d9f..8a5ac8c686b0 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,10 +14,14 @@
 #include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/err.h>
+#include <linux/srcu.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+/* global SRCU for all MMs */
+static struct srcu_struct srcu;
 /*
 * This function can't run concurrently against mmu_notifier_register
 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,8 +29,8 @@
 * in parallel despite there being no task using this mm any more,
 * through the vmas outside of the exit_mmap context, such as with
 * vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to RCU and it serializes
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
- * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
 * can't go away from under us as exit_mmap holds an mm_count pin
 * itself.
 */
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm)
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
        /*
-         * RCU here will block mmu_notifier_unregister until
+         * SRCU here will block mmu_notifier_unregister until
         * ->release returns.
         */
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
                /*
                 * if ->release runs before mmu_notifier_unregister it
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
                 */
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
        spin_unlock(&mm->mmu_notifier_mm->lock);
        /*
-         * synchronize_rcu here prevents mmu_notifier_release to
+         * synchronize_srcu here prevents mmu_notifier_release to
         * return to exit_mmap (which would proceed freeing all pages
         * in the mm) until the ->release method returns, if it was
         * invoked by mmu_notifier_unregister.
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
         * The mmu_notifier_mm can't go away from under us because one
         * mm_count is hold by exit_mmap.
         */
-        synchronize_rcu();
+        synchronize_srcu(&srcu);
 }
 /*
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
-        int young = 0;
+        int young = 0, id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->clear_flush_young)
                        young |= mn->ops->clear_flush_young(mn, mm, address);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
        return young;
 }
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
-        int young = 0;
+        int young = 0, id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->test_young) {
                        young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
                                break;
                }
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
        return young;
 }
@@ -126,19 +131,14 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->change_pte)
                        mn->ops->change_pte(mn, mm, address, pte);
-                /*
-                 * Some drivers don't have change_pte,
-                 * so we must call invalidate_page in that case.
-                 */
-                else if (mn->ops->invalidate_page)
-                        mn->ops->invalidate_page(mn, mm, address);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
 }
 void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +146,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_page)
                        mn->ops->invalidate_page(mn, mm, address);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
 }
 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +161,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_range_start)
                        mn->ops->invalidate_range_start(mn, mm, start, end);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
 }
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +176,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+        int id;
-        rcu_read_lock();
+        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_range_end)
                        mn->ops->invalidate_range_end(mn, mm, start, end);
        }
-        rcu_read_unlock();
+        srcu_read_unlock(&srcu, id);
 }
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,6 +195,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
        BUG_ON(atomic_read(&mm->mm_users) <= 0);
+        /*
+         * Verify that mmu_notifier_init() already run and the global srcu is
+         * initialized.
+         */
+        BUG_ON(!srcu.per_cpu_ref);
        ret = -ENOMEM;
        mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
        if (unlikely(!mmu_notifier_mm))
@@ -201,11 +210,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
                down_write(&mm->mmap_sem);
        ret = mm_take_all_locks(mm);
        if (unlikely(ret))
-                goto out_cleanup;
+                goto out_clean;
        if (!mm_has_notifiers(mm)) {
                INIT_HLIST_HEAD(&mmu_notifier_mm->list);
                spin_lock_init(&mmu_notifier_mm->lock);
                mm->mmu_notifier_mm = mmu_notifier_mm;
                mmu_notifier_mm = NULL;
        }
@@ -224,10 +234,9 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
        spin_unlock(&mm->mmu_notifier_mm->lock);
        mm_drop_all_locks(mm);
-out_cleanup:
+out_clean:
        if (take_mmap_sem)
                up_write(&mm->mmap_sem);
-        /* kfree() does nothing if mmu_notifier_mm is NULL */
        kfree(mmu_notifier_mm);
 out:
        BUG_ON(atomic_read(&mm->mm_users) <= 0);
@@ -274,8 +283,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
 /*
 * This releases the mm_count pin automatically and frees the mm
 * structure if it was the last user of it. It serializes against
- * running mmu notifiers with RCU and against mmu_notifier_unregister
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
- * with the unregister lock + RCU. All sptes must be dropped before
+ * with the unregister lock + SRCU. All sptes must be dropped before
 * calling mmu_notifier_unregister. ->release or any other notifier
 * method may be invoked concurrently with mmu_notifier_unregister,
 * and only after mmu_notifier_unregister returned we're guaranteed
@@ -287,11 +296,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
        if (!hlist_unhashed(&mn->hlist)) {
                /*
-                 * RCU here will force exit_mmap to wait ->release to finish
+                 * SRCU here will force exit_mmap to wait ->release to finish
                 * before freeing the pages.
                 */
-                rcu_read_lock();
+                int id;
+                id = srcu_read_lock(&srcu);
                /*
                 * exit_mmap will block in mmu_notifier_release to
                 * guarantee ->release is called before freeing the
@@ -299,7 +309,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
                 */
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
-                rcu_read_unlock();
+                srcu_read_unlock(&srcu, id);
                spin_lock(&mm->mmu_notifier_mm->lock);
                hlist_del_rcu(&mn->hlist);
@@ -310,10 +320,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
         * Wait any running method to finish, of course including
         * ->release if it was run by mmu_notifier_relase instead of us.
         */
-        synchronize_rcu();
+        synchronize_srcu(&srcu);
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
        mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+static int __init mmu_notifier_init(void)
+{
+        return init_srcu_struct(&srcu);
+}
+module_init(mmu_notifier_init);
diff --git a/mm/mremap.c b/mm/mremap.c
index cc06d0e48d05..1b61c2d3307a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                unsigned long old_addr, unsigned long old_end,
                struct vm_area_struct *new_vma, pmd_t *new_pmd,
-                unsigned long new_addr)
+                unsigned long new_addr, bool need_rmap_locks)
 {
        struct address_space *mapping = NULL;
+        struct anon_vma *anon_vma = NULL;
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
        spinlock_t *old_ptl, *new_ptl;
-        if (vma->vm_file) {
+        /*
-                /*
+         * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
-                 * Subtle point from Rajesh Venkatasubramanian: before
+         * locks to ensure that rmap will always observe either the old or the
-                 * moving file-based ptes, we must lock truncate_pagecache
+         * new ptes. This is the easiest way to avoid races with
-                 * out, since it might clean the dst vma before the src vma,
+         * truncate_pagecache(), page migration, etc...
-                 * and we propagate stale pages into the dst afterward.
+         *
-                 */
+         * When need_rmap_locks is false, we use other ways to avoid
-                mapping = vma->vm_file->f_mapping;
+         * such races:
-                mutex_lock(&mapping->i_mmap_mutex);
+         *
+         * - During exec() shift_arg_pages(), we use a specially tagged vma
+         *   which rmap call sites look for using is_vma_temporary_stack().
+         *
+         * - During mremap(), new_vma is often known to be placed after vma
+         *   in rmap traversal order. This ensures rmap will always observe
+         *   either the old pte, or the new pte, or both (the page table locks
+         *   serialize access to individual ptes, but only rmap traversal
+         *   order guarantees that we won't miss both the old and new ptes).
+         */
+        if (need_rmap_locks) {
+                if (vma->vm_file) {
+                        mapping = vma->vm_file->f_mapping;
+                        mutex_lock(&mapping->i_mmap_mutex);
+                }
+                if (vma->anon_vma) {
+                        anon_vma = vma->anon_vma;
+                        anon_vma_lock(anon_vma);
+                }
        }
        /*
@@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                spin_unlock(new_ptl);
        pte_unmap(new_pte - 1);
        pte_unmap_unlock(old_pte - 1, old_ptl);
+        if (anon_vma)
+                anon_vma_unlock(anon_vma);
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
 }
@@ -122,16 +143,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
-                unsigned long new_addr, unsigned long len)
+                unsigned long new_addr, unsigned long len,
+                bool need_rmap_locks)
 {
        unsigned long extent, next, old_end;
        pmd_t *old_pmd, *new_pmd;
        bool need_flush = false;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        old_end = old_addr + len;
        flush_cache_range(vma, old_addr, old_end);
-        mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
+        mmun_start = old_addr;
+        mmun_end   = old_end;
+        mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
        for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
                cond_resched();
@@ -169,13 +195,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                if (extent > LATENCY_LIMIT)
                        extent = LATENCY_LIMIT;
                move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-                                new_vma, new_pmd, new_addr);
+                          new_vma, new_pmd, new_addr, need_rmap_locks);
                need_flush = true;
        }
        if (likely(need_flush))
                flush_tlb_range(vma, old_end-len, old_addr);
-        mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
+        mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
        return len + old_addr - old_end;        /* how much done */
 }
@@ -193,6 +219,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        unsigned long hiwater_vm;
        int split = 0;
        int err;
+        bool need_rmap_locks;
        /*
         * We'd prefer to avoid failure later on in do_munmap:
@@ -214,27 +241,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                return err;
        new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
-        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+                           &need_rmap_locks);
        if (!new_vma)
                return -ENOMEM;
-        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+                                     need_rmap_locks);
        if (moved_len < old_len) {
                /*
-                 * Before moving the page tables from the new vma to
-                 * the old vma, we need to be sure the old vma is
-                 * queued after new vma in the same_anon_vma list to
-                 * prevent SMP races with rmap_walk (that could lead
-                 * rmap_walk to miss some page table).
-                 */
-                anon_vma_moveto_tail(vma);
-                /*
                 * On error, move entries back from new area to old,
                 * which will succeed since page tables still there,
                 * and then proceed to unmap new area instead of old.
                 */
-                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+                                 true);
                vma = new_vma;
                old_len = new_len;
                old_addr = new_addr;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 405573010f99..714d5d650470 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
                return 0;
        __free_pages_memory(start_pfn, end_pfn);
+        fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT),
+                        start_pfn, end_pfn);
        return end_pfn - start_pfn;
 }
@@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
        phys_addr_t start, end, size;
        u64 i;
+        reset_zone_present_pages();
        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
                count += __free_memory_core(start, end);
@@ -162,8 +165,6 @@ unsigned long __init free_all_bootmem(void)
         * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
         *  because in some case like Node0 doesn't have RAM installed
         *  low ram will be on Node1
-         * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
-         *  will be used instead of only Node0 related
         */
        return free_low_memory_core_early(MAX_NUMNODES);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index d4b0c10872de..45131b41bcdb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
-                vma_prio_tree_insert(vma, &mapping->i_mmap);
+                vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
                mutex_unlock(&mapping->i_mmap_mutex);
        }
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
-                vma_prio_tree_remove(vma, &mapping->i_mmap);
+                vma_interval_tree_remove(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
                mutex_unlock(&mapping->i_mmap_mutex);
        }
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
        kenter("%p", vma);
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
-        if (vma->vm_file) {
+        if (vma->vm_file)
                fput(vma->vm_file);
-                if (vma->vm_flags & VM_EXECUTABLE)
-                        removed_exe_file_vma(mm);
-        }
        put_nommu_region(vma->vm_region);
        kmem_cache_free(vm_area_cachep, vma);
 }
@@ -1282,14 +1279,8 @@ unsigned long do_mmap_pgoff(struct file *file,
        vma->vm_pgoff = pgoff;
        if (file) {
-                region->vm_file = file;
+                region->vm_file = get_file(file);
-                get_file(file);
+                vma->vm_file = get_file(file);
-                vma->vm_file = file;
-                get_file(file);
-                if (vm_flags & VM_EXECUTABLE) {
-                        added_exe_file_vma(current->mm);
-                        vma->vm_mm = current->mm;
-                }
        }
        down_write(&nommu_region_sem);
@@ -1442,8 +1433,6 @@ error:
        kmem_cache_free(vm_region_jar, region);
        if (vma->vm_file)
                fput(vma->vm_file);
-        if (vma->vm_flags & VM_EXECUTABLE)
-                removed_exe_file_vma(vma->vm_mm);
        kmem_cache_free(vm_area_cachep, vma);
        kleave(" = %d", ret);
        return ret;
@@ -1822,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
        if (addr != (pfn << PAGE_SHIFT))
                return -EINVAL;
-        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
        return 0;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -1963,6 +1952,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+                             unsigned long size, pgoff_t pgoff)
+{
+        BUG();
+        return 0;
+}
+EXPORT_SYMBOL(generic_file_remap_pages);
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long addr, void *buf, int len, int write)
 {
@@ -2047,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
                                size_t newsize)
 {
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        struct vm_region *region;
        pgoff_t low, high;
        size_t r_size, r_top;
@@ -2059,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
        mutex_lock(&inode->i_mapping->i_mmap_mutex);
        /* search for VMAs that fall within the dead zone */
-        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+        vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
-                              low, high) {
                /* found one - only interested if it's shared out of the page
                 * cache */
                if (vma->vm_flags & VM_SHARED) {
@@ -2076,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
         * we don't check for any regions that start beyond the EOF as there
         * shouldn't be any
         */
-        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+        vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
-                              0, ULONG_MAX) {
+                                  0, ULONG_MAX) {
                if (!(vma->vm_flags & VM_SHARED))
                        continue;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 198600861638..79e0f3e24831 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 {
        task_lock(current);
        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-                "oom_adj=%d, oom_score_adj=%d\n",
+                "oom_score_adj=%d\n",
-                current->comm, gfp_mask, order, current->signal->oom_adj,
+                current->comm, gfp_mask, order,
                current->signal->oom_score_adj);
        cpuset_print_task_mems_allowed(current);
        task_unlock(current);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5ad5ce23c1e0..830893b2b3c7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1602,10 +1602,18 @@ void writeback_set_ratelimit(void)
 }
 static int __cpuinit
-ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
+ratelimit_handler(struct notifier_block *self, unsigned long action,
+                  void *hcpu)
 {
-        writeback_set_ratelimit();
-        return NOTIFY_DONE;
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_ONLINE:
+        case CPU_DEAD:
+                writeback_set_ratelimit();
+                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
 }
 static struct notifier_block __cpuinitdata ratelimit_nb = {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c13ea7538891..5b74de6702e0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page,
                if (page_is_guard(buddy)) {
                        clear_page_guard_flag(buddy);
                        set_page_private(page, 0);
-                        __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+                        __mod_zone_freepage_state(zone, 1 << order,
+                                                  migratetype);
                } else {
                        list_del(&buddy->lru);
                        zone->free_area[order].nr_free--;
@@ -597,17 +598,6 @@ out:
        zone->free_area[order].nr_free++;
 }
-/*
- * free_page_mlock() -- clean up attempts to free and mlocked() page.
- * Page should not be on lru, so no need to fix that up.
- * free_pages_check() will verify...
- */
-static inline void free_page_mlock(struct page *page)
-{
-        __dec_zone_page_state(page, NR_MLOCK);
-        __count_vm_event(UNEVICTABLE_MLOCKFREED);
-}
 static inline int free_pages_check(struct page *page)
 {
        if (unlikely(page_mapcount(page) |
@@ -668,12 +658,17 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        batch_free = to_free;
                do {
+                        int mt; /* migratetype of the to-be-freed page */
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
                        list_del(&page->lru);
+                        mt = get_freepage_migratetype(page);
                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-                        __free_one_page(page, zone, 0, page_private(page));
+                        __free_one_page(page, zone, 0, mt);
-                        trace_mm_page_pcpu_drain(page, 0, page_private(page));
+                        trace_mm_page_pcpu_drain(page, 0, mt);
+                        if (is_migrate_cma(mt))
+                                __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
                } while (--to_free && --batch_free && !list_empty(list));
        }
        __mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -688,7 +683,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
        zone->pages_scanned = 0;
        __free_one_page(page, zone, order, migratetype);
-        __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+        if (unlikely(migratetype != MIGRATE_ISOLATE))
+                __mod_zone_freepage_state(zone, 1 << order, migratetype);
        spin_unlock(&zone->lock);
 }
@@ -721,17 +717,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
        unsigned long flags;
-        int wasMlocked = __TestClearPageMlocked(page);
+        int migratetype;
        if (!free_pages_prepare(page, order))
                return;
        local_irq_save(flags);
-        if (unlikely(wasMlocked))
-                free_page_mlock(page);
        __count_vm_events(PGFREE, 1 << order);
-        free_one_page(page_zone(page), page, order,
+        migratetype = get_pageblock_migratetype(page);
-                                        get_pageblock_migratetype(page));
+        set_freepage_migratetype(page, migratetype);
+        free_one_page(page_zone(page), page, order, migratetype);
        local_irq_restore(flags);
 }
@@ -811,7 +806,8 @@ static inline void expand(struct zone *zone, struct page *page,
                        set_page_guard_flag(&page[size]);
                        set_page_private(&page[size], high);
                        /* Guard pages are not available for any usage */
-                        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+                        __mod_zone_freepage_state(zone, -(1 << high),
+                                                  migratetype);
                        continue;
                }
 #endif
@@ -915,7 +911,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 */
-static int move_freepages(struct zone *zone,
+int move_freepages(struct zone *zone,
                          struct page *start_page, struct page *end_page,
                          int migratetype)
 {
@@ -951,6 +947,7 @@ static int move_freepages(struct zone *zone,
                order = page_order(page);
                list_move(&page->lru,
                          &zone->free_area[order].free_list[migratetype]);
+                set_freepage_migratetype(page, migratetype);
                page += 1 << order;
                pages_moved += 1 << order;
        }
@@ -1135,8 +1132,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
                                mt = migratetype;
                }
-                set_page_private(page, mt);
+                set_freepage_migratetype(page, mt);
                list = &page->lru;
+                if (is_migrate_cma(mt))
+                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+                                              -(1 << order));
        }
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
        spin_unlock(&zone->lock);
@@ -1296,16 +1296,13 @@ void free_hot_cold_page(struct page *page, int cold)
        struct per_cpu_pages *pcp;
        unsigned long flags;
        int migratetype;
-        int wasMlocked = __TestClearPageMlocked(page);
        if (!free_pages_prepare(page, 0))
                return;
        migratetype = get_pageblock_migratetype(page);
-        set_page_private(page, migratetype);
+        set_freepage_migratetype(page, migratetype);
        local_irq_save(flags);
-        if (unlikely(wasMlocked))
-                free_page_mlock(page);
        __count_vm_event(PGFREE);
        /*
@@ -1380,20 +1377,16 @@ void split_page(struct page *page, unsigned int order)
 }
 /*
- * Similar to split_page except the page is already free. As this is only
+ * Similar to the split_page family of functions except that the page
- * being used for migration, the migratetype of the block also changes.
+ * required at the given order and being isolated now to prevent races
- * As this is called with interrupts disabled, the caller is responsible
+ * with parallel allocators
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
 */
-int split_free_page(struct page *page)
+int capture_free_page(struct page *page, int alloc_order, int migratetype)
 {
        unsigned int order;
        unsigned long watermark;
        struct zone *zone;
+        int mt;
        BUG_ON(!PageBuddy(page));
@@ -1409,12 +1402,16 @@ int split_free_page(struct page *page)
        list_del(&page->lru);
        zone->free_area[order].nr_free--;
        rmv_page_order(page);
-        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
-        /* Split into individual pages */
+        mt = get_pageblock_migratetype(page);
-        set_page_refcounted(page);
+        if (unlikely(mt != MIGRATE_ISOLATE))
-        split_page(page, order);
+                __mod_zone_freepage_state(zone, -(1UL << order), mt);
+        if (alloc_order != order)
+                expand(zone, page, alloc_order, order,
+                        &zone->free_area[order], migratetype);
+        /* Set the pageblock if the captured page is at least a pageblock */
        if (order >= pageblock_order - 1) {
                struct page *endpage = page + (1 << order) - 1;
                for (; page < endpage; page += pageblock_nr_pages) {
@@ -1425,7 +1422,35 @@ int split_free_page(struct page *page)
                }
        }
-        return 1 << order;
+        return 1UL << order;
+}
+/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+        unsigned int order;
+        int nr_pages;
+        BUG_ON(!PageBuddy(page));
+        order = page_order(page);
+        nr_pages = capture_free_page(page, order, 0);
+        if (!nr_pages)
+                return 0;
+        /* Split into individual pages */
+        set_page_refcounted(page);
+        split_page(page, order);
+        return nr_pages;
 }
 /*
@@ -1484,7 +1509,8 @@ again:
                spin_unlock(&zone->lock);
                if (!page)
                        goto failed;
-                __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
+                __mod_zone_freepage_state(zone, -(1 << order),
+                                          get_pageblock_migratetype(page));
        }
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -1501,19 +1527,6 @@ failed:
        return NULL;
 }
-/* The ALLOC_WMARK bits are used as an index to zone->watermark */
-#define ALLOC_WMARK_MIN         WMARK_MIN
-#define ALLOC_WMARK_LOW         WMARK_LOW
-#define ALLOC_WMARK_HIGH        WMARK_HIGH
-#define ALLOC_NO_WATERMARKS     0x04 /* don't check watermarks at all */
-/* Mask to get the watermark bits */
-#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)
-#define ALLOC_HARDER            0x10 /* try to alloc harder */
-#define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET            0x40 /* check for correct cpuset */
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct {
@@ -1608,7 +1621,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
+#ifdef CONFIG_CMA
+        /* If allocation can't use CMA areas don't use free CMA pages */
+        if (!(alloc_flags & ALLOC_CMA))
+                free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
        if (free_pages <= min + lowmem_reserve)
                return false;
        for (o = 0; o < order; o++) {
@@ -1782,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
        bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+        return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
+}
+static void __paginginit init_zone_allows_reclaim(int nid)
+{
+        int i;
+        for_each_online_node(i)
+                if (node_distance(nid, i) <= RECLAIM_DISTANCE)
+                        node_set(i, NODE_DATA(nid)->reclaim_nodes);
+                else
+                        zone_reclaim_mode = 1;
+}
 #else   /* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1802,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+        return true;
+}
+static inline void init_zone_allows_reclaim(int nid)
+{
+}
 #endif  /* CONFIG_NUMA */
 /*
@@ -1886,7 +1928,8 @@ zonelist_scan:
                                did_zlc_setup = 1;
                        }
-                        if (zone_reclaim_mode == 0)
+                        if (zone_reclaim_mode == 0 ||
+                            !zone_allows_reclaim(preferred_zone, zone))
                                goto this_zone_full;
                        /*
@@ -2105,7 +2148,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
-        struct page *page;
+        struct page *page = NULL;
        if (!order)
                return NULL;
@@ -2118,10 +2161,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, sync_migration,
-                                                contended_compaction);
+                                                contended_compaction, &page);
        current->flags &= ~PF_MEMALLOC;
-        if (*did_some_progress != COMPACT_SKIPPED) {
+        /* If compaction captured a page, prep and use it */
+        if (page) {
+                prep_new_page(page, order, gfp_mask);
+                goto got_page;
+        }
+        if (*did_some_progress != COMPACT_SKIPPED) {
                /* Page migration frees to the PCP lists but we want merging */
                drain_pages(get_cpu());
                put_cpu();
@@ -2131,6 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                                alloc_flags & ~ALLOC_NO_WATERMARKS,
                                preferred_zone, migratetype);
                if (page) {
+got_page:
+                        preferred_zone->compact_blockskip_flush = false;
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
                        if (order >= preferred_zone->compact_order_failed)
@@ -2315,7 +2366,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                                 unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
+#ifdef CONFIG_CMA
+        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+                alloc_flags |= ALLOC_CMA;
+#endif
        return alloc_flags;
 }
@@ -2362,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 restart:
-        if (!(gfp_mask & __GFP_NO_KSWAPD))
+        wake_all_kswapd(order, zonelist, high_zoneidx,
-                wake_all_kswapd(order, zonelist, high_zoneidx,
+                                        zone_idx(preferred_zone));
-                                                zone_idx(preferred_zone));
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2441,7 +2494,7 @@ rebalance:
         * system then fail the allocation instead of entering direct reclaim.
         */
        if ((deferred_compaction || contended_compaction) &&
-                                                (gfp_mask & __GFP_NO_KSWAPD))
+            (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
                goto nopage;
        /* Try direct reclaim and then allocating */
@@ -2541,6 +2594,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        struct page *page = NULL;
        int migratetype = allocflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
+        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
        gfp_mask &= gfp_allowed_mask;
@@ -2569,9 +2623,13 @@ retry_cpuset:
        if (!preferred_zone)
                goto out;
+#ifdef CONFIG_CMA
+        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+                alloc_flags |= ALLOC_CMA;
+#endif
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-                        zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
+                        zonelist, high_zoneidx, alloc_flags,
                        preferred_zone, migratetype);
        if (unlikely(!page))
                page = __alloc_pages_slowpath(gfp_mask, order,
@@ -2852,7 +2910,8 @@ void show_free_areas(unsigned int filter)
                " unevictable:%lu"
                " dirty:%lu writeback:%lu unstable:%lu\n"
                " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
-                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
+                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+                " free_cma:%lu\n",
                global_page_state(NR_ACTIVE_ANON),
                global_page_state(NR_INACTIVE_ANON),
                global_page_state(NR_ISOLATED_ANON),
@@ -2869,7 +2928,8 @@ void show_free_areas(unsigned int filter)
                global_page_state(NR_FILE_MAPPED),
                global_page_state(NR_SHMEM),
                global_page_state(NR_PAGETABLE),
-                global_page_state(NR_BOUNCE));
+                global_page_state(NR_BOUNCE),
+                global_page_state(NR_FREE_CMA_PAGES));
        for_each_populated_zone(zone) {
                int i;
@@ -2901,6 +2961,7 @@ void show_free_areas(unsigned int filter)
                        " pagetables:%lukB"
                        " unstable:%lukB"
                        " bounce:%lukB"
+                        " free_cma:%lukB"
                        " writeback_tmp:%lukB"
                        " pages_scanned:%lu"
                        " all_unreclaimable? %s"
@@ -2930,6 +2991,7 @@ void show_free_areas(unsigned int filter)
                        K(zone_page_state(zone, NR_PAGETABLE)),
                        K(zone_page_state(zone, NR_UNSTABLE_NFS)),
                        K(zone_page_state(zone, NR_BOUNCE)),
+                        K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
                        K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
                        zone->pages_scanned,
                        (zone->all_unreclaimable ? "yes" : "no")
@@ -3328,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat)
        j = 0;
        while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
-                int distance = node_distance(local_node, node);
-                /*
-                 * If another node is sufficiently far away then it is better
-                 * to reclaim pages in a zone before going off node.
-                 */
-                if (distance > RECLAIM_DISTANCE)
-                        zone_reclaim_mode = 1;
                /*
                 * We don't want to pressure a particular node.
                 * So adding penalty to the first node in same
                 * distance group to make it round-robin.
                 */
-                if (distance != node_distance(local_node, prev_node))
+                if (node_distance(local_node, node) !=
+                    node_distance(local_node, prev_node))
                        node_load[node] = load;
                prev_node = node;
@@ -4438,11 +4492,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->spanned_pages = size;
                zone->present_pages = realsize;
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-                zone->compact_cached_free_pfn = zone->zone_start_pfn +
-                                                zone->spanned_pages;
-                zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
-#endif
 #ifdef CONFIG_NUMA
                zone->node = nid;
                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4521,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
+        init_zone_allows_reclaim(nid);
        calculate_node_totalpages(pgdat, zones_size, zholes_size);
        alloc_node_mem_map(pgdat);
@@ -4879,7 +4929,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                               zone_movable_pfn[i] << PAGE_SHIFT);
        }
-        /* Print out the early_node_map[] */
+        /* Print out the early node map */
        printk("Early memory node ranges\n");
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
                printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
@@ -5619,47 +5669,28 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
                                pageblock_nr_pages));
 }
-static struct page *
-__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
-                             int **resultp)
-{
-        gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
-        if (PageHighMem(page))
-                gfp_mask |= __GFP_HIGHMEM;
-        return alloc_page(gfp_mask);
-}
 /* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+                                        unsigned long start, unsigned long end)
 {
        /* This function is based on compact_zone() from compaction.c. */
+        unsigned long nr_reclaimed;
        unsigned long pfn = start;
        unsigned int tries = 0;
        int ret = 0;
-        struct compact_control cc = {
-                .nr_migratepages = 0,
-                .order = -1,
-                .zone = page_zone(pfn_to_page(start)),
-                .sync = true,
-        };
-        INIT_LIST_HEAD(&cc.migratepages);
        migrate_prep_local();
-        while (pfn < end || !list_empty(&cc.migratepages)) {
+        while (pfn < end || !list_empty(&cc->migratepages)) {
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }
-                if (list_empty(&cc.migratepages)) {
+                if (list_empty(&cc->migratepages)) {
-                        cc.nr_migratepages = 0;
+                        cc->nr_migratepages = 0;
-                        pfn = isolate_migratepages_range(cc.zone, &cc,
+                        pfn = isolate_migratepages_range(cc->zone, cc,
-                                                         pfn, end);
+                                                         pfn, end, true);
                        if (!pfn) {
                                ret = -EINTR;
                                break;
@@ -5670,12 +5701,16 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
                        break;
                }
-                ret = migrate_pages(&cc.migratepages,
+                nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
-                                    __alloc_contig_migrate_alloc,
+                                                        &cc->migratepages);
+                cc->nr_migratepages -= nr_reclaimed;
+                ret = migrate_pages(&cc->migratepages,
+                                    alloc_migrate_target,
                                    0, false, MIGRATE_SYNC);
        }
-        putback_lru_pages(&cc.migratepages);
+        putback_lru_pages(&cc->migratepages);
        return ret > 0 ? 0 : ret;
 }
@@ -5754,6 +5789,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        unsigned long outer_start, outer_end;
        int ret = 0, order;
+        struct compact_control cc = {
+                .nr_migratepages = 0,
+                .order = -1,
+                .zone = page_zone(pfn_to_page(start)),
+                .sync = true,
+                .ignore_skip_hint = true,
+        };
+        INIT_LIST_HEAD(&cc.migratepages);
        /*
         * What we do here is we mark all pageblocks in range as
         * MIGRATE_ISOLATE.  Because pageblock and max order pages may
@@ -5781,9 +5825,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        ret = start_isolate_page_range(pfn_max_align_down(start),
                                       pfn_max_align_up(end), migratetype);
        if (ret)
-                goto done;
+                return ret;
-        ret = __alloc_contig_migrate_range(start, end);
+        ret = __alloc_contig_migrate_range(&cc, start, end);
        if (ret)
                goto done;
@@ -5832,7 +5876,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
        /* Grab isolated pages from freelists. */
-        outer_end = isolate_freepages_range(outer_start, end);
+        outer_end = isolate_freepages_range(&cc, outer_start, end);
        if (!outer_end) {
                ret = -EBUSY;
                goto done;
@@ -5874,6 +5918,7 @@ static int __meminit __zone_pcp_update(void *data)
                local_irq_save(flags);
                if (pcp->count > 0)
                        free_pcppages_bulk(zone, pcp->count, pcp);
+                drain_zonestat(zone, pset);
                setup_pageset(pset, batch);
                local_irq_restore(flags);
        }
@@ -5890,10 +5935,16 @@ void __meminit zone_pcp_update(struct zone *zone)
 void zone_pcp_reset(struct zone *zone)
 {
        unsigned long flags;
+        int cpu;
+        struct per_cpu_pageset *pset;
        /* avoid races with drain_pages()  */
        local_irq_save(flags);
        if (zone->pageset != &boot_pageset) {
+                for_each_online_cpu(cpu) {
+                        pset = per_cpu_ptr(zone->pageset, cpu);
+                        drain_zonestat(zone, pset);
+                }
                free_percpu(zone->pageset);
                zone->pageset = &boot_pageset;
        }
@@ -6047,3 +6098,37 @@ void dump_page(struct page *page)
        dump_page_flags(page->flags);
        mem_cgroup_print_bad_page(page);
 }
+/* reset zone->present_pages */
+void reset_zone_present_pages(void)
+{
+        struct zone *z;
+        int i, nid;
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                for (i = 0; i < MAX_NR_ZONES; i++) {
+                        z = NODE_DATA(nid)->node_zones + i;
+                        z->present_pages = 0;
+                }
+        }
+}
+/* calculate zone's present pages in buddy system */
+void fixup_zone_present_pages(int nid, unsigned long start_pfn,
+                                unsigned long end_pfn)
+{
+        struct zone *z;
+        unsigned long zone_start_pfn, zone_end_pfn;
+        int i;
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                z = NODE_DATA(nid)->node_zones + i;
+                zone_start_pfn = z->zone_start_pfn;
+                zone_end_pfn = zone_start_pfn + z->spanned_pages;
+                /* if the two regions intersect */
+                if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn))
+                        z->present_pages += min(end_pfn, zone_end_pfn) -
+                                            max(start_pfn, zone_start_pfn);
+        }
+}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 247d1f175739..f2f5b4818e94 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -76,8 +76,13 @@ int set_migratetype_isolate(struct page *page)
 out:
        if (!ret) {
+                unsigned long nr_pages;
+                int migratetype = get_pageblock_migratetype(page);
                set_pageblock_isolate(page);
-                move_freepages_block(zone, page, MIGRATE_ISOLATE);
+                nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+                __mod_zone_freepage_state(zone, -nr_pages, migratetype);
        }
        spin_unlock_irqrestore(&zone->lock, flags);
@@ -89,12 +94,14 @@ out:
 void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 {
        struct zone *zone;
-        unsigned long flags;
+        unsigned long flags, nr_pages;
        zone = page_zone(page);
        spin_lock_irqsave(&zone->lock, flags);
        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
                goto out;
-        move_freepages_block(zone, page, migratetype);
+        nr_pages = move_freepages_block(zone, page, migratetype);
+        __mod_zone_freepage_state(zone, nr_pages, migratetype);
        restore_pageblock_isolate(page, migratetype);
 out:
        spin_unlock_irqrestore(&zone->lock, flags);
@@ -193,10 +200,25 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
                        continue;
                }
                page = pfn_to_page(pfn);
-                if (PageBuddy(page))
+                if (PageBuddy(page)) {
+                        /*
+                         * If race between isolatation and allocation happens,
+                         * some free pages could be in MIGRATE_MOVABLE list
+                         * although pageblock's migratation type of the page
+                         * is MIGRATE_ISOLATE. Catch it and move the page into
+                         * MIGRATE_ISOLATE list.
+                         */
+                        if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
+                                struct page *end_page;
+                                end_page = page + (1 << page_order(page)) - 1;
+                                move_freepages(page_zone(page), page, end_page,
+                                                MIGRATE_ISOLATE);
+                        }
                        pfn += 1 << page_order(page);
+                }
                else if (page_count(page) == 0 &&
-                                page_private(page) == MIGRATE_ISOLATE)
+                        get_freepage_migratetype(page) == MIGRATE_ISOLATE)
                        pfn += 1;
                else
                        break;
@@ -233,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        spin_unlock_irqrestore(&zone->lock, flags);
        return ret ? 0 : -EBUSY;
 }
+struct page *alloc_migrate_target(struct page *page, unsigned long private,
+                                  int **resultp)
+{
+        gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+        if (PageHighMem(page))
+                gfp_mask |= __GFP_HIGHMEM;
+        return alloc_page(gfp_mask);
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index bb4be7435ce3..ddc5efb9c5bb 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1370,7 +1370,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 #ifdef CONFIG_SMP
-const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
+const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
        [PCPU_FC_AUTO]  = "auto",
        [PCPU_FC_EMBED] = "embed",
        [PCPU_FC_PAGE]  = "page",
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 74c0ddaa6fa0..e642627da6b7 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -120,3 +120,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
+#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+{
+        assert_spin_locked(&mm->page_table_lock);
+        /* FIFO */
+        if (!mm->pmd_huge_pte)
+                INIT_LIST_HEAD(&pgtable->lru);
+        else
+                list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+        mm->pmd_huge_pte = pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+{
+        pgtable_t pgtable;
+        assert_spin_locked(&mm->page_table_lock);
+        /* FIFO */
+        pgtable = mm->pmd_huge_pte;
+        if (list_empty(&pgtable->lru))
+                mm->pmd_huge_pte = NULL;
+        else {
+                mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+                                              struct page, lru);
+                list_del(&pgtable->lru);
+        }
+        return pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                     pmd_t *pmdp)
+{
+        set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
+        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
deleted file mode 100644
index 799dcfd7cd8c..000000000000
--- a/mm/prio_tree.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * mm/prio_tree.c - priority search tree for mapping->i_mmap
- *
- * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
- *
- * This file is released under the GPL v2.
- *
- * Based on the radix priority search tree proposed by Edward M. McCreight
- * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
- *
- * 02Feb2004    Initial version
- */
-#include <linux/mm.h>
-#include <linux/prio_tree.h>
-#include <linux/prefetch.h>
-/*
- * See lib/prio_tree.c for details on the general radix priority search tree
- * code.
- */
-/*
- * The following #defines are mirrored from lib/prio_tree.c. They're only used
- * for debugging, and should be removed (along with the debugging code using
- * them) when switching also VMAs to the regular prio_tree code.
- */
-#define RADIX_INDEX(vma)  ((vma)->vm_pgoff)
-#define VMA_SIZE(vma)     (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
-/* avoid overflow */
-#define HEAP_INDEX(vma)   ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
-/*
- * Radix priority search tree for address_space->i_mmap
- *
- * For each vma that map a unique set of file pages i.e., unique [radix_index,
- * heap_index] value, we have a corresponding priority search tree node. If
- * multiple vmas have identical [radix_index, heap_index] value, then one of
- * them is used as a tree node and others are stored in a vm_set list. The tree
- * node points to the first vma (head) of the list using vm_set.head.
- *
- * prio_tree_root
- *      |
- *      A       vm_set.head
- *     / \      /
- *    L   R -> H-I-J-K-M-N-O-P-Q-S
- *    ^   ^    <-- vm_set.list -->
- *  tree nodes
- *
- * We need some way to identify whether a vma is a tree node, head of a vm_set
- * list, or just a member of a vm_set list. We cannot use vm_flags to store
- * such information. The reason is, in the above figure, it is possible that
- * vm_flags' of R and H are covered by the different mmap_sems. When R is
- * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
- * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
- * That's why some trick involving shared.vm_set.parent is used for identifying
- * tree nodes and list head nodes.
- *
- * vma radix priority search tree node rules:
- *
- * vma->shared.vm_set.parent != NULL    ==> a tree node
- *      vma->shared.vm_set.head != NULL ==> list of others mapping same range
- *      vma->shared.vm_set.head == NULL ==> no others map the same range
- *
- * vma->shared.vm_set.parent == NULL
- *      vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
- *      vma->shared.vm_set.head == NULL ==> a list node
- */
-/*
- * Add a new vma known to map the same set of pages as the old vma:
- * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
- * Note that it just happens to work correctly on i_mmap_nonlinear too.
- */
-void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
-{
-        /* Leave these BUG_ONs till prio_tree patch stabilizes */
-        BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
-        BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
-        vma->shared.vm_set.head = NULL;
-        vma->shared.vm_set.parent = NULL;
-        if (!old->shared.vm_set.parent)
-                list_add(&vma->shared.vm_set.list,
-                                &old->shared.vm_set.list);
-        else if (old->shared.vm_set.head)
-                list_add_tail(&vma->shared.vm_set.list,
-                                &old->shared.vm_set.head->shared.vm_set.list);
-        else {
-                INIT_LIST_HEAD(&vma->shared.vm_set.list);
-                vma->shared.vm_set.head = old;
-                old->shared.vm_set.head = vma;
-        }
-}
-void vma_prio_tree_insert(struct vm_area_struct *vma,
-                          struct prio_tree_root *root)
-{
-        struct prio_tree_node *ptr;
-        struct vm_area_struct *old;
-        vma->shared.vm_set.head = NULL;
-        ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
-        if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
-                old = prio_tree_entry(ptr, struct vm_area_struct,
-                                        shared.prio_tree_node);
-                vma_prio_tree_add(vma, old);
-        }
-}
-void vma_prio_tree_remove(struct vm_area_struct *vma,
-                          struct prio_tree_root *root)
-{
-        struct vm_area_struct *node, *head, *new_head;
-        if (!vma->shared.vm_set.head) {
-                if (!vma->shared.vm_set.parent)
-                        list_del_init(&vma->shared.vm_set.list);
-                else
-                        raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
-        } else {
-                /* Leave this BUG_ON till prio_tree patch stabilizes */
-                BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
-                if (vma->shared.vm_set.parent) {
-                        head = vma->shared.vm_set.head;
-                        if (!list_empty(&head->shared.vm_set.list)) {
-                                new_head = list_entry(
-                                        head->shared.vm_set.list.next,
-                                        struct vm_area_struct,
-                                        shared.vm_set.list);
-                                list_del_init(&head->shared.vm_set.list);
-                        } else
-                                new_head = NULL;
-                        raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
-                                        &head->shared.prio_tree_node);
-                        head->shared.vm_set.head = new_head;
-                        if (new_head)
-                                new_head->shared.vm_set.head = head;
-                } else {
-                        node = vma->shared.vm_set.head;
-                        if (!list_empty(&vma->shared.vm_set.list)) {
-                                new_head = list_entry(
-                                        vma->shared.vm_set.list.next,
-                                        struct vm_area_struct,
-                                        shared.vm_set.list);
-                                list_del_init(&vma->shared.vm_set.list);
-                                node->shared.vm_set.head = new_head;
-                                new_head->shared.vm_set.head = node;
-                        } else
-                                node->shared.vm_set.head = NULL;
-                }
-        }
-}
-/*
- * Helper function to enumerate vmas that map a given file page or a set of
- * contiguous file pages. The function returns vmas that at least map a single
- * page in the given range of contiguous file pages.
- */
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
-                                        struct prio_tree_iter *iter)
-{
-        struct prio_tree_node *ptr;
-        struct vm_area_struct *next;
-        if (!vma) {
-                /*
-                 * First call is with NULL vma
-                 */
-                ptr = prio_tree_next(iter);
-                if (ptr) {
-                        next = prio_tree_entry(ptr, struct vm_area_struct,
-                                                shared.prio_tree_node);
-                        prefetch(next->shared.vm_set.head);
-                        return next;
-                } else
-                        return NULL;
-        }
-        if (vma->shared.vm_set.parent) {
-                if (vma->shared.vm_set.head) {
-                        next = vma->shared.vm_set.head;
-                        prefetch(next->shared.vm_set.list.next);
-                        return next;
-                }
-        } else {
-                next = list_entry(vma->shared.vm_set.list.next,
-                                struct vm_area_struct, shared.vm_set.list);
-                if (!next->shared.vm_set.head) {
-                        prefetch(next->shared.vm_set.list.next);
-                        return next;
-                }
-        }
-        ptr = prio_tree_next(iter);
-        if (ptr) {
-                next = prio_tree_entry(ptr, struct vm_area_struct,
-                                        shared.prio_tree_node);
-                prefetch(next->shared.vm_set.head);
-                return next;
-        } else
-                return NULL;
-}
diff --git a/mm/readahead.c b/mm/readahead.c
index ea8f8fa21649..7963f2391236 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -579,19 +579,19 @@ do_readahead(struct address_space *mapping, struct file *filp,
 SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
 {
        ssize_t ret;
-        struct file *file;
+        struct fd f;
        ret = -EBADF;
-        file = fget(fd);
+        f = fdget(fd);
-        if (file) {
+        if (f.file) {
-                if (file->f_mode & FMODE_READ) {
+                if (f.file->f_mode & FMODE_READ) {
-                        struct address_space *mapping = file->f_mapping;
+                        struct address_space *mapping = f.file->f_mapping;
                        pgoff_t start = offset >> PAGE_CACHE_SHIFT;
                        pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
                        unsigned long len = end - start + 1;
-                        ret = do_readahead(mapping, file, start, len);
+                        ret = do_readahead(mapping, f.file, start, len);
                }
-                fput(file);
+                fdput(f);
        }
        return ret;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 0f3b7cda2a24..2ee1ef0f317b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
+#include <linux/backing-dev.h>
 #include <asm/tlbflush.h>
@@ -127,12 +128,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
        avc->vma = vma;
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
+        anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
-        /*
-         * It's critical to add new vmas to the tail of the anon_vma,
-         * see comment in huge_memory.c:__split_huge_page().
-         */
-        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
 }
 /**
@@ -269,51 +265,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 }
 /*
- * Some rmap walk that needs to find all ptes/hugepmds without false
- * negatives (like migrate and split_huge_page) running concurrent
- * with operations that copy or move pagetables (like mremap() and
- * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
- * list to be in a certain order: the dst_vma must be placed after the
- * src_vma in the list. This is always guaranteed by fork() but
- * mremap() needs to call this function to enforce it in case the
- * dst_vma isn't newly allocated and chained with the anon_vma_clone()
- * function but just an extension of a pre-existing vma through
- * vma_merge.
- *
- * NOTE: the same_anon_vma list can still be changed by other
- * processes while mremap runs because mremap doesn't hold the
- * anon_vma mutex to prevent modifications to the list while it
- * runs. All we need to enforce is that the relative order of this
- * process vmas isn't changing (we don't care about other vmas
- * order). Each vma corresponds to an anon_vma_chain structure so
- * there's no risk that other processes calling anon_vma_moveto_tail()
- * and changing the same_anon_vma list under mremap() will screw with
- * the relative order of this process vmas in the list, because we
- * they can't alter the order of any vma that belongs to this
- * process. And there can't be another anon_vma_moveto_tail() running
- * concurrently with mremap() coming from this process because we hold
- * the mmap_sem for the whole mremap(). fork() ordering dependency
- * also shouldn't be affected because fork() only cares that the
- * parent vmas are placed in the list before the child vmas and
- * anon_vma_moveto_tail() won't reorder vmas from either the fork()
- * parent or child.
- */
-void anon_vma_moveto_tail(struct vm_area_struct *dst)
-{
-        struct anon_vma_chain *pavc;
-        struct anon_vma *root = NULL;
-        list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
-                struct anon_vma *anon_vma = pavc->anon_vma;
-                VM_BUG_ON(pavc->vma != dst);
-                root = lock_anon_vma_root(root, anon_vma);
-                list_del(&pavc->same_anon_vma);
-                list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
-        }
-        unlock_anon_vma_root(root);
-}
-/*
 * Attach vma to its own anon_vma, as well as to the anon_vmas that
 * the corresponding VMA in the parent process is attached to.
 * Returns 0 on success, non-zero on failure.
@@ -381,13 +332,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
                struct anon_vma *anon_vma = avc->anon_vma;
                root = lock_anon_vma_root(root, anon_vma);
-                list_del(&avc->same_anon_vma);
+                anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
                /*
                 * Leave empty anon_vmas on the list - we'll need
                 * to free them outside the lock.
                 */
-                if (list_empty(&anon_vma->head))
+                if (RB_EMPTY_ROOT(&anon_vma->rb_root))
                        continue;
                list_del(&avc->same_vma);
@@ -416,7 +367,7 @@ static void anon_vma_ctor(void *data)
        mutex_init(&anon_vma->mutex);
        atomic_set(&anon_vma->refcount, 0);
-        INIT_LIST_HEAD(&anon_vma->head);
+        anon_vma->rb_root = RB_ROOT;
 }
 void __init anon_vma_init(void)
@@ -560,22 +511,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
 /*
 * At what user virtual address is page expected in @vma?
- * Returns virtual address or -EFAULT if page's index/offset is not
- * within the range mapped the @vma.
 */
-inline unsigned long
+static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+__vma_address(struct page *page, struct vm_area_struct *vma)
 {
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        unsigned long address;
        if (unlikely(is_vm_hugetlb_page(vma)))
                pgoff = page->index << huge_page_order(page_hstate(page));
-        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-        if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+        return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-                /* page should be within @vma mapping range */
+}
-                return -EFAULT;
-        }
+inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+        unsigned long address = __vma_address(page, vma);
+        /* page should be within @vma mapping range */
+        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        return address;
 }
@@ -585,6 +540,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
+        unsigned long address;
        if (PageAnon(page)) {
                struct anon_vma *page__anon_vma = page_anon_vma(page);
                /*
@@ -600,7 +556,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
                        return -EFAULT;
        } else
                return -EFAULT;
-        return vma_address(page, vma);
+        address = __vma_address(page, vma);
+        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+                return -EFAULT;
+        return address;
 }
 /*
@@ -674,8 +633,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
        pte_t *pte;
        spinlock_t *ptl;
-        address = vma_address(page, vma);
+        address = __vma_address(page, vma);
-        if (address == -EFAULT)         /* out of vma range */
+        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
                return 0;
        pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
        if (!pte)                       /* the page is not in this mm */
@@ -769,6 +728,7 @@ static int page_referenced_anon(struct page *page,
 {
        unsigned int mapcount;
        struct anon_vma *anon_vma;
+        pgoff_t pgoff;
        struct anon_vma_chain *avc;
        int referenced = 0;
@@ -777,11 +737,10 @@ static int page_referenced_anon(struct page *page,
                return referenced;
        mapcount = page_mapcount(page);
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                /*
                 * If we are reclaiming on behalf of a cgroup, skip
                 * counting on behalf of references from different
@@ -820,7 +779,6 @@ static int page_referenced_file(struct page *page,
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        int referenced = 0;
        /*
@@ -846,10 +804,8 @@ static int page_referenced_file(struct page *page,
         */
        mapcount = page_mapcount(page);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                /*
                 * If we are reclaiming on behalf of a cgroup, skip
                 * counting on behalf of references from different
@@ -929,7 +885,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                pte_t entry;
                flush_cache_page(vma, address, pte_pfn(*pte));
-                entry = ptep_clear_flush_notify(vma, address, pte);
+                entry = ptep_clear_flush(vma, address, pte);
                entry = pte_wrprotect(entry);
                entry = pte_mkclean(entry);
                set_pte_at(mm, address, pte, entry);
@@ -937,6 +893,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
        }
        pte_unmap_unlock(pte, ptl);
+        if (ret)
+                mmu_notifier_invalidate_page(mm, address);
 out:
        return ret;
 }
@@ -945,17 +904,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
 {
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        int ret = 0;
        BUG_ON(PageAnon(page));
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (vma->vm_flags & VM_SHARED) {
                        unsigned long address = vma_address(page, vma);
-                        if (address == -EFAULT)
-                                continue;
                        ret += page_mkclean_one(page, vma, address);
                }
        }
@@ -971,11 +927,8 @@ int page_mkclean(struct page *page)
        if (page_mapped(page)) {
                struct address_space *mapping = page_mapping(page);
-                if (mapping) {
+                if (mapping)
                        ret = page_mkclean_file(mapping, page);
-                        if (page_test_and_clear_dirty(page_to_pfn(page), 1))
-                                ret = 1;
-                }
        }
        return ret;
@@ -1128,7 +1081,7 @@ void page_add_new_anon_rmap(struct page *page,
        else
                __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __page_set_anon_rmap(page, vma, address, 1);
-        if (page_evictable(page, vma))
+        if (!mlocked_vma_newpage(vma, page))
                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
        else
                add_page_to_unevictable_list(page);
@@ -1161,6 +1114,7 @@ void page_add_file_rmap(struct page *page)
 */
 void page_remove_rmap(struct page *page)
 {
+        struct address_space *mapping = page_mapping(page);
        bool anon = PageAnon(page);
        bool locked;
        unsigned long flags;
@@ -1183,8 +1137,19 @@ void page_remove_rmap(struct page *page)
         * this if the page is anon, so about to be freed; but perhaps
         * not if it's in swapcache - there might be another pte slot
         * containing the swap entry, but page not yet written to swap.
+         *
+         * And we can skip it on file pages, so long as the filesystem
+         * participates in dirty tracking; but need to catch shm and tmpfs
+         * and ramfs pages which have been modified since creation by read
+         * fault.
+         *
+         * Note that mapping must be decided above, before decrementing
+         * mapcount (which luckily provides a barrier): once page is unmapped,
+         * it could be truncated and page->mapping reset to NULL at any moment.
+         * Note also that we are relying on page_mapping(page) to set mapping
+         * to &swapper_space when PageSwapCache(page).
         */
-        if ((!anon || PageSwapCache(page)) &&
+        if (mapping && !mapping_cap_account_dirty(mapping) &&
            page_test_and_clear_dirty(page_to_pfn(page), 1))
                set_page_dirty(page);
        /*
@@ -1203,7 +1168,10 @@ void page_remove_rmap(struct page *page)
        } else {
                __dec_zone_page_state(page, NR_FILE_MAPPED);
                mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
+                mem_cgroup_end_update_page_stat(page, &locked, &flags);
        }
+        if (unlikely(PageMlocked(page)))
+                clear_page_mlock(page);
        /*
         * It would be tidy to reset the PageAnon mapping here,
         * but that might overwrite a racing page_add_anon_rmap
@@ -1213,6 +1181,7 @@ void page_remove_rmap(struct page *page)
         * Leaving it set also helps swapoff to reinstate ptes
         * faster for those pages still in swapcache.
         */
+        return;
 out:
        if (!anon)
                mem_cgroup_end_update_page_stat(page, &locked, &flags);
@@ -1256,7 +1225,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
-        pteval = ptep_clear_flush_notify(vma, address, pte);
+        pteval = ptep_clear_flush(vma, address, pte);
        /* Move the dirty bit to the physical page now the pte is gone. */
        if (pte_dirty(pteval))
@@ -1318,6 +1287,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 out_unmap:
        pte_unmap_unlock(pte, ptl);
+        if (ret != SWAP_FAIL)
+                mmu_notifier_invalidate_page(mm, address);
 out:
        return ret;
@@ -1382,6 +1353,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        spinlock_t *ptl;
        struct page *page;
        unsigned long address;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
        unsigned long end;
        int ret = SWAP_AGAIN;
        int locked_vma = 0;
@@ -1405,6 +1378,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        if (!pmd_present(*pmd))
                return ret;
+        mmun_start = address;
+        mmun_end   = end;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
         * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
         * keep the sem while scanning the cluster for mlocking pages.
@@ -1438,7 +1415,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                /* Nuke the page table entry. */
                flush_cache_page(vma, address, pte_pfn(*pte));
-                pteval = ptep_clear_flush_notify(vma, address, pte);
+                pteval = ptep_clear_flush(vma, address, pte);
                /* If nonlinear, store the file page offset in the pte. */
                if (page->index != linear_page_index(vma, address))
@@ -1454,6 +1431,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                (*mapcount)--;
        }
        pte_unmap_unlock(pte - 1, ptl);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (locked_vma)
                up_read(&vma->vm_mm->mmap_sem);
        return ret;
@@ -1492,6 +1470,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma)
 static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
        struct anon_vma *anon_vma;
+        pgoff_t pgoff;
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
@@ -1499,7 +1478,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
        if (!anon_vma)
                return ret;
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address;
@@ -1516,8 +1496,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
                        continue;
                address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                ret = try_to_unmap_one(page, vma, address, flags);
                if (ret != SWAP_AGAIN || !page_mapped(page))
                        break;
@@ -1547,7 +1525,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        int ret = SWAP_AGAIN;
        unsigned long cursor;
        unsigned long max_nl_cursor = 0;
@@ -1555,10 +1532,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        unsigned int mapcount;
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                ret = try_to_unmap_one(page, vma, address, flags);
                if (ret != SWAP_AGAIN || !page_mapped(page))
                        goto out;
@@ -1576,7 +1551,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
                goto out;
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-                                                shared.vm_set.list) {
+                                                        shared.nonlinear) {
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
                        max_nl_cursor = cursor;
@@ -1608,7 +1583,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        do {
                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-                                                shared.vm_set.list) {
+                                                        shared.nonlinear) {
                        cursor = (unsigned long) vma->vm_private_data;
                        while ( cursor < max_nl_cursor &&
                                cursor < vma->vm_end - vma->vm_start) {
@@ -1631,7 +1606,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
         * in locked vmas).  Reset cursor on all unreserved nonlinear
         * vmas, now forgetting on which ones it had fallen behind.
         */
-        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
                vma->vm_private_data = NULL;
 out:
        mutex_unlock(&mapping->i_mmap_mutex);
@@ -1716,6 +1691,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
                struct vm_area_struct *, unsigned long, void *), void *arg)
 {
        struct anon_vma *anon_vma;
+        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
@@ -1729,11 +1705,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        if (!anon_vma)
                return ret;
        anon_vma_lock(anon_vma);
-        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                ret = rmap_one(page, vma, address, arg);
                if (ret != SWAP_AGAIN)
                        break;
@@ -1748,16 +1722,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
        int ret = SWAP_AGAIN;
        if (!mapping)
                return ret;
        mutex_lock(&mapping->i_mmap_mutex);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-                if (address == -EFAULT)
-                        continue;
                ret = rmap_one(page, vma, address, arg);
                if (ret != SWAP_AGAIN)
                        break;
diff --git a/mm/shmem.c b/mm/shmem.c
index d4e184e2a38e..67afba5117f2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -77,13 +77,6 @@ static struct vfsmount *shm_mnt;
 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
 #define SHORT_SYMLINK_LEN 128
-struct shmem_xattr {
-        struct list_head list;  /* anchored by shmem_inode_info->xattr_list */
-        char *name;             /* xattr name */
-        size_t size;
-        char value[0];
-};
 /*
 * shmem_fallocate and shmem_writepage communicate via inode->i_private
 * (with i_mutex making sure that it has only one user at a time):
@@ -636,7 +629,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 static void shmem_evict_inode(struct inode *inode)
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
-        struct shmem_xattr *xattr, *nxattr;
        if (inode->i_mapping->a_ops == &shmem_aops) {
                shmem_unacct_size(info->flags, inode->i_size);
@@ -650,10 +642,7 @@ static void shmem_evict_inode(struct inode *inode)
        } else
                kfree(info->symlink);
-        list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
+        simple_xattrs_free(&info->xattrs);
-                kfree(xattr->name);
-                kfree(xattr);
-        }
        BUG_ON(inode->i_blocks);
        shmem_free_inode(inode->i_sb);
        clear_inode(inode);
@@ -1350,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
        file_accessed(file);
        vma->vm_ops = &shmem_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
@@ -1377,7 +1365,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
                spin_lock_init(&info->lock);
                info->flags = flags & VM_NORESERVE;
                INIT_LIST_HEAD(&info->swaplist);
-                INIT_LIST_HEAD(&info->xattr_list);
+                simple_xattrs_init(&info->xattrs);
                cache_no_acl(inode);
                switch (mode & S_IFMT) {
@@ -2060,28 +2048,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
 */
 /*
- * Allocate new xattr and copy in the value; but leave the name to callers.
- */
-static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
-{
-        struct shmem_xattr *new_xattr;
-        size_t len;
-        /* wrap around? */
-        len = sizeof(*new_xattr) + size;
-        if (len <= sizeof(*new_xattr))
-                return NULL;
-        new_xattr = kmalloc(len, GFP_KERNEL);
-        if (!new_xattr)
-                return NULL;
-        new_xattr->size = size;
-        memcpy(new_xattr->value, value, size);
-        return new_xattr;
-}
-/*
 * Callback for security_inode_init_security() for acquiring xattrs.
 */
 static int shmem_initxattrs(struct inode *inode,
@@ -2090,11 +2056,11 @@ static int shmem_initxattrs(struct inode *inode,
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
        const struct xattr *xattr;
-        struct shmem_xattr *new_xattr;
+        struct simple_xattr *new_xattr;
        size_t len;
        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-                new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
+                new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
                if (!new_xattr)
                        return -ENOMEM;
@@ -2111,91 +2077,12 @@ static int shmem_initxattrs(struct inode *inode,
                memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
                       xattr->name, len);
-                spin_lock(&info->lock);
+                simple_xattr_list_add(&info->xattrs, new_xattr);
-                list_add(&new_xattr->list, &info->xattr_list);
-                spin_unlock(&info->lock);
        }
        return 0;
 }
-static int shmem_xattr_get(struct dentry *dentry, const char *name,
-                           void *buffer, size_t size)
-{
-        struct shmem_inode_info *info;
-        struct shmem_xattr *xattr;
-        int ret = -ENODATA;
-        info = SHMEM_I(dentry->d_inode);
-        spin_lock(&info->lock);
-        list_for_each_entry(xattr, &info->xattr_list, list) {
-                if (strcmp(name, xattr->name))
-                        continue;
-                ret = xattr->size;
-                if (buffer) {
-                        if (size < xattr->size)
-                                ret = -ERANGE;
-                        else
-                                memcpy(buffer, xattr->value, xattr->size);
-                }
-                break;
-        }
-        spin_unlock(&info->lock);
-        return ret;
-}
-static int shmem_xattr_set(struct inode *inode, const char *name,
-                           const void *value, size_t size, int flags)
-{
-        struct shmem_inode_info *info = SHMEM_I(inode);
-        struct shmem_xattr *xattr;
-        struct shmem_xattr *new_xattr = NULL;
-        int err = 0;
-        /* value == NULL means remove */
-        if (value) {
-                new_xattr = shmem_xattr_alloc(value, size);
-                if (!new_xattr)
-                        return -ENOMEM;
-                new_xattr->name = kstrdup(name, GFP_KERNEL);
-                if (!new_xattr->name) {
-                        kfree(new_xattr);
-                        return -ENOMEM;
-                }
-        }
-        spin_lock(&info->lock);
-        list_for_each_entry(xattr, &info->xattr_list, list) {
-                if (!strcmp(name, xattr->name)) {
-                        if (flags & XATTR_CREATE) {
-                                xattr = new_xattr;
-                                err = -EEXIST;
-                        } else if (new_xattr) {
-                                list_replace(&xattr->list, &new_xattr->list);
-                        } else {
-                                list_del(&xattr->list);
-                        }
-                        goto out;
-                }
-        }
-        if (flags & XATTR_REPLACE) {
-                xattr = new_xattr;
-                err = -ENODATA;
-        } else {
-                list_add(&new_xattr->list, &info->xattr_list);
-                xattr = NULL;
-        }
-out:
-        spin_unlock(&info->lock);
-        if (xattr)
-                kfree(xattr->name);
-        kfree(xattr);
-        return err;
-}
 static const struct xattr_handler *shmem_xattr_handlers[] = {
 #ifdef CONFIG_TMPFS_POSIX_ACL
        &generic_acl_access_handler,
@@ -2226,6 +2113,7 @@ static int shmem_xattr_validate(const char *name)
 static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
                              void *buffer, size_t size)
 {
+        struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
        int err;
        /*
@@ -2240,12 +2128,13 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
        if (err)
                return err;
-        return shmem_xattr_get(dentry, name, buffer, size);
+        return simple_xattr_get(&info->xattrs, name, buffer, size);
 }
 static int shmem_setxattr(struct dentry *dentry, const char *name,
                          const void *value, size_t size, int flags)
 {
+        struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
        int err;
        /*
@@ -2260,15 +2149,12 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
        if (err)
                return err;
-        if (size == 0)
+        return simple_xattr_set(&info->xattrs, name, value, size, flags);
-                value = "";  /* empty EA, do not remove */
-        return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
 }
 static int shmem_removexattr(struct dentry *dentry, const char *name)
 {
+        struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
        int err;
        /*
@@ -2283,45 +2169,13 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
        if (err)
                return err;
-        return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+        return simple_xattr_remove(&info->xattrs, name);
-}
-static bool xattr_is_trusted(const char *name)
-{
-        return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
 }
 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-        bool trusted = capable(CAP_SYS_ADMIN);
+        struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
-        struct shmem_xattr *xattr;
+        return simple_xattr_list(&info->xattrs, buffer, size);
-        struct shmem_inode_info *info;
-        size_t used = 0;
-        info = SHMEM_I(dentry->d_inode);
-        spin_lock(&info->lock);
-        list_for_each_entry(xattr, &info->xattr_list, list) {
-                size_t len;
-                /* skip "trusted." attributes for unprivileged callers */
-                if (!trusted && xattr_is_trusted(xattr->name))
-                        continue;
-                len = strlen(xattr->name) + 1;
-                used += len;
-                if (buffer) {
-                        if (size < used) {
-                                used = -ERANGE;
-                                break;
-                        }
-                        memcpy(buffer, xattr->name, len);
-                        buffer += len;
-                }
-        }
-        spin_unlock(&info->lock);
-        return used;
 }
 #endif /* CONFIG_TMPFS_XATTR */
@@ -2366,12 +2220,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
 {
        struct inode *inode;
        struct dentry *dentry = NULL;
-        u64 inum = fid->raw[2];
+        u64 inum;
-        inum = (inum << 32) | fid->raw[1];
        if (fh_len < 3)
                return NULL;
+        inum = fid->raw[2];
+        inum = (inum << 32) | fid->raw[1];
        inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
                        shmem_match, fid->raw);
        if (inode) {
@@ -2788,6 +2644,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
 #endif
+        .remap_pages    = generic_file_remap_pages,
 };
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -2981,7 +2838,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
                fput(vma->vm_file);
        vma->vm_file = file;
        vma->vm_ops = &shmem_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
diff --git a/mm/slab.c b/mm/slab.c
index c6854759bcf1..33d3363658df 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -498,14 +498,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #endif
-#ifdef CONFIG_TRACING
-size_t slab_buffer_size(struct kmem_cache *cachep)
-{
-        return cachep->size;
-}
-EXPORT_SYMBOL(slab_buffer_size);
-#endif
 /*
 * Do not go above this order unless 0 objects fit into the slab or
 * overridden on the command line.
@@ -515,13 +507,6 @@ EXPORT_SYMBOL(slab_buffer_size);
 static int slab_max_order = SLAB_MAX_ORDER_LO;
 static bool slab_max_order_set __initdata;
-static inline struct kmem_cache *page_get_cache(struct page *page)
-{
-        page = compound_head(page);
-        BUG_ON(!PageSlab(page));
-        return page->slab_cache;
-}
 static inline struct kmem_cache *virt_to_cache(const void *obj)
 {
        struct page *page = virt_to_head_page(obj);
@@ -585,9 +570,9 @@ static struct arraycache_init initarray_generic =
    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 /* internal cache of cache description objs */
-static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
+static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
-static struct kmem_cache cache_cache = {
+static struct kmem_cache kmem_cache_boot = {
-        .nodelists = cache_cache_nodelists,
+        .nodelists = kmem_cache_nodelists,
        .batchcount = 1,
        .limit = BOOT_CPUCACHE_ENTRIES,
        .shared = 1,
@@ -810,6 +795,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
        *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 }
+#if DEBUG
 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
 static void __slab_error(const char *function, struct kmem_cache *cachep,
@@ -818,7 +804,9 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
               function, cachep->name, msg);
        dump_stack();
+        add_taint(TAINT_BAD_PAGE);
 }
+#endif
 /*
 * By default on NUMA we use alien caches to stage the freeing of
@@ -900,7 +888,7 @@ static void __cpuinit start_cpu_timer(int cpu)
         */
        if (keventd_up() && reap_work->work.func == NULL) {
                init_reap_node(cpu);
-                INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
+                INIT_DEFERRABLE_WORK(reap_work, cache_reap);
                schedule_delayed_work_on(cpu, reap_work,
                                        __round_jiffies_relative(HZ, cpu));
        }
@@ -1601,15 +1589,17 @@ void __init kmem_cache_init(void)
        int order;
        int node;
+        kmem_cache = &kmem_cache_boot;
        if (num_possible_nodes() == 1)
                use_alien_caches = 0;
        for (i = 0; i < NUM_INIT_LISTS; i++) {
                kmem_list3_init(&initkmem_list3[i]);
                if (i < MAX_NUMNODES)
-                        cache_cache.nodelists[i] = NULL;
+                        kmem_cache->nodelists[i] = NULL;
        }
-        set_up_list3s(&cache_cache, CACHE_CACHE);
+        set_up_list3s(kmem_cache, CACHE_CACHE);
        /*
         * Fragmentation resistance on low memory - only use bigger
@@ -1621,9 +1611,9 @@ void __init kmem_cache_init(void)
        /* Bootstrap is tricky, because several objects are allocated
         * from caches that do not exist yet:
-         * 1) initialize the cache_cache cache: it contains the struct
+         * 1) initialize the kmem_cache cache: it contains the struct
-         *    kmem_cache structures of all caches, except cache_cache itself:
+         *    kmem_cache structures of all caches, except kmem_cache itself:
-         *    cache_cache is statically allocated.
+         *    kmem_cache is statically allocated.
         *    Initially an __init data area is used for the head array and the
         *    kmem_list3 structures, it's replaced with a kmalloc allocated
         *    array at the end of the bootstrap.
@@ -1632,43 +1622,43 @@ void __init kmem_cache_init(void)
         *    An __init data area is used for the head array.
         * 3) Create the remaining kmalloc caches, with minimally sized
         *    head arrays.
-         * 4) Replace the __init data head arrays for cache_cache and the first
+         * 4) Replace the __init data head arrays for kmem_cache and the first
         *    kmalloc cache with kmalloc allocated arrays.
-         * 5) Replace the __init data for kmem_list3 for cache_cache and
+         * 5) Replace the __init data for kmem_list3 for kmem_cache and
         *    the other cache's with kmalloc allocated memory.
         * 6) Resize the head arrays of the kmalloc caches to their final sizes.
         */
        node = numa_mem_id();
-        /* 1) create the cache_cache */
+        /* 1) create the kmem_cache */
        INIT_LIST_HEAD(&slab_caches);
-        list_add(&cache_cache.list, &slab_caches);
+        list_add(&kmem_cache->list, &slab_caches);
-        cache_cache.colour_off = cache_line_size();
+        kmem_cache->colour_off = cache_line_size();
-        cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
+        kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
-        cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
+        kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
        /*
         * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
         */
-        cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+        kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
                                  nr_node_ids * sizeof(struct kmem_list3 *);
-        cache_cache.object_size = cache_cache.size;
+        kmem_cache->object_size = kmem_cache->size;
-        cache_cache.size = ALIGN(cache_cache.size,
+        kmem_cache->size = ALIGN(kmem_cache->object_size,
                                        cache_line_size());
-        cache_cache.reciprocal_buffer_size =
+        kmem_cache->reciprocal_buffer_size =
-                reciprocal_value(cache_cache.size);
+                reciprocal_value(kmem_cache->size);
        for (order = 0; order < MAX_ORDER; order++) {
-                cache_estimate(order, cache_cache.size,
+                cache_estimate(order, kmem_cache->size,
-                        cache_line_size(), 0, &left_over, &cache_cache.num);
+                        cache_line_size(), 0, &left_over, &kmem_cache->num);
-                if (cache_cache.num)
+                if (kmem_cache->num)
                        break;
        }
-        BUG_ON(!cache_cache.num);
+        BUG_ON(!kmem_cache->num);
-        cache_cache.gfporder = order;
+        kmem_cache->gfporder = order;
-        cache_cache.colour = left_over / cache_cache.colour_off;
+        kmem_cache->colour = left_over / kmem_cache->colour_off;
-        cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
+        kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
                                      sizeof(struct slab), cache_line_size());
        /* 2+3) create the kmalloc caches */
@@ -1681,19 +1671,22 @@ void __init kmem_cache_init(void)
         * bug.
         */
-        sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name,
+        sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-                                        sizes[INDEX_AC].cs_size,
+        sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name;
-                                        ARCH_KMALLOC_MINALIGN,
+        sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size;
-                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+        sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size;
-                                        NULL);
+        sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
+        __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
+        list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
        if (INDEX_AC != INDEX_L3) {
-                sizes[INDEX_L3].cs_cachep =
+                sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-                        __kmem_cache_create(names[INDEX_L3].name,
+                sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
-                                sizes[INDEX_L3].cs_size,
+                sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
-                                ARCH_KMALLOC_MINALIGN,
+                sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
-                                ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+                sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
-                                NULL);
+                __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
+                list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
        }
        slab_early_init = 0;
@@ -1707,20 +1700,23 @@ void __init kmem_cache_init(void)
                 * allow tighter packing of the smaller caches.
                 */
                if (!sizes->cs_cachep) {
-                        sizes->cs_cachep = __kmem_cache_create(names->name,
+                        sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-                                        sizes->cs_size,
+                        sizes->cs_cachep->name = names->name;
-                                        ARCH_KMALLOC_MINALIGN,
+                        sizes->cs_cachep->size = sizes->cs_size;
-                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+                        sizes->cs_cachep->object_size = sizes->cs_size;
-                                        NULL);
+                        sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
+                        __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
+                        list_add(&sizes->cs_cachep->list, &slab_caches);
                }
 #ifdef CONFIG_ZONE_DMA
-                sizes->cs_dmacachep = __kmem_cache_create(
+                sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-                                        names->name_dma,
+                sizes->cs_dmacachep->name = names->name_dma;
-                                        sizes->cs_size,
+                sizes->cs_dmacachep->size = sizes->cs_size;
-                                        ARCH_KMALLOC_MINALIGN,
+                sizes->cs_dmacachep->object_size = sizes->cs_size;
-                                        ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
+                sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
-                                                SLAB_PANIC,
+                __kmem_cache_create(sizes->cs_dmacachep,
-                                        NULL);
+                               ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
+                list_add(&sizes->cs_dmacachep->list, &slab_caches);
 #endif
                sizes++;
                names++;
@@ -1731,15 +1727,15 @@ void __init kmem_cache_init(void)
                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-                BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
+                BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
-                memcpy(ptr, cpu_cache_get(&cache_cache),
+                memcpy(ptr, cpu_cache_get(kmem_cache),
                       sizeof(struct arraycache_init));
                /*
                 * Do not assume that spinlocks can be initialized via memcpy:
                 */
                spin_lock_init(&ptr->lock);
-                cache_cache.array[smp_processor_id()] = ptr;
+                kmem_cache->array[smp_processor_id()] = ptr;
                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
@@ -1760,7 +1756,7 @@ void __init kmem_cache_init(void)
                int nid;
                for_each_online_node(nid) {
-                        init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
+                        init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
                        init_list(malloc_sizes[INDEX_AC].cs_cachep,
                                  &initkmem_list3[SIZE_AC + nid], nid);
@@ -1781,9 +1777,6 @@ void __init kmem_cache_init_late(void)
        slab_state = UP;
-        /* Annotate slab for lockdep -- annotate the malloc caches */
-        init_lock_keys();
        /* 6) resize the head arrays to their final sizes */
        mutex_lock(&slab_mutex);
        list_for_each_entry(cachep, &slab_caches, list)
@@ -1791,6 +1784,9 @@ void __init kmem_cache_init_late(void)
                        BUG();
        mutex_unlock(&slab_mutex);
+        /* Annotate slab for lockdep -- annotate the malloc caches */
+        init_lock_keys();
        /* Done! */
        slab_state = FULL;
@@ -2209,27 +2205,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
        }
 }
-static void __kmem_cache_destroy(struct kmem_cache *cachep)
-{
-        int i;
-        struct kmem_list3 *l3;
-        for_each_online_cpu(i)
-            kfree(cachep->array[i]);
-        /* NUMA: free the list3 structures */
-        for_each_online_node(i) {
-                l3 = cachep->nodelists[i];
-                if (l3) {
-                        kfree(l3->shared);
-                        free_alien_cache(l3->alien);
-                        kfree(l3);
-                }
-        }
-        kmem_cache_free(&cache_cache, cachep);
-}
 /**
 * calculate_slab_order - calculate size (page order) of slabs
 * @cachep: pointer to the cache that is being created
@@ -2366,9 +2341,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 * Cannot be called within a int, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
- * @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting unloaded.
- *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -2381,13 +2353,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
-struct kmem_cache *
+int
-__kmem_cache_create (const char *name, size_t size, size_t align,
+__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
-        unsigned long flags, void (*ctor)(void *))
 {
        size_t left_over, slab_size, ralign;
-        struct kmem_cache *cachep = NULL;
        gfp_t gfp;
+        int err;
+        size_t size = cachep->size;
 #if DEBUG
 #if FORCED_DEBUG
@@ -2459,8 +2431,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
                ralign = ARCH_SLAB_MINALIGN;
        }
        /* 3) caller mandated alignment */
-        if (ralign < align) {
+        if (ralign < cachep->align) {
-                ralign = align;
+                ralign = cachep->align;
        }
        /* disable debug if necessary */
        if (ralign > __alignof__(unsigned long long))
@@ -2468,21 +2440,14 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
        /*
         * 4) Store it.
         */
-        align = ralign;
+        cachep->align = ralign;
        if (slab_is_available())
                gfp = GFP_KERNEL;
        else
                gfp = GFP_NOWAIT;
-        /* Get cache's description obj. */
-        cachep = kmem_cache_zalloc(&cache_cache, gfp);
-        if (!cachep)
-                return NULL;
        cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
-        cachep->object_size = size;
-        cachep->align = align;
 #if DEBUG
        /*
@@ -2506,8 +2471,9 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
        }
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
        if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
-            && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
+            && cachep->object_size > cache_line_size()
-                cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
+            && ALIGN(size, cachep->align) < PAGE_SIZE) {
+                cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
                size = PAGE_SIZE;
        }
 #endif
@@ -2527,18 +2493,15 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
                 */
                flags |= CFLGS_OFF_SLAB;
-        size = ALIGN(size, align);
+        size = ALIGN(size, cachep->align);
-        left_over = calculate_slab_order(cachep, size, align, flags);
+        left_over = calculate_slab_order(cachep, size, cachep->align, flags);
+        if (!cachep->num)
+                return -E2BIG;
-        if (!cachep->num) {
-                printk(KERN_ERR
-                       "kmem_cache_create: couldn't create cache %s.\n", name);
-                kmem_cache_free(&cache_cache, cachep);
-                return NULL;
-        }
        slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
-                          + sizeof(struct slab), align);
+                          + sizeof(struct slab), cachep->align);
        /*
         * If the slab has been placed off-slab, and we have enough space then
@@ -2566,8 +2529,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->colour_off = cache_line_size();
        /* Offset must be a multiple of the alignment. */
-        if (cachep->colour_off < align)
+        if (cachep->colour_off < cachep->align)
-                cachep->colour_off = align;
+                cachep->colour_off = cachep->align;
        cachep->colour = left_over / cachep->colour_off;
        cachep->slab_size = slab_size;
        cachep->flags = flags;
@@ -2588,12 +2551,11 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
                 */
                BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
        }
-        cachep->ctor = ctor;
-        cachep->name = name;
-        if (setup_cpu_cache(cachep, gfp)) {
+        err = setup_cpu_cache(cachep, gfp);
-                __kmem_cache_destroy(cachep);
+        if (err) {
-                return NULL;
+                __kmem_cache_shutdown(cachep);
+                return err;
        }
        if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2606,9 +2568,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
                slab_set_debugobj_lock_classes(cachep);
        }
-        /* cache setup completed, link it into the list */
+        return 0;
-        list_add(&cachep->list, &slab_caches);
-        return cachep;
 }
 #if DEBUG
@@ -2767,49 +2727,29 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
-/**
+int __kmem_cache_shutdown(struct kmem_cache *cachep)
- * kmem_cache_destroy - delete a cache
- * @cachep: the cache to destroy
- *
- * Remove a &struct kmem_cache object from the slab cache.
- *
- * It is expected this function will be called by a module when it is
- * unloaded.  This will remove the cache completely, and avoid a duplicate
- * cache being allocated each time a module is loaded and unloaded, if the
- * module doesn't have persistent in-kernel storage across loads and unloads.
- *
- * The cache must be empty before calling this function.
- *
- * The caller must guarantee that no one will allocate memory from the cache
- * during the kmem_cache_destroy().
- */
-void kmem_cache_destroy(struct kmem_cache *cachep)
 {
-        BUG_ON(!cachep || in_interrupt());
+        int i;
+        struct kmem_list3 *l3;
+        int rc = __cache_shrink(cachep);
-        /* Find the cache in the chain of caches. */
+        if (rc)
-        get_online_cpus();
+                return rc;
-        mutex_lock(&slab_mutex);
-        /*
-         * the chain is never empty, cache_cache is never destroyed
-         */
-        list_del(&cachep->list);
-        if (__cache_shrink(cachep)) {
-                slab_error(cachep, "Can't free all objects");
-                list_add(&cachep->list, &slab_caches);
-                mutex_unlock(&slab_mutex);
-                put_online_cpus();
-                return;
-        }
-        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
+        for_each_online_cpu(i)
-                rcu_barrier();
+            kfree(cachep->array[i]);
-        __kmem_cache_destroy(cachep);
+        /* NUMA: free the list3 structures */
-        mutex_unlock(&slab_mutex);
+        for_each_online_node(i) {
-        put_online_cpus();
+                l3 = cachep->nodelists[i];
+                if (l3) {
+                        kfree(l3->shared);
+                        free_alien_cache(l3->alien);
+                        kfree(l3);
+                }
+        }
+        return 0;
 }
-EXPORT_SYMBOL(kmem_cache_destroy);
 /*
 * Get the memory for a slab management obj.
@@ -3098,7 +3038,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
 }
 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
-                                   void *caller)
+                                   unsigned long caller)
 {
        struct page *page;
        unsigned int objnr;
@@ -3118,7 +3058,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
        }
        if (cachep->flags & SLAB_STORE_USER)
-                *dbg_userword(cachep, objp) = caller;
+                *dbg_userword(cachep, objp) = (void *)caller;
        objnr = obj_to_index(cachep, slabp, objp);
@@ -3131,7 +3071,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
-                        store_stackinfo(cachep, objp, (unsigned long)caller);
+                        store_stackinfo(cachep, objp, caller);
                        kernel_map_pages(virt_to_page(objp),
                                         cachep->size / PAGE_SIZE, 0);
                } else {
@@ -3285,7 +3225,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
 #if DEBUG
 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
-                                gfp_t flags, void *objp, void *caller)
+                                gfp_t flags, void *objp, unsigned long caller)
 {
        if (!objp)
                return objp;
@@ -3302,7 +3242,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                poison_obj(cachep, objp, POISON_INUSE);
        }
        if (cachep->flags & SLAB_STORE_USER)
-                *dbg_userword(cachep, objp) = caller;
+                *dbg_userword(cachep, objp) = (void *)caller;
        if (cachep->flags & SLAB_RED_ZONE) {
                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
@@ -3343,7 +3283,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
 {
-        if (cachep == &cache_cache)
+        if (cachep == kmem_cache)
                return false;
        return should_failslab(cachep->object_size, flags, cachep->flags);
@@ -3576,8 +3516,8 @@ done:
 * Fallback to other node is possible if __GFP_THISNODE is not set.
 */
 static __always_inline void *
-__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
-                   void *caller)
+                   unsigned long caller)
 {
        unsigned long save_flags;
        void *ptr;
@@ -3663,7 +3603,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 #endif /* CONFIG_NUMA */
 static __always_inline void *
-__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 {
        unsigned long save_flags;
        void *objp;
@@ -3799,7 +3739,7 @@ free_done:
 * be in this state _before_ it is released.  Called with disabled ints.
 */
 static inline void __cache_free(struct kmem_cache *cachep, void *objp,
-    void *caller)
+                                unsigned long caller)
 {
        struct array_cache *ac = cpu_cache_get(cachep);
@@ -3839,7 +3779,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-        void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+        void *ret = slab_alloc(cachep, flags, _RET_IP_);
        trace_kmem_cache_alloc(_RET_IP_, ret,
                               cachep->object_size, cachep->size, flags);
@@ -3850,14 +3790,14 @@ EXPORT_SYMBOL(kmem_cache_alloc);
 #ifdef CONFIG_TRACING
 void *
-kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
+kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
 {
        void *ret;
-        ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+        ret = slab_alloc(cachep, flags, _RET_IP_);
        trace_kmalloc(_RET_IP_, ret,
-                      size, slab_buffer_size(cachep), flags);
+                      size, cachep->size, flags);
        return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -3866,8 +3806,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
-        void *ret = __cache_alloc_node(cachep, flags, nodeid,
+        void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
-                                       __builtin_return_address(0));
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
                                    cachep->object_size, cachep->size,
@@ -3878,17 +3817,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(size_t size,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
-                                  struct kmem_cache *cachep,
                                  gfp_t flags,
-                                  int nodeid)
+                                  int nodeid,
+                                  size_t size)
 {
        void *ret;
-        ret = __cache_alloc_node(cachep, flags, nodeid,
+        ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
-                                  __builtin_return_address(0));
        trace_kmalloc_node(_RET_IP_, ret,
-                           size, slab_buffer_size(cachep),
+                           size, cachep->size,
                           flags, nodeid);
        return ret;
 }
@@ -3896,34 +3835,33 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
 #endif
 static __always_inline void *
-__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
+__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
 {
        struct kmem_cache *cachep;
        cachep = kmem_find_general_cachep(size, flags);
        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
                return cachep;
-        return kmem_cache_alloc_node_trace(size, cachep, flags, node);
+        return kmem_cache_alloc_node_trace(cachep, flags, node, size);
 }
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
-        return __do_kmalloc_node(size, flags, node,
+        return __do_kmalloc_node(size, flags, node, _RET_IP_);
-                        __builtin_return_address(0));
 }
 EXPORT_SYMBOL(__kmalloc_node);
 void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
                int node, unsigned long caller)
 {
-        return __do_kmalloc_node(size, flags, node, (void *)caller);
+        return __do_kmalloc_node(size, flags, node, caller);
 }
 EXPORT_SYMBOL(__kmalloc_node_track_caller);
 #else
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
-        return __do_kmalloc_node(size, flags, node, NULL);
+        return __do_kmalloc_node(size, flags, node, 0);
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
@@ -3936,7 +3874,7 @@ EXPORT_SYMBOL(__kmalloc_node);
 * @caller: function caller for debug tracking of the caller
 */
 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
-                                          void *caller)
+                                          unsigned long caller)
 {
        struct kmem_cache *cachep;
        void *ret;
@@ -3949,9 +3887,9 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
        cachep = __find_general_cachep(size, flags);
        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
                return cachep;
-        ret = __cache_alloc(cachep, flags, caller);
+        ret = slab_alloc(cachep, flags, caller);
-        trace_kmalloc((unsigned long) caller, ret,
+        trace_kmalloc(caller, ret,
                      size, cachep->size, flags);
        return ret;
@@ -3961,20 +3899,20 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
 void *__kmalloc(size_t size, gfp_t flags)
 {
-        return __do_kmalloc(size, flags, __builtin_return_address(0));
+        return __do_kmalloc(size, flags, _RET_IP_);
 }
 EXPORT_SYMBOL(__kmalloc);
 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
 {
-        return __do_kmalloc(size, flags, (void *)caller);
+        return __do_kmalloc(size, flags, caller);
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
 #else
 void *__kmalloc(size_t size, gfp_t flags)
 {
-        return __do_kmalloc(size, flags, NULL);
+        return __do_kmalloc(size, flags, 0);
 }
 EXPORT_SYMBOL(__kmalloc);
 #endif
@@ -3995,7 +3933,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
        debug_check_no_locks_freed(objp, cachep->object_size);
        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
                debug_check_no_obj_freed(objp, cachep->object_size);
-        __cache_free(cachep, objp, __builtin_return_address(0));
+        __cache_free(cachep, objp, _RET_IP_);
        local_irq_restore(flags);
        trace_kmem_cache_free(_RET_IP_, objp);
@@ -4026,7 +3964,7 @@ void kfree(const void *objp)
        debug_check_no_locks_freed(objp, c->object_size);
        debug_check_no_obj_freed(objp, c->object_size);
-        __cache_free(c, (void *)objp, __builtin_return_address(0));
+        __cache_free(c, (void *)objp, _RET_IP_);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
diff --git a/mm/slab.h b/mm/slab.h
index db7848caaa25..7deeb449a301 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -25,9 +25,26 @@ extern enum slab_state slab_state;
 /* The slab cache mutex protects the management structures during changes */
 extern struct mutex slab_mutex;
+/* The list of all slab caches on the system */
 extern struct list_head slab_caches;
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+/* The slab cache that manages slab cache information */
+extern struct kmem_cache *kmem_cache;
+/* Functions provided by the slab allocators */
+extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
+#ifdef CONFIG_SLUB
+struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
        size_t align, unsigned long flags, void (*ctor)(void *));
+#else
+static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
+        size_t align, unsigned long flags, void (*ctor)(void *))
+{ return NULL; }
+#endif
+int __kmem_cache_shutdown(struct kmem_cache *);
 #endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index aa3ca5bb01b5..069a24e64403 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -22,6 +22,53 @@
 enum slab_state slab_state;
 LIST_HEAD(slab_caches);
 DEFINE_MUTEX(slab_mutex);
+struct kmem_cache *kmem_cache;
+#ifdef CONFIG_DEBUG_VM
+static int kmem_cache_sanity_check(const char *name, size_t size)
+{
+        struct kmem_cache *s = NULL;
+        if (!name || in_interrupt() || size < sizeof(void *) ||
+                size > KMALLOC_MAX_SIZE) {
+                pr_err("kmem_cache_create(%s) integrity check failed\n", name);
+                return -EINVAL;
+        }
+        list_for_each_entry(s, &slab_caches, list) {
+                char tmp;
+                int res;
+                /*
+                 * This happens when the module gets unloaded and doesn't
+                 * destroy its slab cache and no-one else reuses the vmalloc
+                 * area of the module.  Print a warning.
+                 */
+                res = probe_kernel_address(s->name, tmp);
+                if (res) {
+                        pr_err("Slab cache with size %d has lost its name\n",
+                               s->object_size);
+                        continue;
+                }
+                if (!strcmp(s->name, name)) {
+                        pr_err("%s (%s): Cache name already exists.\n",
+                               __func__, name);
+                        dump_stack();
+                        s = NULL;
+                        return -EINVAL;
+                }
+        }
+        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
+        return 0;
+}
+#else
+static inline int kmem_cache_sanity_check(const char *name, size_t size)
+{
+        return 0;
+}
+#endif
 /*
 * kmem_cache_create - Create a cache.
@@ -52,68 +99,95 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
                unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s = NULL;
+        int err = 0;
-#ifdef CONFIG_DEBUG_VM
-        if (!name || in_interrupt() || size < sizeof(void *) ||
-                size > KMALLOC_MAX_SIZE) {
-                printk(KERN_ERR "kmem_cache_create(%s) integrity check"
-                        " failed\n", name);
-                goto out;
-        }
-#endif
        get_online_cpus();
        mutex_lock(&slab_mutex);
-#ifdef CONFIG_DEBUG_VM
+        if (!kmem_cache_sanity_check(name, size) == 0)
-        list_for_each_entry(s, &slab_caches, list) {
+                goto out_locked;
-                char tmp;
-                int res;
-                /*
-                 * This happens when the module gets unloaded and doesn't
-                 * destroy its slab cache and no-one else reuses the vmalloc
-                 * area of the module.  Print a warning.
-                 */
-                res = probe_kernel_address(s->name, tmp);
-                if (res) {
-                        printk(KERN_ERR
-                               "Slab cache with size %d has lost its name\n",
-                               s->object_size);
-                        continue;
-                }
-                if (!strcmp(s->name, name)) {
+        s = __kmem_cache_alias(name, size, align, flags, ctor);
-                        printk(KERN_ERR "kmem_cache_create(%s): Cache name"
+        if (s)
-                                " already exists.\n",
+                goto out_locked;
-                                name);
-                        dump_stack();
+        s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
-                        s = NULL;
+        if (s) {
-                        goto oops;
+                s->object_size = s->size = size;
+                s->align = align;
+                s->ctor = ctor;
+                s->name = kstrdup(name, GFP_KERNEL);
+                if (!s->name) {
+                        kmem_cache_free(kmem_cache, s);
+                        err = -ENOMEM;
+                        goto out_locked;
                }
-        }
-        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
+                err = __kmem_cache_create(s, flags);
-#endif
+                if (!err) {
-        s = __kmem_cache_create(name, size, align, flags, ctor);
+                        s->refcount = 1;
+                        list_add(&s->list, &slab_caches);
-#ifdef CONFIG_DEBUG_VM
+                } else {
-oops:
+                        kfree(s->name);
-#endif
+                        kmem_cache_free(kmem_cache, s);
+                }
+        } else
+                err = -ENOMEM;
+out_locked:
        mutex_unlock(&slab_mutex);
        put_online_cpus();
-#ifdef CONFIG_DEBUG_VM
+        if (err) {
-out:
-#endif
+                if (flags & SLAB_PANIC)
-        if (!s && (flags & SLAB_PANIC))
+                        panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
-                panic("kmem_cache_create: Failed to create slab '%s'\n", name);
+                                name, err);
+                else {
+                        printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
+                                name, err);
+                        dump_stack();
+                }
+                return NULL;
+        }
        return s;
 }
 EXPORT_SYMBOL(kmem_cache_create);
+void kmem_cache_destroy(struct kmem_cache *s)
+{
+        get_online_cpus();
+        mutex_lock(&slab_mutex);
+        s->refcount--;
+        if (!s->refcount) {
+                list_del(&s->list);
+                if (!__kmem_cache_shutdown(s)) {
+                        mutex_unlock(&slab_mutex);
+                        if (s->flags & SLAB_DESTROY_BY_RCU)
+                                rcu_barrier();
+                        kfree(s->name);
+                        kmem_cache_free(kmem_cache, s);
+                } else {
+                        list_add(&s->list, &slab_caches);
+                        mutex_unlock(&slab_mutex);
+                        printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
+                                s->name);
+                        dump_stack();
+                }
+        } else {
+                mutex_unlock(&slab_mutex);
+        }
+        put_online_cpus();
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
 int slab_is_available(void)
 {
        return slab_state >= UP;
diff --git a/mm/slob.c b/mm/slob.c
index 45d4ca79933a..1e921c5e9576 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -194,7 +194,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
        void *page;
 #ifdef CONFIG_NUMA
-        if (node != -1)
+        if (node != NUMA_NO_NODE)
                page = alloc_pages_exact_node(node, gfp, order);
        else
 #endif
@@ -290,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
                 * If there's a node specification, search for a partial
                 * page with a matching node id in the freelist.
                 */
-                if (node != -1 && page_to_nid(sp) != node)
+                if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
                        continue;
 #endif
                /* Enough room on this page? */
@@ -425,10 +425,11 @@ out:
 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
 */
-void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 {
        unsigned int *m;
-        int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+        int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
        void *ret;
        gfp &= gfp_allowed_mask;
@@ -446,7 +447,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
                *m = size;
                ret = (void *)m + align;
-                trace_kmalloc_node(_RET_IP_, ret,
+                trace_kmalloc_node(caller, ret,
                                   size, size + align, gfp, node);
        } else {
                unsigned int order = get_order(size);
@@ -460,15 +461,35 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
                        page->private = size;
                }
-                trace_kmalloc_node(_RET_IP_, ret,
+                trace_kmalloc_node(caller, ret,
                                   size, PAGE_SIZE << order, gfp, node);
        }
        kmemleak_alloc(ret, size, 1, gfp);
        return ret;
 }
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+{
+        return __do_kmalloc_node(size, gfp, node, _RET_IP_);
+}
 EXPORT_SYMBOL(__kmalloc_node);
+#ifdef CONFIG_TRACING
+void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
+{
+        return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
+}
+#ifdef CONFIG_NUMA
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
+                                        int node, unsigned long caller)
+{
+        return __do_kmalloc_node(size, gfp, node, caller);
+}
+#endif
+#endif
 void kfree(const void *block)
 {
        struct page *sp;
@@ -481,7 +502,7 @@ void kfree(const void *block)
        sp = virt_to_page(block);
        if (PageSlab(sp)) {
-                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+                int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
                unsigned int *m = (unsigned int *)(block - align);
                slob_free(m, *m + align);
        } else
@@ -500,7 +521,7 @@ size_t ksize(const void *block)
        sp = virt_to_page(block);
        if (PageSlab(sp)) {
-                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+                int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
                unsigned int *m = (unsigned int *)(block - align);
                return SLOB_UNITS(*m) * SLOB_UNIT;
        } else
@@ -508,44 +529,24 @@ size_t ksize(const void *block)
 }
 EXPORT_SYMBOL(ksize);
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
-        size_t align, unsigned long flags, void (*ctor)(void *))
 {
-        struct kmem_cache *c;
+        size_t align = c->size;
-        c = slob_alloc(sizeof(struct kmem_cache),
-                GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1);
-        if (c) {
+        if (flags & SLAB_DESTROY_BY_RCU) {
-                c->name = name;
+                /* leave room for rcu footer at the end of object */
-                c->size = size;
+                c->size += sizeof(struct slob_rcu);
-                if (flags & SLAB_DESTROY_BY_RCU) {
-                        /* leave room for rcu footer at the end of object */
-                        c->size += sizeof(struct slob_rcu);
-                }
-                c->flags = flags;
-                c->ctor = ctor;
-                /* ignore alignment unless it's forced */
-                c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
-                if (c->align < ARCH_SLAB_MINALIGN)
-                        c->align = ARCH_SLAB_MINALIGN;
-                if (c->align < align)
-                        c->align = align;
-                kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
-                c->refcount = 1;
        }
-        return c;
+        c->flags = flags;
-}
+        /* ignore alignment unless it's forced */
+        c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+        if (c->align < ARCH_SLAB_MINALIGN)
+                c->align = ARCH_SLAB_MINALIGN;
+        if (c->align < align)
+                c->align = align;
-void kmem_cache_destroy(struct kmem_cache *c)
+        return 0;
-{
-        kmemleak_free(c);
-        if (c->flags & SLAB_DESTROY_BY_RCU)
-                rcu_barrier();
-        slob_free(c, sizeof(struct kmem_cache));
 }
-EXPORT_SYMBOL(kmem_cache_destroy);
 void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
@@ -613,14 +614,28 @@ unsigned int kmem_cache_size(struct kmem_cache *c)
 }
 EXPORT_SYMBOL(kmem_cache_size);
+int __kmem_cache_shutdown(struct kmem_cache *c)
+{
+        /* No way to check for remaining objects */
+        return 0;
+}
 int kmem_cache_shrink(struct kmem_cache *d)
 {
        return 0;
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
+struct kmem_cache kmem_cache_boot = {
+        .name = "kmem_cache",
+        .size = sizeof(struct kmem_cache),
+        .flags = SLAB_PANIC,
+        .align = ARCH_KMALLOC_MINALIGN,
+};
 void __init kmem_cache_init(void)
 {
+        kmem_cache = &kmem_cache_boot;
        slab_state = UP;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 2fdd96f9e998..a0d698467f70 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -210,11 +210,7 @@ static void sysfs_slab_remove(struct kmem_cache *);
 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
                                                        { return 0; }
-static inline void sysfs_slab_remove(struct kmem_cache *s)
+static inline void sysfs_slab_remove(struct kmem_cache *s) { }
-{
-        kfree(s->name);
-        kfree(s);
-}
 #endif
@@ -568,6 +564,8 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
        printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
        printk(KERN_ERR "----------------------------------------"
                        "-------------------------------------\n\n");
+        add_taint(TAINT_BAD_PAGE);
 }
 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
@@ -624,7 +622,7 @@ static void object_err(struct kmem_cache *s, struct page *page,
        print_trailer(s, page, object);
 }
-static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
+static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...)
 {
        va_list args;
        char buf[100];
@@ -1069,13 +1067,13 @@ bad:
        return 0;
 }
-static noinline int free_debug_processing(struct kmem_cache *s,
+static noinline struct kmem_cache_node *free_debug_processing(
-                 struct page *page, void *object, unsigned long addr)
+        struct kmem_cache *s, struct page *page, void *object,
+        unsigned long addr, unsigned long *flags)
 {
-        unsigned long flags;
+        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-        int rc = 0;
-        local_irq_save(flags);
+        spin_lock_irqsave(&n->list_lock, *flags);
        slab_lock(page);
        if (!check_slab(s, page))
@@ -1113,15 +1111,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
                set_track(s, object, TRACK_FREE, addr);
        trace(s, page, object, 0);
        init_object(s, object, SLUB_RED_INACTIVE);
-        rc = 1;
 out:
        slab_unlock(page);
-        local_irq_restore(flags);
+        /*
-        return rc;
+         * Keep node_lock to preserve integrity
+         * until the object is actually freed
+         */
+        return n;
 fail:
+        slab_unlock(page);
+        spin_unlock_irqrestore(&n->list_lock, *flags);
        slab_fix(s, "Object at 0x%p not freed", object);
-        goto out;
+        return NULL;
 }
 static int __init setup_slub_debug(char *str)
@@ -1214,8 +1216,9 @@ static inline void setup_object_debug(struct kmem_cache *s,
 static inline int alloc_debug_processing(struct kmem_cache *s,
        struct page *page, void *object, unsigned long addr) { return 0; }
-static inline int free_debug_processing(struct kmem_cache *s,
+static inline struct kmem_cache_node *free_debug_processing(
-        struct page *page, void *object, unsigned long addr) { return 0; }
+        struct kmem_cache *s, struct page *page, void *object,
+        unsigned long addr, unsigned long *flags) { return NULL; }
 static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
                        { return 1; }
@@ -1714,7 +1717,7 @@ static inline void note_cmpxchg_failure(const char *n,
        stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
 }
-void init_kmem_cache_cpus(struct kmem_cache *s)
+static void init_kmem_cache_cpus(struct kmem_cache *s)
 {
        int cpu;
@@ -1939,7 +1942,7 @@ static void unfreeze_partials(struct kmem_cache *s)
 * If we did not find a slot then simply move all the partials to the
 * per node partial list.
 */
-int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 {
        struct page *oldpage;
        int pages;
@@ -1962,6 +1965,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
                                local_irq_save(flags);
                                unfreeze_partials(s);
                                local_irq_restore(flags);
+                                oldpage = NULL;
                                pobjects = 0;
                                pages = 0;
                                stat(s, CPU_PARTIAL_DRAIN);
@@ -2310,7 +2314,7 @@ new_slab:
 *
 * Otherwise we can simply pick the next object from the lockless free list.
 */
-static __always_inline void *slab_alloc(struct kmem_cache *s,
+static __always_inline void *slab_alloc_node(struct kmem_cache *s,
                gfp_t gfpflags, int node, unsigned long addr)
 {
        void **object;
@@ -2380,9 +2384,15 @@ redo:
        return object;
 }
+static __always_inline void *slab_alloc(struct kmem_cache *s,
+                gfp_t gfpflags, unsigned long addr)
+{
+        return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+}
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
-        void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
        trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
@@ -2393,7 +2403,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
 #ifdef CONFIG_TRACING
 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
-        void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
        trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
        return ret;
 }
@@ -2411,7 +2421,7 @@ EXPORT_SYMBOL(kmalloc_order_trace);
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
-        void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+        void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
                                    s->object_size, s->size, gfpflags, node);
@@ -2425,7 +2435,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
                                    gfp_t gfpflags,
                                    int node, size_t size)
 {
-        void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+        void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
        trace_kmalloc_node(_RET_IP_, ret,
                           size, s->size, gfpflags, node);
@@ -2457,7 +2467,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
        stat(s, FREE_SLOWPATH);
-        if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
+        if (kmem_cache_debug(s) &&
+                !(n = free_debug_processing(s, page, x, addr, &flags)))
                return;
        do {
@@ -2612,6 +2623,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
        page = virt_to_head_page(x);
+        if (kmem_cache_debug(s) && page->slab != s) {
+                pr_err("kmem_cache_free: Wrong slab cache. %s but object"
+                        " is from  %s\n", page->slab->name, s->name);
+                WARN_ON_ONCE(1);
+                return;
+        }
        slab_free(s, page, x, _RET_IP_);
        trace_kmem_cache_free(_RET_IP_, x);
@@ -3026,17 +3044,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 }
-static int kmem_cache_open(struct kmem_cache *s,
+static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
-                const char *name, size_t size,
-                size_t align, unsigned long flags,
-                void (*ctor)(void *))
 {
-        memset(s, 0, kmem_size);
+        s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
-        s->name = name;
-        s->ctor = ctor;
-        s->object_size = size;
-        s->align = align;
-        s->flags = kmem_cache_flags(size, flags, name, ctor);
        s->reserved = 0;
        if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
@@ -3098,7 +3108,6 @@ static int kmem_cache_open(struct kmem_cache *s,
        else
                s->cpu_partial = 30;
-        s->refcount = 1;
 #ifdef CONFIG_NUMA
        s->remote_node_defrag_ratio = 1000;
 #endif
@@ -3106,16 +3115,16 @@ static int kmem_cache_open(struct kmem_cache *s,
                goto error;
        if (alloc_kmem_cache_cpus(s))
-                return 1;
+                return 0;
        free_kmem_cache_nodes(s);
 error:
        if (flags & SLAB_PANIC)
                panic("Cannot create slab %s size=%lu realsize=%u "
                        "order=%u offset=%u flags=%lx\n",
-                        s->name, (unsigned long)size, s->size, oo_order(s->oo),
+                        s->name, (unsigned long)s->size, s->size, oo_order(s->oo),
                        s->offset, flags);
-        return 0;
+        return -EINVAL;
 }
 /*
@@ -3137,7 +3146,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
                                     sizeof(long), GFP_ATOMIC);
        if (!map)
                return;
-        slab_err(s, page, "%s", text);
+        slab_err(s, page, text, s->name);
        slab_lock(page);
        get_map(s, page, map);
@@ -3169,7 +3178,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
                        discard_slab(s, page);
                } else {
                        list_slab_objects(s, page,
-                                "Objects remaining on kmem_cache_close()");
+                        "Objects remaining in %s on kmem_cache_close()");
                }
        }
 }
@@ -3182,7 +3191,6 @@ static inline int kmem_cache_close(struct kmem_cache *s)
        int node;
        flush_all(s);
-        free_percpu(s->cpu_slab);
        /* Attempt to free all objects */
        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = get_node(s, node);
@@ -3191,33 +3199,20 @@ static inline int kmem_cache_close(struct kmem_cache *s)
                if (n->nr_partial || slabs_node(s, node))
                        return 1;
        }
+        free_percpu(s->cpu_slab);
        free_kmem_cache_nodes(s);
        return 0;
 }
-/*
+int __kmem_cache_shutdown(struct kmem_cache *s)
- * Close a cache and release the kmem_cache structure
- * (must be used for caches created using kmem_cache_create)
- */
-void kmem_cache_destroy(struct kmem_cache *s)
 {
-        mutex_lock(&slab_mutex);
+        int rc = kmem_cache_close(s);
-        s->refcount--;
-        if (!s->refcount) {
+        if (!rc)
-                list_del(&s->list);
-                mutex_unlock(&slab_mutex);
-                if (kmem_cache_close(s)) {
-                        printk(KERN_ERR "SLUB %s: %s called for cache that "
-                                "still has objects.\n", s->name, __func__);
-                        dump_stack();
-                }
-                if (s->flags & SLAB_DESTROY_BY_RCU)
-                        rcu_barrier();
                sysfs_slab_remove(s);
-        } else
-                mutex_unlock(&slab_mutex);
+        return rc;
 }
-EXPORT_SYMBOL(kmem_cache_destroy);
 /********************************************************************
 *              Kmalloc subsystem
@@ -3226,8 +3221,6 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
 EXPORT_SYMBOL(kmalloc_caches);
-static struct kmem_cache *kmem_cache;
 #ifdef CONFIG_ZONE_DMA
 static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
 #endif
@@ -3273,14 +3266,17 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
 {
        struct kmem_cache *s;
-        s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+        s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+        s->name = name;
+        s->size = s->object_size = size;
+        s->align = ARCH_KMALLOC_MINALIGN;
        /*
         * This function is called with IRQs disabled during early-boot on
         * single CPU so there's no need to take slab_mutex here.
         */
-        if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
+        if (kmem_cache_open(s, flags))
-                                                                flags, NULL))
                goto panic;
        list_add(&s->list, &slab_caches);
@@ -3362,7 +3358,7 @@ void *__kmalloc(size_t size, gfp_t flags)
        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
-        ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
+        ret = slab_alloc(s, flags, _RET_IP_);
        trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
@@ -3405,7 +3401,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
-        ret = slab_alloc(s, flags, node, _RET_IP_);
+        ret = slab_alloc_node(s, flags, node, _RET_IP_);
        trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
@@ -3482,7 +3478,7 @@ void kfree(const void *x)
        if (unlikely(!PageSlab(page))) {
                BUG_ON(!PageCompound(page));
                kmemleak_free(x);
-                put_page(page);
+                __free_pages(page, compound_order(page));
                return;
        }
        slab_free(page->slab, page, object, _RET_IP_);
@@ -3719,12 +3715,12 @@ void __init kmem_cache_init(void)
                slub_max_order = 0;
        kmem_size = offsetof(struct kmem_cache, node) +
-                                nr_node_ids * sizeof(struct kmem_cache_node *);
+                        nr_node_ids * sizeof(struct kmem_cache_node *);
        /* Allocate two kmem_caches from the page allocator */
        kmalloc_size = ALIGN(kmem_size, cache_line_size());
        order = get_order(2 * kmalloc_size);
-        kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
+        kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
        /*
         * Must first have the slab cache available for the allocations of the
@@ -3733,9 +3729,10 @@ void __init kmem_cache_init(void)
         */
        kmem_cache_node = (void *)kmem_cache + kmalloc_size;
-        kmem_cache_open(kmem_cache_node, "kmem_cache_node",
+        kmem_cache_node->name = "kmem_cache_node";
-                sizeof(struct kmem_cache_node),
+        kmem_cache_node->size = kmem_cache_node->object_size =
-                0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+                sizeof(struct kmem_cache_node);
+        kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
@@ -3743,8 +3740,10 @@ void __init kmem_cache_init(void)
        slab_state = PARTIAL;
        temp_kmem_cache = kmem_cache;
-        kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
+        kmem_cache->name = "kmem_cache";
-                0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+        kmem_cache->size = kmem_cache->object_size = kmem_size;
+        kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
        kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
        memcpy(kmem_cache, temp_kmem_cache, kmem_size);
@@ -3933,11 +3932,10 @@ static struct kmem_cache *find_mergeable(size_t size,
        return NULL;
 }
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
                size_t align, unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s;
-        char *n;
        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
@@ -3951,36 +3949,29 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
                if (sysfs_slab_alias(s, name)) {
                        s->refcount--;
-                        return NULL;
+                        s = NULL;
                }
-                return s;
        }
-        n = kstrdup(name, GFP_KERNEL);
+        return s;
-        if (!n)
+}
-                return NULL;
-        s = kmalloc(kmem_size, GFP_KERNEL);
+int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
-        if (s) {
+{
-                if (kmem_cache_open(s, n,
+        int err;
-                                size, align, flags, ctor)) {
-                        int r;
-                        list_add(&s->list, &slab_caches);
+        err = kmem_cache_open(s, flags);
-                        mutex_unlock(&slab_mutex);
+        if (err)
-                        r = sysfs_slab_add(s);
+                return err;
-                        mutex_lock(&slab_mutex);
-                        if (!r)
+        mutex_unlock(&slab_mutex);
-                                return s;
+        err = sysfs_slab_add(s);
+        mutex_lock(&slab_mutex);
-                        list_del(&s->list);
+        if (err)
-                        kmem_cache_close(s);
+                kmem_cache_close(s);
-                }
-                kfree(s);
+        return err;
-        }
-        kfree(n);
-        return NULL;
 }
 #ifdef CONFIG_SMP
@@ -4033,7 +4024,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
-        ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
+        ret = slab_alloc(s, gfpflags, caller);
        /* Honor the call site pointer we received. */
        trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -4063,7 +4054,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
-        ret = slab_alloc(s, gfpflags, node, caller);
+        ret = slab_alloc_node(s, gfpflags, node, caller);
        /* Honor the call site pointer we received. */
        trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
@@ -5210,14 +5201,6 @@ static ssize_t slab_attr_store(struct kobject *kobj,
        return err;
 }
-static void kmem_cache_release(struct kobject *kobj)
-{
-        struct kmem_cache *s = to_slab(kobj);
-        kfree(s->name);
-        kfree(s);
-}
 static const struct sysfs_ops slab_sysfs_ops = {
        .show = slab_attr_show,
        .store = slab_attr_store,
@@ -5225,7 +5208,6 @@ static const struct sysfs_ops slab_sysfs_ops = {
 static struct kobj_type slab_ktype = {
        .sysfs_ops = &slab_sysfs_ops,
-        .release = kmem_cache_release
 };
 static int uevent_filter(struct kset *kset, struct kobject *kobj)
diff --git a/mm/swap.c b/mm/swap.c
index 77825883298f..6310dc2008ff 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page)
 }
 EXPORT_SYMBOL(mark_page_accessed);
+/*
+ * Order of operations is important: flush the pagevec when it's already
+ * full, not when adding the last page, to make sure that last page is
+ * not added to the LRU directly when passed to this function. Because
+ * mark_page_accessed() (called after this when writing) only activates
+ * pages that are on the LRU, linear writes in subpage chunks would see
+ * every PAGEVEC_SIZE page activated, which is unexpected.
+ */
 void __lru_cache_add(struct page *page, enum lru_list lru)
 {
        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
        page_cache_get(page);
-        if (!pagevec_add(pvec, page))
+        if (!pagevec_space(pvec))
                __pagevec_lru_add(pvec, lru);
+        pagevec_add(pvec, page);
        put_cpu_var(lru_add_pvecs);
 }
 EXPORT_SYMBOL(__lru_cache_add);
@@ -742,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
        SetPageLRU(page_tail);
-        if (page_evictable(page_tail, NULL)) {
+        if (page_evictable(page_tail)) {
                if (PageActive(page)) {
                        SetPageActive(page_tail);
                        active = 1;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 14e254c768fc..71cd288b2001 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1483,7 +1483,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        struct file *swap_file, *victim;
        struct address_space *mapping;
        struct inode *inode;
-        char *pathname;
+        struct filename *pathname;
        int oom_score_adj;
        int i, type, prev;
        int err;
@@ -1498,8 +1498,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        if (IS_ERR(pathname))
                goto out;
-        victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
+        victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
-        putname(pathname);
        err = PTR_ERR(victim);
        if (IS_ERR(victim))
                goto out;
@@ -1936,7 +1935,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
        struct swap_info_struct *p;
-        char *name;
+        struct filename *name;
        struct file *swap_file = NULL;
        struct address_space *mapping;
        int i;
@@ -1967,7 +1966,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                name = NULL;
                goto bad_swap;
        }
-        swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
+        swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
        if (IS_ERR(swap_file)) {
                error = PTR_ERR(swap_file);
                swap_file = NULL;
@@ -2053,7 +2052,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        printk(KERN_INFO "Adding %uk swap on %s.  "
                        "Priority:%d extents:%d across:%lluk %s%s%s\n",
-                p->pages<<(PAGE_SHIFT-10), name, p->prio,
+                p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
                (p->flags & SWP_DISCARDABLE) ? "D" : "",
diff --git a/mm/truncate.c b/mm/truncate.c
index 75801acdaac7..d51ce92d6e83 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        cancel_dirty_page(page, PAGE_CACHE_SIZE);
-        clear_page_mlock(page);
        ClearPageMappedToDisk(page);
        delete_from_page_cache(page);
        return 0;
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
        if (page_has_private(page) && !try_to_release_page(page, 0))
                return 0;
-        clear_page_mlock(page);
        ret = remove_mapping(mapping, page);
        return ret;
@@ -398,7 +396,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        if (PageDirty(page))
                goto failed;
-        clear_page_mlock(page);
        BUG_ON(page_has_private(page));
        __delete_from_page_cache(page);
        spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/util.c b/mm/util.c
index 3a5278c08d76..c55e26b17d93 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -105,6 +105,25 @@ void *memdup_user(const void __user *src, size_t len)
 }
 EXPORT_SYMBOL(memdup_user);
+static __always_inline void *__do_krealloc(const void *p, size_t new_size,
+                                           gfp_t flags)
+{
+        void *ret;
+        size_t ks = 0;
+        if (p)
+                ks = ksize(p);
+        if (ks >= new_size)
+                return (void *)p;
+        ret = kmalloc_track_caller(new_size, flags);
+        if (ret && p)
+                memcpy(ret, p, ks);
+        return ret;
+}
 /**
 * __krealloc - like krealloc() but don't free @p.
 * @p: object to reallocate memory for.
@@ -117,23 +136,11 @@ EXPORT_SYMBOL(memdup_user);
 */
 void *__krealloc(const void *p, size_t new_size, gfp_t flags)
 {
-        void *ret;
-        size_t ks = 0;
        if (unlikely(!new_size))
                return ZERO_SIZE_PTR;
-        if (p)
+        return __do_krealloc(p, new_size, flags);
-                ks = ksize(p);
-        if (ks >= new_size)
-                return (void *)p;
-        ret = kmalloc_track_caller(new_size, flags);
-        if (ret && p)
-                memcpy(ret, p, ks);
-        return ret;
 }
 EXPORT_SYMBOL(__krealloc);
@@ -157,7 +164,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
                return ZERO_SIZE_PTR;
        }
-        ret = __krealloc(p, new_size, flags);
+        ret = __do_krealloc(p, new_size, flags);
        if (ret && p != ret)
                kfree(p);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2bb90b1d241c..78e08300db21 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                usize -= PAGE_SIZE;
        } while (usize > 0);
-        /* Prevent "things" like memory migration? VM_flags need a cleanup... */
+        vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
-        vma->vm_flags |= VM_RESERVED;
        return 0;
 }
@@ -2572,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p)
 {
        struct vm_struct *v = p;
-        seq_printf(m, "0x%p-0x%p %7ld",
+        seq_printf(m, "0x%pK-0x%pK %7ld",
                v->addr, v->addr + v->size, v->size);
        if (v->caller)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99b434b674c0..2624edcfb420 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page)
 redo:
        ClearPageUnevictable(page);
-        if (page_evictable(page, NULL)) {
+        if (page_evictable(page)) {
                /*
                 * For evictable pages, we can use the cache.
                 * In event of a race, worst case is we end up with an
@@ -587,7 +587,7 @@ redo:
         * page is on unevictable list, it never be freed. To avoid that,
         * check after we added it to the list, again.
         */
-        if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
+        if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
                if (!isolate_lru_page(page)) {
                        put_page(page);
                        goto redo;
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page,
 static unsigned long shrink_page_list(struct list_head *page_list,
                                      struct zone *zone,
                                      struct scan_control *sc,
+                                      enum ttu_flags ttu_flags,
                                      unsigned long *ret_nr_dirty,
-                                      unsigned long *ret_nr_writeback)
+                                      unsigned long *ret_nr_writeback,
+                                      bool force_reclaim)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        mem_cgroup_uncharge_start();
        while (!list_empty(page_list)) {
-                enum page_references references;
                struct address_space *mapping;
                struct page *page;
                int may_enter_fs;
+                enum page_references references = PAGEREF_RECLAIM_CLEAN;
                cond_resched();
@@ -707,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                sc->nr_scanned++;
-                if (unlikely(!page_evictable(page, NULL)))
+                if (unlikely(!page_evictable(page)))
                        goto cull_mlocked;
                if (!sc->may_unmap && page_mapped(page))
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        wait_on_page_writeback(page);
                }
-                references = page_check_references(page, sc);
+                if (!force_reclaim)
+                        references = page_check_references(page, sc);
                switch (references) {
                case PAGEREF_ACTIVATE:
                        goto activate_locked;
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                        switch (try_to_unmap(page, TTU_UNMAP)) {
+                        switch (try_to_unmap(page, ttu_flags)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@ -960,6 +964,33 @@ keep:
        return nr_reclaimed;
 }
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+                                            struct list_head *page_list)
+{
+        struct scan_control sc = {
+                .gfp_mask = GFP_KERNEL,
+                .priority = DEF_PRIORITY,
+                .may_unmap = 1,
+        };
+        unsigned long ret, dummy1, dummy2;
+        struct page *page, *next;
+        LIST_HEAD(clean_pages);
+        list_for_each_entry_safe(page, next, page_list, lru) {
+                if (page_is_file_cache(page) && !PageDirty(page)) {
+                        ClearPageActive(page);
+                        list_move(&page->lru, &clean_pages);
+                }
+        }
+        ret = shrink_page_list(&clean_pages, zone, &sc,
+                                TTU_UNMAP|TTU_IGNORE_ACCESS,
+                                &dummy1, &dummy2, true);
+        list_splice(&clean_pages, page_list);
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+        return ret;
+}
 /*
 * Attempt to remove the specified page from its LRU.  Only take this page
 * if it is of the appropriate PageActive status.  Pages which are being
@@ -978,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
        if (!PageLRU(page))
                return ret;
-        /* Do not give back unevictable pages for compaction */
+        /* Compaction should not handle unevictable pages but CMA can do so */
-        if (PageUnevictable(page))
+        if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
                return ret;
        ret = -EBUSY;
@@ -1186,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
                VM_BUG_ON(PageLRU(page));
                list_del(&page->lru);
-                if (unlikely(!page_evictable(page, NULL))) {
+                if (unlikely(!page_evictable(page))) {
                        spin_unlock_irq(&zone->lru_lock);
                        putback_lru_page(page);
                        spin_lock_irq(&zone->lru_lock);
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        if (nr_taken == 0)
                return 0;
-        nr_reclaimed = shrink_page_list(&page_list, zone, sc,
+        nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
-                                                &nr_dirty, &nr_writeback);
+                                        &nr_dirty, &nr_writeback, false);
        spin_lock_irq(&zone->lru_lock);
@@ -1439,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
                page = lru_to_page(&l_hold);
                list_del(&page->lru);
-                if (unlikely(!page_evictable(page, NULL))) {
+                if (unlikely(!page_evictable(page))) {
                        putback_lru_page(page);
                        continue;
                }
@@ -1729,6 +1760,28 @@ static bool in_reclaim_compaction(struct scan_control *sc)
        return false;
 }
+#ifdef CONFIG_COMPACTION
+/*
+ * If compaction is deferred for sc->order then scale the number of pages
+ * reclaimed based on the number of consecutive allocation failures
+ */
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+                        struct lruvec *lruvec, struct scan_control *sc)
+{
+        struct zone *zone = lruvec_zone(lruvec);
+        if (zone->compact_order_failed <= sc->order)
+                pages_for_compaction <<= zone->compact_defer_shift;
+        return pages_for_compaction;
+}
+#else
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+                        struct lruvec *lruvec, struct scan_control *sc)
+{
+        return pages_for_compaction;
+}
+#endif
 /*
 * Reclaim/compaction is used for high-order allocation requests. It reclaims
 * order-0 pages before compacting the zone. should_continue_reclaim() returns
@@ -1776,6 +1829,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
+        pages_for_compaction = scale_for_compaction(pages_for_compaction,
+                                                    lruvec, sc);
        inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
        if (nr_swap_pages > 0)
                inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
@@ -2839,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+                /*
+                 * Compaction records what page blocks it recently failed to
+                 * isolate pages from and skips them in the future scanning.
+                 * When kswapd is going to sleep, it is reasonable to assume
+                 * that pages and compaction may succeed so reset the cache.
+                 */
+                reset_isolation_suitable(pgdat);
                if (!kthread_should_stop())
                        schedule();
@@ -3101,9 +3165,9 @@ int kswapd_run(int nid)
        if (IS_ERR(pgdat->kswapd)) {
                /* failure at boot is fatal */
                BUG_ON(system_state == SYSTEM_BOOTING);
-                printk("Failed to start kswapd on node %d\n",nid);
                pgdat->kswapd = NULL;
-                ret = -1;
+                pr_err("Failed to start kswapd on node %d\n", nid);
+                ret = PTR_ERR(pgdat->kswapd);
        }
        return ret;
 }
@@ -3350,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 /*
 * page_evictable - test whether a page is evictable
 * @page: the page to test
- * @vma: the VMA in which the page is or will be mapped, may be NULL
 *
 * Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list.  The vma argument is !NULL when called from the
+ * lists vs unevictable list.
- * fault path to determine how to instantate a new page.
 *
 * Reasons page might not be evictable:
 * (1) page's mapping marked unevictable
 * (2) page is part of an mlocked VMA
 *
 */
-int page_evictable(struct page *page, struct vm_area_struct *vma)
+int page_evictable(struct page *page)
 {
+        return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
-        if (mapping_unevictable(page_mapping(page)))
-                return 0;
-        if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
-                return 0;
-        return 1;
 }
 #ifdef CONFIG_SHMEM
@@ -3408,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
                if (!PageLRU(page) || !PageUnevictable(page))
                        continue;
-                if (page_evictable(page, NULL)) {
+                if (page_evictable(page)) {
                        enum lru_list lru = page_lru_base_type(page);
                        VM_BUG_ON(PageActive(page));
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df7a6748231d..c7370579111b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu)
                        atomic_long_add(global_diff[i], &vm_stat[i]);
 }
+void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
+{
+        int i;
+        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                if (pset->vm_stat_diff[i]) {
+                        int v = pset->vm_stat_diff[i];
+                        pset->vm_stat_diff[i] = 0;
+                        atomic_long_add(v, &zone->vm_stat[i]);
+                        atomic_long_add(v, &vm_stat[i]);
+                }
+}
 #endif
 #ifdef CONFIG_NUMA
@@ -722,6 +734,7 @@ const char * const vmstat_text[] = {
        "numa_other",
 #endif
        "nr_anon_transparent_hugepages",
+        "nr_free_cma",
        "nr_dirty_threshold",
        "nr_dirty_background_threshold",
@@ -781,7 +794,6 @@ const char * const vmstat_text[] = {
        "unevictable_pgs_munlocked",
        "unevictable_pgs_cleared",
        "unevictable_pgs_stranded",
-        "unevictable_pgs_mlockfreed",
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        "thp_fault_alloc",
@@ -1157,7 +1169,7 @@ static void __cpuinit start_cpu_timer(int cpu)
 {
        struct delayed_work *work = &per_cpu(vmstat_work, cpu);
-        INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
+        INIT_DEFERRABLE_WORK(work, vmstat_update);
        schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
 }
author	Jiri Kosina <jkosina@suse.cz>	2012-10-28 14:28:52 -0400
committer	Jiri Kosina <jkosina@suse.cz>	2012-10-28 14:29:19 -0400
commit	3bd7bf1f0fe14f591c089ae61bbfa9bd356f178a (patch)
tree	0058693cc9e70b7461dae551f8a19aff2efd13ca /mm
parent	f16f84937d769c893492160b1a8c3672e3992beb (diff)
parent	e657e078d3dfa9f96976db7a2b5fd7d7c9f1f1a6 (diff)