Merge branch 'master' into for-next

Conflicts: include/linux/mmzone.h Synced with Linus' tree so that trivial patch can be applied on top of up-to-date code properly. Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
author: Jiri Kosina <jkosina@suse.cz> 2012-06-29 08:45:58 -0400
committer: Jiri Kosina <jkosina@suse.cz> 2012-06-29 08:45:58 -0400
commit: 59f91e5dd0504dc0ebfaa0b6f3a55e6931f96266 (patch)
tree: b913718405d44a921905ac71044fbde410256865 /mm
parent: 57bdfdd80077addf518a9b90c4a66890efc4f70e (diff)
parent: 89abfab133ef1f5902abafb744df72793213ac19 (diff)
30 files changed, 1137 insertions, 984 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 39220026c797..b2176374b98e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -349,6 +349,16 @@ choice
          benefit.
 endchoice
+config CROSS_MEMORY_ATTACH
+        bool "Cross Memory Support"
+        depends on MMU
+        default y
+        help
+          Enabling this option adds the system calls process_vm_readv and
+          process_vm_writev which allow a process with the correct privileges
+          to directly read from or write to to another process's address space.
+          See the man page for more details.
 #
 # UP and nommu archs use km based percpu allocator
 #
diff --git a/mm/Makefile b/mm/Makefile
index 8aada89efbbb..a156285ce88d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,8 +5,11 @@
 mmu-y                   := nommu.o
 mmu-$(CONFIG_MMU)       := fremap.o highmem.o madvise.o memory.o mincore.o \
                           mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-                           vmalloc.o pagewalk.o pgtable-generic.o \
+                           vmalloc.o pagewalk.o pgtable-generic.o
-                           process_vm_access.o
+ifdef CONFIG_CROSS_MEMORY_ATTACH
+mmu-$(CONFIG_MMU)       += process_vm_access.o
+endif
 obj-y                   := filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
@@ -25,7 +28,7 @@ endif
 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 obj-$(CONFIG_BOUNCE)    += bounce.o
-obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_HAS_DMA)   += dmapool.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0131170c9d54..ec4fcb7a56c8 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages)
 */
 static void __init link_bootmem(bootmem_data_t *bdata)
 {
-        struct list_head *iter;
+        bootmem_data_t *ent;
-        list_for_each(iter, &bdata_list) {
+        list_for_each_entry(ent, &bdata_list, list) {
-                bootmem_data_t *ent;
+                if (bdata->node_min_pfn < ent->node_min_pfn) {
+                        list_add_tail(&bdata->list, &ent->list);
-                ent = list_entry(iter, bootmem_data_t, list);
+                        return;
-                if (bdata->node_min_pfn < ent->node_min_pfn)
+                }
-                        break;
        }
-        list_add_tail(&bdata->list, iter);
+        list_add_tail(&bdata->list, &bdata_list);
 }
 /*
@@ -203,7 +203,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                } else {
                        unsigned long off = 0;
-                        while (vec && off < BITS_PER_LONG) {
+                        vec >>= start & (BITS_PER_LONG - 1);
+                        while (vec) {
                                if (vec & 1) {
                                        page = pfn_to_page(start + off);
                                        __free_pages_bootmem(page, 0);
@@ -467,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
        return ALIGN(base + off, align) - base;
 }
-static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
                                        unsigned long size, unsigned long align,
                                        unsigned long goal, unsigned long limit)
 {
@@ -588,14 +589,14 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
                p_bdata = bootmem_arch_preferred_node(bdata, size, align,
                                                        goal, limit);
                if (p_bdata)
-                        return alloc_bootmem_core(p_bdata, size, align,
+                        return alloc_bootmem_bdata(p_bdata, size, align,
                                                        goal, limit);
        }
 #endif
        return NULL;
 }
-static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+static void * __init alloc_bootmem_core(unsigned long size,
                                        unsigned long align,
                                        unsigned long goal,
                                        unsigned long limit)
@@ -603,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
        bootmem_data_t *bdata;
        void *region;
-restart:
        region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
        if (region)
                return region;
@@ -614,11 +614,25 @@ restart:
                if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
                        break;
-                region = alloc_bootmem_core(bdata, size, align, goal, limit);
+                region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
                if (region)
                        return region;
        }
+        return NULL;
+}
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+                                              unsigned long align,
+                                              unsigned long goal,
+                                              unsigned long limit)
+{
+        void *ptr;
+restart:
+        ptr = alloc_bootmem_core(size, align, goal, limit);
+        if (ptr)
+                return ptr;
        if (goal) {
                goal = 0;
                goto restart;
@@ -684,21 +698,56 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, limit);
 }
-static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
+static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                unsigned long size, unsigned long align,
                                unsigned long goal, unsigned long limit)
 {
        void *ptr;
-        ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
+again:
+        ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
+                                           align, goal, limit);
        if (ptr)
                return ptr;
-        ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
+        ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
        if (ptr)
                return ptr;
-        return ___alloc_bootmem(size, align, goal, limit);
+        ptr = alloc_bootmem_core(size, align, goal, limit);
+        if (ptr)
+                return ptr;
+        if (goal) {
+                goal = 0;
+                goto again;
+        }
+        return NULL;
+}
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+                                   unsigned long align, unsigned long goal)
+{
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+        return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+}
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+                                    unsigned long align, unsigned long goal,
+                                    unsigned long limit)
+{
+        void *ptr;
+        ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+        if (ptr)
+                return ptr;
+        printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+        panic("Out of memory");
+        return NULL;
 }
 /**
@@ -722,7 +771,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-        return  ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+        return  ___alloc_bootmem_node(pgdat, size, align, goal, 0);
 }
 void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -743,7 +792,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
                unsigned long new_goal;
                new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
-                ptr = alloc_bootmem_core(pgdat->bdata, size, align,
+                ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
                                                 new_goal, 0);
                if (ptr)
                        return ptr;
@@ -754,47 +803,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 }
-#ifdef CONFIG_SPARSEMEM
-/**
- * alloc_bootmem_section - allocate boot memory from a specific section
- * @size: size of the request in bytes
- * @section_nr: sparse map section to allocate from
- *
- * Return NULL on failure.
- */
-void * __init alloc_bootmem_section(unsigned long size,
-                                    unsigned long section_nr)
-{
-        bootmem_data_t *bdata;
-        unsigned long pfn, goal;
-        pfn = section_nr_to_pfn(section_nr);
-        goal = pfn << PAGE_SHIFT;
-        bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
-        return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
-}
-#endif
-void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-                                   unsigned long align, unsigned long goal)
-{
-        void *ptr;
-        if (WARN_ON_ONCE(slab_is_available()))
-                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-        ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
-        if (ptr)
-                return ptr;
-        ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
-        if (ptr)
-                return ptr;
-        return __alloc_bootmem_nopanic(size, align, goal);
-}
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT  0xffffffffUL
 #endif
@@ -839,6 +847,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-        return ___alloc_bootmem_node(pgdat->bdata, size, align,
+        return ___alloc_bootmem_node(pgdat, size, align,
-                                goal, ARCH_LOW_ADDRESS_LIMIT);
+                                     goal, ARCH_LOW_ADDRESS_LIMIT);
 }
diff --git a/mm/compaction.c b/mm/compaction.c
index da7d35ea5103..840ee288e296 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -235,7 +235,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
         */
        while (unlikely(too_many_isolated(zone))) {
                /* async migration should just abort */
-                if (!cc->sync)
+                if (cc->mode != COMPACT_SYNC)
                        return 0;
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -303,7 +303,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                 * satisfies the allocation
                 */
                pageblock_nr = low_pfn >> pageblock_order;
-                if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+                if (cc->mode != COMPACT_SYNC &&
+                    last_pageblock_nr != pageblock_nr &&
                    !migrate_async_suitable(get_pageblock_migratetype(page))) {
                        low_pfn += pageblock_nr_pages;
                        low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
@@ -324,7 +325,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                        continue;
                }
-                if (!cc->sync)
+                if (cc->mode != COMPACT_SYNC)
                        mode |= ISOLATE_ASYNC_MIGRATE;
                /* Try isolate the page */
@@ -357,27 +358,90 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
+/*
+ * Returns true if MIGRATE_UNMOVABLE pageblock was successfully
+ * converted to MIGRATE_MOVABLE type, false otherwise.
+ */
+static bool rescue_unmovable_pageblock(struct page *page)
+{
+        unsigned long pfn, start_pfn, end_pfn;
+        struct page *start_page, *end_page;
+        pfn = page_to_pfn(page);
+        start_pfn = pfn & ~(pageblock_nr_pages - 1);
+        end_pfn = start_pfn + pageblock_nr_pages;
+        start_page = pfn_to_page(start_pfn);
+        end_page = pfn_to_page(end_pfn);
+        /* Do not deal with pageblocks that overlap zones */
+        if (page_zone(start_page) != page_zone(end_page))
+                return false;
+        for (page = start_page, pfn = start_pfn; page < end_page; pfn++,
+                                                                  page++) {
+                if (!pfn_valid_within(pfn))
+                        continue;
+                if (PageBuddy(page)) {
+                        int order = page_order(page);
+                        pfn += (1 << order) - 1;
+                        page += (1 << order) - 1;
+                        continue;
+                } else if (page_count(page) == 0 || PageLRU(page))
+                        continue;
+                return false;
+        }
+        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+        move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE);
+        return true;
+}
-/* Returns true if the page is within a block suitable for migration to */
+enum smt_result {
-static bool suitable_migration_target(struct page *page)
+        GOOD_AS_MIGRATION_TARGET,
+        FAIL_UNMOVABLE_TARGET,
+        FAIL_BAD_TARGET,
+};
+/*
+ * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block
+ * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page
+ * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise.
+ */
+static enum smt_result suitable_migration_target(struct page *page,
+                                      struct compact_control *cc)
 {
        int migratetype = get_pageblock_migratetype(page);
        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
        if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
-                return false;
+                return FAIL_BAD_TARGET;
        /* If the page is a large free page, then allow migration */
        if (PageBuddy(page) && page_order(page) >= pageblock_order)
-                return true;
+                return GOOD_AS_MIGRATION_TARGET;
        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-        if (migrate_async_suitable(migratetype))
+        if (cc->mode != COMPACT_ASYNC_UNMOVABLE &&
-                return true;
+            migrate_async_suitable(migratetype))
+                return GOOD_AS_MIGRATION_TARGET;
+        if (cc->mode == COMPACT_ASYNC_MOVABLE &&
+            migratetype == MIGRATE_UNMOVABLE)
+                return FAIL_UNMOVABLE_TARGET;
+        if (cc->mode != COMPACT_ASYNC_MOVABLE &&
+            migratetype == MIGRATE_UNMOVABLE &&
+            rescue_unmovable_pageblock(page))
+                return GOOD_AS_MIGRATION_TARGET;
        /* Otherwise skip the block */
-        return false;
+        return FAIL_BAD_TARGET;
 }
 /*
@@ -411,6 +475,13 @@ static void isolate_freepages(struct zone *zone,
        zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
        /*
+         * isolate_freepages() may be called more than once during
+         * compact_zone_order() run and we want only the most recent
+         * count.
+         */
+        cc->nr_pageblocks_skipped = 0;
+        /*
         * Isolate free pages until enough are available to migrate the
         * pages on cc->migratepages. We stop searching if the migrate
         * and free page scanners meet or enough free pages are isolated.
@@ -418,6 +489,7 @@ static void isolate_freepages(struct zone *zone,
        for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
                                        pfn -= pageblock_nr_pages) {
                unsigned long isolated;
+                enum smt_result ret;
                if (!pfn_valid(pfn))
                        continue;
@@ -434,9 +506,12 @@ static void isolate_freepages(struct zone *zone,
                        continue;
                /* Check the block is suitable for migration */
-                if (!suitable_migration_target(page))
+                ret = suitable_migration_target(page, cc);
+                if (ret != GOOD_AS_MIGRATION_TARGET) {
+                        if (ret == FAIL_UNMOVABLE_TARGET)
+                                cc->nr_pageblocks_skipped++;
                        continue;
+                }
                /*
                 * Found a block suitable for isolating free pages from. Now
                 * we disabled interrupts, double check things are ok and
@@ -445,12 +520,14 @@ static void isolate_freepages(struct zone *zone,
                 */
                isolated = 0;
                spin_lock_irqsave(&zone->lock, flags);
-                if (suitable_migration_target(page)) {
+                ret = suitable_migration_target(page, cc);
+                if (ret == GOOD_AS_MIGRATION_TARGET) {
                        end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
                        isolated = isolate_freepages_block(pfn, end_pfn,
                                                           freelist, false);
                        nr_freepages += isolated;
-                }
+                } else if (ret == FAIL_UNMOVABLE_TARGET)
+                        cc->nr_pageblocks_skipped++;
                spin_unlock_irqrestore(&zone->lock, flags);
                /*
@@ -682,8 +759,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                nr_migrate = cc->nr_migratepages;
                err = migrate_pages(&cc->migratepages, compaction_alloc,
-                                (unsigned long)cc, false,
+                        (unsigned long)&cc->freepages, false,
-                                cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
+                        (cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT
+                                                      : MIGRATE_ASYNC);
                update_nr_listpages(cc);
                nr_remaining = cc->nr_migratepages;
@@ -712,7 +790,8 @@ out:
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                 bool sync)
+                                 enum compact_mode mode,
+                                 unsigned long *nr_pageblocks_skipped)
 {
        struct compact_control cc = {
                .nr_freepages = 0,
@@ -720,12 +799,17 @@ static unsigned long compact_zone_order(struct zone *zone,
                .order = order,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
-                .sync = sync,
+                .mode = mode,
        };
+        unsigned long rc;
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
-        return compact_zone(zone, &cc);
+        rc = compact_zone(zone, &cc);
+        *nr_pageblocks_skipped = cc.nr_pageblocks_skipped;
+        return rc;
 }
 int sysctl_extfrag_threshold = 500;
@@ -750,6 +834,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
+        unsigned long nr_pageblocks_skipped;
+        enum compact_mode mode;
        /*
         * Check whether it is worth even starting compaction. The order check is
@@ -766,12 +852,22 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                                                                nodemask) {
                int status;
-                status = compact_zone_order(zone, order, gfp_mask, sync);
+                mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE;
+retry:
+                status = compact_zone_order(zone, order, gfp_mask, mode,
+                                                &nr_pageblocks_skipped);
                rc = max(status, rc);
                /* If a normal allocation would succeed, stop compacting */
                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
                        break;
+                if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) {
+                        if (nr_pageblocks_skipped) {
+                                mode = COMPACT_ASYNC_UNMOVABLE;
+                                goto retry;
+                        }
+                }
        }
        return rc;
@@ -805,7 +901,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                        if (ok && cc->order > zone->compact_order_failed)
                                zone->compact_order_failed = cc->order + 1;
                        /* Currently async compaction is never deferred. */
-                        else if (!ok && cc->sync)
+                        else if (!ok && cc->mode == COMPACT_SYNC)
                                defer_compaction(zone, cc->order);
                }
@@ -820,7 +916,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
 {
        struct compact_control cc = {
                .order = order,
-                .sync = false,
+                .mode = COMPACT_ASYNC_MOVABLE,
        };
        return __compact_pgdat(pgdat, &cc);
@@ -830,7 +926,7 @@ static int compact_node(int nid)
 {
        struct compact_control cc = {
                .order = -1,
-                .sync = true,
+                .mode = COMPACT_SYNC,
        };
        return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index 79c4b2b0b14e..64b48f934b89 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,7 +29,6 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/security.h>
-#include <linux/syscalls.h>
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
@@ -1478,44 +1477,6 @@ out:
 }
 EXPORT_SYMBOL(generic_file_aio_read);
-static ssize_t
-do_readahead(struct address_space *mapping, struct file *filp,
-             pgoff_t index, unsigned long nr)
-{
-        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
-                return -EINVAL;
-        force_page_cache_readahead(mapping, filp, index, nr);
-        return 0;
-}
-SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
-{
-        ssize_t ret;
-        struct file *file;
-        ret = -EBADF;
-        file = fget(fd);
-        if (file) {
-                if (file->f_mode & FMODE_READ) {
-                        struct address_space *mapping = file->f_mapping;
-                        pgoff_t start = offset >> PAGE_CACHE_SHIFT;
-                        pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
-                        unsigned long len = end - start + 1;
-                        ret = do_readahead(mapping, file, start, len);
-                }
-                fput(file);
-        }
-        return ret;
-}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
-{
-        return SYSC_readahead((int) fd, offset, (size_t) count);
-}
-SYSCALL_ALIAS(sys_readahead, SyS_readahead);
-#endif
 #ifdef CONFIG_MMU
 /**
 * page_cache_read - adds requested page to the page cache if not already there
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f0e5306eeb55..d0def42c121b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -636,16 +636,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        unsigned long haddr, pmd_t *pmd,
                                        struct page *page)
 {
-        int ret = 0;
        pgtable_t pgtable;
        VM_BUG_ON(!PageCompound(page));
        pgtable = pte_alloc_one(mm, haddr);
-        if (unlikely(!pgtable)) {
+        if (unlikely(!pgtable))
-                mem_cgroup_uncharge_page(page);
-                put_page(page);
                return VM_FAULT_OOM;
-        }
        clear_huge_page(page, haddr, HPAGE_PMD_NR);
        __SetPageUptodate(page);
@@ -675,7 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                spin_unlock(&mm->page_table_lock);
        }
-        return ret;
+        return 0;
 }
 static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
@@ -724,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        put_page(page);
                        goto out;
                }
+                if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
+                                                          page))) {
+                        mem_cgroup_uncharge_page(page);
+                        put_page(page);
+                        goto out;
+                }
-                return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+                return 0;
        }
 out:
        /*
@@ -950,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
                                                   pmd, orig_pmd, page, haddr);
+                if (ret & VM_FAULT_OOM)
+                        split_huge_page(page);
                put_page(page);
                goto out;
        }
@@ -957,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
                put_page(new_page);
+                split_huge_page(page);
                put_page(page);
                ret |= VM_FAULT_OOM;
                goto out;
@@ -968,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_lock(&mm->page_table_lock);
        put_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+                spin_unlock(&mm->page_table_lock);
                mem_cgroup_uncharge_page(new_page);
                put_page(new_page);
+                goto out;
        } else {
                pmd_t entry;
                VM_BUG_ON(!PageHead(page));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4e28416c47fb..285a81e87ec8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t)
        /* Locate each segment we overlap with, and count that overlap. */
        list_for_each_entry(rg, head, link) {
-                int seg_from;
+                long seg_from;
-                int seg_to;
+                long seg_to;
                if (rg->to <= f)
                        continue;
@@ -2157,6 +2157,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
                kref_get(&reservations->refs);
 }
+static void resv_map_put(struct vm_area_struct *vma)
+{
+        struct resv_map *reservations = vma_resv_map(vma);
+        if (!reservations)
+                return;
+        kref_put(&reservations->refs, resv_map_release);
+}
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
        struct hstate *h = hstate_vma(vma);
@@ -2173,7 +2182,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
                reserve = (end - start) -
                        region_count(&reservations->regions, start, end);
-                kref_put(&reservations->refs, resv_map_release);
+                resv_map_put(vma);
                if (reserve) {
                        hugetlb_acct_memory(h, -reserve);
@@ -2991,12 +3000,16 @@ int hugetlb_reserve_pages(struct inode *inode,
                set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
        }
-        if (chg < 0)
+        if (chg < 0) {
-                return chg;
+                ret = chg;
+                goto out_err;
+        }
        /* There must be enough pages in the subpool for the mapping */
-        if (hugepage_subpool_get_pages(spool, chg))
+        if (hugepage_subpool_get_pages(spool, chg)) {
-                return -ENOSPC;
+                ret = -ENOSPC;
+                goto out_err;
+        }
        /*
         * Check enough hugepages are available for the reservation.
@@ -3005,7 +3018,7 @@ int hugetlb_reserve_pages(struct inode *inode,
        ret = hugetlb_acct_memory(h, chg);
        if (ret < 0) {
                hugepage_subpool_put_pages(spool, chg);
-                return ret;
+                goto out_err;
        }
        /*
@@ -3022,6 +3035,9 @@ int hugetlb_reserve_pages(struct inode *inode,
        if (!vma || vma->vm_flags & VM_MAYSHARE)
                region_add(&inode->i_mapping->private_list, from, to);
        return 0;
+out_err:
+        resv_map_put(vma);
+        return ret;
 }
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
diff --git a/mm/internal.h b/mm/internal.h
index aee4761cf9a9..4194ab9dc19b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -94,6 +94,9 @@ extern void putback_lru_page(struct page *page);
 /*
 * in mm/page_alloc.c
 */
+extern void set_pageblock_migratetype(struct page *page, int migratetype);
+extern int move_freepages_block(struct zone *zone, struct page *page,
+                                int migratetype);
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 extern void prep_compound_page(struct page *page, unsigned long order);
 #ifdef CONFIG_MEMORY_FAILURE
@@ -101,6 +104,7 @@ extern bool is_free_buddy_page(struct page *page);
 #endif
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
+#include <linux/compaction.h>
 /*
 * in mm/compaction.c
@@ -119,11 +123,14 @@ struct compact_control {
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
-        bool sync;                      /* Synchronous migration */
+        enum compact_mode mode;         /* Compaction mode */
        int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
+        /* Number of UNMOVABLE destination pageblocks skipped during scan */
+        unsigned long nr_pageblocks_skipped;
 };
 unsigned long
@@ -164,7 +171,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 * to determine if it's being mapped into a LOCKED vma.
 * If so, mark page as mlocked.
 */
-static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
+static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
+                                    struct page *page)
 {
        VM_BUG_ON(PageLRU(page));
@@ -222,7 +230,7 @@ extern unsigned long vma_address(struct page *page,
                                 struct vm_area_struct *vma);
 #endif
 #else /* !CONFIG_MMU */
-static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
+static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
 {
        return 0;
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 1ccbba5b6674..deff1b64a08c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,8 +11,10 @@
 #include <linux/mempolicy.h>
 #include <linux/page-isolation.h>
 #include <linux/hugetlb.h>
+#include <linux/falloc.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
+#include <linux/fs.h>
 /*
 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -200,8 +202,7 @@ static long madvise_remove(struct vm_area_struct *vma,
                                struct vm_area_struct **prev,
                                unsigned long start, unsigned long end)
 {
-        struct address_space *mapping;
+        loff_t offset;
-        loff_t offset, endoff;
        int error;
        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
@@ -217,16 +218,14 @@ static long madvise_remove(struct vm_area_struct *vma,
        if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
                return -EACCES;
-        mapping = vma->vm_file->f_mapping;
        offset = (loff_t)(start - vma->vm_start)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        endoff = (loff_t)(end - vma->vm_start - 1)
-                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        /* vmtruncate_range needs to take i_mutex */
+        /* filesystem's fallocate may need to take i_mutex */
        up_read(&current->mm->mmap_sem);
-        error = vmtruncate_range(mapping->host, offset, endoff);
+        error = do_fallocate(vma->vm_file,
+                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                                offset, end - start);
        down_read(&current->mm->mmap_sem);
        return error;
 }
diff --git a/mm/memblock.c b/mm/memblock.c
index a44eab3157f8..952123eba433 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = {
 int memblock_debug __initdata_memblock;
 static int memblock_can_resize __initdata_memblock;
+static int memblock_memory_in_slab __initdata_memblock = 0;
+static int memblock_reserved_in_slab __initdata_memblock = 0;
 /* inline so we don't get a warning when pr_debug is compiled out */
 static inline const char *memblock_type_name(struct memblock_type *type)
@@ -187,6 +189,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        struct memblock_region *new_array, *old_array;
        phys_addr_t old_size, new_size, addr;
        int use_slab = slab_is_available();
+        int *in_slab;
        /* We don't allow resizing until we know about the reserved regions
         * of memory that aren't suitable for allocation
@@ -198,6 +201,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        old_size = type->max * sizeof(struct memblock_region);
        new_size = old_size << 1;
+        /* Retrieve the slab flag */
+        if (type == &memblock.memory)
+                in_slab = &memblock_memory_in_slab;
+        else
+                in_slab = &memblock_reserved_in_slab;
        /* Try to find some space for it.
         *
         * WARNING: We assume that either slab_is_available() and we use it or
@@ -212,14 +221,15 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        if (use_slab) {
                new_array = kmalloc(new_size, GFP_KERNEL);
                addr = new_array ? __pa(new_array) : 0;
-        } else
+        } else {
                addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+                new_array = addr ? __va(addr) : 0;
+        }
        if (!addr) {
                pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
                       memblock_type_name(type), type->max, type->max * 2);
                return -1;
        }
-        new_array = __va(addr);
        memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
                 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
@@ -234,22 +244,24 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        type->regions = new_array;
        type->max <<= 1;
-        /* If we use SLAB that's it, we are done */
+        /* Free old array. We needn't free it if the array is the
-        if (use_slab)
+         * static one
-                return 0;
-        /* Add the new reserved region now. Should not fail ! */
-        BUG_ON(memblock_reserve(addr, new_size));
-        /* If the array wasn't our static init one, then free it. We only do
-         * that before SLAB is available as later on, we don't know whether
-         * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
-         * anyways
         */
-        if (old_array != memblock_memory_init_regions &&
+        if (*in_slab)
-            old_array != memblock_reserved_init_regions)
+                kfree(old_array);
+        else if (old_array != memblock_memory_init_regions &&
+                 old_array != memblock_reserved_init_regions)
                memblock_free(__pa(old_array), old_size);
+        /* Reserve the new array if that comes from the memblock.
+         * Otherwise, we needn't do it
+         */
+        if (!use_slab)
+                BUG_ON(memblock_reserve(addr, new_size));
+        /* Update slab flag */
+        *in_slab = use_slab;
        return 0;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f342778a0c0a..00c8898dbb81 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -138,7 +138,6 @@ struct mem_cgroup_per_zone {
        struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
-        struct zone_reclaim_stat reclaim_stat;
        struct rb_node          tree_node;      /* RB tree node */
        unsigned long long      usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
@@ -1149,15 +1148,25 @@ struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
 * Checks whether given mem is same or in the root_mem_cgroup's
 * hierarchy subtree
 */
+bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+                                  struct mem_cgroup *memcg)
+{
+        if (root_memcg == memcg)
+                return true;
+        if (!root_memcg->use_hierarchy)
+                return false;
+        return css_is_ancestor(&memcg->css, &root_memcg->css);
+}
 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
-                struct mem_cgroup *memcg)
+                                       struct mem_cgroup *memcg)
 {
-        if (root_memcg != memcg) {
+        bool ret;
-                return (root_memcg->use_hierarchy &&
-                        css_is_ancestor(&memcg->css, &root_memcg->css));
-        }
-        return true;
+        rcu_read_lock();
+        ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
+        rcu_read_unlock();
+        return ret;
 }
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
@@ -1233,16 +1242,6 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
        return (active > inactive);
 }
-struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
-                                                      struct zone *zone)
-{
-        int nid = zone_to_nid(zone);
-        int zid = zone_idx(zone);
-        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-        return &mz->reclaim_stat;
-}
 struct zone_reclaim_stat *
 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 {
@@ -1258,7 +1257,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
        smp_rmb();
        mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
-        return &mz->reclaim_stat;
+        return &mz->lruvec.reclaim_stat;
 }
 #define mem_cgroup_from_res_counter(counter, member)    \
@@ -2845,24 +2844,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
         */
        if (do_swap_account && PageSwapCache(page)) {
                swp_entry_t ent = {.val = page_private(page)};
-                struct mem_cgroup *swap_memcg;
+                mem_cgroup_uncharge_swap(ent);
-                unsigned short id;
-                id = swap_cgroup_record(ent, 0);
-                rcu_read_lock();
-                swap_memcg = mem_cgroup_lookup(id);
-                if (swap_memcg) {
-                        /*
-                         * This recorded memcg can be obsolete one. So, avoid
-                         * calling css_tryget
-                         */
-                        if (!mem_cgroup_is_root(swap_memcg))
-                                res_counter_uncharge(&swap_memcg->memsw,
-                                                     PAGE_SIZE);
-                        mem_cgroup_swap_statistics(swap_memcg, false);
-                        mem_cgroup_put(swap_memcg);
-                }
-                rcu_read_unlock();
        }
        /*
         * At swapin, we may charge account against cgroup which has no tasks.
@@ -3155,7 +3137,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
 * @entry: swap entry to be moved
 * @from:  mem_cgroup which the entry is moved from
 * @to:  mem_cgroup which the entry is moved to
- * @need_fixup: whether we should fixup res_counters and refcounts.
 *
 * It succeeds only when the swap_cgroup's record for this entry is the same
 * as the mem_cgroup's id of @from.
@@ -3166,7 +3147,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
 * both res and memsw, and called css_get().
 */
 static int mem_cgroup_move_swap_account(swp_entry_t entry,
-                struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+                                struct mem_cgroup *from, struct mem_cgroup *to)
 {
        unsigned short old_id, new_id;
@@ -3185,24 +3166,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
                 * swap-in, the refcount of @to might be decreased to 0.
                 */
                mem_cgroup_get(to);
-                if (need_fixup) {
-                        if (!mem_cgroup_is_root(from))
-                                res_counter_uncharge(&from->memsw, PAGE_SIZE);
-                        mem_cgroup_put(from);
-                        /*
-                         * we charged both to->res and to->memsw, so we should
-                         * uncharge to->res.
-                         */
-                        if (!mem_cgroup_is_root(to))
-                                res_counter_uncharge(&to->res, PAGE_SIZE);
-                }
                return 0;
        }
        return -EINVAL;
 }
 #else
 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
-                struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+                                struct mem_cgroup *from, struct mem_cgroup *to)
 {
        return -EINVAL;
 }
@@ -3363,7 +3333,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 void mem_cgroup_replace_page_cache(struct page *oldpage,
                                  struct page *newpage)
 {
-        struct mem_cgroup *memcg;
+        struct mem_cgroup *memcg = NULL;
        struct page_cgroup *pc;
        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
@@ -3373,11 +3343,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
        pc = lookup_page_cgroup(oldpage);
        /* fix accounting on old pages */
        lock_page_cgroup(pc);
-        memcg = pc->mem_cgroup;
+        if (PageCgroupUsed(pc)) {
-        mem_cgroup_charge_statistics(memcg, false, -1);
+                memcg = pc->mem_cgroup;
-        ClearPageCgroupUsed(pc);
+                mem_cgroup_charge_statistics(memcg, false, -1);
+                ClearPageCgroupUsed(pc);
+        }
        unlock_page_cgroup(pc);
+        /*
+         * When called from shmem_replace_page(), in some cases the
+         * oldpage has already been charged, and in some cases not.
+         */
+        if (!memcg)
+                return;
        if (PageSwapBacked(oldpage))
                type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
@@ -4226,21 +4205,19 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        {
                int nid, zid;
                struct mem_cgroup_per_zone *mz;
+                struct zone_reclaim_stat *rstat;
                unsigned long recent_rotated[2] = {0, 0};
                unsigned long recent_scanned[2] = {0, 0};
                for_each_online_node(nid)
                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                                mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+                                rstat = &mz->lruvec.reclaim_stat;
-                                recent_rotated[0] +=
+                                recent_rotated[0] += rstat->recent_rotated[0];
-                                        mz->reclaim_stat.recent_rotated[0];
+                                recent_rotated[1] += rstat->recent_rotated[1];
-                                recent_rotated[1] +=
+                                recent_scanned[0] += rstat->recent_scanned[0];
-                                        mz->reclaim_stat.recent_rotated[1];
+                                recent_scanned[1] += rstat->recent_scanned[1];
-                                recent_scanned[0] +=
-                                        mz->reclaim_stat.recent_scanned[0];
-                                recent_scanned[1] +=
-                                        mz->reclaim_stat.recent_scanned[1];
                        }
                cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
                cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
@@ -5135,7 +5112,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
                return NULL;
        if (PageAnon(page)) {
                /* we don't move shared anon */
-                if (!move_anon() || page_mapcount(page) > 2)
+                if (!move_anon())
                        return NULL;
        } else if (!move_file())
                /* we ignore mapcount for file pages */
@@ -5146,26 +5123,32 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
        return page;
 }
+#ifdef CONFIG_SWAP
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
-        int usage_count;
        struct page *page = NULL;
        swp_entry_t ent = pte_to_swp_entry(ptent);
        if (!move_anon() || non_swap_entry(ent))
                return NULL;
-        usage_count = mem_cgroup_count_swap_user(ent, &page);
+        /*
-        if (usage_count > 1) { /* we don't move shared anon */
+         * Because lookup_swap_cache() updates some statistics counter,
-                if (page)
+         * we call find_get_page() with swapper_space directly.
-                        put_page(page);
+         */
-                return NULL;
+        page = find_get_page(&swapper_space, ent.val);
-        }
        if (do_swap_account)
                entry->val = ent.val;
        return page;
 }
+#else
+static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
+                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+        return NULL;
+}
+#endif
 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
@@ -5521,8 +5504,7 @@ put:			/* get_mctgt_type() gets the page */
                        break;
                case MC_TARGET_SWAP:
                        ent = target.ent;
-                        if (!mem_cgroup_move_swap_account(ent,
+                        if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
-                                                mc.from, mc.to, false)) {
                                mc.precharge--;
                                /* we fixup refcnts and charges later. */
                                mc.moved_swap++;
@@ -5598,7 +5580,6 @@ static void mem_cgroup_move_task(struct cgroup *cont,
        if (mm) {
                if (mc.to)
                        mem_cgroup_move_charge(mm);
-                put_swap_token(mm);
                mmput(mm);
        }
        if (mc.to)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c99ad4e6b88c..ab1e7145e290 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1388,16 +1388,16 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
         */
        if (!get_page_unless_zero(compound_head(p))) {
                if (PageHuge(p)) {
-                        pr_info("get_any_page: %#lx free huge page\n", pfn);
+                        pr_info("%s: %#lx free huge page\n", __func__, pfn);
                        ret = dequeue_hwpoisoned_huge_page(compound_head(p));
                } else if (is_free_buddy_page(p)) {
-                        pr_info("get_any_page: %#lx free buddy page\n", pfn);
+                        pr_info("%s: %#lx free buddy page\n", __func__, pfn);
                        /* Set hwpoison bit while page is still isolated */
                        SetPageHWPoison(p);
                        ret = 0;
                } else {
-                        pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+                        pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
-                                pfn, p->flags);
+                                __func__, pfn, p->flags);
                        ret = -EIO;
                }
        } else {
diff --git a/mm/memory.c b/mm/memory.c
index e40f6759ba98..1b7dc662bf9f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2908,7 +2908,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        delayacct_set_flag(DELAYACCT_PF_SWAPIN);
        page = lookup_swap_cache(entry);
        if (!page) {
-                grab_swap_token(mm); /* Contend for token _before_ read-in */
                page = swapin_readahead(entry,
                                        GFP_HIGHUSER_MOVABLE, vma, address);
                if (!page) {
@@ -2938,6 +2937,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        locked = lock_page_or_retry(page, mm, flags);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
        if (!locked) {
                ret |= VM_FAULT_RETRY;
@@ -3486,6 +3486,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(is_vm_hugetlb_page(vma)))
                return hugetlb_fault(mm, vma, address, flags);
+retry:
        pgd = pgd_offset(mm, address);
        pud = pud_alloc(mm, pgd, address);
        if (!pud)
@@ -3499,13 +3500,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                          pmd, flags);
        } else {
                pmd_t orig_pmd = *pmd;
+                int ret;
                barrier();
                if (pmd_trans_huge(orig_pmd)) {
                        if (flags & FAULT_FLAG_WRITE &&
                            !pmd_write(orig_pmd) &&
-                            !pmd_trans_splitting(orig_pmd))
+                            !pmd_trans_splitting(orig_pmd)) {
-                                return do_huge_pmd_wp_page(mm, vma, address,
+                                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
-                                                           pmd, orig_pmd);
+                                                          orig_pmd);
+                                /*
+                                 * If COW results in an oom, the huge pmd will
+                                 * have been split, so retry the fault on the
+                                 * pte for a smaller charge.
+                                 */
+                                if (unlikely(ret & VM_FAULT_OOM))
+                                        goto retry;
+                                return ret;
+                        }
                        return 0;
                }
        }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fc898cb4fe8f..0d7e3ec8e0f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
        res->end = start + size - 1;
        res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
        if (request_resource(&iomem_resource, res) < 0) {
-                printk("System RAM resource %llx - %llx cannot be added\n",
+                printk("System RAM resource %pR cannot be added\n", res);
-                (unsigned long long)res->start, (unsigned long long)res->end);
                kfree(res);
                res = NULL;
        }
@@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
                online_pages_range);
        if (ret) {
                mutex_unlock(&zonelists_mutex);
-                printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
+                printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
-                        nr_pages, pfn);
+                       (unsigned long long) pfn << PAGE_SHIFT,
+                       (((unsigned long long) pfn + nr_pages)
+                            << PAGE_SHIFT) - 1);
                memory_notify(MEM_CANCEL_ONLINE, &arg);
                unlock_memory_hotplug();
                return ret;
@@ -977,8 +978,9 @@ repeat:
        return 0;
 failed_removal:
-        printk(KERN_INFO "memory offlining %lx to %lx failed\n",
+        printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
-                start_pfn, end_pfn);
+               (unsigned long long) start_pfn << PAGE_SHIFT,
+               ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
        memory_notify(MEM_CANCEL_OFFLINE, &arg);
        /* pushback to free area */
        undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 88f9422b92e7..f15c1b24ca18 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 {
        if (!pol)
                return;
-        if (!mpol_store_user_nodemask(pol) && step == 0 &&
+        if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
                return;
@@ -950,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 *
 * Returns the number of page that could not be moved.
 */
-int do_migrate_pages(struct mm_struct *mm,
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
-        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+                     const nodemask_t *to, int flags)
 {
        int busy = 0;
        int err;
@@ -963,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm,
        down_read(&mm->mmap_sem);
-        err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+        err = migrate_vmas(mm, from, to, flags);
        if (err)
                goto out;
@@ -998,14 +998,34 @@ int do_migrate_pages(struct mm_struct *mm,
         * moved to an empty node, then there is nothing left worth migrating.
         */
-        tmp = *from_nodes;
+        tmp = *from;
        while (!nodes_empty(tmp)) {
                int s,d;
                int source = -1;
                int dest = 0;
                for_each_node_mask(s, tmp) {
-                        d = node_remap(s, *from_nodes, *to_nodes);
+                        /*
+                         * do_migrate_pages() tries to maintain the relative
+                         * node relationship of the pages established between
+                         * threads and memory areas.
+                         *
+                         * However if the number of source nodes is not equal to
+                         * the number of destination nodes we can not preserve
+                         * this node relative relationship.  In that case, skip
+                         * copying memory from a node that is in the destination
+                         * mask.
+                         *
+                         * Example: [2,3,4] -> [3,4,5] moves everything.
+                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
+                         */
+                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
+                                                (node_isset(s, *to)))
+                                continue;
+                        d = node_remap(s, *from, *to);
                        if (s == d)
                                continue;
@@ -1065,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 {
 }
-int do_migrate_pages(struct mm_struct *mm,
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
-        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+                     const nodemask_t *to, int flags)
 {
        return -ENOSYS;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index e8dcfc7de866..4a9c2a391e28 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1639,33 +1639,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
        struct vm_area_struct *vma = NULL;
-        if (mm) {
+        if (WARN_ON_ONCE(!mm))          /* Remove this in linux-3.6 */
-                /* Check the cache first. */
+                return NULL;
-                /* (Cache hit rate is typically around 35%.) */
-                vma = mm->mmap_cache;
+        /* Check the cache first. */
-                if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+        /* (Cache hit rate is typically around 35%.) */
-                        struct rb_node * rb_node;
+        vma = mm->mmap_cache;
+        if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
-                        rb_node = mm->mm_rb.rb_node;
+                struct rb_node *rb_node;
-                        vma = NULL;
+                rb_node = mm->mm_rb.rb_node;
-                        while (rb_node) {
+                vma = NULL;
-                                struct vm_area_struct * vma_tmp;
+                while (rb_node) {
-                                vma_tmp = rb_entry(rb_node,
+                        struct vm_area_struct *vma_tmp;
-                                                struct vm_area_struct, vm_rb);
+                        vma_tmp = rb_entry(rb_node,
-                                if (vma_tmp->vm_end > addr) {
+                                           struct vm_area_struct, vm_rb);
-                                        vma = vma_tmp;
-                                        if (vma_tmp->vm_start <= addr)
+                        if (vma_tmp->vm_end > addr) {
-                                                break;
+                                vma = vma_tmp;
-                                        rb_node = rb_node->rb_left;
+                                if (vma_tmp->vm_start <= addr)
-                                } else
+                                        break;
-                                        rb_node = rb_node->rb_right;
+                                rb_node = rb_node->rb_left;
-                        }
+                        } else
-                        if (vma)
+                                rb_node = rb_node->rb_right;
-                                mm->mmap_cache = vma;
                }
+                if (vma)
+                        mm->mmap_cache = vma;
        }
        return vma;
 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 1983fb1c7026..d23415c001bc 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -274,86 +274,85 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, limit);
 }
-/**
+static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
- * __alloc_bootmem_node - allocate boot memory from a specific node
+                                                   unsigned long size,
- * @pgdat: node to allocate from
+                                                   unsigned long align,
- * @size: size of the request in bytes
+                                                   unsigned long goal,
- * @align: alignment of the region
+                                                   unsigned long limit)
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may fall back to any node in the system if the specified node
- * can not hold the requested memory.
- *
- * The function panics if the request can not be satisfied.
- */
-void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
-                                   unsigned long align, unsigned long goal)
 {
        void *ptr;
-        if (WARN_ON_ONCE(slab_is_available()))
-                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 again:
        ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-                                         goal, -1ULL);
+                                        goal, limit);
        if (ptr)
                return ptr;
        ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-                                        goal, -1ULL);
+                                        goal, limit);
-        if (!ptr && goal) {
+        if (ptr)
+                return ptr;
+        if (goal) {
                goal = 0;
                goto again;
        }
-        return ptr;
+        return NULL;
 }
-void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
                                   unsigned long align, unsigned long goal)
 {
-        return __alloc_bootmem_node(pgdat, size, align, goal);
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+        return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
 }
-#ifdef CONFIG_SPARSEMEM
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
-/**
+                                    unsigned long align, unsigned long goal,
- * alloc_bootmem_section - allocate boot memory from a specific section
+                                    unsigned long limit)
- * @size: size of the request in bytes
- * @section_nr: sparse map section to allocate from
- *
- * Return NULL on failure.
- */
-void * __init alloc_bootmem_section(unsigned long size,
-                                    unsigned long section_nr)
 {
-        unsigned long pfn, goal, limit;
+        void *ptr;
-        pfn = section_nr_to_pfn(section_nr);
+        ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
-        goal = pfn << PAGE_SHIFT;
+        if (ptr)
-        limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+                return ptr;
-        return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
+        printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
-                                         SMP_CACHE_BYTES, goal, limit);
+        panic("Out of memory");
+        return NULL;
 }
-#endif
-void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
                                   unsigned long align, unsigned long goal)
 {
-        void *ptr;
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-        ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
+        return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
-                                                 goal, -1ULL);
+}
-        if (ptr)
-                return ptr;
-        return __alloc_bootmem_nopanic(size, align, goal);
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+                                   unsigned long align, unsigned long goal)
+{
+        return __alloc_bootmem_node(pgdat, size, align, goal);
 }
 #ifndef ARCH_LOW_ADDRESS_LIMIT
@@ -397,16 +396,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
                                       unsigned long align, unsigned long goal)
 {
-        void *ptr;
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-        ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+        return ___alloc_bootmem_node(pgdat, size, align, goal,
-                                goal, ARCH_LOW_ADDRESS_LIMIT);
+                                     ARCH_LOW_ADDRESS_LIMIT);
-        if (ptr)
-                return ptr;
-        return  __alloc_memory_core_early(MAX_NUMNODES, size, align,
-                                goal, ARCH_LOW_ADDRESS_LIMIT);
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9f09a1fde9f9..ed0e19677360 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -180,10 +180,10 @@ static bool oom_unkillable_task(struct task_struct *p,
 * predictable as possible.  The goal is to return the highest value for the
 * task consuming the most memory to avoid subsequent oom failures.
 */
-unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
+unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-                      const nodemask_t *nodemask, unsigned long totalpages)
+                          const nodemask_t *nodemask, unsigned long totalpages)
 {
-        long points;
+        unsigned long points;
        if (oom_unkillable_task(p, memcg, nodemask))
                return 0;
@@ -198,21 +198,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
        }
        /*
-         * The memory controller may have a limit of 0 bytes, so avoid a divide
-         * by zero, if necessary.
-         */
-        if (!totalpages)
-                totalpages = 1;
-        /*
         * The baseline for the badness score is the proportion of RAM that each
         * task's rss, pagetable and swap space use.
         */
-        points = get_mm_rss(p->mm) + p->mm->nr_ptes;
+        points = get_mm_rss(p->mm) + p->mm->nr_ptes +
-        points += get_mm_counter(p->mm, MM_SWAPENTS);
+                 get_mm_counter(p->mm, MM_SWAPENTS);
-        points *= 1000;
-        points /= totalpages;
        task_unlock(p);
        /*
@@ -220,23 +210,20 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
         * implementation used by LSMs.
         */
        if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-                points -= 30;
+                points -= 30 * totalpages / 1000;
        /*
         * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
         * either completely disable oom killing or always prefer a certain
         * task.
         */
-        points += p->signal->oom_score_adj;
+        points += p->signal->oom_score_adj * totalpages / 1000;
        /*
-         * Never return 0 for an eligible task that may be killed since it's
+         * Never return 0 for an eligible task regardless of the root bonus and
-         * possible that no single user task uses more than 0.1% of memory and
+         * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
-         * no single admin tasks uses more than 3.0%.
         */
-        if (points <= 0)
+        return points ? points : 1;
-                return 1;
-        return (points < 1000) ? points : 1000;
 }
 /*
@@ -314,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 {
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
-        *ppoints = 0;
+        unsigned long chosen_points = 0;
        do_each_thread(g, p) {
                unsigned int points;
@@ -354,7 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                         */
                        if (p == current) {
                                chosen = p;
-                                *ppoints = 1000;
+                                chosen_points = ULONG_MAX;
                        } else if (!force_kill) {
                                /*
                                 * If this task is not being ptraced on exit,
@@ -367,12 +354,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                }
                points = oom_badness(p, memcg, nodemask, totalpages);
-                if (points > *ppoints) {
+                if (points > chosen_points) {
                        chosen = p;
-                        *ppoints = points;
+                        chosen_points = points;
                }
        } while_each_thread(g, p);
+        *ppoints = chosen_points * 1000 / totalpages;
        return chosen;
 }
@@ -572,7 +560,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
        }
        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
-        limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
+        limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
        read_lock(&tasklist_lock);
        p = select_bad_process(&points, limit, memcg, NULL, false);
        if (p && PTR_ERR(p) != -1UL)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bab8e3bc4202..8cbfc38e68ac 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -219,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+void set_pageblock_migratetype(struct page *page, int migratetype)
 {
        if (unlikely(page_group_by_mobility_disabled))
@@ -954,8 +954,8 @@ static int move_freepages(struct zone *zone,
        return pages_moved;
 }
-static int move_freepages_block(struct zone *zone, struct page *page,
+int move_freepages_block(struct zone *zone, struct page *page,
-                                int migratetype)
+                         int migratetype)
 {
        unsigned long start_pfn, end_pfn;
        struct page *start_page, *end_page;
@@ -4300,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat,
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
-/* Return a sensible default order for the pageblock size. */
-static inline int pageblock_default_order(void)
-{
-        if (HPAGE_SHIFT > PAGE_SHIFT)
-                return HUGETLB_PAGE_ORDER;
-        return MAX_ORDER-1;
-}
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(unsigned int order)
+static inline void __init set_pageblock_order(void)
 {
+        unsigned int order;
        /* Check that pageblock_nr_pages has not already been setup */
        if (pageblock_order)
                return;
+        if (HPAGE_SHIFT > PAGE_SHIFT)
+                order = HUGETLB_PAGE_ORDER;
+        else
+                order = MAX_ORDER - 1;
        /*
         * Assume the largest contiguous order of interest is a huge page.
-         * This value may be variable depending on boot parameters on IA64
+         * This value may be variable depending on boot parameters on IA64 and
+         * powerpc.
         */
        pageblock_order = order;
 }
@@ -4326,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order)
 /*
 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
- * and pageblock_default_order() are unused as pageblock_order is set
+ * is unused as pageblock_order is set at compile-time. See
- * at compile-time. See include/linux/pageblock-flags.h for the values of
+ * include/linux/pageblock-flags.h for the values of pageblock_order based on
- * pageblock_order based on the kernel config
+ * the kernel config
 */
-static inline int pageblock_default_order(unsigned int order)
+static inline void set_pageblock_order(void)
 {
-        return MAX_ORDER-1;
 }
-#define set_pageblock_order(x)  do {} while (0)
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
@@ -4413,16 +4410,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone_pcp_init(zone);
                for_each_lru(lru)
                        INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
-                zone->reclaim_stat.recent_rotated[0] = 0;
+                zone->lruvec.reclaim_stat.recent_rotated[0] = 0;
-                zone->reclaim_stat.recent_rotated[1] = 0;
+                zone->lruvec.reclaim_stat.recent_rotated[1] = 0;
-                zone->reclaim_stat.recent_scanned[0] = 0;
+                zone->lruvec.reclaim_stat.recent_scanned[0] = 0;
-                zone->reclaim_stat.recent_scanned[1] = 0;
+                zone->lruvec.reclaim_stat.recent_scanned[1] = 0;
                zap_zone_vm_stats(zone);
                zone->flags = 0;
                if (!size)
                        continue;
-                set_pageblock_order(pageblock_default_order());
+                set_pageblock_order();
                setup_usemap(pgdat, zone, size);
                ret = init_currently_empty_zone(zone, zone_start_pfn,
                                                size, MEMMAP_EARLY);
@@ -4815,7 +4812,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        find_zone_movable_pfns_for_nodes();
        /* Print out the zone ranges */
-        printk("Zone PFN ranges:\n");
+        printk("Zone ranges:\n");
        for (i = 0; i < MAX_NR_ZONES; i++) {
                if (i == ZONE_MOVABLE)
                        continue;
@@ -4824,22 +4821,25 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                                arch_zone_highest_possible_pfn[i])
                        printk(KERN_CONT "empty\n");
                else
-                        printk(KERN_CONT "%0#10lx -> %0#10lx\n",
+                        printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
-                                arch_zone_lowest_possible_pfn[i],
+                                arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
-                                arch_zone_highest_possible_pfn[i]);
+                                (arch_zone_highest_possible_pfn[i]
+                                        << PAGE_SHIFT) - 1);
        }
        /* Print out the PFNs ZONE_MOVABLE begins at in each node */
-        printk("Movable zone start PFN for each node\n");
+        printk("Movable zone start for each node\n");
        for (i = 0; i < MAX_NUMNODES; i++) {
                if (zone_movable_pfn[i])
-                        printk("  Node %d: %lu\n", i, zone_movable_pfn[i]);
+                        printk("  Node %d: %#010lx\n", i,
+                               zone_movable_pfn[i] << PAGE_SHIFT);
        }
        /* Print out the early_node_map[] */
-        printk("Early memory PFN ranges\n");
+        printk("Early memory node ranges\n");
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
-                printk("  %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
+                printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
+                       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
        /* Initialise every node */
        mminit_verify_pageflags_layout();
@@ -5657,7 +5657,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
                .nr_migratepages = 0,
                .order = -1,
                .zone = page_zone(pfn_to_page(start)),
-                .sync = true,
+                .mode = COMPACT_SYNC,
        };
        INIT_LIST_HEAD(&cc.migratepages);
@@ -5938,7 +5938,7 @@ bool is_free_buddy_page(struct page *page)
 }
 #endif
-static struct trace_print_flags pageflag_names[] = {
+static const struct trace_print_flags pageflag_names[] = {
        {1UL << PG_locked,              "locked"        },
        {1UL << PG_error,               "error"         },
        {1UL << PG_referenced,          "referenced"    },
@@ -5973,7 +5973,9 @@ static struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_MEMORY_FAILURE
        {1UL << PG_hwpoison,            "hwpoison"      },
 #endif
-        {-1UL,                          NULL            },
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        {1UL << PG_compound_lock,       "compound_lock" },
+#endif
 };
 static void dump_page_flags(unsigned long flags)
@@ -5982,12 +5984,14 @@ static void dump_page_flags(unsigned long flags)
        unsigned long mask;
        int i;
+        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
        printk(KERN_ALERT "page flags: %#lx(", flags);
        /* remove zone id */
        flags &= (1UL << NR_PAGEFLAGS) - 1;
-        for (i = 0; pageflag_names[i].name && flags; i++) {
+        for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
                mask = pageflag_names[i].mask;
                if ((flags & mask) != mask)
diff --git a/mm/readahead.c b/mm/readahead.c
index cbcbb02f3e28..ea8f8fa21649 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,8 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/pagevec.h>
 #include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/file.h>
 /*
 * Initialise a struct file's readahead state.  Assumes that the caller has
@@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping,
        ondemand_readahead(mapping, ra, filp, true, offset, req_size);
 }
 EXPORT_SYMBOL_GPL(page_cache_async_readahead);
+static ssize_t
+do_readahead(struct address_space *mapping, struct file *filp,
+             pgoff_t index, unsigned long nr)
+{
+        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
+                return -EINVAL;
+        force_page_cache_readahead(mapping, filp, index, nr);
+        return 0;
+}
+SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
+{
+        ssize_t ret;
+        struct file *file;
+        ret = -EBADF;
+        file = fget(fd);
+        if (file) {
+                if (file->f_mode & FMODE_READ) {
+                        struct address_space *mapping = file->f_mapping;
+                        pgoff_t start = offset >> PAGE_CACHE_SHIFT;
+                        pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+                        unsigned long len = end - start + 1;
+                        ret = do_readahead(mapping, file, start, len);
+                }
+                fput(file);
+        }
+        return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
+{
+        return SYSC_readahead((int) fd, offset, (size_t) count);
+}
+SYSCALL_ALIAS(sys_readahead, SyS_readahead);
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 5b5ad584ffb7..0f3b7cda2a24 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
-        /* Pretend the page is referenced if the task has the
-           swap token and is in the middle of a page fault. */
-        if (mm != current->mm && has_swap_token(mm) &&
-                        rwsem_is_locked(&mm->mmap_sem))
-                referenced++;
        (*mapcount)--;
        if (referenced)
diff --git a/mm/shmem.c b/mm/shmem.c
index be5af34a070d..d576b84d913c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/percpu_counter.h>
+#include <linux/falloc.h>
 #include <linux/splice.h>
 #include <linux/security.h>
 #include <linux/swapops.h>
@@ -83,12 +84,25 @@ struct shmem_xattr {
        char value[0];
 };
+/*
+ * shmem_fallocate and shmem_writepage communicate via inode->i_private
+ * (with i_mutex making sure that it has only one user at a time):
+ * we would prefer not to enlarge the shmem inode just for that.
+ */
+struct shmem_falloc {
+        pgoff_t start;          /* start of range currently being fallocated */
+        pgoff_t next;           /* the next page offset to be fallocated */
+        pgoff_t nr_falloced;    /* how many new pages have been fallocated */
+        pgoff_t nr_unswapped;   /* how often writepage refused to swap out */
+};
 /* Flag allocation requirements to shmem_getpage */
 enum sgp_type {
        SGP_READ,       /* don't exceed i_size, don't allocate page */
        SGP_CACHE,      /* don't exceed i_size, may allocate page */
        SGP_DIRTY,      /* like SGP_CACHE, but set new page dirty */
-        SGP_WRITE,      /* may exceed i_size, may allocate page */
+        SGP_WRITE,      /* may exceed i_size, may allocate !Uptodate page */
+        SGP_FALLOC,     /* like SGP_WRITE, but make existing page Uptodate */
 };
 #ifdef CONFIG_TMPFS
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void)
 }
 #endif
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+                                struct shmem_inode_info *info, pgoff_t index);
 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
@@ -423,27 +440,31 @@ void shmem_unlock_mapping(struct address_space *mapping)
 /*
 * Remove range of pages and swap entries from radix tree, and free them.
+ * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
 */
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+                                                                 bool unfalloc)
 {
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+        pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
-        pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+        unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+        unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
        struct pagevec pvec;
        pgoff_t indices[PAGEVEC_SIZE];
        long nr_swaps_freed = 0;
        pgoff_t index;
        int i;
-        BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+        if (lend == -1)
+                end = -1;       /* unsigned, so actually very big */
        pagevec_init(&pvec, 0);
        index = start;
-        while (index <= end) {
+        while (index < end) {
                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                                min(end - index, (pgoff_t)PAGEVEC_SIZE),
                                                        pvec.pages, indices);
                if (!pvec.nr)
                        break;
@@ -452,10 +473,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                        struct page *page = pvec.pages[i];
                        index = indices[i];
-                        if (index > end)
+                        if (index >= end)
                                break;
                        if (radix_tree_exceptional_entry(page)) {
+                                if (unfalloc)
+                                        continue;
                                nr_swaps_freed += !shmem_free_swap(mapping,
                                                                index, page);
                                continue;
@@ -463,9 +486,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                        if (!trylock_page(page))
                                continue;
-                        if (page->mapping == mapping) {
+                        if (!unfalloc || !PageUptodate(page)) {
-                                VM_BUG_ON(PageWriteback(page));
+                                if (page->mapping == mapping) {
-                                truncate_inode_page(mapping, page);
+                                        VM_BUG_ON(PageWriteback(page));
+                                        truncate_inode_page(mapping, page);
+                                }
                        }
                        unlock_page(page);
                }
@@ -476,30 +501,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                index++;
        }
-        if (partial) {
+        if (partial_start) {
                struct page *page = NULL;
                shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
                if (page) {
-                        zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+                        unsigned int top = PAGE_CACHE_SIZE;
+                        if (start > end) {
+                                top = partial_end;
+                                partial_end = 0;
+                        }
+                        zero_user_segment(page, partial_start, top);
+                        set_page_dirty(page);
+                        unlock_page(page);
+                        page_cache_release(page);
+                }
+        }
+        if (partial_end) {
+                struct page *page = NULL;
+                shmem_getpage(inode, end, &page, SGP_READ, NULL);
+                if (page) {
+                        zero_user_segment(page, 0, partial_end);
                        set_page_dirty(page);
                        unlock_page(page);
                        page_cache_release(page);
                }
        }
+        if (start >= end)
+                return;
        index = start;
        for ( ; ; ) {
                cond_resched();
                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                                min(end - index, (pgoff_t)PAGEVEC_SIZE),
                                                        pvec.pages, indices);
                if (!pvec.nr) {
-                        if (index == start)
+                        if (index == start || unfalloc)
                                break;
                        index = start;
                        continue;
                }
-                if (index == start && indices[0] > end) {
+                if ((index == start || unfalloc) && indices[0] >= end) {
                        shmem_deswap_pagevec(&pvec);
                        pagevec_release(&pvec);
                        break;
@@ -509,19 +551,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                        struct page *page = pvec.pages[i];
                        index = indices[i];
-                        if (index > end)
+                        if (index >= end)
                                break;
                        if (radix_tree_exceptional_entry(page)) {
+                                if (unfalloc)
+                                        continue;
                                nr_swaps_freed += !shmem_free_swap(mapping,
                                                                index, page);
                                continue;
                        }
                        lock_page(page);
-                        if (page->mapping == mapping) {
+                        if (!unfalloc || !PageUptodate(page)) {
-                                VM_BUG_ON(PageWriteback(page));
+                                if (page->mapping == mapping) {
-                                truncate_inode_page(mapping, page);
+                                        VM_BUG_ON(PageWriteback(page));
+                                        truncate_inode_page(mapping, page);
+                                }
                        }
                        unlock_page(page);
                }
@@ -535,7 +581,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
        info->swapped -= nr_swaps_freed;
        shmem_recalc_inode(inode);
        spin_unlock(&info->lock);
+}
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+        shmem_undo_range(inode, lstart, lend, false);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -604,12 +654,13 @@ static void shmem_evict_inode(struct inode *inode)
 * If swap found in inode, free it and move page from swapcache to filecache.
 */
 static int shmem_unuse_inode(struct shmem_inode_info *info,
-                             swp_entry_t swap, struct page *page)
+                             swp_entry_t swap, struct page **pagep)
 {
        struct address_space *mapping = info->vfs_inode.i_mapping;
        void *radswap;
        pgoff_t index;
-        int error;
+        gfp_t gfp;
+        int error = 0;
        radswap = swp_to_radix_entry(swap);
        index = radix_tree_locate_item(&mapping->page_tree, radswap);
@@ -625,22 +676,37 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
        if (shmem_swaplist.next != &info->swaplist)
                list_move_tail(&shmem_swaplist, &info->swaplist);
+        gfp = mapping_gfp_mask(mapping);
+        if (shmem_should_replace_page(*pagep, gfp)) {
+                mutex_unlock(&shmem_swaplist_mutex);
+                error = shmem_replace_page(pagep, gfp, info, index);
+                mutex_lock(&shmem_swaplist_mutex);
+                /*
+                 * We needed to drop mutex to make that restrictive page
+                 * allocation; but the inode might already be freed by now,
+                 * and we cannot refer to inode or mapping or info to check.
+                 * However, we do hold page lock on the PageSwapCache page,
+                 * so can check if that still has our reference remaining.
+                 */
+                if (!page_swapcount(*pagep))
+                        error = -ENOENT;
+        }
        /*
         * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
         * but also to hold up shmem_evict_inode(): so inode cannot be freed
         * beneath us (pagelock doesn't help until the page is in pagecache).
         */
-        error = shmem_add_to_page_cache(page, mapping, index,
+        if (!error)
+                error = shmem_add_to_page_cache(*pagep, mapping, index,
                                                GFP_NOWAIT, radswap);
-        /* which does mem_cgroup_uncharge_cache_page on error */
        if (error != -ENOMEM) {
                /*
                 * Truncation and eviction use free_swap_and_cache(), which
                 * only does trylock page: if we raced, best clean up here.
                 */
-                delete_from_swap_cache(page);
+                delete_from_swap_cache(*pagep);
-                set_page_dirty(page);
+                set_page_dirty(*pagep);
                if (!error) {
                        spin_lock(&info->lock);
                        info->swapped--;
@@ -660,7 +726,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        struct list_head *this, *next;
        struct shmem_inode_info *info;
        int found = 0;
-        int error;
+        int error = 0;
+        /*
+         * There's a faint possibility that swap page was replaced before
+         * caller locked it: it will come back later with the right page.
+         */
+        if (unlikely(!PageSwapCache(page)))
+                goto out;
        /*
         * Charge page using GFP_KERNEL while we can wait, before taking
@@ -676,7 +749,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        list_for_each_safe(this, next, &shmem_swaplist) {
                info = list_entry(this, struct shmem_inode_info, swaplist);
                if (info->swapped)
-                        found = shmem_unuse_inode(info, swap, page);
+                        found = shmem_unuse_inode(info, swap, &page);
                else
                        list_del_init(&info->swaplist);
                cond_resched();
@@ -685,8 +758,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        }
        mutex_unlock(&shmem_swaplist_mutex);
-        if (!found)
-                mem_cgroup_uncharge_cache_page(page);
        if (found < 0)
                error = found;
 out:
@@ -727,6 +798,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
                goto redirty;
        }
+        /*
+         * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
+         * value into swapfile.c, the only way we can correctly account for a
+         * fallocated page arriving here is now to initialize it and write it.
+         *
+         * That's okay for a page already fallocated earlier, but if we have
+         * not yet completed the fallocation, then (a) we want to keep track
+         * of this page in case we have to undo it, and (b) it may not be a
+         * good idea to continue anyway, once we're pushing into swap.  So
+         * reactivate the page, and let shmem_fallocate() quit when too many.
+         */
+        if (!PageUptodate(page)) {
+                if (inode->i_private) {
+                        struct shmem_falloc *shmem_falloc;
+                        spin_lock(&inode->i_lock);
+                        shmem_falloc = inode->i_private;
+                        if (shmem_falloc &&
+                            index >= shmem_falloc->start &&
+                            index < shmem_falloc->next)
+                                shmem_falloc->nr_unswapped++;
+                        else
+                                shmem_falloc = NULL;
+                        spin_unlock(&inode->i_lock);
+                        if (shmem_falloc)
+                                goto redirty;
+                }
+                clear_highpage(page);
+                flush_dcache_page(page);
+                SetPageUptodate(page);
+        }
        swap = get_swap_page();
        if (!swap.val)
                goto redirty;
@@ -856,6 +959,84 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 #endif
 /*
+ * When a page is moved from swapcache to shmem filecache (either by the
+ * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * shmem_unuse_inode()), it may have been read in earlier from swap, in
+ * ignorance of the mapping it belongs to.  If that mapping has special
+ * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
+ * we may need to copy to a suitable page before moving to filecache.
+ *
+ * In a future release, this may well be extended to respect cpuset and
+ * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
+ * but for now it is a simple matter of zone.
+ */
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
+{
+        return page_zonenum(page) > gfp_zone(gfp);
+}
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+                                struct shmem_inode_info *info, pgoff_t index)
+{
+        struct page *oldpage, *newpage;
+        struct address_space *swap_mapping;
+        pgoff_t swap_index;
+        int error;
+        oldpage = *pagep;
+        swap_index = page_private(oldpage);
+        swap_mapping = page_mapping(oldpage);
+        /*
+         * We have arrived here because our zones are constrained, so don't
+         * limit chance of success by further cpuset and node constraints.
+         */
+        gfp &= ~GFP_CONSTRAINT_MASK;
+        newpage = shmem_alloc_page(gfp, info, index);
+        if (!newpage)
+                return -ENOMEM;
+        VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
+        *pagep = newpage;
+        page_cache_get(newpage);
+        copy_highpage(newpage, oldpage);
+        VM_BUG_ON(!PageLocked(oldpage));
+        __set_page_locked(newpage);
+        VM_BUG_ON(!PageUptodate(oldpage));
+        SetPageUptodate(newpage);
+        VM_BUG_ON(!PageSwapBacked(oldpage));
+        SetPageSwapBacked(newpage);
+        VM_BUG_ON(!swap_index);
+        set_page_private(newpage, swap_index);
+        VM_BUG_ON(!PageSwapCache(oldpage));
+        SetPageSwapCache(newpage);
+        /*
+         * Our caller will very soon move newpage out of swapcache, but it's
+         * a nice clean interface for us to replace oldpage by newpage there.
+         */
+        spin_lock_irq(&swap_mapping->tree_lock);
+        error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
+                                                                   newpage);
+        __inc_zone_page_state(newpage, NR_FILE_PAGES);
+        __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+        spin_unlock_irq(&swap_mapping->tree_lock);
+        BUG_ON(error);
+        mem_cgroup_replace_page_cache(oldpage, newpage);
+        lru_cache_add_anon(newpage);
+        ClearPageSwapCache(oldpage);
+        set_page_private(oldpage, 0);
+        unlock_page(oldpage);
+        page_cache_release(oldpage);
+        page_cache_release(oldpage);
+        return 0;
+}
+/*
 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
@@ -872,6 +1053,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        swp_entry_t swap;
        int error;
        int once = 0;
+        int alloced = 0;
        if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
                return -EFBIG;
@@ -883,19 +1065,21 @@ repeat:
                page = NULL;
        }
-        if (sgp != SGP_WRITE &&
+        if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
            ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
                error = -EINVAL;
                goto failed;
        }
+        /* fallocated page? */
+        if (page && !PageUptodate(page)) {
+                if (sgp != SGP_READ)
+                        goto clear;
+                unlock_page(page);
+                page_cache_release(page);
+                page = NULL;
+        }
        if (page || (sgp == SGP_READ && !swap.val)) {
-                /*
-                 * Once we can get the page lock, it must be uptodate:
-                 * if there were an error in reading back from swap,
-                 * the page would not be inserted into the filecache.
-                 */
-                BUG_ON(page && !PageUptodate(page));
                *pagep = page;
                return 0;
        }
@@ -923,19 +1107,20 @@ repeat:
                /* We have to do this with page locked to prevent races */
                lock_page(page);
+                if (!PageSwapCache(page) || page->mapping) {
+                        error = -EEXIST;        /* try again */
+                        goto failed;
+                }
                if (!PageUptodate(page)) {
                        error = -EIO;
                        goto failed;
                }
                wait_on_page_writeback(page);
-                /* Someone may have already done it for us */
+                if (shmem_should_replace_page(page, gfp)) {
-                if (page->mapping) {
+                        error = shmem_replace_page(&page, gfp, info, index);
-                        if (page->mapping == mapping &&
+                        if (error)
-                            page->index == index)
+                                goto failed;
-                                goto done;
-                        error = -EEXIST;
-                        goto failed;
                }
                error = mem_cgroup_cache_charge(page, current->mm,
@@ -991,19 +1176,36 @@ repeat:
                inode->i_blocks += BLOCKS_PER_PAGE;
                shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+                alloced = true;
-                clear_highpage(page);
+                /*
-                flush_dcache_page(page);
+                 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
-                SetPageUptodate(page);
+                 */
+                if (sgp == SGP_FALLOC)
+                        sgp = SGP_WRITE;
+clear:
+                /*
+                 * Let SGP_WRITE caller clear ends if write does not fill page;
+                 * but SGP_FALLOC on a page fallocated earlier must initialize
+                 * it now, lest undo on failure cancel our earlier guarantee.
+                 */
+                if (sgp != SGP_WRITE) {
+                        clear_highpage(page);
+                        flush_dcache_page(page);
+                        SetPageUptodate(page);
+                }
                if (sgp == SGP_DIRTY)
                        set_page_dirty(page);
        }
-done:
        /* Perhaps the file has been truncated since we checked */
-        if (sgp != SGP_WRITE &&
+        if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
            ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
                error = -EINVAL;
-                goto trunc;
+                if (alloced)
+                        goto trunc;
+                else
+                        goto failed;
        }
        *pagep = page;
        return 0;
@@ -1012,6 +1214,7 @@ done:
         * Error recovery.
         */
 trunc:
+        info = SHMEM_I(inode);
        ClearPageDirty(page);
        delete_from_page_cache(page);
        spin_lock(&info->lock);
@@ -1019,6 +1222,7 @@ trunc:
        inode->i_blocks -= BLOCKS_PER_PAGE;
        spin_unlock(&info->lock);
 decused:
+        sbinfo = SHMEM_SB(inode->i_sb);
        if (sbinfo->max_blocks)
                percpu_counter_add(&sbinfo->used_blocks, -1);
 unacct:
@@ -1204,6 +1408,14 @@ shmem_write_end(struct file *file, struct address_space *mapping,
        if (pos + copied > inode->i_size)
                i_size_write(inode, pos + copied);
+        if (!PageUptodate(page)) {
+                if (copied < PAGE_CACHE_SIZE) {
+                        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+                        zero_user_segments(page, 0, from,
+                                        from + copied, PAGE_CACHE_SIZE);
+                }
+                SetPageUptodate(page);
+        }
        set_page_dirty(page);
        unlock_page(page);
        page_cache_release(page);
@@ -1462,6 +1674,199 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        return error;
 }
+/*
+ * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
+ */
+static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+                                    pgoff_t index, pgoff_t end, int origin)
+{
+        struct page *page;
+        struct pagevec pvec;
+        pgoff_t indices[PAGEVEC_SIZE];
+        bool done = false;
+        int i;
+        pagevec_init(&pvec, 0);
+        pvec.nr = 1;            /* start small: we may be there already */
+        while (!done) {
+                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+                                        pvec.nr, pvec.pages, indices);
+                if (!pvec.nr) {
+                        if (origin == SEEK_DATA)
+                                index = end;
+                        break;
+                }
+                for (i = 0; i < pvec.nr; i++, index++) {
+                        if (index < indices[i]) {
+                                if (origin == SEEK_HOLE) {
+                                        done = true;
+                                        break;
+                                }
+                                index = indices[i];
+                        }
+                        page = pvec.pages[i];
+                        if (page && !radix_tree_exceptional_entry(page)) {
+                                if (!PageUptodate(page))
+                                        page = NULL;
+                        }
+                        if (index >= end ||
+                            (page && origin == SEEK_DATA) ||
+                            (!page && origin == SEEK_HOLE)) {
+                                done = true;
+                                break;
+                        }
+                }
+                shmem_deswap_pagevec(&pvec);
+                pagevec_release(&pvec);
+                pvec.nr = PAGEVEC_SIZE;
+                cond_resched();
+        }
+        return index;
+}
+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct address_space *mapping;
+        struct inode *inode;
+        pgoff_t start, end;
+        loff_t new_offset;
+        if (origin != SEEK_DATA && origin != SEEK_HOLE)
+                return generic_file_llseek_size(file, offset, origin,
+                                                        MAX_LFS_FILESIZE);
+        mapping = file->f_mapping;
+        inode = mapping->host;
+        mutex_lock(&inode->i_mutex);
+        /* We're holding i_mutex so we can access i_size directly */
+        if (offset < 0)
+                offset = -EINVAL;
+        else if (offset >= inode->i_size)
+                offset = -ENXIO;
+        else {
+                start = offset >> PAGE_CACHE_SHIFT;
+                end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                new_offset = shmem_seek_hole_data(mapping, start, end, origin);
+                new_offset <<= PAGE_CACHE_SHIFT;
+                if (new_offset > offset) {
+                        if (new_offset < inode->i_size)
+                                offset = new_offset;
+                        else if (origin == SEEK_DATA)
+                                offset = -ENXIO;
+                        else
+                                offset = inode->i_size;
+                }
+        }
+        if (offset >= 0 && offset != file->f_pos) {
+                file->f_pos = offset;
+                file->f_version = 0;
+        }
+        mutex_unlock(&inode->i_mutex);
+        return offset;
+}
+static long shmem_fallocate(struct file *file, int mode, loff_t offset,
+                                                         loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+        struct shmem_falloc shmem_falloc;
+        pgoff_t start, index, end;
+        int error;
+        mutex_lock(&inode->i_mutex);
+        if (mode & FALLOC_FL_PUNCH_HOLE) {
+                struct address_space *mapping = file->f_mapping;
+                loff_t unmap_start = round_up(offset, PAGE_SIZE);
+                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+                if ((u64)unmap_end > (u64)unmap_start)
+                        unmap_mapping_range(mapping, unmap_start,
+                                            1 + unmap_end - unmap_start, 0);
+                shmem_truncate_range(inode, offset, offset + len - 1);
+                /* No need to unmap again: hole-punching leaves COWed pages */
+                error = 0;
+                goto out;
+        }
+        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+        error = inode_newsize_ok(inode, offset + len);
+        if (error)
+                goto out;
+        start = offset >> PAGE_CACHE_SHIFT;
+        end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        /* Try to avoid a swapstorm if len is impossible to satisfy */
+        if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
+                error = -ENOSPC;
+                goto out;
+        }
+        shmem_falloc.start = start;
+        shmem_falloc.next  = start;
+        shmem_falloc.nr_falloced = 0;
+        shmem_falloc.nr_unswapped = 0;
+        spin_lock(&inode->i_lock);
+        inode->i_private = &shmem_falloc;
+        spin_unlock(&inode->i_lock);
+        for (index = start; index < end; index++) {
+                struct page *page;
+                /*
+                 * Good, the fallocate(2) manpage permits EINTR: we may have
+                 * been interrupted because we are using up too much memory.
+                 */
+                if (signal_pending(current))
+                        error = -EINTR;
+                else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
+                        error = -ENOMEM;
+                else
+                        error = shmem_getpage(inode, index, &page, SGP_FALLOC,
+                                                                        NULL);
+                if (error) {
+                        /* Remove the !PageUptodate pages we added */
+                        shmem_undo_range(inode,
+                                (loff_t)start << PAGE_CACHE_SHIFT,
+                                (loff_t)index << PAGE_CACHE_SHIFT, true);
+                        goto undone;
+                }
+                /*
+                 * Inform shmem_writepage() how far we have reached.
+                 * No need for lock or barrier: we have the page lock.
+                 */
+                shmem_falloc.next++;
+                if (!PageUptodate(page))
+                        shmem_falloc.nr_falloced++;
+                /*
+                 * If !PageUptodate, leave it that way so that freeable pages
+                 * can be recognized if we need to rollback on error later.
+                 * But set_page_dirty so that memory pressure will swap rather
+                 * than free the pages we are allocating (and SGP_CACHE pages
+                 * might still be clean: we now need to mark those dirty too).
+                 */
+                set_page_dirty(page);
+                unlock_page(page);
+                page_cache_release(page);
+                cond_resched();
+        }
+        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+                i_size_write(inode, offset + len);
+        inode->i_ctime = CURRENT_TIME;
+undone:
+        spin_lock(&inode->i_lock);
+        inode->i_private = NULL;
+        spin_unlock(&inode->i_lock);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return error;
+}
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1665,6 +2070,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                kaddr = kmap_atomic(page);
                memcpy(kaddr, symname, len);
                kunmap_atomic(kaddr);
+                SetPageUptodate(page);
                set_page_dirty(page);
                unlock_page(page);
                page_cache_release(page);
@@ -2270,6 +2676,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
                }
        }
        sb->s_export_op = &shmem_export_ops;
+        sb->s_flags |= MS_NOSEC;
 #else
        sb->s_flags |= MS_NOUSER;
 #endif
@@ -2364,7 +2771,7 @@ static const struct address_space_operations shmem_aops = {
 static const struct file_operations shmem_file_operations = {
        .mmap           = shmem_mmap,
 #ifdef CONFIG_TMPFS
-        .llseek         = generic_file_llseek,
+        .llseek         = shmem_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = shmem_file_aio_read,
@@ -2372,12 +2779,12 @@ static const struct file_operations shmem_file_operations = {
        .fsync          = noop_fsync,
        .splice_read    = shmem_file_splice_read,
        .splice_write   = generic_file_splice_write,
+        .fallocate      = shmem_fallocate,
 #endif
 };
 static const struct inode_operations shmem_inode_operations = {
        .setattr        = shmem_setattr,
-        .truncate_range = shmem_truncate_range,
 #ifdef CONFIG_TMPFS_XATTR
        .setxattr       = shmem_setxattr,
        .getxattr       = shmem_getxattr,
diff --git a/mm/sparse.c b/mm/sparse.c
index a8bc7d364deb..6a4bf9160e85 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -273,10 +273,10 @@ static unsigned long *__kmalloc_section_usemap(void)
 #ifdef CONFIG_MEMORY_HOTREMOVE
 static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
-                                         unsigned long count)
+                                         unsigned long size)
 {
-        unsigned long section_nr;
+        pg_data_t *host_pgdat;
+        unsigned long goal;
        /*
         * A page may contain usemaps for other sections preventing the
         * page being freed and making a section unremovable while
@@ -287,8 +287,10 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
         * from the same section as the pgdat where possible to avoid
         * this problem.
         */
-        section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+        goal = __pa(pgdat) & PAGE_SECTION_MASK;
-        return alloc_bootmem_section(usemap_size() * count, section_nr);
+        host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT));
+        return __alloc_bootmem_node_nopanic(host_pgdat, size,
+                                            SMP_CACHE_BYTES, goal);
 }
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -332,9 +334,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 #else
 static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
-                                         unsigned long count)
+                                         unsigned long size)
 {
-        return NULL;
+        return alloc_bootmem_node_nopanic(pgdat, size);
 }
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -352,13 +354,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
        int size = usemap_size();
        usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
-                                                                 usemap_count);
+                                                          size * usemap_count);
        if (!usemap) {
-                usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
+                printk(KERN_WARNING "%s: allocation failed\n", __func__);
-                if (!usemap) {
+                return;
-                        printk(KERN_WARNING "%s: allocation failed\n", __func__);
-                        return;
-                }
        }
        for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
diff --git a/mm/swap.c b/mm/swap.c
index 5c13f1338972..0503ad705e7c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -82,6 +82,25 @@ static void put_compound_page(struct page *page)
                if (likely(page != page_head &&
                           get_page_unless_zero(page_head))) {
                        unsigned long flags;
+                        /*
+                         * THP can not break up slab pages so avoid taking
+                         * compound_lock().  Slab performs non-atomic bit ops
+                         * on page->flags for better performance.  In particular
+                         * slab_unlock() in slub used to be a hot path.  It is
+                         * still hot on arches that do not support
+                         * this_cpu_cmpxchg_double().
+                         */
+                        if (PageSlab(page_head)) {
+                                if (PageTail(page)) {
+                                        if (put_page_testzero(page_head))
+                                                VM_BUG_ON(1);
+                                        atomic_dec(&page->_mapcount);
+                                        goto skip_lock_tail;
+                                } else
+                                        goto skip_lock;
+                        }
                        /*
                         * page_head wasn't a dangling pointer but it
                         * may not be a head page anymore by the time
@@ -92,10 +111,10 @@ static void put_compound_page(struct page *page)
                        if (unlikely(!PageTail(page))) {
                                /* __split_huge_page_refcount run before us */
                                compound_unlock_irqrestore(page_head, flags);
-                                VM_BUG_ON(PageHead(page_head));
+skip_lock:
                                if (put_page_testzero(page_head))
                                        __put_single_page(page_head);
-                        out_put_single:
+out_put_single:
                                if (put_page_testzero(page))
                                        __put_single_page(page);
                                return;
@@ -115,6 +134,8 @@ static void put_compound_page(struct page *page)
                        VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
                        VM_BUG_ON(atomic_read(&page->_count) != 0);
                        compound_unlock_irqrestore(page_head, flags);
+skip_lock_tail:
                        if (put_page_testzero(page_head)) {
                                if (PageHead(page_head))
                                        __put_compound_page(page_head);
@@ -162,6 +183,18 @@ bool __get_page_tail(struct page *page)
        struct page *page_head = compound_trans_head(page);
        if (likely(page != page_head && get_page_unless_zero(page_head))) {
+                /* Ref to put_compound_page() comment. */
+                if (PageSlab(page_head)) {
+                        if (likely(PageTail(page))) {
+                                __get_page_tail_foll(page, false);
+                                return true;
+                        } else {
+                                put_page(page_head);
+                                return false;
+                        }
+                }
                /*
                 * page_head wasn't a dangling pointer but it
                 * may not be a head page anymore by the time
@@ -279,21 +312,15 @@ void rotate_reclaimable_page(struct page *page)
 static void update_page_reclaim_stat(struct zone *zone, struct page *page,
                                     int file, int rotated)
 {
-        struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
+        struct zone_reclaim_stat *reclaim_stat;
-        struct zone_reclaim_stat *memcg_reclaim_stat;
-        memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
+        reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
+        if (!reclaim_stat)
+                reclaim_stat = &zone->lruvec.reclaim_stat;
        reclaim_stat->recent_scanned[file]++;
        if (rotated)
                reclaim_stat->recent_rotated[file]++;
-        if (!memcg_reclaim_stat)
-                return;
-        memcg_reclaim_stat->recent_scanned[file]++;
-        if (rotated)
-                memcg_reclaim_stat->recent_rotated[file]++;
 }
 static void __activate_page(struct page *page, void *arg)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fafc26d1b1dc..457b10baef59 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -601,7 +601,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
 * This does not give an exact answer when swap count is continued,
 * but does include the high COUNT_CONTINUED flag to allow for that.
 */
-static inline int page_swapcount(struct page *page)
+int page_swapcount(struct page *page)
 {
        int count = 0;
        struct swap_info_struct *p;
@@ -717,37 +717,6 @@ int free_swap_and_cache(swp_entry_t entry)
        return p != NULL;
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_count_swap_user - count the user of a swap entry
- * @ent: the swap entry to be checked
- * @pagep: the pointer for the swap cache page of the entry to be stored
- *
- * Returns the number of the user of the swap entry. The number is valid only
- * for swaps of anonymous pages.
- * If the entry is found on swap cache, the page is stored to pagep with
- * refcount of it being incremented.
- */
-int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
-{
-        struct page *page;
-        struct swap_info_struct *p;
-        int count = 0;
-        page = find_get_page(&swapper_space, ent.val);
-        if (page)
-                count += page_mapcount(page);
-        p = swap_info_get(ent);
-        if (p) {
-                count += swap_count(p->swap_map[swp_offset(ent)]);
-                spin_unlock(&swap_lock);
-        }
-        *pagep = page;
-        return count;
-}
-#endif
 #ifdef CONFIG_HIBERNATION
 /*
 * Find the swap type that corresponds to given device (if any).
diff --git a/mm/thrash.c b/mm/thrash.c
deleted file mode 100644
index 57ad495dbd54..000000000000
--- a/mm/thrash.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * mm/thrash.c
- *
- * Copyright (C) 2004, Red Hat, Inc.
- * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
- * Released under the GPL, see the file COPYING for details.
- *
- * Simple token based thrashing protection, using the algorithm
- * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
- *
- * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
- * Improved algorithm to pass token:
- * Each task has a priority which is incremented if it contended
- * for the token in an interval less than its previous attempt.
- * If the token is acquired, that task's priority is boosted to prevent
- * the token from bouncing around too often and to let the task make
- * some progress in its execution.
- */
-#include <linux/jiffies.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/swap.h>
-#include <linux/memcontrol.h>
-#include <trace/events/vmscan.h>
-#define TOKEN_AGING_INTERVAL    (0xFF)
-static DEFINE_SPINLOCK(swap_token_lock);
-struct mm_struct *swap_token_mm;
-static struct mem_cgroup *swap_token_memcg;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
-{
-        struct mem_cgroup *memcg;
-        memcg = try_get_mem_cgroup_from_mm(mm);
-        if (memcg)
-                css_put(mem_cgroup_css(memcg));
-        return memcg;
-}
-#else
-static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
-{
-        return NULL;
-}
-#endif
-void grab_swap_token(struct mm_struct *mm)
-{
-        int current_interval;
-        unsigned int old_prio = mm->token_priority;
-        static unsigned int global_faults;
-        static unsigned int last_aging;
-        global_faults++;
-        current_interval = global_faults - mm->faultstamp;
-        if (!spin_trylock(&swap_token_lock))
-                return;
-        /* First come first served */
-        if (!swap_token_mm)
-                goto replace_token;
-        /*
-         * Usually, we don't need priority aging because long interval faults
-         * makes priority decrease quickly. But there is one exception. If the
-         * token owner task is sleeping, it never make long interval faults.
-         * Thus, we need a priority aging mechanism instead. The requirements
-         * of priority aging are
-         *  1) An aging interval is reasonable enough long. Too short aging
-         *     interval makes quick swap token lost and decrease performance.
-         *  2) The swap token owner task have to get priority aging even if
-         *     it's under sleep.
-         */
-        if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
-                swap_token_mm->token_priority /= 2;
-                last_aging = global_faults;
-        }
-        if (mm == swap_token_mm) {
-                mm->token_priority += 2;
-                goto update_priority;
-        }
-        if (current_interval < mm->last_interval)
-                mm->token_priority++;
-        else {
-                if (likely(mm->token_priority > 0))
-                        mm->token_priority--;
-        }
-        /* Check if we deserve the token */
-        if (mm->token_priority > swap_token_mm->token_priority)
-                goto replace_token;
-update_priority:
-        trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
-out:
-        mm->faultstamp = global_faults;
-        mm->last_interval = current_interval;
-        spin_unlock(&swap_token_lock);
-        return;
-replace_token:
-        mm->token_priority += 2;
-        trace_replace_swap_token(swap_token_mm, mm);
-        swap_token_mm = mm;
-        swap_token_memcg = swap_token_memcg_from_mm(mm);
-        last_aging = global_faults;
-        goto out;
-}
-/* Called on process exit. */
-void __put_swap_token(struct mm_struct *mm)
-{
-        spin_lock(&swap_token_lock);
-        if (likely(mm == swap_token_mm)) {
-                trace_put_swap_token(swap_token_mm);
-                swap_token_mm = NULL;
-                swap_token_memcg = NULL;
-        }
-        spin_unlock(&swap_token_lock);
-}
-static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
-{
-        if (!a)
-                return true;
-        if (!b)
-                return true;
-        if (a == b)
-                return true;
-        return false;
-}
-void disable_swap_token(struct mem_cgroup *memcg)
-{
-        /* memcg reclaim don't disable unrelated mm token. */
-        if (match_memcg(memcg, swap_token_memcg)) {
-                spin_lock(&swap_token_lock);
-                if (match_memcg(memcg, swap_token_memcg)) {
-                        trace_disable_swap_token(swap_token_mm);
-                        swap_token_mm = NULL;
-                        swap_token_memcg = NULL;
-                }
-                spin_unlock(&swap_token_lock);
-        }
-}
diff --git a/mm/truncate.c b/mm/truncate.c
index 61a183b89df6..75801acdaac7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize)
 }
 EXPORT_SYMBOL(vmtruncate);
-int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
-{
-        struct address_space *mapping = inode->i_mapping;
-        loff_t holebegin = round_up(lstart, PAGE_SIZE);
-        loff_t holelen = 1 + lend - holebegin;
-        /*
-         * If the underlying filesystem is not going to provide
-         * a way to truncate a range of blocks (punch a hole) -
-         * we should return failure right now.
-         */
-        if (!inode->i_op->truncate_range)
-                return -ENOSYS;
-        mutex_lock(&inode->i_mutex);
-        inode_dio_wait(inode);
-        unmap_mapping_range(mapping, holebegin, holelen, 1);
-        inode->i_op->truncate_range(inode, lstart, lend);
-        /* unmap again to remove racily COWed private pages */
-        unmap_mapping_range(mapping, holebegin, holelen, 1);
-        mutex_unlock(&inode->i_mutex);
-        return 0;
-}
 /**
 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
 * @inode: inode
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 94dff883b449..2aad49981b57 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1185,9 +1185,10 @@ void __init vmalloc_init(void)
        /* Import existing vmlist entries. */
        for (tmp = vmlist; tmp; tmp = tmp->next) {
                va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
-                va->flags = tmp->flags | VM_VM_AREA;
+                va->flags = VM_VM_AREA;
                va->va_start = (unsigned long)tmp->addr;
                va->va_end = va->va_start + tmp->size;
+                va->vm = tmp;
                __insert_vmap_area(va);
        }
@@ -2375,8 +2376,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                return NULL;
        }
-        vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
+        vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
-        vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
+        vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
        if (!vas || !vms)
                goto err_free2;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3d1365c17868..8deb5f4da4d9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -53,24 +53,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
-/*
- * reclaim_mode determines how the inactive list is shrunk
- * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
- * RECLAIM_MODE_ASYNC:  Do not block
- * RECLAIM_MODE_SYNC:   Allow blocking e.g. call wait_on_page_writeback
- * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
- *                      page from the LRU and reclaim all pages within a
- *                      naturally aligned range
- * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
- *                      order-0 pages and then compact the zone
- */
-typedef unsigned __bitwise__ reclaim_mode_t;
-#define RECLAIM_MODE_SINGLE             ((__force reclaim_mode_t)0x01u)
-#define RECLAIM_MODE_ASYNC              ((__force reclaim_mode_t)0x02u)
-#define RECLAIM_MODE_SYNC               ((__force reclaim_mode_t)0x04u)
-#define RECLAIM_MODE_LUMPYRECLAIM       ((__force reclaim_mode_t)0x08u)
-#define RECLAIM_MODE_COMPACTION         ((__force reclaim_mode_t)0x10u)
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
@@ -97,12 +79,6 @@ struct scan_control {
        int order;
        /*
-         * Intend to reclaim enough continuous memory rather than reclaim
-         * enough amount of memory. i.e, mode for high order allocation.
-         */
-        reclaim_mode_t reclaim_mode;
-        /*
         * The memory cgroup that hit its limit and as a result is the
         * primary target of this reclaim invocation.
         */
@@ -164,35 +140,22 @@ static bool global_reclaim(struct scan_control *sc)
 {
        return !sc->target_mem_cgroup;
 }
-static bool scanning_global_lru(struct mem_cgroup_zone *mz)
-{
-        return !mz->mem_cgroup;
-}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
        return true;
 }
-static bool scanning_global_lru(struct mem_cgroup_zone *mz)
-{
-        return true;
-}
 #endif
 static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
 {
-        if (!scanning_global_lru(mz))
+        return &mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup)->reclaim_stat;
-                return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
-        return &mz->zone->reclaim_stat;
 }
 static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
                                       enum lru_list lru)
 {
-        if (!scanning_global_lru(mz))
+        if (!mem_cgroup_disabled())
                return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
                                                    zone_to_nid(mz->zone),
                                                    zone_idx(mz->zone),
@@ -364,39 +327,6 @@ out:
        return ret;
 }
-static void set_reclaim_mode(int priority, struct scan_control *sc,
-                                   bool sync)
-{
-        reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
-        /*
-         * Initially assume we are entering either lumpy reclaim or
-         * reclaim/compaction.Depending on the order, we will either set the
-         * sync mode or just reclaim order-0 pages later.
-         */
-        if (COMPACTION_BUILD)
-                sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
-        else
-                sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
-        /*
-         * Avoid using lumpy reclaim or reclaim/compaction if possible by
-         * restricting when its set to either costly allocations or when
-         * under memory pressure
-         */
-        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-                sc->reclaim_mode |= syncmode;
-        else if (sc->order && priority < DEF_PRIORITY - 2)
-                sc->reclaim_mode |= syncmode;
-        else
-                sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
-}
-static void reset_reclaim_mode(struct scan_control *sc)
-{
-        sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
-}
 static inline int is_page_cache_freeable(struct page *page)
 {
        /*
@@ -416,10 +346,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
                return 1;
        if (bdi == current->backing_dev_info)
                return 1;
-        /* lumpy reclaim for hugepage often need a lot of write */
-        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-                return 1;
        return 0;
 }
@@ -523,8 +449,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
                }
-                trace_mm_vmscan_writepage(page,
+                trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
-                        trace_reclaim_flags(page, sc->reclaim_mode));
                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -707,13 +632,10 @@ static enum page_references page_check_references(struct page *page,
        int referenced_ptes, referenced_page;
        unsigned long vm_flags;
-        referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
+        referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
+                                          &vm_flags);
        referenced_page = TestClearPageReferenced(page);
-        /* Lumpy reclaim - ignore references */
-        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
-                return PAGEREF_RECLAIM;
        /*
         * Mlock lost the isolation race with us.  Let try_to_unmap()
         * move the page to the unevictable list.
@@ -722,7 +644,7 @@ static enum page_references page_check_references(struct page *page,
                return PAGEREF_RECLAIM;
        if (referenced_ptes) {
-                if (PageAnon(page))
+                if (PageSwapBacked(page))
                        return PAGEREF_ACTIVATE;
                /*
                 * All mapped pages start out with page table
@@ -813,19 +735,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PageWriteback(page)) {
                        nr_writeback++;
-                        /*
+                        unlock_page(page);
-                         * Synchronous reclaim cannot queue pages for
+                        goto keep;
-                         * writeback due to the possibility of stack overflow
-                         * but if it encounters a page under writeback, wait
-                         * for the IO to complete.
-                         */
-                        if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
-                            may_enter_fs)
-                                wait_on_page_writeback(page);
-                        else {
-                                unlock_page(page);
-                                goto keep_lumpy;
-                        }
                }
                references = page_check_references(page, mz, sc);
@@ -908,7 +819,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto activate_locked;
                        case PAGE_SUCCESS:
                                if (PageWriteback(page))
-                                        goto keep_lumpy;
+                                        goto keep;
                                if (PageDirty(page))
                                        goto keep;
@@ -994,7 +905,6 @@ cull_mlocked:
                        try_to_free_swap(page);
                unlock_page(page);
                putback_lru_page(page);
-                reset_reclaim_mode(sc);
                continue;
 activate_locked:
@@ -1007,8 +917,6 @@ activate_locked:
 keep_locked:
                unlock_page(page);
 keep:
-                reset_reclaim_mode(sc);
-keep_lumpy:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
@@ -1064,11 +972,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
        if (!all_lru_mode && !!page_is_file_cache(page) != file)
                return ret;
-        /*
+        /* Do not give back unevictable pages for compaction */
-         * When this function is being called for lumpy reclaim, we
-         * initially look into all LRU pages, active, inactive and
-         * unevictable; only give shrink_page_list evictable pages.
-         */
        if (PageUnevictable(page))
                return ret;
@@ -1153,9 +1057,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        struct lruvec *lruvec;
        struct list_head *src;
        unsigned long nr_taken = 0;
-        unsigned long nr_lumpy_taken = 0;
-        unsigned long nr_lumpy_dirty = 0;
-        unsigned long nr_lumpy_failed = 0;
        unsigned long scan;
        int lru = LRU_BASE;
@@ -1168,10 +1069,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
                struct page *page;
-                unsigned long pfn;
-                unsigned long end_pfn;
-                unsigned long page_pfn;
-                int zone_id;
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
@@ -1193,84 +1090,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                default:
                        BUG();
                }
-                if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
-                        continue;
-                /*
-                 * Attempt to take all pages in the order aligned region
-                 * surrounding the tag page.  Only take those pages of
-                 * the same active state as that tag page.  We may safely
-                 * round the target page pfn down to the requested order
-                 * as the mem_map is guaranteed valid out to MAX_ORDER,
-                 * where that page is in a different zone we will detect
-                 * it from its zone id and abort this block scan.
-                 */
-                zone_id = page_zone_id(page);
-                page_pfn = page_to_pfn(page);
-                pfn = page_pfn & ~((1 << sc->order) - 1);
-                end_pfn = pfn + (1 << sc->order);
-                for (; pfn < end_pfn; pfn++) {
-                        struct page *cursor_page;
-                        /* The target page is in the block, ignore it. */
-                        if (unlikely(pfn == page_pfn))
-                                continue;
-                        /* Avoid holes within the zone. */
-                        if (unlikely(!pfn_valid_within(pfn)))
-                                break;
-                        cursor_page = pfn_to_page(pfn);
-                        /* Check that we have not crossed a zone boundary. */
-                        if (unlikely(page_zone_id(cursor_page) != zone_id))
-                                break;
-                        /*
-                         * If we don't have enough swap space, reclaiming of
-                         * anon page which don't already have a swap slot is
-                         * pointless.
-                         */
-                        if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
-                            !PageSwapCache(cursor_page))
-                                break;
-                        if (__isolate_lru_page(cursor_page, mode, file) == 0) {
-                                unsigned int isolated_pages;
-                                mem_cgroup_lru_del(cursor_page);
-                                list_move(&cursor_page->lru, dst);
-                                isolated_pages = hpage_nr_pages(cursor_page);
-                                nr_taken += isolated_pages;
-                                nr_lumpy_taken += isolated_pages;
-                                if (PageDirty(cursor_page))
-                                        nr_lumpy_dirty += isolated_pages;
-                                scan++;
-                                pfn += isolated_pages - 1;
-                        } else {
-                                /*
-                                 * Check if the page is freed already.
-                                 *
-                                 * We can't use page_count() as that
-                                 * requires compound_head and we don't
-                                 * have a pin on the page here. If a
-                                 * page is tail, we may or may not
-                                 * have isolated the head, so assume
-                                 * it's not free, it'd be tricky to
-                                 * track the head status without a
-                                 * page pin.
-                                 */
-                                if (!PageTail(cursor_page) &&
-                                    !atomic_read(&cursor_page->_count))
-                                        continue;
-                                break;
-                        }
-                }
-                /* If we break out of the loop above, lumpy reclaim failed */
-                if (pfn < end_pfn)
-                        nr_lumpy_failed++;
        }
        *nr_scanned = scan;
@@ -1278,7 +1097,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        trace_mm_vmscan_lru_isolate(sc->order,
                        nr_to_scan, scan,
                        nr_taken,
-                        nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
                        mode, file);
        return nr_taken;
 }
@@ -1454,47 +1272,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
 }
 /*
- * Returns true if a direct reclaim should wait on pages under writeback.
- *
- * If we are direct reclaiming for contiguous pages and we do not reclaim
- * everything in the list, try again and wait for writeback IO to complete.
- * This will stall high-order allocations noticeably. Only do that when really
- * need to free the pages under high memory pressure.
- */
-static inline bool should_reclaim_stall(unsigned long nr_taken,
-                                        unsigned long nr_freed,
-                                        int priority,
-                                        struct scan_control *sc)
-{
-        int lumpy_stall_priority;
-        /* kswapd should not stall on sync IO */
-        if (current_is_kswapd())
-                return false;
-        /* Only stall on lumpy reclaim */
-        if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
-                return false;
-        /* If we have reclaimed everything on the isolated list, no stall */
-        if (nr_freed == nr_taken)
-                return false;
-        /*
-         * For high-order allocations, there are two stall thresholds.
-         * High-cost allocations stall immediately where as lower
-         * order allocations such as stacks require the scanning
-         * priority to be much higher before stalling.
-         */
-        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-                lumpy_stall_priority = DEF_PRIORITY;
-        else
-                lumpy_stall_priority = DEF_PRIORITY / 3;
-        return priority <= lumpy_stall_priority;
-}
-/*
 * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
 * of reclaimed pages
 */
@@ -1522,10 +1299,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
                        return SWAP_CLUSTER_MAX;
        }
-        set_reclaim_mode(priority, sc, false);
-        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
-                isolate_mode |= ISOLATE_ACTIVE;
        lru_add_drain();
        if (!sc->may_unmap)
@@ -1556,13 +1329,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
        nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
                                                &nr_dirty, &nr_writeback);
-        /* Check if we should syncronously wait for writeback */
-        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
-                set_reclaim_mode(priority, sc, true);
-                nr_reclaimed += shrink_page_list(&page_list, mz, sc,
-                                        priority, &nr_dirty, &nr_writeback);
-        }
        spin_lock_irq(&zone->lru_lock);
        reclaim_stat->recent_scanned[0] += nr_anon;
@@ -1616,7 +1382,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
                zone_idx(zone),
                nr_scanned, nr_reclaimed,
                priority,
-                trace_shrink_flags(file, sc->reclaim_mode));
+                trace_shrink_flags(file));
        return nr_reclaimed;
 }
@@ -1695,8 +1461,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
        lru_add_drain();
-        reset_reclaim_mode(sc);
        if (!sc->may_unmap)
                isolate_mode |= ISOLATE_UNMAPPED;
        if (!sc->may_writepage)
@@ -1737,7 +1501,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
                        }
                }
-                if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
+                if (page_referenced(page, 0, sc->target_mem_cgroup,
+                                    &vm_flags)) {
                        nr_rotated += hpage_nr_pages(page);
                        /*
                         * Identify referenced, file-backed active pages and
@@ -1811,7 +1576,7 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
        if (!total_swap_pages)
                return 0;
-        if (!scanning_global_lru(mz))
+        if (!mem_cgroup_disabled())
                return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
                                                       mz->zone);
@@ -1850,7 +1615,7 @@ static int inactive_file_is_low_global(struct zone *zone)
 */
 static int inactive_file_is_low(struct mem_cgroup_zone *mz)
 {
-        if (!scanning_global_lru(mz))
+        if (!mem_cgroup_disabled())
                return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
                                                       mz->zone);
@@ -1984,10 +1749,10 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
         * proportional to the fraction of recently scanned pages on
         * each list that were recently referenced and in active use.
         */
-        ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
+        ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
        ap /= reclaim_stat->recent_rotated[0] + 1;
-        fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
+        fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
        fp /= reclaim_stat->recent_rotated[1] + 1;
        spin_unlock_irq(&mz->zone->lru_lock);
@@ -2000,7 +1765,7 @@ out:
                unsigned long scan;
                scan = zone_nr_lru_pages(mz, lru);
-                if (priority || noswap) {
+                if (priority || noswap || !vmscan_swappiness(mz, sc)) {
                        scan >>= priority;
                        if (!scan && force_scan)
                                scan = SWAP_CLUSTER_MAX;
@@ -2010,23 +1775,35 @@ out:
        }
 }
+/* Use reclaim/compaction for costly allocs or under memory pressure */
+static bool in_reclaim_compaction(int priority, struct scan_control *sc)
+{
+        if (COMPACTION_BUILD && sc->order &&
+                        (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
+                         priority < DEF_PRIORITY - 2))
+                return true;
+        return false;
+}
 /*
- * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * Reclaim/compaction is used for high-order allocation requests. It reclaims
- * disruption to the system, a small number of order-0 pages continue to be
+ * order-0 pages before compacting the zone. should_continue_reclaim() returns
- * rotated and reclaimed in the normal fashion. However, by the time we get
+ * true if more pages should be reclaimed such that when the page allocator
- * back to the allocator and call try_to_compact_zone(), we ensure that
+ * calls try_to_compact_zone() that it will have enough free pages to succeed.
- * there are enough free pages for it to be likely successful
+ * It will give up earlier than that if there is difficulty reclaiming pages.
 */
 static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
                                        unsigned long nr_reclaimed,
                                        unsigned long nr_scanned,
+                                        int priority,
                                        struct scan_control *sc)
 {
        unsigned long pages_for_compaction;
        unsigned long inactive_lru_pages;
        /* If not in reclaim/compaction mode, stop */
-        if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+        if (!in_reclaim_compaction(priority, sc))
                return false;
        /* Consider stopping depending on scan and reclaim activity */
@@ -2128,7 +1905,8 @@ restart:
        /* reclaim/compaction might need reclaim to continue */
        if (should_continue_reclaim(mz, nr_reclaimed,
-                                        sc->nr_scanned - nr_scanned, sc))
+                                        sc->nr_scanned - nr_scanned,
+                                        priority, sc))
                goto restart;
        throttle_vm_writeout(sc->gfp_mask);
@@ -2353,8 +2131,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                sc->nr_scanned = 0;
-                if (!priority)
-                        disable_swap_token(sc->target_mem_cgroup);
                aborted_reclaim = shrink_zones(priority, zonelist, sc);
                /*
@@ -2705,10 +2481,6 @@ loop_again:
                unsigned long lru_pages = 0;
                int has_under_min_watermark_zone = 0;
-                /* The swap token gets in the way of swapout... */
-                if (!priority)
-                        disable_swap_token(NULL);
                all_zones_ok = 1;
                balanced = 0;
@@ -3537,7 +3309,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
        if (mapping_unevictable(page_mapping(page)))
                return 0;
-        if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
+        if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
                return 0;
        return 1;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0dad31dc1618..1bbbbd9776ad 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1223,7 +1223,6 @@ module_init(setup_vmstat)
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
 #include <linux/debugfs.h>
-static struct dentry *extfrag_debug_root;
 /*
 * Return an index indicating how much of the available free memory is
@@ -1361,19 +1360,24 @@ static const struct file_operations extfrag_file_ops = {
 static int __init extfrag_debug_init(void)
 {
+        struct dentry *extfrag_debug_root;
        extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
        if (!extfrag_debug_root)
                return -ENOMEM;
        if (!debugfs_create_file("unusable_index", 0444,
                        extfrag_debug_root, NULL, &unusable_file_ops))
-                return -ENOMEM;
+                goto fail;
        if (!debugfs_create_file("extfrag_index", 0444,
                        extfrag_debug_root, NULL, &extfrag_file_ops))
-                return -ENOMEM;
+                goto fail;
        return 0;
+fail:
+        debugfs_remove_recursive(extfrag_debug_root);
+        return -ENOMEM;
 }
 module_init(extfrag_debug_init);
author	Jiri Kosina <jkosina@suse.cz>	2012-06-29 08:45:58 -0400
committer	Jiri Kosina <jkosina@suse.cz>	2012-06-29 08:45:58 -0400
commit	59f91e5dd0504dc0ebfaa0b6f3a55e6931f96266 (patch)
tree	b913718405d44a921905ac71044fbde410256865 /mm
parent	57bdfdd80077addf518a9b90c4a66890efc4f70e (diff)
parent	89abfab133ef1f5902abafb744df72793213ac19 (diff)