20 files changed, 634 insertions, 293 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 021a2960ef9..6cc604bd564 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -144,9 +144,20 @@ static void isolate_freepages(struct zone *zone,
        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
+        /*
+         * Initialise the free scanner. The starting point is where we last
+         * scanned from (or the end of the zone if starting). The low point
+         * is the end of the pageblock the migration scanner is using.
+         */
        pfn = cc->free_pfn;
        low_pfn = cc->migrate_pfn + pageblock_nr_pages;
-        high_pfn = low_pfn;
+        /*
+         * Take care that if the migration scanner is at the end of the zone
+         * that the free scanner does not accidentally move to the next zone
+         * in the next isolation cycle.
+         */
+        high_pfn = min(low_pfn, pfn);
        /*
         * Isolate free pages until enough are available to migrate the
@@ -240,11 +251,18 @@ static bool too_many_isolated(struct zone *zone)
        return isolated > (inactive + active) / 2;
 }
+/* possible outcome of isolate_migratepages */
+typedef enum {
+        ISOLATE_ABORT,          /* Abort compaction now */
+        ISOLATE_NONE,           /* No pages isolated, continue scanning */
+        ISOLATE_SUCCESS,        /* Pages isolated, migrate */
+} isolate_migrate_t;
 /*
 * Isolate all pages that can be migrated from the block pointed to by
 * the migrate scanner within compact_control.
 */
-static unsigned long isolate_migratepages(struct zone *zone,
+static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
        unsigned long low_pfn, end_pfn;
@@ -261,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
        /* Do not cross the free scanner or scan within a memory hole */
        if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
                cc->migrate_pfn = end_pfn;
-                return 0;
+                return ISOLATE_NONE;
        }
        /*
@@ -270,10 +288,14 @@ static unsigned long isolate_migratepages(struct zone *zone,
         * delay for some time until fewer pages are isolated
         */
        while (unlikely(too_many_isolated(zone))) {
+                /* async migration should just abort */
+                if (!cc->sync)
+                        return ISOLATE_ABORT;
                congestion_wait(BLK_RW_ASYNC, HZ/10);
                if (fatal_signal_pending(current))
-                        return 0;
+                        return ISOLATE_ABORT;
        }
        /* Time to isolate some pages for migration */
@@ -358,7 +380,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
-        return cc->nr_migratepages;
+        return ISOLATE_SUCCESS;
 }
 /*
@@ -420,13 +442,6 @@ static int compact_finished(struct zone *zone,
        if (cc->free_pfn <= cc->migrate_pfn)
                return COMPACT_COMPLETE;
-        /* Compaction run is not finished if the watermark is not met */
-        watermark = low_wmark_pages(zone);
-        watermark += (1 << cc->order);
-        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
-                return COMPACT_CONTINUE;
        /*
         * order == -1 is expected when compacting via
         * /proc/sys/vm/compact_memory
@@ -434,6 +449,13 @@ static int compact_finished(struct zone *zone,
        if (cc->order == -1)
                return COMPACT_CONTINUE;
+        /* Compaction run is not finished if the watermark is not met */
+        watermark = low_wmark_pages(zone);
+        watermark += (1 << cc->order);
+        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
        for (order = cc->order; order < MAX_ORDER; order++) {
                /* Job done if page is free of the right migratetype */
@@ -461,6 +483,13 @@ unsigned long compaction_suitable(struct zone *zone, int order)
        unsigned long watermark;
        /*
+         * order == -1 is expected when compacting via
+         * /proc/sys/vm/compact_memory
+         */
+        if (order == -1)
+                return COMPACT_CONTINUE;
+        /*
         * Watermarks for order-0 must be met for compaction. Note the 2UL.
         * This is because during migration, copies of pages need to be
         * allocated and for a short time, the footprint is higher
@@ -470,17 +499,11 @@ unsigned long compaction_suitable(struct zone *zone, int order)
                return COMPACT_SKIPPED;
        /*
-         * order == -1 is expected when compacting via
-         * /proc/sys/vm/compact_memory
-         */
-        if (order == -1)
-                return COMPACT_CONTINUE;
-        /*
         * fragmentation index determines if allocation failures are due to
         * low memory or external fragmentation
         *
-         * index of -1 implies allocations might succeed dependingon watermarks
+         * index of -1000 implies allocations might succeed depending on
+         * watermarks
         * index towards 0 implies failure is due to lack of memory
         * index towards 1000 implies failure is due to fragmentation
         *
@@ -490,7 +513,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
        if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
                return COMPACT_SKIPPED;
-        if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+        if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
+            0, 0))
                return COMPACT_PARTIAL;
        return COMPACT_CONTINUE;
@@ -522,8 +546,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                unsigned long nr_migrate, nr_remaining;
                int err;
-                if (!isolate_migratepages(zone, cc))
+                switch (isolate_migratepages(zone, cc)) {
+                case ISOLATE_ABORT:
+                        ret = COMPACT_PARTIAL;
+                        goto out;
+                case ISOLATE_NONE:
                        continue;
+                case ISOLATE_SUCCESS:
+                        ;
+                }
                nr_migrate = cc->nr_migratepages;
                err = migrate_pages(&cc->migratepages, compaction_alloc,
@@ -547,6 +578,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        }
+out:
        /* Release free pages and check accounting */
        cc->nr_freepages -= release_freepages(&cc->freepages);
        VM_BUG_ON(cc->nr_freepages != 0);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 615d9743a3c..81532f297fd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2234,11 +2234,8 @@ static void khugepaged_loop(void)
        while (likely(khugepaged_enabled())) {
 #ifndef CONFIG_NUMA
                hpage = khugepaged_alloc_hugepage();
-                if (unlikely(!hpage)) {
+                if (unlikely(!hpage))
-                        count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                        break;
-                }
-                count_vm_event(THP_COLLAPSE_ALLOC);
 #else
                if (IS_ERR(hpage)) {
                        khugepaged_alloc_sleep();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6402458fee3..bfcf153bc82 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1111,6 +1111,14 @@ static void __init gather_bootmem_prealloc(void)
                WARN_ON(page_count(page) != 1);
                prep_compound_huge_page(page, h->order);
                prep_new_huge_page(h, page, page_to_nid(page));
+                /*
+                 * If we had gigantic hugepages allocated at boot time, we need
+                 * to restore the 'stolen' pages to totalram_pages in order to
+                 * fix confusing memory reports from free(1) and another
+                 * side-effects, like CommitLimit going negative.
+                 */
+                if (h->order > (MAX_ORDER - 1))
+                        totalram_pages += 1 << h->order;
        }
 }
diff --git a/mm/ksm.c b/mm/ksm.c
index d708b3ef226..9a68b0cf0a1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1302,6 +1302,12 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
                slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
                ksm_scan.mm_slot = slot;
                spin_unlock(&ksm_mmlist_lock);
+                /*
+                 * Although we tested list_empty() above, a racing __ksm_exit
+                 * of the last mm on the list may have removed it since then.
+                 */
+                if (slot == &ksm_mm_head)
+                        return NULL;
 next_mm:
                ksm_scan.address = 0;
                ksm_scan.rmap_list = &slot->rmap_list;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bd9052a5d3a..e013b8e57d2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
 #include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
+#include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index {
 enum mem_cgroup_events_target {
        MEM_CGROUP_TARGET_THRESH,
        MEM_CGROUP_TARGET_SOFTLIMIT,
+        MEM_CGROUP_TARGET_NUMAINFO,
        MEM_CGROUP_NTARGETS,
 };
 #define THRESHOLDS_EVENTS_TARGET (128)
 #define SOFTLIMIT_EVENTS_TARGET (1024)
+#define NUMAINFO_EVENTS_TARGET  (1024)
 struct mem_cgroup_stat_cpu {
        long count[MEM_CGROUP_STAT_NSTATS];
@@ -236,7 +239,8 @@ struct mem_cgroup {
        int last_scanned_node;
 #if MAX_NUMNODES > 1
        nodemask_t      scan_nodes;
-        unsigned long   next_scan_node_update;
+        atomic_t        numainfo_events;
+        atomic_t        numainfo_updating;
 #endif
        /*
         * Should the accounting and control be hierarchical, per subtree?
@@ -359,7 +363,7 @@ enum charge_type {
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void drain_all_stock_async(void);
+static void drain_all_stock_async(struct mem_cgroup *mem);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
        return val;
 }
-static long mem_cgroup_local_usage(struct mem_cgroup *mem)
-{
-        long ret;
-        ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
-        ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
-        return ret;
-}
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
                                         bool charge)
 {
@@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
        case MEM_CGROUP_TARGET_SOFTLIMIT:
                next = val + SOFTLIMIT_EVENTS_TARGET;
                break;
+        case MEM_CGROUP_TARGET_NUMAINFO:
+                next = val + NUMAINFO_EVENTS_TARGET;
+                break;
        default:
                return;
        }
@@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
                mem_cgroup_threshold(mem);
                __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
                if (unlikely(__memcg_event_check(mem,
-                        MEM_CGROUP_TARGET_SOFTLIMIT))){
+                             MEM_CGROUP_TARGET_SOFTLIMIT))) {
                        mem_cgroup_update_tree(mem, page);
                        __mem_cgroup_target_update(mem,
-                                MEM_CGROUP_TARGET_SOFTLIMIT);
+                                                   MEM_CGROUP_TARGET_SOFTLIMIT);
                }
+#if MAX_NUMNODES > 1
+                if (unlikely(__memcg_event_check(mem,
+                        MEM_CGROUP_TARGET_NUMAINFO))) {
+                        atomic_inc(&mem->numainfo_events);
+                        __mem_cgroup_target_update(mem,
+                                MEM_CGROUP_TARGET_NUMAINFO);
+                }
+#endif
        }
 }
@@ -735,7 +741,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
                                struct mem_cgroup, css);
 }
-static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
        struct mem_cgroup *mem = NULL;
@@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
        return MEM_CGROUP_ZSTAT(mz, lru);
 }
-#ifdef CONFIG_NUMA
 static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
                                                        int nid)
 {
@@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
        return ret;
 }
+static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
+                                                        int nid)
+{
+        unsigned long ret;
+        ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
+                mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
+        return ret;
+}
+#if MAX_NUMNODES > 1
 static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
 {
        u64 total = 0;
@@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
        return total;
 }
-static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
-                                                        int nid)
-{
-        unsigned long ret;
-        ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
-                mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
-        return ret;
-}
 static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
 {
        u64 total = 0;
@@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
        return ret;
 }
+/**
+ * test_mem_cgroup_node_reclaimable
+ * @mem: the target memcg
+ * @nid: the node ID to be checked.
+ * @noswap : specify true here if the user wants flle only information.
+ *
+ * This function returns whether the specified memcg contains any
+ * reclaimable pages on a node. Returns true if there are any reclaimable
+ * pages in the node.
+ */
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+                int nid, bool noswap)
+{
+        if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
+                return true;
+        if (noswap || !total_swap_pages)
+                return false;
+        if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
+                return true;
+        return false;
+}
 #if MAX_NUMNODES > 1
 /*
@@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
 {
        int nid;
+        /*
-        if (time_after(mem->next_scan_node_update, jiffies))
+         * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
+         * pagein/pageout changes since the last update.
+         */
+        if (!atomic_read(&mem->numainfo_events))
+                return;
+        if (atomic_inc_return(&mem->numainfo_updating) > 1)
                return;
-        mem->next_scan_node_update = jiffies + 10*HZ;
        /* make a nodemask where this memcg uses memory from */
        mem->scan_nodes = node_states[N_HIGH_MEMORY];
        for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
-                if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+                if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
-                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+                        node_clear(nid, mem->scan_nodes);
-                        continue;
-                if (total_swap_pages &&
-                    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
-                     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
-                        continue;
-                node_clear(nid, mem->scan_nodes);
        }
+        atomic_set(&mem->numainfo_events, 0);
+        atomic_set(&mem->numainfo_updating, 0);
 }
 /*
@@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
        return node;
 }
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+        int nid;
+        /*
+         * quick check...making use of scan_node.
+         * We can skip unused nodes.
+         */
+        if (!nodes_empty(mem->scan_nodes)) {
+                for (nid = first_node(mem->scan_nodes);
+                     nid < MAX_NUMNODES;
+                     nid = next_node(nid, mem->scan_nodes)) {
+                        if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                                return true;
+                }
+        }
+        /*
+         * Check rest of nodes.
+         */
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                if (node_isset(nid, mem->scan_nodes))
+                        continue;
+                if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                        return true;
+        }
+        return false;
+}
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
 {
        return 0;
 }
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+        return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+}
 #endif
 /*
@@ -1663,15 +1730,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
        /* If memsw_is_minimum==1, swap-out is of-no-use. */
-        if (root_mem->memsw_is_minimum)
+        if (!check_soft && root_mem->memsw_is_minimum)
                noswap = true;
        while (1) {
                victim = mem_cgroup_select_victim(root_mem);
                if (victim == root_mem) {
                        loop++;
-                        if (loop >= 1)
+                        /*
-                                drain_all_stock_async();
+                         * We are not draining per cpu cached charges during
+                         * soft limit reclaim  because global reclaim doesn't
+                         * care about charges. It tries to free some memory and
+                         * charges will not give any.
+                         */
+                        if (!check_soft && loop >= 1)
+                                drain_all_stock_async(root_mem);
                        if (loop >= 2) {
                                /*
                                 * If we have not been able to reclaim
@@ -1695,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                }
                        }
                }
-                if (!mem_cgroup_local_usage(victim)) {
+                if (!mem_cgroup_reclaimable(victim, noswap)) {
                        /* this cgroup's local usage == 0 */
                        css_put(&victim->css);
                        continue;
@@ -1934,9 +2007,11 @@ struct memcg_stock_pcp {
        struct mem_cgroup *cached; /* this never be root cgroup */
        unsigned int nr_pages;
        struct work_struct work;
+        unsigned long flags;
+#define FLUSHING_CACHED_CHARGE  (0)
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
-static atomic_t memcg_drain_count;
+static DEFINE_MUTEX(percpu_charge_mutex);
 /*
 * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -1984,6 +2059,7 @@ static void drain_local_stock(struct work_struct *dummy)
 {
        struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
        drain_stock(stock);
+        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 /*
@@ -2008,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
 * expects some charges will be back to res_counter later but cannot wait for
 * it.
 */
-static void drain_all_stock_async(void)
+static void drain_all_stock_async(struct mem_cgroup *root_mem)
 {
-        int cpu;
+        int cpu, curcpu;
-        /* This function is for scheduling "drain" in asynchronous way.
+        /*
-         * The result of "drain" is not directly handled by callers. Then,
+         * If someone calls draining, avoid adding more kworker runs.
-         * if someone is calling drain, we don't have to call drain more.
-         * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
-         * there is a race. We just do loose check here.
         */
-        if (atomic_read(&memcg_drain_count))
+        if (!mutex_trylock(&percpu_charge_mutex))
                return;
        /* Notify other cpus that system-wide "drain" is running */
-        atomic_inc(&memcg_drain_count);
        get_online_cpus();
+        /*
+         * Get a hint for avoiding draining charges on the current cpu,
+         * which must be exhausted by our charging.  It is not required that
+         * this be a precise check, so we use raw_smp_processor_id() instead of
+         * getcpu()/putcpu().
+         */
+        curcpu = raw_smp_processor_id();
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-                schedule_work_on(cpu, &stock->work);
+                struct mem_cgroup *mem;
+                if (cpu == curcpu)
+                        continue;
+                mem = stock->cached;
+                if (!mem)
+                        continue;
+                if (mem != root_mem) {
+                        if (!root_mem->use_hierarchy)
+                                continue;
+                        /* check whether "mem" is under tree of "root_mem" */
+                        if (!css_is_ancestor(&mem->css, &root_mem->css))
+                                continue;
+                }
+                if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+                        schedule_work_on(cpu, &stock->work);
        }
        put_online_cpus();
-        atomic_dec(&memcg_drain_count);
+        mutex_unlock(&percpu_charge_mutex);
        /* We don't wait for flush_work */
 }
@@ -2035,9 +2130,9 @@ static void drain_all_stock_async(void)
 static void drain_all_stock_sync(void)
 {
        /* called when force_empty is called */
-        atomic_inc(&memcg_drain_count);
+        mutex_lock(&percpu_charge_mutex);
        schedule_on_each_cpu(drain_local_stock);
-        atomic_dec(&memcg_drain_count);
+        mutex_unlock(&percpu_charge_mutex);
 }
 /*
@@ -4640,6 +4735,7 @@ static struct cftype mem_cgroup_files[] = {
        {
                .name = "numa_stat",
                .open = mem_control_numa_stat_open,
+                .mode = S_IRUGO,
        },
 #endif
 };
@@ -5414,18 +5510,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *old_cont,
                                struct task_struct *p)
 {
-        struct mm_struct *mm;
+        struct mm_struct *mm = get_task_mm(p);
-        if (!mc.to)
-                /* no need to move charge */
-                return;
-        mm = get_task_mm(p);
        if (mm) {
-                mem_cgroup_move_charge(mm);
+                if (mc.to)
+                        mem_cgroup_move_charge(mm);
+                put_swap_token(mm);
                mmput(mm);
        }
-        mem_cgroup_clear_mc();
+        if (mc.to)
+                mem_cgroup_clear_mc();
 }
 #else   /* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5c8f7e08928..740c4f52059 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -52,6 +52,7 @@
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
 #include <linux/memory_hotplug.h>
+#include <linux/mm_inline.h>
 #include "internal.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -390,10 +391,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct task_struct *tsk;
        struct anon_vma *av;
-        read_lock(&tasklist_lock);
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
-                goto out;
+                return;
+        read_lock(&tasklist_lock);
        for_each_process (tsk) {
                struct anon_vma_chain *vmac;
@@ -407,9 +409,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
                                add_to_kill(tsk, page, vma, to_kill, tkc);
                }
        }
-        page_unlock_anon_vma(av);
-out:
        read_unlock(&tasklist_lock);
+        page_unlock_anon_vma(av);
 }
 /*
@@ -423,17 +424,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
        struct prio_tree_iter iter;
        struct address_space *mapping = page->mapping;
-        /*
-         * A note on the locking order between the two locks.
-         * We don't rely on this particular order.
-         * If you have some other code that needs a different order
-         * feel free to switch them around. Or add a reverse link
-         * from mm_struct to task_struct, then this could be all
-         * done without taking tasklist_lock and looping over all tasks.
-         */
-        read_lock(&tasklist_lock);
        mutex_lock(&mapping->i_mmap_mutex);
+        read_lock(&tasklist_lock);
        for_each_process(tsk) {
                pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -453,8 +445,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
                                add_to_kill(tsk, page, vma, to_kill, tkc);
                }
        }
-        mutex_unlock(&mapping->i_mmap_mutex);
        read_unlock(&tasklist_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
 }
 /*
@@ -1468,7 +1460,8 @@ int soft_offline_page(struct page *page, int flags)
        put_page(page);
        if (!ret) {
                LIST_HEAD(pagelist);
+                inc_zone_page_state(page, NR_ISOLATED_ANON +
+                                            page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
                                                                0, true);
diff --git a/mm/memory.c b/mm/memory.c
index 6953d3926e0..9b8a01d941c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
        if (batch->nr == batch->max) {
                if (!tlb_next_batch(tlb))
                        return 0;
+                batch = tlb->active;
        }
        VM_BUG_ON(batch->nr > batch->max);
@@ -1112,11 +1113,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
        int force_flush = 0;
        int rss[NR_MM_COUNTERS];
        spinlock_t *ptl;
+        pte_t *start_pte;
        pte_t *pte;
 again:
        init_rss_vec(rss);
-        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+        start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+        pte = start_pte;
        arch_enter_lazy_mmu_mode();
        do {
                pte_t ptent = *pte;
@@ -1196,7 +1199,7 @@ again:
        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();
-        pte_unmap_unlock(pte - 1, ptl);
+        pte_unmap_unlock(start_pte, ptl);
        /*
         * mmu_gather ran out of room to batch pages, we break out of
@@ -1296,7 +1299,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 /**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
- * @tlbp: address of the caller's struct mmu_gather
+ * @tlb: address of the caller's struct mmu_gather
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
@@ -2796,30 +2799,6 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
-int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
-{
-        struct address_space *mapping = inode->i_mapping;
-        /*
-         * If the underlying filesystem is not going to provide
-         * a way to truncate a range of blocks (punch a hole) -
-         * we should return failure right now.
-         */
-        if (!inode->i_op->truncate_range)
-                return -ENOSYS;
-        mutex_lock(&inode->i_mutex);
-        down_write(&inode->i_alloc_sem);
-        unmap_mapping_range(mapping, offset, (end - offset), 1);
-        truncate_inode_pages_range(mapping, offset, end);
-        unmap_mapping_range(mapping, offset, (end - offset), 1);
-        inode->i_op->truncate_range(inode, offset, end);
-        up_write(&inode->i_alloc_sem);
-        mutex_unlock(&inode->i_mutex);
-        return 0;
-}
 /*
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9f646374e32..c46887b5a11 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -494,6 +494,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
        /* init node's zones as empty zones, we don't have any present pages.*/
        free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+        /*
+         * The node we allocated has no zone fallback lists. For avoiding
+         * to access not-initialized zonelist, build here.
+         */
+        mutex_lock(&zonelists_mutex);
+        build_all_zonelists(NULL);
+        mutex_unlock(&zonelists_mutex);
        return pgdat;
 }
@@ -515,7 +523,7 @@ int mem_online_node(int nid)
        lock_memory_hotplug();
        pgdat = hotadd_new_pgdat(nid, 0);
-        if (pgdat) {
+        if (!pgdat) {
                ret = -ENOMEM;
                goto out;
        }
diff --git a/mm/migrate.c b/mm/migrate.c
index e4a5c912983..666e4e67741 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -288,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
         */
        __dec_zone_page_state(page, NR_FILE_PAGES);
        __inc_zone_page_state(newpage, NR_FILE_PAGES);
-        if (PageSwapBacked(page)) {
+        if (!PageSwapCache(page) && PageSwapBacked(page)) {
                __dec_zone_page_state(page, NR_SHMEM);
                __inc_zone_page_state(newpage, NR_SHMEM);
        }
diff --git a/mm/mmap.c b/mm/mmap.c
index bbdc9af5e11..d49736ff8a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -906,14 +906,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
        if (anon_vma)
                return anon_vma;
 try_prev:
-        /*
+        near = vma->vm_prev;
-         * It is potentially slow to have to call find_vma_prev here.
-         * But it's only on the first write fault on the vma, not
-         * every time, and we could devise a way to avoid it later
-         * (e.g. stash info in next's anon_vma_node when assigning
-         * an anon_vma, or when trying vma_merge).  Another time.
-         */
-        BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
        if (!near)
                goto none;
@@ -2044,9 +2037,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
                return -EINVAL;
        /* Find the first overlapping VMA */
-        vma = find_vma_prev(mm, start, &prev);
+        vma = find_vma(mm, start);
        if (!vma)
                return 0;
+        prev = vma->vm_prev;
        /* we have  start < vma->vm_end  */
        /* if it doesn't overlap, we have nothing.. */
diff --git a/mm/nommu.c b/mm/nommu.c
index 1fd0c51b10a..9edc897a397 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1813,10 +1813,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        return NULL;
 }
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
-                unsigned long to, unsigned long size, pgprot_t prot)
+                unsigned long pfn, unsigned long size, pgprot_t prot)
 {
-        vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
+        if (addr != (pfn << PAGE_SHIFT))
+                return -EINVAL;
+        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
        return 0;
 }
 EXPORT_SYMBOL(remap_pfn_range);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 74ccff61d1b..53bffc6c293 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -162,13 +162,13 @@ static void free_page_cgroup(void *addr)
 }
 #endif
-static int __meminit init_section_page_cgroup(unsigned long pfn)
+static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
 {
        struct page_cgroup *base, *pc;
        struct mem_section *section;
        unsigned long table_size;
        unsigned long nr;
-        int nid, index;
+        int index;
        nr = pfn_to_section_nr(pfn);
        section = __nr_to_section(nr);
@@ -176,7 +176,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn)
        if (section->page_cgroup)
                return 0;
-        nid = page_to_nid(pfn_to_page(pfn));
        table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
        base = alloc_page_cgroup(table_size, nid);
@@ -196,7 +195,11 @@ static int __meminit init_section_page_cgroup(unsigned long pfn)
                pc = base + index;
                init_page_cgroup(pc, nr);
        }
+        /*
+         * The passed "pfn" may not be aligned to SECTION.  For the calculation
+         * we need to apply a mask.
+         */
+        pfn &= PAGE_SECTION_MASK;
        section->page_cgroup = base - pfn;
        total_usage += table_size;
        return 0;
@@ -225,10 +228,20 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
        start = start_pfn & ~(PAGES_PER_SECTION - 1);
        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+        if (nid == -1) {
+                /*
+                 * In this case, "nid" already exists and contains valid memory.
+                 * "start_pfn" passed to us is a pfn which is an arg for
+                 * online__pages(), and start_pfn should exist.
+                 */
+                nid = pfn_to_nid(start_pfn);
+                VM_BUG_ON(!node_state(nid, N_ONLINE));
+        }
        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
                if (!pfn_present(pfn))
                        continue;
-                fail = init_section_page_cgroup(pfn);
+                fail = init_section_page_cgroup(pfn, nid);
        }
        if (!fail)
                return 0;
@@ -284,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
 void __init page_cgroup_init(void)
 {
        unsigned long pfn;
-        int fail = 0;
+        int nid;
        if (mem_cgroup_disabled())
                return;
-        for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
+        for_each_node_state(nid, N_HIGH_MEMORY) {
-                if (!pfn_present(pfn))
+                unsigned long start_pfn, end_pfn;
-                        continue;
-                fail = init_section_page_cgroup(pfn);
+                start_pfn = node_start_pfn(nid);
-        }
+                end_pfn = node_end_pfn(nid);
-        if (fail) {
+                /*
-                printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
+                 * start_pfn and end_pfn may not be aligned to SECTION and the
-                panic("Out of memory");
+                 * page->flags of out of node pages are not initialized.  So we
-        } else {
+                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
-                hotplug_memory_notifier(page_cgroup_callback, 0);
+                 */
+                for (pfn = start_pfn;
+                     pfn < end_pfn;
+                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
+                        if (!pfn_valid(pfn))
+                                continue;
+                        /*
+                         * Nodes's pfns can be overlapping.
+                         * We know some arch can have a nodes layout such as
+                         * -------------pfn-------------->
+                         * N0 | N1 | N2 | N0 | N1 | N2|....
+                         */
+                        if (pfn_to_nid(pfn) != nid)
+                                continue;
+                        if (init_section_page_cgroup(pfn, nid))
+                                goto oom;
+                }
        }
+        hotplug_memory_notifier(page_cgroup_callback, 0);
        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
-        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
+        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
-        " want memory cgroups\n");
+                         "don't want memory cgroups\n");
+        return;
+oom:
+        printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
+        panic("Out of memory");
 }
 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
diff --git a/mm/rmap.c b/mm/rmap.c
index 0eb463ea88d..23295f65ae4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -38,9 +38,8 @@
 *                           in arch-dependent flush_dcache_mmap_lock,
 *                           within inode_wb_list_lock in __sync_single_inode)
 *
- * (code doesn't rely on that order so it could be switched around)
+ * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon)
- * ->tasklist_lock
+ *   ->tasklist_lock
- *   anon_vma->mutex      (memory_failure, collect_procs_anon)
 *     pte map lock
 */
@@ -112,9 +111,9 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
        kmem_cache_free(anon_vma_cachep, anon_vma);
 }
-static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
+static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
 {
-        return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+        return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
 }
 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
@@ -159,7 +158,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                struct mm_struct *mm = vma->vm_mm;
                struct anon_vma *allocated;
-                avc = anon_vma_chain_alloc();
+                avc = anon_vma_chain_alloc(GFP_KERNEL);
                if (!avc)
                        goto out_enomem;
@@ -200,6 +199,32 @@ int anon_vma_prepare(struct vm_area_struct *vma)
        return -ENOMEM;
 }
+/*
+ * This is a useful helper function for locking the anon_vma root as
+ * we traverse the vma->anon_vma_chain, looping over anon_vma's that
+ * have the same vma.
+ *
+ * Such anon_vma's should have the same root, so you'd expect to see
+ * just a single mutex_lock for the whole traversal.
+ */
+static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
+{
+        struct anon_vma *new_root = anon_vma->root;
+        if (new_root != root) {
+                if (WARN_ON_ONCE(root))
+                        mutex_unlock(&root->mutex);
+                root = new_root;
+                mutex_lock(&root->mutex);
+        }
+        return root;
+}
+static inline void unlock_anon_vma_root(struct anon_vma *root)
+{
+        if (root)
+                mutex_unlock(&root->mutex);
+}
 static void anon_vma_chain_link(struct vm_area_struct *vma,
                                struct anon_vma_chain *avc,
                                struct anon_vma *anon_vma)
@@ -208,13 +233,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
-        anon_vma_lock(anon_vma);
        /*
         * It's critical to add new vmas to the tail of the anon_vma,
         * see comment in huge_memory.c:__split_huge_page().
         */
        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
-        anon_vma_unlock(anon_vma);
 }
 /*
@@ -224,13 +247,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 {
        struct anon_vma_chain *avc, *pavc;
+        struct anon_vma *root = NULL;
        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
-                avc = anon_vma_chain_alloc();
+                struct anon_vma *anon_vma;
-                if (!avc)
-                        goto enomem_failure;
+                avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
-                anon_vma_chain_link(dst, avc, pavc->anon_vma);
+                if (unlikely(!avc)) {
+                        unlock_anon_vma_root(root);
+                        root = NULL;
+                        avc = anon_vma_chain_alloc(GFP_KERNEL);
+                        if (!avc)
+                                goto enomem_failure;
+                }
+                anon_vma = pavc->anon_vma;
+                root = lock_anon_vma_root(root, anon_vma);
+                anon_vma_chain_link(dst, avc, anon_vma);
        }
+        unlock_anon_vma_root(root);
        return 0;
 enomem_failure:
@@ -263,7 +297,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        anon_vma = anon_vma_alloc();
        if (!anon_vma)
                goto out_error;
-        avc = anon_vma_chain_alloc();
+        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_error_free_anon_vma;
@@ -280,7 +314,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
+        anon_vma_lock(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
+        anon_vma_unlock(anon_vma);
        return 0;
@@ -291,36 +327,43 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        return -ENOMEM;
 }
-static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
-{
-        struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
-        int empty;
-        /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
-        if (!anon_vma)
-                return;
-        anon_vma_lock(anon_vma);
-        list_del(&anon_vma_chain->same_anon_vma);
-        /* We must garbage collect the anon_vma if it's empty */
-        empty = list_empty(&anon_vma->head);
-        anon_vma_unlock(anon_vma);
-        if (empty)
-                put_anon_vma(anon_vma);
-}
 void unlink_anon_vmas(struct vm_area_struct *vma)
 {
        struct anon_vma_chain *avc, *next;
+        struct anon_vma *root = NULL;
        /*
         * Unlink each anon_vma chained to the VMA.  This list is ordered
         * from newest to oldest, ensuring the root anon_vma gets freed last.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
-                anon_vma_unlink(avc);
+                struct anon_vma *anon_vma = avc->anon_vma;
+                root = lock_anon_vma_root(root, anon_vma);
+                list_del(&avc->same_anon_vma);
+                /*
+                 * Leave empty anon_vmas on the list - we'll need
+                 * to free them outside the lock.
+                 */
+                if (list_empty(&anon_vma->head))
+                        continue;
+                list_del(&avc->same_vma);
+                anon_vma_chain_free(avc);
+        }
+        unlock_anon_vma_root(root);
+        /*
+         * Iterate the list once more, it now only contains empty and unlinked
+         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
+         * needing to acquire the anon_vma->root->mutex.
+         */
+        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+                struct anon_vma *anon_vma = avc->anon_vma;
+                put_anon_vma(anon_vma);
                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
diff --git a/mm/shmem.c b/mm/shmem.c
index d221a1cfd7b..fcedf5464eb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -539,7 +539,7 @@ static void shmem_free_pages(struct list_head *next)
        } while (next);
 }
-static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
        unsigned long idx;
@@ -562,6 +562,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
        spinlock_t *punch_lock;
        unsigned long upper_limit;
+        truncate_inode_pages_range(inode->i_mapping, start, end);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
        idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (idx >= info->next_index)
@@ -738,16 +740,8 @@ done2:
                 * lowered next_index.  Also, though shmem_getpage checks
                 * i_size before adding to cache, no recheck after: so fix the
                 * narrow window there too.
-                 *
-                 * Recalling truncate_inode_pages_range and unmap_mapping_range
-                 * every time for punch_hole (which never got a chance to clear
-                 * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
-                 * yet hardly ever necessary: try to optimize them out later.
                 */
                truncate_inode_pages_range(inode->i_mapping, start, end);
-                if (punch_hole)
-                        unmap_mapping_range(inode->i_mapping, start,
-                                                        end - start, 1);
        }
        spin_lock(&info->lock);
@@ -766,22 +760,23 @@ done2:
                shmem_free_pages(pages_to_free.next);
        }
 }
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
-static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
+static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
-        loff_t newsize = attr->ia_size;
        int error;
        error = inode_change_ok(inode, attr);
        if (error)
                return error;
-        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
+        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-                                        && newsize != inode->i_size) {
+                loff_t oldsize = inode->i_size;
+                loff_t newsize = attr->ia_size;
                struct page *page = NULL;
-                if (newsize < inode->i_size) {
+                if (newsize < oldsize) {
                        /*
                         * If truncating down to a partial page, then
                         * if that page is already allocated, hold it
@@ -810,12 +805,19 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
                                spin_unlock(&info->lock);
                        }
                }
+                if (newsize != oldsize) {
-                /* XXX(truncate): truncate_setsize should be called last */
+                        i_size_write(inode, newsize);
-                truncate_setsize(inode, newsize);
+                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+                }
+                if (newsize < oldsize) {
+                        loff_t holebegin = round_up(newsize, PAGE_SIZE);
+                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+                        shmem_truncate_range(inode, newsize, (loff_t)-1);
+                        /* unmap again to remove racily COWed private pages */
+                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+                }
                if (page)
                        page_cache_release(page);
-                shmem_truncate_range(inode, newsize, (loff_t)-1);
        }
        setattr_copy(inode, attr);
@@ -832,7 +834,6 @@ static void shmem_evict_inode(struct inode *inode)
        struct shmem_xattr *xattr, *nxattr;
        if (inode->i_mapping->a_ops == &shmem_aops) {
-                truncate_inode_pages(inode->i_mapping, 0);
                shmem_unacct_size(info->flags, inode->i_size);
                inode->i_size = 0;
                shmem_truncate_range(inode, 0, (loff_t)-1);
@@ -2706,7 +2707,7 @@ static const struct file_operations shmem_file_operations = {
 };
 static const struct inode_operations shmem_inode_operations = {
-        .setattr        = shmem_notify_change,
+        .setattr        = shmem_setattr,
        .truncate_range = shmem_truncate_range,
 #ifdef CONFIG_TMPFS_XATTR
        .setxattr       = shmem_setxattr,
@@ -2739,7 +2740,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
        .removexattr    = shmem_removexattr,
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
-        .setattr        = shmem_notify_change,
+        .setattr        = shmem_setattr,
        .check_acl      = generic_check_acl,
 #endif
 };
@@ -2752,7 +2753,7 @@ static const struct inode_operations shmem_special_inode_operations = {
        .removexattr    = shmem_removexattr,
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
-        .setattr        = shmem_notify_change,
+        .setattr        = shmem_setattr,
        .check_acl      = generic_check_acl,
 #endif
 };
@@ -2908,6 +2909,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
        return 0;
 }
+void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+{
+        truncate_inode_pages_range(inode->i_mapping, start, end);
+}
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /**
 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
@@ -3028,3 +3035,26 @@ int shmem_zero_setup(struct vm_area_struct *vma)
        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
+/**
+ * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping:    the page's address_space
+ * @index:      the page index
+ * @gfp:        the page allocator flags to use if allocating
+ *
+ * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
+ * with any new page allocations done using the specified allocation flags.
+ * But read_cache_page_gfp() uses the ->readpage() method: which does not
+ * suit tmpfs, since it may have pages in swapcache, and needs to find those
+ * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
+ *
+ * Provide a stub for those callers to start using now, then later
+ * flesh it out to call shmem_getpage() with additional gfp mask, when
+ * shmem_file_splice_read() is added and shmem_readpage() is removed.
+ */
+struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+                                         pgoff_t index, gfp_t gfp)
+{
+        return read_cache_page_gfp(mapping, index, gfp);
+}
+EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index bcfa4987c8a..d96e223de77 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3604,13 +3604,14 @@ free_done:
 * Release an obj back to its cache. If the obj has a constructed state, it must
 * be in this state _before_ it is released.  Called with disabled ints.
 */
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static inline void __cache_free(struct kmem_cache *cachep, void *objp,
+    void *caller)
 {
        struct array_cache *ac = cpu_cache_get(cachep);
        check_irq_off();
        kmemleak_free_recursive(objp, cachep->flags);
-        objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+        objp = cache_free_debugcheck(cachep, objp, caller);
        kmemcheck_slab_free(cachep, objp, obj_size(cachep));
@@ -3801,7 +3802,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
        debug_check_no_locks_freed(objp, obj_size(cachep));
        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
                debug_check_no_obj_freed(objp, obj_size(cachep));
-        __cache_free(cachep, objp);
+        __cache_free(cachep, objp, __builtin_return_address(0));
        local_irq_restore(flags);
        trace_kmem_cache_free(_RET_IP_, objp);
@@ -3831,7 +3832,7 @@ void kfree(const void *objp)
        c = virt_to_cache(objp);
        debug_check_no_locks_freed(objp, obj_size(c));
        debug_check_no_obj_freed(objp, obj_size(c));
-        __cache_free(c, (void *)objp);
+        __cache_free(c, (void *)objp, __builtin_return_address(0));
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
diff --git a/mm/slub.c b/mm/slub.c
index 7be0223531b..35f351f2619 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2320,16 +2320,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
        BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
                        SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
-#ifdef CONFIG_CMPXCHG_LOCAL
        /*
-         * Must align to double word boundary for the double cmpxchg instructions
+         * Must align to double word boundary for the double cmpxchg
-         * to work.
+         * instructions to work; see __pcpu_double_call_return_bool().
         */
-        s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
+        s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
-#else
+                                     2 * sizeof(void *));
-        /* Regular alignment is sufficient */
-        s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
-#endif
        if (!s->cpu_slab)
                return 0;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d537d29e9b7..ff8dc1a18cb 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -14,7 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/namei.h>
-#include <linux/shm.h>
+#include <linux/shmem_fs.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
 #include <linux/writeback.h>
diff --git a/mm/thrash.c b/mm/thrash.c
index 2372d4ed5dd..fabf2d0f516 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -21,14 +21,40 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/swap.h>
+#include <linux/memcontrol.h>
+#include <trace/events/vmscan.h>
+#define TOKEN_AGING_INTERVAL    (0xFF)
 static DEFINE_SPINLOCK(swap_token_lock);
 struct mm_struct *swap_token_mm;
+struct mem_cgroup *swap_token_memcg;
 static unsigned int global_faults;
+static unsigned int last_aging;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
+{
+        struct mem_cgroup *memcg;
+        memcg = try_get_mem_cgroup_from_mm(mm);
+        if (memcg)
+                css_put(mem_cgroup_css(memcg));
+        return memcg;
+}
+#else
+static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
+{
+        return NULL;
+}
+#endif
 void grab_swap_token(struct mm_struct *mm)
 {
        int current_interval;
+        unsigned int old_prio = mm->token_priority;
        global_faults++;
@@ -38,40 +64,81 @@ void grab_swap_token(struct mm_struct *mm)
                return;
        /* First come first served */
-        if (swap_token_mm == NULL) {
+        if (!swap_token_mm)
-                mm->token_priority = mm->token_priority + 2;
+                goto replace_token;
-                swap_token_mm = mm;
-                goto out;
+        if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
+                swap_token_mm->token_priority /= 2;
+                last_aging = global_faults;
        }
-        if (mm != swap_token_mm) {
+        if (mm == swap_token_mm) {
-                if (current_interval < mm->last_interval)
-                        mm->token_priority++;
-                else {
-                        if (likely(mm->token_priority > 0))
-                                mm->token_priority--;
-                }
-                /* Check if we deserve the token */
-                if (mm->token_priority > swap_token_mm->token_priority) {
-                        mm->token_priority += 2;
-                        swap_token_mm = mm;
-                }
-        } else {
-                /* Token holder came in again! */
                mm->token_priority += 2;
+                goto update_priority;
+        }
+        if (current_interval < mm->last_interval)
+                mm->token_priority++;
+        else {
+                if (likely(mm->token_priority > 0))
+                        mm->token_priority--;
        }
+        /* Check if we deserve the token */
+        if (mm->token_priority > swap_token_mm->token_priority)
+                goto replace_token;
+update_priority:
+        trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
 out:
        mm->faultstamp = global_faults;
        mm->last_interval = current_interval;
        spin_unlock(&swap_token_lock);
+        return;
+replace_token:
+        mm->token_priority += 2;
+        trace_replace_swap_token(swap_token_mm, mm);
+        swap_token_mm = mm;
+        swap_token_memcg = swap_token_memcg_from_mm(mm);
+        last_aging = global_faults;
+        goto out;
 }
 /* Called on process exit. */
 void __put_swap_token(struct mm_struct *mm)
 {
        spin_lock(&swap_token_lock);
-        if (likely(mm == swap_token_mm))
+        if (likely(mm == swap_token_mm)) {
+                trace_put_swap_token(swap_token_mm);
                swap_token_mm = NULL;
+                swap_token_memcg = NULL;
+        }
        spin_unlock(&swap_token_lock);
 }
+static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
+{
+        if (!a)
+                return true;
+        if (!b)
+                return true;
+        if (a == b)
+                return true;
+        return false;
+}
+void disable_swap_token(struct mem_cgroup *memcg)
+{
+        /* memcg reclaim don't disable unrelated mm token. */
+        if (match_memcg(memcg, swap_token_memcg)) {
+                spin_lock(&swap_token_lock);
+                if (match_memcg(memcg, swap_token_memcg)) {
+                        trace_disable_swap_token(swap_token_mm);
+                        swap_token_mm = NULL;
+                        swap_token_memcg = NULL;
+                }
+                spin_unlock(&swap_token_lock);
+        }
+}
diff --git a/mm/truncate.c b/mm/truncate.c
index 3a29a618021..e13f22efaad 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -304,6 +304,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
 * @lstart: offset from which to truncate
 *
 * Called under (and serialised by) inode->i_mutex.
+ *
+ * Note: When this function returns, there can be a page in the process of
+ * deletion (inside __delete_from_page_cache()) in the specified range.  Thus
+ * mapping->nrpages can be non-zero when this function returns even after
+ * truncation of the whole mapping.
 */
 void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 {
@@ -603,3 +608,27 @@ int vmtruncate(struct inode *inode, loff_t offset)
        return 0;
 }
 EXPORT_SYMBOL(vmtruncate);
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+        struct address_space *mapping = inode->i_mapping;
+        /*
+         * If the underlying filesystem is not going to provide
+         * a way to truncate a range of blocks (punch a hole) -
+         * we should return failure right now.
+         */
+        if (!inode->i_op->truncate_range)
+                return -ENOSYS;
+        mutex_lock(&inode->i_mutex);
+        down_write(&inode->i_alloc_sem);
+        unmap_mapping_range(mapping, offset, (end - offset), 1);
+        inode->i_op->truncate_range(inode, offset, end);
+        /* unmap again to remove racily COWed private pages */
+        unmap_mapping_range(mapping, offset, (end - offset), 1);
+        up_write(&inode->i_alloc_sem);
+        mutex_unlock(&inode->i_mutex);
+        return 0;
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index faa0a088f9c..5ed24b94c5e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1124,8 +1124,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                                        nr_lumpy_dirty++;
                                scan++;
                        } else {
-                                /* the page is freed already. */
+                                /*
-                                if (!page_count(cursor_page))
+                                 * Check if the page is freed already.
+                                 *
+                                 * We can't use page_count() as that
+                                 * requires compound_head and we don't
+                                 * have a pin on the page here. If a
+                                 * page is tail, we may or may not
+                                 * have isolated the head, so assume
+                                 * it's not free, it'd be tricky to
+                                 * track the head status without a
+                                 * page pin.
+                                 */
+                                if (!PageTail(cursor_page) &&
+                                    !atomic_read(&cursor_page->_count))
                                        continue;
                                break;
                        }
@@ -1983,14 +1995,13 @@ restart:
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
-static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
-        unsigned long total_scanned = 0;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2005,19 +2016,23 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
+                        /*
+                         * This steals pages from memory cgroups over softlimit
+                         * and returns the number of reclaimed pages and
+                         * scanned pages. This works for global memory pressure
+                         * and balancing, not for a memcg's limit.
+                         */
+                        nr_soft_scanned = 0;
+                        nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                                                sc->order, sc->gfp_mask,
+                                                &nr_soft_scanned);
+                        sc->nr_reclaimed += nr_soft_reclaimed;
+                        sc->nr_scanned += nr_soft_scanned;
+                        /* need some check for avoid more shrink_zone() */
                }
-                nr_soft_scanned = 0;
-                nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
-                                                        sc->order, sc->gfp_mask,
-                                                        &nr_soft_scanned);
-                sc->nr_reclaimed += nr_soft_reclaimed;
-                total_scanned += nr_soft_scanned;
                shrink_zone(priority, zone, sc);
        }
-        return total_scanned;
 }
 static bool zone_reclaimable(struct zone *zone)
@@ -2081,8 +2096,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                sc->nr_scanned = 0;
                if (!priority)
-                        disable_swap_token();
+                        disable_swap_token(sc->mem_cgroup);
-                total_scanned += shrink_zones(priority, zonelist, sc);
+                shrink_zones(priority, zonelist, sc);
                /*
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
@@ -2311,7 +2326,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
                return true;
        /* Check the watermark levels */
-        for (i = 0; i < pgdat->nr_zones; i++) {
+        for (i = 0; i <= classzone_idx; i++) {
                struct zone *zone = pgdat->node_zones + i;
                if (!populated_zone(zone))
@@ -2329,7 +2344,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
                }
                if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
-                                                        classzone_idx, 0))
+                                                        i, 0))
                        all_zones_ok = false;
                else
                        balanced += zone->present_pages;
@@ -2407,7 +2422,7 @@ loop_again:
                /* The swap token gets in the way of swapout... */
                if (!priority)
-                        disable_swap_token();
+                        disable_swap_token(NULL);
                all_zones_ok = 1;
                balanced = 0;
@@ -2436,7 +2451,6 @@ loop_again:
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
-                                *classzone_idx = i;
                                break;
                        }
                }
@@ -2495,18 +2509,18 @@ loop_again:
                                KSWAPD_ZONE_BALANCE_GAP_RATIO);
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone) + balance_gap,
-                                        end_zone, 0))
+                                        end_zone, 0)) {
                                shrink_zone(priority, zone, &sc);
-                        reclaim_state->reclaimed_slab = 0;
-                        nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
-                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-                        total_scanned += sc.nr_scanned;
-                        if (zone->all_unreclaimable)
+                                reclaim_state->reclaimed_slab = 0;
-                                continue;
+                                nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
-                        if (nr_slab == 0 &&
+                                sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-                            !zone_reclaimable(zone))
+                                total_scanned += sc.nr_scanned;
-                                zone->all_unreclaimable = 1;
+                                if (nr_slab == 0 && !zone_reclaimable(zone))
+                                        zone->all_unreclaimable = 1;
+                        }
                        /*
                         * If we've done a decent amount of scanning and
                         * the reclaim ratio is low, start doing writepage
@@ -2516,6 +2530,12 @@ loop_again:
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
+                        if (zone->all_unreclaimable) {
+                                if (end_zone && end_zone == i)
+                                        end_zone--;
+                                continue;
+                        }
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
@@ -2694,8 +2714,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 */
 static int kswapd(void *p)
 {
-        unsigned long order;
+        unsigned long order, new_order;
-        int classzone_idx;
+        int classzone_idx, new_classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
@@ -2725,17 +2745,23 @@ static int kswapd(void *p)
        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
        set_freezable();
-        order = 0;
+        order = new_order = 0;
-        classzone_idx = MAX_NR_ZONES - 1;
+        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
        for ( ; ; ) {
-                unsigned long new_order;
-                int new_classzone_idx;
                int ret;
-                new_order = pgdat->kswapd_max_order;
+                /*
-                new_classzone_idx = pgdat->classzone_idx;
+                 * If the last balance_pgdat was unsuccessful it's unlikely a
-                pgdat->kswapd_max_order = 0;
+                 * new request of a similar or harder type will succeed soon
-                pgdat->classzone_idx = MAX_NR_ZONES - 1;
+                 * so consider going to sleep on the basis we reclaimed at
+                 */
+                if (classzone_idx >= new_classzone_idx && order == new_order) {
+                        new_order = pgdat->kswapd_max_order;
+                        new_classzone_idx = pgdat->classzone_idx;
+                        pgdat->kswapd_max_order =  0;
+                        pgdat->classzone_idx = pgdat->nr_zones - 1;
+                }
                if (order < new_order || classzone_idx > new_classzone_idx) {
                        /*
                         * Don't sleep if someone wants a larger 'order'
@@ -2748,7 +2774,7 @@ static int kswapd(void *p)
                        order = pgdat->kswapd_max_order;
                        classzone_idx = pgdat->classzone_idx;
                        pgdat->kswapd_max_order = 0;
-                        pgdat->classzone_idx = MAX_NR_ZONES - 1;
+                        pgdat->classzone_idx = pgdat->nr_zones - 1;
                }
                ret = try_to_freeze();