46 files changed, 1598 insertions, 1173 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7ba8feae11b..dd8e2aafb07 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -318,7 +318,7 @@ static void wakeup_timer_fn(unsigned long data)
        if (bdi->wb.task) {
                trace_writeback_wake_thread(bdi);
                wake_up_process(bdi->wb.task);
-        } else {
+        } else if (bdi->dev) {
                /*
                 * When bdi tasks are inactive for long time, they are killed.
                 * In this case we have to wake-up the forker thread which
@@ -584,6 +584,8 @@ EXPORT_SYMBOL(bdi_register_dev);
 */
 static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 {
+        struct task_struct *task;
        if (!bdi_cap_writeback_dirty(bdi))
                return;
@@ -602,8 +604,13 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
         * Finally, kill the kernel thread. We don't need to be RCU
         * safe anymore, since the bdi is gone from visibility.
         */
-        if (bdi->wb.task)
+        spin_lock_bh(&bdi->wb_lock);
-                kthread_stop(bdi->wb.task);
+        task = bdi->wb.task;
+        bdi->wb.task = NULL;
+        spin_unlock_bh(&bdi->wb_lock);
+        if (task)
+                kthread_stop(task);
 }
 /*
@@ -623,7 +630,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
 void bdi_unregister(struct backing_dev_info *bdi)
 {
-        if (bdi->dev) {
+        struct device *dev = bdi->dev;
+        if (dev) {
                bdi_set_min_ratio(bdi, 0);
                trace_writeback_bdi_unregister(bdi);
                bdi_prune_sb(bdi);
@@ -632,8 +641,12 @@ void bdi_unregister(struct backing_dev_info *bdi)
                if (!bdi_cap_flush_forker(bdi))
                        bdi_wb_shutdown(bdi);
                bdi_debug_unregister(bdi);
-                device_unregister(bdi->dev);
+                spin_lock_bh(&bdi->wb_lock);
                bdi->dev = NULL;
+                spin_unlock_bh(&bdi->wb_lock);
+                device_unregister(dev);
        }
 }
 EXPORT_SYMBOL(bdi_unregister);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 668e94df8cf..0131170c9d5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -766,14 +766,13 @@ void * __init alloc_bootmem_section(unsigned long size,
                                    unsigned long section_nr)
 {
        bootmem_data_t *bdata;
-        unsigned long pfn, goal, limit;
+        unsigned long pfn, goal;
        pfn = section_nr_to_pfn(section_nr);
        goal = pfn << PAGE_SHIFT;
-        limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
        bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
-        return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
+        return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
 }
 #endif
diff --git a/mm/bounce.c b/mm/bounce.c
index 4e9ae722af8..d1be02ca188 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -50,9 +50,9 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
        unsigned char *vto;
        local_irq_save(flags);
-        vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
+        vto = kmap_atomic(to->bv_page);
        memcpy(vto + to->bv_offset, vfrom, to->bv_len);
-        kunmap_atomic(vto, KM_BOUNCE_READ);
+        kunmap_atomic(vto);
        local_irq_restore(flags);
 }
diff --git a/mm/cleancache.c b/mm/cleancache.c
index bcaae4c2a77..5646c740f61 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -15,29 +15,34 @@
 #include <linux/fs.h>
 #include <linux/exportfs.h>
 #include <linux/mm.h>
+#include <linux/debugfs.h>
 #include <linux/cleancache.h>
 /*
 * This global enablement flag may be read thousands of times per second
- * by cleancache_get/put/flush even on systems where cleancache_ops
+ * by cleancache_get/put/invalidate even on systems where cleancache_ops
 * is not claimed (e.g. cleancache is config'ed on but remains
 * disabled), so is preferred to the slower alternative: a function
 * call that checks a non-global.
 */
-int cleancache_enabled;
+int cleancache_enabled __read_mostly;
 EXPORT_SYMBOL(cleancache_enabled);
 /*
 * cleancache_ops is set by cleancache_ops_register to contain the pointers
 * to the cleancache "backend" implementation functions.
 */
-static struct cleancache_ops cleancache_ops;
+static struct cleancache_ops cleancache_ops __read_mostly;
-/* useful stats available in /sys/kernel/mm/cleancache */
+/*
-static unsigned long cleancache_succ_gets;
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
-static unsigned long cleancache_failed_gets;
+ * properly configured.  These are for information only so are not protected
-static unsigned long cleancache_puts;
+ * against increment races.
-static unsigned long cleancache_flushes;
+ */
+static u64 cleancache_succ_gets;
+static u64 cleancache_failed_gets;
+static u64 cleancache_puts;
+static u64 cleancache_invalidates;
 /*
 * register operations for cleancache, returning previous thus allowing
@@ -148,10 +153,11 @@ void __cleancache_put_page(struct page *page)
 EXPORT_SYMBOL(__cleancache_put_page);
 /*
- * Flush any data from cleancache associated with the poolid and the
+ * Invalidate any data from cleancache associated with the poolid and the
 * page's inode and page index so that a subsequent "get" will fail.
 */
-void __cleancache_flush_page(struct address_space *mapping, struct page *page)
+void __cleancache_invalidate_page(struct address_space *mapping,
+                                        struct page *page)
 {
        /* careful... page->mapping is NULL sometimes when this is called */
        int pool_id = mapping->host->i_sb->cleancache_poolid;
@@ -160,85 +166,57 @@ void __cleancache_flush_page(struct address_space *mapping, struct page *page)
        if (pool_id >= 0) {
                VM_BUG_ON(!PageLocked(page));
                if (cleancache_get_key(mapping->host, &key) >= 0) {
-                        (*cleancache_ops.flush_page)(pool_id, key, page->index);
+                        (*cleancache_ops.invalidate_page)(pool_id,
-                        cleancache_flushes++;
+                                                          key, page->index);
+                        cleancache_invalidates++;
                }
        }
 }
-EXPORT_SYMBOL(__cleancache_flush_page);
+EXPORT_SYMBOL(__cleancache_invalidate_page);
 /*
- * Flush all data from cleancache associated with the poolid and the
+ * Invalidate all data from cleancache associated with the poolid and the
 * mappings's inode so that all subsequent gets to this poolid/inode
 * will fail.
 */
-void __cleancache_flush_inode(struct address_space *mapping)
+void __cleancache_invalidate_inode(struct address_space *mapping)
 {
        int pool_id = mapping->host->i_sb->cleancache_poolid;
        struct cleancache_filekey key = { .u.key = { 0 } };
        if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
-                (*cleancache_ops.flush_inode)(pool_id, key);
+                (*cleancache_ops.invalidate_inode)(pool_id, key);
 }
-EXPORT_SYMBOL(__cleancache_flush_inode);
+EXPORT_SYMBOL(__cleancache_invalidate_inode);
 /*
 * Called by any cleancache-enabled filesystem at time of unmount;
 * note that pool_id is surrendered and may be reutrned by a subsequent
 * cleancache_init_fs or cleancache_init_shared_fs
 */
-void __cleancache_flush_fs(struct super_block *sb)
+void __cleancache_invalidate_fs(struct super_block *sb)
 {
        if (sb->cleancache_poolid >= 0) {
                int old_poolid = sb->cleancache_poolid;
                sb->cleancache_poolid = -1;
-                (*cleancache_ops.flush_fs)(old_poolid);
+                (*cleancache_ops.invalidate_fs)(old_poolid);
        }
 }
-EXPORT_SYMBOL(__cleancache_flush_fs);
+EXPORT_SYMBOL(__cleancache_invalidate_fs);
-#ifdef CONFIG_SYSFS
-/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
-#define CLEANCACHE_SYSFS_RO(_name) \
-        static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
-                                struct kobj_attribute *attr, char *buf) \
-        { \
-                return sprintf(buf, "%lu\n", cleancache_##_name); \
-        } \
-        static struct kobj_attribute cleancache_##_name##_attr = { \
-                .attr = { .name = __stringify(_name), .mode = 0444 }, \
-                .show = cleancache_##_name##_show, \
-        }
-CLEANCACHE_SYSFS_RO(succ_gets);
-CLEANCACHE_SYSFS_RO(failed_gets);
-CLEANCACHE_SYSFS_RO(puts);
-CLEANCACHE_SYSFS_RO(flushes);
-static struct attribute *cleancache_attrs[] = {
-        &cleancache_succ_gets_attr.attr,
-        &cleancache_failed_gets_attr.attr,
-        &cleancache_puts_attr.attr,
-        &cleancache_flushes_attr.attr,
-        NULL,
-};
-static struct attribute_group cleancache_attr_group = {
-        .attrs = cleancache_attrs,
-        .name = "cleancache",
-};
-#endif /* CONFIG_SYSFS */
 static int __init init_cleancache(void)
 {
-#ifdef CONFIG_SYSFS
+#ifdef CONFIG_DEBUG_FS
-        int err;
+        struct dentry *root = debugfs_create_dir("cleancache", NULL);
+        if (root == NULL)
-        err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
+                return -ENXIO;
-#endif /* CONFIG_SYSFS */
+        debugfs_create_u64("succ_gets", S_IRUGO, root, &cleancache_succ_gets);
+        debugfs_create_u64("failed_gets", S_IRUGO,
+                                root, &cleancache_failed_gets);
+        debugfs_create_u64("puts", S_IRUGO, root, &cleancache_puts);
+        debugfs_create_u64("invalidates", S_IRUGO,
+                                root, &cleancache_invalidates);
+#endif
        return 0;
 }
 module_init(init_cleancache)
diff --git a/mm/compaction.c b/mm/compaction.c
index 71a58f67f48..74a8c825ff2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,7 +35,7 @@ struct compact_control {
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        bool sync;                      /* Synchronous migration */
-        unsigned int order;             /* order a direct compactor needs */
+        int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
 };
@@ -313,12 +313,34 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                } else if (!locked)
                        spin_lock_irq(&zone->lru_lock);
+                /*
+                 * migrate_pfn does not necessarily start aligned to a
+                 * pageblock. Ensure that pfn_valid is called when moving
+                 * into a new MAX_ORDER_NR_PAGES range in case of large
+                 * memory holes within the zone
+                 */
+                if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+                        if (!pfn_valid(low_pfn)) {
+                                low_pfn += MAX_ORDER_NR_PAGES - 1;
+                                continue;
+                        }
+                }
                if (!pfn_valid_within(low_pfn))
                        continue;
                nr_scanned++;
-                /* Get the page and skip if free */
+                /*
+                 * Get the page and ensure the page is within the same zone.
+                 * See the comment in isolate_freepages about overlapping
+                 * nodes. It is deliberate that the new zone lock is not taken
+                 * as memory compaction should not move pages between nodes.
+                 */
                page = pfn_to_page(low_pfn);
+                if (page_zone(page) != zone)
+                        continue;
+                /* Skip if free */
                if (PageBuddy(page))
                        continue;
@@ -653,49 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 /* Compact all zones within a node */
-static int compact_node(int nid)
+static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 {
        int zoneid;
-        pg_data_t *pgdat;
        struct zone *zone;
-        if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
-                return -EINVAL;
-        pgdat = NODE_DATA(nid);
-        /* Flush pending updates to the LRU lists */
-        lru_add_drain_all();
        for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
-                struct compact_control cc = {
-                        .nr_freepages = 0,
-                        .nr_migratepages = 0,
-                        .order = -1,
-                        .sync = true,
-                };
                zone = &pgdat->node_zones[zoneid];
                if (!populated_zone(zone))
                        continue;
-                cc.zone = zone;
+                cc->nr_freepages = 0;
-                INIT_LIST_HEAD(&cc.freepages);
+                cc->nr_migratepages = 0;
-                INIT_LIST_HEAD(&cc.migratepages);
+                cc->zone = zone;
+                INIT_LIST_HEAD(&cc->freepages);
-                compact_zone(zone, &cc);
+                INIT_LIST_HEAD(&cc->migratepages);
+                if (cc->order == -1 || !compaction_deferred(zone, cc->order))
+                        compact_zone(zone, cc);
+                if (cc->order > 0) {
+                        int ok = zone_watermark_ok(zone, cc->order,
+                                                low_wmark_pages(zone), 0, 0);
+                        if (ok && cc->order > zone->compact_order_failed)
+                                zone->compact_order_failed = cc->order + 1;
+                        /* Currently async compaction is never deferred. */
+                        else if (!ok && cc->sync)
+                                defer_compaction(zone, cc->order);
+                }
-                VM_BUG_ON(!list_empty(&cc.freepages));
+                VM_BUG_ON(!list_empty(&cc->freepages));
-                VM_BUG_ON(!list_empty(&cc.migratepages));
+                VM_BUG_ON(!list_empty(&cc->migratepages));
        }
        return 0;
 }
+int compact_pgdat(pg_data_t *pgdat, int order)
+{
+        struct compact_control cc = {
+                .order = order,
+                .sync = false,
+        };
+        return __compact_pgdat(pgdat, &cc);
+}
+static int compact_node(int nid)
+{
+        struct compact_control cc = {
+                .order = -1,
+                .sync = true,
+        };
+        return __compact_pgdat(NODE_DATA(nid), &cc);
+}
 /* Compact all nodes in the system */
 static int compact_nodes(void)
 {
        int nid;
+        /* Flush pending updates to the LRU lists */
+        lru_add_drain_all();
        for_each_online_node(nid)
                compact_node(nid);
@@ -728,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev,
                        struct device_attribute *attr,
                        const char *buf, size_t count)
 {
-        compact_node(dev->id);
+        int nid = dev->id;
+        if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
+                /* Flush pending updates to the LRU lists */
+                lru_add_drain_all();
+                compact_node(nid);
+        }
        return count;
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 97f49ed35bd..c3811bc6b9e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,9 +101,8 @@
 *    ->inode->i_lock           (zap_pte_range->set_page_dirty)
 *    ->private_lock            (zap_pte_range->__set_page_dirty_buffers)
 *
- *  (code doesn't rely on that order, so you could switch it around)
+ * ->i_mmap_mutex
- *  ->tasklist_lock             (memory_failure, collect_procs_ao)
+ *   ->tasklist_lock            (memory_failure, collect_procs_ao)
- *    ->i_mmap_mutex
 */
 /*
@@ -123,7 +122,7 @@ void __delete_from_page_cache(struct page *page)
        if (PageUptodate(page) && PageMappedToDisk(page))
                cleancache_put_page(page);
        else
-                cleancache_flush_page(mapping, page);
+                cleancache_invalidate_page(mapping, page);
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
@@ -500,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
        struct page *page;
        if (cpuset_do_page_mem_spread()) {
-                get_mems_allowed();
+                unsigned int cpuset_mems_cookie;
-                n = cpuset_mem_spread_node();
+                do {
-                page = alloc_pages_exact_node(n, gfp, 0);
+                        cpuset_mems_cookie = get_mems_allowed();
-                put_mems_allowed();
+                        n = cpuset_mem_spread_node();
+                        page = alloc_pages_exact_node(n, gfp, 0);
+                } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
                return page;
        }
        return alloc_pages(gfp, 0);
@@ -1318,10 +1320,10 @@ int file_read_actor(read_descriptor_t *desc, struct page *page,
         * taking the kmap.
         */
        if (!fault_in_pages_writeable(desc->arg.buf, size)) {
-                kaddr = kmap_atomic(page, KM_USER0);
+                kaddr = kmap_atomic(page);
                left = __copy_to_user_inatomic(desc->arg.buf,
                                                kaddr + offset, size);
-                kunmap_atomic(kaddr, KM_USER0);
+                kunmap_atomic(kaddr);
                if (left == 0)
                        goto success;
        }
@@ -1400,15 +1402,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        unsigned long seg = 0;
        size_t count;
        loff_t *ppos = &iocb->ki_pos;
-        struct blk_plug plug;
        count = 0;
        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
        if (retval)
                return retval;
-        blk_start_plug(&plug);
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (filp->f_flags & O_DIRECT) {
                loff_t size;
@@ -1424,8 +1423,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        retval = filemap_write_and_wait_range(mapping, pos,
                                        pos + iov_length(iov, nr_segs) - 1);
                        if (!retval) {
+                                struct blk_plug plug;
+                                blk_start_plug(&plug);
                                retval = mapping->a_ops->direct_IO(READ, iocb,
                                                        iov, pos, nr_segs);
+                                blk_finish_plug(&plug);
                        }
                        if (retval > 0) {
                                *ppos = pos + retval;
@@ -1481,7 +1484,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        break;
        }
 out:
-        blk_finish_plug(&plug);
        return retval;
 }
 EXPORT_SYMBOL(generic_file_aio_read);
@@ -2045,7 +2047,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
        size_t copied;
        BUG_ON(!in_atomic());
-        kaddr = kmap_atomic(page, KM_USER0);
+        kaddr = kmap_atomic(page);
        if (likely(i->nr_segs == 1)) {
                int left;
                char __user *buf = i->iov->iov_base + i->iov_offset;
@@ -2055,7 +2057,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
                copied = __iovec_copy_from_user_inatomic(kaddr + offset,
                                                i->iov, i->iov_offset, bytes);
        }
-        kunmap_atomic(kaddr, KM_USER0);
+        kunmap_atomic(kaddr);
        return copied;
 }
@@ -2341,7 +2343,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
        struct page *page;
        gfp_t gfp_notmask = 0;
-        gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE;
+        gfp_mask = mapping_gfp_mask(mapping);
+        if (mapping_cap_account_dirty(mapping))
+                gfp_mask |= __GFP_WRITE;
        if (flags & AOP_FLAG_NOFS)
                gfp_notmask = __GFP_FS;
 repeat:
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index f91b2f68734..a4eb3113222 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -263,7 +263,12 @@ found:
                                                        xip_pfn);
                if (err == -ENOMEM)
                        return VM_FAULT_OOM;
-                BUG_ON(err);
+                /*
+                 * err == -EBUSY is fine, we've raced against another thread
+                 * that faulted-in the same page
+                 */
+                if (err != -EBUSY)
+                        BUG_ON(err);
                return VM_FAULT_NOPAGE;
        } else {
                int err, ret = VM_FAULT_OOM;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b3ffc21ce80..f0e5306eeb5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -671,6 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                set_pmd_at(mm, haddr, pmd, entry);
                prepare_pmd_huge_pte(pgtable, mm);
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+                mm->nr_ptes++;
                spin_unlock(&mm->page_table_lock);
        }
@@ -789,6 +790,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        pmd = pmd_mkold(pmd_wrprotect(pmd));
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
        prepare_pmd_huge_pte(pgtable, dst_mm);
+        dst_mm->nr_ptes++;
        ret = 0;
 out_unlock:
@@ -887,7 +889,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        }
        kfree(pages);
-        mm->nr_ptes++;
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
        page_remove_rmap(page);
@@ -1030,31 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 {
        int ret = 0;
-        spin_lock(&tlb->mm->page_table_lock);
+        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
-        if (likely(pmd_trans_huge(*pmd))) {
+                struct page *page;
-                if (unlikely(pmd_trans_splitting(*pmd))) {
+                pgtable_t pgtable;
-                        spin_unlock(&tlb->mm->page_table_lock);
+                pgtable = get_pmd_huge_pte(tlb->mm);
-                        wait_split_huge_page(vma->anon_vma,
+                page = pmd_page(*pmd);
-                                             pmd);
+                pmd_clear(pmd);
-                } else {
+                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
-                        struct page *page;
+                page_remove_rmap(page);
-                        pgtable_t pgtable;
+                VM_BUG_ON(page_mapcount(page) < 0);
-                        pgtable = get_pmd_huge_pte(tlb->mm);
+                add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
-                        page = pmd_page(*pmd);
+                VM_BUG_ON(!PageHead(page));
-                        pmd_clear(pmd);
+                tlb->mm->nr_ptes--;
-                        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
-                        page_remove_rmap(page);
-                        VM_BUG_ON(page_mapcount(page) < 0);
-                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
-                        VM_BUG_ON(!PageHead(page));
-                        spin_unlock(&tlb->mm->page_table_lock);
-                        tlb_remove_page(tlb, page);
-                        pte_free(tlb->mm, pgtable);
-                        ret = 1;
-                }
-        } else
                spin_unlock(&tlb->mm->page_table_lock);
+                tlb_remove_page(tlb, page);
+                pte_free(tlb->mm, pgtable);
+                ret = 1;
+        }
        return ret;
 }
@@ -1064,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 {
        int ret = 0;
-        spin_lock(&vma->vm_mm->page_table_lock);
+        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
-        if (likely(pmd_trans_huge(*pmd))) {
+                /*
-                ret = !pmd_trans_splitting(*pmd);
+                 * All logical pages in the range are present
-                spin_unlock(&vma->vm_mm->page_table_lock);
+                 * if backed by a huge page.
-                if (unlikely(!ret))
+                 */
-                        wait_split_huge_page(vma->anon_vma, pmd);
-                else {
-                        /*
-                         * All logical pages in the range are present
-                         * if backed by a huge page.
-                         */
-                        memset(vec, 1, (end - addr) >> PAGE_SHIFT);
-                }
-        } else
                spin_unlock(&vma->vm_mm->page_table_lock);
+                memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+                ret = 1;
+        }
        return ret;
 }
@@ -1108,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                goto out;
        }
-        spin_lock(&mm->page_table_lock);
+        ret = __pmd_trans_huge_lock(old_pmd, vma);
-        if (likely(pmd_trans_huge(*old_pmd))) {
+        if (ret == 1) {
-                if (pmd_trans_splitting(*old_pmd)) {
+                pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
-                        spin_unlock(&mm->page_table_lock);
+                VM_BUG_ON(!pmd_none(*new_pmd));
-                        wait_split_huge_page(vma->anon_vma, old_pmd);
+                set_pmd_at(mm, new_addr, new_pmd, pmd);
-                        ret = -1;
-                } else {
-                        pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
-                        VM_BUG_ON(!pmd_none(*new_pmd));
-                        set_pmd_at(mm, new_addr, new_pmd, pmd);
-                        spin_unlock(&mm->page_table_lock);
-                        ret = 1;
-                }
-        } else {
                spin_unlock(&mm->page_table_lock);
        }
 out:
@@ -1134,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        struct mm_struct *mm = vma->vm_mm;
        int ret = 0;
-        spin_lock(&mm->page_table_lock);
+        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+                pmd_t entry;
+                entry = pmdp_get_and_clear(mm, addr, pmd);
+                entry = pmd_modify(entry, newprot);
+                set_pmd_at(mm, addr, pmd, entry);
+                spin_unlock(&vma->vm_mm->page_table_lock);
+                ret = 1;
+        }
+        return ret;
+}
+/*
+ * Returns 1 if a given pmd maps a stable (not under splitting) thp.
+ * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+ *
+ * Note that if it returns 1, this routine returns without unlocking page
+ * table locks. So callers must unlock them.
+ */
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+{
+        spin_lock(&vma->vm_mm->page_table_lock);
        if (likely(pmd_trans_huge(*pmd))) {
                if (unlikely(pmd_trans_splitting(*pmd))) {
-                        spin_unlock(&mm->page_table_lock);
+                        spin_unlock(&vma->vm_mm->page_table_lock);
                        wait_split_huge_page(vma->anon_vma, pmd);
+                        return -1;
                } else {
-                        pmd_t entry;
+                        /* Thp mapped by 'pmd' is stable, so we can
+                         * handle it as it is. */
-                        entry = pmdp_get_and_clear(mm, addr, pmd);
+                        return 1;
-                        entry = pmd_modify(entry, newprot);
-                        set_pmd_at(mm, addr, pmd, entry);
-                        spin_unlock(&vma->vm_mm->page_table_lock);
-                        ret = 1;
                }
-        } else
+        }
-                spin_unlock(&vma->vm_mm->page_table_lock);
+        spin_unlock(&vma->vm_mm->page_table_lock);
+        return 0;
-        return ret;
 }
 pmd_t *page_check_address_pmd(struct page *page,
@@ -1375,7 +1370,6 @@ static int __split_huge_page_map(struct page *page,
                        pte_unmap(pte);
                }
-                mm->nr_ptes++;
                smp_wmb(); /* make pte visible before pmd */
                /*
                 * Up to this point the pmd is present and huge and
@@ -1988,7 +1982,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache(vma, address, _pmd);
        prepare_pmd_huge_pte(pgtable, mm);
-        mm->nr_ptes--;
        spin_unlock(&mm->page_table_lock);
 #ifndef CONFIG_NUMA
@@ -2083,7 +2076,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
 {
        struct mm_struct *mm = mm_slot->mm;
-        VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+        VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
        if (khugepaged_test_exit(mm)) {
                /* free mm_slot */
@@ -2113,7 +2106,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
        int progress = 0;
        VM_BUG_ON(!pages);
-        VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+        VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
        if (khugepaged_scan.mm_slot)
                mm_slot = khugepaged_scan.mm_slot;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5f34bd8dda3..b8ce6f45095 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
 */
 static DEFINE_SPINLOCK(hugetlb_lock);
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+{
+        bool free = (spool->count == 0) && (spool->used_hpages == 0);
+        spin_unlock(&spool->lock);
+        /* If no pages are used, and no other handles to the subpool
+         * remain, free the subpool the subpool remain */
+        if (free)
+                kfree(spool);
+}
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+{
+        struct hugepage_subpool *spool;
+        spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+        if (!spool)
+                return NULL;
+        spin_lock_init(&spool->lock);
+        spool->count = 1;
+        spool->max_hpages = nr_blocks;
+        spool->used_hpages = 0;
+        return spool;
+}
+void hugepage_put_subpool(struct hugepage_subpool *spool)
+{
+        spin_lock(&spool->lock);
+        BUG_ON(!spool->count);
+        spool->count--;
+        unlock_or_release_subpool(spool);
+}
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+                                      long delta)
+{
+        int ret = 0;
+        if (!spool)
+                return 0;
+        spin_lock(&spool->lock);
+        if ((spool->used_hpages + delta) <= spool->max_hpages) {
+                spool->used_hpages += delta;
+        } else {
+                ret = -ENOMEM;
+        }
+        spin_unlock(&spool->lock);
+        return ret;
+}
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+                                       long delta)
+{
+        if (!spool)
+                return;
+        spin_lock(&spool->lock);
+        spool->used_hpages -= delta;
+        /* If hugetlbfs_put_super couldn't free spool due to
+        * an outstanding quota reference, free it now. */
+        unlock_or_release_subpool(spool);
+}
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+        return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
+{
+        return subpool_inode(vma->vm_file->f_dentry->d_inode);
+}
 /*
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
@@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, int avoid_reserve)
 {
-        struct page *page = NULL;
+        struct page *page;
        struct mempolicy *mpol;
        nodemask_t *nodemask;
        struct zonelist *zonelist;
        struct zone *zone;
        struct zoneref *z;
+        unsigned int cpuset_mems_cookie;
-        get_mems_allowed();
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
        zonelist = huge_zonelist(vma, address,
                                        htlb_alloc_mask, &mpol, &nodemask);
        /*
@@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
                        }
                }
        }
-err:
        mpol_cond_put(mpol);
-        put_mems_allowed();
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
+err:
+        mpol_cond_put(mpol);
+        return NULL;
 }
 static void update_and_free_page(struct hstate *h, struct page *page)
@@ -533,9 +618,9 @@ static void free_huge_page(struct page *page)
         */
        struct hstate *h = page_hstate(page);
        int nid = page_to_nid(page);
-        struct address_space *mapping;
+        struct hugepage_subpool *spool =
+                (struct hugepage_subpool *)page_private(page);
-        mapping = (struct address_space *) page_private(page);
        set_page_private(page, 0);
        page->mapping = NULL;
        BUG_ON(page_count(page));
@@ -551,8 +636,7 @@ static void free_huge_page(struct page *page)
                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
-        if (mapping)
+        hugepage_subpool_put_pages(spool, 1);
-                hugetlb_put_quota(mapping, 1);
 }
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
        struct page *page, *tmp;
        int ret, i;
        int needed, allocated;
+        bool alloc_ok = true;
        needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
        if (needed <= 0) {
@@ -867,17 +952,13 @@ retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
-                if (!page)
+                if (!page) {
-                        /*
+                        alloc_ok = false;
-                         * We were not able to allocate enough pages to
+                        break;
-                         * satisfy the entire reservation so we free what
+                }
-                         * we've allocated so far.
-                         */
-                        goto free;
                list_add(&page->lru, &surplus_list);
        }
-        allocated += needed;
+        allocated += i;
        /*
         * After retaking hugetlb_lock, we need to recalculate 'needed'
@@ -886,9 +967,16 @@ retry:
        spin_lock(&hugetlb_lock);
        needed = (h->resv_huge_pages + delta) -
                        (h->free_huge_pages + allocated);
-        if (needed > 0)
+        if (needed > 0) {
-                goto retry;
+                if (alloc_ok)
+                        goto retry;
+                /*
+                 * We were not able to allocate enough pages to
+                 * satisfy the entire reservation so we free what
+                 * we've allocated so far.
+                 */
+                goto free;
+        }
        /*
         * The surplus_list now contains _at_least_ the number of extra pages
         * needed to accommodate the reservation.  Add the appropriate number
@@ -914,10 +1002,10 @@ retry:
                VM_BUG_ON(page_count(page));
                enqueue_huge_page(h, page);
        }
+free:
        spin_unlock(&hugetlb_lock);
        /* Free unnecessary surplus pages to the buddy allocator */
-free:
        if (!list_empty(&surplus_list)) {
                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                        list_del(&page->lru);
@@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
 /*
 * Determine if the huge page at addr within the vma has an associated
 * reservation.  Where it does not we will need to logically increase
- * reservation and actually increase quota before an allocation can occur.
+ * reservation and actually increase subpool usage before an allocation
- * Where any new reservation would be required the reservation change is
+ * can occur.  Where any new reservation would be required the
- * prepared, but not committed.  Once the page has been quota'd allocated
+ * reservation change is prepared, but not committed.  Once the page
- * an instantiated the change should be committed via vma_commit_reservation.
+ * has been allocated from the subpool and instantiated the change should
- * No action is required on failure.
+ * be committed via vma_commit_reservation.  No action is required on
+ * failure.
 */
 static long vma_needs_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
@@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr, int avoid_reserve)
 {
+        struct hugepage_subpool *spool = subpool_vma(vma);
        struct hstate *h = hstate_vma(vma);
        struct page *page;
-        struct address_space *mapping = vma->vm_file->f_mapping;
-        struct inode *inode = mapping->host;
        long chg;
        /*
-         * Processes that did not create the mapping will have no reserves and
+         * Processes that did not create the mapping will have no
-         * will not have accounted against quota. Check that the quota can be
+         * reserves and will not have accounted against subpool
-         * made before satisfying the allocation
+         * limit. Check that the subpool limit can be made before
-         * MAP_NORESERVE mappings may also need pages and quota allocated
+         * satisfying the allocation MAP_NORESERVE mappings may also
-         * if no reserve mapping overlaps.
+         * need pages and subpool limit allocated allocated if no reserve
+         * mapping overlaps.
         */
        chg = vma_needs_reservation(h, vma, addr);
        if (chg < 0)
                return ERR_PTR(-VM_FAULT_OOM);
        if (chg)
-                if (hugetlb_get_quota(inode->i_mapping, chg))
+                if (hugepage_subpool_get_pages(spool, chg))
                        return ERR_PTR(-VM_FAULT_SIGBUS);
        spin_lock(&hugetlb_lock);
@@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        if (!page) {
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
                if (!page) {
-                        hugetlb_put_quota(inode->i_mapping, chg);
+                        hugepage_subpool_put_pages(spool, chg);
                        return ERR_PTR(-VM_FAULT_SIGBUS);
                }
        }
-        set_page_private(page, (unsigned long) mapping);
+        set_page_private(page, (unsigned long)spool);
        vma_commit_reservation(h, vma, addr);
@@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
        struct hstate *h = hstate_vma(vma);
        struct resv_map *reservations = vma_resv_map(vma);
+        struct hugepage_subpool *spool = subpool_vma(vma);
        unsigned long reserve;
        unsigned long start;
        unsigned long end;
@@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
                if (reserve) {
                        hugetlb_acct_memory(h, -reserve);
-                        hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+                        hugepage_subpool_put_pages(spool, reserve);
                }
        }
 }
@@ -2241,16 +2331,23 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                if (huge_pmd_unshare(mm, &address, ptep))
                        continue;
+                pte = huge_ptep_get(ptep);
+                if (huge_pte_none(pte))
+                        continue;
+                /*
+                 * HWPoisoned hugepage is already unmapped and dropped reference
+                 */
+                if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+                        continue;
+                page = pte_page(pte);
                /*
                 * If a reference page is supplied, it is because a specific
                 * page is being unmapped, not a range. Ensure the page we
                 * are about to unmap is the actual page of interest.
                 */
                if (ref_page) {
-                        pte = huge_ptep_get(ptep);
-                        if (huge_pte_none(pte))
-                                continue;
-                        page = pte_page(pte);
                        if (page != ref_page)
                                continue;
@@ -2263,22 +2360,16 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                }
                pte = huge_ptep_get_and_clear(mm, address, ptep);
-                if (huge_pte_none(pte))
-                        continue;
-                /*
-                 * HWPoisoned hugepage is already unmapped and dropped reference
-                 */
-                if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
-                        continue;
-                page = pte_page(pte);
                if (pte_dirty(pte))
                        set_page_dirty(page);
                list_add(&page->lru, &page_list);
+                /* Bail out after unmapping reference page if supplied */
+                if (ref_page)
+                        break;
        }
-        spin_unlock(&mm->page_table_lock);
        flush_tlb_range(vma, start, end);
+        spin_unlock(&mm->page_table_lock);
        mmu_notifier_invalidate_range_end(mm, start, end);
        list_for_each_entry_safe(page, tmp, &page_list, lru) {
                page_remove_rmap(page);
@@ -2316,7 +2407,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        address = address & huge_page_mask(h);
        pgoff = vma_hugecache_offset(h, vma, address);
-        mapping = (struct address_space *)page_private(page);
+        mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
        /*
         * Take the mapping lock for the duration of the table walk. As
@@ -2869,11 +2960,12 @@ int hugetlb_reserve_pages(struct inode *inode,
 {
        long ret, chg;
        struct hstate *h = hstate_inode(inode);
+        struct hugepage_subpool *spool = subpool_inode(inode);
        /*
         * Only apply hugepage reservation if asked. At fault time, an
         * attempt will be made for VM_NORESERVE to allocate a page
-         * and filesystem quota without using reserves
+         * without using reserves
         */
        if (vm_flags & VM_NORESERVE)
                return 0;
@@ -2900,17 +2992,17 @@ int hugetlb_reserve_pages(struct inode *inode,
        if (chg < 0)
                return chg;
-        /* There must be enough filesystem quota for the mapping */
+        /* There must be enough pages in the subpool for the mapping */
-        if (hugetlb_get_quota(inode->i_mapping, chg))
+        if (hugepage_subpool_get_pages(spool, chg))
                return -ENOSPC;
        /*
         * Check enough hugepages are available for the reservation.
-         * Hand back the quota if there are not
+         * Hand the pages back to the subpool if there are not
         */
        ret = hugetlb_acct_memory(h, chg);
        if (ret < 0) {
-                hugetlb_put_quota(inode->i_mapping, chg);
+                hugepage_subpool_put_pages(spool, chg);
                return ret;
        }
@@ -2934,12 +3026,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 {
        struct hstate *h = hstate_inode(inode);
        long chg = region_truncate(&inode->i_mapping->private_list, offset);
+        struct hugepage_subpool *spool = subpool_inode(inode);
        spin_lock(&inode->i_lock);
        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);
-        hugetlb_put_quota(inode->i_mapping, (chg - freed));
+        hugepage_subpool_put_pages(spool, (chg - freed));
        hugetlb_acct_memory(h, -(chg - freed));
 }
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index c7fc7fd00e3..cc448bb983b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -45,7 +45,7 @@ static int hwpoison_inject(void *data, u64 val)
         * do a racy check with elevated page count, to make sure PG_hwpoison
         * will only be set for the targeted owner (or on a free page).
         * We temporarily take page lock for try_get_mem_cgroup_from_page().
-         * __memory_failure() will redo the check reliably inside page lock.
+         * memory_failure() will redo the check reliably inside page lock.
         */
        lock_page(hpage);
        err = hwpoison_filter(hpage);
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val)
 inject:
        printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
-        return __memory_failure(pfn, 18, MF_COUNT_INCREASED);
+        return memory_failure(pfn, 18, MF_COUNT_INCREASED);
 }
 static int hwpoison_unpoison(void *data, u64 val)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c833addd94d..45eb6217bf3 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1036,7 +1036,7 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
 {
        pr_debug("%s(0x%p)\n", __func__, ptr);
-        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+        if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr))
                add_scan_area((unsigned long)ptr, size, gfp);
        else if (atomic_read(&kmemleak_early_log))
                log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
@@ -1757,6 +1757,7 @@ void __init kmemleak_init(void)
 #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
        if (!kmemleak_skip_disable) {
+                atomic_set(&kmemleak_early_log, 0);
                kmemleak_disable();
                return;
        }
diff --git a/mm/ksm.c b/mm/ksm.c
index 1925ffbfb27..47c88536889 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -28,7 +28,6 @@
 #include <linux/kthread.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
-#include <linux/memcontrol.h>
 #include <linux/rbtree.h>
 #include <linux/memory.h>
 #include <linux/mmu_notifier.h>
@@ -375,6 +374,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
        return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
 }
+static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
+                unsigned long addr)
+{
+        struct vm_area_struct *vma;
+        if (ksm_test_exit(mm))
+                return NULL;
+        vma = find_vma(mm, addr);
+        if (!vma || vma->vm_start > addr)
+                return NULL;
+        if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+                return NULL;
+        return vma;
+}
 static void break_cow(struct rmap_item *rmap_item)
 {
        struct mm_struct *mm = rmap_item->mm;
@@ -388,15 +401,9 @@ static void break_cow(struct rmap_item *rmap_item)
        put_anon_vma(rmap_item->anon_vma);
        down_read(&mm->mmap_sem);
-        if (ksm_test_exit(mm))
+        vma = find_mergeable_vma(mm, addr);
-                goto out;
+        if (vma)
-        vma = find_vma(mm, addr);
+                break_ksm(vma, addr);
-        if (!vma || vma->vm_start > addr)
-                goto out;
-        if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
-                goto out;
-        break_ksm(vma, addr);
-out:
        up_read(&mm->mmap_sem);
 }
@@ -422,12 +429,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        struct page *page;
        down_read(&mm->mmap_sem);
-        if (ksm_test_exit(mm))
+        vma = find_mergeable_vma(mm, addr);
-                goto out;
+        if (!vma)
-        vma = find_vma(mm, addr);
-        if (!vma || vma->vm_start > addr)
-                goto out;
-        if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
                goto out;
        page = follow_page(vma, addr, FOLL_GET);
@@ -673,9 +676,9 @@ error:
 static u32 calc_checksum(struct page *page)
 {
        u32 checksum;
-        void *addr = kmap_atomic(page, KM_USER0);
+        void *addr = kmap_atomic(page);
        checksum = jhash2(addr, PAGE_SIZE / 4, 17);
-        kunmap_atomic(addr, KM_USER0);
+        kunmap_atomic(addr);
        return checksum;
 }
@@ -684,11 +687,11 @@ static int memcmp_pages(struct page *page1, struct page *page2)
        char *addr1, *addr2;
        int ret;
-        addr1 = kmap_atomic(page1, KM_USER0);
+        addr1 = kmap_atomic(page1);
-        addr2 = kmap_atomic(page2, KM_USER1);
+        addr2 = kmap_atomic(page2);
        ret = memcmp(addr1, addr2, PAGE_SIZE);
-        kunmap_atomic(addr2, KM_USER1);
+        kunmap_atomic(addr2);
-        kunmap_atomic(addr1, KM_USER0);
+        kunmap_atomic(addr1);
        return ret;
 }
@@ -1572,16 +1575,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
        if (new_page) {
-                /*
-                 * The memcg-specific accounting when moving
-                 * pages around the LRU lists relies on the
-                 * page's owner (memcg) to be valid.  Usually,
-                 * pages are assigned to a new owner before
-                 * being put on the LRU list, but since this
-                 * is not the case here, the stale owner from
-                 * a previous allocation cycle must be reset.
-                 */
-                mem_cgroup_reset_owner(new_page);
                copy_user_highpage(new_page, page, address, vma);
                SetPageDirty(new_page);
diff --git a/mm/madvise.c b/mm/madvise.c
index 74bf193eff0..1ccbba5b667 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -65,6 +65,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
                }
                new_flags &= ~VM_DONTCOPY;
                break;
+        case MADV_DONTDUMP:
+                new_flags |= VM_NODUMP;
+                break;
+        case MADV_DODUMP:
+                new_flags &= ~VM_NODUMP;
+                break;
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
                error = ksm_madvise(vma, start, end, behavior, &new_flags);
@@ -251,7 +257,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
                printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
                       page_to_pfn(p), start);
                /* Ignore return value for now */
-                __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+                memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
        }
        return ret;
 }
@@ -293,6 +299,8 @@ madvise_behavior_valid(int behavior)
        case MADV_HUGEPAGE:
        case MADV_NOHUGEPAGE:
 #endif
+        case MADV_DONTDUMP:
+        case MADV_DODUMP:
                return 1;
        default:
diff --git a/mm/memblock.c b/mm/memblock.c
index 77b5f227e1d..99f28559950 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -99,9 +99,6 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
        phys_addr_t this_start, this_end, cand;
        u64 i;
-        /* align @size to avoid excessive fragmentation on reserved array */
-        size = round_up(size, align);
        /* pump up @end */
        if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
                end = memblock.current_limit;
@@ -731,6 +728,9 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
 {
        phys_addr_t found;
+        /* align @size to avoid excessive fragmentation on reserved array */
+        size = round_up(size, align);
        found = memblock_find_in_range_node(0, max_addr, size, align, nid);
        if (found && !memblock_reserve(found, size))
                return found;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 556859fec4e..b2ee6df0e9b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,6 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
        MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
        MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
-        MEM_CGROUP_ON_MOVE,     /* someone is moving account between groups */
        MEM_CGROUP_STAT_NSTATS,
 };
@@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter {
 */
 struct mem_cgroup_per_zone {
        struct lruvec           lruvec;
-        unsigned long           count[NR_LRU_LISTS];
+        unsigned long           lru_size[NR_LRU_LISTS];
        struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
@@ -144,11 +143,9 @@ struct mem_cgroup_per_zone {
        unsigned long long      usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                    on_tree;
-        struct mem_cgroup       *mem;           /* Back pointer, we cannot */
+        struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
                                                /* use container_of        */
 };
-/* Macro for accessing counter */
-#define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
 struct mem_cgroup_per_node {
        struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
@@ -230,10 +227,30 @@ struct mem_cgroup {
         * the counter to account for memory usage
         */
        struct res_counter res;
-        /*
-         * the counter to account for mem+swap usage.
+        union {
-         */
+                /*
-        struct res_counter memsw;
+                 * the counter to account for mem+swap usage.
+                 */
+                struct res_counter memsw;
+                /*
+                 * rcu_freeing is used only when freeing struct mem_cgroup,
+                 * so put it into a union to avoid wasting more memory.
+                 * It must be disjoint from the css field.  It could be
+                 * in a union with the res field, but res plays a much
+                 * larger part in mem_cgroup life than memsw, and might
+                 * be of interest, even at time of free, when debugging.
+                 * So share rcu_head with the less interesting memsw.
+                 */
+                struct rcu_head rcu_freeing;
+                /*
+                 * But when using vfree(), that cannot be done at
+                 * interrupt time, so we must then queue the work.
+                 */
+                struct work_struct work_freeing;
+        };
        /*
         * Per cgroup active and inactive list, similar to the
         * per zone LRU lists.
@@ -280,6 +297,12 @@ struct mem_cgroup {
         */
        unsigned long   move_charge_at_immigrate;
        /*
+         * set > 0 if pages under this cgroup are moving to other cgroup.
+         */
+        atomic_t        moving_account;
+        /* taken only while moving_account > 0 */
+        spinlock_t      move_lock;
+        /*
         * percpu counter.
         */
        struct mem_cgroup_stat_cpu *stat;
@@ -592,9 +615,9 @@ retry:
         * we will to add it back at the end of reclaim to its correct
         * position in the tree.
         */
-        __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+        __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
-        if (!res_counter_soft_limit_excess(&mz->mem->res) ||
+        if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
-                !css_tryget(&mz->mem->css))
+                !css_tryget(&mz->memcg->css))
                goto retry;
 done:
        return mz;
@@ -672,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 }
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
-                                         bool file, int nr_pages)
+                                         bool anon, int nr_pages)
 {
        preempt_disable();
-        if (file)
+        /*
-                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
+         * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
+         * counted as CACHE even if it's on ANON LRU.
+         */
+        if (anon)
+                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
                                nr_pages);
        else
-                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
+                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
                                nr_pages);
        /* pagein of a big page is an event. So, ignore page size */
@@ -701,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
                        unsigned int lru_mask)
 {
        struct mem_cgroup_per_zone *mz;
-        enum lru_list l;
+        enum lru_list lru;
        unsigned long ret = 0;
        mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-        for_each_lru(l) {
+        for_each_lru(lru) {
-                if (BIT(l) & lru_mask)
+                if (BIT(lru) & lru_mask)
-                        ret += MEM_CGROUP_ZSTAT(mz, l);
+                        ret += mz->lru_size[lru];
        }
        return ret;
 }
@@ -776,7 +803,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
        /* threshold event is triggered in finer grain than soft limit */
        if (unlikely(mem_cgroup_event_ratelimit(memcg,
                                                MEM_CGROUP_TARGET_THRESH))) {
-                bool do_softlimit, do_numainfo;
+                bool do_softlimit;
+                bool do_numainfo __maybe_unused;
                do_softlimit = mem_cgroup_event_ratelimit(memcg,
                                                MEM_CGROUP_TARGET_SOFTLIMIT);
@@ -1041,9 +1069,22 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
        pc = lookup_page_cgroup(page);
        memcg = pc->mem_cgroup;
+        /*
+         * Surreptitiously switch any uncharged page to root:
+         * an uncharged page off lru does nothing to secure
+         * its former mem_cgroup from sudden removal.
+         *
+         * Our caller holds lru_lock, and PageCgroupUsed is updated
+         * under page_cgroup lock: between them, they make all uses
+         * of pc->mem_cgroup safe.
+         */
+        if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
+                pc->mem_cgroup = memcg = root_mem_cgroup;
        mz = page_cgroup_zoneinfo(memcg, page);
        /* compound_order() is stabilized through lru_lock */
-        MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
+        mz->lru_size[lru] += 1 << compound_order(page);
        return &mz->lruvec;
 }
@@ -1071,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
        VM_BUG_ON(!memcg);
        mz = page_cgroup_zoneinfo(memcg, page);
        /* huge page split is done under lru_lock. so, we have no races. */
-        VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page)));
+        VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
-        MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
+        mz->lru_size[lru] -= 1 << compound_order(page);
 }
 void mem_cgroup_lru_del(struct page *page)
@@ -1251,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
        return memcg->swappiness;
 }
-static void mem_cgroup_start_move(struct mem_cgroup *memcg)
+/*
-{
+ * memcg->moving_account is used for checking possibility that some thread is
-        int cpu;
+ * calling move_account(). When a thread on CPU-A starts moving pages under
+ * a memcg, other threads should check memcg->moving_account under
+ * rcu_read_lock(), like this:
+ *
+ *         CPU-A                                    CPU-B
+ *                                              rcu_read_lock()
+ *         memcg->moving_account+1              if (memcg->mocing_account)
+ *                                                   take heavy locks.
+ *         synchronize_rcu()                    update something.
+ *                                              rcu_read_unlock()
+ *         start move here.
+ */
-        get_online_cpus();
+/* for quick checking without looking up memcg */
-        spin_lock(&memcg->pcp_counter_lock);
+atomic_t memcg_moving __read_mostly;
-        for_each_online_cpu(cpu)
-                per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
-        memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
-        spin_unlock(&memcg->pcp_counter_lock);
-        put_online_cpus();
+static void mem_cgroup_start_move(struct mem_cgroup *memcg)
+{
+        atomic_inc(&memcg_moving);
+        atomic_inc(&memcg->moving_account);
        synchronize_rcu();
 }
 static void mem_cgroup_end_move(struct mem_cgroup *memcg)
 {
-        int cpu;
+        /*
+         * Now, mem_cgroup_clear_mc() may call this function with NULL.
-        if (!memcg)
+         * We check NULL in callee rather than caller.
-                return;
+         */
-        get_online_cpus();
+        if (memcg) {
-        spin_lock(&memcg->pcp_counter_lock);
+                atomic_dec(&memcg_moving);
-        for_each_online_cpu(cpu)
+                atomic_dec(&memcg->moving_account);
-                per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+        }
-        memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
-        spin_unlock(&memcg->pcp_counter_lock);
-        put_online_cpus();
 }
 /*
 * 2 routines for checking "mem" is under move_account() or not.
 *
- * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
+ * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This
- *                        for avoiding race in accounting. If true,
+ *                        is used for avoiding races in accounting.  If true,
 *                        pc->mem_cgroup may be overwritten.
 *
 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
@@ -1292,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
 *                        waiting at hith-memory prressure caused by "move".
 */
-static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
+static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
 {
        VM_BUG_ON(!rcu_read_lock_held());
-        return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+        return atomic_read(&memcg->moving_account) > 0;
 }
 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
@@ -1336,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
        return false;
 }
+/*
+ * Take this lock when
+ * - a code tries to modify page's memcg while it's USED.
+ * - a code tries to modify page state accounting in a memcg.
+ * see mem_cgroup_stolen(), too.
+ */
+static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
+                                  unsigned long *flags)
+{
+        spin_lock_irqsave(&memcg->move_lock, *flags);
+}
+static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
+                                unsigned long *flags)
+{
+        spin_unlock_irqrestore(&memcg->move_lock, *flags);
+}
 /**
 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
 * @memcg: The memory cgroup that went over limit
@@ -1359,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        if (!memcg || !p)
                return;
        rcu_read_lock();
        mem_cgrp = memcg->css.cgroup;
@@ -1738,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 struct oom_wait_info {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *memcg;
        wait_queue_t    wait;
 };
 static int memcg_oom_wake_function(wait_queue_t *wait,
        unsigned mode, int sync, void *arg)
 {
-        struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
+        struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
-                          *oom_wait_memcg;
+        struct mem_cgroup *oom_wait_memcg;
        struct oom_wait_info *oom_wait_info;
        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
-        oom_wait_memcg = oom_wait_info->mem;
+        oom_wait_memcg = oom_wait_info->memcg;
        /*
-         * Both of oom_wait_info->mem and wake_mem are stable under us.
+         * Both of oom_wait_info->memcg and wake_memcg are stable under us.
         * Then we can use css_is_ancestor without taking care of RCU.
         */
        if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
@@ -1777,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 /*
 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
 */
-bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
+bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
        struct oom_wait_info owait;
        bool locked, need_to_kill;
-        owait.mem = memcg;
+        owait.memcg = memcg;
        owait.wait.flags = 0;
        owait.wait.func = memcg_oom_wake_function;
        owait.wait.private = current;
@@ -1807,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
        if (need_to_kill) {
                finish_wait(&memcg_oom_waitq, &owait.wait);
-                mem_cgroup_out_of_memory(memcg, mask);
+                mem_cgroup_out_of_memory(memcg, mask, order);
        } else {
                schedule();
                finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1847,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
 * by flags.
 *
 * Considering "move", this is an only case we see a race. To make the race
- * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
+ * small, we check mm->moving_account and detect there are possibility of race
- * possibility of race condition. If there is, we take a lock.
+ * If there is, we take a lock.
 */
+void __mem_cgroup_begin_update_page_stat(struct page *page,
+                                bool *locked, unsigned long *flags)
+{
+        struct mem_cgroup *memcg;
+        struct page_cgroup *pc;
+        pc = lookup_page_cgroup(page);
+again:
+        memcg = pc->mem_cgroup;
+        if (unlikely(!memcg || !PageCgroupUsed(pc)))
+                return;
+        /*
+         * If this memory cgroup is not under account moving, we don't
+         * need to take move_lock_page_cgroup(). Because we already hold
+         * rcu_read_lock(), any calls to move_account will be delayed until
+         * rcu_read_unlock() if mem_cgroup_stolen() == true.
+         */
+        if (!mem_cgroup_stolen(memcg))
+                return;
+        move_lock_mem_cgroup(memcg, flags);
+        if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
+                move_unlock_mem_cgroup(memcg, flags);
+                goto again;
+        }
+        *locked = true;
+}
+void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
+{
+        struct page_cgroup *pc = lookup_page_cgroup(page);
+        /*
+         * It's guaranteed that pc->mem_cgroup never changes while
+         * lock is held because a routine modifies pc->mem_cgroup
+         * should take move_lock_page_cgroup().
+         */
+        move_unlock_mem_cgroup(pc->mem_cgroup, flags);
+}
 void mem_cgroup_update_page_stat(struct page *page,
                                 enum mem_cgroup_page_stat_item idx, int val)
 {
        struct mem_cgroup *memcg;
        struct page_cgroup *pc = lookup_page_cgroup(page);
-        bool need_unlock = false;
        unsigned long uninitialized_var(flags);
        if (mem_cgroup_disabled())
                return;
-        rcu_read_lock();
        memcg = pc->mem_cgroup;
        if (unlikely(!memcg || !PageCgroupUsed(pc)))
-                goto out;
+                return;
-        /* pc->mem_cgroup is unstable ? */
-        if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
-                /* take a lock against to access pc->mem_cgroup */
-                move_lock_page_cgroup(pc, &flags);
-                need_unlock = true;
-                memcg = pc->mem_cgroup;
-                if (!memcg || !PageCgroupUsed(pc))
-                        goto out;
-        }
        switch (idx) {
        case MEMCG_NR_FILE_MAPPED:
-                if (val > 0)
-                        SetPageCgroupFileMapped(pc);
-                else if (!page_mapped(page))
-                        ClearPageCgroupFileMapped(pc);
                idx = MEM_CGROUP_STAT_FILE_MAPPED;
                break;
        default:
@@ -1889,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page,
        }
        this_cpu_add(memcg->stat->count[idx], val);
-out:
-        if (unlikely(need_unlock))
-                move_unlock_page_cgroup(pc, &flags);
-        rcu_read_unlock();
-        return;
 }
-EXPORT_SYMBOL(mem_cgroup_update_page_stat);
 /*
 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2067,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
                per_cpu(memcg->stat->events[i], cpu) = 0;
                memcg->nocpu_base.events[i] += x;
        }
-        /* need to clear ON_MOVE value, works as a kind of lock. */
-        per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
-        spin_unlock(&memcg->pcp_counter_lock);
-}
-static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
-{
-        int idx = MEM_CGROUP_ON_MOVE;
-        spin_lock(&memcg->pcp_counter_lock);
-        per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
        spin_unlock(&memcg->pcp_counter_lock);
 }
@@ -2089,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
        struct memcg_stock_pcp *stock;
        struct mem_cgroup *iter;
-        if ((action == CPU_ONLINE)) {
+        if (action == CPU_ONLINE)
-                for_each_mem_cgroup(iter)
-                        synchronize_mem_cgroup_on_move(iter, cpu);
                return NOTIFY_OK;
-        }
        if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
                return NOTIFY_OK;
@@ -2178,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        if (!oom_check)
                return CHARGE_NOMEM;
        /* check OOM */
-        if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+        if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
                return CHARGE_OOM_DIE;
        return CHARGE_RETRY;
@@ -2407,8 +2477,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
                                       struct page *page,
                                       unsigned int nr_pages,
                                       struct page_cgroup *pc,
-                                       enum charge_type ctype)
+                                       enum charge_type ctype,
+                                       bool lrucare)
 {
+        struct zone *uninitialized_var(zone);
+        bool was_on_lru = false;
+        bool anon;
        lock_page_cgroup(pc);
        if (unlikely(PageCgroupUsed(pc))) {
                unlock_page_cgroup(pc);
@@ -2419,6 +2494,21 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
         * we don't need page_cgroup_lock about tail pages, becase they are not
         * accessed by any other context at this point.
         */
+        /*
+         * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
+         * may already be on some other mem_cgroup's LRU.  Take care of it.
+         */
+        if (lrucare) {
+                zone = page_zone(page);
+                spin_lock_irq(&zone->lru_lock);
+                if (PageLRU(page)) {
+                        ClearPageLRU(page);
+                        del_page_from_lru_list(zone, page, page_lru(page));
+                        was_on_lru = true;
+                }
+        }
        pc->mem_cgroup = memcg;
        /*
         * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2428,23 +2518,25 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
         * See mem_cgroup_add_lru_list(), etc.
         */
        smp_wmb();
-        switch (ctype) {
+        SetPageCgroupUsed(pc);
-        case MEM_CGROUP_CHARGE_TYPE_CACHE:
-        case MEM_CGROUP_CHARGE_TYPE_SHMEM:
+        if (lrucare) {
-                SetPageCgroupCache(pc);
+                if (was_on_lru) {
-                SetPageCgroupUsed(pc);
+                        VM_BUG_ON(PageLRU(page));
-                break;
+                        SetPageLRU(page);
-        case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+                        add_page_to_lru_list(zone, page, page_lru(page));
-                ClearPageCgroupCache(pc);
+                }
-                SetPageCgroupUsed(pc);
+                spin_unlock_irq(&zone->lru_lock);
-                break;
-        default:
-                break;
        }
-        mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
+        if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+                anon = true;
+        else
+                anon = false;
+        mem_cgroup_charge_statistics(memcg, anon, nr_pages);
        unlock_page_cgroup(pc);
-        WARN_ON_ONCE(PageLRU(page));
        /*
         * "charge_statistics" updated event counter. Then, check it.
         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -2455,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
-                        (1 << PCG_MIGRATION))
 /*
 * Because tail pages are not marked as "used", set it. We're under
 * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2507,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page,
 {
        unsigned long flags;
        int ret;
+        bool anon = PageAnon(page);
        VM_BUG_ON(from == to);
        VM_BUG_ON(PageLRU(page));
@@ -2526,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page,
        if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
                goto unlock;
-        move_lock_page_cgroup(pc, &flags);
+        move_lock_mem_cgroup(from, &flags);
-        if (PageCgroupFileMapped(pc)) {
+        if (!anon && page_mapped(page)) {
                /* Update mapped_file data for mem_cgroup */
                preempt_disable();
                __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
                __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
                preempt_enable();
        }
-        mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
+        mem_cgroup_charge_statistics(from, anon, -nr_pages);
        if (uncharge)
                /* This is not "cancel", but cancel_charge does all we need. */
                __mem_cgroup_cancel_charge(from, nr_pages);
        /* caller should have done css_get */
        pc->mem_cgroup = to;
-        mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
+        mem_cgroup_charge_statistics(to, anon, nr_pages);
        /*
         * We charges against "to" which may not have any tasks. Then, "to"
         * can be under rmdir(). But in current implementation, caller of
@@ -2550,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page,
         * guaranteed that "to" is never removed. So, we don't check rmdir
         * status here.
         */
-        move_unlock_page_cgroup(pc, &flags);
+        move_unlock_mem_cgroup(from, &flags);
        ret = 0;
 unlock:
        unlock_page_cgroup(pc);
@@ -2642,7 +2734,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
        if (ret == -ENOMEM)
                return ret;
-        __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
+        __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);
        return 0;
 }
@@ -2662,35 +2754,6 @@ static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                                        enum charge_type ctype);
-static void
-__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
-                                        enum charge_type ctype)
-{
-        struct page_cgroup *pc = lookup_page_cgroup(page);
-        struct zone *zone = page_zone(page);
-        unsigned long flags;
-        bool removed = false;
-        /*
-         * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
-         * is already on LRU. It means the page may on some other page_cgroup's
-         * LRU. Take care of it.
-         */
-        spin_lock_irqsave(&zone->lru_lock, flags);
-        if (PageLRU(page)) {
-                del_page_from_lru_list(zone, page, page_lru(page));
-                ClearPageLRU(page);
-                removed = true;
-        }
-        __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
-        if (removed) {
-                add_page_to_lru_list(zone, page, page_lru(page));
-                SetPageLRU(page);
-        }
-        spin_unlock_irqrestore(&zone->lru_lock, flags);
-        return;
-}
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
@@ -2768,13 +2831,16 @@ static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
                                        enum charge_type ctype)
 {
+        struct page_cgroup *pc;
        if (mem_cgroup_disabled())
                return;
        if (!memcg)
                return;
        cgroup_exclude_rmdir(&memcg->css);
-        __mem_cgroup_commit_charge_lrucare(page, memcg, ctype);
+        pc = lookup_page_cgroup(page);
+        __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
        /*
         * Now swap is on-memory. This means this page may be
         * counted both as mem and swap....double count.
@@ -2878,7 +2944,6 @@ direct_uncharge:
                res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
        if (unlikely(batch->memcg != memcg))
                memcg_oom_recover(memcg);
-        return;
 }
 /*
@@ -2890,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        struct mem_cgroup *memcg = NULL;
        unsigned int nr_pages = 1;
        struct page_cgroup *pc;
+        bool anon;
        if (mem_cgroup_disabled())
                return NULL;
@@ -2915,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        if (!PageCgroupUsed(pc))
                goto unlock_out;
+        anon = PageAnon(page);
        switch (ctype) {
        case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+                /*
+                 * Generally PageAnon tells if it's the anon statistics to be
+                 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
+                 * used before page reached the stage of being marked PageAnon.
+                 */
+                anon = true;
+                /* fallthrough */
        case MEM_CGROUP_CHARGE_TYPE_DROP:
                /* See mem_cgroup_prepare_migration() */
                if (page_mapped(page) || PageCgroupMigration(pc))
@@ -2933,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
+        mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
        ClearPageCgroupUsed(pc);
        /*
@@ -3026,23 +3101,6 @@ void mem_cgroup_uncharge_end(void)
        batch->memcg = NULL;
 }
-/*
- * A function for resetting pc->mem_cgroup for newly allocated pages.
- * This function should be called if the newpage will be added to LRU
- * before start accounting.
- */
-void mem_cgroup_reset_owner(struct page *newpage)
-{
-        struct page_cgroup *pc;
-        if (mem_cgroup_disabled())
-                return;
-        pc = lookup_page_cgroup(newpage);
-        VM_BUG_ON(PageCgroupUsed(pc));
-        pc->mem_cgroup = root_mem_cgroup;
-}
 #ifdef CONFIG_SWAP
 /*
 * called after __delete_from_swap_cache() and drop "page" account.
@@ -3247,7 +3305,7 @@ int mem_cgroup_prepare_migration(struct page *page,
                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
        else
                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-        __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype);
+        __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);
        return ret;
 }
@@ -3257,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 {
        struct page *used, *unused;
        struct page_cgroup *pc;
+        bool anon;
        if (!memcg)
                return;
@@ -3278,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
        lock_page_cgroup(pc);
        ClearPageCgroupMigration(pc);
        unlock_page_cgroup(pc);
+        anon = PageAnon(used);
-        __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
+        __mem_cgroup_uncharge_common(unused,
+                anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
+                     : MEM_CGROUP_CHARGE_TYPE_CACHE);
        /*
         * If a page is a file cache, radix-tree replacement is very atomic
@@ -3289,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
         * and USED bit check in mem_cgroup_uncharge_page() will do enough
         * check. (see prepare_charge() also)
         */
-        if (PageAnon(used))
+        if (anon)
                mem_cgroup_uncharge_page(used);
        /*
         * At migration, we may charge account against cgroup which has no
@@ -3319,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
        /* fix accounting on old pages */
        lock_page_cgroup(pc);
        memcg = pc->mem_cgroup;
-        mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
+        mem_cgroup_charge_statistics(memcg, false, -1);
        ClearPageCgroupUsed(pc);
        unlock_page_cgroup(pc);
@@ -3331,7 +3392,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
         * the newpage may be on LRU(or pagevec for LRU) already. We lock
         * LRU while we overwrite pc->mem_cgroup.
         */
-        __mem_cgroup_commit_charge_lrucare(newpage, memcg, type);
+        __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
 }
 #ifdef CONFIG_DEBUG_VM
@@ -3530,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                        break;
                nr_scanned = 0;
-                reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone,
+                reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
                                                    gfp_mask, &nr_scanned);
                nr_reclaimed += reclaimed;
                *total_scanned += nr_scanned;
@@ -3557,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                                next_mz =
                                __mem_cgroup_largest_soft_limit_node(mctz);
                                if (next_mz == mz)
-                                        css_put(&next_mz->mem->css);
+                                        css_put(&next_mz->memcg->css);
                                else /* next_mz == NULL or other memcg */
                                        break;
                        } while (1);
                }
-                __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+                __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
-                excess = res_counter_soft_limit_excess(&mz->mem->res);
+                excess = res_counter_soft_limit_excess(&mz->memcg->res);
                /*
                 * One school of thought says that we should not add
                 * back the node to the tree if reclaim returns 0.
@@ -3573,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                 * term TODO.
                 */
                /* If excess == 0, no tree ops */
-                __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
+                __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
                spin_unlock(&mctz->lock);
-                css_put(&mz->mem->css);
+                css_put(&mz->memcg->css);
                loop++;
                /*
                 * Could not reclaim anything and there are no more
@@ -3588,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                        break;
        } while (!nr_reclaimed);
        if (next_mz)
-                css_put(&next_mz->mem->css);
+                css_put(&next_mz->memcg->css);
        return nr_reclaimed;
 }
@@ -3610,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
        mz = mem_cgroup_zoneinfo(memcg, node, zid);
        list = &mz->lruvec.lists[lru];
-        loop = MEM_CGROUP_ZSTAT(mz, lru);
+        loop = mz->lru_size[lru];
        /* give some margin against EBUSY etc...*/
        loop += 256;
        busy = NULL;
@@ -3684,10 +3745,10 @@ move_account:
                mem_cgroup_start_move(memcg);
                for_each_node_state(node, N_HIGH_MEMORY) {
                        for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
-                                enum lru_list l;
+                                enum lru_list lru;
-                                for_each_lru(l) {
+                                for_each_lru(lru) {
                                        ret = mem_cgroup_force_empty_list(memcg,
-                                                        node, zid, l);
+                                                        node, zid, lru);
                                        if (ret)
                                                break;
                                }
@@ -3841,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
                break;
        default:
                BUG();
-                break;
        }
        return val;
 }
@@ -3920,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
 out:
        *mem_limit = min_limit;
        *memsw_limit = min_memsw_limit;
-        return;
 }
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -4079,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
        unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
        unsigned long node_nr;
        struct cgroup *cont = m->private;
-        struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
+        total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
        seq_printf(m, "total=%lu", total_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-                node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
+                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
-        file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
+        file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
        seq_printf(m, "file=%lu", file_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-                node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
                                LRU_ALL_FILE);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
-        anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
+        anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
        seq_printf(m, "anon=%lu", anon_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-                node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
                                LRU_ALL_ANON);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
-        unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
+        unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
        seq_printf(m, "unevictable=%lu", unevictable_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-                node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
                                BIT(LRU_UNEVICTABLE));
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
@@ -4122,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
                                 struct cgroup_map_cb *cb)
 {
-        struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
        struct mcs_total_stat mystat;
        int i;
        memset(&mystat, 0, sizeof(mystat));
-        mem_cgroup_get_local_stat(mem_cont, &mystat);
+        mem_cgroup_get_local_stat(memcg, &mystat);
        for (i = 0; i < NR_MCS_STAT; i++) {
@@ -4139,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        /* Hierarchical information */
        {
                unsigned long long limit, memsw_limit;
-                memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
+                memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
                cb->fill(cb, "hierarchical_memory_limit", limit);
                if (do_swap_account)
                        cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
        }
        memset(&mystat, 0, sizeof(mystat));
-        mem_cgroup_get_total_stat(mem_cont, &mystat);
+        mem_cgroup_get_total_stat(memcg, &mystat);
        for (i = 0; i < NR_MCS_STAT; i++) {
                if (i == MCS_SWAP && !do_swap_account)
                        continue;
@@ -4162,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
                for_each_online_node(nid)
                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-                                mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
+                                mz = mem_cgroup_zoneinfo(memcg, nid, zid);
                                recent_rotated[0] +=
                                        mz->reclaim_stat.recent_rotated[0];
@@ -4407,11 +4466,8 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
        else
                BUG();
-        /*
+        if (!thresholds->primary)
-         * Something went wrong if we trying to unregister a threshold
+                goto unlock;
-         * if we don't have thresholds
-         */
-        BUG_ON(!thresholds);
        usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
@@ -4461,7 +4517,7 @@ swap_buffers:
        /* To be sure that nobody uses thresholds */
        synchronize_rcu();
+unlock:
        mutex_unlock(&memcg->thresholds_lock);
 }
@@ -4580,10 +4636,9 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
        return mem_cgroup_sockets_init(cont, ss);
 };
-static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
+static void kmem_cgroup_destroy(struct cgroup *cont)
-                                struct cgroup *cont)
 {
-        mem_cgroup_sockets_destroy(cont, ss);
+        mem_cgroup_sockets_destroy(cont);
 }
 #else
 static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
@@ -4591,8 +4646,7 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
        return 0;
 }
-static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
+static void kmem_cgroup_destroy(struct cgroup *cont)
-                                struct cgroup *cont)
 {
 }
 #endif
@@ -4716,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
        struct mem_cgroup_per_node *pn;
        struct mem_cgroup_per_zone *mz;
-        enum lru_list l;
+        enum lru_list lru;
        int zone, tmp = node;
        /*
         * This routine is called against possible nodes.
@@ -4734,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
-                for_each_lru(l)
+                for_each_lru(lru)
-                        INIT_LIST_HEAD(&mz->lruvec.lists[l]);
+                        INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
                mz->usage_in_excess = 0;
                mz->on_tree = false;
-                mz->mem = memcg;
+                mz->memcg = memcg;
        }
        memcg->info.nodeinfo[node] = pn;
        return 0;
@@ -4751,33 +4805,54 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *memcg;
        int size = sizeof(struct mem_cgroup);
        /* Can be very big if MAX_NUMNODES is very big */
        if (size < PAGE_SIZE)
-                mem = kzalloc(size, GFP_KERNEL);
+                memcg = kzalloc(size, GFP_KERNEL);
        else
-                mem = vzalloc(size);
+                memcg = vzalloc(size);
-        if (!mem)
+        if (!memcg)
                return NULL;
-        mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+        memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
-        if (!mem->stat)
+        if (!memcg->stat)
                goto out_free;
-        spin_lock_init(&mem->pcp_counter_lock);
+        spin_lock_init(&memcg->pcp_counter_lock);
-        return mem;
+        return memcg;
 out_free:
        if (size < PAGE_SIZE)
-                kfree(mem);
+                kfree(memcg);
        else
-                vfree(mem);
+                vfree(memcg);
        return NULL;
 }
 /*
+ * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
+ * but in process context.  The work_freeing structure is overlaid
+ * on the rcu_freeing structure, which itself is overlaid on memsw.
+ */
+static void vfree_work(struct work_struct *work)
+{
+        struct mem_cgroup *memcg;
+        memcg = container_of(work, struct mem_cgroup, work_freeing);
+        vfree(memcg);
+}
+static void vfree_rcu(struct rcu_head *rcu_head)
+{
+        struct mem_cgroup *memcg;
+        memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
+        INIT_WORK(&memcg->work_freeing, vfree_work);
+        schedule_work(&memcg->work_freeing);
+}
+/*
 * At destroying mem_cgroup, references from swap_cgroup can remain.
 * (scanning all at force_empty is too costly...)
 *
@@ -4800,9 +4875,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
        free_percpu(memcg->stat);
        if (sizeof(struct mem_cgroup) < PAGE_SIZE)
-                kfree(memcg);
+                kfree_rcu(memcg, rcu_freeing);
        else
-                vfree(memcg);
+                call_rcu(&memcg->rcu_freeing, vfree_rcu);
 }
 static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4884,7 +4959,7 @@ err_cleanup:
 }
 static struct cgroup_subsys_state * __ref
-mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
+mem_cgroup_create(struct cgroup *cont)
 {
        struct mem_cgroup *memcg, *parent;
        long error = -ENOMEM;
@@ -4940,26 +5015,25 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        atomic_set(&memcg->refcnt, 1);
        memcg->move_charge_at_immigrate = 0;
        mutex_init(&memcg->thresholds_lock);
+        spin_lock_init(&memcg->move_lock);
        return &memcg->css;
 free_out:
        __mem_cgroup_free(memcg);
        return ERR_PTR(error);
 }
-static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
+static int mem_cgroup_pre_destroy(struct cgroup *cont)
-                                        struct cgroup *cont)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
        return mem_cgroup_force_empty(memcg, false);
 }
-static void mem_cgroup_destroy(struct cgroup_subsys *ss,
+static void mem_cgroup_destroy(struct cgroup *cont)
-                                struct cgroup *cont)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        kmem_cgroup_destroy(ss, cont);
+        kmem_cgroup_destroy(cont);
        mem_cgroup_put(memcg);
 }
@@ -5036,7 +5110,7 @@ one_by_one:
 }
 /**
- * is_target_pte_for_mc - check a pte whether it is valid for move charge
+ * get_mctgt_type - get target type of moving charge
 * @vma: the vma the pte to be checked belongs
 * @addr: the address corresponding to the pte to be checked
 * @ptent: the pte to be checked
@@ -5059,7 +5133,7 @@ union mc_target {
 };
 enum mc_target_type {
-        MC_TARGET_NONE, /* not used */
+        MC_TARGET_NONE = 0,
        MC_TARGET_PAGE,
        MC_TARGET_SWAP,
 };
@@ -5140,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
        return page;
 }
-static int is_target_pte_for_mc(struct vm_area_struct *vma,
+static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                unsigned long addr, pte_t ptent, union mc_target *target)
 {
        struct page *page = NULL;
        struct page_cgroup *pc;
-        int ret = 0;
+        enum mc_target_type ret = MC_TARGET_NONE;
        swp_entry_t ent = { .val = 0 };
        if (pte_present(ptent))
@@ -5156,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
                page = mc_handle_file_pte(vma, addr, ptent, &ent);
        if (!page && !ent.val)
-                return 0;
+                return ret;
        if (page) {
                pc = lookup_page_cgroup(page);
                /*
@@ -5182,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
        return ret;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * We don't consider swapping or file mapped pages because THP does not
+ * support them for now.
+ * Caller should make sure that pmd_trans_huge(pmd) is true.
+ */
+static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
+                unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+        struct page *page = NULL;
+        struct page_cgroup *pc;
+        enum mc_target_type ret = MC_TARGET_NONE;
+        page = pmd_page(pmd);
+        VM_BUG_ON(!page || !PageHead(page));
+        if (!move_anon())
+                return ret;
+        pc = lookup_page_cgroup(page);
+        if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+                ret = MC_TARGET_PAGE;
+                if (target) {
+                        get_page(page);
+                        target->page = page;
+                }
+        }
+        return ret;
+}
+#else
+static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
+                unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+        return MC_TARGET_NONE;
+}
+#endif
 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
                                        unsigned long addr, unsigned long end,
                                        struct mm_walk *walk)
@@ -5190,11 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        pte_t *pte;
        spinlock_t *ptl;
-        split_huge_page_pmd(walk->mm, pmd);
+        if (pmd_trans_huge_lock(pmd, vma) == 1) {
+                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
+                        mc.precharge += HPAGE_PMD_NR;
+                spin_unlock(&vma->vm_mm->page_table_lock);
+                return 0;
+        }
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE)
-                if (is_target_pte_for_mc(vma, addr, *pte, NULL))
+                if (get_mctgt_type(vma, addr, *pte, NULL))
                        mc.precharge++; /* increment precharge temporarily */
        pte_unmap_unlock(pte - 1, ptl);
        cond_resched();
@@ -5296,9 +5410,8 @@ static void mem_cgroup_clear_mc(void)
        mem_cgroup_end_move(from);
 }
-static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+static int mem_cgroup_can_attach(struct cgroup *cgroup,
-                                struct cgroup *cgroup,
+                                 struct cgroup_taskset *tset)
-                                struct cgroup_taskset *tset)
 {
        struct task_struct *p = cgroup_taskset_first(tset);
        int ret = 0;
@@ -5336,9 +5449,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
        return ret;
 }
-static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
-                                struct cgroup *cgroup,
+                                     struct cgroup_taskset *tset)
-                                struct cgroup_taskset *tset)
 {
        mem_cgroup_clear_mc();
 }
@@ -5351,23 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        struct vm_area_struct *vma = walk->private;
        pte_t *pte;
        spinlock_t *ptl;
+        enum mc_target_type target_type;
+        union mc_target target;
+        struct page *page;
+        struct page_cgroup *pc;
+        /*
+         * We don't take compound_lock() here but no race with splitting thp
+         * happens because:
+         *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
+         *    under splitting, which means there's no concurrent thp split,
+         *  - if another thread runs into split_huge_page() just after we
+         *    entered this if-block, the thread must wait for page table lock
+         *    to be unlocked in __split_huge_page_splitting(), where the main
+         *    part of thp split is not executed yet.
+         */
+        if (pmd_trans_huge_lock(pmd, vma) == 1) {
+                if (!mc.precharge) {
+                        spin_unlock(&vma->vm_mm->page_table_lock);
+                        return 0;
+                }
+                target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
+                if (target_type == MC_TARGET_PAGE) {
+                        page = target.page;
+                        if (!isolate_lru_page(page)) {
+                                pc = lookup_page_cgroup(page);
+                                if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+                                                             pc, mc.from, mc.to,
+                                                             false)) {
+                                        mc.precharge -= HPAGE_PMD_NR;
+                                        mc.moved_charge += HPAGE_PMD_NR;
+                                }
+                                putback_lru_page(page);
+                        }
+                        put_page(page);
+                }
+                spin_unlock(&vma->vm_mm->page_table_lock);
+                return 0;
+        }
-        split_huge_page_pmd(walk->mm, pmd);
 retry:
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; addr += PAGE_SIZE) {
                pte_t ptent = *(pte++);
-                union mc_target target;
-                int type;
-                struct page *page;
-                struct page_cgroup *pc;
                swp_entry_t ent;
                if (!mc.precharge)
                        break;
-                type = is_target_pte_for_mc(vma, addr, ptent, &target);
+                switch (get_mctgt_type(vma, addr, ptent, &target)) {
-                switch (type) {
                case MC_TARGET_PAGE:
                        page = target.page;
                        if (isolate_lru_page(page))
@@ -5380,7 +5524,7 @@ retry:
                                mc.moved_charge++;
                        }
                        putback_lru_page(page);
-put:                    /* is_target_pte_for_mc() gets the page */
+put:                    /* get_mctgt_type() gets the page */
                        put_page(page);
                        break;
                case MC_TARGET_SWAP:
@@ -5453,9 +5597,8 @@ retry:
        up_read(&mm->mmap_sem);
 }
-static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+static void mem_cgroup_move_task(struct cgroup *cont,
-                                struct cgroup *cont,
+                                 struct cgroup_taskset *tset)
-                                struct cgroup_taskset *tset)
 {
        struct task_struct *p = cgroup_taskset_first(tset);
        struct mm_struct *mm = get_task_mm(p);
@@ -5470,20 +5613,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                mem_cgroup_clear_mc();
 }
 #else   /* !CONFIG_MMU */
-static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+static int mem_cgroup_can_attach(struct cgroup *cgroup,
-                                struct cgroup *cgroup,
+                                 struct cgroup_taskset *tset)
-                                struct cgroup_taskset *tset)
 {
        return 0;
 }
-static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
-                                struct cgroup *cgroup,
+                                     struct cgroup_taskset *tset)
-                                struct cgroup_taskset *tset)
 {
 }
-static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+static void mem_cgroup_move_task(struct cgroup *cont,
-                                struct cgroup *cont,
+                                 struct cgroup_taskset *tset)
-                                struct cgroup_taskset *tset)
 {
 }
 #endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 56080ea3614..97cc2733551 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -187,33 +187,40 @@ int hwpoison_filter(struct page *p)
 EXPORT_SYMBOL_GPL(hwpoison_filter);
 /*
- * Send all the processes who have the page mapped an ``action optional''
+ * Send all the processes who have the page mapped a signal.
- * signal.
+ * ``action optional'' if they are not immediately affected by the error
+ * ``action required'' if error happened in current execution context
 */
-static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
+static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
-                        unsigned long pfn, struct page *page)
+                        unsigned long pfn, struct page *page, int flags)
 {
        struct siginfo si;
        int ret;
        printk(KERN_ERR
-                "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
+                "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
                pfn, t->comm, t->pid);
        si.si_signo = SIGBUS;
        si.si_errno = 0;
-        si.si_code = BUS_MCEERR_AO;
        si.si_addr = (void *)addr;
 #ifdef __ARCH_SI_TRAPNO
        si.si_trapno = trapno;
 #endif
        si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
-        /*
-         * Don't use force here, it's convenient if the signal
+        if ((flags & MF_ACTION_REQUIRED) && t == current) {
-         * can be temporarily blocked.
+                si.si_code = BUS_MCEERR_AR;
-         * This could cause a loop when the user sets SIGBUS
+                ret = force_sig_info(SIGBUS, &si, t);
-         * to SIG_IGN, but hopefully no one will do that?
+        } else {
-         */
+                /*
-        ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
+                 * Don't use force here, it's convenient if the signal
+                 * can be temporarily blocked.
+                 * This could cause a loop when the user sets SIGBUS
+                 * to SIG_IGN, but hopefully no one will do that?
+                 */
+                si.si_code = BUS_MCEERR_AO;
+                ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
+        }
        if (ret < 0)
                printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
                       t->comm, t->pid, ret);
@@ -338,8 +345,9 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 * Also when FAIL is set do a force kill because something went
 * wrong earlier.
 */
-static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
+static void kill_procs(struct list_head *to_kill, int doit, int trapno,
-                          int fail, struct page *page, unsigned long pfn)
+                          int fail, struct page *page, unsigned long pfn,
+                          int flags)
 {
        struct to_kill *tk, *next;
@@ -363,8 +371,8 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
                         * check for that, but we need to tell the
                         * process anyways.
                         */
-                        else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
+                        else if (kill_proc(tk->tsk, tk->addr, trapno,
-                                              pfn, page) < 0)
+                                              pfn, page, flags) < 0)
                                printk(KERN_ERR
                "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
                                        pfn, tk->tsk->comm, tk->tsk->pid);
@@ -844,7 +852,7 @@ static int page_action(struct page_state *ps, struct page *p,
 * the pages and send SIGBUS to the processes if the data was dirty.
 */
 static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
-                                  int trapno)
+                                  int trapno, int flags)
 {
        enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
        struct address_space *mapping;
@@ -962,8 +970,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-        kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
+        kill_procs(&tokill, !!PageDirty(ppage), trapno,
-                      ret != SWAP_SUCCESS, p, pfn);
+                      ret != SWAP_SUCCESS, p, pfn, flags);
        return ret;
 }
@@ -984,7 +992,25 @@ static void clear_page_hwpoison_huge_page(struct page *hpage)
                ClearPageHWPoison(hpage + i);
 }
-int __memory_failure(unsigned long pfn, int trapno, int flags)
+/**
+ * memory_failure - Handle memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ * @flags: fine tune action taken
+ *
+ * This function is called by the low level machine check code
+ * of an architecture when it detects hardware memory corruption
+ * of a page. It tries its best to recover, which includes
+ * dropping pages, killing processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Must run in process context (e.g. a work queue) with interrupts
+ * enabled and no spinlocks hold.
+ */
+int memory_failure(unsigned long pfn, int trapno, int flags)
 {
        struct page_state *ps;
        struct page *p;
@@ -1063,7 +1089,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
         */
-        if (!PageHuge(p) && !PageTransCompound(p)) {
+        if (!PageHuge(p) && !PageTransTail(p)) {
                if (!PageLRU(p))
                        shake_page(p, 0);
                if (!PageLRU(p)) {
@@ -1130,7 +1156,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * Now take care of user space mappings.
         * Abort on fail: __delete_from_page_cache() assumes unmapped page.
         */
-        if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
+        if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
                printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
                res = -EBUSY;
                goto out;
@@ -1156,29 +1182,7 @@ out:
        unlock_page(hpage);
        return res;
 }
-EXPORT_SYMBOL_GPL(__memory_failure);
+EXPORT_SYMBOL_GPL(memory_failure);
-/**
- * memory_failure - Handle memory failure of a page.
- * @pfn: Page Number of the corrupted page
- * @trapno: Trap number reported in the signal to user space.
- *
- * This function is called by the low level machine check code
- * of an architecture when it detects hardware memory corruption
- * of a page. It tries its best to recover, which includes
- * dropping pages, killing processes etc.
- *
- * The function is primarily of use for corruptions that
- * happen outside the current execution context (e.g. when
- * detected by a background scrubber)
- *
- * Must run in process context (e.g. a work queue) with interrupts
- * enabled and no spinlocks hold.
- */
-void memory_failure(unsigned long pfn, int trapno)
-{
-        __memory_failure(pfn, trapno, 0);
-}
 #define MEMORY_FAILURE_FIFO_ORDER       4
 #define MEMORY_FAILURE_FIFO_SIZE        (1 << MEMORY_FAILURE_FIFO_ORDER)
@@ -1251,7 +1255,7 @@ static void memory_failure_work_func(struct work_struct *work)
                spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
                if (!gotten)
                        break;
-                __memory_failure(entry.pfn, entry.trapno, entry.flags);
+                memory_failure(entry.pfn, entry.trapno, entry.flags);
        }
 }
diff --git a/mm/memory.c b/mm/memory.c
index fa2f04e0337..6105f475fa8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn);
 #if defined(SPLIT_RSS_COUNTING)
-static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+void sync_mm_rss(struct mm_struct *mm)
 {
        int i;
        for (i = 0; i < NR_MM_COUNTERS; i++) {
-                if (task->rss_stat.count[i]) {
+                if (current->rss_stat.count[i]) {
-                        add_mm_counter(mm, i, task->rss_stat.count[i]);
+                        add_mm_counter(mm, i, current->rss_stat.count[i]);
-                        task->rss_stat.count[i] = 0;
+                        current->rss_stat.count[i] = 0;
                }
        }
-        task->rss_stat.events = 0;
+        current->rss_stat.events = 0;
 }
 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
@@ -157,30 +157,7 @@ static void check_sync_rss_stat(struct task_struct *task)
        if (unlikely(task != current))
                return;
        if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
-                __sync_task_rss_stat(task, task->mm);
+                sync_mm_rss(task->mm);
-}
-unsigned long get_mm_counter(struct mm_struct *mm, int member)
-{
-        long val = 0;
-        /*
-         * Don't use task->mm here...for avoiding to use task_get_mm()..
-         * The caller must guarantee task->mm is not invalid.
-         */
-        val = atomic_long_read(&mm->rss_stat.count[member]);
-        /*
-         * counter is updated in asynchronous manner and may go to minus.
-         * But it's never be expected number for users.
-         */
-        if (val < 0)
-                return 0;
-        return (unsigned long)val;
-}
-void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
-{
-        __sync_task_rss_stat(task, mm);
 }
 #else /* SPLIT_RSS_COUNTING */
@@ -661,7 +638,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
        int i;
        if (current->mm == mm)
-                sync_mm_rss(current, mm);
+                sync_mm_rss(mm);
        for (i = 0; i < NR_MM_COUNTERS; i++)
                if (rss[i])
                        add_mm_counter(mm, i, rss[i]);
@@ -1247,16 +1224,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*pmd)) {
-                        if (next-addr != HPAGE_PMD_SIZE) {
+                        if (next - addr != HPAGE_PMD_SIZE) {
                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
                                split_huge_page_pmd(vma->vm_mm, pmd);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
-                                continue;
+                                goto next;
                        /* fall through */
                }
-                if (pmd_none_or_clear_bad(pmd))
+                /*
-                        continue;
+                 * Here there can be other concurrent MADV_DONTNEED or
+                 * trans huge page faults running, and if the pmd is
+                 * none or trans huge it can change under us. This is
+                 * because MADV_DONTNEED holds the mmap_sem in read
+                 * mode.
+                 */
+                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+                        goto next;
                next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+next:
                cond_resched();
        } while (pmd++, addr = next, addr != end);
@@ -1282,10 +1267,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
        return addr;
 }
-static unsigned long unmap_page_range(struct mmu_gather *tlb,
+static void unmap_page_range(struct mmu_gather *tlb,
-                                struct vm_area_struct *vma,
+                             struct vm_area_struct *vma,
-                                unsigned long addr, unsigned long end,
+                             unsigned long addr, unsigned long end,
-                                struct zap_details *details)
+                             struct zap_details *details)
 {
        pgd_t *pgd;
        unsigned long next;
@@ -1305,8 +1290,47 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
        } while (pgd++, addr = next, addr != end);
        tlb_end_vma(tlb, vma);
        mem_cgroup_uncharge_end();
+}
-        return addr;
+static void unmap_single_vma(struct mmu_gather *tlb,
+                struct vm_area_struct *vma, unsigned long start_addr,
+                unsigned long end_addr, unsigned long *nr_accounted,
+                struct zap_details *details)
+{
+        unsigned long start = max(vma->vm_start, start_addr);
+        unsigned long end;
+        if (start >= vma->vm_end)
+                return;
+        end = min(vma->vm_end, end_addr);
+        if (end <= vma->vm_start)
+                return;
+        if (vma->vm_flags & VM_ACCOUNT)
+                *nr_accounted += (end - start) >> PAGE_SHIFT;
+        if (unlikely(is_pfn_mapping(vma)))
+                untrack_pfn_vma(vma, 0, 0);
+        if (start != end) {
+                if (unlikely(is_vm_hugetlb_page(vma))) {
+                        /*
+                         * It is undesirable to test vma->vm_file as it
+                         * should be non-null for valid hugetlb area.
+                         * However, vm_file will be NULL in the error
+                         * cleanup path of do_mmap_pgoff. When
+                         * hugetlbfs ->mmap method fails,
+                         * do_mmap_pgoff() nullifies vma->vm_file
+                         * before calling this function to clean up.
+                         * Since no pte has actually been setup, it is
+                         * safe to do nothing in this case.
+                         */
+                        if (vma->vm_file)
+                                unmap_hugepage_range(vma, start, end, NULL);
+                } else
+                        unmap_page_range(tlb, vma, start, end, details);
+        }
 }
 /**
@@ -1318,8 +1342,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
 * @details: details of nonlinear truncation or shared cache invalidation
 *
- * Returns the end address of the unmapping (restart addr if interrupted).
- *
 * Unmap all pages in the vma list.
 *
 * Only addresses between `start' and `end' will be unmapped.
@@ -1331,55 +1353,18 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
-unsigned long unmap_vmas(struct mmu_gather *tlb,
+void unmap_vmas(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr, unsigned long *nr_accounted,
                struct zap_details *details)
 {
-        unsigned long start = start_addr;
        struct mm_struct *mm = vma->vm_mm;
        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
-        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
+        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
-                unsigned long end;
+                unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted,
+                                 details);
-                start = max(vma->vm_start, start_addr);
-                if (start >= vma->vm_end)
-                        continue;
-                end = min(vma->vm_end, end_addr);
-                if (end <= vma->vm_start)
-                        continue;
-                if (vma->vm_flags & VM_ACCOUNT)
-                        *nr_accounted += (end - start) >> PAGE_SHIFT;
-                if (unlikely(is_pfn_mapping(vma)))
-                        untrack_pfn_vma(vma, 0, 0);
-                while (start != end) {
-                        if (unlikely(is_vm_hugetlb_page(vma))) {
-                                /*
-                                 * It is undesirable to test vma->vm_file as it
-                                 * should be non-null for valid hugetlb area.
-                                 * However, vm_file will be NULL in the error
-                                 * cleanup path of do_mmap_pgoff. When
-                                 * hugetlbfs ->mmap method fails,
-                                 * do_mmap_pgoff() nullifies vma->vm_file
-                                 * before calling this function to clean up.
-                                 * Since no pte has actually been setup, it is
-                                 * safe to do nothing in this case.
-                                 */
-                                if (vma->vm_file)
-                                        unmap_hugepage_range(vma, start, end, NULL);
-                                start = end;
-                        } else
-                                start = unmap_page_range(tlb, vma, start, end, details);
-                }
-        }
        mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
-        return start;   /* which is now the end (or restart) address */
 }
 /**
@@ -1388,8 +1373,10 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of nonlinear truncation or shared cache invalidation
+ *
+ * Caller must protect the VMA list
 */
-unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
 {
        struct mm_struct *mm = vma->vm_mm;
@@ -1400,9 +1387,34 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
        lru_add_drain();
        tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
-        end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+        unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+        tlb_finish_mmu(&tlb, address, end);
+}
+/**
+ * zap_page_range_single - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of nonlinear truncation or shared cache invalidation
+ *
+ * The range must fit into one VMA.
+ */
+static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+                unsigned long size, struct zap_details *details)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        struct mmu_gather tlb;
+        unsigned long end = address + size;
+        unsigned long nr_accounted = 0;
+        lru_add_drain();
+        tlb_gather_mmu(&tlb, mm, 0);
+        update_hiwater_rss(mm);
+        mmu_notifier_invalidate_range_start(mm, address, end);
+        unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details);
+        mmu_notifier_invalidate_range_end(mm, address, end);
        tlb_finish_mmu(&tlb, address, end);
-        return end;
 }
 /**
@@ -1423,7 +1435,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
        if (address < vma->vm_start || address + size > vma->vm_end ||
                        !(vma->vm_flags & VM_PFNMAP))
                return -1;
-        zap_page_range(vma, address, size, NULL);
+        zap_page_range_single(vma, address, size, NULL);
        return 0;
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
@@ -2447,7 +2459,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
         * fails, we just zero-fill it. Live with it.
         */
        if (unlikely(!src)) {
-                void *kaddr = kmap_atomic(dst, KM_USER0);
+                void *kaddr = kmap_atomic(dst);
                void __user *uaddr = (void __user *)(va & PAGE_MASK);
                /*
@@ -2458,7 +2470,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
                 */
                if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
                        clear_page(kaddr);
-                kunmap_atomic(kaddr, KM_USER0);
+                kunmap_atomic(kaddr);
                flush_dcache_page(dst);
        } else
                copy_user_highpage(dst, src, va, vma);
@@ -2770,7 +2782,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
 {
-        zap_page_range(vma, start_addr, end_addr - start_addr, details);
+        zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
 }
 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -3611,13 +3623,7 @@ static int __init gate_vma_init(void)
        gate_vma.vm_end = FIXADDR_USER_END;
        gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
        gate_vma.vm_page_prot = __P101;
-        /*
-         * Make sure the vDSO gets into every core dump.
-         * Dumping its contents makes post-mortem fully interpretable later
-         * without matching up the same kernel and hardware config to see
-         * what PC values meant.
-         */
-        gate_vma.vm_flags |= VM_ALWAYSDUMP;
        return 0;
 }
 __initcall(gate_vma_init);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 06b145fb64a..cfb6c867875 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        do {
                next = pmd_addr_end(addr, end);
                split_huge_page_pmd(vma->vm_mm, pmd);
-                if (pmd_none_or_clear_bad(pmd))
+                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        continue;
                if (check_pte_range(vma, pmd, addr, next, nodes,
                                    flags, private))
@@ -640,10 +640,11 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
        unsigned long vmstart;
        unsigned long vmend;
-        vma = find_vma_prev(mm, start, &prev);
+        vma = find_vma(mm, start);
        if (!vma || vma->vm_start > start)
                return -EFAULT;
+        prev = vma->vm_prev;
        if (start > vma->vm_start)
                prev = vma;
@@ -1322,12 +1323,9 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                err = -ESRCH;
                goto out;
        }
-        mm = get_task_mm(task);
+        get_task_struct(task);
-        rcu_read_unlock();
        err = -EINVAL;
-        if (!mm)
-                goto out;
        /*
         * Check if this process has the right to modify the specified
@@ -1335,14 +1333,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
         * capabilities, superuser privileges or the same
         * userid as the target process.
         */
-        rcu_read_lock();
        tcred = __task_cred(task);
        if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
            cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
            !capable(CAP_SYS_NICE)) {
                rcu_read_unlock();
                err = -EPERM;
-                goto out;
+                goto out_put;
        }
        rcu_read_unlock();
@@ -1350,26 +1347,36 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
        /* Is the user allowed to access the target nodes? */
        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
                err = -EPERM;
-                goto out;
+                goto out_put;
        }
        if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
                err = -EINVAL;
-                goto out;
+                goto out_put;
        }
        err = security_task_movememory(task);
        if (err)
-                goto out;
+                goto out_put;
-        err = do_migrate_pages(mm, old, new,
+        mm = get_task_mm(task);
-                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+        put_task_struct(task);
-out:
        if (mm)
-                mmput(mm);
+                err = do_migrate_pages(mm, old, new,
+                        capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+        else
+                err = -EINVAL;
+        mmput(mm);
+out:
        NODEMASK_SCRATCH_FREE(scratch);
        return err;
+out_put:
+        put_task_struct(task);
+        goto out;
 }
@@ -1843,18 +1850,24 @@ struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr, int node)
 {
-        struct mempolicy *pol = get_vma_policy(current, vma, addr);
+        struct mempolicy *pol;
        struct zonelist *zl;
        struct page *page;
+        unsigned int cpuset_mems_cookie;
+retry_cpuset:
+        pol = get_vma_policy(current, vma, addr);
+        cpuset_mems_cookie = get_mems_allowed();
-        get_mems_allowed();
        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
                unsigned nid;
                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                mpol_cond_put(pol);
                page = alloc_page_interleave(gfp, order, nid);
-                put_mems_allowed();
+                if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                        goto retry_cpuset;
                return page;
        }
        zl = policy_zonelist(gfp, pol, node);
@@ -1865,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                struct page *page =  __alloc_pages_nodemask(gfp, order,
                                                zl, policy_nodemask(gfp, pol));
                __mpol_put(pol);
-                put_mems_allowed();
+                if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                        goto retry_cpuset;
                return page;
        }
        /*
@@ -1873,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
         */
        page = __alloc_pages_nodemask(gfp, order, zl,
                                      policy_nodemask(gfp, pol));
-        put_mems_allowed();
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
 }
@@ -1900,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
        struct mempolicy *pol = current->mempolicy;
        struct page *page;
+        unsigned int cpuset_mems_cookie;
        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
                pol = &default_policy;
-        get_mems_allowed();
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
        /*
         * No reference counting needed for current->mempolicy
         * nor system default_policy
@@ -1915,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
                page = __alloc_pages_nodemask(gfp, order,
                                policy_zonelist(gfp, pol, numa_node_id()),
                                policy_nodemask(gfp, pol));
-        put_mems_allowed();
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
 }
 EXPORT_SYMBOL(alloc_pages_current);
diff --git a/mm/migrate.c b/mm/migrate.c
index 9871a56d82c..51c08a0c6f6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -445,7 +445,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
        ClearPageSwapCache(page);
        ClearPagePrivate(page);
        set_page_private(page, 0);
-        page->mapping = NULL;
        /*
         * If any waiters have accumulated on the new page then
@@ -667,6 +666,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
        } else {
                if (remap_swapcache)
                        remove_migration_ptes(page, newpage);
+                page->mapping = NULL;
        }
        unlock_page(newpage);
@@ -839,8 +839,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        if (!newpage)
                return -ENOMEM;
-        mem_cgroup_reset_owner(newpage);
        if (page_count(page) == 1) {
                /* page was freed from under us. So we are done. */
                goto out;
@@ -1176,20 +1174,17 @@ set_status:
 * Migrate an array of page address onto an array of nodes and fill
 * the corresponding array of status.
 */
-static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
+static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
                         unsigned long nr_pages,
                         const void __user * __user *pages,
                         const int __user *nodes,
                         int __user *status, int flags)
 {
        struct page_to_node *pm;
-        nodemask_t task_nodes;
        unsigned long chunk_nr_pages;
        unsigned long chunk_start;
        int err;
-        task_nodes = cpuset_mems_allowed(task);
        err = -ENOMEM;
        pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
        if (!pm)
@@ -1351,6 +1346,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
        struct task_struct *task;
        struct mm_struct *mm;
        int err;
+        nodemask_t task_nodes;
        /* Check flags */
        if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1366,11 +1362,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
                rcu_read_unlock();
                return -ESRCH;
        }
-        mm = get_task_mm(task);
+        get_task_struct(task);
-        rcu_read_unlock();
-        if (!mm)
-                return -EINVAL;
        /*
         * Check if this process has the right to modify the specified
@@ -1378,7 +1370,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
         * capabilities, superuser privileges or the same
         * userid as the target process.
         */
-        rcu_read_lock();
        tcred = __task_cred(task);
        if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
            cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
@@ -1393,16 +1384,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
        if (err)
                goto out;
-        if (nodes) {
+        task_nodes = cpuset_mems_allowed(task);
-                err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
+        mm = get_task_mm(task);
-                                    flags);
+        put_task_struct(task);
-        } else {
-                err = do_pages_stat(mm, nr_pages, pages, status);
+        if (mm) {
-        }
+                if (nodes)
+                        err = do_pages_move(mm, task_nodes, nr_pages, pages,
+                                            nodes, status, flags);
+                else
+                        err = do_pages_stat(mm, nr_pages, pages, status);
+        } else
+                err = -EINVAL;
-out:
        mmput(mm);
        return err;
+out:
+        put_task_struct(task);
+        return err;
 }
 /*
diff --git a/mm/mincore.c b/mm/mincore.c
index 636a86876ff..936b4cee8cb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                        }
                        /* fall through */
                }
-                if (pmd_none_or_clear_bad(pmd))
+                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        mincore_unmapped_range(vma, addr, next, vec);
                else
                        mincore_pte_range(vma, pmd, addr, next, vec);
diff --git a/mm/mlock.c b/mm/mlock.c
index 4f4f53bdc65..ef726e8aa8e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -385,10 +385,11 @@ static int do_mlock(unsigned long start, size_t len, int on)
                return -EINVAL;
        if (end == start)
                return 0;
-        vma = find_vma_prev(current->mm, start, &prev);
+        vma = find_vma(current->mm, start);
        if (!vma || vma->vm_start > start)
                return -ENOMEM;
+        prev = vma->vm_prev;
        if (start > vma->vm_start)
                prev = vma;
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f758c7f4c8..a7bf6a31c9f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -451,9 +451,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 /*
- * Helper for vma_adjust in the split_vma insert case:
+ * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * insert vm structure into list and rbtree and anon_vma,
+ * mm's list and rbtree.  It has already been inserted into the prio_tree.
- * but it has already been inserted into prio_tree earlier.
 */
 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
@@ -936,6 +935,19 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 #endif /* CONFIG_PROC_FS */
 /*
+ * If a hint addr is less than mmap_min_addr change hint to be as
+ * low as possible but still greater than mmap_min_addr
+ */
+static inline unsigned long round_hint_to_min(unsigned long hint)
+{
+        hint &= PAGE_MASK;
+        if (((void *)hint != NULL) &&
+            (hint < mmap_min_addr))
+                return PAGE_ALIGN(mmap_min_addr);
+        return hint;
+}
+/*
 * The caller must hold down_write(&current->mm->mmap_sem).
 */
@@ -1099,9 +1111,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                 * A dummy user value is used because we are not locking
                 * memory so no accounting is necessary
                 */
-                len = ALIGN(len, huge_page_size(&default_hstate));
+                file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
-                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+                                                VM_NORESERVE, &user,
-                                                &user, HUGETLB_ANONHUGE_INODE);
+                                                HUGETLB_ANONHUGE_INODE);
                if (IS_ERR(file))
                        return PTR_ERR(file);
        }
@@ -1235,7 +1247,7 @@ munmap_back:
         */
        if (accountable_mapping(file, vm_flags)) {
                charged = len >> PAGE_SHIFT;
-                if (security_vm_enough_memory(charged))
+                if (security_vm_enough_memory_mm(mm, charged))
                        return -ENOMEM;
                vm_flags |= VM_ACCOUNT;
        }
@@ -1266,8 +1278,9 @@ munmap_back:
        vma->vm_pgoff = pgoff;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
+        error = -EINVAL;        /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
        if (file) {
-                error = -EINVAL;
                if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                        goto free_vma;
                if (vm_flags & VM_DENYWRITE) {
@@ -1293,6 +1306,8 @@ munmap_back:
                pgoff = vma->vm_pgoff;
                vm_flags = vma->vm_flags;
        } else if (vm_flags & VM_SHARED) {
+                if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
+                        goto free_vma;
                error = shmem_zero_setup(vma);
                if (error)
                        goto free_vma;
@@ -1423,10 +1438,8 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
        /*
         * Is this a new hole at the lowest possible address?
         */
-        if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
+        if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
                mm->free_area_cache = addr;
-                mm->cached_hole_size = ~0UL;
-        }
 }
 /*
@@ -1441,7 +1454,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
-        unsigned long addr = addr0;
+        unsigned long addr = addr0, start_addr;
        /* requested length too big for entire address space */
        if (len > TASK_SIZE)
@@ -1465,22 +1478,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                mm->free_area_cache = mm->mmap_base;
        }
+try_again:
        /* either no address requested or can't fit in requested address hole */
-        addr = mm->free_area_cache;
+        start_addr = addr = mm->free_area_cache;
-        /* make sure it can fit in the remaining address space */
-        if (addr > len) {
-                vma = find_vma(mm, addr-len);
-                if (!vma || addr <= vma->vm_start)
-                        /* remember the address as a hint for next time */
-                        return (mm->free_area_cache = addr-len);
-        }
-        if (mm->mmap_base < len)
+        if (addr < len)
-                goto bottomup;
+                goto fail;
-        addr = mm->mmap_base-len;
+        addr -= len;
        do {
                /*
                 * Lookup failure means no vma is above this address,
@@ -1500,7 +1505,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                addr = vma->vm_start-len;
        } while (len < vma->vm_start);
-bottomup:
+fail:
+        /*
+         * if hint left us with no space for the requested
+         * mapping then try again:
+         *
+         * Note: this is different with the case of bottomup
+         * which does the fully line-search, but we use find_vma
+         * here that causes some holes skipped.
+         */
+        if (start_addr != mm->mmap_base) {
+                mm->free_area_cache = mm->mmap_base;
+                mm->cached_hole_size = 0;
+                goto try_again;
+        }
        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
@@ -1605,7 +1624,6 @@ EXPORT_SYMBOL(find_vma);
 /*
 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
- * Note: pprev is set to NULL when return value is NULL.
 */
 struct vm_area_struct *
 find_vma_prev(struct mm_struct *mm, unsigned long addr,
@@ -1614,7 +1632,16 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
        struct vm_area_struct *vma;
        vma = find_vma(mm, addr);
-        *pprev = vma ? vma->vm_prev : NULL;
+        if (vma) {
+                *pprev = vma->vm_prev;
+        } else {
+                struct rb_node *rb_node = mm->mm_rb.rb_node;
+                *pprev = NULL;
+                while (rb_node) {
+                        *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+                        rb_node = rb_node->rb_right;
+                }
+        }
        return vma;
 }
@@ -2169,7 +2196,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;
-        if (security_vm_enough_memory(len >> PAGE_SHIFT))
+        if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;
        /* Can we just expand an old private anonymous mapping? */
@@ -2213,7 +2240,6 @@ void exit_mmap(struct mm_struct *mm)
        struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
-        unsigned long end;
        /* mm's last user has gone, and its about to be pulled down */
        mmu_notifier_release(mm);
@@ -2238,11 +2264,11 @@ void exit_mmap(struct mm_struct *mm)
        tlb_gather_mmu(&tlb, mm, 1);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
-        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+        unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
-        tlb_finish_mmu(&tlb, 0, end);
+        tlb_finish_mmu(&tlb, 0, -1);
        /*
         * Walk the list again, actually closing and freeing it,
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index cf332bc0080..3dcfaf4ed35 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm)
        struct task_struct *tsk = current;
        task_lock(tsk);
-        sync_mm_rss(tsk, mm);
+        sync_mm_rss(mm);
        tsk->mm = NULL;
        /* active_mm is still 'mm' */
        enter_lazy_tlb(mm, tsk);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5a688a2756b..a40992610ab 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -60,7 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                ptent = pte_mkwrite(ptent);
                        ptep_modify_prot_commit(mm, addr, pte, ptent);
-                } else if (PAGE_MIGRATION && !pte_file(oldpte)) {
+                } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
                        if (is_write_migration_entry(entry)) {
@@ -168,7 +168,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
                                                VM_SHARED|VM_NORESERVE))) {
                        charged = nrpages;
-                        if (security_vm_enough_memory(charged))
+                        if (security_vm_enough_memory_mm(mm, charged))
                                return -ENOMEM;
                        newflags |= VM_ACCOUNT;
                }
@@ -262,10 +262,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
        down_write(&current->mm->mmap_sem);
-        vma = find_vma_prev(current->mm, start, &prev);
+        vma = find_vma(current->mm, start);
        error = -ENOMEM;
        if (!vma)
                goto out;
+        prev = vma->vm_prev;
        if (unlikely(grows & PROT_GROWSDOWN)) {
                if (vma->vm_start >= end)
                        goto out;
diff --git a/mm/mremap.c b/mm/mremap.c
index 87bb8393e7d..db8d983b5a7 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -329,7 +329,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
        if (vma->vm_flags & VM_ACCOUNT) {
                unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
-                if (security_vm_enough_memory(charged))
+                if (security_vm_enough_memory_mm(mm, charged))
                        goto Efault;
                *p = charged;
        }
diff --git a/mm/nommu.c b/mm/nommu.c
index b982290fd96..f59e170fceb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -696,9 +696,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
+                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
                vma_prio_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
+                mutex_unlock(&mapping->i_mmap_mutex);
        }
        /* add the VMA to the tree */
@@ -760,9 +762,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
+                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
                vma_prio_tree_remove(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
+                mutex_unlock(&mapping->i_mmap_mutex);
        }
        /* remove from the MM's tree and list */
@@ -775,8 +779,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
        if (vma->vm_next)
                vma->vm_next->vm_prev = vma->vm_prev;
-        vma->vm_mm = NULL;
 }
 /*
@@ -2052,6 +2054,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
        high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        down_write(&nommu_region_sem);
+        mutex_lock(&inode->i_mapping->i_mmap_mutex);
        /* search for VMAs that fall within the dead zone */
        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
@@ -2059,6 +2062,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
                /* found one - only interested if it's shared out of the page
                 * cache */
                if (vma->vm_flags & VM_SHARED) {
+                        mutex_unlock(&inode->i_mapping->i_mmap_mutex);
                        up_write(&nommu_region_sem);
                        return -ETXTBSY; /* not quite true, but near enough */
                }
@@ -2086,6 +2090,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
                }
        }
+        mutex_unlock(&inode->i_mapping->i_mmap_mutex);
        up_write(&nommu_region_sem);
        return 0;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2958fd8e7c9..46bf2ed5594 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -34,6 +34,7 @@
 #include <linux/ptrace.h>
 #include <linux/freezer.h>
 #include <linux/ftrace.h>
+#include <linux/ratelimit.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/oom.h>
@@ -309,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 */
 static struct task_struct *select_bad_process(unsigned int *ppoints,
                unsigned long totalpages, struct mem_cgroup *memcg,
-                const nodemask_t *nodemask)
+                const nodemask_t *nodemask, bool force_kill)
 {
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
@@ -335,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
                        if (unlikely(frozen(p)))
                                __thaw_task(p);
-                        return ERR_PTR(-1UL);
+                        if (!force_kill)
+                                return ERR_PTR(-1UL);
                }
                if (!p->mm)
                        continue;
@@ -353,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                        if (p == current) {
                                chosen = p;
                                *ppoints = 1000;
-                        } else {
+                        } else if (!force_kill) {
                                /*
                                 * If this task is not being ptraced on exit,
                                 * then wait for it to finish before killing
@@ -434,66 +436,18 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
-static int oom_kill_task(struct task_struct *p)
+static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-{
+                             unsigned int points, unsigned long totalpages,
-        struct task_struct *q;
+                             struct mem_cgroup *memcg, nodemask_t *nodemask,
-        struct mm_struct *mm;
+                             const char *message)
-        p = find_lock_task_mm(p);
-        if (!p)
-                return 1;
-        /* mm cannot be safely dereferenced after task_unlock(p) */
-        mm = p->mm;
-        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
-                task_pid_nr(p), p->comm, K(p->mm->total_vm),
-                K(get_mm_counter(p->mm, MM_ANONPAGES)),
-                K(get_mm_counter(p->mm, MM_FILEPAGES)));
-        task_unlock(p);
-        /*
-         * Kill all user processes sharing p->mm in other thread groups, if any.
-         * They don't get access to memory reserves or a higher scheduler
-         * priority, though, to avoid depletion of all memory or task
-         * starvation.  This prevents mm->mmap_sem livelock when an oom killed
-         * task cannot exit because it requires the semaphore and its contended
-         * by another thread trying to allocate memory itself.  That thread will
-         * now get access to memory reserves since it has a pending fatal
-         * signal.
-         */
-        for_each_process(q)
-                if (q->mm == mm && !same_thread_group(q, p) &&
-                    !(q->flags & PF_KTHREAD)) {
-                        if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-                                continue;
-                        task_lock(q);   /* Protect ->comm from prctl() */
-                        pr_err("Kill process %d (%s) sharing same memory\n",
-                                task_pid_nr(q), q->comm);
-                        task_unlock(q);
-                        force_sig(SIGKILL, q);
-                }
-        set_tsk_thread_flag(p, TIF_MEMDIE);
-        force_sig(SIGKILL, p);
-        return 0;
-}
-#undef K
-static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-                            unsigned int points, unsigned long totalpages,
-                            struct mem_cgroup *memcg, nodemask_t *nodemask,
-                            const char *message)
 {
        struct task_struct *victim = p;
        struct task_struct *child;
        struct task_struct *t = p;
+        struct mm_struct *mm;
        unsigned int victim_points = 0;
+        static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
-        if (printk_ratelimit())
+                                              DEFAULT_RATELIMIT_BURST);
-                dump_header(p, gfp_mask, order, memcg, nodemask);
        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
@@ -501,9 +455,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         */
        if (p->flags & PF_EXITING) {
                set_tsk_thread_flag(p, TIF_MEMDIE);
-                return 0;
+                return;
        }
+        if (__ratelimit(&oom_rs))
+                dump_header(p, gfp_mask, order, memcg, nodemask);
        task_lock(p);
        pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
                message, task_pid_nr(p), p->comm, points);
@@ -533,8 +490,44 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                }
        } while_each_thread(p, t);
-        return oom_kill_task(victim);
+        victim = find_lock_task_mm(victim);
+        if (!victim)
+                return;
+        /* mm cannot safely be dereferenced after task_unlock(victim) */
+        mm = victim->mm;
+        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+                task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
+                K(get_mm_counter(victim->mm, MM_ANONPAGES)),
+                K(get_mm_counter(victim->mm, MM_FILEPAGES)));
+        task_unlock(victim);
+        /*
+         * Kill all user processes sharing victim->mm in other thread groups, if
+         * any.  They don't get access to memory reserves, though, to avoid
+         * depletion of all memory.  This prevents mm->mmap_sem livelock when an
+         * oom killed thread cannot exit because it requires the semaphore and
+         * its contended by another thread trying to allocate memory itself.
+         * That thread will now get access to memory reserves since it has a
+         * pending fatal signal.
+         */
+        for_each_process(p)
+                if (p->mm == mm && !same_thread_group(p, victim) &&
+                    !(p->flags & PF_KTHREAD)) {
+                        if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                                continue;
+                        task_lock(p);   /* Protect ->comm from prctl() */
+                        pr_err("Kill process %d (%s) sharing same memory\n",
+                                task_pid_nr(p), p->comm);
+                        task_unlock(p);
+                        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+                }
+        set_tsk_thread_flag(victim, TIF_MEMDIE);
+        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
 }
+#undef K
 /*
 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
@@ -561,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 }
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
-void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
+void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+                              int order)
 {
        unsigned long limit;
        unsigned int points = 0;
@@ -577,18 +571,13 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
                return;
        }
-        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
+        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
        limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
        read_lock(&tasklist_lock);
-retry:
+        p = select_bad_process(&points, limit, memcg, NULL, false);
-        p = select_bad_process(&points, limit, memcg, NULL);
+        if (p && PTR_ERR(p) != -1UL)
-        if (!p || PTR_ERR(p) == -1UL)
+                oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
-                goto out;
+                                 "Memory cgroup out of memory");
-        if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL,
-                                "Memory cgroup out of memory"))
-                goto retry;
-out:
        read_unlock(&tasklist_lock);
 }
 #endif
@@ -700,6 +689,7 @@ static void clear_system_oom(void)
 * @gfp_mask: memory allocation flags
 * @order: amount of memory being requested as a power of 2
 * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
 *
 * If we run out of memory, we have the choice between either
 * killing a random task (bad), letting the system crash (worse)
@@ -707,7 +697,7 @@ static void clear_system_oom(void)
 * don't have to be perfect here, we just have to be good.
 */
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-                int order, nodemask_t *nodemask)
+                int order, nodemask_t *nodemask, bool force_kill)
 {
        const nodemask_t *mpol_mask;
        struct task_struct *p;
@@ -745,33 +735,25 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        if (sysctl_oom_kill_allocating_task &&
            !oom_unkillable_task(current, NULL, nodemask) &&
            current->mm) {
-                /*
+                oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
-                 * oom_kill_process() needs tasklist_lock held.  If it returns
+                                 nodemask,
-                 * non-zero, current could not be killed so we must fallback to
+                                 "Out of memory (oom_kill_allocating_task)");
-                 * the tasklist scan.
-                 */
-                if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
-                                NULL, nodemask,
-                                "Out of memory (oom_kill_allocating_task)"))
-                        goto out;
-        }
-retry:
-        p = select_bad_process(&points, totalpages, NULL, mpol_mask);
-        if (PTR_ERR(p) == -1UL)
                goto out;
+        }
+        p = select_bad_process(&points, totalpages, NULL, mpol_mask,
+                               force_kill);
        /* Found nothing?!?! Either we hang forever, or we panic. */
        if (!p) {
                dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
                read_unlock(&tasklist_lock);
                panic("Out of memory and no killable processes...\n");
        }
+        if (PTR_ERR(p) != -1UL) {
-        if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+                oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
-                                nodemask, "Out of memory"))
+                                 nodemask, "Out of memory");
-                goto retry;
+                killed = 1;
-        killed = 1;
+        }
 out:
        read_unlock(&tasklist_lock);
@@ -792,7 +774,7 @@ out:
 void pagefault_out_of_memory(void)
 {
        if (try_set_system_oom()) {
-                out_of_memory(NULL, 0, 0, NULL);
+                out_of_memory(NULL, 0, 0, NULL, false);
                clear_system_oom();
        }
        if (!test_thread_flag(TIF_MEMDIE))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5e39858880f..26adea8ca2e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1474,6 +1474,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
        for ( ; ; ) {
                global_dirty_limits(&background_thresh, &dirty_thresh);
+                dirty_thresh = hard_dirty_limit(dirty_thresh);
                /*
                 * Boost the allowable dirty threshold a bit for page
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d2186ecb36f..caea788628e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1968,7 +1968,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        goto out;
        }
        /* Exhausted what can be done so it's blamo time */
-        out_of_memory(zonelist, gfp_mask, order, nodemask);
+        out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 out:
        clear_zonelist_oom(zonelist, gfp_mask);
@@ -1990,7 +1990,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        if (!order)
                return NULL;
-        if (compaction_deferred(preferred_zone)) {
+        if (compaction_deferred(preferred_zone, order)) {
                *deferred_compaction = true;
                return NULL;
        }
@@ -2012,6 +2012,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                if (page) {
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
+                        if (order >= preferred_zone->compact_order_failed)
+                                preferred_zone->compact_order_failed = order + 1;
                        count_vm_event(COMPACTSUCCESS);
                        return page;
                }
@@ -2028,7 +2030,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                 * defer if the failure was a sync compaction failure.
                 */
                if (sync_migration)
-                        defer_compaction(preferred_zone);
+                        defer_compaction(preferred_zone, order);
                cond_resched();
        }
@@ -2378,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        struct zone *preferred_zone;
-        struct page *page;
+        struct page *page = NULL;
        int migratetype = allocflags_to_migratetype(gfp_mask);
+        unsigned int cpuset_mems_cookie;
        gfp_mask &= gfp_allowed_mask;
@@ -2398,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
-        get_mems_allowed();
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
        /* The preferred zone is used for statistics later */
        first_zones_zonelist(zonelist, high_zoneidx,
                                nodemask ? : &cpuset_current_mems_allowed,
                                &preferred_zone);
-        if (!preferred_zone) {
+        if (!preferred_zone)
-                put_mems_allowed();
+                goto out;
-                return NULL;
-        }
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2416,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
-        put_mems_allowed();
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+out:
+        /*
+         * When updating a task's mems_allowed, it is possible to race with
+         * parallel threads in such a way that an allocation can fail while
+         * the mask is being updated. If a page allocation is about to fail,
+         * check if the cpuset changed during allocation and if so, retry.
+         */
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2632,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
        bool ret = false;
+        unsigned int cpuset_mems_cookie;
        if (!(flags & SHOW_MEM_FILTER_NODES))
                goto out;
-        get_mems_allowed();
+        do {
-        ret = !node_isset(nid, cpuset_current_mems_allowed);
+                cpuset_mems_cookie = get_mems_allowed();
-        put_mems_allowed();
+                ret = !node_isset(nid, cpuset_current_mems_allowed);
+        } while (!put_mems_allowed(cpuset_mems_cookie));
 out:
        return ret;
 }
@@ -3925,18 +3940,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
        }
 }
-int __init add_from_early_node_map(struct range *range, int az,
-                                   int nr_range, int nid)
-{
-        unsigned long start_pfn, end_pfn;
-        int i;
-        /* need to go over early_node_map to find out good range for node */
-        for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
-                nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
-        return nr_range;
-}
 /**
 * sparse_memory_present_with_active_regions - Call memory_present for each active range
 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -4521,7 +4524,7 @@ static unsigned long __init early_calculate_totalpages(void)
 * memory. When they don't, some nodes will have more kernelcore than
 * others
 */
-static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+static void __init find_zone_movable_pfns_for_nodes(void)
 {
        int i, nid;
        unsigned long usable_startpfn;
@@ -4713,7 +4716,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        /* Find the PFNs that ZONE_MOVABLE begins at in each node */
        memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
-        find_zone_movable_pfns_for_nodes(zone_movable_pfn);
+        find_zone_movable_pfns_for_nodes();
        /* Print out the zone ranges */
        printk("Zone PFN ranges:\n");
@@ -4823,6 +4826,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
        int cpu = (unsigned long)hcpu;
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+                lru_add_drain_cpu(cpu);
                drain_pages(cpu);
                /*
@@ -5236,6 +5240,7 @@ void *__init alloc_large_system_hash(const char *tablename,
                max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
                do_div(max, bucketsize);
        }
+        max = min(max, 0x80000000ULL);
        if (numentries > max)
                numentries = max;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index de1616aa9b1..1ccbd714059 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -379,13 +379,15 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
        pgoff_t offset = swp_offset(ent);
        struct swap_cgroup_ctrl *ctrl;
        struct page *mappage;
+        struct swap_cgroup *sc;
        ctrl = &swap_cgroup_ctrl[swp_type(ent)];
        if (ctrlp)
                *ctrlp = ctrl;
        mappage = ctrl->map[offset / SC_PER_PAGE];
-        return page_address(mappage) + offset % SC_PER_PAGE;
+        sc = page_address(mappage);
+        return sc + offset % SC_PER_PAGE;
 }
 /**
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2f5cf10ff66..aa9701e1271 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
                        continue;
                split_huge_page_pmd(walk->mm, pmd);
-                if (pmd_none_or_clear_bad(pmd))
+                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        goto again;
                err = walk_pte_range(pmd, addr, next, walk);
                if (err)
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 12a48a88c0d..405d331804c 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -184,8 +184,7 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
                                   page_end - page_start);
        }
-        for (i = page_start; i < page_end; i++)
+        bitmap_clear(populated, page_start, page_end - page_start);
-                __clear_bit(i, populated);
 }
 /**
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index eb663fb533e..5a74fea182f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -70,10 +70,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pmd_t *pmdp)
 {
        int young;
-#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#else
        BUG();
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        young = pmdp_test_and_clear_young(vma, address, pmdp);
        if (young)
                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index e920aa3ce10..c20ff48994c 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -298,23 +298,18 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
                goto free_proc_pages;
        }
-        task_lock(task);
+        mm = mm_access(task, PTRACE_MODE_ATTACH);
-        if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+        if (!mm || IS_ERR(mm)) {
-                task_unlock(task);
+                rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
-                rc = -EPERM;
+                /*
-                goto put_task_struct;
+                 * Explicitly map EACCES to EPERM as EPERM is a more a
-        }
+                 * appropriate error code for process_vw_readv/writev
-        mm = task->mm;
+                 */
+                if (rc == -EACCES)
-        if (!mm || (task->flags & PF_KTHREAD)) {
+                        rc = -EPERM;
-                task_unlock(task);
-                rc = -EINVAL;
                goto put_task_struct;
        }
-        atomic_inc(&mm->mm_users);
-        task_unlock(task);
        for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
                rc = process_vm_rw_single_vec(
                        (unsigned long)rvec[i].iov_base, rvec[i].iov_len,
diff --git a/mm/rmap.c b/mm/rmap.c
index c8454e06b6c..5b5ad584ffb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -120,6 +120,21 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
        kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
 }
+static void anon_vma_chain_link(struct vm_area_struct *vma,
+                                struct anon_vma_chain *avc,
+                                struct anon_vma *anon_vma)
+{
+        avc->vma = vma;
+        avc->anon_vma = anon_vma;
+        list_add(&avc->same_vma, &vma->anon_vma_chain);
+        /*
+         * It's critical to add new vmas to the tail of the anon_vma,
+         * see comment in huge_memory.c:__split_huge_page().
+         */
+        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+}
 /**
 * anon_vma_prepare - attach an anon_vma to a memory region
 * @vma: the memory region in question
@@ -175,10 +190,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                spin_lock(&mm->page_table_lock);
                if (likely(!vma->anon_vma)) {
                        vma->anon_vma = anon_vma;
-                        avc->anon_vma = anon_vma;
+                        anon_vma_chain_link(vma, avc, anon_vma);
-                        avc->vma = vma;
-                        list_add(&avc->same_vma, &vma->anon_vma_chain);
-                        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
                        allocated = NULL;
                        avc = NULL;
                }
@@ -224,21 +236,6 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
                mutex_unlock(&root->mutex);
 }
-static void anon_vma_chain_link(struct vm_area_struct *vma,
-                                struct anon_vma_chain *avc,
-                                struct anon_vma *anon_vma)
-{
-        avc->vma = vma;
-        avc->anon_vma = anon_vma;
-        list_add(&avc->same_vma, &vma->anon_vma_chain);
-        /*
-         * It's critical to add new vmas to the tail of the anon_vma,
-         * see comment in huge_memory.c:__split_huge_page().
-         */
-        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
-}
 /*
 * Attach the anon_vmas from src to dst.
 * Returns 0 on success, -ENOMEM on failure.
@@ -1151,10 +1148,15 @@ void page_add_new_anon_rmap(struct page *page,
 */
 void page_add_file_rmap(struct page *page)
 {
+        bool locked;
+        unsigned long flags;
+        mem_cgroup_begin_update_page_stat(page, &locked, &flags);
        if (atomic_inc_and_test(&page->_mapcount)) {
                __inc_zone_page_state(page, NR_FILE_MAPPED);
                mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
        }
+        mem_cgroup_end_update_page_stat(page, &locked, &flags);
 }
 /**
@@ -1165,9 +1167,21 @@ void page_add_file_rmap(struct page *page)
 */
 void page_remove_rmap(struct page *page)
 {
+        bool anon = PageAnon(page);
+        bool locked;
+        unsigned long flags;
+        /*
+         * The anon case has no mem_cgroup page_stat to update; but may
+         * uncharge_page() below, where the lock ordering can deadlock if
+         * we hold the lock against page_stat move: so avoid it on anon.
+         */
+        if (!anon)
+                mem_cgroup_begin_update_page_stat(page, &locked, &flags);
        /* page still mapped by someone else? */
        if (!atomic_add_negative(-1, &page->_mapcount))
-                return;
+                goto out;
        /*
         * Now that the last pte has gone, s390 must transfer dirty
@@ -1176,7 +1190,7 @@ void page_remove_rmap(struct page *page)
         * not if it's in swapcache - there might be another pte slot
         * containing the swap entry, but page not yet written to swap.
         */
-        if ((!PageAnon(page) || PageSwapCache(page)) &&
+        if ((!anon || PageSwapCache(page)) &&
            page_test_and_clear_dirty(page_to_pfn(page), 1))
                set_page_dirty(page);
        /*
@@ -1184,8 +1198,8 @@ void page_remove_rmap(struct page *page)
         * and not charged by memcg for now.
         */
        if (unlikely(PageHuge(page)))
-                return;
+                goto out;
-        if (PageAnon(page)) {
+        if (anon) {
                mem_cgroup_uncharge_page(page);
                if (!PageTransHuge(page))
                        __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1205,6 +1219,9 @@ void page_remove_rmap(struct page *page)
         * Leaving it set also helps swapoff to reinstate ptes
         * faster for those pages still in swapcache.
         */
+out:
+        if (!anon)
+                mem_cgroup_end_update_page_stat(page, &locked, &flags);
 }
 /*
@@ -1282,7 +1299,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                        }
                        dec_mm_counter(mm, MM_ANONPAGES);
                        inc_mm_counter(mm, MM_SWAPENTS);
-                } else if (PAGE_MIGRATION) {
+                } else if (IS_ENABLED(CONFIG_MIGRATION)) {
                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
@@ -1293,7 +1310,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                }
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                BUG_ON(pte_file(*pte));
-        } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
+        } else if (IS_ENABLED(CONFIG_MIGRATION) &&
+                   (TTU_ACTION(flags) == TTU_MIGRATION)) {
                /* Establish migration entry for a file page */
                swp_entry_t entry;
                entry = make_migration_entry(page, pte_write(pteval));
@@ -1499,7 +1517,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
                 * locking requirements of exec(), migration skips
                 * temporary VMAs until after exec() completes.
                 */
-                if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&
+                if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
                                is_vma_temporary_stack(vma))
                        continue;
diff --git a/mm/shmem.c b/mm/shmem.c
index 269d049294a..f99ff3e50bd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -127,7 +127,7 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 static inline int shmem_acct_size(unsigned long flags, loff_t size)
 {
        return (flags & VM_NORESERVE) ?
-                0 : security_vm_enough_memory_kern(VM_ACCT(size));
+                0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
 }
 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
@@ -145,7 +145,7 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
 static inline int shmem_acct_block(unsigned long flags)
 {
        return (flags & VM_NORESERVE) ?
-                security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0;
+                security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
 }
 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
@@ -1178,6 +1178,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_short_symlink_operations;
+#ifdef CONFIG_TMPFS_XATTR
+static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
+#else
+#define shmem_initxattrs NULL
+#endif
 static int
 shmem_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
@@ -1490,7 +1496,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
        if (inode) {
                error = security_inode_init_security(inode, dir,
                                                     &dentry->d_name,
-                                                     NULL, NULL);
+                                                     shmem_initxattrs, NULL);
                if (error) {
                        if (error != -EOPNOTSUPP) {
                                iput(inode);
@@ -1630,7 +1636,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                return -ENOSPC;
        error = security_inode_init_security(inode, dir, &dentry->d_name,
-                                             NULL, NULL);
+                                             shmem_initxattrs, NULL);
        if (error) {
                if (error != -EOPNOTSUPP) {
                        iput(inode);
@@ -1656,9 +1662,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                }
                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_op = &shmem_symlink_inode_operations;
-                kaddr = kmap_atomic(page, KM_USER0);
+                kaddr = kmap_atomic(page);
                memcpy(kaddr, symname, len);
-                kunmap_atomic(kaddr, KM_USER0);
+                kunmap_atomic(kaddr);
                set_page_dirty(page);
                unlock_page(page);
                page_cache_release(page);
@@ -1704,6 +1710,66 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
 * filesystem level, though.
 */
+/*
+ * Allocate new xattr and copy in the value; but leave the name to callers.
+ */
+static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
+{
+        struct shmem_xattr *new_xattr;
+        size_t len;
+        /* wrap around? */
+        len = sizeof(*new_xattr) + size;
+        if (len <= sizeof(*new_xattr))
+                return NULL;
+        new_xattr = kmalloc(len, GFP_KERNEL);
+        if (!new_xattr)
+                return NULL;
+        new_xattr->size = size;
+        memcpy(new_xattr->value, value, size);
+        return new_xattr;
+}
+/*
+ * Callback for security_inode_init_security() for acquiring xattrs.
+ */
+static int shmem_initxattrs(struct inode *inode,
+                            const struct xattr *xattr_array,
+                            void *fs_info)
+{
+        struct shmem_inode_info *info = SHMEM_I(inode);
+        const struct xattr *xattr;
+        struct shmem_xattr *new_xattr;
+        size_t len;
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+                new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
+                if (!new_xattr)
+                        return -ENOMEM;
+                len = strlen(xattr->name) + 1;
+                new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
+                                          GFP_KERNEL);
+                if (!new_xattr->name) {
+                        kfree(new_xattr);
+                        return -ENOMEM;
+                }
+                memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
+                       XATTR_SECURITY_PREFIX_LEN);
+                memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
+                       xattr->name, len);
+                spin_lock(&info->lock);
+                list_add(&new_xattr->list, &info->xattr_list);
+                spin_unlock(&info->lock);
+        }
+        return 0;
+}
 static int shmem_xattr_get(struct dentry *dentry, const char *name,
                           void *buffer, size_t size)
 {
@@ -1731,24 +1797,17 @@ static int shmem_xattr_get(struct dentry *dentry, const char *name,
        return ret;
 }
-static int shmem_xattr_set(struct dentry *dentry, const char *name,
+static int shmem_xattr_set(struct inode *inode, const char *name,
                           const void *value, size_t size, int flags)
 {
-        struct inode *inode = dentry->d_inode;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_xattr *xattr;
        struct shmem_xattr *new_xattr = NULL;
-        size_t len;
        int err = 0;
        /* value == NULL means remove */
        if (value) {
-                /* wrap around? */
+                new_xattr = shmem_xattr_alloc(value, size);
-                len = sizeof(*new_xattr) + size;
-                if (len <= sizeof(*new_xattr))
-                        return -ENOMEM;
-                new_xattr = kmalloc(len, GFP_KERNEL);
                if (!new_xattr)
                        return -ENOMEM;
@@ -1757,9 +1816,6 @@ static int shmem_xattr_set(struct dentry *dentry, const char *name,
                        kfree(new_xattr);
                        return -ENOMEM;
                }
-                new_xattr->size = size;
-                memcpy(new_xattr->value, value, size);
        }
        spin_lock(&info->lock);
@@ -1858,7 +1914,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
        if (size == 0)
                value = "";  /* empty EA, do not remove */
-        return shmem_xattr_set(dentry, name, value, size, flags);
+        return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
 }
@@ -1878,7 +1934,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
        if (err)
                return err;
-        return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
+        return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
 }
 static bool xattr_is_trusted(const char *name)
@@ -2175,7 +2231,6 @@ static void shmem_put_super(struct super_block *sb)
 int shmem_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct inode *inode;
-        struct dentry *root;
        struct shmem_sb_info *sbinfo;
        int err = -ENOMEM;
@@ -2232,14 +2287,11 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
                goto failed;
        inode->i_uid = sbinfo->uid;
        inode->i_gid = sbinfo->gid;
-        root = d_alloc_root(inode);
+        sb->s_root = d_make_root(inode);
-        if (!root)
+        if (!sb->s_root)
-                goto failed_iput;
+                goto failed;
-        sb->s_root = root;
        return 0;
-failed_iput:
-        iput(inode);
 failed:
        shmem_put_super(sb);
        return err;
diff --git a/mm/slab.c b/mm/slab.c
index f0bd7857ab3..29c8716eb7a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
        if (in_interrupt() || (flags & __GFP_THISNODE))
                return NULL;
        nid_alloc = nid_here = numa_mem_id();
-        get_mems_allowed();
        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
                nid_alloc = cpuset_slab_spread_node();
        else if (current->mempolicy)
                nid_alloc = slab_node(current->mempolicy);
-        put_mems_allowed();
        if (nid_alloc != nid_here)
                return ____cache_alloc_node(cachep, flags, nid_alloc);
        return NULL;
@@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
        enum zone_type high_zoneidx = gfp_zone(flags);
        void *obj = NULL;
        int nid;
+        unsigned int cpuset_mems_cookie;
        if (flags & __GFP_THISNODE)
                return NULL;
-        get_mems_allowed();
-        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
+        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
 retry:
        /*
         * Look through allowed nodes for objects available
@@ -3372,7 +3373,9 @@ retry:
                        }
                }
        }
-        put_mems_allowed();
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+                goto retry_cpuset;
        return obj;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 4907563ef7f..f4a6229848f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
        struct zone *zone;
        enum zone_type high_zoneidx = gfp_zone(flags);
        void *object;
+        unsigned int cpuset_mems_cookie;
        /*
         * The defrag ratio allows a configuration of the tradeoffs between
@@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
                return NULL;
-        get_mems_allowed();
+        do {
-        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+                cpuset_mems_cookie = get_mems_allowed();
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+                zonelist = node_zonelist(slab_node(current->mempolicy), flags);
-                struct kmem_cache_node *n;
+                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+                        struct kmem_cache_node *n;
-                n = get_node(s, zone_to_nid(zone));
+                        n = get_node(s, zone_to_nid(zone));
-                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-                                n->nr_partial > s->min_partial) {
+                        if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-                        object = get_partial_node(s, n, c);
+                                        n->nr_partial > s->min_partial) {
-                        if (object) {
+                                object = get_partial_node(s, n, c);
-                                put_mems_allowed();
+                                if (object) {
-                                return object;
+                                        /*
+                                         * Return the object even if
+                                         * put_mems_allowed indicated that
+                                         * the cpuset mems_allowed was
+                                         * updated in parallel. It's a
+                                         * harmless race between the alloc
+                                         * and the cpuset update.
+                                         */
+                                        put_mems_allowed(cpuset_mems_cookie);
+                                        return object;
+                                }
                        }
                }
-        }
+        } while (!put_mems_allowed(cpuset_mems_cookie));
-        put_mems_allowed();
 #endif
        return NULL;
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 61d7cde2311..a8bc7d364de 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
        usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
                                                                 usemap_count);
-        if (usemap) {
+        if (!usemap) {
-                for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+                usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
-                        if (!present_section_nr(pnum))
+                if (!usemap) {
-                                continue;
+                        printk(KERN_WARNING "%s: allocation failed\n", __func__);
-                        usemap_map[pnum] = usemap;
+                        return;
-                        usemap += size;
                }
-                return;
        }
-        usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
+        for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
-        if (usemap) {
+                if (!present_section_nr(pnum))
-                for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+                        continue;
-                        if (!present_section_nr(pnum))
+                usemap_map[pnum] = usemap;
-                                continue;
+                usemap += size;
-                        usemap_map[pnum] = usemap;
+                check_usemap_section_nr(nodeid, usemap_map[pnum]);
-                        usemap += size;
-                        check_usemap_section_nr(nodeid, usemap_map[pnum]);
-                }
-                return;
        }
-        printk(KERN_WARNING "%s: allocation failed\n", __func__);
 }
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/mm/swap.c b/mm/swap.c
index b0f529b3897..5c13f133897 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg)
 * Either "cpu" is the current CPU, and preemption has already been
 * disabled; or "cpu" is being hot-unplugged, and is already dead.
 */
-static void drain_cpu_pagevecs(int cpu)
+void lru_add_drain_cpu(int cpu)
 {
        struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
        struct pagevec *pvec;
@@ -553,7 +553,7 @@ void deactivate_page(struct page *page)
 void lru_add_drain(void)
 {
-        drain_cpu_pagevecs(get_cpu());
+        lru_add_drain_cpu(get_cpu());
        put_cpu();
 }
@@ -652,14 +652,14 @@ EXPORT_SYMBOL(__pagevec_release);
 void lru_add_page_tail(struct zone* zone,
                       struct page *page, struct page *page_tail)
 {
-        int active;
+        int uninitialized_var(active);
        enum lru_list lru;
        const int file = 0;
        VM_BUG_ON(!PageHead(page));
        VM_BUG_ON(PageCompound(page_tail));
        VM_BUG_ON(PageLRU(page_tail));
-        VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
+        VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock));
        SetPageLRU(page_tail);
@@ -672,7 +672,6 @@ void lru_add_page_tail(struct zone* zone,
                        active = 0;
                        lru = LRU_INACTIVE_ANON;
                }
-                update_page_reclaim_stat(zone, page_tail, file, active);
        } else {
                SetPageUnevictable(page_tail);
                lru = LRU_UNEVICTABLE;
@@ -693,6 +692,9 @@ void lru_add_page_tail(struct zone* zone,
                list_head = page_tail->lru.prev;
                list_move_tail(&page_tail->lru, list_head);
        }
+        if (!PageUnevictable(page))
+                update_page_reclaim_stat(zone, page_tail, file, active);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -710,8 +712,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg)
        SetPageLRU(page);
        if (active)
                SetPageActive(page);
-        update_page_reclaim_stat(zone, page, file, active);
        add_page_to_lru_list(zone, page, lru);
+        update_page_reclaim_stat(zone, page, file, active);
 }
 /*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 470038a9187..9d3dd3763cf 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -300,16 +300,6 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                        new_page = alloc_page_vma(gfp_mask, vma, addr);
                        if (!new_page)
                                break;          /* Out of memory */
-                        /*
-                         * The memcg-specific accounting when moving
-                         * pages around the LRU lists relies on the
-                         * page's owner (memcg) to be valid.  Usually,
-                         * pages are assigned to a new owner before
-                         * being put on the LRU list, but since this
-                         * is not the case here, the stale owner from
-                         * a previous allocation cycle must be reset.
-                         */
-                        mem_cgroup_reset_owner(new_page);
                }
                /*
@@ -382,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
                        struct vm_area_struct *vma, unsigned long addr)
 {
-        int nr_pages;
        struct page *page;
-        unsigned long offset;
+        unsigned long offset = swp_offset(entry);
-        unsigned long end_offset;
+        unsigned long start_offset, end_offset;
+        unsigned long mask = (1UL << page_cluster) - 1;
-        /*
+        /* Read a page_cluster sized and aligned cluster around offset. */
-         * Get starting offset for readaround, and number of pages to read.
+        start_offset = offset & ~mask;
-         * Adjust starting address by readbehind (for NUMA interleave case)?
+        end_offset = offset | mask;
-         * No, it's very unlikely that swap layout would follow vma layout,
+        if (!start_offset)      /* First page is swap header. */
-         * more likely that neighbouring swap pages came from the same node:
+                start_offset++;
-         * so use the same "addr" to choose the same node for each swap read.
-         */
+        for (offset = start_offset; offset <= end_offset ; offset++) {
-        nr_pages = valid_swaphandles(entry, &offset);
-        for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
                /* Ok, do the async read-ahead now */
                page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
                                                gfp_mask, vma, addr);
                if (!page)
-                        break;
+                        continue;
                page_cache_release(page);
        }
        lru_add_drain();        /* Push any new pages onto the LRU now */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d999f090dfd..dae42f380d6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-                if (unlikely(pmd_trans_huge(*pmd)))
+                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
-                        continue;
-                if (pmd_none_or_clear_bad(pmd))
                        continue;
                ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
                if (ret)
@@ -1563,6 +1561,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+        BUG_ON(!current->mm);
        pathname = getname(specialfile);
        err = PTR_ERR(pathname);
        if (IS_ERR(pathname))
@@ -1590,7 +1590,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                spin_unlock(&swap_lock);
                goto out_dput;
        }
-        if (!security_vm_enough_memory(p->pages))
+        if (!security_vm_enough_memory_mm(current->mm, p->pages))
                vm_unacct_memory(p->pages);
        else {
                err = -ENOMEM;
@@ -2105,7 +2105,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                        p->flags |= SWP_SOLIDSTATE;
                        p->cluster_next = 1 + (random32() % p->highest_bit);
                }
-                if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
+                if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
                        p->flags |= SWP_DISCARDABLE;
        }
@@ -2290,58 +2290,6 @@ int swapcache_prepare(swp_entry_t entry)
 }
 /*
- * swap_lock prevents swap_map being freed. Don't grab an extra
- * reference on the swaphandle, it doesn't matter if it becomes unused.
- */
-int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
-{
-        struct swap_info_struct *si;
-        int our_page_cluster = page_cluster;
-        pgoff_t target, toff;
-        pgoff_t base, end;
-        int nr_pages = 0;
-        if (!our_page_cluster)  /* no readahead */
-                return 0;
-        si = swap_info[swp_type(entry)];
-        target = swp_offset(entry);
-        base = (target >> our_page_cluster) << our_page_cluster;
-        end = base + (1 << our_page_cluster);
-        if (!base)              /* first page is swap header */
-                base++;
-        spin_lock(&swap_lock);
-        if (end > si->max)      /* don't go beyond end of map */
-                end = si->max;
-        /* Count contiguous allocated slots above our target */
-        for (toff = target; ++toff < end; nr_pages++) {
-                /* Don't read in free or bad pages */
-                if (!si->swap_map[toff])
-                        break;
-                if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
-                        break;
-        }
-        /* Count contiguous allocated slots below our target */
-        for (toff = target; --toff >= base; nr_pages++) {
-                /* Don't read in free or bad pages */
-                if (!si->swap_map[toff])
-                        break;
-                if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
-                        break;
-        }
-        spin_unlock(&swap_lock);
-        /*
-         * Indicate starting offset, and return number of pages to get:
-         * if only 1, say 0, since there's then no readahead to be done.
-         */
-        *offset = ++toff;
-        return nr_pages? ++nr_pages: 0;
-}
-/*
 * add_swap_count_continuation - called when a swap count is duplicated
 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
 * page of the original vmalloc'ed swap_map, to hold the continuation count
@@ -2427,9 +2375,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
                if (!(count & COUNT_CONTINUED))
                        goto out;
-                map = kmap_atomic(list_page, KM_USER0) + offset;
+                map = kmap_atomic(list_page) + offset;
                count = *map;
-                kunmap_atomic(map, KM_USER0);
+                kunmap_atomic(map);
                /*
                 * If this continuation count now has some space in it,
@@ -2472,7 +2420,7 @@ static bool swap_count_continued(struct swap_info_struct *si,
        offset &= ~PAGE_MASK;
        page = list_entry(head->lru.next, struct page, lru);
-        map = kmap_atomic(page, KM_USER0) + offset;
+        map = kmap_atomic(page) + offset;
        if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
                goto init_map;          /* jump over SWAP_CONT_MAX checks */
@@ -2482,26 +2430,26 @@ static bool swap_count_continued(struct swap_info_struct *si,
                 * Think of how you add 1 to 999
                 */
                while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
-                        kunmap_atomic(map, KM_USER0);
+                        kunmap_atomic(map);
                        page = list_entry(page->lru.next, struct page, lru);
                        BUG_ON(page == head);
-                        map = kmap_atomic(page, KM_USER0) + offset;
+                        map = kmap_atomic(page) + offset;
                }
                if (*map == SWAP_CONT_MAX) {
-                        kunmap_atomic(map, KM_USER0);
+                        kunmap_atomic(map);
                        page = list_entry(page->lru.next, struct page, lru);
                        if (page == head)
                                return false;   /* add count continuation */
-                        map = kmap_atomic(page, KM_USER0) + offset;
+                        map = kmap_atomic(page) + offset;
 init_map:               *map = 0;               /* we didn't zero the page */
                }
                *map += 1;
-                kunmap_atomic(map, KM_USER0);
+                kunmap_atomic(map);
                page = list_entry(page->lru.prev, struct page, lru);
                while (page != head) {
-                        map = kmap_atomic(page, KM_USER0) + offset;
+                        map = kmap_atomic(page) + offset;
                        *map = COUNT_CONTINUED;
-                        kunmap_atomic(map, KM_USER0);
+                        kunmap_atomic(map);
                        page = list_entry(page->lru.prev, struct page, lru);
                }
                return true;                    /* incremented */
@@ -2512,22 +2460,22 @@ init_map:		*map = 0;		/* we didn't zero the page */
                 */
                BUG_ON(count != COUNT_CONTINUED);
                while (*map == COUNT_CONTINUED) {
-                        kunmap_atomic(map, KM_USER0);
+                        kunmap_atomic(map);
                        page = list_entry(page->lru.next, struct page, lru);
                        BUG_ON(page == head);
-                        map = kmap_atomic(page, KM_USER0) + offset;
+                        map = kmap_atomic(page) + offset;
                }
                BUG_ON(*map == 0);
                *map -= 1;
                if (*map == 0)
                        count = 0;
-                kunmap_atomic(map, KM_USER0);
+                kunmap_atomic(map);
                page = list_entry(page->lru.prev, struct page, lru);
                while (page != head) {
-                        map = kmap_atomic(page, KM_USER0) + offset;
+                        map = kmap_atomic(page) + offset;
                        *map = SWAP_CONT_MAX | count;
                        count = COUNT_CONTINUED;
-                        kunmap_atomic(map, KM_USER0);
+                        kunmap_atomic(map);
                        page = list_entry(page->lru.prev, struct page, lru);
                }
                return count == COUNT_CONTINUED;
diff --git a/mm/truncate.c b/mm/truncate.c
index 632b15e29f7..18aded3a89f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -52,7 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
        zero_user_segment(page, partial, PAGE_CACHE_SIZE);
-        cleancache_flush_page(page->mapping, page);
+        cleancache_invalidate_page(page->mapping, page);
        if (page_has_private(page))
                do_invalidatepage(page, partial);
 }
@@ -184,7 +184,7 @@ int invalidate_inode_page(struct page *page)
 }
 /**
- * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
+ * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
 * @lend: offset to which to truncate
@@ -213,7 +213,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
        pgoff_t end;
        int i;
-        cleancache_flush_inode(mapping);
+        cleancache_invalidate_inode(mapping);
        if (mapping->nrpages == 0)
                return;
@@ -292,7 +292,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                mem_cgroup_uncharge_end();
                index++;
        }
-        cleancache_flush_inode(mapping);
+        cleancache_invalidate_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -444,7 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
        int ret2 = 0;
        int did_range_unmap = 0;
-        cleancache_flush_inode(mapping);
+        cleancache_invalidate_inode(mapping);
        pagevec_init(&pvec, 0);
        index = start;
        while (index <= end && pagevec_lookup(&pvec, mapping, index,
@@ -500,7 +500,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                cond_resched();
                index++;
        }
-        cleancache_flush_inode(mapping);
+        cleancache_invalidate_inode(mapping);
        return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/util.c b/mm/util.c
index 136ac4f322b..ae962b31de8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -239,6 +239,47 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                next->vm_prev = vma;
 }
+/* Check if the vma is being used as a stack by this task */
+static int vm_is_stack_for_task(struct task_struct *t,
+                                struct vm_area_struct *vma)
+{
+        return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
+}
+/*
+ * Check if the vma is being used as a stack.
+ * If is_group is non-zero, check in the entire thread group or else
+ * just check in the current task. Returns the pid of the task that
+ * the vma is stack for.
+ */
+pid_t vm_is_stack(struct task_struct *task,
+                  struct vm_area_struct *vma, int in_group)
+{
+        pid_t ret = 0;
+        if (vm_is_stack_for_task(task, vma))
+                return task->pid;
+        if (in_group) {
+                struct task_struct *t;
+                rcu_read_lock();
+                if (!pid_alive(task))
+                        goto done;
+                t = task;
+                do {
+                        if (vm_is_stack_for_task(t, vma)) {
+                                ret = t->pid;
+                                goto done;
+                        }
+                } while_each_thread(task, t);
+done:
+                rcu_read_unlock();
+        }
+        return ret;
+}
 #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 86ce9a526c1..94dff883b44 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1906,9 +1906,9 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
                         * we can expect USER0 is not used (see vread/vwrite's
                         * function description)
                         */
-                        void *map = kmap_atomic(p, KM_USER0);
+                        void *map = kmap_atomic(p);
                        memcpy(buf, map + offset, length);
-                        kunmap_atomic(map, KM_USER0);
+                        kunmap_atomic(map);
                } else
                        memset(buf, 0, length);
@@ -1945,9 +1945,9 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
                         * we can expect USER0 is not used (see vread/vwrite's
                         * function description)
                         */
-                        void *map = kmap_atomic(p, KM_USER0);
+                        void *map = kmap_atomic(p);
                        memcpy(map + offset, buf, length);
-                        kunmap_atomic(map, KM_USER0);
+                        kunmap_atomic(map);
                }
                addr += length;
                buf += length;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c52b2355265..33c332bbab7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 * @mz:         The mem_cgroup_zone to pull pages from.
 * @dst:        The temp list to put pages on to.
 * @nr_scanned: The number of pages that were scanned.
- * @order:      The caller's attempted allocation order
+ * @sc:         The scan_control struct for this reclaim session
 * @mode:       One of the LRU isolation modes
 * @active:     True [1] if isolating active pages
 * @file:       True [1] if isolating file [!anon] pages
@@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                struct mem_cgroup_zone *mz, struct list_head *dst,
-                unsigned long *nr_scanned, int order, isolate_mode_t mode,
+                unsigned long *nr_scanned, struct scan_control *sc,
-                int active, int file)
+                isolate_mode_t mode, int active, int file)
 {
        struct lruvec *lruvec;
        struct list_head *src;
@@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        BUG();
                }
-                if (!order)
+                if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
                        continue;
                /*
@@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 */
                zone_id = page_zone_id(page);
                page_pfn = page_to_pfn(page);
-                pfn = page_pfn & ~((1 << order) - 1);
+                pfn = page_pfn & ~((1 << sc->order) - 1);
-                end_pfn = pfn + (1 << order);
+                end_pfn = pfn + (1 << sc->order);
                for (; pfn < end_pfn; pfn++) {
                        struct page *cursor_page;
@@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        *nr_scanned = scan;
-        trace_mm_vmscan_lru_isolate(order,
+        trace_mm_vmscan_lru_isolate(sc->order,
                        nr_to_scan, scan,
                        nr_taken,
                        nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
@@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
                       unsigned long *nr_anon,
                       unsigned long *nr_file)
 {
-        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
        struct zone *zone = mz->zone;
        unsigned int count[NR_LRU_LISTS] = { 0, };
        unsigned long nr_active = 0;
@@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
                count[lru] += numpages;
        }
+        preempt_disable();
        __count_vm_events(PGDEACTIVATE, nr_active);
        __mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
        *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
        *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
-        reclaim_stat->recent_scanned[0] += *nr_anon;
+        __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
-        reclaim_stat->recent_scanned[1] += *nr_file;
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
+        preempt_enable();
 }
 /*
@@ -1509,8 +1510,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
        unsigned long nr_file;
        unsigned long nr_dirty = 0;
        unsigned long nr_writeback = 0;
-        isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
+        isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
        struct zone *zone = mz->zone;
+        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
        while (unlikely(too_many_isolated(zone, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,20 +1524,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
        set_reclaim_mode(priority, sc, false);
        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
-                reclaim_mode |= ISOLATE_ACTIVE;
+                isolate_mode |= ISOLATE_ACTIVE;
        lru_add_drain();
        if (!sc->may_unmap)
-                reclaim_mode |= ISOLATE_UNMAPPED;
+                isolate_mode |= ISOLATE_UNMAPPED;
        if (!sc->may_writepage)
-                reclaim_mode |= ISOLATE_CLEAN;
+                isolate_mode |= ISOLATE_CLEAN;
        spin_lock_irq(&zone->lru_lock);
-        nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list,
+        nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
-                                     &nr_scanned, sc->order,
+                                     sc, isolate_mode, 0, file);
-                                     reclaim_mode, 0, file);
        if (global_reclaim(sc)) {
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
@@ -1545,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                               nr_scanned);
        }
+        spin_unlock_irq(&zone->lru_lock);
-        if (nr_taken == 0) {
+        if (nr_taken == 0)
-                spin_unlock_irq(&zone->lru_lock);
                return 0;
-        }
        update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
-        __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
-        __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
-        spin_unlock_irq(&zone->lru_lock);
        nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
                                                &nr_dirty, &nr_writeback);
@@ -1570,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
        spin_lock_irq(&zone->lru_lock);
+        reclaim_stat->recent_scanned[0] += nr_anon;
+        reclaim_stat->recent_scanned[1] += nr_file;
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
        __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
@@ -1643,18 +1641,6 @@ static void move_active_pages_to_lru(struct zone *zone,
        unsigned long pgmoved = 0;
        struct page *page;
-        if (buffer_heads_over_limit) {
-                spin_unlock_irq(&zone->lru_lock);
-                list_for_each_entry(page, list, lru) {
-                        if (page_has_private(page) && trylock_page(page)) {
-                                if (page_has_private(page))
-                                        try_to_release_page(page, 0);
-                                unlock_page(page);
-                        }
-                }
-                spin_lock_irq(&zone->lru_lock);
-        }
        while (!list_empty(list)) {
                struct lruvec *lruvec;
@@ -1699,21 +1685,22 @@ static void shrink_active_list(unsigned long nr_to_scan,
        struct page *page;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
        unsigned long nr_rotated = 0;
-        isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
+        isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
        struct zone *zone = mz->zone;
        lru_add_drain();
+        reset_reclaim_mode(sc);
        if (!sc->may_unmap)
-                reclaim_mode |= ISOLATE_UNMAPPED;
+                isolate_mode |= ISOLATE_UNMAPPED;
        if (!sc->may_writepage)
-                reclaim_mode |= ISOLATE_CLEAN;
+                isolate_mode |= ISOLATE_CLEAN;
        spin_lock_irq(&zone->lru_lock);
-        nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold,
+        nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
-                                     &nr_scanned, sc->order,
+                                     isolate_mode, 1, file);
-                                     reclaim_mode, 1, file);
        if (global_reclaim(sc))
                zone->pages_scanned += nr_scanned;
@@ -1737,6 +1724,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
                        continue;
                }
+                if (unlikely(buffer_heads_over_limit)) {
+                        if (page_has_private(page) && trylock_page(page)) {
+                                if (page_has_private(page))
+                                        try_to_release_page(page, 0);
+                                unlock_page(page);
+                        }
+                }
                if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
                        nr_rotated += hpage_nr_pages(page);
                        /*
@@ -2112,7 +2107,12 @@ restart:
                 * with multiple processes reclaiming pages, the total
                 * freeing target can get unreasonably large.
                 */
-                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
+                if (nr_reclaimed >= nr_to_reclaim)
+                        nr_to_reclaim = 0;
+                else
+                        nr_to_reclaim -= nr_reclaimed;
+                if (!nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
        blk_finish_plug(&plug);
@@ -2195,7 +2195,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
         * If compaction is deferred, reclaim up to a point where
         * compaction will have a chance of success when re-enabled
         */
-        if (compaction_deferred(zone))
+        if (compaction_deferred(zone, sc->order))
                return watermark_ok;
        /* If compaction is not ready to start, keep reclaiming */
@@ -2235,6 +2235,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
        unsigned long nr_soft_scanned;
        bool aborted_reclaim = false;
+        /*
+         * If the number of buffer_heads in the machine exceeds the maximum
+         * allowed level, force direct reclaim to scan the highmem zone as
+         * highmem pages could be pinning lowmem pages storing buffer_heads
+         */
+        if (buffer_heads_over_limit)
+                sc->gfp_mask |= __GFP_HIGHMEM;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
                if (!populated_zone(zone))
@@ -2255,8 +2263,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                                 * Even though compaction is invoked for any
                                 * non-zero order, only frequent costly order
                                 * reclamation is disruptive enough to become a
-                                 * noticable problem, like transparent huge page
+                                 * noticeable problem, like transparent huge
-                                 * allocations.
+                                 * page allocations.
                                 */
                                if (compaction_ready(zone, sc)) {
                                        aborted_reclaim = true;
@@ -2337,7 +2345,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        unsigned long writeback_threshold;
        bool aborted_reclaim;
-        get_mems_allowed();
        delayacct_freepages_start();
        if (global_reclaim(sc))
@@ -2401,7 +2408,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 out:
        delayacct_freepages_end();
-        put_mems_allowed();
        if (sc->nr_reclaimed)
                return sc->nr_reclaimed;
@@ -2724,6 +2730,17 @@ loop_again:
                         */
                        age_active_anon(zone, &sc, priority);
+                        /*
+                         * If the number of buffer_heads in the machine
+                         * exceeds the maximum allowed level and this node
+                         * has a highmem zone, force kswapd to reclaim from
+                         * it to relieve lowmem pressure.
+                         */
+                        if (buffer_heads_over_limit && is_highmem_idx(i)) {
+                                end_zone = i;
+                                break;
+                        }
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
@@ -2753,7 +2770,7 @@ loop_again:
                 */
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
-                        int nr_slab;
+                        int nr_slab, testorder;
                        unsigned long balance_gap;
                        if (!populated_zone(zone))
@@ -2786,7 +2803,21 @@ loop_again:
                                (zone->present_pages +
                                        KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
                                KSWAPD_ZONE_BALANCE_GAP_RATIO);
-                        if (!zone_watermark_ok_safe(zone, order,
+                        /*
+                         * Kswapd reclaims only single pages with compaction
+                         * enabled. Trying too hard to reclaim until contiguous
+                         * free pages have become available can hurt performance
+                         * by evicting too much useful data from memory.
+                         * Do not reclaim more than needed for compaction.
+                         */
+                        testorder = order;
+                        if (COMPACTION_BUILD && order &&
+                                        compaction_suitable(zone, order) !=
+                                                COMPACT_SKIPPED)
+                                testorder = 0;
+                        if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
+                                    !zone_watermark_ok_safe(zone, testorder,
                                        high_wmark_pages(zone) + balance_gap,
                                        end_zone, 0)) {
                                shrink_zone(priority, zone, &sc);
@@ -2815,7 +2846,7 @@ loop_again:
                                continue;
                        }
-                        if (!zone_watermark_ok_safe(zone, order,
+                        if (!zone_watermark_ok_safe(zone, testorder,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
                                /*
@@ -2903,6 +2934,8 @@ out:
         * and it is potentially going to sleep here.
         */
        if (order) {
+                int zones_need_compaction = 1;
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
@@ -2912,6 +2945,11 @@ out:
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
+                        /* Would compaction fail due to lack of free memory? */
+                        if (COMPACTION_BUILD &&
+                            compaction_suitable(zone, order) == COMPACT_SKIPPED)
+                                goto loop_again;
                        /* Confirm the zone is balanced for order-0 */
                        if (!zone_watermark_ok(zone, 0,
                                        high_wmark_pages(zone), 0, 0)) {
@@ -2919,11 +2957,17 @@ out:
                                goto loop_again;
                        }
+                        /* Check if the memory needs to be defragmented. */
+                        if (zone_watermark_ok(zone, order,
+                                    low_wmark_pages(zone), *classzone_idx, 0))
+                                zones_need_compaction = 0;
                        /* If balanced, clear the congested flag */
                        zone_clear_flag(zone, ZONE_CONGESTED);
-                        if (i <= *classzone_idx)
-                                balanced += zone->present_pages;
                }
+                if (zones_need_compaction)
+                        compact_pgdat(pgdat, order);
        }
        /*