Merge branch 'akpm' (Andrew's patch-bomb)

Merge Andrew's second set of patches: - MM - a few random fixes - a couple of RTC leftovers * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (120 commits) rtc/rtc-88pm80x: remove unneed devm_kfree rtc/rtc-88pm80x: assign ret only when rtc_register_driver fails mm: hugetlbfs: close race during teardown of hugetlbfs shared page tables tmpfs: distribute interleave better across nodes mm: remove redundant initialization mm: warn if pg_data_t isn't initialized with zero mips: zero out pg_data_t when it's allocated memcg: gix memory accounting scalability in shrink_page_list mm/sparse: remove index_init_lock mm/sparse: more checks on mem_section number mm/sparse: optimize sparse_index_alloc memcg: add mem_cgroup_from_css() helper memcg: further prevent OOM with too many dirty pages memcg: prevent OOM with too many dirty pages mm: mmu_notifier: fix freed page still mapped in secondary MMU mm: memcg: only check anon swapin page charges for swap cache mm: memcg: only check swap cache pages for repeated charging mm: memcg: split swapin charge function into private and public part mm: memcg: remove needless !mm fixup to init_mm when charging mm: memcg: remove unneeded shmem charge type ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-31 22:25:39 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-31 22:25:39 -0400
commit: ac694dbdbc403c00e2c14d10bc7b8412cc378259 (patch)
tree: e37328cfbeaf43716dd5914cad9179e57e84df76 /mm
parent: a40a1d3d0a2fd613fdec6d89d3c053268ced76ed (diff)
parent: 437ea90cc3afdca5229b41c6b1d38c4842756cb9 (diff)
35 files changed, 2043 insertions, 770 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 82fed4eb2b6f..d5c8019c6627 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
 config NO_BOOTMEM
        boolean
+config MEMORY_ISOLATION
+        boolean
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
+        select MEMORY_ISOLATION
        depends on SPARSEMEM || X86_64_ACPI_NUMA
        depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
        depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
        depends on MMU
        depends on ARCH_SUPPORTS_MEMORY_FAILURE
        bool "Enable recovery from hardware memory errors"
+        select MEMORY_ISOLATION
        help
          Enables code to recover from some memory failures on systems
          with MCA recovery. This allows a system to continue running
diff --git a/mm/Makefile b/mm/Makefile
index 8e81fe263c94..92753e2d82da 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,8 +15,8 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                           page_isolation.o mm_init.o mmu_context.o percpu.o \
+                           mm_init.o mmu_context.o percpu.o slab_common.o \
-                           compaction.o slab_common.o $(mmu-y)
+                           compaction.o $(mmu-y)
 obj-y += init-mm.o
@@ -49,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3387aea11209..6b4718e2ee34 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -886,3 +886,23 @@ out:
        return ret;
 }
 EXPORT_SYMBOL(wait_iff_congested);
+int pdflush_proc_obsolete(struct ctl_table *table, int write,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        char kbuf[] = "0\n";
+        if (*ppos) {
+                *lenp = 0;
+                return 0;
+        }
+        if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
+                return -EFAULT;
+        printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
+                        table->procname);
+        *lenp = 2;
+        *ppos += *lenp;
+        return 2;
+}
diff --git a/mm/compaction.c b/mm/compaction.c
index 2f42d9528539..e78cb9688421 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -422,6 +422,17 @@ static void isolate_freepages(struct zone *zone,
                                        pfn -= pageblock_nr_pages) {
                unsigned long isolated;
+                /*
+                 * Skip ahead if another thread is compacting in the area
+                 * simultaneously. If we wrapped around, we can only skip
+                 * ahead if zone->compact_cached_free_pfn also wrapped to
+                 * above our starting point.
+                 */
+                if (cc->order > 0 && (!cc->wrapped ||
+                                      zone->compact_cached_free_pfn >
+                                      cc->start_free_pfn))
+                        pfn = min(pfn, zone->compact_cached_free_pfn);
                if (!pfn_valid(pfn))
                        continue;
@@ -461,8 +472,11 @@ static void isolate_freepages(struct zone *zone,
                 * looking for free pages, the search will restart here as
                 * page migration may have returned some pages to the allocator
                 */
-                if (isolated)
+                if (isolated) {
                        high_pfn = max(high_pfn, pfn);
+                        if (cc->order > 0)
+                                zone->compact_cached_free_pfn = high_pfn;
+                }
        }
        /* split_free_page does not map the pages */
@@ -556,6 +570,20 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        return ISOLATE_SUCCESS;
 }
+/*
+ * Returns the start pfn of the last page block in a zone.  This is the starting
+ * point for full compaction of a zone.  Compaction searches for free pages from
+ * the end of each zone, while isolate_freepages_block scans forward inside each
+ * page block.
+ */
+static unsigned long start_free_pfn(struct zone *zone)
+{
+        unsigned long free_pfn;
+        free_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        free_pfn &= ~(pageblock_nr_pages-1);
+        return free_pfn;
+}
 static int compact_finished(struct zone *zone,
                            struct compact_control *cc)
 {
@@ -565,8 +593,26 @@ static int compact_finished(struct zone *zone,
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
-        /* Compaction run completes if the migrate and free scanner meet */
+        /*
-        if (cc->free_pfn <= cc->migrate_pfn)
+         * A full (order == -1) compaction run starts at the beginning and
+         * end of a zone; it completes when the migrate and free scanner meet.
+         * A partial (order > 0) compaction can start with the free scanner
+         * at a random point in the zone, and may have to restart.
+         */
+        if (cc->free_pfn <= cc->migrate_pfn) {
+                if (cc->order > 0 && !cc->wrapped) {
+                        /* We started partway through; restart at the end. */
+                        unsigned long free_pfn = start_free_pfn(zone);
+                        zone->compact_cached_free_pfn = free_pfn;
+                        cc->free_pfn = free_pfn;
+                        cc->wrapped = 1;
+                        return COMPACT_CONTINUE;
+                }
+                return COMPACT_COMPLETE;
+        }
+        /* We wrapped around and ended up where we started. */
+        if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
                return COMPACT_COMPLETE;
        /*
@@ -664,8 +710,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        /* Setup to move all movable pages to the end of the zone */
        cc->migrate_pfn = zone->zone_start_pfn;
-        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
-        cc->free_pfn &= ~(pageblock_nr_pages-1);
+        if (cc->order > 0) {
+                /* Incremental compaction. Start where the last one stopped. */
+                cc->free_pfn = zone->compact_cached_free_pfn;
+                cc->start_free_pfn = cc->free_pfn;
+        } else {
+                /* Order == -1 starts at the end of the zone. */
+                cc->free_pfn = start_free_pfn(zone);
+        }
        migrate_prep_local();
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0af79..9b75a045dbf4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                spin_unlock(&file->f_lock);
                break;
        case POSIX_FADV_WILLNEED:
-                if (!mapping->a_ops->readpage) {
-                        ret = -EINVAL;
-                        break;
-                }
                /* First and last PARTIAL page! */
                start_index = offset >> PAGE_CACHE_SHIFT;
                end_index = endbyte >> PAGE_CACHE_SHIFT;
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                nrpages = end_index - start_index + 1;
                if (!nrpages)
                        nrpages = ~0UL;
-                
-                ret = force_page_cache_readahead(mapping, file,
+                /*
-                                start_index,
+                 * Ignore return value because fadvise() shall return
-                                nrpages);
+                 * success even if filesystem can't retrieve a hint,
-                if (ret > 0)
+                 */
-                        ret = 0;
+                force_page_cache_readahead(mapping, file, start_index,
+                                           nrpages);
                break;
        case POSIX_FADV_NOREUSE:
                break;
diff --git a/mm/highmem.c b/mm/highmem.c
index 57d82c6250c3..d517cd16a6eb 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
                do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
 #endif
+struct page *kmap_to_page(void *vaddr)
+{
+        unsigned long addr = (unsigned long)vaddr;
+        if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
+                int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
+                return pte_page(pkmap_page_table[i]);
+        }
+        return virt_to_page(addr);
+}
 static void flush_all_zero_pkmaps(void)
 {
        int i;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e198831276a3..bc727122dd44 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,17 +24,20 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <linux/io.h>
+#include <asm/tlb.h>
+#include <linux/io.h>
 #include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
+#include <linux/hugetlb_cgroup.h>
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
-static int max_hstate;
+int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
 struct hstate hstates[HUGE_MAX_HSTATE];
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
 static unsigned long __initdata default_hstate_max_huge_pages;
 static unsigned long __initdata default_hstate_size;
-#define for_each_hstate(h) \
-        for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
-static DEFINE_SPINLOCK(hugetlb_lock);
+DEFINE_SPINLOCK(hugetlb_lock);
 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
 {
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
 static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
-        list_add(&page->lru, &h->hugepage_freelists[nid]);
+        list_move(&page->lru, &h->hugepage_freelists[nid]);
        h->free_huge_pages++;
        h->free_huge_pages_node[nid]++;
 }
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
        if (list_empty(&h->hugepage_freelists[nid]))
                return NULL;
        page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
-        list_del(&page->lru);
+        list_move(&page->lru, &h->hugepage_activelist);
        set_page_refcounted(page);
        h->free_huge_pages--;
        h->free_huge_pages_node[nid]--;
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                1 << PG_active | 1 << PG_reserved |
                                1 << PG_private | 1 << PG_writeback);
        }
+        VM_BUG_ON(hugetlb_cgroup_from_page(page));
        set_compound_page_dtor(page, NULL);
        set_page_refcounted(page);
        arch_release_hugepage(page);
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
        page->mapping = NULL;
        BUG_ON(page_count(page));
        BUG_ON(page_mapcount(page));
-        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
+        hugetlb_cgroup_uncharge_page(hstate_index(h),
+                                     pages_per_huge_page(h), page);
        if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+                /* remove the page from active list */
+                list_del(&page->lru);
                update_and_free_page(h, page);
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
+        INIT_LIST_HEAD(&page->lru);
        set_compound_page_dtor(page, free_huge_page);
        spin_lock(&hugetlb_lock);
+        set_hugetlb_cgroup(page, NULL);
        h->nr_huge_pages++;
        h->nr_huge_pages_node[nid]++;
        spin_unlock(&hugetlb_lock);
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
        spin_lock(&hugetlb_lock);
        if (page) {
+                INIT_LIST_HEAD(&page->lru);
                r_nid = page_to_nid(page);
                set_compound_page_dtor(page, free_huge_page);
+                set_hugetlb_cgroup(page, NULL);
                /*
                 * We incremented the global counters already
                 */
@@ -993,7 +1001,6 @@ retry:
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
                        break;
-                list_del(&page->lru);
                /*
                 * This page is now managed by the hugetlb allocator and has
                 * no users -- drop the buddy allocator's reference.
@@ -1008,7 +1015,6 @@ free:
        /* Free unnecessary surplus pages to the buddy allocator */
        if (!list_empty(&surplus_list)) {
                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
-                        list_del(&page->lru);
                        put_page(page);
                }
        }
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        struct hstate *h = hstate_vma(vma);
        struct page *page;
        long chg;
+        int ret, idx;
+        struct hugetlb_cgroup *h_cg;
+        idx = hstate_index(h);
        /*
         * Processes that did not create the mapping will have no
         * reserves and will not have accounted against subpool
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
         */
        chg = vma_needs_reservation(h, vma, addr);
        if (chg < 0)
-                return ERR_PTR(-VM_FAULT_OOM);
+                return ERR_PTR(-ENOMEM);
        if (chg)
                if (hugepage_subpool_get_pages(spool, chg))
-                        return ERR_PTR(-VM_FAULT_SIGBUS);
+                        return ERR_PTR(-ENOSPC);
+        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+        if (ret) {
+                hugepage_subpool_put_pages(spool, chg);
+                return ERR_PTR(-ENOSPC);
+        }
        spin_lock(&hugetlb_lock);
        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
-        spin_unlock(&hugetlb_lock);
+        if (page) {
+                /* update page cgroup details */
-        if (!page) {
+                hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+                                             h_cg, page);
+                spin_unlock(&hugetlb_lock);
+        } else {
+                spin_unlock(&hugetlb_lock);
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
                if (!page) {
+                        hugetlb_cgroup_uncharge_cgroup(idx,
+                                                       pages_per_huge_page(h),
+                                                       h_cg);
                        hugepage_subpool_put_pages(spool, chg);
-                        return ERR_PTR(-VM_FAULT_SIGBUS);
+                        return ERR_PTR(-ENOSPC);
                }
+                spin_lock(&hugetlb_lock);
+                hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+                                             h_cg, page);
+                list_move(&page->lru, &h->hugepage_activelist);
+                spin_unlock(&hugetlb_lock);
        }
        set_page_private(page, (unsigned long)spool);
        vma_commit_reservation(h, vma, addr);
        return page;
 }
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
                                    struct attribute_group *hstate_attr_group)
 {
        int retval;
-        int hi = h - hstates;
+        int hi = hstate_index(h);
        hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
        if (!hstate_kobjs[hi])
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
        if (!nhs->hugepages_kobj)
                return;         /* no hstate attributes */
-        for_each_hstate(h)
+        for_each_hstate(h) {
-                if (nhs->hstate_kobjs[h - hstates]) {
+                int idx = hstate_index(h);
-                        kobject_put(nhs->hstate_kobjs[h - hstates]);
+                if (nhs->hstate_kobjs[idx]) {
-                        nhs->hstate_kobjs[h - hstates] = NULL;
+                        kobject_put(nhs->hstate_kobjs[idx]);
+                        nhs->hstate_kobjs[idx] = NULL;
                }
+        }
        kobject_put(nhs->hugepages_kobj);
        nhs->hugepages_kobj = NULL;
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
        hugetlb_unregister_all_nodes();
        for_each_hstate(h) {
-                kobject_put(hstate_kobjs[h - hstates]);
+                kobject_put(hstate_kobjs[hstate_index(h)]);
        }
        kobject_put(hugepages_kobj);
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
                if (!size_to_hstate(default_hstate_size))
                        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
        }
-        default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+        default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
        if (default_hstate_max_huge_pages)
                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
                printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
                return;
        }
-        BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+        BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
        BUG_ON(order == 0);
-        h = &hstates[max_hstate++];
+        h = &hstates[hugetlb_max_hstate++];
        h->order = order;
        h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
        h->nr_huge_pages = 0;
        h->free_huge_pages = 0;
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+        INIT_LIST_HEAD(&h->hugepage_activelist);
        h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
        h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
+        /*
+         * Add cgroup control files only if the huge page consists
+         * of more than two normal pages. This is because we use
+         * page[2].lru.next for storing cgoup details.
+         */
+        if (order >= HUGETLB_CGROUP_MIN_ORDER)
+                hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
        parsed_hstate = h;
 }
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
        static unsigned long *last_mhp;
        /*
-         * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+         * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
         * so this hugepages= parameter goes to the "default hstate".
         */
-        if (!max_hstate)
+        if (!hugetlb_max_hstate)
                mhp = &default_hstate_max_huge_pages;
        else
                mhp = &parsed_hstate->max_huge_pages;
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
         * But we need to allocate >= MAX_ORDER hstates here early to still
         * use the bootmem allocator.
         */
-        if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+        if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
                hugetlb_hstate_alloc_pages(parsed_hstate);
        last_mhp = mhp;
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
                return 0;
 }
-void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
-                            unsigned long end, struct page *ref_page)
+                            unsigned long start, unsigned long end,
+                            struct page *ref_page)
 {
+        int force_flush = 0;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
        pte_t *ptep;
        pte_t pte;
        struct page *page;
-        struct page *tmp;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
-        /*
-         * A page gathering list, protected by per file i_mmap_mutex. The
-         * lock is used to avoid list corruption from multiple unmapping
-         * of the same page since we are using page->lru.
-         */
-        LIST_HEAD(page_list);
        WARN_ON(!is_vm_hugetlb_page(vma));
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));
+        tlb_start_vma(tlb, vma);
        mmu_notifier_invalidate_range_start(mm, start, end);
+again:
        spin_lock(&mm->page_table_lock);
        for (address = start; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address);
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                }
                pte = huge_ptep_get_and_clear(mm, address, ptep);
+                tlb_remove_tlb_entry(tlb, ptep, address);
                if (pte_dirty(pte))
                        set_page_dirty(page);
-                list_add(&page->lru, &page_list);
+                page_remove_rmap(page);
+                force_flush = !__tlb_remove_page(tlb, page);
+                if (force_flush)
+                        break;
                /* Bail out after unmapping reference page if supplied */
                if (ref_page)
                        break;
        }
-        flush_tlb_range(vma, start, end);
        spin_unlock(&mm->page_table_lock);
-        mmu_notifier_invalidate_range_end(mm, start, end);
+        /*
-        list_for_each_entry_safe(page, tmp, &page_list, lru) {
+         * mmu_gather ran out of room to batch pages, we break out of
-                page_remove_rmap(page);
+         * the PTE lock to avoid doing the potential expensive TLB invalidate
-                list_del(&page->lru);
+         * and page-free while holding it.
-                put_page(page);
+         */
+        if (force_flush) {
+                force_flush = 0;
+                tlb_flush_mmu(tlb);
+                if (address < end && !ref_page)
+                        goto again;
        }
+        mmu_notifier_invalidate_range_end(mm, start, end);
+        tlb_end_vma(tlb, vma);
+}
+void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+                          struct vm_area_struct *vma, unsigned long start,
+                          unsigned long end, struct page *ref_page)
+{
+        __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+        /*
+         * Clear this flag so that x86's huge_pmd_share page_table_shareable
+         * test will fail on a vma being torn down, and not grab a page table
+         * on its way out.  We're lucky that the flag has such an appropriate
+         * name, and can in fact be safely cleared here. We could clear it
+         * before the __unmap_hugepage_range above, but all that's necessary
+         * is to clear it before releasing the i_mmap_mutex. This works
+         * because in the context this is called, the VMA is about to be
+         * destroyed and the i_mmap_mutex is held.
+         */
+        vma->vm_flags &= ~VM_MAYSHARE;
 }
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct page *ref_page)
 {
-        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        struct mm_struct *mm;
-        __unmap_hugepage_range(vma, start, end, ref_page);
+        struct mmu_gather tlb;
-        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        mm = vma->vm_mm;
+        tlb_gather_mmu(&tlb, mm, 0);
+        __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+        tlb_finish_mmu(&tlb, start, end);
 }
 /*
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                 * from the time of fork. This would look like data corruption
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
-                        __unmap_hugepage_range(iter_vma,
+                        unmap_hugepage_range(iter_vma, address,
-                                address, address + huge_page_size(h),
+                                             address + huge_page_size(h), page);
-                                page);
        }
        mutex_unlock(&mapping->i_mmap_mutex);
@@ -2496,6 +2560,7 @@ retry_avoidcopy:
        new_page = alloc_huge_page(vma, address, outside_reserve);
        if (IS_ERR(new_page)) {
+                long err = PTR_ERR(new_page);
                page_cache_release(old_page);
                /*
@@ -2524,7 +2589,10 @@ retry_avoidcopy:
                /* Caller expects lock to be held */
                spin_lock(&mm->page_table_lock);
-                return -PTR_ERR(new_page);
+                if (err == -ENOMEM)
+                        return VM_FAULT_OOM;
+                else
+                        return VM_FAULT_SIGBUS;
        }
        /*
@@ -2642,7 +2710,11 @@ retry:
                        goto out;
                page = alloc_huge_page(vma, address, 0);
                if (IS_ERR(page)) {
-                        ret = -PTR_ERR(page);
+                        ret = PTR_ERR(page);
+                        if (ret == -ENOMEM)
+                                ret = VM_FAULT_OOM;
+                        else
+                                ret = VM_FAULT_SIGBUS;
                        goto out;
                }
                clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2679,7 +2751,7 @@ retry:
                 */
                if (unlikely(PageHWPoison(page))) {
                        ret = VM_FAULT_HWPOISON |
-                              VM_FAULT_SET_HINDEX(h - hstates);
+                                VM_FAULT_SET_HINDEX(hstate_index(h));
                        goto backout_unlocked;
                }
        }
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        return 0;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                        return VM_FAULT_HWPOISON_LARGE |
-                               VM_FAULT_SET_HINDEX(h - hstates);
+                                VM_FAULT_SET_HINDEX(hstate_index(h));
        }
        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                }
        }
        spin_unlock(&mm->page_table_lock);
-        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        /*
+         * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+         * may have cleared our pud entry and done put_page on the page table:
+         * once we release i_mmap_mutex, another task can do the final put_page
+         * and that page table be reused and filled with junk.
+         */
        flush_tlb_range(vma, start, end);
+        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
 int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 000000000000..a3f358fb8a0c
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,418 @@
+/*
+ *
+ * Copyright IBM Corporation, 2012
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+struct hugetlb_cgroup {
+        struct cgroup_subsys_state css;
+        /*
+         * the counter to account for hugepages from hugetlb.
+         */
+        struct res_counter hugepage[HUGE_MAX_HSTATE];
+};
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_IDX(val)        (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val)       ((val) & 0xffff)
+struct cgroup_subsys hugetlb_subsys __read_mostly;
+static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+        return container_of(s, struct hugetlb_cgroup, css);
+}
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
+{
+        return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
+                                                           hugetlb_subsys_id));
+}
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
+{
+        return hugetlb_cgroup_from_css(task_subsys_state(task,
+                                                         hugetlb_subsys_id));
+}
+static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
+{
+        return (h_cg == root_h_cgroup);
+}
+static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
+{
+        if (!cg->parent)
+                return NULL;
+        return hugetlb_cgroup_from_cgroup(cg->parent);
+}
+static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
+{
+        int idx;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
+        for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+                if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
+                        return true;
+        }
+        return false;
+}
+static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
+{
+        int idx;
+        struct cgroup *parent_cgroup;
+        struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
+        h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
+        if (!h_cgroup)
+                return ERR_PTR(-ENOMEM);
+        parent_cgroup = cgroup->parent;
+        if (parent_cgroup) {
+                parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
+                for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+                        res_counter_init(&h_cgroup->hugepage[idx],
+                                         &parent_h_cgroup->hugepage[idx]);
+        } else {
+                root_h_cgroup = h_cgroup;
+                for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+                        res_counter_init(&h_cgroup->hugepage[idx], NULL);
+        }
+        return &h_cgroup->css;
+}
+static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
+{
+        struct hugetlb_cgroup *h_cgroup;
+        h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
+        kfree(h_cgroup);
+}
+/*
+ * Should be called with hugetlb_lock held.
+ * Since we are holding hugetlb_lock, pages cannot get moved from
+ * active list or uncharged from the cgroup, So no need to get
+ * page reference and test for page active here. This function
+ * cannot fail.
+ */
+static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
+                                       struct page *page)
+{
+        int csize;
+        struct res_counter *counter;
+        struct res_counter *fail_res;
+        struct hugetlb_cgroup *page_hcg;
+        struct hugetlb_cgroup *h_cg   = hugetlb_cgroup_from_cgroup(cgroup);
+        struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
+        page_hcg = hugetlb_cgroup_from_page(page);
+        /*
+         * We can have pages in active list without any cgroup
+         * ie, hugepage with less than 3 pages. We can safely
+         * ignore those pages.
+         */
+        if (!page_hcg || page_hcg != h_cg)
+                goto out;
+        csize = PAGE_SIZE << compound_order(page);
+        if (!parent) {
+                parent = root_h_cgroup;
+                /* root has no limit */
+                res_counter_charge_nofail(&parent->hugepage[idx],
+                                          csize, &fail_res);
+        }
+        counter = &h_cg->hugepage[idx];
+        res_counter_uncharge_until(counter, counter->parent, csize);
+        set_hugetlb_cgroup(page, parent);
+out:
+        return;
+}
+/*
+ * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
+ * the parent cgroup.
+ */
+static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
+{
+        struct hstate *h;
+        struct page *page;
+        int ret = 0, idx = 0;
+        do {
+                if (cgroup_task_count(cgroup) ||
+                    !list_empty(&cgroup->children)) {
+                        ret = -EBUSY;
+                        goto out;
+                }
+                for_each_hstate(h) {
+                        spin_lock(&hugetlb_lock);
+                        list_for_each_entry(page, &h->hugepage_activelist, lru)
+                                hugetlb_cgroup_move_parent(idx, cgroup, page);
+                        spin_unlock(&hugetlb_lock);
+                        idx++;
+                }
+                cond_resched();
+        } while (hugetlb_cgroup_have_usage(cgroup));
+out:
+        return ret;
+}
+int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+                                 struct hugetlb_cgroup **ptr)
+{
+        int ret = 0;
+        struct res_counter *fail_res;
+        struct hugetlb_cgroup *h_cg = NULL;
+        unsigned long csize = nr_pages * PAGE_SIZE;
+        if (hugetlb_cgroup_disabled())
+                goto done;
+        /*
+         * We don't charge any cgroup if the compound page have less
+         * than 3 pages.
+         */
+        if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+                goto done;
+again:
+        rcu_read_lock();
+        h_cg = hugetlb_cgroup_from_task(current);
+        if (!css_tryget(&h_cg->css)) {
+                rcu_read_unlock();
+                goto again;
+        }
+        rcu_read_unlock();
+        ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
+        css_put(&h_cg->css);
+done:
+        *ptr = h_cg;
+        return ret;
+}
+/* Should be called with hugetlb_lock held */
+void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+                                  struct hugetlb_cgroup *h_cg,
+                                  struct page *page)
+{
+        if (hugetlb_cgroup_disabled() || !h_cg)
+                return;
+        set_hugetlb_cgroup(page, h_cg);
+        return;
+}
+/*
+ * Should be called with hugetlb_lock held
+ */
+void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+                                  struct page *page)
+{
+        struct hugetlb_cgroup *h_cg;
+        unsigned long csize = nr_pages * PAGE_SIZE;
+        if (hugetlb_cgroup_disabled())
+                return;
+        VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
+        h_cg = hugetlb_cgroup_from_page(page);
+        if (unlikely(!h_cg))
+                return;
+        set_hugetlb_cgroup(page, NULL);
+        res_counter_uncharge(&h_cg->hugepage[idx], csize);
+        return;
+}
+void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
+                                    struct hugetlb_cgroup *h_cg)
+{
+        unsigned long csize = nr_pages * PAGE_SIZE;
+        if (hugetlb_cgroup_disabled() || !h_cg)
+                return;
+        if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+                return;
+        res_counter_uncharge(&h_cg->hugepage[idx], csize);
+        return;
+}
+static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
+                                   struct file *file, char __user *buf,
+                                   size_t nbytes, loff_t *ppos)
+{
+        u64 val;
+        char str[64];
+        int idx, name, len;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+        idx = MEMFILE_IDX(cft->private);
+        name = MEMFILE_ATTR(cft->private);
+        val = res_counter_read_u64(&h_cg->hugepage[idx], name);
+        len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+        return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
+                                const char *buffer)
+{
+        int idx, name, ret;
+        unsigned long long val;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+        idx = MEMFILE_IDX(cft->private);
+        name = MEMFILE_ATTR(cft->private);
+        switch (name) {
+        case RES_LIMIT:
+                if (hugetlb_cgroup_is_root(h_cg)) {
+                        /* Can't set limit on root */
+                        ret = -EINVAL;
+                        break;
+                }
+                /* This function does all necessary parse...reuse it */
+                ret = res_counter_memparse_write_strategy(buffer, &val);
+                if (ret)
+                        break;
+                ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        return ret;
+}
+static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
+{
+        int idx, name, ret = 0;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+        idx = MEMFILE_IDX(event);
+        name = MEMFILE_ATTR(event);
+        switch (name) {
+        case RES_MAX_USAGE:
+                res_counter_reset_max(&h_cg->hugepage[idx]);
+                break;
+        case RES_FAILCNT:
+                res_counter_reset_failcnt(&h_cg->hugepage[idx]);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        return ret;
+}
+static char *mem_fmt(char *buf, int size, unsigned long hsize)
+{
+        if (hsize >= (1UL << 30))
+                snprintf(buf, size, "%luGB", hsize >> 30);
+        else if (hsize >= (1UL << 20))
+                snprintf(buf, size, "%luMB", hsize >> 20);
+        else
+                snprintf(buf, size, "%luKB", hsize >> 10);
+        return buf;
+}
+int __init hugetlb_cgroup_file_init(int idx)
+{
+        char buf[32];
+        struct cftype *cft;
+        struct hstate *h = &hstates[idx];
+        /* format the size */
+        mem_fmt(buf, 32, huge_page_size(h));
+        /* Add the limit file */
+        cft = &h->cgroup_files[0];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
+        cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
+        cft->read = hugetlb_cgroup_read;
+        cft->write_string = hugetlb_cgroup_write;
+        /* Add the usage file */
+        cft = &h->cgroup_files[1];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
+        cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
+        cft->read = hugetlb_cgroup_read;
+        /* Add the MAX usage file */
+        cft = &h->cgroup_files[2];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
+        cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
+        cft->trigger = hugetlb_cgroup_reset;
+        cft->read = hugetlb_cgroup_read;
+        /* Add the failcntfile */
+        cft = &h->cgroup_files[3];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
+        cft->private  = MEMFILE_PRIVATE(idx, RES_FAILCNT);
+        cft->trigger  = hugetlb_cgroup_reset;
+        cft->read = hugetlb_cgroup_read;
+        /* NULL terminate the last cft */
+        cft = &h->cgroup_files[4];
+        memset(cft, 0, sizeof(*cft));
+        WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
+        return 0;
+}
+/*
+ * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
+ * when we migrate hugepages
+ */
+void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
+{
+        struct hugetlb_cgroup *h_cg;
+        struct hstate *h = page_hstate(oldhpage);
+        if (hugetlb_cgroup_disabled())
+                return;
+        VM_BUG_ON(!PageHuge(oldhpage));
+        spin_lock(&hugetlb_lock);
+        h_cg = hugetlb_cgroup_from_page(oldhpage);
+        set_hugetlb_cgroup(oldhpage, NULL);
+        /* move the h_cg details to new cgroup */
+        set_hugetlb_cgroup(newhpage, h_cg);
+        list_move(&newhpage->lru, &h->hugepage_activelist);
+        spin_unlock(&hugetlb_lock);
+        return;
+}
+struct cgroup_subsys hugetlb_subsys = {
+        .name = "hugetlb",
+        .create     = hugetlb_cgroup_create,
+        .pre_destroy = hugetlb_cgroup_pre_destroy,
+        .destroy    = hugetlb_cgroup_destroy,
+        .subsys_id  = hugetlb_subsys_id,
+};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index cc448bb983ba..3a61efc518d5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -123,7 +123,7 @@ static int pfn_inject_init(void)
        if (!dentry)
                goto fail;
-#ifdef  CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
        dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
                                    hwpoison_dir, &hwpoison_filter_memcg);
        if (!dentry)
diff --git a/mm/internal.h b/mm/internal.h
index 2ba87fbfb75b..3314f79d775a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,8 +118,14 @@ struct compact_control {
        unsigned long nr_freepages;     /* Number of isolated free pages */
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
+        unsigned long start_free_pfn;   /* where we started the search */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        bool sync;                      /* Synchronous migration */
+        bool wrapped;                   /* Order > 0 compactions are
+                                           incremental, once free_pfn
+                                           and migrate_pfn meet, we restart
+                                           from the top of the zone;
+                                           remember we wrapped around. */
        int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
@@ -347,3 +353,5 @@ extern u32 hwpoison_filter_enable;
 extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);
+extern void set_pageblock_order(void);
diff --git a/mm/memblock.c b/mm/memblock.c
index 5cc6731b00cc..4d9393c7edc9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -222,13 +222,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
        /* Try to find some space for it.
         *
         * WARNING: We assume that either slab_is_available() and we use it or
-         * we use MEMBLOCK for allocations. That means that this is unsafe to use
+         * we use MEMBLOCK for allocations. That means that this is unsafe to
-         * when bootmem is currently active (unless bootmem itself is implemented
+         * use when bootmem is currently active (unless bootmem itself is
-         * on top of MEMBLOCK which isn't the case yet)
+         * implemented on top of MEMBLOCK which isn't the case yet)
         *
         * This should however not be an issue for now, as we currently only
-         * call into MEMBLOCK while it's still active, or much later when slab is
+         * call into MEMBLOCK while it's still active, or much later when slab
-         * active for memory hotplug operations
+         * is active for memory hotplug operations
         */
        if (use_slab) {
                new_array = kmalloc(new_size, GFP_KERNEL);
@@ -243,8 +243,8 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
                                                new_alloc_size, PAGE_SIZE);
                if (!addr && new_area_size)
                        addr = memblock_find_in_range(0,
-                                        min(new_area_start, memblock.current_limit),
+                                min(new_area_start, memblock.current_limit),
-                                        new_alloc_size, PAGE_SIZE);
+                                new_alloc_size, PAGE_SIZE);
                new_array = addr ? __va(addr) : 0;
        }
@@ -254,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
                return -1;
        }
-        memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
+        memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
-                 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
+                        memblock_type_name(type), type->max * 2, (u64)addr,
+                        (u64)addr + new_size - 1);
-        /* Found space, we now need to move the array over before
+        /*
-         * we add the reserved region since it may be our reserved
+         * Found space, we now need to move the array over before we add the
-         * array itself that is full.
+         * reserved region since it may be our reserved array itself that is
+         * full.
         */
        memcpy(new_array, type->regions, old_size);
        memset(new_array + type->max, 0, old_size);
@@ -267,17 +269,16 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
        type->regions = new_array;
        type->max <<= 1;
-        /* Free old array. We needn't free it if the array is the
+        /* Free old array. We needn't free it if the array is the static one */
-         * static one
-         */
        if (*in_slab)
                kfree(old_array);
        else if (old_array != memblock_memory_init_regions &&
                 old_array != memblock_reserved_init_regions)
                memblock_free(__pa(old_array), old_alloc_size);
-        /* Reserve the new array if that comes from the memblock.
+        /*
-         * Otherwise, we needn't do it
+         * Reserve the new array if that comes from the memblock.  Otherwise, we
+         * needn't do it
         */
        if (!use_slab)
                BUG_ON(memblock_reserve(addr, new_alloc_size));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f72b5e52451a..795e525afaba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES      5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
 /* for remember boot option*/
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
 #else
 static int really_do_swap_account __initdata = 0;
@@ -87,7 +87,7 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
        MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
        MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
-        MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+        MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
        MEM_CGROUP_STAT_NSTATS,
 };
@@ -378,9 +378,7 @@ static bool move_file(void)
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
-        MEM_CGROUP_CHARGE_TYPE_MAPPED,
+        MEM_CGROUP_CHARGE_TYPE_ANON,
-        MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
-        MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
        MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
        NR_CHARGE_TYPE,
@@ -407,8 +405,14 @@ enum charge_type {
 static void mem_cgroup_get(struct mem_cgroup *memcg);
 static void mem_cgroup_put(struct mem_cgroup *memcg);
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+        return container_of(s, struct mem_cgroup, css);
+}
 /* Writing them here to avoid exposing memcg's inner layout */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
 #include <net/sock.h>
 #include <net/ip.h>
@@ -467,9 +471,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 #endif /* CONFIG_INET */
-#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+#endif /* CONFIG_MEMCG_KMEM */
-#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
        if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -703,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
                                         bool charge)
 {
        int val = (charge) ? 1 : -1;
-        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
+        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
 }
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
@@ -864,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
-        return container_of(cgroup_subsys_state(cont,
+        return mem_cgroup_from_css(
-                                mem_cgroup_subsys_id), struct mem_cgroup,
+                cgroup_subsys_state(cont, mem_cgroup_subsys_id));
-                                css);
 }
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -879,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
        if (unlikely(!p))
                return NULL;
-        return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
+        return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
-                                struct mem_cgroup, css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -966,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
                if (css) {
                        if (css == &root->css || css_tryget(css))
-                                memcg = container_of(css,
+                                memcg = mem_cgroup_from_css(css);
-                                                     struct mem_cgroup, css);
                } else
                        id = 0;
                rcu_read_unlock();
@@ -1454,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 /*
 * Return the memory (and swap, if configured) limit for a memcg.
 */
-u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
        u64 limit;
        u64 memsw;
@@ -1470,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        return min(limit, memsw);
 }
+void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+                              int order)
+{
+        struct mem_cgroup *iter;
+        unsigned long chosen_points = 0;
+        unsigned long totalpages;
+        unsigned int points = 0;
+        struct task_struct *chosen = NULL;
+        /*
+         * If current has a pending SIGKILL, then automatically select it.  The
+         * goal is to allow it to allocate so that it may quickly exit and free
+         * its memory.
+         */
+        if (fatal_signal_pending(current)) {
+                set_thread_flag(TIF_MEMDIE);
+                return;
+        }
+        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+        totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+        for_each_mem_cgroup_tree(iter, memcg) {
+                struct cgroup *cgroup = iter->css.cgroup;
+                struct cgroup_iter it;
+                struct task_struct *task;
+                cgroup_iter_start(cgroup, &it);
+                while ((task = cgroup_iter_next(cgroup, &it))) {
+                        switch (oom_scan_process_thread(task, totalpages, NULL,
+                                                        false)) {
+                        case OOM_SCAN_SELECT:
+                                if (chosen)
+                                        put_task_struct(chosen);
+                                chosen = task;
+                                chosen_points = ULONG_MAX;
+                                get_task_struct(chosen);
+                                /* fall through */
+                        case OOM_SCAN_CONTINUE:
+                                continue;
+                        case OOM_SCAN_ABORT:
+                                cgroup_iter_end(cgroup, &it);
+                                mem_cgroup_iter_break(memcg, iter);
+                                if (chosen)
+                                        put_task_struct(chosen);
+                                return;
+                        case OOM_SCAN_OK:
+                                break;
+                        };
+                        points = oom_badness(task, memcg, NULL, totalpages);
+                        if (points > chosen_points) {
+                                if (chosen)
+                                        put_task_struct(chosen);
+                                chosen = task;
+                                chosen_points = points;
+                                get_task_struct(chosen);
+                        }
+                }
+                cgroup_iter_end(cgroup, &it);
+        }
+        if (!chosen)
+                return;
+        points = chosen_points * 1000 / totalpages;
+        oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
+                         NULL, "Memory cgroup out of memory");
+}
 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
                                        gfp_t gfp_mask,
                                        unsigned long flags)
@@ -1899,7 +1967,7 @@ again:
                return;
        /*
         * If this memory cgroup is not under account moving, we don't
-         * need to take move_lock_page_cgroup(). Because we already hold
+         * need to take move_lock_mem_cgroup(). Because we already hold
         * rcu_read_lock(), any calls to move_account will be delayed until
         * rcu_read_unlock() if mem_cgroup_stolen() == true.
         */
@@ -1921,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
        /*
         * It's guaranteed that pc->mem_cgroup never changes while
         * lock is held because a routine modifies pc->mem_cgroup
-         * should take move_lock_page_cgroup().
+         * should take move_lock_mem_cgroup().
         */
        move_unlock_mem_cgroup(pc->mem_cgroup, flags);
 }
@@ -2268,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
         * thread group leader migrates. It's possible that mm is not
-         * set, if so charge the init_mm (happens for pagecache usage).
+         * set, if so charge the root memcg (happens for pagecache usage).
         */
        if (!*ptr && !mm)
                *ptr = root_mem_cgroup;
@@ -2429,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
        css = css_lookup(&mem_cgroup_subsys, id);
        if (!css)
                return NULL;
-        return container_of(css, struct mem_cgroup, css);
+        return mem_cgroup_from_css(css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2473,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        bool anon;
        lock_page_cgroup(pc);
-        if (unlikely(PageCgroupUsed(pc))) {
+        VM_BUG_ON(PageCgroupUsed(pc));
-                unlock_page_cgroup(pc);
-                __mem_cgroup_cancel_charge(memcg, nr_pages);
-                return;
-        }
        /*
         * we don't need page_cgroup_lock about tail pages, becase they are not
         * accessed by any other context at this point.
@@ -2519,7 +2583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
                spin_unlock_irq(&zone->lru_lock);
        }
-        if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+        if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
                anon = true;
        else
                anon = false;
@@ -2644,8 +2708,7 @@ out:
 static int mem_cgroup_move_parent(struct page *page,
                                  struct page_cgroup *pc,
-                                  struct mem_cgroup *child,
+                                  struct mem_cgroup *child)
-                                  gfp_t gfp_mask)
 {
        struct mem_cgroup *parent;
        unsigned int nr_pages;
@@ -2728,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page,
        VM_BUG_ON(page->mapping && !PageAnon(page));
        VM_BUG_ON(!mm);
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                        MEM_CGROUP_CHARGE_TYPE_MAPPED);
+                                        MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-static void
-__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
-                                        enum charge_type ctype);
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
-                                gfp_t gfp_mask)
-{
-        struct mem_cgroup *memcg = NULL;
-        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
-        int ret;
-        if (mem_cgroup_disabled())
-                return 0;
-        if (PageCompound(page))
-                return 0;
-        if (unlikely(!mm))
-                mm = &init_mm;
-        if (!page_is_file_cache(page))
-                type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-        if (!PageSwapCache(page))
-                ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
-        else { /* page is swapcache/shmem */
-                ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
-                if (!ret)
-                        __mem_cgroup_commit_charge_swapin(page, memcg, type);
-        }
-        return ret;
 }
 /*
@@ -2768,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 * struct page_cgroup is acquired. This refcnt will be consumed by
 * "commit()" or removed by "cancel()"
 */
-int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-                                 struct page *page,
+                                          struct page *page,
-                                 gfp_t mask, struct mem_cgroup **memcgp)
+                                          gfp_t mask,
+                                          struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg;
+        struct page_cgroup *pc;
        int ret;
-        *memcgp = NULL;
+        pc = lookup_page_cgroup(page);
-        if (mem_cgroup_disabled())
-                return 0;
-        if (!do_swap_account)
-                goto charge_cur_mm;
        /*
-         * A racing thread's fault, or swapoff, may have already updated
+         * Every swap fault against a single page tries to charge the
-         * the pte, and even removed page from swap cache: in those cases
+         * page, bail as early as possible.  shmem_unuse() encounters
-         * do_swap_page()'s pte_same() test will fail; but there's also a
+         * already charged pages, too.  The USED bit is protected by
-         * KSM case which does need to charge the page.
+         * the page lock, which serializes swap cache removal, which
+         * in turn serializes uncharging.
         */
-        if (!PageSwapCache(page))
+        if (PageCgroupUsed(pc))
+                return 0;
+        if (!do_swap_account)
                goto charge_cur_mm;
        memcg = try_get_mem_cgroup_from_page(page);
        if (!memcg)
@@ -2800,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                ret = 0;
        return ret;
 charge_cur_mm:
-        if (unlikely(!mm))
-                mm = &init_mm;
        ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
        if (ret == -EINTR)
                ret = 0;
        return ret;
 }
+int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
+                                 gfp_t gfp_mask, struct mem_cgroup **memcgp)
+{
+        *memcgp = NULL;
+        if (mem_cgroup_disabled())
+                return 0;
+        /*
+         * A racing thread's fault, or swapoff, may have already
+         * updated the pte, and even removed page from swap cache: in
+         * those cases unuse_pte()'s pte_same() test will fail; but
+         * there's also a KSM case which does need to charge the page.
+         */
+        if (!PageSwapCache(page)) {
+                int ret;
+                ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
+                if (ret == -EINTR)
+                        ret = 0;
+                return ret;
+        }
+        return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
+}
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+{
+        if (mem_cgroup_disabled())
+                return;
+        if (!memcg)
+                return;
+        __mem_cgroup_cancel_charge(memcg, 1);
+}
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
                                        enum charge_type ctype)
@@ -2842,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
                                     struct mem_cgroup *memcg)
 {
        __mem_cgroup_commit_charge_swapin(page, memcg,
-                                          MEM_CGROUP_CHARGE_TYPE_MAPPED);
+                                          MEM_CGROUP_CHARGE_TYPE_ANON);
 }
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+                                gfp_t gfp_mask)
 {
+        struct mem_cgroup *memcg = NULL;
+        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        int ret;
        if (mem_cgroup_disabled())
-                return;
+                return 0;
-        if (!memcg)
+        if (PageCompound(page))
-                return;
+                return 0;
-        __mem_cgroup_cancel_charge(memcg, 1);
+        if (!PageSwapCache(page))
+                ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
+        else { /* page is swapcache/shmem */
+                ret = __mem_cgroup_try_charge_swapin(mm, page,
+                                                     gfp_mask, &memcg);
+                if (!ret)
+                        __mem_cgroup_commit_charge_swapin(page, memcg, type);
+        }
+        return ret;
 }
 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -2911,7 +2986,8 @@ direct_uncharge:
 * uncharge if !page_mapped(page)
 */
 static struct mem_cgroup *
-__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
+                             bool end_migration)
 {
        struct mem_cgroup *memcg = NULL;
        unsigned int nr_pages = 1;
@@ -2921,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        if (mem_cgroup_disabled())
                return NULL;
-        if (PageSwapCache(page))
+        VM_BUG_ON(PageSwapCache(page));
-                return NULL;
        if (PageTransHuge(page)) {
                nr_pages <<= compound_order(page);
@@ -2945,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        anon = PageAnon(page);
        switch (ctype) {
-        case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+        case MEM_CGROUP_CHARGE_TYPE_ANON:
                /*
                 * Generally PageAnon tells if it's the anon statistics to be
                 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
@@ -2955,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                /* fallthrough */
        case MEM_CGROUP_CHARGE_TYPE_DROP:
                /* See mem_cgroup_prepare_migration() */
-                if (page_mapped(page) || PageCgroupMigration(pc))
+                if (page_mapped(page))
+                        goto unlock_out;
+                /*
+                 * Pages under migration may not be uncharged.  But
+                 * end_migration() /must/ be the one uncharging the
+                 * unused post-migration page and so it has to call
+                 * here with the migration bit still set.  See the
+                 * res_counter handling below.
+                 */
+                if (!end_migration && PageCgroupMigration(pc))
                        goto unlock_out;
                break;
        case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2989,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                mem_cgroup_swap_statistics(memcg, true);
                mem_cgroup_get(memcg);
        }
-        if (!mem_cgroup_is_root(memcg))
+        /*
+         * Migration does not charge the res_counter for the
+         * replacement page, so leave it alone when phasing out the
+         * page that is unused after the migration.
+         */
+        if (!end_migration && !mem_cgroup_is_root(memcg))
                mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
        return memcg;
@@ -3005,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page)
        if (page_mapped(page))
                return;
        VM_BUG_ON(page->mapping && !PageAnon(page));
-        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+        if (PageSwapCache(page))
+                return;
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
 }
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
        VM_BUG_ON(page_mapped(page));
        VM_BUG_ON(page->mapping);
-        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
 }
 /*
@@ -3076,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
        if (!swapout) /* this was a swap cache but the swap is unused ! */
                ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
-        memcg = __mem_cgroup_uncharge_common(page, ctype);
+        memcg = __mem_cgroup_uncharge_common(page, ctype, false);
        /*
         * record memcg information,  if swapout && memcg != NULL,
@@ -3087,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 }
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 /*
 * called from swap_entry_free(). remove record in swap_cgroup and
 * uncharge "memsw" account.
@@ -3166,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
 * page belongs to.
 */
-int mem_cgroup_prepare_migration(struct page *page,
+void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
-        struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
+                                  struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg = NULL;
        struct page_cgroup *pc;
        enum charge_type ctype;
-        int ret = 0;
        *memcgp = NULL;
        VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
-                return 0;
+                return;
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
@@ -3223,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page,
         * we return here.
         */
        if (!memcg)
-                return 0;
+                return;
        *memcgp = memcg;
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
-        css_put(&memcg->css);/* drop extra refcnt */
-        if (ret) {
-                if (PageAnon(page)) {
-                        lock_page_cgroup(pc);
-                        ClearPageCgroupMigration(pc);
-                        unlock_page_cgroup(pc);
-                        /*
-                         * The old page may be fully unmapped while we kept it.
-                         */
-                        mem_cgroup_uncharge_page(page);
-                }
-                /* we'll need to revisit this error code (we have -EINTR) */
-                return -ENOMEM;
-        }
        /*
         * We charge new page before it's used/mapped. So, even if unlock_page()
         * is called before end_migration, we can catch all events on this new
@@ -3248,13 +3323,15 @@ int mem_cgroup_prepare_migration(struct page *page,
         * mapcount will be finally 0 and we call uncharge in end_migration().
         */
        if (PageAnon(page))
-                ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+                ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
-        else if (page_is_file_cache(page))
-                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
        else
-                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        /*
+         * The page is committed to the memcg, but it's not actually
+         * charged to the res_counter since we plan on replacing the
+         * old one and only one page is going to be left afterwards.
+         */
        __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
-        return ret;
 }
 /* remove redundant charge if migration failed*/
@@ -3276,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
                used = newpage;
                unused = oldpage;
        }
+        anon = PageAnon(used);
+        __mem_cgroup_uncharge_common(unused,
+                                     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
+                                     : MEM_CGROUP_CHARGE_TYPE_CACHE,
+                                     true);
+        css_put(&memcg->css);
        /*
         * We disallowed uncharge of pages under migration because mapcount
         * of the page goes down to zero, temporarly.
@@ -3285,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
        lock_page_cgroup(pc);
        ClearPageCgroupMigration(pc);
        unlock_page_cgroup(pc);
-        anon = PageAnon(used);
-        __mem_cgroup_uncharge_common(unused,
-                anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
-                     : MEM_CGROUP_CHARGE_TYPE_CACHE);
        /*
         * If a page is a file cache, radix-tree replacement is very atomic
@@ -3340,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
         */
        if (!memcg)
                return;
-        if (PageSwapBacked(oldpage))
-                type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
        /*
         * Even if newpage->mapping was NULL before starting replacement,
         * the newpage may be on LRU(or pagevec for LRU) already. We lock
@@ -3418,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                /*
                 * Rather than hide all in some function, I do this in
                 * open coded manner. You see what this really does.
-                 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
+                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
                 */
                mutex_lock(&set_limit_mutex);
                memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3479,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                /*
                 * Rather than hide all in some function, I do this in
                 * open coded manner. You see what this really does.
-                 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
+                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
                 */
                mutex_lock(&set_limit_mutex);
                memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3611,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 }
 /*
- * This routine traverse page_cgroup in given list and drop them all.
+ * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
- * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
+ * reclaim the pages page themselves - it just removes the page_cgroups.
+ * Returns true if some page_cgroups were not freed, indicating that the caller
+ * must retry this operation.
 */
-static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
+static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                                int node, int zid, enum lru_list lru)
 {
        struct mem_cgroup_per_zone *mz;
@@ -3622,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
        struct list_head *list;
        struct page *busy;
        struct zone *zone;
-        int ret = 0;
        zone = &NODE_DATA(node)->node_zones[zid];
        mz = mem_cgroup_zoneinfo(memcg, node, zid);
@@ -3636,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                struct page_cgroup *pc;
                struct page *page;
-                ret = 0;
                spin_lock_irqsave(&zone->lru_lock, flags);
                if (list_empty(list)) {
                        spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -3653,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                pc = lookup_page_cgroup(page);
-                ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
+                if (mem_cgroup_move_parent(page, pc, memcg)) {
-                if (ret == -ENOMEM || ret == -EINTR)
-                        break;
-                if (ret == -EBUSY || ret == -EINVAL) {
                        /* found lock contention or "pc" is obsolete. */
                        busy = page;
                        cond_resched();
                } else
                        busy = NULL;
        }
+        return !list_empty(list);
-        if (!ret && !list_empty(list))
-                return -EBUSY;
-        return ret;
 }
 /*
@@ -3692,9 +3760,6 @@ move_account:
                ret = -EBUSY;
                if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
                        goto out;
-                ret = -EINTR;
-                if (signal_pending(current))
-                        goto out;
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
                drain_all_stock_sync(memcg);
@@ -3715,9 +3780,6 @@ move_account:
                }
                mem_cgroup_end_move(memcg);
                memcg_oom_recover(memcg);
-                /* it seems parent cgroup doesn't have enough mem */
-                if (ret == -ENOMEM)
-                        goto try_to_free;
                cond_resched();
        /* "ret" should also be checked to ensure all lists are empty. */
        } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
@@ -3779,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
                parent_memcg = mem_cgroup_from_cont(parent);
        cgroup_lock();
+        if (memcg->use_hierarchy == val)
+                goto out;
        /*
         * If parent's use_hierarchy is set, we can't make any modifications
         * in the child subtrees. If it is unset, then the change can
@@ -3795,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
                        retval = -EBUSY;
        } else
                retval = -EINVAL;
+out:
        cgroup_unlock();
        return retval;
@@ -3831,7 +3899,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
        val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
        if (swap)
-                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
+                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
        return val << PAGE_SHIFT;
 }
@@ -4015,7 +4083,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 #endif
 #ifdef CONFIG_NUMA
-static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft,
+static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
                                      struct seq_file *m)
 {
        int nid;
@@ -4074,7 +4142,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 }
-static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
+static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
                                 struct seq_file *m)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
@@ -4082,7 +4150,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        unsigned int i;
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-                if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
+                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
                seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
                           mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
@@ -4109,7 +4177,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                long long val = 0;
-                if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
+                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
                for_each_mem_cgroup_tree(mi, memcg)
                        val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
@@ -4533,7 +4601,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        return 0;
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
        return mem_cgroup_sockets_init(memcg, ss);
@@ -4588,7 +4656,7 @@ static struct cftype mem_cgroup_files[] = {
        },
        {
                .name = "stat",
-                .read_seq_string = mem_control_stat_show,
+                .read_seq_string = memcg_stat_show,
        },
        {
                .name = "force_empty",
@@ -4620,10 +4688,10 @@ static struct cftype mem_cgroup_files[] = {
 #ifdef CONFIG_NUMA
        {
                .name = "numa_stat",
-                .read_seq_string = mem_control_numa_stat_show,
+                .read_seq_string = memcg_numa_stat_show,
        },
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
        {
                .name = "memsw.usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
@@ -4810,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 static void __init enable_swap_cgroup(void)
 {
        if (!mem_cgroup_disabled() && really_do_swap_account)
@@ -5541,7 +5609,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .__DEPRECATED_clear_css_refs = true,
 };
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 static int __init enable_swap_account(char *s)
 {
        /* consider enabled if no parameter or 1 is given */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6de0d613bbe6..a6e2141a6610 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
 * can only guarantee that the page either belongs to the memcg tasks, or is
 * a freed page.
 */
-#ifdef  CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef  CONFIG_MEMCG_SWAP
 u64 hwpoison_filter_memcg;
 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 static int hwpoison_filter_task(struct page *p)
@@ -1416,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
        int ret;
        unsigned long pfn = page_to_pfn(page);
        struct page *hpage = compound_head(page);
-        LIST_HEAD(pagelist);
        ret = get_any_page(page, pfn, flags);
        if (ret < 0)
@@ -1431,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
        }
        /* Keep page count to indicate a given hugepage is isolated. */
+        ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
-        list_add(&hpage->lru, &pagelist);
-        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false,
                                MIGRATE_SYNC);
+        put_page(hpage);
        if (ret) {
-                struct page *page1, *page2;
-                list_for_each_entry_safe(page1, page2, &pagelist, lru)
-                        put_page(page1);
                pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                        pfn, ret, page->flags);
-                if (ret > 0)
-                        ret = -EIO;
                return ret;
        }
 done:
        if (!PageHWPoison(hpage))
-                atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
+                atomic_long_add(1 << compound_trans_order(hpage),
+                                &mce_bad_pages);
        set_page_hwpoison_huge_page(hpage);
        dequeue_hwpoisoned_huge_page(hpage);
        /* keep elevated page count for bad page */
diff --git a/mm/memory.c b/mm/memory.c
index 91f69459d3e8..482f089765ff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1343,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
                         */
-                        if (vma->vm_file)
+                        if (vma->vm_file) {
-                                unmap_hugepage_range(vma, start, end, NULL);
+                                mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                                __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
+                                mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                        }
                } else
                        unmap_page_range(tlb, vma, start, end, details);
        }
@@ -3938,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
                        free_page((unsigned long)buf);
                }
        }
-        up_read(&current->mm->mmap_sem);
+        up_read(&mm->mmap_sem);
 }
 #ifdef CONFIG_PROVE_LOCKING
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 427bb291dd0f..3ad25f9d1fc1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
        zone->present_pages += onlined_pages;
        zone->zone_pgdat->node_present_pages += onlined_pages;
-        if (need_zonelists_rebuild)
+        if (onlined_pages) {
-                build_all_zonelists(zone);
+                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
-        else
+                if (need_zonelists_rebuild)
-                zone_pcp_update(zone);
+                        build_all_zonelists(NULL, zone);
+                else
+                        zone_pcp_update(zone);
+        }
        mutex_unlock(&zonelists_mutex);
        init_per_zone_wmark_min();
-        if (onlined_pages) {
+        if (onlined_pages)
                kswapd_run(zone_to_nid(zone));
-                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
-        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -562,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
         * to access not-initialized zonelist, build here.
         */
        mutex_lock(&zonelists_mutex);
-        build_all_zonelists(NULL);
+        build_all_zonelists(pgdat, NULL);
        mutex_unlock(&zonelists_mutex);
        return pgdat;
@@ -965,6 +966,9 @@ repeat:
        init_per_zone_wmark_min();
+        if (!populated_zone(zone))
+                zone_pcp_reset(zone);
        if (!node_present_pages(node)) {
                node_clear_state(node, N_HIGH_MEMORY);
                kswapd_stop(node);
diff --git a/mm/migrate.c b/mm/migrate.c
index be26d5cbe56b..77ed2d773705 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -33,6 +33,7 @@
 #include <linux/memcontrol.h>
 #include <linux/syscalls.h>
 #include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
 #include <asm/tlbflush.h>
@@ -682,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 {
        int rc = -EAGAIN;
        int remap_swapcache = 1;
-        int charge = 0;
        struct mem_cgroup *mem;
        struct anon_vma *anon_vma = NULL;
@@ -724,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
        }
        /* charge against new page */
-        charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
+        mem_cgroup_prepare_migration(page, newpage, &mem);
-        if (charge == -ENOMEM) {
-                rc = -ENOMEM;
-                goto unlock;
-        }
-        BUG_ON(charge);
        if (PageWriteback(page)) {
                /*
@@ -819,8 +814,7 @@ skip_unmap:
                put_anon_vma(anon_vma);
 uncharge:
-        if (!charge)
+        mem_cgroup_end_migration(mem, page, newpage, rc == 0);
-                mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
        unlock_page(page);
 out:
@@ -931,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        if (anon_vma)
                put_anon_vma(anon_vma);
-        unlock_page(hpage);
-out:
+        if (!rc)
-        if (rc != -EAGAIN) {
+                hugetlb_cgroup_migrate(hpage, new_hpage);
-                list_del(&hpage->lru);
-                put_page(hpage);
-        }
+        unlock_page(hpage);
+out:
        put_page(new_hpage);
        if (result) {
                if (rc)
                        *result = rc;
@@ -1016,48 +1007,32 @@ out:
        return nr_failed + retry;
 }
-int migrate_huge_pages(struct list_head *from,
+int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
-                new_page_t get_new_page, unsigned long private, bool offlining,
+                      unsigned long private, bool offlining,
-                enum migrate_mode mode)
+                      enum migrate_mode mode)
 {
-        int retry = 1;
+        int pass, rc;
-        int nr_failed = 0;
-        int pass = 0;
+        for (pass = 0; pass < 10; pass++) {
-        struct page *page;
+                rc = unmap_and_move_huge_page(get_new_page,
-        struct page *page2;
+                                              private, hpage, pass > 2, offlining,
-        int rc;
+                                              mode);
+                switch (rc) {
-        for (pass = 0; pass < 10 && retry; pass++) {
+                case -ENOMEM:
-                retry = 0;
+                        goto out;
+                case -EAGAIN:
-                list_for_each_entry_safe(page, page2, from, lru) {
+                        /* try again */
                        cond_resched();
+                        break;
-                        rc = unmap_and_move_huge_page(get_new_page,
+                case 0:
-                                        private, page, pass > 2, offlining,
+                        goto out;
-                                        mode);
+                default:
+                        rc = -EIO;
-                        switch(rc) {
+                        goto out;
-                        case -ENOMEM:
-                                goto out;
-                        case -EAGAIN:
-                                retry++;
-                                break;
-                        case 0:
-                                break;
-                        default:
-                                /* Permanent failure */
-                                nr_failed++;
-                                break;
-                        }
                }
        }
-        rc = 0;
 out:
-        if (rc)
+        return rc;
-                return rc;
-        return nr_failed + retry;
 }
 #ifdef CONFIG_NUMA
diff --git a/mm/mmap.c b/mm/mmap.c
index 4fe2697339ed..e3e86914f11a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
        const unsigned long stack_flags
                = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
+        mm->total_vm += pages;
        if (file) {
                mm->shared_vm += pages;
                if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -1347,7 +1349,6 @@ munmap_back:
 out:
        perf_event_mmap(vma);
-        mm->total_vm += len >> PAGE_SHIFT;
        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                if (!mlock_vma_pages_range(vma, addr, addr + len))
@@ -1707,7 +1708,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
                return -ENOMEM;
        /* Ok, everything looks good - let it rip */
-        mm->total_vm += grow;
        if (vma->vm_flags & VM_LOCKED)
                mm->locked_vm += grow;
        vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
@@ -1889,7 +1889,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += nrpages;
-                mm->total_vm -= nrpages;
                vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
                vma = remove_vma(vma);
        } while (vma);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9a611d3a1848..862b60822d9f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
 void __mmu_notifier_release(struct mm_struct *mm)
 {
        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        /*
+         * RCU here will block mmu_notifier_unregister until
+         * ->release returns.
+         */
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
+                /*
+                 * if ->release runs before mmu_notifier_unregister it
+                 * must be handled as it's the only way for the driver
+                 * to flush all existing sptes and stop the driver
+                 * from establishing any more sptes before all the
+                 * pages in the mm are freed.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+        rcu_read_unlock();
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
                 * mmu_notifier_unregister to return.
                 */
                hlist_del_init_rcu(&mn->hlist);
-                /*
-                 * RCU here will block mmu_notifier_unregister until
-                 * ->release returns.
-                 */
-                rcu_read_lock();
-                spin_unlock(&mm->mmu_notifier_mm->lock);
-                /*
-                 * if ->release runs before mmu_notifier_unregister it
-                 * must be handled as it's the only way for the driver
-                 * to flush all existing sptes and stop the driver
-                 * from establishing any more sptes before all the
-                 * pages in the mm are freed.
-                 */
-                if (mn->ops->release)
-                        mn->ops->release(mn, mm);
-                rcu_read_unlock();
-                spin_lock(&mm->mmu_notifier_mm->lock);
        }
        spin_unlock(&mm->mmu_notifier_mm->lock);
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
-        spin_lock(&mm->mmu_notifier_mm->lock);
        if (!hlist_unhashed(&mn->hlist)) {
-                hlist_del_rcu(&mn->hlist);
                /*
                 * RCU here will force exit_mmap to wait ->release to finish
                 * before freeing the pages.
                 */
                rcu_read_lock();
-                spin_unlock(&mm->mmu_notifier_mm->lock);
                /*
                 * exit_mmap will block in mmu_notifier_release to
                 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
                rcu_read_unlock();
-        } else
+                spin_lock(&mm->mmu_notifier_mm->lock);
+                hlist_del_rcu(&mn->hlist);
                spin_unlock(&mm->mmu_notifier_mm->lock);
+        }
        /*
         * Wait any running method to finish, of course including
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 6830eab5bf09..3cef80f6ac79 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
        for_each_lru(lru)
                INIT_LIST_HEAD(&lruvec->lists[lru]);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
        lruvec->zone = zone;
 #endif
 }
diff --git a/mm/mremap.c b/mm/mremap.c
index 21fed202ddad..cc06d0e48d05 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         * If this were a serious issue, we'd add a flag to do_munmap().
         */
        hiwater_vm = mm->hiwater_vm;
-        mm->total_vm += new_len >> PAGE_SHIFT;
        vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
        if (do_munmap(mm, old_addr, old_len) < 0) {
@@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                                goto out;
                        }
-                        mm->total_vm += pages;
                        vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
                        if (vma->vm_flags & VM_LOCKED) {
                                mm->locked_vm += pages;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ac300c99baf6..198600861638 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -288,76 +288,93 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 }
 #endif
+enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
+                unsigned long totalpages, const nodemask_t *nodemask,
+                bool force_kill)
+{
+        if (task->exit_state)
+                return OOM_SCAN_CONTINUE;
+        if (oom_unkillable_task(task, NULL, nodemask))
+                return OOM_SCAN_CONTINUE;
+        /*
+         * This task already has access to memory reserves and is being killed.
+         * Don't allow any other task to have access to the reserves.
+         */
+        if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
+                if (unlikely(frozen(task)))
+                        __thaw_task(task);
+                if (!force_kill)
+                        return OOM_SCAN_ABORT;
+        }
+        if (!task->mm)
+                return OOM_SCAN_CONTINUE;
+        if (task->flags & PF_EXITING) {
+                /*
+                 * If task is current and is in the process of releasing memory,
+                 * allow the "kill" to set TIF_MEMDIE, which will allow it to
+                 * access memory reserves.  Otherwise, it may stall forever.
+                 *
+                 * The iteration isn't broken here, however, in case other
+                 * threads are found to have already been oom killed.
+                 */
+                if (task == current)
+                        return OOM_SCAN_SELECT;
+                else if (!force_kill) {
+                        /*
+                         * If this task is not being ptraced on exit, then wait
+                         * for it to finish before killing some other task
+                         * unnecessarily.
+                         */
+                        if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
+                                return OOM_SCAN_ABORT;
+                }
+        }
+        return OOM_SCAN_OK;
+}
 /*
 * Simple selection loop. We chose the process with the highest
- * number of 'points'. We expect the caller will lock the tasklist.
+ * number of 'points'.
 *
 * (not docbooked, we don't want this one cluttering up the manual)
 */
 static struct task_struct *select_bad_process(unsigned int *ppoints,
-                unsigned long totalpages, struct mem_cgroup *memcg,
+                unsigned long totalpages, const nodemask_t *nodemask,
-                const nodemask_t *nodemask, bool force_kill)
+                bool force_kill)
 {
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
        unsigned long chosen_points = 0;
+        rcu_read_lock();
        do_each_thread(g, p) {
                unsigned int points;
-                if (p->exit_state)
+                switch (oom_scan_process_thread(p, totalpages, nodemask,
-                        continue;
+                                                force_kill)) {
-                if (oom_unkillable_task(p, memcg, nodemask))
+                case OOM_SCAN_SELECT:
-                        continue;
+                        chosen = p;
+                        chosen_points = ULONG_MAX;
-                /*
+                        /* fall through */
-                 * This task already has access to memory reserves and is
+                case OOM_SCAN_CONTINUE:
-                 * being killed. Don't allow any other task access to the
-                 * memory reserve.
-                 *
-                 * Note: this may have a chance of deadlock if it gets
-                 * blocked waiting for another task which itself is waiting
-                 * for memory. Is there a better alternative?
-                 */
-                if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
-                        if (unlikely(frozen(p)))
-                                __thaw_task(p);
-                        if (!force_kill)
-                                return ERR_PTR(-1UL);
-                }
-                if (!p->mm)
                        continue;
+                case OOM_SCAN_ABORT:
-                if (p->flags & PF_EXITING) {
+                        rcu_read_unlock();
-                        /*
+                        return ERR_PTR(-1UL);
-                         * If p is the current task and is in the process of
+                case OOM_SCAN_OK:
-                         * releasing memory, we allow the "kill" to set
+                        break;
-                         * TIF_MEMDIE, which will allow it to gain access to
+                };
-                         * memory reserves.  Otherwise, it may stall forever.
+                points = oom_badness(p, NULL, nodemask, totalpages);
-                         *
-                         * The loop isn't broken here, however, in case other
-                         * threads are found to have already been oom killed.
-                         */
-                        if (p == current) {
-                                chosen = p;
-                                chosen_points = ULONG_MAX;
-                        } else if (!force_kill) {
-                                /*
-                                 * If this task is not being ptraced on exit,
-                                 * then wait for it to finish before killing
-                                 * some other task unnecessarily.
-                                 */
-                                if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
-                                        return ERR_PTR(-1UL);
-                        }
-                }
-                points = oom_badness(p, memcg, nodemask, totalpages);
                if (points > chosen_points) {
                        chosen = p;
                        chosen_points = points;
                }
        } while_each_thread(g, p);
+        if (chosen)
+                get_task_struct(chosen);
+        rcu_read_unlock();
        *ppoints = chosen_points * 1000 / totalpages;
        return chosen;
@@ -371,17 +388,16 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 * Dumps the current memory state of all eligible tasks.  Tasks not in the same
 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
 * are not shown.
- * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
+ * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
- * value, oom_score_adj value, and name.
+ * swapents, oom_score_adj value, and name.
- *
- * Call with tasklist_lock read-locked.
 */
 static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
        struct task_struct *p;
        struct task_struct *task;
-        pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
+        pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes swapents oom_score_adj name\n");
+        rcu_read_lock();
        for_each_process(p) {
                if (oom_unkillable_task(p, memcg, nodemask))
                        continue;
@@ -396,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
                        continue;
                }
-                pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
+                pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5d %s\n",
                        task->pid, from_kuid(&init_user_ns, task_uid(task)),
                        task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-                        task_cpu(task), task->signal->oom_adj,
+                        task->mm->nr_ptes,
+                        get_mm_counter(task->mm, MM_SWAPENTS),
                        task->signal->oom_score_adj, task->comm);
                task_unlock(task);
        }
+        rcu_read_unlock();
 }
 static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -423,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
-static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+/*
-                             unsigned int points, unsigned long totalpages,
+ * Must be called while holding a reference to p, which will be released upon
-                             struct mem_cgroup *memcg, nodemask_t *nodemask,
+ * returning.
-                             const char *message)
+ */
+void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+                      unsigned int points, unsigned long totalpages,
+                      struct mem_cgroup *memcg, nodemask_t *nodemask,
+                      const char *message)
 {
        struct task_struct *victim = p;
        struct task_struct *child;
@@ -442,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         */
        if (p->flags & PF_EXITING) {
                set_tsk_thread_flag(p, TIF_MEMDIE);
+                put_task_struct(p);
                return;
        }
@@ -459,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * parent.  This attempts to lose the minimal amount of work done while
         * still freeing memory.
         */
+        read_lock(&tasklist_lock);
        do {
                list_for_each_entry(child, &t->children, sibling) {
                        unsigned int child_points;
@@ -471,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                        child_points = oom_badness(child, memcg, nodemask,
                                                                totalpages);
                        if (child_points > victim_points) {
+                                put_task_struct(victim);
                                victim = child;
                                victim_points = child_points;
+                                get_task_struct(victim);
                        }
                }
        } while_each_thread(p, t);
+        read_unlock(&tasklist_lock);
-        victim = find_lock_task_mm(victim);
+        rcu_read_lock();
-        if (!victim)
+        p = find_lock_task_mm(victim);
+        if (!p) {
+                rcu_read_unlock();
+                put_task_struct(victim);
                return;
+        } else if (victim != p) {
+                get_task_struct(p);
+                put_task_struct(victim);
+                victim = p;
+        }
        /* mm cannot safely be dereferenced after task_unlock(victim) */
        mm = victim->mm;
@@ -510,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                        task_unlock(p);
                        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
                }
+        rcu_read_unlock();
        set_tsk_thread_flag(victim, TIF_MEMDIE);
        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+        put_task_struct(victim);
 }
 #undef K
 /*
 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
 */
-static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
-                                int order, const nodemask_t *nodemask)
+                        int order, const nodemask_t *nodemask)
 {
        if (likely(!sysctl_panic_on_oom))
                return;
@@ -533,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
                if (constraint != CONSTRAINT_NONE)
                        return;
        }
-        read_lock(&tasklist_lock);
        dump_header(NULL, gfp_mask, order, NULL, nodemask);
-        read_unlock(&tasklist_lock);
        panic("Out of memory: %s panic_on_oom is enabled\n",
                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
-                              int order)
-{
-        unsigned long limit;
-        unsigned int points = 0;
-        struct task_struct *p;
-        /*
-         * If current has a pending SIGKILL, then automatically select it.  The
-         * goal is to allow it to allocate so that it may quickly exit and free
-         * its memory.
-         */
-        if (fatal_signal_pending(current)) {
-                set_thread_flag(TIF_MEMDIE);
-                return;
-        }
-        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
-        limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
-        read_lock(&tasklist_lock);
-        p = select_bad_process(&points, limit, memcg, NULL, false);
-        if (p && PTR_ERR(p) != -1UL)
-                oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
-                                 "Memory cgroup out of memory");
-        read_unlock(&tasklist_lock);
-}
-#endif
 static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
 int register_oom_notifier(struct notifier_block *nb)
@@ -690,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        struct task_struct *p;
        unsigned long totalpages;
        unsigned long freed = 0;
-        unsigned int points;
+        unsigned int uninitialized_var(points);
        enum oom_constraint constraint = CONSTRAINT_NONE;
        int killed = 0;
@@ -718,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
        check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
-        read_lock(&tasklist_lock);
+        if (sysctl_oom_kill_allocating_task && current->mm &&
-        if (sysctl_oom_kill_allocating_task &&
            !oom_unkillable_task(current, NULL, nodemask) &&
-            current->mm) {
+            current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+                get_task_struct(current);
                oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
                                 nodemask,
                                 "Out of memory (oom_kill_allocating_task)");
                goto out;
        }
-        p = select_bad_process(&points, totalpages, NULL, mpol_mask,
+        p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
-                               force_kill);
        /* Found nothing?!?! Either we hang forever, or we panic. */
        if (!p) {
                dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
-                read_unlock(&tasklist_lock);
                panic("Out of memory and no killable processes...\n");
        }
        if (PTR_ERR(p) != -1UL) {
@@ -742,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                killed = 1;
        }
 out:
-        read_unlock(&tasklist_lock);
        /*
-         * Give "p" a good chance of killing itself before we
+         * Give the killed threads a good chance of exiting before trying to
-         * retry to allocate memory unless "p" is current
+         * allocate memory again.
         */
-        if (killed && !test_thread_flag(TIF_MEMDIE))
+        if (killed)
-                schedule_timeout_uninterruptible(1);
+                schedule_timeout_killable(1);
 }
 /*
@@ -764,6 +766,5 @@ void pagefault_out_of_memory(void)
                out_of_memory(NULL, 0, 0, NULL, false);
                clear_system_oom();
        }
-        if (!test_thread_flag(TIF_MEMDIE))
+        schedule_timeout_killable(1);
-                schedule_timeout_uninterruptible(1);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4a4f9219683f..889532b8e6c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
-#include <linux/memory.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+/*
+ * NOTE:
+ * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
+ * Instead, use {un}set_pageblock_isolate.
+ */
+void set_pageblock_migratetype(struct page *page, int migratetype)
 {
        if (unlikely(page_group_by_mobility_disabled))
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,
        return pages_moved;
 }
-static int move_freepages_block(struct zone *zone, struct page *page,
+int move_freepages_block(struct zone *zone, struct page *page,
                                int migratetype)
 {
        unsigned long start_pfn, end_pfn;
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
                to_drain = pcp->batch;
        else
                to_drain = pcp->count;
-        free_pcppages_bulk(zone, to_drain, pcp);
+        if (to_drain > 0) {
-        pcp->count -= to_drain;
+                free_pcppages_bulk(zone, to_drain, pcp);
+                pcp->count -= to_drain;
+        }
        local_irq_restore(flags);
 }
 #endif
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
-static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
        if (order < fail_page_alloc.min_order)
-                return 0;
+                return false;
        if (gfp_mask & __GFP_NOFAIL)
-                return 0;
+                return false;
        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
-                return 0;
+                return false;
        if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
-                return 0;
+                return false;
        return should_fail(&fail_page_alloc.attr, 1 << order);
 }
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
 #else /* CONFIG_FAIL_PAGE_ALLOC */
-static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
-        return 0;
+        return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
+        long lowmem_reserve = z->lowmem_reserve[classzone_idx];
        int o;
        free_pages -= (1 << order) - 1;
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
-        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+        if (free_pages <= min + lowmem_reserve)
                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return true;
 }
+#ifdef CONFIG_MEMORY_ISOLATION
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+        if (unlikely(zone->nr_pageblock_isolate))
+                return zone->nr_pageblock_isolate * pageblock_nr_pages;
+        return 0;
+}
+#else
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+        return 0;
+}
+#endif
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                      int classzone_idx, int alloc_flags)
 {
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+        /*
+         * If the zone has MIGRATE_ISOLATE type free pages, we should consider
+         * it.  nr_zone_isolate_freepages is never accurate so kswapd might not
+         * sleep although it could do so.  But this is more desirable for memory
+         * hotplug than sleeping which can cause a livelock in the direct
+         * reclaim path.
+         */
+        free_pages -= nr_zone_isolate_freepages(z);
        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
                                                                free_pages);
 }
@@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                page = get_page_from_freelist(gfp_mask, nodemask,
                                order, zonelist, high_zoneidx,
-                                alloc_flags, preferred_zone,
+                                alloc_flags & ~ALLOC_NO_WATERMARKS,
-                                migratetype);
+                                preferred_zone, migratetype);
                if (page) {
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
@@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 retry:
        page = get_page_from_freelist(gfp_mask, nodemask, order,
                                        zonelist, high_zoneidx,
-                                        alloc_flags, preferred_zone,
+                                        alloc_flags & ~ALLOC_NO_WATERMARKS,
-                                        migratetype);
+                                        preferred_zone, migratetype);
        /*
         * If an allocation failed after direct reclaim, it could be because
@@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                alloc_flags |= ALLOC_HARDER;
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
-                if (!in_interrupt() &&
+                if (gfp_mask & __GFP_MEMALLOC)
-                    ((current->flags & PF_MEMALLOC) ||
+                        alloc_flags |= ALLOC_NO_WATERMARKS;
-                     unlikely(test_thread_flag(TIF_MEMDIE))))
+                else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+                        alloc_flags |= ALLOC_NO_WATERMARKS;
+                else if (!in_interrupt() &&
+                                ((current->flags & PF_MEMALLOC) ||
+                                 unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
        return alloc_flags;
 }
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+        return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
+}
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2340,11 +2378,27 @@ rebalance:
        /* Allocate without watermarks if the context allows */
        if (alloc_flags & ALLOC_NO_WATERMARKS) {
+                /*
+                 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
+                 * the allocation is high priority and these type of
+                 * allocations are system rather than user orientated
+                 */
+                zonelist = node_zonelist(numa_node_id(), gfp_mask);
                page = __alloc_pages_high_priority(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
-                if (page)
+                if (page) {
+                        /*
+                         * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+                         * necessary to allocate the page. The expectation is
+                         * that the caller is taking steps that will free more
+                         * memory. The caller should avoid the page being used
+                         * for !PFMEMALLOC purposes.
+                         */
+                        page->pfmemalloc = true;
                        goto got_pg;
+                }
        }
        /* Atomic allocations - we can't balance anything */
@@ -2463,8 +2517,8 @@ nopage:
 got_pg:
        if (kmemcheck_enabled)
                kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-        return page;
+        return page;
 }
 /*
@@ -2515,6 +2569,8 @@ retry_cpuset:
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
+        else
+                page->pfmemalloc = false;
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
                        user_zonelist_order = oldval;
                } else if (oldval != user_zonelist_order) {
                        mutex_lock(&zonelists_mutex);
-                        build_all_zonelists(NULL);
+                        build_all_zonelists(NULL, NULL);
                        mutex_unlock(&zonelists_mutex);
                }
        }
@@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone);
 DEFINE_MUTEX(zonelists_mutex);
 /* return values int ....just for stop_machine() */
-static __init_refok int __build_all_zonelists(void *data)
+static int __build_all_zonelists(void *data)
 {
        int nid;
        int cpu;
+        pg_data_t *self = data;
 #ifdef CONFIG_NUMA
        memset(node_load, 0, sizeof(node_load));
 #endif
+        if (self && !node_online(self->node_id)) {
+                build_zonelists(self);
+                build_zonelist_cache(self);
+        }
        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid);
@@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data)
 * Called with zonelists_mutex held always
 * unless system_state == SYSTEM_BOOTING.
 */
-void __ref build_all_zonelists(void *data)
+void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
        set_zonelist_order();
@@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data)
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
 #ifdef CONFIG_MEMORY_HOTPLUG
-                if (data)
+                if (zone)
-                        setup_zone_pageset((struct zone *)data);
+                        setup_zone_pageset(zone);
 #endif
-                stop_machine(__build_all_zonelists, NULL, NULL);
+                stop_machine(__build_all_zonelists, pgdat, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
        memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
-static int zone_batchsize(struct zone *zone)
+static int __meminit zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
        int batch;
@@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                pcp->batch = PAGE_SHIFT * 8;
 }
-static void setup_zone_pageset(struct zone *zone)
+static void __meminit setup_zone_pageset(struct zone *zone)
 {
        int cpu;
@@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
        return 0;
 }
-static int __zone_pcp_update(void *data)
-{
-        struct zone *zone = data;
-        int cpu;
-        unsigned long batch = zone_batchsize(zone), flags;
-        for_each_possible_cpu(cpu) {
-                struct per_cpu_pageset *pset;
-                struct per_cpu_pages *pcp;
-                pset = per_cpu_ptr(zone->pageset, cpu);
-                pcp = &pset->pcp;
-                local_irq_save(flags);
-                free_pcppages_bulk(zone, pcp->count, pcp);
-                setup_pageset(pset, batch);
-                local_irq_restore(flags);
-        }
-        return 0;
-}
-void zone_pcp_update(struct zone *zone)
-{
-        stop_machine(__zone_pcp_update, zone, NULL);
-}
 static __meminit void zone_pcp_init(struct zone *zone)
 {
        /*
@@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
                                         zone_batchsize(zone));
 }
-__meminit int init_currently_empty_zone(struct zone *zone,
+int __meminit init_currently_empty_zone(struct zone *zone,
                                        unsigned long zone_start_pfn,
                                        unsigned long size,
                                        enum memmap_context context)
@@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(void)
+void __init set_pageblock_order(void)
 {
        unsigned int order;
@@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void)
 * include/linux/pageblock-flags.h for the values of pageblock_order based on
 * the kernel config
 */
-static inline void set_pageblock_order(void)
+void __init set_pageblock_order(void)
 {
 }
@@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void)
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
 */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
@@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
        int ret;
        pgdat_resize_init(pgdat);
-        pgdat->nr_zones = 0;
        init_waitqueue_head(&pgdat->kswapd_wait);
-        pgdat->kswapd_max_order = 0;
+        init_waitqueue_head(&pgdat->pfmemalloc_wait);
        pgdat_page_cgroup_init(pgdat);
        for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->spanned_pages = size;
                zone->present_pages = realsize;
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+                zone->compact_cached_free_pfn = zone->zone_start_pfn +
+                                                zone->spanned_pages;
+                zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
+#endif
 #ifdef CONFIG_NUMA
                zone->node = nid;
                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone_pcp_init(zone);
                lruvec_init(&zone->lruvec, zone);
-                zap_zone_vm_stats(zone);
-                zone->flags = 0;
                if (!size)
                        continue;
@@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 {
        pg_data_t *pgdat = NODE_DATA(nid);
+        /* pg_data_t should be reset to zero when it's allocated */
+        WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx);
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
        calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4750,7 +4794,7 @@ out:
 }
 /* Any regular memory on that node ? */
-static void check_for_regular_memory(pg_data_t *pgdat)
+static void __init check_for_regular_memory(pg_data_t *pgdat)
 {
 #ifdef CONFIG_HIGHMEM
        enum zone_type zone_type;
@@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 }
 /*
- * This is designed as sub function...plz see page_isolation.c also.
+ * This function checks whether pageblock includes unmovable pages or not.
- * set/clear page block's type to be ISOLATE.
+ * If @count is not zero, it is okay to include less @count unmovable pages
- * page allocater never alloc memory from ISOLATE block.
+ *
+ * PageLRU check wihtout isolation or lru_lock could race so that
+ * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
+ * expect this function should be exact.
 */
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
-static int
-__count_immobile_pages(struct zone *zone, struct page *page, int count)
 {
        unsigned long pfn, iter, found;
        int mt;
        /*
         * For avoiding noise data, lru_add_drain_all() should be called
-         * If ZONE_MOVABLE, the zone never contains immobile pages
+         * If ZONE_MOVABLE, the zone never contains unmovable pages
         */
        if (zone_idx(zone) == ZONE_MOVABLE)
-                return true;
+                return false;
        mt = get_pageblock_migratetype(page);
        if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
-                return true;
+                return false;
        pfn = page_to_pfn(page);
        for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
@@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
                        continue;
                page = pfn_to_page(check);
-                if (!page_count(page)) {
+                /*
+                 * We can't use page_count without pin a page
+                 * because another CPU can free compound page.
+                 * This check already skips compound tails of THP
+                 * because their page->_count is zero at all time.
+                 */
+                if (!atomic_read(&page->_count)) {
                        if (PageBuddy(page))
                                iter += (1 << page_order(page)) - 1;
                        continue;
                }
                if (!PageLRU(page))
                        found++;
                /*
@@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
                 * page at boot.
                 */
                if (found > count)
-                        return false;
+                        return true;
        }
-        return true;
+        return false;
 }
 bool is_pageblock_removable_nolock(struct page *page)
@@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page)
                        zone->zone_start_pfn + zone->spanned_pages <= pfn)
                return false;
-        return __count_immobile_pages(zone, page, 0);
+        return !has_unmovable_pages(zone, page, 0);
-}
-int set_migratetype_isolate(struct page *page)
-{
-        struct zone *zone;
-        unsigned long flags, pfn;
-        struct memory_isolate_notify arg;
-        int notifier_ret;
-        int ret = -EBUSY;
-        zone = page_zone(page);
-        spin_lock_irqsave(&zone->lock, flags);
-        pfn = page_to_pfn(page);
-        arg.start_pfn = pfn;
-        arg.nr_pages = pageblock_nr_pages;
-        arg.pages_found = 0;
-        /*
-         * It may be possible to isolate a pageblock even if the
-         * migratetype is not MIGRATE_MOVABLE. The memory isolation
-         * notifier chain is used by balloon drivers to return the
-         * number of pages in a range that are held by the balloon
-         * driver to shrink memory. If all the pages are accounted for
-         * by balloons, are free, or on the LRU, isolation can continue.
-         * Later, for example, when memory hotplug notifier runs, these
-         * pages reported as "can be isolated" should be isolated(freed)
-         * by the balloon driver through the memory notifier chain.
-         */
-        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
-        notifier_ret = notifier_to_errno(notifier_ret);
-        if (notifier_ret)
-                goto out;
-        /*
-         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
-         * We just check MOVABLE pages.
-         */
-        if (__count_immobile_pages(zone, page, arg.pages_found))
-                ret = 0;
-        /*
-         * immobile means "not-on-lru" paes. If immobile is larger than
-         * removable-by-driver pages reported by notifier, we'll fail.
-         */
-out:
-        if (!ret) {
-                set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-                move_freepages_block(zone, page, MIGRATE_ISOLATE);
-        }
-        spin_unlock_irqrestore(&zone->lock, flags);
-        if (!ret)
-                drain_all_pages();
-        return ret;
-}
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
-{
-        struct zone *zone;
-        unsigned long flags;
-        zone = page_zone(page);
-        spin_lock_irqsave(&zone->lock, flags);
-        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
-                goto out;
-        set_pageblock_migratetype(page, migratetype);
-        move_freepages_block(zone, page, migratetype);
-out:
-        spin_unlock_irqrestore(&zone->lock, flags);
 }
 #ifdef CONFIG_CMA
@@ -5869,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
 }
 #endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int __meminit __zone_pcp_update(void *data)
+{
+        struct zone *zone = data;
+        int cpu;
+        unsigned long batch = zone_batchsize(zone), flags;
+        for_each_possible_cpu(cpu) {
+                struct per_cpu_pageset *pset;
+                struct per_cpu_pages *pcp;
+                pset = per_cpu_ptr(zone->pageset, cpu);
+                pcp = &pset->pcp;
+                local_irq_save(flags);
+                if (pcp->count > 0)
+                        free_pcppages_bulk(zone, pcp->count, pcp);
+                setup_pageset(pset, batch);
+                local_irq_restore(flags);
+        }
+        return 0;
+}
+void __meminit zone_pcp_update(struct zone *zone)
+{
+        stop_machine(__zone_pcp_update, zone, NULL);
+}
+#endif
 #ifdef CONFIG_MEMORY_HOTREMOVE
+void zone_pcp_reset(struct zone *zone)
+{
+        unsigned long flags;
+        /* avoid races with drain_pages()  */
+        local_irq_save(flags);
+        if (zone->pageset != &boot_pageset) {
+                free_percpu(zone->pageset);
+                zone->pageset = &boot_pageset;
+        }
+        local_irq_restore(flags);
+}
 /*
 * All pages in the range must be isolated before calling this.
 */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index eb750f851395..5ddad0c6daa6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 static DEFINE_MUTEX(swap_cgroup_mutex);
 struct swap_cgroup_ctrl {
diff --git a/mm/page_io.c b/mm/page_io.c
index 34f02923744c..78eee32ee486 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -17,6 +17,7 @@
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/swapops.h>
+#include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/frontswap.h>
 #include <asm/pgtable.h>
@@ -86,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err)
        bio_put(bio);
 }
+int generic_swapfile_activate(struct swap_info_struct *sis,
+                                struct file *swap_file,
+                                sector_t *span)
+{
+        struct address_space *mapping = swap_file->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned blocks_per_page;
+        unsigned long page_no;
+        unsigned blkbits;
+        sector_t probe_block;
+        sector_t last_block;
+        sector_t lowest_block = -1;
+        sector_t highest_block = 0;
+        int nr_extents = 0;
+        int ret;
+        blkbits = inode->i_blkbits;
+        blocks_per_page = PAGE_SIZE >> blkbits;
+        /*
+         * Map all the blocks into the extent list.  This code doesn't try
+         * to be very smart.
+         */
+        probe_block = 0;
+        page_no = 0;
+        last_block = i_size_read(inode) >> blkbits;
+        while ((probe_block + blocks_per_page) <= last_block &&
+                        page_no < sis->max) {
+                unsigned block_in_page;
+                sector_t first_block;
+                first_block = bmap(inode, probe_block);
+                if (first_block == 0)
+                        goto bad_bmap;
+                /*
+                 * It must be PAGE_SIZE aligned on-disk
+                 */
+                if (first_block & (blocks_per_page - 1)) {
+                        probe_block++;
+                        goto reprobe;
+                }
+                for (block_in_page = 1; block_in_page < blocks_per_page;
+                                        block_in_page++) {
+                        sector_t block;
+                        block = bmap(inode, probe_block + block_in_page);
+                        if (block == 0)
+                                goto bad_bmap;
+                        if (block != first_block + block_in_page) {
+                                /* Discontiguity */
+                                probe_block++;
+                                goto reprobe;
+                        }
+                }
+                first_block >>= (PAGE_SHIFT - blkbits);
+                if (page_no) {  /* exclude the header page */
+                        if (first_block < lowest_block)
+                                lowest_block = first_block;
+                        if (first_block > highest_block)
+                                highest_block = first_block;
+                }
+                /*
+                 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+                 */
+                ret = add_swap_extent(sis, page_no, 1, first_block);
+                if (ret < 0)
+                        goto out;
+                nr_extents += ret;
+                page_no++;
+                probe_block += blocks_per_page;
+reprobe:
+                continue;
+        }
+        ret = nr_extents;
+        *span = 1 + highest_block - lowest_block;
+        if (page_no == 0)
+                page_no = 1;    /* force Empty message */
+        sis->max = page_no;
+        sis->pages = page_no - 1;
+        sis->highest_bit = page_no - 1;
+out:
+        return ret;
+bad_bmap:
+        printk(KERN_ERR "swapon: swapfile has holes\n");
+        ret = -EINVAL;
+        goto out;
+}
 /*
 * We may have stale swap cache pages in memory: notice
 * them here and get rid of the unnecessary final write.
@@ -94,6 +187,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct bio *bio;
        int ret = 0, rw = WRITE;
+        struct swap_info_struct *sis = page_swap_info(page);
        if (try_to_free_swap(page)) {
                unlock_page(page);
@@ -105,6 +199,33 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
                end_page_writeback(page);
                goto out;
        }
+        if (sis->flags & SWP_FILE) {
+                struct kiocb kiocb;
+                struct file *swap_file = sis->swap_file;
+                struct address_space *mapping = swap_file->f_mapping;
+                struct iovec iov = {
+                        .iov_base = kmap(page),
+                        .iov_len  = PAGE_SIZE,
+                };
+                init_sync_kiocb(&kiocb, swap_file);
+                kiocb.ki_pos = page_file_offset(page);
+                kiocb.ki_left = PAGE_SIZE;
+                kiocb.ki_nbytes = PAGE_SIZE;
+                unlock_page(page);
+                ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
+                                                &kiocb, &iov,
+                                                kiocb.ki_pos, 1);
+                kunmap(page);
+                if (ret == PAGE_SIZE) {
+                        count_vm_event(PSWPOUT);
+                        ret = 0;
+                }
+                return ret;
+        }
        bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
        if (bio == NULL) {
                set_page_dirty(page);
@@ -126,6 +247,7 @@ int swap_readpage(struct page *page)
 {
        struct bio *bio;
        int ret = 0;
+        struct swap_info_struct *sis = page_swap_info(page);
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(PageUptodate(page));
@@ -134,6 +256,17 @@ int swap_readpage(struct page *page)
                unlock_page(page);
                goto out;
        }
+        if (sis->flags & SWP_FILE) {
+                struct file *swap_file = sis->swap_file;
+                struct address_space *mapping = swap_file->f_mapping;
+                ret = mapping->a_ops->readpage(swap_file, page);
+                if (!ret)
+                        count_vm_event(PSWPIN);
+                return ret;
+        }
        bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
        if (bio == NULL) {
                unlock_page(page);
@@ -145,3 +278,15 @@ int swap_readpage(struct page *page)
 out:
        return ret;
 }
+int swap_set_page_dirty(struct page *page)
+{
+        struct swap_info_struct *sis = page_swap_info(page);
+        if (sis->flags & SWP_FILE) {
+                struct address_space *mapping = sis->swap_file->f_mapping;
+                return mapping->a_ops->set_page_dirty(page);
+        } else {
+                return __set_page_dirty_no_writeback(page);
+        }
+}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c9f04774f2b8..247d1f175739 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -5,8 +5,101 @@
 #include <linux/mm.h>
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
+#include <linux/memory.h>
 #include "internal.h"
+/* called while holding zone->lock */
+static void set_pageblock_isolate(struct page *page)
+{
+        if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
+                return;
+        set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+        page_zone(page)->nr_pageblock_isolate++;
+}
+/* called while holding zone->lock */
+static void restore_pageblock_isolate(struct page *page, int migratetype)
+{
+        struct zone *zone = page_zone(page);
+        if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
+                return;
+        BUG_ON(zone->nr_pageblock_isolate <= 0);
+        set_pageblock_migratetype(page, migratetype);
+        zone->nr_pageblock_isolate--;
+}
+int set_migratetype_isolate(struct page *page)
+{
+        struct zone *zone;
+        unsigned long flags, pfn;
+        struct memory_isolate_notify arg;
+        int notifier_ret;
+        int ret = -EBUSY;
+        zone = page_zone(page);
+        spin_lock_irqsave(&zone->lock, flags);
+        pfn = page_to_pfn(page);
+        arg.start_pfn = pfn;
+        arg.nr_pages = pageblock_nr_pages;
+        arg.pages_found = 0;
+        /*
+         * It may be possible to isolate a pageblock even if the
+         * migratetype is not MIGRATE_MOVABLE. The memory isolation
+         * notifier chain is used by balloon drivers to return the
+         * number of pages in a range that are held by the balloon
+         * driver to shrink memory. If all the pages are accounted for
+         * by balloons, are free, or on the LRU, isolation can continue.
+         * Later, for example, when memory hotplug notifier runs, these
+         * pages reported as "can be isolated" should be isolated(freed)
+         * by the balloon driver through the memory notifier chain.
+         */
+        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+        notifier_ret = notifier_to_errno(notifier_ret);
+        if (notifier_ret)
+                goto out;
+        /*
+         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+         * We just check MOVABLE pages.
+         */
+        if (!has_unmovable_pages(zone, page, arg.pages_found))
+                ret = 0;
+        /*
+         * immobile means "not-on-lru" paes. If immobile is larger than
+         * removable-by-driver pages reported by notifier, we'll fail.
+         */
+out:
+        if (!ret) {
+                set_pageblock_isolate(page);
+                move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        }
+        spin_unlock_irqrestore(&zone->lock, flags);
+        if (!ret)
+                drain_all_pages();
+        return ret;
+}
+void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+{
+        struct zone *zone;
+        unsigned long flags;
+        zone = page_zone(page);
+        spin_lock_irqsave(&zone->lock, flags);
+        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+                goto out;
+        move_freepages_block(zone, page, migratetype);
+        restore_pageblock_isolate(page, migratetype);
+out:
+        spin_unlock_irqrestore(&zone->lock, flags);
+}
 static inline struct page *
 __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index c15b998e5a86..d4e184e2a38e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -929,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-        pvma.vm_pgoff = index;
+        /* Bias interleave by inode number to distribute better across nodes */
+        pvma.vm_pgoff = index + info->vfs_inode.i_ino;
        pvma.vm_ops = NULL;
        pvma.vm_policy = spol;
        return swapin_readahead(swap, gfp, &pvma, 0);
@@ -942,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp,
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-        pvma.vm_pgoff = index;
+        /* Bias interleave by inode number to distribute better across nodes */
+        pvma.vm_pgoff = index + info->vfs_inode.i_ino;
        pvma.vm_ops = NULL;
        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
diff --git a/mm/slab.c b/mm/slab.c
index 1fcf3ac94b6c..f8b0d539b482 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -118,12 +118,16 @@
 #include        <linux/memory.h>
 #include        <linux/prefetch.h>
+#include        <net/sock.h>
 #include        <asm/cacheflush.h>
 #include        <asm/tlbflush.h>
 #include        <asm/page.h>
 #include <trace/events/kmem.h>
+#include        "internal.h"
 /*
 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 *                0 for faster, smaller code (especially in the critical paths).
@@ -152,6 +156,12 @@
 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 #endif
+/*
+ * true if a page was allocated from pfmemalloc reserves for network-based
+ * swap
+ */
+static bool pfmemalloc_active __read_mostly;
 /* Legal flag mask for kmem_cache_create(). */
 #if DEBUG
 # define CREATE_MASK    (SLAB_RED_ZONE | \
@@ -257,9 +267,30 @@ struct array_cache {
                         * Must have this definition in here for the proper
                         * alignment of array_cache. Also simplifies accessing
                         * the entries.
+                         *
+                         * Entries should not be directly dereferenced as
+                         * entries belonging to slabs marked pfmemalloc will
+                         * have the lower bits set SLAB_OBJ_PFMEMALLOC
                         */
 };
+#define SLAB_OBJ_PFMEMALLOC     1
+static inline bool is_obj_pfmemalloc(void *objp)
+{
+        return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
+}
+static inline void set_obj_pfmemalloc(void **objp)
+{
+        *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
+        return;
+}
+static inline void clear_obj_pfmemalloc(void **objp)
+{
+        *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
+}
 /*
 * bootstrap: The caches do not work without cpuarrays anymore, but the
 * cpuarrays are allocated from the generic caches...
@@ -900,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries,
        return nc;
 }
+static inline bool is_slab_pfmemalloc(struct slab *slabp)
+{
+        struct page *page = virt_to_page(slabp->s_mem);
+        return PageSlabPfmemalloc(page);
+}
+/* Clears pfmemalloc_active if no slabs have pfmalloc set */
+static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
+                                                struct array_cache *ac)
+{
+        struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
+        struct slab *slabp;
+        unsigned long flags;
+        if (!pfmemalloc_active)
+                return;
+        spin_lock_irqsave(&l3->list_lock, flags);
+        list_for_each_entry(slabp, &l3->slabs_full, list)
+                if (is_slab_pfmemalloc(slabp))
+                        goto out;
+        list_for_each_entry(slabp, &l3->slabs_partial, list)
+                if (is_slab_pfmemalloc(slabp))
+                        goto out;
+        list_for_each_entry(slabp, &l3->slabs_free, list)
+                if (is_slab_pfmemalloc(slabp))
+                        goto out;
+        pfmemalloc_active = false;
+out:
+        spin_unlock_irqrestore(&l3->list_lock, flags);
+}
+static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
+                                                gfp_t flags, bool force_refill)
+{
+        int i;
+        void *objp = ac->entry[--ac->avail];
+        /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
+        if (unlikely(is_obj_pfmemalloc(objp))) {
+                struct kmem_list3 *l3;
+                if (gfp_pfmemalloc_allowed(flags)) {
+                        clear_obj_pfmemalloc(&objp);
+                        return objp;
+                }
+                /* The caller cannot use PFMEMALLOC objects, find another one */
+                for (i = 1; i < ac->avail; i++) {
+                        /* If a !PFMEMALLOC object is found, swap them */
+                        if (!is_obj_pfmemalloc(ac->entry[i])) {
+                                objp = ac->entry[i];
+                                ac->entry[i] = ac->entry[ac->avail];
+                                ac->entry[ac->avail] = objp;
+                                return objp;
+                        }
+                }
+                /*
+                 * If there are empty slabs on the slabs_free list and we are
+                 * being forced to refill the cache, mark this one !pfmemalloc.
+                 */
+                l3 = cachep->nodelists[numa_mem_id()];
+                if (!list_empty(&l3->slabs_free) && force_refill) {
+                        struct slab *slabp = virt_to_slab(objp);
+                        ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
+                        clear_obj_pfmemalloc(&objp);
+                        recheck_pfmemalloc_active(cachep, ac);
+                        return objp;
+                }
+                /* No !PFMEMALLOC objects available */
+                ac->avail++;
+                objp = NULL;
+        }
+        return objp;
+}
+static inline void *ac_get_obj(struct kmem_cache *cachep,
+                        struct array_cache *ac, gfp_t flags, bool force_refill)
+{
+        void *objp;
+        if (unlikely(sk_memalloc_socks()))
+                objp = __ac_get_obj(cachep, ac, flags, force_refill);
+        else
+                objp = ac->entry[--ac->avail];
+        return objp;
+}
+static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+                                                                void *objp)
+{
+        if (unlikely(pfmemalloc_active)) {
+                /* Some pfmemalloc slabs exist, check if this is one */
+                struct page *page = virt_to_page(objp);
+                if (PageSlabPfmemalloc(page))
+                        set_obj_pfmemalloc(&objp);
+        }
+        return objp;
+}
+static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+                                                                void *objp)
+{
+        if (unlikely(sk_memalloc_socks()))
+                objp = __ac_put_obj(cachep, ac, objp);
+        ac->entry[ac->avail++] = objp;
+}
 /*
 * Transfer objects in one arraycache to another.
 * Locking must be handled by the caller.
@@ -1076,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
                        STATS_INC_ACOVERFLOW(cachep);
                        __drain_alien_cache(cachep, alien, nodeid);
                }
-                alien->entry[alien->avail++] = objp;
+                ac_put_obj(cachep, alien, objp);
                spin_unlock(&alien->lock);
        } else {
                spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1759,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                return NULL;
        }
+        /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+        if (unlikely(page->pfmemalloc))
+                pfmemalloc_active = true;
        nr_pages = (1 << cachep->gfporder);
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
                add_zone_page_state(page_zone(page),
@@ -1766,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        else
                add_zone_page_state(page_zone(page),
                        NR_SLAB_UNRECLAIMABLE, nr_pages);
-        for (i = 0; i < nr_pages; i++)
+        for (i = 0; i < nr_pages; i++) {
                __SetPageSlab(page + i);
+                if (page->pfmemalloc)
+                        SetPageSlabPfmemalloc(page + i);
+        }
        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
                kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1800,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
                                NR_SLAB_UNRECLAIMABLE, nr_freed);
        while (i--) {
                BUG_ON(!PageSlab(page));
+                __ClearPageSlabPfmemalloc(page);
                __ClearPageSlab(page);
                page++;
        }
@@ -3015,16 +3173,19 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
+                                                        bool force_refill)
 {
        int batchcount;
        struct kmem_list3 *l3;
        struct array_cache *ac;
        int node;
-retry:
        check_irq_off();
        node = numa_mem_id();
+        if (unlikely(force_refill))
+                goto force_grow;
+retry:
        ac = cpu_cache_get(cachep);
        batchcount = ac->batchcount;
        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3074,8 +3235,8 @@ retry:
                        STATS_INC_ACTIVE(cachep);
                        STATS_SET_HIGH(cachep);
-                        ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
+                        ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
-                                                            node);
+                                                                        node));
                }
                check_slabp(cachep, slabp);
@@ -3094,18 +3255,22 @@ alloc_done:
        if (unlikely(!ac->avail)) {
                int x;
+force_grow:
                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
                /* cache_grow can reenable interrupts, then ac could change. */
                ac = cpu_cache_get(cachep);
-                if (!x && ac->avail == 0)       /* no objects in sight? abort */
+                /* no objects in sight? abort */
+                if (!x && (ac->avail == 0 || force_refill))
                        return NULL;
                if (!ac->avail)         /* objects refilled by interrupt? */
                        goto retry;
        }
        ac->touched = 1;
-        return ac->entry[--ac->avail];
+        return ac_get_obj(cachep, ac, flags, force_refill);
 }
 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3187,23 +3352,35 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
        void *objp;
        struct array_cache *ac;
+        bool force_refill = false;
        check_irq_off();
        ac = cpu_cache_get(cachep);
        if (likely(ac->avail)) {
-                STATS_INC_ALLOCHIT(cachep);
                ac->touched = 1;
-                objp = ac->entry[--ac->avail];
+                objp = ac_get_obj(cachep, ac, flags, false);
-        } else {
-                STATS_INC_ALLOCMISS(cachep);
-                objp = cache_alloc_refill(cachep, flags);
                /*
-                 * the 'ac' may be updated by cache_alloc_refill(),
+                 * Allow for the possibility all avail objects are not allowed
-                 * and kmemleak_erase() requires its correct value.
+                 * by the current flags
                 */
-                ac = cpu_cache_get(cachep);
+                if (objp) {
+                        STATS_INC_ALLOCHIT(cachep);
+                        goto out;
+                }
+                force_refill = true;
        }
+        STATS_INC_ALLOCMISS(cachep);
+        objp = cache_alloc_refill(cachep, flags, force_refill);
+        /*
+         * the 'ac' may be updated by cache_alloc_refill(),
+         * and kmemleak_erase() requires its correct value.
+         */
+        ac = cpu_cache_get(cachep);
+out:
        /*
         * To avoid a false negative, if an object that is in one of the
         * per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3525,9 +3702,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
        struct kmem_list3 *l3;
        for (i = 0; i < nr_objects; i++) {
-                void *objp = objpp[i];
+                void *objp;
                struct slab *slabp;
+                clear_obj_pfmemalloc(&objpp[i]);
+                objp = objpp[i];
                slabp = virt_to_slab(objp);
                l3 = cachep->nodelists[node];
                list_del(&slabp->list);
@@ -3645,7 +3825,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
                cache_flusharray(cachep, ac);
        }
-        ac->entry[ac->avail++] = objp;
+        ac_put_obj(cachep, ac, objp);
 }
 /**
diff --git a/mm/slub.c b/mm/slub.c
index e517d435e5dc..8f78e2577031 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -34,6 +34,8 @@
 #include <trace/events/kmem.h>
+#include "internal.h"
 /*
 * Lock order:
 *   1. slab_mutex (Global Mutex)
@@ -1354,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        inc_slabs_node(s, page_to_nid(page), page->objects);
        page->slab = s;
        __SetPageSlab(page);
+        if (page->pfmemalloc)
+                SetPageSlabPfmemalloc(page);
        start = page_address(page);
@@ -1397,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
                -pages);
+        __ClearPageSlabPfmemalloc(page);
        __ClearPageSlab(page);
        reset_page_mapcount(page);
        if (current->reclaim_state)
@@ -2126,6 +2131,14 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
        return freelist;
 }
+static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
+{
+        if (unlikely(PageSlabPfmemalloc(page)))
+                return gfp_pfmemalloc_allowed(gfpflags);
+        return true;
+}
 /*
 * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
 * or deactivate the page.
@@ -2206,6 +2219,18 @@ redo:
                goto new_slab;
        }
+        /*
+         * By rights, we should be searching for a slab page that was
+         * PFMEMALLOC but right now, we are losing the pfmemalloc
+         * information when the page leaves the per-cpu allocator
+         */
+        if (unlikely(!pfmemalloc_match(page, gfpflags))) {
+                deactivate_slab(s, page, c->freelist);
+                c->page = NULL;
+                c->freelist = NULL;
+                goto new_slab;
+        }
        /* must check again c->freelist in case of cpu migration or IRQ */
        freelist = c->freelist;
        if (freelist)
@@ -2256,11 +2281,11 @@ new_slab:
        }
        page = c->page;
-        if (likely(!kmem_cache_debug(s)))
+        if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
                goto load_freelist;
        /* Only entered in the debug case */
-        if (!alloc_debug_processing(s, page, freelist, addr))
+        if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
                goto new_slab;  /* Slab failed checks. Next slab needed */
        deactivate_slab(s, page, get_freepointer(s, freelist));
@@ -2313,7 +2338,6 @@ redo:
        object = c->freelist;
        page = c->page;
        if (unlikely(!object || !node_match(page, node)))
                object = __slab_alloc(s, gfpflags, node, addr, c);
        else {
diff --git a/mm/sparse.c b/mm/sparse.c
index c7bb952400c8..fac95f2888f2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
        if (slab_is_available()) {
                if (node_state(nid, N_HIGH_MEMORY))
-                        section = kmalloc_node(array_size, GFP_KERNEL, nid);
+                        section = kzalloc_node(array_size, GFP_KERNEL, nid);
                else
-                        section = kmalloc(array_size, GFP_KERNEL);
+                        section = kzalloc(array_size, GFP_KERNEL);
-        } else
+        } else {
                section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+        }
-        if (section)
-                memset(section, 0, array_size);
        return section;
 }
 static int __meminit sparse_index_init(unsigned long section_nr, int nid)
 {
-        static DEFINE_SPINLOCK(index_init_lock);
        unsigned long root = SECTION_NR_TO_ROOT(section_nr);
        struct mem_section *section;
        int ret = 0;
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
        section = sparse_index_alloc(nid);
        if (!section)
                return -ENOMEM;
-        /*
-         * This lock keeps two different sections from
-         * reallocating for the same index
-         */
-        spin_lock(&index_init_lock);
-        if (mem_section[root]) {
-                ret = -EEXIST;
-                goto out;
-        }
        mem_section[root] = section;
-out:
-        spin_unlock(&index_init_lock);
        return ret;
 }
 #else /* !SPARSEMEM_EXTREME */
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms)
                     break;
        }
+        VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
        return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
@@ -493,6 +481,9 @@ void __init sparse_init(void)
        struct page **map_map;
 #endif
+        /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
+        set_pageblock_order();
        /*
         * map is using big page (aka 2M in x86 64 bit)
         * usemap is less one page (aka 24 bytes)
diff --git a/mm/swap.c b/mm/swap.c
index 4e7e2ec67078..77825883298f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -236,6 +236,58 @@ void put_pages_list(struct list_head *pages)
 }
 EXPORT_SYMBOL(put_pages_list);
+/*
+ * get_kernel_pages() - pin kernel pages in memory
+ * @kiov:       An array of struct kvec structures
+ * @nr_segs:    number of segments to pin
+ * @write:      pinning for read/write, currently ignored
+ * @pages:      array that receives pointers to the pages pinned.
+ *              Should be at least nr_segs long.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with.
+ */
+int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
+                struct page **pages)
+{
+        int seg;
+        for (seg = 0; seg < nr_segs; seg++) {
+                if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
+                        return seg;
+                pages[seg] = kmap_to_page(kiov[seg].iov_base);
+                page_cache_get(pages[seg]);
+        }
+        return seg;
+}
+EXPORT_SYMBOL_GPL(get_kernel_pages);
+/*
+ * get_kernel_page() - pin a kernel page in memory
+ * @start:      starting kernel address
+ * @write:      pinning for read/write, currently ignored
+ * @pages:      array that receives pointer to the page pinned.
+ *              Must be at least nr_segs long.
+ *
+ * Returns 1 if page is pinned. If the page was not pinned, returns
+ * -errno. The page returned must be released with a put_page() call
+ * when it is finished with.
+ */
+int get_kernel_page(unsigned long start, int write, struct page **pages)
+{
+        const struct kvec kiov = {
+                .iov_base = (void *)start,
+                .iov_len = PAGE_SIZE
+        };
+        return get_kernel_pages(&kiov, 1, write, pages);
+}
+EXPORT_SYMBOL_GPL(get_kernel_page);
 static void pagevec_lru_move_fn(struct pagevec *pvec,
        void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
        void *arg)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4c5ff7f284d9..0cb36fb1f61c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
+#include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
 #include <linux/page_cgroup.h>
@@ -26,7 +27,7 @@
 */
 static const struct address_space_operations swap_aops = {
        .writepage      = swap_writepage,
-        .set_page_dirty = __set_page_dirty_no_writeback,
+        .set_page_dirty = swap_set_page_dirty,
        .migratepage    = migrate_page,
 };
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
        unsigned long offset = swp_offset(entry);
        unsigned long start_offset, end_offset;
        unsigned long mask = (1UL << page_cluster) - 1;
+        struct blk_plug plug;
        /* Read a page_cluster sized and aligned cluster around offset. */
        start_offset = offset & ~mask;
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
        if (!start_offset)      /* First page is swap header. */
                start_offset++;
+        blk_start_plug(&plug);
        for (offset = start_offset; offset <= end_offset ; offset++) {
                /* Ok, do the async read-ahead now */
                page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
                        continue;
                page_cache_release(page);
        }
+        blk_finish_plug(&plug);
        lru_add_drain();        /* Push any new pages onto the LRU now */
        return read_swap_cache_async(entry, gfp_mask, vma, addr);
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 71373d03fcee..14e254c768fc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,6 +33,7 @@
 #include <linux/oom.h>
 #include <linux/frontswap.h>
 #include <linux/swapfile.h>
+#include <linux/export.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -548,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
        /* free if no reference */
        if (!usage) {
-                struct gendisk *disk = p->bdev->bd_disk;
                if (offset < p->lowest_bit)
                        p->lowest_bit = offset;
                if (offset > p->highest_bit)
@@ -559,9 +559,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
                nr_swap_pages++;
                p->inuse_pages--;
                frontswap_invalidate_page(p->type, offset);
-                if ((p->flags & SWP_BLKDEV) &&
+                if (p->flags & SWP_BLKDEV) {
-                                disk->fops->swap_slot_free_notify)
+                        struct gendisk *disk = p->bdev->bd_disk;
-                        disk->fops->swap_slot_free_notify(p->bdev, offset);
+                        if (disk->fops->swap_slot_free_notify)
+                                disk->fops->swap_slot_free_notify(p->bdev,
+                                                                  offset);
+                }
        }
        return usage;
@@ -832,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
-                if (ret > 0)
+                mem_cgroup_cancel_charge_swapin(memcg);
-                        mem_cgroup_cancel_charge_swapin(memcg);
                ret = 0;
                goto out;
        }
@@ -1328,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
                list_del(&se->list);
                kfree(se);
        }
+        if (sis->flags & SWP_FILE) {
+                struct file *swap_file = sis->swap_file;
+                struct address_space *mapping = swap_file->f_mapping;
+                sis->flags &= ~SWP_FILE;
+                mapping->a_ops->swap_deactivate(swap_file);
+        }
 }
 /*
@@ -1336,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
 *
 * This function rather assumes that it is called in ascending page order.
 */
-static int
+int
 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block)
 {
@@ -1409,98 +1419,28 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 */
 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 {
-        struct inode *inode;
+        struct file *swap_file = sis->swap_file;
-        unsigned blocks_per_page;
+        struct address_space *mapping = swap_file->f_mapping;
-        unsigned long page_no;
+        struct inode *inode = mapping->host;
-        unsigned blkbits;
-        sector_t probe_block;
-        sector_t last_block;
-        sector_t lowest_block = -1;
-        sector_t highest_block = 0;
-        int nr_extents = 0;
        int ret;
-        inode = sis->swap_file->f_mapping->host;
        if (S_ISBLK(inode->i_mode)) {
                ret = add_swap_extent(sis, 0, sis->max, 0);
                *span = sis->pages;
-                goto out;
+                return ret;
        }
-        blkbits = inode->i_blkbits;
+        if (mapping->a_ops->swap_activate) {
-        blocks_per_page = PAGE_SIZE >> blkbits;
+                ret = mapping->a_ops->swap_activate(sis, swap_file, span);
+                if (!ret) {
-        /*
+                        sis->flags |= SWP_FILE;
-         * Map all the blocks into the extent list.  This code doesn't try
+                        ret = add_swap_extent(sis, 0, sis->max, 0);
-         * to be very smart.
+                        *span = sis->pages;
-         */
-        probe_block = 0;
-        page_no = 0;
-        last_block = i_size_read(inode) >> blkbits;
-        while ((probe_block + blocks_per_page) <= last_block &&
-                        page_no < sis->max) {
-                unsigned block_in_page;
-                sector_t first_block;
-                first_block = bmap(inode, probe_block);
-                if (first_block == 0)
-                        goto bad_bmap;
-                /*
-                 * It must be PAGE_SIZE aligned on-disk
-                 */
-                if (first_block & (blocks_per_page - 1)) {
-                        probe_block++;
-                        goto reprobe;
-                }
-                for (block_in_page = 1; block_in_page < blocks_per_page;
-                                        block_in_page++) {
-                        sector_t block;
-                        block = bmap(inode, probe_block + block_in_page);
-                        if (block == 0)
-                                goto bad_bmap;
-                        if (block != first_block + block_in_page) {
-                                /* Discontiguity */
-                                probe_block++;
-                                goto reprobe;
-                        }
-                }
-                first_block >>= (PAGE_SHIFT - blkbits);
-                if (page_no) {  /* exclude the header page */
-                        if (first_block < lowest_block)
-                                lowest_block = first_block;
-                        if (first_block > highest_block)
-                                highest_block = first_block;
                }
+                return ret;
+        }
-                /*
+        return generic_swapfile_activate(sis, swap_file, span);
-                 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
-                 */
-                ret = add_swap_extent(sis, page_no, 1, first_block);
-                if (ret < 0)
-                        goto out;
-                nr_extents += ret;
-                page_no++;
-                probe_block += blocks_per_page;
-reprobe:
-                continue;
-        }
-        ret = nr_extents;
-        *span = 1 + highest_block - lowest_block;
-        if (page_no == 0)
-                page_no = 1;    /* force Empty message */
-        sis->max = page_no;
-        sis->pages = page_no - 1;
-        sis->highest_bit = page_no - 1;
-out:
-        return ret;
-bad_bmap:
-        printk(KERN_ERR "swapon: swapfile has holes\n");
-        ret = -EINVAL;
-        goto out;
 }
 static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2285,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry)
        return __swap_duplicate(entry, SWAP_HAS_CACHE);
 }
+struct swap_info_struct *page_swap_info(struct page *page)
+{
+        swp_entry_t swap = { .val = page_private(page) };
+        BUG_ON(!PageSwapCache(page));
+        return swap_info[swp_type(swap)];
+}
+/*
+ * out-of-line __page_file_ methods to avoid include hell.
+ */
+struct address_space *__page_file_mapping(struct page *page)
+{
+        VM_BUG_ON(!PageSwapCache(page));
+        return page_swap_info(page)->swap_file->f_mapping;
+}
+EXPORT_SYMBOL_GPL(__page_file_mapping);
+pgoff_t __page_file_index(struct page *page)
+{
+        swp_entry_t swap = { .val = page_private(page) };
+        VM_BUG_ON(!PageSwapCache(page));
+        return swp_offset(swap);
+}
+EXPORT_SYMBOL_GPL(__page_file_index);
 /*
 * add_swap_count_continuation - called when a swap count is duplicated
 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e03f4c7307a5..2bb90b1d241c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -413,11 +413,11 @@ nocache:
                if (addr + size - 1 < addr)
                        goto overflow;
-                n = rb_next(&first->rb_node);
+                if (list_is_last(&first->list, &vmap_area_list))
-                if (n)
-                        first = rb_entry(n, struct vmap_area, rb_node);
-                else
                        goto found;
+                first = list_entry(first->list.next,
+                                struct vmap_area, list);
        }
 found:
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
        BUG_ON(size & ~PAGE_MASK);
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+        if (WARN_ON(size == 0)) {
+                /*
+                 * Allocating 0 bytes isn't what caller wants since
+                 * get_order(0) returns funny result. Just warn and terminate
+                 * early.
+                 */
+                return NULL;
+        }
        order = get_order(size);
 again:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 347b3ff2a478..8d01243d9560 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -133,7 +133,7 @@ long vm_total_pages;	/* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
 {
        return !sc->target_mem_cgroup;
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        cond_resched();
+        mem_cgroup_uncharge_start();
        while (!list_empty(page_list)) {
                enum page_references references;
                struct address_space *mapping;
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
                if (PageWriteback(page)) {
-                        nr_writeback++;
+                        /*
-                        unlock_page(page);
+                         * memcg doesn't have any dirty pages throttling so we
-                        goto keep;
+                         * could easily OOM just because too many pages are in
+                         * writeback and there is nothing else to reclaim.
+                         *
+                         * Check __GFP_IO, certainly because a loop driver
+                         * thread might enter reclaim, and deadlock if it waits
+                         * on a page for which it is needed to do the write
+                         * (loop masks off __GFP_IO|__GFP_FS for this reason);
+                         * but more thought would probably show more reasons.
+                         *
+                         * Don't require __GFP_FS, since we're not going into
+                         * the FS, just waiting on its writeback completion.
+                         * Worryingly, ext4 gfs2 and xfs allocate pages with
+                         * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
+                         * testing may_enter_fs here is liable to OOM on them.
+                         */
+                        if (global_reclaim(sc) ||
+                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+                                /*
+                                 * This is slightly racy - end_page_writeback()
+                                 * might have just cleared PageReclaim, then
+                                 * setting PageReclaim here end up interpreted
+                                 * as PageReadahead - but that does not matter
+                                 * enough to care.  What we do want is for this
+                                 * page to have PageReclaim set next time memcg
+                                 * reclaim reaches the tests above, so it will
+                                 * then wait_on_page_writeback() to avoid OOM;
+                                 * and it's also appropriate in global reclaim.
+                                 */
+                                SetPageReclaim(page);
+                                nr_writeback++;
+                                goto keep_locked;
+                        }
+                        wait_on_page_writeback(page);
                }
                references = page_check_references(page, sc);
@@ -921,6 +954,7 @@ keep:
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
+        mem_cgroup_uncharge_end();
        *ret_nr_dirty += nr_dirty;
        *ret_nr_writeback += nr_writeback;
        return nr_reclaimed;
@@ -2112,6 +2146,83 @@ out:
        return 0;
 }
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+{
+        struct zone *zone;
+        unsigned long pfmemalloc_reserve = 0;
+        unsigned long free_pages = 0;
+        int i;
+        bool wmark_ok;
+        for (i = 0; i <= ZONE_NORMAL; i++) {
+                zone = &pgdat->node_zones[i];
+                pfmemalloc_reserve += min_wmark_pages(zone);
+                free_pages += zone_page_state(zone, NR_FREE_PAGES);
+        }
+        wmark_ok = free_pages > pfmemalloc_reserve / 2;
+        /* kswapd must be awake if processes are being throttled */
+        if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
+                pgdat->classzone_idx = min(pgdat->classzone_idx,
+                                                (enum zone_type)ZONE_NORMAL);
+                wake_up_interruptible(&pgdat->kswapd_wait);
+        }
+        return wmark_ok;
+}
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached
+ */
+static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+                                        nodemask_t *nodemask)
+{
+        struct zone *zone;
+        int high_zoneidx = gfp_zone(gfp_mask);
+        pg_data_t *pgdat;
+        /*
+         * Kernel threads should not be throttled as they may be indirectly
+         * responsible for cleaning pages necessary for reclaim to make forward
+         * progress. kjournald for example may enter direct reclaim while
+         * committing a transaction where throttling it could forcing other
+         * processes to block on log_wait_commit().
+         */
+        if (current->flags & PF_KTHREAD)
+                return;
+        /* Check if the pfmemalloc reserves are ok */
+        first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+        pgdat = zone->zone_pgdat;
+        if (pfmemalloc_watermark_ok(pgdat))
+                return;
+        /* Account for the throttling */
+        count_vm_event(PGSCAN_DIRECT_THROTTLE);
+        /*
+         * If the caller cannot enter the filesystem, it's possible that it
+         * is due to the caller holding an FS lock or performing a journal
+         * transaction in the case of a filesystem like ext[3|4]. In this case,
+         * it is not safe to block on pfmemalloc_wait as kswapd could be
+         * blocked waiting on the same lock. Instead, throttle for up to a
+         * second before continuing.
+         */
+        if (!(gfp_mask & __GFP_FS)) {
+                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
+                        pfmemalloc_watermark_ok(pgdat), HZ);
+                return;
+        }
+        /* Throttle until kswapd wakes the process */
+        wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+                pfmemalloc_watermark_ok(pgdat));
+}
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                gfp_t gfp_mask, nodemask_t *nodemask)
 {
@@ -2131,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .gfp_mask = sc.gfp_mask,
        };
+        throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
+        /*
+         * Do not enter reclaim if fatal signal is pending. 1 is returned so
+         * that the page allocator does not consider triggering OOM
+         */
+        if (fatal_signal_pending(current))
+                return 1;
        trace_mm_vmscan_direct_reclaim_begin(order,
                                sc.may_writepage,
                                gfp_mask);
@@ -2142,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        return nr_reclaimed;
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                                                gfp_t gfp_mask, bool noswap,
@@ -2275,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
        return balanced_pages >= (present_pages >> 2);
 }
-/* is kswapd sleeping prematurely? */
+/*
-static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+ * Prepare kswapd for sleeping. This verifies that there are no processes
+ * waiting in throttle_direct_reclaim() and that watermarks have been met.
+ *
+ * Returns true if kswapd is ready to sleep
+ */
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                                        int classzone_idx)
 {
        int i;
@@ -2285,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
-                return true;
+                return false;
+        /*
+         * There is a potential race between when kswapd checks its watermarks
+         * and a process gets throttled. There is also a potential race if
+         * processes get throttled, kswapd wakes, a large process exits therby
+         * balancing the zones that causes kswapd to miss a wakeup. If kswapd
+         * is going to sleep, no process should be sleeping on pfmemalloc_wait
+         * so wake them now if necessary. If necessary, processes will wake
+         * kswapd and get throttled again
+         */
+        if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
+                wake_up(&pgdat->pfmemalloc_wait);
+                return false;
+        }
        /* Check the watermark levels */
        for (i = 0; i <= classzone_idx; i++) {
@@ -2318,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
         * must be balanced
         */
        if (order)
-                return !pgdat_balanced(pgdat, balanced, classzone_idx);
+                return pgdat_balanced(pgdat, balanced, classzone_idx);
        else
-                return !all_zones_ok;
+                return all_zones_ok;
 }
 /*
@@ -2546,6 +2685,16 @@ loop_again:
                        }
                }
+                /*
+                 * If the low watermark is met there is no need for processes
+                 * to be throttled on pfmemalloc_wait as they should not be
+                 * able to safely make forward progress. Wake them
+                 */
+                if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+                                pfmemalloc_watermark_ok(pgdat))
+                        wake_up(&pgdat->pfmemalloc_wait);
                if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
                        break;          /* kswapd: all done */
                /*
@@ -2647,7 +2796,7 @@ out:
        }
        /*
-         * Return the order we were reclaiming at so sleeping_prematurely()
+         * Return the order we were reclaiming at so prepare_kswapd_sleep()
         * makes a decision on the order we were last reclaiming at. However,
         * if another caller entered the allocator slow path while kswapd
         * was awake, order will remain at the higher level
@@ -2667,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
        /* Try to sleep for a short interval */
-        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+        if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
                remaining = schedule_timeout(HZ/10);
                finish_wait(&pgdat->kswapd_wait, &wait);
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2677,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         * After a short sleep, check if it was a premature sleep. If not, then
         * go fully to sleep until explicitly woken up.
         */
-        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+        if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
                /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1bbbbd9776ad..df7a6748231d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -745,6 +745,7 @@ const char * const vmstat_text[] = {
        TEXTS_FOR_ZONES("pgsteal_direct")
        TEXTS_FOR_ZONES("pgscan_kswapd")
        TEXTS_FOR_ZONES("pgscan_direct")
+        "pgscan_direct_throttle",
 #ifdef CONFIG_NUMA
        "zone_reclaim_failed",
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-31 22:25:39 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-31 22:25:39 -0400
commit	ac694dbdbc403c00e2c14d10bc7b8412cc378259 (patch)
tree	e37328cfbeaf43716dd5914cad9179e57e84df76 /mm
parent	a40a1d3d0a2fd613fdec6d89d3c053268ced76ed (diff)
parent	437ea90cc3afdca5229b41c6b1d38c4842756cb9 (diff)