38 files changed, 2116 insertions, 1000 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 37d9edcd14cf..ce682f7a4f29 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -652,7 +652,7 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write,
 {
        char kbuf[] = "0\n";
-        if (*ppos) {
+        if (*ppos || *lenp < sizeof(kbuf)) {
                *lenp = 0;
                return 0;
        }
diff --git a/mm/compaction.c b/mm/compaction.c
index 05ccb4cc0bdb..c43789388cd8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1131,6 +1131,9 @@ void compact_pgdat(pg_data_t *pgdat, int order)
                .sync = false,
        };
+        if (!order)
+                return;
        __compact_pgdat(pgdat, &cc);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 731a2c24532d..e607728db4a8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
        if (error)
                goto out;
-        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+        error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (error == 0) {
                page_cache_get(page);
                page->mapping = mapping;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d94f7dee3997..d66010e0049d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -422,7 +422,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
        unsigned long msecs;
        int err;
-        err = strict_strtoul(buf, 10, &msecs);
+        err = kstrtoul(buf, 10, &msecs);
        if (err || msecs > UINT_MAX)
                return -EINVAL;
@@ -449,7 +449,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
        unsigned long msecs;
        int err;
-        err = strict_strtoul(buf, 10, &msecs);
+        err = kstrtoul(buf, 10, &msecs);
        if (err || msecs > UINT_MAX)
                return -EINVAL;
@@ -475,7 +475,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
        int err;
        unsigned long pages;
-        err = strict_strtoul(buf, 10, &pages);
+        err = kstrtoul(buf, 10, &pages);
        if (err || !pages || pages > UINT_MAX)
                return -EINVAL;
@@ -543,7 +543,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
        int err;
        unsigned long max_ptes_none;
-        err = strict_strtoul(buf, 10, &max_ptes_none);
+        err = kstrtoul(buf, 10, &max_ptes_none);
        if (err || max_ptes_none > HPAGE_PMD_NR-1)
                return -EINVAL;
@@ -2301,6 +2301,8 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out;
        vma = find_vma(mm, address);
+        if (!vma)
+                goto out;
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b60f33080a28..b49579c7f2a5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -21,6 +21,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/page-isolation.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -33,7 +34,6 @@
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
 int hugetlb_max_hstate __read_mostly;
@@ -48,7 +48,8 @@ static unsigned long __initdata default_hstate_max_huge_pages;
 static unsigned long __initdata default_hstate_size;
 /*
- * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
+ * free_huge_pages, and surplus_huge_pages.
 */
 DEFINE_SPINLOCK(hugetlb_lock);
@@ -135,9 +136,9 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
 *                    across the pages in a mapping.
 *
 * The region data structures are protected by a combination of the mmap_sem
- * and the hugetlb_instantion_mutex.  To access or modify a region the caller
+ * and the hugetlb_instantiation_mutex.  To access or modify a region the caller
 * must either hold the mmap_sem for write, or the mmap_sem for read and
- * the hugetlb_instantiation mutex:
+ * the hugetlb_instantiation_mutex:
 *
 *      down_write(&mm->mmap_sem);
 * or
@@ -434,25 +435,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
        return (get_vma_private_data(vma) & flag) != 0;
 }
-/* Decrement the reserved pages in the hugepage pool by one */
-static void decrement_hugepage_resv_vma(struct hstate *h,
-                        struct vm_area_struct *vma)
-{
-        if (vma->vm_flags & VM_NORESERVE)
-                return;
-        if (vma->vm_flags & VM_MAYSHARE) {
-                /* Shared mappings always use reserves */
-                h->resv_huge_pages--;
-        } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
-                /*
-                 * Only the process that called mmap() has reserves for
-                 * private mappings.
-                 */
-                h->resv_huge_pages--;
-        }
-}
 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 {
@@ -462,12 +444,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 }
 /* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma)
+static int vma_has_reserves(struct vm_area_struct *vma, long chg)
 {
+        if (vma->vm_flags & VM_NORESERVE) {
+                /*
+                 * This address is already reserved by other process(chg == 0),
+                 * so, we should decrement reserved count. Without decrementing,
+                 * reserve count remains after releasing inode, because this
+                 * allocated page will go into page cache and is regarded as
+                 * coming from reserved pool in releasing step.  Currently, we
+                 * don't have any other solution to deal with this situation
+                 * properly, so add work-around here.
+                 */
+                if (vma->vm_flags & VM_MAYSHARE && chg == 0)
+                        return 1;
+                else
+                        return 0;
+        }
+        /* Shared mappings always use reserves */
        if (vma->vm_flags & VM_MAYSHARE)
                return 1;
+        /*
+         * Only the process that called mmap() has reserves for
+         * private mappings.
+         */
        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                return 1;
        return 0;
 }
@@ -517,9 +522,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
 {
        struct page *page;
-        if (list_empty(&h->hugepage_freelists[nid]))
+        list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
+                if (!is_migrate_isolate_page(page))
+                        break;
+        /*
+         * if 'non-isolated free hugepage' not found on the list,
+         * the allocation fails.
+         */
+        if (&h->hugepage_freelists[nid] == &page->lru)
                return NULL;
-        page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
        list_move(&page->lru, &h->hugepage_activelist);
        set_page_refcounted(page);
        h->free_huge_pages--;
@@ -527,9 +538,19 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
        return page;
 }
+/* Movability of hugepages depends on migration support. */
+static inline gfp_t htlb_alloc_mask(struct hstate *h)
+{
+        if (hugepages_treat_as_movable || hugepage_migration_support(h))
+                return GFP_HIGHUSER_MOVABLE;
+        else
+                return GFP_HIGHUSER;
+}
 static struct page *dequeue_huge_page_vma(struct hstate *h,
                                struct vm_area_struct *vma,
-                                unsigned long address, int avoid_reserve)
+                                unsigned long address, int avoid_reserve,
+                                long chg)
 {
        struct page *page = NULL;
        struct mempolicy *mpol;
@@ -539,16 +560,12 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
        struct zoneref *z;
        unsigned int cpuset_mems_cookie;
-retry_cpuset:
-        cpuset_mems_cookie = get_mems_allowed();
-        zonelist = huge_zonelist(vma, address,
-                                        htlb_alloc_mask, &mpol, &nodemask);
        /*
         * A child process with MAP_PRIVATE mappings created by their parent
         * have no page reserves. This check ensures that reservations are
         * not "stolen". The child may still get SIGKILLed
         */
-        if (!vma_has_reserves(vma) &&
+        if (!vma_has_reserves(vma, chg) &&
                        h->free_huge_pages - h->resv_huge_pages == 0)
                goto err;
@@ -556,13 +573,23 @@ retry_cpuset:
        if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
                goto err;
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
+        zonelist = huge_zonelist(vma, address,
+                                        htlb_alloc_mask(h), &mpol, &nodemask);
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
-                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
+                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) {
                        page = dequeue_huge_page_node(h, zone_to_nid(zone));
                        if (page) {
-                                if (!avoid_reserve)
+                                if (avoid_reserve)
-                                        decrement_hugepage_resv_vma(h, vma);
+                                        break;
+                                if (!vma_has_reserves(vma, chg))
+                                        break;
+                                SetPagePrivate(page);
+                                h->resv_huge_pages--;
                                break;
                        }
                }
@@ -574,7 +601,6 @@ retry_cpuset:
        return page;
 err:
-        mpol_cond_put(mpol);
        return NULL;
 }
@@ -620,15 +646,20 @@ static void free_huge_page(struct page *page)
        int nid = page_to_nid(page);
        struct hugepage_subpool *spool =
                (struct hugepage_subpool *)page_private(page);
+        bool restore_reserve;
        set_page_private(page, 0);
        page->mapping = NULL;
        BUG_ON(page_count(page));
        BUG_ON(page_mapcount(page));
+        restore_reserve = PagePrivate(page);
        spin_lock(&hugetlb_lock);
        hugetlb_cgroup_uncharge_page(hstate_index(h),
                                     pages_per_huge_page(h), page);
+        if (restore_reserve)
+                h->resv_huge_pages++;
        if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
                /* remove the page from active list */
                list_del(&page->lru);
@@ -715,7 +746,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
                return NULL;
        page = alloc_pages_exact_node(nid,
-                htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+                htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
                                                __GFP_REPEAT|__GFP_NOWARN,
                huge_page_order(h));
        if (page) {
@@ -772,33 +803,6 @@ static int hstate_next_node_to_alloc(struct hstate *h,
        return nid;
 }
-static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
-{
-        struct page *page;
-        int start_nid;
-        int next_nid;
-        int ret = 0;
-        start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
-        next_nid = start_nid;
-        do {
-                page = alloc_fresh_huge_page_node(h, next_nid);
-                if (page) {
-                        ret = 1;
-                        break;
-                }
-                next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
-        } while (next_nid != start_nid);
-        if (ret)
-                count_vm_event(HTLB_BUDDY_PGALLOC);
-        else
-                count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
-        return ret;
-}
 /*
 * helper for free_pool_huge_page() - return the previously saved
 * node ["this node"] from which to free a huge page.  Advance the
@@ -817,6 +821,40 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
        return nid;
 }
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)           \
+        for (nr_nodes = nodes_weight(*mask);                            \
+                nr_nodes > 0 &&                                         \
+                ((node = hstate_next_node_to_alloc(hs, mask)) || 1);    \
+                nr_nodes--)
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)            \
+        for (nr_nodes = nodes_weight(*mask);                            \
+                nr_nodes > 0 &&                                         \
+                ((node = hstate_next_node_to_free(hs, mask)) || 1);     \
+                nr_nodes--)
+static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+{
+        struct page *page;
+        int nr_nodes, node;
+        int ret = 0;
+        for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+                page = alloc_fresh_huge_page_node(h, node);
+                if (page) {
+                        ret = 1;
+                        break;
+                }
+        }
+        if (ret)
+                count_vm_event(HTLB_BUDDY_PGALLOC);
+        else
+                count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+        return ret;
+}
 /*
 * Free huge page from pool from next node to free.
 * Attempt to keep persistent huge pages more or less
@@ -826,40 +864,73 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
 static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
                                                         bool acct_surplus)
 {
-        int start_nid;
+        int nr_nodes, node;
-        int next_nid;
        int ret = 0;
-        start_nid = hstate_next_node_to_free(h, nodes_allowed);
+        for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
-        next_nid = start_nid;
-        do {
                /*
                 * If we're returning unused surplus pages, only examine
                 * nodes with surplus pages.
                 */
-                if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
+                if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
-                    !list_empty(&h->hugepage_freelists[next_nid])) {
+                    !list_empty(&h->hugepage_freelists[node])) {
                        struct page *page =
-                                list_entry(h->hugepage_freelists[next_nid].next,
+                                list_entry(h->hugepage_freelists[node].next,
                                          struct page, lru);
                        list_del(&page->lru);
                        h->free_huge_pages--;
-                        h->free_huge_pages_node[next_nid]--;
+                        h->free_huge_pages_node[node]--;
                        if (acct_surplus) {
                                h->surplus_huge_pages--;
-                                h->surplus_huge_pages_node[next_nid]--;
+                                h->surplus_huge_pages_node[node]--;
                        }
                        update_and_free_page(h, page);
                        ret = 1;
                        break;
                }
-                next_nid = hstate_next_node_to_free(h, nodes_allowed);
+        }
-        } while (next_nid != start_nid);
        return ret;
 }
+/*
+ * Dissolve a given free hugepage into free buddy pages. This function does
+ * nothing for in-use (including surplus) hugepages.
+ */
+static void dissolve_free_huge_page(struct page *page)
+{
+        spin_lock(&hugetlb_lock);
+        if (PageHuge(page) && !page_count(page)) {
+                struct hstate *h = page_hstate(page);
+                int nid = page_to_nid(page);
+                list_del(&page->lru);
+                h->free_huge_pages--;
+                h->free_huge_pages_node[nid]--;
+                update_and_free_page(h, page);
+        }
+        spin_unlock(&hugetlb_lock);
+}
+/*
+ * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
+ * make specified memory blocks removable from the system.
+ * Note that start_pfn should aligned with (minimum) hugepage size.
+ */
+void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned int order = 8 * sizeof(void *);
+        unsigned long pfn;
+        struct hstate *h;
+        /* Set scan step to minimum hugepage size */
+        for_each_hstate(h)
+                if (order > huge_page_order(h))
+                        order = huge_page_order(h);
+        VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
+        for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
+                dissolve_free_huge_page(pfn_to_page(pfn));
+}
 static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
 {
        struct page *page;
@@ -902,12 +973,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
        spin_unlock(&hugetlb_lock);
        if (nid == NUMA_NO_NODE)
-                page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+                page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
                                   __GFP_REPEAT|__GFP_NOWARN,
                                   huge_page_order(h));
        else
                page = alloc_pages_exact_node(nid,
-                        htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+                        htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
                        __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
        if (page && arch_prepare_hugepage(page)) {
@@ -944,10 +1015,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
 */
 struct page *alloc_huge_page_node(struct hstate *h, int nid)
 {
-        struct page *page;
+        struct page *page = NULL;
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page_node(h, nid);
+        if (h->free_huge_pages - h->resv_huge_pages > 0)
+                page = dequeue_huge_page_node(h, nid);
        spin_unlock(&hugetlb_lock);
        if (!page)
@@ -1035,11 +1107,8 @@ free:
        spin_unlock(&hugetlb_lock);
        /* Free unnecessary surplus pages to the buddy allocator */
-        if (!list_empty(&surplus_list)) {
+        list_for_each_entry_safe(page, tmp, &surplus_list, lru)
-                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+                put_page(page);
-                        put_page(page);
-                }
-        }
        spin_lock(&hugetlb_lock);
        return ret;
@@ -1106,9 +1175,9 @@ static long vma_needs_reservation(struct hstate *h,
        } else  {
                long err;
                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-                struct resv_map *reservations = vma_resv_map(vma);
+                struct resv_map *resv = vma_resv_map(vma);
-                err = region_chg(&reservations->regions, idx, idx + 1);
+                err = region_chg(&resv->regions, idx, idx + 1);
                if (err < 0)
                        return err;
                return 0;
@@ -1126,10 +1195,10 @@ static void vma_commit_reservation(struct hstate *h,
        } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-                struct resv_map *reservations = vma_resv_map(vma);
+                struct resv_map *resv = vma_resv_map(vma);
                /* Mark this page used in the map. */
-                region_add(&reservations->regions, idx, idx + 1);
+                region_add(&resv->regions, idx, idx + 1);
        }
 }
@@ -1155,38 +1224,35 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        chg = vma_needs_reservation(h, vma, addr);
        if (chg < 0)
                return ERR_PTR(-ENOMEM);
-        if (chg)
+        if (chg || avoid_reserve)
-                if (hugepage_subpool_get_pages(spool, chg))
+                if (hugepage_subpool_get_pages(spool, 1))
                        return ERR_PTR(-ENOSPC);
        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
        if (ret) {
-                hugepage_subpool_put_pages(spool, chg);
+                if (chg || avoid_reserve)
+                        hugepage_subpool_put_pages(spool, 1);
                return ERR_PTR(-ENOSPC);
        }
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
+        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
-        if (page) {
+        if (!page) {
-                /* update page cgroup details */
-                hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
-                                             h_cg, page);
-                spin_unlock(&hugetlb_lock);
-        } else {
                spin_unlock(&hugetlb_lock);
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
                if (!page) {
                        hugetlb_cgroup_uncharge_cgroup(idx,
                                                       pages_per_huge_page(h),
                                                       h_cg);
-                        hugepage_subpool_put_pages(spool, chg);
+                        if (chg || avoid_reserve)
+                                hugepage_subpool_put_pages(spool, 1);
                        return ERR_PTR(-ENOSPC);
                }
                spin_lock(&hugetlb_lock);
-                hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
-                                             h_cg, page);
                list_move(&page->lru, &h->hugepage_activelist);
-                spin_unlock(&hugetlb_lock);
+                /* Fall through */
        }
+        hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+        spin_unlock(&hugetlb_lock);
        set_page_private(page, (unsigned long)spool);
@@ -1194,17 +1260,29 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        return page;
 }
+/*
+ * alloc_huge_page()'s wrapper which simply returns the page if allocation
+ * succeeds, otherwise NULL. This function is called from new_vma_page(),
+ * where no ERR_VALUE is expected to be returned.
+ */
+struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
+                                unsigned long addr, int avoid_reserve)
+{
+        struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
+        if (IS_ERR(page))
+                page = NULL;
+        return page;
+}
 int __weak alloc_bootmem_huge_page(struct hstate *h)
 {
        struct huge_bootmem_page *m;
-        int nr_nodes = nodes_weight(node_states[N_MEMORY]);
+        int nr_nodes, node;
-        while (nr_nodes) {
+        for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
                void *addr;
-                addr = __alloc_bootmem_node_nopanic(
+                addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
-                                NODE_DATA(hstate_next_node_to_alloc(h,
-                                                &node_states[N_MEMORY])),
                                huge_page_size(h), huge_page_size(h), 0);
                if (addr) {
@@ -1216,7 +1294,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
                        m = addr;
                        goto found;
                }
-                nr_nodes--;
        }
        return 0;
@@ -1355,48 +1432,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count,
 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
                                int delta)
 {
-        int start_nid, next_nid;
+        int nr_nodes, node;
-        int ret = 0;
        VM_BUG_ON(delta != -1 && delta != 1);
-        if (delta < 0)
+        if (delta < 0) {
-                start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
+                for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
-        else
+                        if (h->surplus_huge_pages_node[node])
-                start_nid = hstate_next_node_to_free(h, nodes_allowed);
+                                goto found;
-        next_nid = start_nid;
-        do {
-                int nid = next_nid;
-                if (delta < 0)  {
-                        /*
-                         * To shrink on this node, there must be a surplus page
-                         */
-                        if (!h->surplus_huge_pages_node[nid]) {
-                                next_nid = hstate_next_node_to_alloc(h,
-                                                                nodes_allowed);
-                                continue;
-                        }
                }
-                if (delta > 0) {
+        } else {
-                        /*
+                for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
-                         * Surplus cannot exceed the total number of pages
+                        if (h->surplus_huge_pages_node[node] <
-                         */
+                                        h->nr_huge_pages_node[node])
-                        if (h->surplus_huge_pages_node[nid] >=
+                                goto found;
-                                                h->nr_huge_pages_node[nid]) {
-                                next_nid = hstate_next_node_to_free(h,
-                                                                nodes_allowed);
-                                continue;
-                        }
                }
+        }
+        return 0;
-                h->surplus_huge_pages += delta;
+found:
-                h->surplus_huge_pages_node[nid] += delta;
+        h->surplus_huge_pages += delta;
-                ret = 1;
+        h->surplus_huge_pages_node[node] += delta;
-                break;
+        return 1;
-        } while (next_nid != start_nid);
-        return ret;
 }
 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
@@ -1526,7 +1583,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
        struct hstate *h;
        NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
-        err = strict_strtoul(buf, 10, &count);
+        err = kstrtoul(buf, 10, &count);
        if (err)
                goto out;
@@ -1617,7 +1674,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
        if (h->order >= MAX_ORDER)
                return -EINVAL;
-        err = strict_strtoul(buf, 10, &input);
+        err = kstrtoul(buf, 10, &input);
        if (err)
                return err;
@@ -2068,18 +2125,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
 }
 #endif /* CONFIG_NUMA */
-int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
-                        void __user *buffer,
-                        size_t *length, loff_t *ppos)
-{
-        proc_dointvec(table, write, buffer, length, ppos);
-        if (hugepages_treat_as_movable)
-                htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
-        else
-                htlb_alloc_mask = GFP_HIGHUSER;
-        return 0;
-}
 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
                        void __user *buffer,
                        size_t *length, loff_t *ppos)
@@ -2207,7 +2252,7 @@ out:
 static void hugetlb_vm_op_open(struct vm_area_struct *vma)
 {
-        struct resv_map *reservations = vma_resv_map(vma);
+        struct resv_map *resv = vma_resv_map(vma);
        /*
         * This new VMA should share its siblings reservation map if present.
@@ -2217,34 +2262,34 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
         * after this open call completes.  It is therefore safe to take a
         * new reference here without additional locking.
         */
-        if (reservations)
+        if (resv)
-                kref_get(&reservations->refs);
+                kref_get(&resv->refs);
 }
 static void resv_map_put(struct vm_area_struct *vma)
 {
-        struct resv_map *reservations = vma_resv_map(vma);
+        struct resv_map *resv = vma_resv_map(vma);
-        if (!reservations)
+        if (!resv)
                return;
-        kref_put(&reservations->refs, resv_map_release);
+        kref_put(&resv->refs, resv_map_release);
 }
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
        struct hstate *h = hstate_vma(vma);
-        struct resv_map *reservations = vma_resv_map(vma);
+        struct resv_map *resv = vma_resv_map(vma);
        struct hugepage_subpool *spool = subpool_vma(vma);
        unsigned long reserve;
        unsigned long start;
        unsigned long end;
-        if (reservations) {
+        if (resv) {
                start = vma_hugecache_offset(h, vma, vma->vm_start);
                end = vma_hugecache_offset(h, vma, vma->vm_end);
                reserve = (end - start) -
-                        region_count(&reservations->regions, start, end);
+                        region_count(&resv->regions, start, end);
                resv_map_put(vma);
@@ -2557,7 +2602,6 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct hstate *h = hstate_vma(vma);
        struct page *old_page, *new_page;
-        int avoidcopy;
        int outside_reserve = 0;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
@@ -2567,10 +2611,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 retry_avoidcopy:
        /* If no-one else is actually using this page, avoid the copy
         * and just make the page writable */
-        avoidcopy = (page_mapcount(old_page) == 1);
+        if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
-        if (avoidcopy) {
+                page_move_anon_rmap(old_page, vma, address);
-                if (PageAnon(old_page))
-                        page_move_anon_rmap(old_page, vma, address);
                set_huge_ptep_writable(vma, address, ptep);
                return 0;
        }
@@ -2584,8 +2626,7 @@ retry_avoidcopy:
         * at the time of fork() could consume its reserves on COW instead
         * of the full address range.
         */
-        if (!(vma->vm_flags & VM_MAYSHARE) &&
+        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
-                        is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
                        old_page != pagecache_page)
                outside_reserve = 1;
@@ -2657,6 +2698,8 @@ retry_avoidcopy:
        spin_lock(&mm->page_table_lock);
        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
+                ClearPagePrivate(new_page);
                /* Break COW */
                huge_ptep_clear_flush(vma, address, ptep);
                set_huge_pte_at(mm, address, ptep,
@@ -2668,10 +2711,11 @@ retry_avoidcopy:
        }
        spin_unlock(&mm->page_table_lock);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-        /* Caller expects lock to be held */
-        spin_lock(&mm->page_table_lock);
        page_cache_release(new_page);
        page_cache_release(old_page);
+        /* Caller expects lock to be held */
+        spin_lock(&mm->page_table_lock);
        return 0;
 }
@@ -2767,6 +2811,7 @@ retry:
                                        goto retry;
                                goto out;
                        }
+                        ClearPagePrivate(page);
                        spin_lock(&inode->i_lock);
                        inode->i_blocks += blocks_per_huge_page(h);
@@ -2813,8 +2858,10 @@ retry:
        if (!huge_pte_none(huge_ptep_get(ptep)))
                goto backout;
-        if (anon_rmap)
+        if (anon_rmap) {
+                ClearPagePrivate(page);
                hugepage_add_new_anon_rmap(page, vma, address);
+        }
        else
                page_dup_rmap(page);
        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
@@ -3431,3 +3478,45 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
        return ret;
 }
 #endif
+bool isolate_huge_page(struct page *page, struct list_head *list)
+{
+        VM_BUG_ON(!PageHead(page));
+        if (!get_page_unless_zero(page))
+                return false;
+        spin_lock(&hugetlb_lock);
+        list_move_tail(&page->lru, list);
+        spin_unlock(&hugetlb_lock);
+        return true;
+}
+void putback_active_hugepage(struct page *page)
+{
+        VM_BUG_ON(!PageHead(page));
+        spin_lock(&hugetlb_lock);
+        list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
+        spin_unlock(&hugetlb_lock);
+        put_page(page);
+}
+bool is_hugepage_active(struct page *page)
+{
+        VM_BUG_ON(!PageHuge(page));
+        /*
+         * This function can be called for a tail page because the caller,
+         * scan_movable_pages, scans through a given pfn-range which typically
+         * covers one memory block. In systems using gigantic hugepage (1GB
+         * for x86_64,) a hugepage is larger than a memory block, and we don't
+         * support migrating such large hugepages for now, so return false
+         * when called for tail pages.
+         */
+        if (PageTail(page))
+                return false;
+        /*
+         * Refcount of a hwpoisoned hugepages is 1, but they are not active,
+         * so we should return false for them.
+         */
+        if (unlikely(PageHWPoison(page)))
+                return false;
+        return page_count(page) > 0;
+}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 3a61efc518d5..afc2daa91c60 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -88,12 +88,12 @@ static int pfn_inject_init(void)
         * hardware status change, hence do not require hardware support.
         * They are mainly for testing hwpoison in software level.
         */
-        dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
+        dentry = debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir,
                                          NULL, &hwpoison_fops);
        if (!dentry)
                goto fail;
-        dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir,
+        dentry = debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir,
                                     NULL, &unpoison_fops);
        if (!dentry)
                goto fail;
diff --git a/mm/internal.h b/mm/internal.h
index 4390ac6c106e..684f7aa9692a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -85,6 +85,8 @@ extern unsigned long highest_memmap_pfn;
 */
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
+extern unsigned long zone_reclaimable_pages(struct zone *zone);
+extern bool zone_reclaimable(struct zone *zone);
 /*
 * in mm/rmap.c:
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c8d7f3110fd0..e126b0ef9ad2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1639,7 +1639,7 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
        else if (strncmp(buf, "scan=", 5) == 0) {
                unsigned long secs;
-                ret = strict_strtoul(buf + 5, 0, &secs);
+                ret = kstrtoul(buf + 5, 0, &secs);
                if (ret < 0)
                        goto out;
                stop_scan_thread();
diff --git a/mm/ksm.c b/mm/ksm.c
index b6afe0c440d8..0bea2b262a47 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2194,7 +2194,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj,
        unsigned long msecs;
        int err;
-        err = strict_strtoul(buf, 10, &msecs);
+        err = kstrtoul(buf, 10, &msecs);
        if (err || msecs > UINT_MAX)
                return -EINVAL;
@@ -2217,7 +2217,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
        int err;
        unsigned long nr_pages;
-        err = strict_strtoul(buf, 10, &nr_pages);
+        err = kstrtoul(buf, 10, &nr_pages);
        if (err || nr_pages > UINT_MAX)
                return -EINVAL;
@@ -2239,7 +2239,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
        int err;
        unsigned long flags;
-        err = strict_strtoul(buf, 10, &flags);
+        err = kstrtoul(buf, 10, &flags);
        if (err || flags > UINT_MAX)
                return -EINVAL;
        if (flags > KSM_RUN_UNMERGE)
diff --git a/mm/madvise.c b/mm/madvise.c
index 7055883e6e25..6975bc812542 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -42,11 +42,11 @@ static int madvise_need_mmap_write(int behavior)
 * We can potentially split a vm area into separate
 * areas, each area with its own behavior.
 */
-static long madvise_behavior(struct vm_area_struct * vma,
+static long madvise_behavior(struct vm_area_struct *vma,
                     struct vm_area_struct **prev,
                     unsigned long start, unsigned long end, int behavior)
 {
-        struct mm_struct * mm = vma->vm_mm;
+        struct mm_struct *mm = vma->vm_mm;
        int error = 0;
        pgoff_t pgoff;
        unsigned long new_flags = vma->vm_flags;
@@ -215,8 +215,8 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 /*
 * Schedule all required I/O operations.  Do not wait for completion.
 */
-static long madvise_willneed(struct vm_area_struct * vma,
+static long madvise_willneed(struct vm_area_struct *vma,
-                             struct vm_area_struct ** prev,
+                             struct vm_area_struct **prev,
                             unsigned long start, unsigned long end)
 {
        struct file *file = vma->vm_file;
@@ -270,8 +270,8 @@ static long madvise_willneed(struct vm_area_struct * vma,
 * An interface that causes the system to free clean pages and flush
 * dirty pages is already available as msync(MS_INVALIDATE).
 */
-static long madvise_dontneed(struct vm_area_struct * vma,
+static long madvise_dontneed(struct vm_area_struct *vma,
-                             struct vm_area_struct ** prev,
+                             struct vm_area_struct **prev,
                             unsigned long start, unsigned long end)
 {
        *prev = vma;
@@ -343,29 +343,34 @@ static long madvise_remove(struct vm_area_struct *vma,
 */
 static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 {
-        int ret = 0;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
        for (; start < end; start += PAGE_SIZE) {
                struct page *p;
-                int ret = get_user_pages_fast(start, 1, 0, &p);
+                int ret;
+                ret = get_user_pages_fast(start, 1, 0, &p);
                if (ret != 1)
                        return ret;
+                if (PageHWPoison(p)) {
+                        put_page(p);
+                        continue;
+                }
                if (bhv == MADV_SOFT_OFFLINE) {
-                        printk(KERN_INFO "Soft offlining page %lx at %lx\n",
+                        pr_info("Soft offlining page %#lx at %#lx\n",
                                page_to_pfn(p), start);
                        ret = soft_offline_page(p, MF_COUNT_INCREASED);
                        if (ret)
-                                break;
+                                return ret;
                        continue;
                }
-                printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
+                pr_info("Injecting memory failure for page %#lx at %#lx\n",
                       page_to_pfn(p), start);
                /* Ignore return value for now */
                memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
        }
-        return ret;
+        return 0;
 }
 #endif
@@ -459,7 +464,7 @@ madvise_behavior_valid(int behavior)
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
        unsigned long end, tmp;
-        struct vm_area_struct * vma, *prev;
+        struct vm_area_struct *vma, *prev;
        int unmapped_error = 0;
        int error = -EINVAL;
        int write;
diff --git a/mm/memblock.c b/mm/memblock.c
index a847bfe6f3ba..0ac412a0a7ee 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -914,6 +914,24 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
        return memblock_search(&memblock.memory, addr) != -1;
 }
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
+                         unsigned long *start_pfn, unsigned long *end_pfn)
+{
+        struct memblock_type *type = &memblock.memory;
+        int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT);
+        if (mid == -1)
+                return -1;
+        *start_pfn = type->regions[mid].base >> PAGE_SHIFT;
+        *end_pfn = (type->regions[mid].base + type->regions[mid].size)
+                        >> PAGE_SHIFT;
+        return type->regions[mid].nid;
+}
+#endif
 /**
 * memblock_is_region_memory - check if a region is a subset of memory
 * @base: base of region to check
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3b83957b6439..c6bd28edd533 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3121,7 +3121,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
                ssize_t size = memcg_caches_array_size(num_groups);
                size *= sizeof(void *);
-                size += sizeof(struct memcg_cache_params);
+                size += offsetof(struct memcg_cache_params, memcg_caches);
                s->memcg_params = kzalloc(size, GFP_KERNEL);
                if (!s->memcg_params) {
@@ -3164,13 +3164,16 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
                         struct kmem_cache *root_cache)
 {
-        size_t size = sizeof(struct memcg_cache_params);
+        size_t size;
        if (!memcg_kmem_enabled())
                return 0;
-        if (!memcg)
+        if (!memcg) {
+                size = offsetof(struct memcg_cache_params, memcg_caches);
                size += memcg_limited_groups_array_size * sizeof(void *);
+        } else
+                size = sizeof(struct memcg_cache_params);
        s->memcg_params = kzalloc(size, GFP_KERNEL);
        if (!s->memcg_params)
@@ -5588,7 +5591,13 @@ static int compare_thresholds(const void *a, const void *b)
        const struct mem_cgroup_threshold *_a = a;
        const struct mem_cgroup_threshold *_b = b;
-        return _a->threshold - _b->threshold;
+        if (_a->threshold > _b->threshold)
+                return 1;
+        if (_a->threshold < _b->threshold)
+                return -1;
+        return 0;
 }
 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index baa4e0a45dec..947ed5413279 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -206,7 +206,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
 #ifdef __ARCH_SI_TRAPNO
        si.si_trapno = trapno;
 #endif
-        si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
+        si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
        if ((flags & MF_ACTION_REQUIRED) && t == current) {
                si.si_code = BUS_MCEERR_AR;
@@ -985,7 +985,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 static void set_page_hwpoison_huge_page(struct page *hpage)
 {
        int i;
-        int nr_pages = 1 << compound_trans_order(hpage);
+        int nr_pages = 1 << compound_order(hpage);
        for (i = 0; i < nr_pages; i++)
                SetPageHWPoison(hpage + i);
 }
@@ -993,7 +993,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
 static void clear_page_hwpoison_huge_page(struct page *hpage)
 {
        int i;
-        int nr_pages = 1 << compound_trans_order(hpage);
+        int nr_pages = 1 << compound_order(hpage);
        for (i = 0; i < nr_pages; i++)
                ClearPageHWPoison(hpage + i);
 }
@@ -1206,6 +1206,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        for (ps = error_states;; ps++)
                if ((p->flags & ps->mask) == ps->res)
                        break;
+        page_flags |= (p->flags & (1UL << PG_dirty));
        if (!ps->mask)
                for (ps = error_states;; ps++)
                        if ((page_flags & ps->mask) == ps->res)
@@ -1341,7 +1344,17 @@ int unpoison_memory(unsigned long pfn)
                return 0;
        }
-        nr_pages = 1 << compound_trans_order(page);
+        /*
+         * unpoison_memory() can encounter thp only when the thp is being
+         * worked by memory_failure() and the page lock is not held yet.
+         * In such case, we yield to memory_failure() and make unpoison fail.
+         */
+        if (PageTransHuge(page)) {
+                pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
+                        return 0;
+        }
+        nr_pages = 1 << compound_order(page);
        if (!get_page_unless_zero(page)) {
                /*
@@ -1355,7 +1368,7 @@ int unpoison_memory(unsigned long pfn)
                        return 0;
                }
                if (TestClearPageHWPoison(p))
-                        atomic_long_sub(nr_pages, &num_poisoned_pages);
+                        atomic_long_dec(&num_poisoned_pages);
                pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
                return 0;
        }
@@ -1377,7 +1390,7 @@ int unpoison_memory(unsigned long pfn)
        unlock_page(page);
        put_page(page);
-        if (freeit)
+        if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
                put_page(page);
        return 0;
@@ -1418,7 +1431,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
         * was free. This flag should be kept set until the source page
         * is freed and PG_hwpoison on it is set.
         */
-        set_migratetype_isolate(p, true);
+        if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE)
+                set_migratetype_isolate(p, true);
        /*
         * When the target page is a free hugepage, just remove it
         * from free hugepage list.
@@ -1472,6 +1486,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
        int ret;
        unsigned long pfn = page_to_pfn(page);
        struct page *hpage = compound_head(page);
+        LIST_HEAD(pagelist);
        /*
         * This double-check of PageHWPoison is to avoid the race with
@@ -1487,86 +1502,29 @@ static int soft_offline_huge_page(struct page *page, int flags)
        unlock_page(hpage);
        /* Keep page count to indicate a given hugepage is isolated. */
-        ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL,
+        list_move(&hpage->lru, &pagelist);
-                                MIGRATE_SYNC);
+        ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-        put_page(hpage);
+                                MIGRATE_SYNC, MR_MEMORY_FAILURE);
        if (ret) {
                pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                        pfn, ret, page->flags);
+                /*
+                 * We know that soft_offline_huge_page() tries to migrate
+                 * only one hugepage pointed to by hpage, so we need not
+                 * run through the pagelist here.
+                 */
+                putback_active_hugepage(hpage);
+                if (ret > 0)
+                        ret = -EIO;
        } else {
                set_page_hwpoison_huge_page(hpage);
                dequeue_hwpoisoned_huge_page(hpage);
-                atomic_long_add(1 << compound_trans_order(hpage),
+                atomic_long_add(1 << compound_order(hpage),
                                &num_poisoned_pages);
        }
        return ret;
 }
-static int __soft_offline_page(struct page *page, int flags);
-/**
- * soft_offline_page - Soft offline a page.
- * @page: page to offline
- * @flags: flags. Same as memory_failure().
- *
- * Returns 0 on success, otherwise negated errno.
- *
- * Soft offline a page, by migration or invalidation,
- * without killing anything. This is for the case when
- * a page is not corrupted yet (so it's still valid to access),
- * but has had a number of corrected errors and is better taken
- * out.
- *
- * The actual policy on when to do that is maintained by
- * user space.
- *
- * This should never impact any application or cause data loss,
- * however it might take some time.
- *
- * This is not a 100% solution for all memory, but tries to be
- * ``good enough'' for the majority of memory.
- */
-int soft_offline_page(struct page *page, int flags)
-{
-        int ret;
-        unsigned long pfn = page_to_pfn(page);
-        struct page *hpage = compound_trans_head(page);
-        if (PageHWPoison(page)) {
-                pr_info("soft offline: %#lx page already poisoned\n", pfn);
-                return -EBUSY;
-        }
-        if (!PageHuge(page) && PageTransHuge(hpage)) {
-                if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
-                        pr_info("soft offline: %#lx: failed to split THP\n",
-                                pfn);
-                        return -EBUSY;
-                }
-        }
-        ret = get_any_page(page, pfn, flags);
-        if (ret < 0)
-                return ret;
-        if (ret) { /* for in-use pages */
-                if (PageHuge(page))
-                        ret = soft_offline_huge_page(page, flags);
-                else
-                        ret = __soft_offline_page(page, flags);
-        } else { /* for free pages */
-                if (PageHuge(page)) {
-                        set_page_hwpoison_huge_page(hpage);
-                        dequeue_hwpoisoned_huge_page(hpage);
-                        atomic_long_add(1 << compound_trans_order(hpage),
-                                        &num_poisoned_pages);
-                } else {
-                        SetPageHWPoison(page);
-                        atomic_long_inc(&num_poisoned_pages);
-                }
-        }
-        unset_migratetype_isolate(page, MIGRATE_MOVABLE);
-        return ret;
-}
 static int __soft_offline_page(struct page *page, int flags)
 {
        int ret;
@@ -1653,3 +1611,67 @@ static int __soft_offline_page(struct page *page, int flags)
        }
        return ret;
 }
+/**
+ * soft_offline_page - Soft offline a page.
+ * @page: page to offline
+ * @flags: flags. Same as memory_failure().
+ *
+ * Returns 0 on success, otherwise negated errno.
+ *
+ * Soft offline a page, by migration or invalidation,
+ * without killing anything. This is for the case when
+ * a page is not corrupted yet (so it's still valid to access),
+ * but has had a number of corrected errors and is better taken
+ * out.
+ *
+ * The actual policy on when to do that is maintained by
+ * user space.
+ *
+ * This should never impact any application or cause data loss,
+ * however it might take some time.
+ *
+ * This is not a 100% solution for all memory, but tries to be
+ * ``good enough'' for the majority of memory.
+ */
+int soft_offline_page(struct page *page, int flags)
+{
+        int ret;
+        unsigned long pfn = page_to_pfn(page);
+        struct page *hpage = compound_trans_head(page);
+        if (PageHWPoison(page)) {
+                pr_info("soft offline: %#lx page already poisoned\n", pfn);
+                return -EBUSY;
+        }
+        if (!PageHuge(page) && PageTransHuge(hpage)) {
+                if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+                        pr_info("soft offline: %#lx: failed to split THP\n",
+                                pfn);
+                        return -EBUSY;
+                }
+        }
+        ret = get_any_page(page, pfn, flags);
+        if (ret < 0)
+                goto unset;
+        if (ret) { /* for in-use pages */
+                if (PageHuge(page))
+                        ret = soft_offline_huge_page(page, flags);
+                else
+                        ret = __soft_offline_page(page, flags);
+        } else { /* for free pages */
+                if (PageHuge(page)) {
+                        set_page_hwpoison_huge_page(hpage);
+                        dequeue_hwpoisoned_huge_page(hpage);
+                        atomic_long_add(1 << compound_order(hpage),
+                                        &num_poisoned_pages);
+                } else {
+                        SetPageHWPoison(page);
+                        atomic_long_inc(&num_poisoned_pages);
+                }
+        }
+unset:
+        unset_migratetype_isolate(page, MIGRATE_MOVABLE);
+        return ret;
+}
diff --git a/mm/memory.c b/mm/memory.c
index b3c6bf9a398e..2b73dbde2274 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -373,30 +373,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
 /*
- * If a p?d_bad entry is found while walking page tables, report
- * the error, before resetting entry to p?d_none.  Usually (but
- * very seldom) called out from the p?d_none_or_clear_bad macros.
- */
-void pgd_clear_bad(pgd_t *pgd)
-{
-        pgd_ERROR(*pgd);
-        pgd_clear(pgd);
-}
-void pud_clear_bad(pud_t *pud)
-{
-        pud_ERROR(*pud);
-        pud_clear(pud);
-}
-void pmd_clear_bad(pmd_t *pmd)
-{
-        pmd_ERROR(*pmd);
-        pmd_clear(pmd);
-}
-/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
@@ -1505,7 +1481,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
        if (pud_none(*pud))
                goto no_page_table;
        if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
-                BUG_ON(flags & FOLL_GET);
+                if (flags & FOLL_GET)
+                        goto out;
                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
                goto out;
        }
@@ -1516,8 +1493,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
        if (pmd_none(*pmd))
                goto no_page_table;
        if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
-                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+                if (flags & FOLL_GET) {
+                        /*
+                         * Refcount on tail pages are not well-defined and
+                         * shouldn't be taken. The caller should handle a NULL
+                         * return when trying to follow tail pages.
+                         */
+                        if (PageHead(page))
+                                get_page(page);
+                        else {
+                                page = NULL;
+                                goto out;
+                        }
+                }
                goto out;
        }
        if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ca1dd3aa5eee..ed85fe3870e2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -30,6 +30,7 @@
 #include <linux/mm_inline.h>
 #include <linux/firmware-map.h>
 #include <linux/stop_machine.h>
+#include <linux/hugetlb.h>
 #include <asm/tlbflush.h>
@@ -51,14 +52,10 @@ DEFINE_MUTEX(mem_hotplug_mutex);
 void lock_memory_hotplug(void)
 {
        mutex_lock(&mem_hotplug_mutex);
-        /* for exclusive hibernation if CONFIG_HIBERNATION=y */
-        lock_system_sleep();
 }
 void unlock_memory_hotplug(void)
 {
-        unlock_system_sleep();
        mutex_unlock(&mem_hotplug_mutex);
 }
@@ -194,7 +191,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
        zone = &pgdat->node_zones[0];
        for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
-                if (zone->wait_table) {
+                if (zone_is_initialized(zone)) {
                        nr_pages = zone->wait_table_hash_nr_entries
                                * sizeof(wait_queue_head_t);
                        nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
@@ -229,8 +226,8 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
        zone_span_writelock(zone);
-        old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        old_zone_end_pfn = zone_end_pfn(zone);
-        if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
+        if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
                zone->zone_start_pfn = start_pfn;
        zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -305,7 +302,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
                goto out_fail;
        /* use start_pfn for z1's start_pfn if z1 is empty */
-        if (z1->spanned_pages)
+        if (!zone_is_empty(z1))
                z1_start_pfn = z1->zone_start_pfn;
        else
                z1_start_pfn = start_pfn;
@@ -347,7 +344,7 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
                goto out_fail;
        /* use end_pfn for z2's end_pfn if z2 is empty */
-        if (z2->spanned_pages)
+        if (!zone_is_empty(z2))
                z2_end_pfn = zone_end_pfn(z2);
        else
                z2_end_pfn = end_pfn;
@@ -514,8 +511,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone,
 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
                             unsigned long end_pfn)
 {
-        unsigned long zone_start_pfn =  zone->zone_start_pfn;
+        unsigned long zone_start_pfn = zone->zone_start_pfn;
-        unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
+        unsigned long zone_end_pfn = z;
        unsigned long pfn;
        struct mem_section *ms;
        int nid = zone_to_nid(zone);
@@ -1069,6 +1067,23 @@ out:
        return ret;
 }
+static int check_hotplug_memory_range(u64 start, u64 size)
+{
+        u64 start_pfn = start >> PAGE_SHIFT;
+        u64 nr_pages = size >> PAGE_SHIFT;
+        /* Memory range must be aligned with section */
+        if ((start_pfn & ~PAGE_SECTION_MASK) ||
+            (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) {
+                pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n",
+                                (unsigned long long)start,
+                                (unsigned long long)size);
+                return -EINVAL;
+        }
+        return 0;
+}
 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
 int __ref add_memory(int nid, u64 start, u64 size)
 {
@@ -1078,6 +1093,10 @@ int __ref add_memory(int nid, u64 start, u64 size)
        struct resource *res;
        int ret;
+        ret = check_hotplug_memory_range(start, size);
+        if (ret)
+                return ret;
        lock_memory_hotplug();
        res = register_memory_resource(start, size);
@@ -1208,10 +1227,12 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
 }
 /*
- * Scanning pfn is much easier than scanning lru list.
+ * Scan pfn range [start,end) to find movable/migratable pages (LRU pages
- * Scan pfn from start to end and Find LRU page.
+ * and hugepages). We scan pfn because it's much easier than scanning over
+ * linked list. This function returns the pfn of the first found movable
+ * page if it's found, otherwise 0.
 */
-static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
+static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
 {
        unsigned long pfn;
        struct page *page;
@@ -1220,6 +1241,13 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
                        page = pfn_to_page(pfn);
                        if (PageLRU(page))
                                return pfn;
+                        if (PageHuge(page)) {
+                                if (is_hugepage_active(page))
+                                        return pfn;
+                                else
+                                        pfn = round_up(pfn + 1,
+                                                1 << compound_order(page)) - 1;
+                        }
                }
        }
        return 0;
@@ -1240,6 +1268,19 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                if (!pfn_valid(pfn))
                        continue;
                page = pfn_to_page(pfn);
+                if (PageHuge(page)) {
+                        struct page *head = compound_head(page);
+                        pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
+                        if (compound_order(head) > PFN_SECTION_SHIFT) {
+                                ret = -EBUSY;
+                                break;
+                        }
+                        if (isolate_huge_page(page, &source))
+                                move_pages -= 1 << compound_order(head);
+                        continue;
+                }
                if (!get_page_unless_zero(page))
                        continue;
                /*
@@ -1272,7 +1313,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
        }
        if (!list_empty(&source)) {
                if (not_managed) {
-                        putback_lru_pages(&source);
+                        putback_movable_pages(&source);
                        goto out;
                }
@@ -1283,7 +1324,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                ret = migrate_pages(&source, alloc_migrate_target, 0,
                                        MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
                if (ret)
-                        putback_lru_pages(&source);
+                        putback_movable_pages(&source);
        }
 out:
        return ret;
@@ -1472,7 +1513,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
        struct zone *zone;
        struct memory_notify arg;
-        BUG_ON(start_pfn >= end_pfn);
        /* at least, alignment against pageblock is necessary */
        if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
                return -EINVAL;
@@ -1527,8 +1567,8 @@ repeat:
                drain_all_pages();
        }
-        pfn = scan_lru_pages(start_pfn, end_pfn);
+        pfn = scan_movable_pages(start_pfn, end_pfn);
-        if (pfn) { /* We have page on LRU */
+        if (pfn) { /* We have movable pages */
                ret = do_migrate_range(pfn, end_pfn);
                if (!ret) {
                        drain = 1;
@@ -1547,6 +1587,11 @@ repeat:
        yield();
        /* drain pcp pages, this is synchronous. */
        drain_all_pages();
+        /*
+         * dissolve free hugepages in the memory block before doing offlining
+         * actually in order to make hugetlbfs's object counting consistent.
+         */
+        dissolve_free_huge_pages(start_pfn, end_pfn);
        /* check again */
        offlined_pages = check_pages_isolated(start_pfn, end_pfn);
        if (offlined_pages < 0) {
@@ -1674,9 +1719,8 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
        return ret;
 }
-static int check_cpu_on_node(void *data)
+static int check_cpu_on_node(pg_data_t *pgdat)
 {
-        struct pglist_data *pgdat = data;
        int cpu;
        for_each_present_cpu(cpu) {
@@ -1691,10 +1735,9 @@ static int check_cpu_on_node(void *data)
        return 0;
 }
-static void unmap_cpu_on_node(void *data)
+static void unmap_cpu_on_node(pg_data_t *pgdat)
 {
 #ifdef CONFIG_ACPI_NUMA
-        struct pglist_data *pgdat = data;
        int cpu;
        for_each_possible_cpu(cpu)
@@ -1703,10 +1746,11 @@ static void unmap_cpu_on_node(void *data)
 #endif
 }
-static int check_and_unmap_cpu_on_node(void *data)
+static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
 {
-        int ret = check_cpu_on_node(data);
+        int ret;
+        ret = check_cpu_on_node(pgdat);
        if (ret)
                return ret;
@@ -1715,11 +1759,18 @@ static int check_and_unmap_cpu_on_node(void *data)
         * the cpu_to_node() now.
         */
-        unmap_cpu_on_node(data);
+        unmap_cpu_on_node(pgdat);
        return 0;
 }
-/* offline the node if all memory sections of this node are removed */
+/**
+ * try_offline_node
+ *
+ * Offline a node if all memory sections and cpus of the node are removed.
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call.
+ */
 void try_offline_node(int nid)
 {
        pg_data_t *pgdat = NODE_DATA(nid);
@@ -1745,7 +1796,7 @@ void try_offline_node(int nid)
                return;
        }
-        if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
+        if (check_and_unmap_cpu_on_node(pgdat))
                return;
        /*
@@ -1782,10 +1833,19 @@ void try_offline_node(int nid)
 }
 EXPORT_SYMBOL(try_offline_node);
+/**
+ * remove_memory
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call, as required by
+ * try_offline_node().
+ */
 void __ref remove_memory(int nid, u64 start, u64 size)
 {
        int ret;
+        BUG_ON(check_hotplug_memory_range(start, size));
        lock_memory_hotplug();
        /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4baf12e534d1..04729647f359 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -123,16 +123,19 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 static struct mempolicy *get_task_policy(struct task_struct *p)
 {
        struct mempolicy *pol = p->mempolicy;
-        int node;
        if (!pol) {
-                node = numa_node_id();
+                int node = numa_node_id();
-                if (node != NUMA_NO_NODE)
-                        pol = &preferred_node_policy[node];
-                /* preferred_node_policy is not initialised early in boot */
+                if (node != NUMA_NO_NODE) {
-                if (!pol->mode)
+                        pol = &preferred_node_policy[node];
-                        pol = NULL;
+                        /*
+                         * preferred_node_policy is not initialised early in
+                         * boot
+                         */
+                        if (!pol->mode)
+                                pol = NULL;
+                }
        }
        return pol;
@@ -473,8 +476,11 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags);
-/* Scan through pages checking if pages follow certain conditions. */
+/*
-static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ * Scan through pages checking if pages follow certain conditions,
+ * and move them to the pagelist if they do.
+ */
+static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, unsigned long end,
                const nodemask_t *nodes, unsigned long flags,
                void *private)
@@ -512,7 +518,31 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
        return addr != end;
 }
-static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
+                pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
+                                    void *private)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+        int nid;
+        struct page *page;
+        spin_lock(&vma->vm_mm->page_table_lock);
+        page = pte_page(huge_ptep_get((pte_t *)pmd));
+        nid = page_to_nid(page);
+        if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+                goto unlock;
+        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
+        if (flags & (MPOL_MF_MOVE_ALL) ||
+            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
+                isolate_huge_page(page, private);
+unlock:
+        spin_unlock(&vma->vm_mm->page_table_lock);
+#else
+        BUG();
+#endif
+}
+static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                unsigned long addr, unsigned long end,
                const nodemask_t *nodes, unsigned long flags,
                void *private)
@@ -523,17 +553,24 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (!pmd_present(*pmd))
+                        continue;
+                if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
+                        queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
+                                                flags, private);
+                        continue;
+                }
                split_huge_page_pmd(vma, addr, pmd);
                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        continue;
-                if (check_pte_range(vma, pmd, addr, next, nodes,
+                if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
                                    flags, private))
                        return -EIO;
        } while (pmd++, addr = next, addr != end);
        return 0;
 }
-static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                unsigned long addr, unsigned long end,
                const nodemask_t *nodes, unsigned long flags,
                void *private)
@@ -544,16 +581,18 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
+                if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
+                        continue;
                if (pud_none_or_clear_bad(pud))
                        continue;
-                if (check_pmd_range(vma, pud, addr, next, nodes,
+                if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
                                    flags, private))
                        return -EIO;
        } while (pud++, addr = next, addr != end);
        return 0;
 }
-static inline int check_pgd_range(struct vm_area_struct *vma,
+static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
                unsigned long addr, unsigned long end,
                const nodemask_t *nodes, unsigned long flags,
                void *private)
@@ -566,7 +605,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                if (check_pud_range(vma, pgd, addr, next, nodes,
+                if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
                                    flags, private))
                        return -EIO;
        } while (pgd++, addr = next, addr != end);
@@ -604,12 +643,14 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 /*
- * Check if all pages in a range are on a set of nodes.
+ * Walk through page tables and collect pages to be migrated.
- * If pagelist != NULL then isolate pages from the LRU and
+ *
- * put them on the pagelist.
+ * If pages found in a given range are on a set of nodes (determined by
+ * @nodes and @flags,) it's isolated and queued to the pagelist which is
+ * passed via @private.)
 */
 static struct vm_area_struct *
-check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
+queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                const nodemask_t *nodes, unsigned long flags, void *private)
 {
        int err;
@@ -635,9 +676,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                                return ERR_PTR(-EFAULT);
                }
-                if (is_vm_hugetlb_page(vma))
-                        goto next;
                if (flags & MPOL_MF_LAZY) {
                        change_prot_numa(vma, start, endvma);
                        goto next;
@@ -647,7 +685,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
                      vma_migratable(vma))) {
-                        err = check_pgd_range(vma, start, endvma, nodes,
+                        err = queue_pages_pgd_range(vma, start, endvma, nodes,
                                                flags, private);
                        if (err) {
                                first = ERR_PTR(err);
@@ -990,7 +1028,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 {
-        return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+        if (PageHuge(page))
+                return alloc_huge_page_node(page_hstate(compound_head(page)),
+                                        node);
+        else
+                return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 }
 /*
@@ -1013,14 +1055,14 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
         */
        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
-        check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest,
                                        MIGRATE_SYNC, MR_SYSCALL);
                if (err)
-                        putback_lru_pages(&pagelist);
+                        putback_movable_pages(&pagelist);
        }
        return err;
@@ -1154,10 +1196,14 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
                        break;
                vma = vma->vm_next;
        }
        /*
-         * if !vma, alloc_page_vma() will use task or system default policy
+         * queue_pages_range() confirms that @page belongs to some vma,
+         * so vma shouldn't be NULL.
         */
+        BUG_ON(!vma);
+        if (PageHuge(page))
+                return alloc_huge_page_noerr(vma, address, 1);
        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 }
 #else
@@ -1249,7 +1295,7 @@ static long do_mbind(unsigned long start, unsigned long len,
        if (err)
                goto mpol_out;
-        vma = check_range(mm, start, end, nmask,
+        vma = queue_pages_range(mm, start, end, nmask,
                          flags | MPOL_MF_INVERT, &pagelist);
        err = PTR_ERR(vma);     /* maybe ... */
@@ -1265,7 +1311,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                                        (unsigned long)vma,
                                        MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
                        if (nr_failed)
-                                putback_lru_pages(&pagelist);
+                                putback_movable_pages(&pagelist);
                }
                if (nr_failed && (flags & MPOL_MF_STRICT))
@@ -2065,6 +2111,16 @@ retry_cpuset:
 }
 EXPORT_SYMBOL(alloc_pages_current);
+int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
+{
+        struct mempolicy *pol = mpol_dup(vma_policy(src));
+        if (IS_ERR(pol))
+                return PTR_ERR(pol);
+        dst->vm_policy = pol;
+        return 0;
+}
 /*
 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
diff --git a/mm/mempool.c b/mm/mempool.c
index 54990476c049..659aa42bad16 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -73,7 +73,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
                               gfp_t gfp_mask, int node_id)
 {
        mempool_t *pool;
-        pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
+        pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
        if (!pool)
                return NULL;
        pool->elements = kmalloc_node(min_nr * sizeof(void *),
diff --git a/mm/migrate.c b/mm/migrate.c
index 6f0c24438bba..b7ded7eafe3a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -100,6 +100,10 @@ void putback_movable_pages(struct list_head *l)
        struct page *page2;
        list_for_each_entry_safe(page, page2, l, lru) {
+                if (unlikely(PageHuge(page))) {
+                        putback_active_hugepage(page);
+                        continue;
+                }
                list_del(&page->lru);
                dec_zone_page_state(page, NR_ISOLATED_ANON +
                                page_is_file_cache(page));
@@ -945,6 +949,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        struct page *new_hpage = get_new_page(hpage, private, &result);
        struct anon_vma *anon_vma = NULL;
+        /*
+         * Movability of hugepages depends on architectures and hugepage size.
+         * This check is necessary because some callers of hugepage migration
+         * like soft offline and memory hotremove don't walk through page
+         * tables or check whether the hugepage is pmd-based or not before
+         * kicking migration.
+         */
+        if (!hugepage_migration_support(page_hstate(hpage)))
+                return -ENOSYS;
        if (!new_hpage)
                return -ENOMEM;
@@ -975,6 +989,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        unlock_page(hpage);
 out:
+        if (rc != -EAGAIN)
+                putback_active_hugepage(hpage);
        put_page(new_hpage);
        if (result) {
                if (rc)
@@ -1025,7 +1041,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
                list_for_each_entry_safe(page, page2, from, lru) {
                        cond_resched();
-                        rc = unmap_and_move(get_new_page, private,
+                        if (PageHuge(page))
+                                rc = unmap_and_move_huge_page(get_new_page,
+                                                private, page, pass > 2, mode);
+                        else
+                                rc = unmap_and_move(get_new_page, private,
                                                page, pass > 2, mode);
                        switch(rc) {
@@ -1058,32 +1078,6 @@ out:
        return rc;
 }
-int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
-                      unsigned long private, enum migrate_mode mode)
-{
-        int pass, rc;
-        for (pass = 0; pass < 10; pass++) {
-                rc = unmap_and_move_huge_page(get_new_page, private,
-                                                hpage, pass > 2, mode);
-                switch (rc) {
-                case -ENOMEM:
-                        goto out;
-                case -EAGAIN:
-                        /* try again */
-                        cond_resched();
-                        break;
-                case MIGRATEPAGE_SUCCESS:
-                        goto out;
-                default:
-                        rc = -EIO;
-                        goto out;
-                }
-        }
-out:
-        return rc;
-}
 #ifdef CONFIG_NUMA
 /*
 * Move a list of individual pages
@@ -1108,7 +1102,11 @@ static struct page *new_page_node(struct page *p, unsigned long private,
        *result = &pm->status;
-        return alloc_pages_exact_node(pm->node,
+        if (PageHuge(p))
+                return alloc_huge_page_node(page_hstate(compound_head(p)),
+                                        pm->node);
+        else
+                return alloc_pages_exact_node(pm->node,
                                GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
 }
@@ -1168,6 +1166,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                                !migrate_all)
                        goto put_and_set;
+                if (PageHuge(page)) {
+                        isolate_huge_page(page, &pagelist);
+                        goto put_and_set;
+                }
                err = isolate_lru_page(page);
                if (!err) {
                        list_add_tail(&page->lru, &pagelist);
@@ -1190,7 +1193,7 @@ set_status:
                err = migrate_pages(&pagelist, new_page_node,
                                (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
                if (err)
-                        putback_lru_pages(&pagelist);
+                        putback_movable_pages(&pagelist);
        }
        up_read(&mm->mmap_sem);
@@ -1468,7 +1471,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
                if (!populated_zone(zone))
                        continue;
-                if (zone->all_unreclaimable)
+                if (!zone_reclaimable(zone))
                        continue;
                /* Avoid waking kswapd by allocating pages_to_migrate pages. */
diff --git a/mm/mlock.c b/mm/mlock.c
index 79b7cf7d1bca..d63802663242 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -11,6 +11,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pagemap.h>
+#include <linux/pagevec.h>
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
 #include <linux/sched.h>
@@ -18,6 +19,8 @@
 #include <linux/rmap.h>
 #include <linux/mmzone.h>
 #include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
 #include "internal.h"
@@ -87,6 +90,47 @@ void mlock_vma_page(struct page *page)
        }
 }
+/*
+ * Finish munlock after successful page isolation
+ *
+ * Page must be locked. This is a wrapper for try_to_munlock()
+ * and putback_lru_page() with munlock accounting.
+ */
+static void __munlock_isolated_page(struct page *page)
+{
+        int ret = SWAP_AGAIN;
+        /*
+         * Optimization: if the page was mapped just once, that's our mapping
+         * and we don't need to check all the other vmas.
+         */
+        if (page_mapcount(page) > 1)
+                ret = try_to_munlock(page);
+        /* Did try_to_unlock() succeed or punt? */
+        if (ret != SWAP_MLOCK)
+                count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+        putback_lru_page(page);
+}
+/*
+ * Accounting for page isolation fail during munlock
+ *
+ * Performs accounting when page isolation fails in munlock. There is nothing
+ * else to do because it means some other task has already removed the page
+ * from the LRU. putback_lru_page() will take care of removing the page from
+ * the unevictable list, if necessary. vmscan [page_referenced()] will move
+ * the page back to the unevictable list if some other vma has it mlocked.
+ */
+static void __munlock_isolation_failed(struct page *page)
+{
+        if (PageUnevictable(page))
+                count_vm_event(UNEVICTABLE_PGSTRANDED);
+        else
+                count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+}
 /**
 * munlock_vma_page - munlock a vma page
 * @page - page to be unlocked
@@ -112,37 +156,10 @@ unsigned int munlock_vma_page(struct page *page)
                unsigned int nr_pages = hpage_nr_pages(page);
                mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
                page_mask = nr_pages - 1;
-                if (!isolate_lru_page(page)) {
+                if (!isolate_lru_page(page))
-                        int ret = SWAP_AGAIN;
+                        __munlock_isolated_page(page);
+                else
-                        /*
+                        __munlock_isolation_failed(page);
-                         * Optimization: if the page was mapped just once,
-                         * that's our mapping and we don't need to check all the
-                         * other vmas.
-                         */
-                        if (page_mapcount(page) > 1)
-                                ret = try_to_munlock(page);
-                        /*
-                         * did try_to_unlock() succeed or punt?
-                         */
-                        if (ret != SWAP_MLOCK)
-                                count_vm_event(UNEVICTABLE_PGMUNLOCKED);
-                        putback_lru_page(page);
-                } else {
-                        /*
-                         * Some other task has removed the page from the LRU.
-                         * putback_lru_page() will take care of removing the
-                         * page from the unevictable list, if necessary.
-                         * vmscan [page_referenced()] will move the page back
-                         * to the unevictable list if some other vma has it
-                         * mlocked.
-                         */
-                        if (PageUnevictable(page))
-                                count_vm_event(UNEVICTABLE_PGSTRANDED);
-                        else
-                                count_vm_event(UNEVICTABLE_PGMUNLOCKED);
-                }
        }
        return page_mask;
@@ -210,6 +227,191 @@ static int __mlock_posix_error_return(long retval)
 }
 /*
+ * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
+ *
+ * The fast path is available only for evictable pages with single mapping.
+ * Then we can bypass the per-cpu pvec and get better performance.
+ * when mapcount > 1 we need try_to_munlock() which can fail.
+ * when !page_evictable(), we need the full redo logic of putback_lru_page to
+ * avoid leaving evictable page in unevictable list.
+ *
+ * In case of success, @page is added to @pvec and @pgrescued is incremented
+ * in case that the page was previously unevictable. @page is also unlocked.
+ */
+static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
+                int *pgrescued)
+{
+        VM_BUG_ON(PageLRU(page));
+        VM_BUG_ON(!PageLocked(page));
+        if (page_mapcount(page) <= 1 && page_evictable(page)) {
+                pagevec_add(pvec, page);
+                if (TestClearPageUnevictable(page))
+                        (*pgrescued)++;
+                unlock_page(page);
+                return true;
+        }
+        return false;
+}
+/*
+ * Putback multiple evictable pages to the LRU
+ *
+ * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
+ * the pages might have meanwhile become unevictable but that is OK.
+ */
+static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
+{
+        count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
+        /*
+         *__pagevec_lru_add() calls release_pages() so we don't call
+         * put_page() explicitly
+         */
+        __pagevec_lru_add(pvec);
+        count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+}
+/*
+ * Munlock a batch of pages from the same zone
+ *
+ * The work is split to two main phases. First phase clears the Mlocked flag
+ * and attempts to isolate the pages, all under a single zone lru lock.
+ * The second phase finishes the munlock only for pages where isolation
+ * succeeded.
+ *
+ * Note that the pagevec may be modified during the process.
+ */
+static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
+{
+        int i;
+        int nr = pagevec_count(pvec);
+        int delta_munlocked = -nr;
+        struct pagevec pvec_putback;
+        int pgrescued = 0;
+        /* Phase 1: page isolation */
+        spin_lock_irq(&zone->lru_lock);
+        for (i = 0; i < nr; i++) {
+                struct page *page = pvec->pages[i];
+                if (TestClearPageMlocked(page)) {
+                        struct lruvec *lruvec;
+                        int lru;
+                        if (PageLRU(page)) {
+                                lruvec = mem_cgroup_page_lruvec(page, zone);
+                                lru = page_lru(page);
+                                /*
+                                 * We already have pin from follow_page_mask()
+                                 * so we can spare the get_page() here.
+                                 */
+                                ClearPageLRU(page);
+                                del_page_from_lru_list(page, lruvec, lru);
+                        } else {
+                                __munlock_isolation_failed(page);
+                                goto skip_munlock;
+                        }
+                } else {
+skip_munlock:
+                        /*
+                         * We won't be munlocking this page in the next phase
+                         * but we still need to release the follow_page_mask()
+                         * pin.
+                         */
+                        pvec->pages[i] = NULL;
+                        put_page(page);
+                        delta_munlocked++;
+                }
+        }
+        __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+        spin_unlock_irq(&zone->lru_lock);
+        /* Phase 2: page munlock */
+        pagevec_init(&pvec_putback, 0);
+        for (i = 0; i < nr; i++) {
+                struct page *page = pvec->pages[i];
+                if (page) {
+                        lock_page(page);
+                        if (!__putback_lru_fast_prepare(page, &pvec_putback,
+                                        &pgrescued)) {
+                                /*
+                                 * Slow path. We don't want to lose the last
+                                 * pin before unlock_page()
+                                 */
+                                get_page(page); /* for putback_lru_page() */
+                                __munlock_isolated_page(page);
+                                unlock_page(page);
+                                put_page(page); /* from follow_page_mask() */
+                        }
+                }
+        }
+        /*
+         * Phase 3: page putback for pages that qualified for the fast path
+         * This will also call put_page() to return pin from follow_page_mask()
+         */
+        if (pagevec_count(&pvec_putback))
+                __putback_lru_fast(&pvec_putback, pgrescued);
+}
+/*
+ * Fill up pagevec for __munlock_pagevec using pte walk
+ *
+ * The function expects that the struct page corresponding to @start address is
+ * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
+ *
+ * The rest of @pvec is filled by subsequent pages within the same pmd and same
+ * zone, as long as the pte's are present and vm_normal_page() succeeds. These
+ * pages also get pinned.
+ *
+ * Returns the address of the next page that should be scanned. This equals
+ * @start + PAGE_SIZE when no page could be added by the pte walk.
+ */
+static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
+                struct vm_area_struct *vma, int zoneid, unsigned long start,
+                unsigned long end)
+{
+        pte_t *pte;
+        spinlock_t *ptl;
+        /*
+         * Initialize pte walk starting at the already pinned page where we
+         * are sure that there is a pte.
+         */
+        pte = get_locked_pte(vma->vm_mm, start, &ptl);
+        end = min(end, pmd_addr_end(start, end));
+        /* The page next to the pinned page is the first we will try to get */
+        start += PAGE_SIZE;
+        while (start < end) {
+                struct page *page = NULL;
+                pte++;
+                if (pte_present(*pte))
+                        page = vm_normal_page(vma, start, *pte);
+                /*
+                 * Break if page could not be obtained or the page's node+zone does not
+                 * match
+                 */
+                if (!page || page_zone_id(page) != zoneid)
+                        break;
+                get_page(page);
+                /*
+                 * Increase the address that will be returned *before* the
+                 * eventual break due to pvec becoming full by adding the page
+                 */
+                start += PAGE_SIZE;
+                if (pagevec_add(pvec, page) == 0)
+                        break;
+        }
+        pte_unmap_unlock(pte, ptl);
+        return start;
+}
+/*
 * munlock_vma_pages_range() - munlock all pages in the vma range.'
 * @vma - vma containing range to be munlock()ed.
 * @start - start address in @vma of the range
@@ -233,9 +435,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
        vma->vm_flags &= ~VM_LOCKED;
        while (start < end) {
-                struct page *page;
+                struct page *page = NULL;
                unsigned int page_mask, page_increm;
+                struct pagevec pvec;
+                struct zone *zone;
+                int zoneid;
+                pagevec_init(&pvec, 0);
                /*
                 * Although FOLL_DUMP is intended for get_dump_page(),
                 * it just so happens that its special treatment of the
@@ -244,21 +450,45 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                 * has sneaked into the range, we won't oops here: great).
                 */
                page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
-                                        &page_mask);
+                                &page_mask);
                if (page && !IS_ERR(page)) {
-                        lock_page(page);
+                        if (PageTransHuge(page)) {
-                        lru_add_drain();
+                                lock_page(page);
-                        /*
+                                /*
-                         * Any THP page found by follow_page_mask() may have
+                                 * Any THP page found by follow_page_mask() may
-                         * gotten split before reaching munlock_vma_page(),
+                                 * have gotten split before reaching
-                         * so we need to recompute the page_mask here.
+                                 * munlock_vma_page(), so we need to recompute
-                         */
+                                 * the page_mask here.
-                        page_mask = munlock_vma_page(page);
+                                 */
-                        unlock_page(page);
+                                page_mask = munlock_vma_page(page);
-                        put_page(page);
+                                unlock_page(page);
+                                put_page(page); /* follow_page_mask() */
+                        } else {
+                                /*
+                                 * Non-huge pages are handled in batches via
+                                 * pagevec. The pin from follow_page_mask()
+                                 * prevents them from collapsing by THP.
+                                 */
+                                pagevec_add(&pvec, page);
+                                zone = page_zone(page);
+                                zoneid = page_zone_id(page);
+                                /*
+                                 * Try to fill the rest of pagevec using fast
+                                 * pte walk. This will also update start to
+                                 * the next page to process. Then munlock the
+                                 * pagevec.
+                                 */
+                                start = __munlock_pagevec_fill(&pvec, vma,
+                                                zoneid, start, end);
+                                __munlock_pagevec(&pvec, zone);
+                                goto next;
+                        }
                }
                page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
                start += page_increm * PAGE_SIZE;
+next:
                cond_resched();
        }
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index f9c97d10b873..9d548512ff8a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1202,7 +1202,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                        unsigned long *populate)
 {
        struct mm_struct * mm = current->mm;
-        struct inode *inode;
        vm_flags_t vm_flags;
        *populate = 0;
@@ -1265,9 +1264,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                        return -EAGAIN;
        }
-        inode = file ? file_inode(file) : NULL;
        if (file) {
+                struct inode *inode = file_inode(file);
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
@@ -1302,6 +1301,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                        if (!file->f_op || !file->f_op->mmap)
                                return -ENODEV;
+                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+                                return -EINVAL;
                        break;
                default:
@@ -1310,6 +1311,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        } else {
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
+                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+                                return -EINVAL;
                        /*
                         * Ignore pgoff.
                         */
@@ -1476,11 +1479,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
-        int correct_wcount = 0;
        int error;
        struct rb_node **rb_link, *rb_parent;
        unsigned long charged = 0;
-        struct inode *inode =  file ? file_inode(file) : NULL;
        /* Check against address space limit. */
        if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
@@ -1544,16 +1545,11 @@ munmap_back:
        vma->vm_pgoff = pgoff;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
-        error = -EINVAL;        /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
        if (file) {
-                if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
-                        goto free_vma;
                if (vm_flags & VM_DENYWRITE) {
                        error = deny_write_access(file);
                        if (error)
                                goto free_vma;
-                        correct_wcount = 1;
                }
                vma->vm_file = get_file(file);
                error = file->f_op->mmap(file, vma);
@@ -1570,11 +1566,8 @@ munmap_back:
                WARN_ON_ONCE(addr != vma->vm_start);
                addr = vma->vm_start;
-                pgoff = vma->vm_pgoff;
                vm_flags = vma->vm_flags;
        } else if (vm_flags & VM_SHARED) {
-                if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
-                        goto free_vma;
                error = shmem_zero_setup(vma);
                if (error)
                        goto free_vma;
@@ -1596,11 +1589,10 @@ munmap_back:
        }
        vma_link(mm, vma, prev, rb_link, rb_parent);
-        file = vma->vm_file;
        /* Once vma denies write, undo our temporary denial count */
-        if (correct_wcount)
+        if (vm_flags & VM_DENYWRITE)
-                atomic_inc(&inode->i_writecount);
+                allow_write_access(file);
+        file = vma->vm_file;
 out:
        perf_event_mmap(vma);
@@ -1616,11 +1608,20 @@ out:
        if (file)
                uprobe_mmap(vma);
+        /*
+         * New (or expanded) vma always get soft dirty status.
+         * Otherwise user-space soft-dirty page tracker won't
+         * be able to distinguish situation when vma area unmapped,
+         * then new mapped in-place (which must be aimed as
+         * a completely new data area).
+         */
+        vma->vm_flags |= VM_SOFTDIRTY;
        return addr;
 unmap_and_free_vma:
-        if (correct_wcount)
+        if (vm_flags & VM_DENYWRITE)
-                atomic_inc(&inode->i_writecount);
+                allow_write_access(file);
        vma->vm_file = NULL;
        fput(file);
@@ -2380,7 +2381,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
              unsigned long addr, int new_below)
 {
-        struct mempolicy *pol;
        struct vm_area_struct *new;
        int err = -ENOMEM;
@@ -2404,12 +2404,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
                new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
        }
-        pol = mpol_dup(vma_policy(vma));
+        err = vma_dup_policy(vma, new);
-        if (IS_ERR(pol)) {
+        if (err)
-                err = PTR_ERR(pol);
                goto out_free_vma;
-        }
-        vma_set_policy(new, pol);
        if (anon_vma_clone(new, vma))
                goto out_free_mpol;
@@ -2437,7 +2434,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
                fput(new->vm_file);
        unlink_anon_vmas(new);
 out_free_mpol:
-        mpol_put(pol);
+        mpol_put(vma_policy(new));
 out_free_vma:
        kmem_cache_free(vm_area_cachep, new);
 out_err:
@@ -2663,6 +2660,7 @@ out:
        mm->total_vm += len >> PAGE_SHIFT;
        if (flags & VM_LOCKED)
                mm->locked_vm += (len >> PAGE_SHIFT);
+        vma->vm_flags |= VM_SOFTDIRTY;
        return addr;
 }
@@ -2780,7 +2778,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma, *prev;
        struct rb_node **rb_link, *rb_parent;
-        struct mempolicy *pol;
        bool faulted_in_anon_vma = true;
        /*
@@ -2825,10 +2822,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                        new_vma->vm_start = addr;
                        new_vma->vm_end = addr + len;
                        new_vma->vm_pgoff = pgoff;
-                        pol = mpol_dup(vma_policy(vma));
+                        if (vma_dup_policy(vma, new_vma))
-                        if (IS_ERR(pol))
                                goto out_free_vma;
-                        vma_set_policy(new_vma, pol);
                        INIT_LIST_HEAD(&new_vma->anon_vma_chain);
                        if (anon_vma_clone(new_vma, vma))
                                goto out_free_mempol;
@@ -2843,7 +2838,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        return new_vma;
 out_free_mempol:
-        mpol_put(pol);
+        mpol_put(vma_policy(new_vma));
 out_free_vma:
        kmem_cache_free(vm_area_cachep, new_vma);
        return NULL;
@@ -2930,7 +2925,7 @@ int install_special_mapping(struct mm_struct *mm,
        vma->vm_start = addr;
        vma->vm_end = addr + len;
-        vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
+        vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        vma->vm_ops = &special_mapping_vmops;
diff --git a/mm/mremap.c b/mm/mremap.c
index 0843feb66f3d..91b13d6a16d4 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,6 +25,7 @@
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
 #include "internal.h"
@@ -62,8 +63,10 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
                return NULL;
        pmd = pmd_alloc(mm, pud, addr);
-        if (!pmd)
+        if (!pmd) {
+                pud_free(mm, pud);
                return NULL;
+        }
        VM_BUG_ON(pmd_trans_huge(*pmd));
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3f0c895c71fe..6c7b0187be8e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,8 +36,11 @@
 #include <linux/pagevec.h>
 #include <linux/timer.h>
 #include <linux/sched/rt.h>
+#include <linux/mm_inline.h>
 #include <trace/events/writeback.h>
+#include "internal.h"
 /*
 * Sleep at most 200ms at a time in balance_dirty_pages().
 */
@@ -241,9 +244,6 @@ static unsigned long global_dirtyable_memory(void)
        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);
-        /* Subtract min_free_kbytes */
-        x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
        return x + 1;   /* Ensure that we never return 0 */
 }
@@ -585,6 +585,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
 }
 /*
+ *                           setpoint - dirty 3
+ *        f(dirty) := 1.0 + (----------------)
+ *                           limit - setpoint
+ *
+ * it's a 3rd order polynomial that subjects to
+ *
+ * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
+ * (2) f(setpoint) = 1.0 => the balance point
+ * (3) f(limit)    = 0   => the hard limit
+ * (4) df/dx      <= 0   => negative feedback control
+ * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
+ *     => fast response on large errors; small oscillation near setpoint
+ */
+static inline long long pos_ratio_polynom(unsigned long setpoint,
+                                          unsigned long dirty,
+                                          unsigned long limit)
+{
+        long long pos_ratio;
+        long x;
+        x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
+                    limit - setpoint + 1);
+        pos_ratio = x;
+        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+        pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
+        return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
+}
+/*
 * Dirty position control.
 *
 * (o) global/bdi setpoints
@@ -682,26 +713,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
        /*
         * global setpoint
         *
-         *                           setpoint - dirty 3
+         * See comment for pos_ratio_polynom().
-         *        f(dirty) := 1.0 + (----------------)
+         */
-         *                           limit - setpoint
+        setpoint = (freerun + limit) / 2;
+        pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
+        /*
+         * The strictlimit feature is a tool preventing mistrusted filesystems
+         * from growing a large number of dirty pages before throttling. For
+         * such filesystems balance_dirty_pages always checks bdi counters
+         * against bdi limits. Even if global "nr_dirty" is under "freerun".
+         * This is especially important for fuse which sets bdi->max_ratio to
+         * 1% by default. Without strictlimit feature, fuse writeback may
+         * consume arbitrary amount of RAM because it is accounted in
+         * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
         *
-         * it's a 3rd order polynomial that subjects to
+         * Here, in bdi_position_ratio(), we calculate pos_ratio based on
+         * two values: bdi_dirty and bdi_thresh. Let's consider an example:
+         * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
+         * limits are set by default to 10% and 20% (background and throttle).
+         * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
+         * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
+         * about ~6K pages (as the average of background and throttle bdi
+         * limits). The 3rd order polynomial will provide positive feedback if
+         * bdi_dirty is under bdi_setpoint and vice versa.
         *
-         * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
+         * Note, that we cannot use global counters in these calculations
-         * (2) f(setpoint) = 1.0 => the balance point
+         * because we want to throttle process writing to a strictlimit BDI
-         * (3) f(limit)    = 0   => the hard limit
+         * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
-         * (4) df/dx      <= 0   => negative feedback control
+         * in the example above).
-         * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
-         *     => fast response on large errors; small oscillation near setpoint
         */
-        setpoint = (freerun + limit) / 2;
+        if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
-        x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
+                long long bdi_pos_ratio;
-                    limit - setpoint + 1);
+                unsigned long bdi_bg_thresh;
-        pos_ratio = x;
-        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+                if (bdi_dirty < 8)
-        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+                        return min_t(long long, pos_ratio * 2,
-        pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
+                                     2 << RATELIMIT_CALC_SHIFT);
+                if (bdi_dirty >= bdi_thresh)
+                        return 0;
+                bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
+                bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
+                                                     bdi_bg_thresh);
+                if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
+                        return 0;
+                bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
+                                                  bdi_thresh);
+                /*
+                 * Typically, for strictlimit case, bdi_setpoint << setpoint
+                 * and pos_ratio >> bdi_pos_ratio. In the other words global
+                 * state ("dirty") is not limiting factor and we have to
+                 * make decision based on bdi counters. But there is an
+                 * important case when global pos_ratio should get precedence:
+                 * global limits are exceeded (e.g. due to activities on other
+                 * BDIs) while given strictlimit BDI is below limit.
+                 *
+                 * "pos_ratio * bdi_pos_ratio" would work for the case above,
+                 * but it would look too non-natural for the case of all
+                 * activity in the system coming from a single strictlimit BDI
+                 * with bdi->max_ratio == 100%.
+                 *
+                 * Note that min() below somewhat changes the dynamics of the
+                 * control system. Normally, pos_ratio value can be well over 3
+                 * (when globally we are at freerun and bdi is well below bdi
+                 * setpoint). Now the maximum pos_ratio in the same situation
+                 * is 2. We might want to tweak this if we observe the control
+                 * system is too slow to adapt.
+                 */
+                return min(pos_ratio, bdi_pos_ratio);
+        }
        /*
         * We have computed basic pos_ratio above based on global situation. If
@@ -994,6 +1079,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
         * keep that period small to reduce time lags).
         */
        step = 0;
+        /*
+         * For strictlimit case, calculations above were based on bdi counters
+         * and limits (starting from pos_ratio = bdi_position_ratio() and up to
+         * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
+         * Hence, to calculate "step" properly, we have to use bdi_dirty as
+         * "dirty" and bdi_setpoint as "setpoint".
+         *
+         * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
+         * it's possible that bdi_thresh is close to zero due to inactivity
+         * of backing device (see the implementation of bdi_dirty_limit()).
+         */
+        if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+                dirty = bdi_dirty;
+                if (bdi_dirty < 8)
+                        setpoint = bdi_dirty + 1;
+                else
+                        setpoint = (bdi_thresh +
+                                    bdi_dirty_limit(bdi, bg_thresh)) / 2;
+        }
        if (dirty < setpoint) {
                x = min(bdi->balanced_dirty_ratelimit,
                         min(balanced_dirty_ratelimit, task_ratelimit));
@@ -1198,6 +1304,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
        return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
 }
+static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
+                                    unsigned long dirty_thresh,
+                                    unsigned long background_thresh,
+                                    unsigned long *bdi_dirty,
+                                    unsigned long *bdi_thresh,
+                                    unsigned long *bdi_bg_thresh)
+{
+        unsigned long bdi_reclaimable;
+        /*
+         * bdi_thresh is not treated as some limiting factor as
+         * dirty_thresh, due to reasons
+         * - in JBOD setup, bdi_thresh can fluctuate a lot
+         * - in a system with HDD and USB key, the USB key may somehow
+         *   go into state (bdi_dirty >> bdi_thresh) either because
+         *   bdi_dirty starts high, or because bdi_thresh drops low.
+         *   In this case we don't want to hard throttle the USB key
+         *   dirtiers for 100 seconds until bdi_dirty drops under
+         *   bdi_thresh. Instead the auxiliary bdi control line in
+         *   bdi_position_ratio() will let the dirtier task progress
+         *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+         */
+        *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+        if (bdi_bg_thresh)
+                *bdi_bg_thresh = div_u64((u64)*bdi_thresh *
+                                         background_thresh,
+                                         dirty_thresh);
+        /*
+         * In order to avoid the stacked BDI deadlock we need
+         * to ensure we accurately count the 'dirty' pages when
+         * the threshold is low.
+         *
+         * Otherwise it would be possible to get thresh+n pages
+         * reported dirty, even though there are thresh-m pages
+         * actually dirty; with m+n sitting in the percpu
+         * deltas.
+         */
+        if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
+                bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+                *bdi_dirty = bdi_reclaimable +
+                        bdi_stat_sum(bdi, BDI_WRITEBACK);
+        } else {
+                bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+                *bdi_dirty = bdi_reclaimable +
+                        bdi_stat(bdi, BDI_WRITEBACK);
+        }
+}
 /*
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
@@ -1209,13 +1365,9 @@ static void balance_dirty_pages(struct address_space *mapping,
                                unsigned long pages_dirtied)
 {
        unsigned long nr_reclaimable;   /* = file_dirty + unstable_nfs */
-        unsigned long bdi_reclaimable;
        unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
-        unsigned long bdi_dirty;
-        unsigned long freerun;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
-        unsigned long bdi_thresh;
        long period;
        long pause;
        long max_pause;
@@ -1226,10 +1378,16 @@ static void balance_dirty_pages(struct address_space *mapping,
        unsigned long dirty_ratelimit;
        unsigned long pos_ratio;
        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
        unsigned long start_time = jiffies;
        for (;;) {
                unsigned long now = jiffies;
+                unsigned long uninitialized_var(bdi_thresh);
+                unsigned long thresh;
+                unsigned long uninitialized_var(bdi_dirty);
+                unsigned long dirty;
+                unsigned long bg_thresh;
                /*
                 * Unstable writes are a feature of certain networked
@@ -1243,61 +1401,44 @@ static void balance_dirty_pages(struct address_space *mapping,
                global_dirty_limits(&background_thresh, &dirty_thresh);
+                if (unlikely(strictlimit)) {
+                        bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
+                                         &bdi_dirty, &bdi_thresh, &bg_thresh);
+                        dirty = bdi_dirty;
+                        thresh = bdi_thresh;
+                } else {
+                        dirty = nr_dirty;
+                        thresh = dirty_thresh;
+                        bg_thresh = background_thresh;
+                }
                /*
                 * Throttle it only when the background writeback cannot
                 * catch-up. This avoids (excessively) small writeouts
-                 * when the bdi limits are ramping up.
+                 * when the bdi limits are ramping up in case of !strictlimit.
+                 *
+                 * In strictlimit case make decision based on the bdi counters
+                 * and limits. Small writeouts when the bdi limits are ramping
+                 * up are the price we consciously pay for strictlimit-ing.
                 */
-                freerun = dirty_freerun_ceiling(dirty_thresh,
+                if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
-                                                background_thresh);
-                if (nr_dirty <= freerun) {
                        current->dirty_paused_when = now;
                        current->nr_dirtied = 0;
                        current->nr_dirtied_pause =
-                                dirty_poll_interval(nr_dirty, dirty_thresh);
+                                dirty_poll_interval(dirty, thresh);
                        break;
                }
                if (unlikely(!writeback_in_progress(bdi)))
                        bdi_start_background_writeback(bdi);
-                /*
+                if (!strictlimit)
-                 * bdi_thresh is not treated as some limiting factor as
+                        bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
-                 * dirty_thresh, due to reasons
+                                         &bdi_dirty, &bdi_thresh, NULL);
-                 * - in JBOD setup, bdi_thresh can fluctuate a lot
-                 * - in a system with HDD and USB key, the USB key may somehow
-                 *   go into state (bdi_dirty >> bdi_thresh) either because
-                 *   bdi_dirty starts high, or because bdi_thresh drops low.
-                 *   In this case we don't want to hard throttle the USB key
-                 *   dirtiers for 100 seconds until bdi_dirty drops under
-                 *   bdi_thresh. Instead the auxiliary bdi control line in
-                 *   bdi_position_ratio() will let the dirtier task progress
-                 *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
-                 */
-                bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-                /*
-                 * In order to avoid the stacked BDI deadlock we need
-                 * to ensure we accurately count the 'dirty' pages when
-                 * the threshold is low.
-                 *
-                 * Otherwise it would be possible to get thresh+n pages
-                 * reported dirty, even though there are thresh-m pages
-                 * actually dirty; with m+n sitting in the percpu
-                 * deltas.
-                 */
-                if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
-                        bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-                        bdi_dirty = bdi_reclaimable +
-                                    bdi_stat_sum(bdi, BDI_WRITEBACK);
-                } else {
-                        bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-                        bdi_dirty = bdi_reclaimable +
-                                    bdi_stat(bdi, BDI_WRITEBACK);
-                }
                dirty_exceeded = (bdi_dirty > bdi_thresh) &&
-                                  (nr_dirty > dirty_thresh);
+                                 ((nr_dirty > dirty_thresh) || strictlimit);
                if (dirty_exceeded && !bdi->dirty_exceeded)
                        bdi->dirty_exceeded = 1;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2b59dbda196..0ee638f76ebe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
+#include <linux/mm_inline.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
 #include <linux/hugetlb.h>
@@ -488,8 +489,10 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
 *
- * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * For recording whether a page is in the buddy system, we set ->_mapcount
- * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
+ * PAGE_BUDDY_MAPCOUNT_VALUE.
+ * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
+ * serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
@@ -527,8 +530,9 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with _mapcount -2. Page's
+ * free pages of length of (1 << order) and marked with _mapcount
- * order is recorded in page_private(page) field.
+ * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
+ * field.
 * So when we are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were
 * free, the remainder of the region must be split into blocks.
@@ -647,7 +651,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
        int to_free = count;
        spin_lock(&zone->lock);
-        zone->all_unreclaimable = 0;
        zone->pages_scanned = 0;
        while (to_free) {
@@ -696,7 +699,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
                                int migratetype)
 {
        spin_lock(&zone->lock);
-        zone->all_unreclaimable = 0;
        zone->pages_scanned = 0;
        __free_one_page(page, zone, order, migratetype);
@@ -721,7 +723,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
                return false;
        if (!PageHighMem(page)) {
-                debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
+                debug_check_no_locks_freed(page_address(page),
+                                           PAGE_SIZE << order);
                debug_check_no_obj_freed(page_address(page),
                                           PAGE_SIZE << order);
        }
@@ -750,19 +753,19 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 void __init __free_pages_bootmem(struct page *page, unsigned int order)
 {
        unsigned int nr_pages = 1 << order;
+        struct page *p = page;
        unsigned int loop;
-        prefetchw(page);
+        prefetchw(p);
-        for (loop = 0; loop < nr_pages; loop++) {
+        for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
-                struct page *p = &page[loop];
+                prefetchw(p + 1);
-                if (loop + 1 < nr_pages)
-                        prefetchw(p + 1);
                __ClearPageReserved(p);
                set_page_count(p, 0);
        }
+        __ClearPageReserved(p);
+        set_page_count(p, 0);
-        page_zone(page)->managed_pages += 1 << order;
+        page_zone(page)->managed_pages += nr_pages;
        set_page_refcounted(page);
        __free_pages(page, order);
 }
@@ -885,7 +888,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                                                int migratetype)
 {
        unsigned int current_order;
-        struct free_area * area;
+        struct free_area *area;
        struct page *page;
        /* Find a page of the appropriate size in the preferred list */
@@ -1007,14 +1010,60 @@ static void change_pageblock_range(struct page *pageblock_page,
        }
 }
+/*
+ * If breaking a large block of pages, move all free pages to the preferred
+ * allocation list. If falling back for a reclaimable kernel allocation, be
+ * more aggressive about taking ownership of free pages.
+ *
+ * On the other hand, never change migration type of MIGRATE_CMA pageblocks
+ * nor move CMA pages to different free lists. We don't want unmovable pages
+ * to be allocated from MIGRATE_CMA areas.
+ *
+ * Returns the new migratetype of the pageblock (or the same old migratetype
+ * if it was unchanged).
+ */
+static int try_to_steal_freepages(struct zone *zone, struct page *page,
+                                  int start_type, int fallback_type)
+{
+        int current_order = page_order(page);
+        if (is_migrate_cma(fallback_type))
+                return fallback_type;
+        /* Take ownership for orders >= pageblock_order */
+        if (current_order >= pageblock_order) {
+                change_pageblock_range(page, current_order, start_type);
+                return start_type;
+        }
+        if (current_order >= pageblock_order / 2 ||
+            start_type == MIGRATE_RECLAIMABLE ||
+            page_group_by_mobility_disabled) {
+                int pages;
+                pages = move_freepages_block(zone, page, start_type);
+                /* Claim the whole block if over half of it is free */
+                if (pages >= (1 << (pageblock_order-1)) ||
+                                page_group_by_mobility_disabled) {
+                        set_pageblock_migratetype(page, start_type);
+                        return start_type;
+                }
+        }
+        return fallback_type;
+}
 /* Remove an element from the buddy allocator from the fallback list */
 static inline struct page *
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 {
-        struct free_area * area;
+        struct free_area *area;
        int current_order;
        struct page *page;
-        int migratetype, i;
+        int migratetype, new_type, i;
        /* Find the largest possible block of pages in the other list */
        for (current_order = MAX_ORDER-1; current_order >= order;
@@ -1034,51 +1083,29 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                                        struct page, lru);
                        area->nr_free--;
-                        /*
+                        new_type = try_to_steal_freepages(zone, page,
-                         * If breaking a large block of pages, move all free
+                                                          start_migratetype,
-                         * pages to the preferred allocation list. If falling
+                                                          migratetype);
-                         * back for a reclaimable kernel allocation, be more
-                         * aggressive about taking ownership of free pages
-                         *
-                         * On the other hand, never change migration
-                         * type of MIGRATE_CMA pageblocks nor move CMA
-                         * pages on different free lists. We don't
-                         * want unmovable pages to be allocated from
-                         * MIGRATE_CMA areas.
-                         */
-                        if (!is_migrate_cma(migratetype) &&
-                            (current_order >= pageblock_order / 2 ||
-                             start_migratetype == MIGRATE_RECLAIMABLE ||
-                             page_group_by_mobility_disabled)) {
-                                int pages;
-                                pages = move_freepages_block(zone, page,
-                                                                start_migratetype);
-                                /* Claim the whole block if over half of it is free */
-                                if (pages >= (1 << (pageblock_order-1)) ||
-                                                page_group_by_mobility_disabled)
-                                        set_pageblock_migratetype(page,
-                                                                start_migratetype);
-                                migratetype = start_migratetype;
-                        }
                        /* Remove the page from the freelists */
                        list_del(&page->lru);
                        rmv_page_order(page);
-                        /* Take ownership for orders >= pageblock_order */
+                        /*
-                        if (current_order >= pageblock_order &&
+                         * Borrow the excess buddy pages as well, irrespective
-                            !is_migrate_cma(migratetype))
+                         * of whether we stole freepages, or took ownership of
-                                change_pageblock_range(page, current_order,
+                         * the pageblock or not.
-                                                        start_migratetype);
+                         *
+                         * Exception: When borrowing from MIGRATE_CMA, release
+                         * the excess buddy pages to CMA itself.
+                         */
                        expand(zone, page, order, current_order, area,
                               is_migrate_cma(migratetype)
                             ? migratetype : start_migratetype);
-                        trace_mm_page_alloc_extfrag(page, order, current_order,
+                        trace_mm_page_alloc_extfrag(page, order,
-                                start_migratetype, migratetype);
+                                current_order, start_migratetype, migratetype,
+                                new_type == start_migratetype);
                        return page;
                }
@@ -1281,7 +1308,7 @@ void mark_free_pages(struct zone *zone)
        int order, t;
        struct list_head *curr;
-        if (!zone->spanned_pages)
+        if (zone_is_empty(zone))
                return;
        spin_lock_irqsave(&zone->lock, flags);
@@ -1526,6 +1553,7 @@ again:
                                          get_pageblock_migratetype(page));
        }
+        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
        local_irq_restore(flags);
@@ -1792,6 +1820,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
        bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
+static bool zone_local(struct zone *local_zone, struct zone *zone)
+{
+        return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;
+}
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
        return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
@@ -1829,6 +1862,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
+static bool zone_local(struct zone *local_zone, struct zone *zone)
+{
+        return true;
+}
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
        return true;
@@ -1860,16 +1898,41 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 zonelist_scan:
        /*
         * Scan zonelist, looking for a zone with enough free.
-         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+         * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
         */
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                high_zoneidx, nodemask) {
+                unsigned long mark;
                if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
                        !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                continue;
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                continue;
+                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+                if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
+                        goto try_this_zone;
+                /*
+                 * Distribute pages in proportion to the individual
+                 * zone size to ensure fair page aging.  The zone a
+                 * page was allocated in should have no effect on the
+                 * time the page has in memory before being reclaimed.
+                 *
+                 * When zone_reclaim_mode is enabled, try to stay in
+                 * local zones in the fastpath.  If that fails, the
+                 * slowpath is entered, which will do another pass
+                 * starting with the local zones, but ultimately fall
+                 * back to remote zones that do not partake in the
+                 * fairness round-robin cycle of this zonelist.
+                 */
+                if (alloc_flags & ALLOC_WMARK_LOW) {
+                        if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+                                continue;
+                        if (zone_reclaim_mode &&
+                            !zone_local(preferred_zone, zone))
+                                continue;
+                }
                /*
                 * When allocating a page cache page for writing, we
                 * want to get it from a zone that is within its dirty
@@ -1900,16 +1963,11 @@ zonelist_scan:
                    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
                        goto this_zone_full;
-                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+                mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
-                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+                if (!zone_watermark_ok(zone, order, mark,
-                        unsigned long mark;
+                                       classzone_idx, alloc_flags)) {
                        int ret;
-                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
-                        if (zone_watermark_ok(zone, order, mark,
-                                    classzone_idx, alloc_flags))
-                                goto try_this_zone;
                        if (IS_ENABLED(CONFIG_NUMA) &&
                                        !did_zlc_setup && nr_online_nodes > 1) {
                                /*
@@ -2321,16 +2379,30 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
        return page;
 }
-static inline
+static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
-void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
+                             struct zonelist *zonelist,
-                                                enum zone_type high_zoneidx,
+                             enum zone_type high_zoneidx,
-                                                enum zone_type classzone_idx)
+                             struct zone *preferred_zone)
 {
        struct zoneref *z;
        struct zone *zone;
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                wakeup_kswapd(zone, order, classzone_idx);
+                if (!(gfp_mask & __GFP_NO_KSWAPD))
+                        wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+                /*
+                 * Only reset the batches of zones that were actually
+                 * considered in the fast path, we don't want to
+                 * thrash fairness information for zones that are not
+                 * actually part of this zonelist's round-robin cycle.
+                 */
+                if (zone_reclaim_mode && !zone_local(preferred_zone, zone))
+                        continue;
+                mod_zone_page_state(zone, NR_ALLOC_BATCH,
+                                    high_wmark_pages(zone) -
+                                    low_wmark_pages(zone) -
+                                    zone_page_state(zone, NR_ALLOC_BATCH));
+        }
 }
 static inline int
@@ -2426,9 +2498,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 restart:
-        if (!(gfp_mask & __GFP_NO_KSWAPD))
+        prepare_slowpath(gfp_mask, order, zonelist,
-                wake_all_kswapd(order, zonelist, high_zoneidx,
+                         high_zoneidx, preferred_zone);
-                                                zone_idx(preferred_zone));
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -3095,7 +3166,7 @@ void show_free_areas(unsigned int filter)
                        K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
                        K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
                        zone->pages_scanned,
-                        (zone->all_unreclaimable ? "yes" : "no")
+                        (!zone_reclaimable(zone) ? "yes" : "no")
                        );
                printk("lowmem_reserve[]:");
                for (i = 0; i < MAX_NR_ZONES; i++)
@@ -3104,7 +3175,7 @@ void show_free_areas(unsigned int filter)
        }
        for_each_populated_zone(zone) {
-                unsigned long nr[MAX_ORDER], flags, order, total = 0;
+                unsigned long nr[MAX_ORDER], flags, order, total = 0;
                unsigned char types[MAX_ORDER];
                if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -3416,11 +3487,11 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 static int default_zonelist_order(void)
 {
        int nid, zone_type;
-        unsigned long low_kmem_size,total_size;
+        unsigned long low_kmem_size, total_size;
        struct zone *z;
        int average_size;
        /*
-         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
+         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
         * If they are really small and used heavily, the system can fall
         * into OOM very easily.
         * This function detect ZONE_DMA/DMA32 size and configures zone order.
@@ -3452,9 +3523,9 @@ static int default_zonelist_order(void)
                return ZONELIST_ORDER_NODE;
        /*
         * look into each node's config.
-         * If there is a node whose DMA/DMA32 memory is very big area on
+         * If there is a node whose DMA/DMA32 memory is very big area on
-         * local memory, NODE_ORDER may be suitable.
+         * local memory, NODE_ORDER may be suitable.
-         */
+         */
        average_size = total_size /
                                (nodes_weight(node_states[N_MEMORY]) + 1);
        for_each_online_node(nid) {
@@ -4180,7 +4251,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
        if (!zone->wait_table)
                return -ENOMEM;
-        for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
+        for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
                init_waitqueue_head(zone->wait_table + i);
        return 0;
@@ -4237,7 +4308,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
 int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
        unsigned long start_pfn, end_pfn;
-        int i, nid;
+        int nid;
        /*
         * NOTE: The following SMP-unsafe globals are only used early in boot
         * when the kernel is running single-threaded.
@@ -4248,15 +4319,14 @@ int __meminit __early_pfn_to_nid(unsigned long pfn)
        if (last_start_pfn <= pfn && pfn < last_end_pfn)
                return last_nid;
-        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+        nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
-                if (start_pfn <= pfn && pfn < end_pfn) {
+        if (nid != -1) {
-                        last_start_pfn = start_pfn;
+                last_start_pfn = start_pfn;
-                        last_end_pfn = end_pfn;
+                last_end_pfn = end_pfn;
-                        last_nid = nid;
+                last_nid = nid;
-                        return nid;
+        }
-                }
-        /* This is a memory hole */
+        return nid;
-        return -1;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
@@ -4586,7 +4656,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-void __init set_pageblock_order(void)
+void __paginginit set_pageblock_order(void)
 {
        unsigned int order;
@@ -4614,7 +4684,7 @@ void __init set_pageblock_order(void)
 * include/linux/pageblock-flags.h for the values of pageblock_order based on
 * the kernel config
 */
-void __init set_pageblock_order(void)
+void __paginginit set_pageblock_order(void)
 {
 }
@@ -4728,8 +4798,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                spin_lock_init(&zone->lru_lock);
                zone_seqlock_init(zone);
                zone->zone_pgdat = pgdat;
                zone_pcp_init(zone);
+                /* For bootup, initialized properly in watermark setup */
+                mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
                lruvec_init(&zone->lruvec);
                if (!size)
                        continue;
@@ -4930,7 +5003,7 @@ static unsigned long __init early_calculate_totalpages(void)
                if (pages)
                        node_set_state(nid, N_MEMORY);
        }
-        return totalpages;
+        return totalpages;
 }
 /*
@@ -5047,7 +5120,7 @@ restart:
                        /*
                         * Some kernelcore has been met, update counts and
                         * break if the kernelcore for this node has been
-                         * satisified
+                         * satisfied
                         */
                        required_kernelcore -= min(required_kernelcore,
                                                                size_pages);
@@ -5061,7 +5134,7 @@ restart:
         * If there is still required_kernelcore, we do another pass with one
         * less node in the count. This will push zone_movable_pfn[nid] further
         * along on the nodes that still have memory until kernelcore is
-         * satisified
+         * satisfied
         */
        usable_nodes--;
        if (usable_nodes && required_kernelcore > usable_nodes)
@@ -5286,8 +5359,10 @@ void __init mem_init_print_info(const char *str)
         * 3) .rodata.* may be embedded into .text or .data sections.
         */
 #define adj_init_size(start, end, size, pos, adj) \
-        if (start <= pos && pos < end && size > adj) \
+        do { \
-                size -= adj;
+                if (start <= pos && pos < end && size > adj) \
+                        size -= adj; \
+        } while (0)
        adj_init_size(__init_begin, __init_end, init_data_size,
                     _sinittext, init_code_size);
@@ -5361,7 +5436,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
                 * This is only okay since the processor is dead and cannot
                 * race with what we are doing.
                 */
-                refresh_cpu_vm_stats(cpu);
+                cpu_vm_stats_fold(cpu);
        }
        return NOTIFY_OK;
 }
@@ -5498,6 +5573,11 @@ static void __setup_per_zone_wmarks(void)
                zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+                __mod_zone_page_state(zone, NR_ALLOC_BATCH,
+                                      high_wmark_pages(zone) -
+                                      low_wmark_pages(zone) -
+                                      zone_page_state(zone, NR_ALLOC_BATCH));
                setup_zone_migrate_reserve(zone);
                spin_unlock_irqrestore(&zone->lock, flags);
        }
@@ -5570,7 +5650,7 @@ static void __meminit setup_per_zone_inactive_ratio(void)
 * we want it large (64MB max).  But it is not linear, because network
 * bandwidth does not increase linearly with machine size.  We use
 *
- *      min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
+ *      min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
 *      min_free_kbytes = sqrt(lowmem_kbytes * 16)
 *
 * which yields
@@ -5614,11 +5694,11 @@ int __meminit init_per_zone_wmark_min(void)
 module_init(init_per_zone_wmark_min)
 /*
- * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
 *      that we can call two helper functions whenever min_free_kbytes
 *      changes.
 */
-int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
        proc_dointvec(table, write, buffer, length, ppos);
@@ -5682,8 +5762,8 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 /*
 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu
- * can have before it gets flushed back to buddy allocator.
+ * pagelist can have before it gets flushed back to buddy allocator.
 */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
@@ -5745,9 +5825,10 @@ void *__init alloc_large_system_hash(const char *tablename,
        if (!numentries) {
                /* round applicable memory size up to nearest megabyte */
                numentries = nr_kernel_pages;
-                numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
-                numentries >>= 20 - PAGE_SHIFT;
+                /* It isn't necessary when PAGE_SIZE >= 1MB */
-                numentries <<= 20 - PAGE_SHIFT;
+                if (PAGE_SHIFT < 20)
+                        numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
                /* limit to 1 bucket per 2^scale bytes of low memory */
                if (scale > PAGE_SHIFT)
@@ -5900,7 +5981,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 * This function checks whether pageblock includes unmovable pages or not.
 * If @count is not zero, it is okay to include less @count unmovable pages
 *
- * PageLRU check wihtout isolation or lru_lock could race so that
+ * PageLRU check without isolation or lru_lock could race so that
 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
 * expect this function should be exact.
 */
@@ -5928,6 +6009,17 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                        continue;
                page = pfn_to_page(check);
+                /*
+                 * Hugepages are not in LRU lists, but they're movable.
+                 * We need not scan over tail pages bacause we don't
+                 * handle each tail page individually in migration.
+                 */
+                if (PageHuge(page)) {
+                        iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
+                        continue;
+                }
                /*
                 * We can't use page_count without pin a page
                 * because another CPU can free compound page.
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 0cee10ffb98d..d1473b2e9481 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -6,6 +6,7 @@
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
 #include <linux/memory.h>
+#include <linux/hugetlb.h>
 #include "internal.h"
 int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
@@ -252,6 +253,19 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
 {
        gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+        /*
+         * TODO: allocate a destination hugepage from a nearest neighbor node,
+         * accordance with memory policy of the user process if possible. For
+         * now as a simple work-around, we use the next node for destination.
+         */
+        if (PageHuge(page)) {
+                nodemask_t src = nodemask_of_node(page_to_nid(page));
+                nodemask_t dst;
+                nodes_complement(dst, src);
+                return alloc_huge_page_node(page_hstate(compound_head(page)),
+                                            next_node(page_to_nid(page), dst));
+        }
        if (PageHighMem(page))
                gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e1a6e4fab016..3929a40bd6c0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -10,6 +10,30 @@
 #include <asm/tlb.h>
 #include <asm-generic/pgtable.h>
+/*
+ * If a p?d_bad entry is found while walking page tables, report
+ * the error, before resetting entry to p?d_none.  Usually (but
+ * very seldom) called out from the p?d_none_or_clear_bad macros.
+ */
+void pgd_clear_bad(pgd_t *pgd)
+{
+        pgd_ERROR(*pgd);
+        pgd_clear(pgd);
+}
+void pud_clear_bad(pud_t *pud)
+{
+        pud_ERROR(*pud);
+        pud_clear(pud);
+}
+void pmd_clear_bad(pmd_t *pmd)
+{
+        pmd_ERROR(*pmd);
+        pmd_clear(pmd);
+}
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 /*
 * Only sets the access flags (dirty, accessed), as well as write 
diff --git a/mm/readahead.c b/mm/readahead.c
index 829a77c62834..e4ed04149785 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -371,10 +371,10 @@ static int try_context_readahead(struct address_space *mapping,
        size = count_history_pages(mapping, ra, offset, max);
        /*
-         * no history pages:
+         * not enough history pages:
         * it could be a random read
         */
-        if (!size)
+        if (size <= req_size)
                return 0;
        /*
@@ -385,8 +385,8 @@ static int try_context_readahead(struct address_space *mapping,
                size *= 2;
        ra->start = offset;
-        ra->size = get_init_ra_size(size + req_size, max);
+        ra->size = min(size + req_size, max);
-        ra->async_size = ra->size;
+        ra->async_size = 1;
        return 1;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 526149846d0a..8297623fcaed 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1205,7 +1205,7 @@ repeat:
                                                gfp & GFP_RECLAIM_MASK);
                if (error)
                        goto decused;
-                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+                error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
                                                        gfp, NULL);
@@ -2819,6 +2819,10 @@ int __init shmem_init(void)
 {
        int error;
+        /* If rootfs called this, don't re-init */
+        if (shmem_inode_cachep)
+                return 0;
        error = bdi_init(&shmem_backing_dev_info);
        if (error)
                goto out4;
diff --git a/mm/slub.c b/mm/slub.c
index e3ba1f2cf60c..51df8272cfaf 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4420,7 +4420,7 @@ static ssize_t order_store(struct kmem_cache *s,
        unsigned long order;
        int err;
-        err = strict_strtoul(buf, 10, &order);
+        err = kstrtoul(buf, 10, &order);
        if (err)
                return err;
@@ -4448,7 +4448,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
        unsigned long min;
        int err;
-        err = strict_strtoul(buf, 10, &min);
+        err = kstrtoul(buf, 10, &min);
        if (err)
                return err;
@@ -4468,7 +4468,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
        unsigned long objects;
        int err;
-        err = strict_strtoul(buf, 10, &objects);
+        err = kstrtoul(buf, 10, &objects);
        if (err)
                return err;
        if (objects && !kmem_cache_has_cpu_partial(s))
@@ -4784,7 +4784,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
        unsigned long ratio;
        int err;
-        err = strict_strtoul(buf, 10, &ratio);
+        err = kstrtoul(buf, 10, &ratio);
        if (err)
                return err;
diff --git a/mm/sparse.c b/mm/sparse.c
index 308d50331bc3..4ac1d7ef548f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -339,13 +339,14 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
-static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
+static void __init sparse_early_usemaps_alloc_node(void *data,
                                 unsigned long pnum_begin,
                                 unsigned long pnum_end,
                                 unsigned long usemap_count, int nodeid)
 {
        void *usemap;
        unsigned long pnum;
+        unsigned long **usemap_map = (unsigned long **)data;
        int size = usemap_size();
        usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
@@ -430,11 +431,12 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
-static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
+static void __init sparse_early_mem_maps_alloc_node(void *data,
                                 unsigned long pnum_begin,
                                 unsigned long pnum_end,
                                 unsigned long map_count, int nodeid)
 {
+        struct page **map_map = (struct page **)data;
        sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
                                         map_count, nodeid);
 }
@@ -460,6 +462,55 @@ void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
 {
 }
+/**
+ *  alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
+ *  @map: usemap_map for pageblock flags or mmap_map for vmemmap
+ */
+static void __init alloc_usemap_and_memmap(void (*alloc_func)
+                                        (void *, unsigned long, unsigned long,
+                                        unsigned long, int), void *data)
+{
+        unsigned long pnum;
+        unsigned long map_count;
+        int nodeid_begin = 0;
+        unsigned long pnum_begin = 0;
+        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+                struct mem_section *ms;
+                if (!present_section_nr(pnum))
+                        continue;
+                ms = __nr_to_section(pnum);
+                nodeid_begin = sparse_early_nid(ms);
+                pnum_begin = pnum;
+                break;
+        }
+        map_count = 1;
+        for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+                struct mem_section *ms;
+                int nodeid;
+                if (!present_section_nr(pnum))
+                        continue;
+                ms = __nr_to_section(pnum);
+                nodeid = sparse_early_nid(ms);
+                if (nodeid == nodeid_begin) {
+                        map_count++;
+                        continue;
+                }
+                /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+                alloc_func(data, pnum_begin, pnum,
+                                                map_count, nodeid_begin);
+                /* new start, update count etc*/
+                nodeid_begin = nodeid;
+                pnum_begin = pnum;
+                map_count = 1;
+        }
+        /* ok, last chunk */
+        alloc_func(data, pnum_begin, NR_MEM_SECTIONS,
+                                                map_count, nodeid_begin);
+}
 /*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
@@ -471,11 +522,7 @@ void __init sparse_init(void)
        unsigned long *usemap;
        unsigned long **usemap_map;
        int size;
-        int nodeid_begin = 0;
-        unsigned long pnum_begin = 0;
-        unsigned long usemap_count;
 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
-        unsigned long map_count;
        int size2;
        struct page **map_map;
 #endif
@@ -501,82 +548,16 @@ void __init sparse_init(void)
        usemap_map = alloc_bootmem(size);
        if (!usemap_map)
                panic("can not allocate usemap_map\n");
+        alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
-        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+                                                        (void *)usemap_map);
-                struct mem_section *ms;
-                if (!present_section_nr(pnum))
-                        continue;
-                ms = __nr_to_section(pnum);
-                nodeid_begin = sparse_early_nid(ms);
-                pnum_begin = pnum;
-                break;
-        }
-        usemap_count = 1;
-        for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
-                struct mem_section *ms;
-                int nodeid;
-                if (!present_section_nr(pnum))
-                        continue;
-                ms = __nr_to_section(pnum);
-                nodeid = sparse_early_nid(ms);
-                if (nodeid == nodeid_begin) {
-                        usemap_count++;
-                        continue;
-                }
-                /* ok, we need to take cake of from pnum_begin to pnum - 1*/
-                sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
-                                                 usemap_count, nodeid_begin);
-                /* new start, update count etc*/
-                nodeid_begin = nodeid;
-                pnum_begin = pnum;
-                usemap_count = 1;
-        }
-        /* ok, last chunk */
-        sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
-                                         usemap_count, nodeid_begin);
 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
        size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
        map_map = alloc_bootmem(size2);
        if (!map_map)
                panic("can not allocate map_map\n");
+        alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
-        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+                                                        (void *)map_map);
-                struct mem_section *ms;
-                if (!present_section_nr(pnum))
-                        continue;
-                ms = __nr_to_section(pnum);
-                nodeid_begin = sparse_early_nid(ms);
-                pnum_begin = pnum;
-                break;
-        }
-        map_count = 1;
-        for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
-                struct mem_section *ms;
-                int nodeid;
-                if (!present_section_nr(pnum))
-                        continue;
-                ms = __nr_to_section(pnum);
-                nodeid = sparse_early_nid(ms);
-                if (nodeid == nodeid_begin) {
-                        map_count++;
-                        continue;
-                }
-                /* ok, we need to take cake of from pnum_begin to pnum - 1*/
-                sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
-                                                 map_count, nodeid_begin);
-                /* new start, update count etc*/
-                nodeid_begin = nodeid;
-                pnum_begin = pnum;
-                map_count = 1;
-        }
-        /* ok, last chunk */
-        sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
-                                         map_count, nodeid_begin);
 #endif
        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
diff --git a/mm/swap.c b/mm/swap.c
index 62b78a6e224f..c899502d3e36 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
 #include <linux/uio.h>
+#include <linux/hugetlb.h>
 #include "internal.h"
@@ -81,6 +82,19 @@ static void __put_compound_page(struct page *page)
 static void put_compound_page(struct page *page)
 {
+        /*
+         * hugetlbfs pages cannot be split from under us.  If this is a
+         * hugetlbfs page, check refcount on head page and release the page if
+         * the refcount becomes zero.
+         */
+        if (PageHuge(page)) {
+                page = compound_head(page);
+                if (put_page_testzero(page))
+                        __put_compound_page(page);
+                return;
+        }
        if (unlikely(PageTail(page))) {
                /* __split_huge_page_refcount can run under us */
                struct page *page_head = compound_trans_head(page);
@@ -184,38 +198,51 @@ bool __get_page_tail(struct page *page)
         * proper PT lock that already serializes against
         * split_huge_page().
         */
-        unsigned long flags;
        bool got = false;
-        struct page *page_head = compound_trans_head(page);
+        struct page *page_head;
-        if (likely(page != page_head && get_page_unless_zero(page_head))) {
+        /*
+         * If this is a hugetlbfs page it cannot be split under us.  Simply
+         * increment refcount for the head page.
+         */
+        if (PageHuge(page)) {
+                page_head = compound_head(page);
+                atomic_inc(&page_head->_count);
+                got = true;
+        } else {
+                unsigned long flags;
+                page_head = compound_trans_head(page);
+                if (likely(page != page_head &&
+                                        get_page_unless_zero(page_head))) {
+                        /* Ref to put_compound_page() comment. */
+                        if (PageSlab(page_head)) {
+                                if (likely(PageTail(page))) {
+                                        __get_page_tail_foll(page, false);
+                                        return true;
+                                } else {
+                                        put_page(page_head);
+                                        return false;
+                                }
+                        }
-                /* Ref to put_compound_page() comment. */
+                        /*
-                if (PageSlab(page_head)) {
+                         * page_head wasn't a dangling pointer but it
+                         * may not be a head page anymore by the time
+                         * we obtain the lock. That is ok as long as it
+                         * can't be freed from under us.
+                         */
+                        flags = compound_lock_irqsave(page_head);
+                        /* here __split_huge_page_refcount won't run anymore */
                        if (likely(PageTail(page))) {
                                __get_page_tail_foll(page, false);
-                                return true;
+                                got = true;
-                        } else {
-                                put_page(page_head);
-                                return false;
                        }
+                        compound_unlock_irqrestore(page_head, flags);
+                        if (unlikely(!got))
+                                put_page(page_head);
                }
-                /*
-                 * page_head wasn't a dangling pointer but it
-                 * may not be a head page anymore by the time
-                 * we obtain the lock. That is ok as long as it
-                 * can't be freed from under us.
-                 */
-                flags = compound_lock_irqsave(page_head);
-                /* here __split_huge_page_refcount won't run anymore */
-                if (likely(PageTail(page))) {
-                        __get_page_tail_foll(page, false);
-                        got = true;
-                }
-                compound_unlock_irqrestore(page_head, flags);
-                if (unlikely(!got))
-                        put_page(page_head);
        }
        return got;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f24ab0dff554..e6f15f8ca2af 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -122,7 +122,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 {
        int error;
-        error = radix_tree_preload(gfp_mask);
+        error = radix_tree_maybe_preload(gfp_mask);
        if (!error) {
                error = __add_to_swap_cache(page, entry);
                radix_tree_preload_end();
@@ -328,7 +328,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                /*
                 * call radix_tree_preload() while we can wait.
                 */
-                err = radix_tree_preload(gfp_mask & GFP_KERNEL);
+                err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
                if (err)
                        break;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cf2e60983b7..3963fc24fcc1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -175,14 +175,296 @@ static void discard_swap_cluster(struct swap_info_struct *si,
        }
 }
-static int wait_for_discard(void *word)
+#define SWAPFILE_CLUSTER        256
+#define LATENCY_LIMIT           256
+static inline void cluster_set_flag(struct swap_cluster_info *info,
+        unsigned int flag)
 {
-        schedule();
+        info->flags = flag;
-        return 0;
 }
-#define SWAPFILE_CLUSTER        256
+static inline unsigned int cluster_count(struct swap_cluster_info *info)
-#define LATENCY_LIMIT           256
+{
+        return info->data;
+}
+static inline void cluster_set_count(struct swap_cluster_info *info,
+                                     unsigned int c)
+{
+        info->data = c;
+}
+static inline void cluster_set_count_flag(struct swap_cluster_info *info,
+                                         unsigned int c, unsigned int f)
+{
+        info->flags = f;
+        info->data = c;
+}
+static inline unsigned int cluster_next(struct swap_cluster_info *info)
+{
+        return info->data;
+}
+static inline void cluster_set_next(struct swap_cluster_info *info,
+                                    unsigned int n)
+{
+        info->data = n;
+}
+static inline void cluster_set_next_flag(struct swap_cluster_info *info,
+                                         unsigned int n, unsigned int f)
+{
+        info->flags = f;
+        info->data = n;
+}
+static inline bool cluster_is_free(struct swap_cluster_info *info)
+{
+        return info->flags & CLUSTER_FLAG_FREE;
+}
+static inline bool cluster_is_null(struct swap_cluster_info *info)
+{
+        return info->flags & CLUSTER_FLAG_NEXT_NULL;
+}
+static inline void cluster_set_null(struct swap_cluster_info *info)
+{
+        info->flags = CLUSTER_FLAG_NEXT_NULL;
+        info->data = 0;
+}
+/* Add a cluster to discard list and schedule it to do discard */
+static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+                unsigned int idx)
+{
+        /*
+         * If scan_swap_map() can't find a free cluster, it will check
+         * si->swap_map directly. To make sure the discarding cluster isn't
+         * taken by scan_swap_map(), mark the swap entries bad (occupied). It
+         * will be cleared after discard
+         */
+        memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+                        SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+        if (cluster_is_null(&si->discard_cluster_head)) {
+                cluster_set_next_flag(&si->discard_cluster_head,
+                                                idx, 0);
+                cluster_set_next_flag(&si->discard_cluster_tail,
+                                                idx, 0);
+        } else {
+                unsigned int tail = cluster_next(&si->discard_cluster_tail);
+                cluster_set_next(&si->cluster_info[tail], idx);
+                cluster_set_next_flag(&si->discard_cluster_tail,
+                                                idx, 0);
+        }
+        schedule_work(&si->discard_work);
+}
+/*
+ * Doing discard actually. After a cluster discard is finished, the cluster
+ * will be added to free cluster list. caller should hold si->lock.
+*/
+static void swap_do_scheduled_discard(struct swap_info_struct *si)
+{
+        struct swap_cluster_info *info;
+        unsigned int idx;
+        info = si->cluster_info;
+        while (!cluster_is_null(&si->discard_cluster_head)) {
+                idx = cluster_next(&si->discard_cluster_head);
+                cluster_set_next_flag(&si->discard_cluster_head,
+                                                cluster_next(&info[idx]), 0);
+                if (cluster_next(&si->discard_cluster_tail) == idx) {
+                        cluster_set_null(&si->discard_cluster_head);
+                        cluster_set_null(&si->discard_cluster_tail);
+                }
+                spin_unlock(&si->lock);
+                discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
+                                SWAPFILE_CLUSTER);
+                spin_lock(&si->lock);
+                cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
+                if (cluster_is_null(&si->free_cluster_head)) {
+                        cluster_set_next_flag(&si->free_cluster_head,
+                                                idx, 0);
+                        cluster_set_next_flag(&si->free_cluster_tail,
+                                                idx, 0);
+                } else {
+                        unsigned int tail;
+                        tail = cluster_next(&si->free_cluster_tail);
+                        cluster_set_next(&info[tail], idx);
+                        cluster_set_next_flag(&si->free_cluster_tail,
+                                                idx, 0);
+                }
+                memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+                                0, SWAPFILE_CLUSTER);
+        }
+}
+static void swap_discard_work(struct work_struct *work)
+{
+        struct swap_info_struct *si;
+        si = container_of(work, struct swap_info_struct, discard_work);
+        spin_lock(&si->lock);
+        swap_do_scheduled_discard(si);
+        spin_unlock(&si->lock);
+}
+/*
+ * The cluster corresponding to page_nr will be used. The cluster will be
+ * removed from free cluster list and its usage counter will be increased.
+ */
+static void inc_cluster_info_page(struct swap_info_struct *p,
+        struct swap_cluster_info *cluster_info, unsigned long page_nr)
+{
+        unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+        if (!cluster_info)
+                return;
+        if (cluster_is_free(&cluster_info[idx])) {
+                VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
+                cluster_set_next_flag(&p->free_cluster_head,
+                        cluster_next(&cluster_info[idx]), 0);
+                if (cluster_next(&p->free_cluster_tail) == idx) {
+                        cluster_set_null(&p->free_cluster_tail);
+                        cluster_set_null(&p->free_cluster_head);
+                }
+                cluster_set_count_flag(&cluster_info[idx], 0, 0);
+        }
+        VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
+        cluster_set_count(&cluster_info[idx],
+                cluster_count(&cluster_info[idx]) + 1);
+}
+/*
+ * The cluster corresponding to page_nr decreases one usage. If the usage
+ * counter becomes 0, which means no page in the cluster is in using, we can
+ * optionally discard the cluster and add it to free cluster list.
+ */
+static void dec_cluster_info_page(struct swap_info_struct *p,
+        struct swap_cluster_info *cluster_info, unsigned long page_nr)
+{
+        unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+        if (!cluster_info)
+                return;
+        VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
+        cluster_set_count(&cluster_info[idx],
+                cluster_count(&cluster_info[idx]) - 1);
+        if (cluster_count(&cluster_info[idx]) == 0) {
+                /*
+                 * If the swap is discardable, prepare discard the cluster
+                 * instead of free it immediately. The cluster will be freed
+                 * after discard.
+                 */
+                if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+                                 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
+                        swap_cluster_schedule_discard(p, idx);
+                        return;
+                }
+                cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
+                if (cluster_is_null(&p->free_cluster_head)) {
+                        cluster_set_next_flag(&p->free_cluster_head, idx, 0);
+                        cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
+                } else {
+                        unsigned int tail = cluster_next(&p->free_cluster_tail);
+                        cluster_set_next(&cluster_info[tail], idx);
+                        cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
+                }
+        }
+}
+/*
+ * It's possible scan_swap_map() uses a free cluster in the middle of free
+ * cluster list. Avoiding such abuse to avoid list corruption.
+ */
+static bool
+scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
+        unsigned long offset)
+{
+        struct percpu_cluster *percpu_cluster;
+        bool conflict;
+        offset /= SWAPFILE_CLUSTER;
+        conflict = !cluster_is_null(&si->free_cluster_head) &&
+                offset != cluster_next(&si->free_cluster_head) &&
+                cluster_is_free(&si->cluster_info[offset]);
+        if (!conflict)
+                return false;
+        percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+        cluster_set_null(&percpu_cluster->index);
+        return true;
+}
+/*
+ * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
+ * might involve allocating a new cluster for current CPU too.
+ */
+static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+        unsigned long *offset, unsigned long *scan_base)
+{
+        struct percpu_cluster *cluster;
+        bool found_free;
+        unsigned long tmp;
+new_cluster:
+        cluster = this_cpu_ptr(si->percpu_cluster);
+        if (cluster_is_null(&cluster->index)) {
+                if (!cluster_is_null(&si->free_cluster_head)) {
+                        cluster->index = si->free_cluster_head;
+                        cluster->next = cluster_next(&cluster->index) *
+                                        SWAPFILE_CLUSTER;
+                } else if (!cluster_is_null(&si->discard_cluster_head)) {
+                        /*
+                         * we don't have free cluster but have some clusters in
+                         * discarding, do discard now and reclaim them
+                         */
+                        swap_do_scheduled_discard(si);
+                        *scan_base = *offset = si->cluster_next;
+                        goto new_cluster;
+                } else
+                        return;
+        }
+        found_free = false;
+        /*
+         * Other CPUs can use our cluster if they can't find a free cluster,
+         * check if there is still free entry in the cluster
+         */
+        tmp = cluster->next;
+        while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
+               SWAPFILE_CLUSTER) {
+                if (!si->swap_map[tmp]) {
+                        found_free = true;
+                        break;
+                }
+                tmp++;
+        }
+        if (!found_free) {
+                cluster_set_null(&cluster->index);
+                goto new_cluster;
+        }
+        cluster->next = tmp + 1;
+        *offset = tmp;
+        *scan_base = tmp;
+}
 static unsigned long scan_swap_map(struct swap_info_struct *si,
                                   unsigned char usage)
@@ -191,7 +473,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
        unsigned long scan_base;
        unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
-        int found_free_cluster = 0;
        /*
         * We try to cluster swap pages by allocating them sequentially
@@ -207,24 +488,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
        si->flags += SWP_SCANNING;
        scan_base = offset = si->cluster_next;
+        /* SSD algorithm */
+        if (si->cluster_info) {
+                scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+                goto checks;
+        }
        if (unlikely(!si->cluster_nr--)) {
                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
                }
-                if (si->flags & SWP_PAGE_DISCARD) {
-                        /*
-                         * Start range check on racing allocations, in case
-                         * they overlap the cluster we eventually decide on
-                         * (we scan without swap_lock to allow preemption).
-                         * It's hardly conceivable that cluster_nr could be
-                         * wrapped during our scan, but don't depend on it.
-                         */
-                        if (si->lowest_alloc)
-                                goto checks;
-                        si->lowest_alloc = si->max;
-                        si->highest_alloc = 0;
-                }
                spin_unlock(&si->lock);
                /*
@@ -248,7 +523,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                                found_free_cluster = 1;
                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
@@ -269,7 +543,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                                found_free_cluster = 1;
                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
@@ -281,10 +554,13 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                offset = scan_base;
                spin_lock(&si->lock);
                si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                si->lowest_alloc = 0;
        }
 checks:
+        if (si->cluster_info) {
+                while (scan_swap_map_ssd_cluster_conflict(si, offset))
+                        scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+        }
        if (!(si->flags & SWP_WRITEOK))
                goto no_page;
        if (!si->highest_bit)
@@ -317,62 +593,10 @@ checks:
                si->highest_bit = 0;
        }
        si->swap_map[offset] = usage;
+        inc_cluster_info_page(si, si->cluster_info, offset);
        si->cluster_next = offset + 1;
        si->flags -= SWP_SCANNING;
-        if (si->lowest_alloc) {
-                /*
-                 * Only set when SWP_PAGE_DISCARD, and there's a scan
-                 * for a free cluster in progress or just completed.
-                 */
-                if (found_free_cluster) {
-                        /*
-                         * To optimize wear-levelling, discard the
-                         * old data of the cluster, taking care not to
-                         * discard any of its pages that have already
-                         * been allocated by racing tasks (offset has
-                         * already stepped over any at the beginning).
-                         */
-                        if (offset < si->highest_alloc &&
-                            si->lowest_alloc <= last_in_cluster)
-                                last_in_cluster = si->lowest_alloc - 1;
-                        si->flags |= SWP_DISCARDING;
-                        spin_unlock(&si->lock);
-                        if (offset < last_in_cluster)
-                                discard_swap_cluster(si, offset,
-                                        last_in_cluster - offset + 1);
-                        spin_lock(&si->lock);
-                        si->lowest_alloc = 0;
-                        si->flags &= ~SWP_DISCARDING;
-                        smp_mb();       /* wake_up_bit advises this */
-                        wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
-                } else if (si->flags & SWP_DISCARDING) {
-                        /*
-                         * Delay using pages allocated by racing tasks
-                         * until the whole discard has been issued. We
-                         * could defer that delay until swap_writepage,
-                         * but it's easier to keep this self-contained.
-                         */
-                        spin_unlock(&si->lock);
-                        wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
-                                wait_for_discard, TASK_UNINTERRUPTIBLE);
-                        spin_lock(&si->lock);
-                } else {
-                        /*
-                         * Note pages allocated by racing tasks while
-                         * scan for a free cluster is in progress, so
-                         * that its final discard can exclude them.
-                         */
-                        if (offset < si->lowest_alloc)
-                                si->lowest_alloc = offset;
-                        if (offset > si->highest_alloc)
-                                si->highest_alloc = offset;
-                }
-        }
        return offset;
 scan:
@@ -527,16 +751,16 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
        return p;
 bad_free:
-        printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
+        pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
        goto out;
 bad_offset:
-        printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
+        pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
        goto out;
 bad_device:
-        printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
+        pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
        goto out;
 bad_nofile:
-        printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
+        pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
 out:
        return NULL;
 }
@@ -600,6 +824,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
        /* free if no reference */
        if (!usage) {
+                dec_cluster_info_page(p, p->cluster_info, offset);
                if (offset < p->lowest_bit)
                        p->lowest_bit = offset;
                if (offset > p->highest_bit)
@@ -1107,7 +1332,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                        else
                                continue;
                }
-                count = si->swap_map[i];
+                count = ACCESS_ONCE(si->swap_map[i]);
                if (count && swap_count(count) != SWAP_MAP_BAD)
                        break;
        }
@@ -1127,7 +1352,11 @@ int try_to_unuse(unsigned int type, bool frontswap,
 {
        struct swap_info_struct *si = swap_info[type];
        struct mm_struct *start_mm;
-        unsigned char *swap_map;
+        volatile unsigned char *swap_map; /* swap_map is accessed without
+                                           * locking. Mark it as volatile
+                                           * to prevent compiler doing
+                                           * something odd.
+                                           */
        unsigned char swcount;
        struct page *page;
        swp_entry_t entry;
@@ -1178,7 +1407,15 @@ int try_to_unuse(unsigned int type, bool frontswap,
                         * reused since sys_swapoff() already disabled
                         * allocation from here, or alloc_page() failed.
                         */
-                        if (!*swap_map)
+                        swcount = *swap_map;
+                        /*
+                         * We don't hold lock here, so the swap entry could be
+                         * SWAP_MAP_BAD (when the cluster is discarding).
+                         * Instead of fail out, We can just skip the swap
+                         * entry because swapoff will wait for discarding
+                         * finish anyway.
+                         */
+                        if (!swcount || swcount == SWAP_MAP_BAD)
                                continue;
                        retval = -ENOMEM;
                        break;
@@ -1524,7 +1761,8 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 }
 static void _enable_swap_info(struct swap_info_struct *p, int prio,
-                                unsigned char *swap_map)
+                                unsigned char *swap_map,
+                                struct swap_cluster_info *cluster_info)
 {
        int i, prev;
@@ -1533,6 +1771,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
        else
                p->prio = --least_priority;
        p->swap_map = swap_map;
+        p->cluster_info = cluster_info;
        p->flags |= SWP_WRITEOK;
        atomic_long_add(p->pages, &nr_swap_pages);
        total_swap_pages += p->pages;
@@ -1553,12 +1792,13 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
 static void enable_swap_info(struct swap_info_struct *p, int prio,
                                unsigned char *swap_map,
+                                struct swap_cluster_info *cluster_info,
                                unsigned long *frontswap_map)
 {
        frontswap_init(p->type, frontswap_map);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
-         _enable_swap_info(p, prio, swap_map);
+         _enable_swap_info(p, prio, swap_map, cluster_info);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
 }
@@ -1567,7 +1807,7 @@ static void reinsert_swap_info(struct swap_info_struct *p)
 {
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
-        _enable_swap_info(p, p->prio, p->swap_map);
+        _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
 }
@@ -1576,6 +1816,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
        struct swap_info_struct *p = NULL;
        unsigned char *swap_map;
+        struct swap_cluster_info *cluster_info;
        unsigned long *frontswap_map;
        struct file *swap_file, *victim;
        struct address_space *mapping;
@@ -1651,6 +1892,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                goto out_dput;
        }
+        flush_work(&p->discard_work);
        destroy_swap_extents(p);
        if (p->flags & SWP_CONTINUED)
                free_swap_count_continuations(p);
@@ -1675,6 +1918,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        p->max = 0;
        swap_map = p->swap_map;
        p->swap_map = NULL;
+        cluster_info = p->cluster_info;
+        p->cluster_info = NULL;
        p->flags = 0;
        frontswap_map = frontswap_map_get(p);
        frontswap_map_set(p, NULL);
@@ -1682,7 +1927,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        spin_unlock(&swap_lock);
        frontswap_invalidate_area(type);
        mutex_unlock(&swapon_mutex);
+        free_percpu(p->percpu_cluster);
+        p->percpu_cluster = NULL;
        vfree(swap_map);
+        vfree(cluster_info);
        vfree(frontswap_map);
        /* Destroy swap account informatin */
        swap_cgroup_swapoff(type);
@@ -1926,9 +2174,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
        int i;
        unsigned long maxpages;
        unsigned long swapfilepages;
+        unsigned long last_page;
        if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
-                printk(KERN_ERR "Unable to find swap-space signature\n");
+                pr_err("Unable to find swap-space signature\n");
                return 0;
        }
@@ -1942,9 +2191,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
        }
        /* Check the swap header's sub-version */
        if (swap_header->info.version != 1) {
-                printk(KERN_WARNING
+                pr_warn("Unable to handle swap header version %d\n",
-                       "Unable to handle swap header version %d\n",
+                        swap_header->info.version);
-                       swap_header->info.version);
                return 0;
        }
@@ -1968,8 +2216,14 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
         */
        maxpages = swp_offset(pte_to_swp_entry(
                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
-        if (maxpages > swap_header->info.last_page) {
+        last_page = swap_header->info.last_page;
-                maxpages = swap_header->info.last_page + 1;
+        if (last_page > maxpages) {
+                pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
+                        maxpages << (PAGE_SHIFT - 10),
+                        last_page << (PAGE_SHIFT - 10));
+        }
+        if (maxpages > last_page) {
+                maxpages = last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
                if ((unsigned int)maxpages == 0)
                        maxpages = UINT_MAX;
@@ -1980,8 +2234,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
                return 0;
        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
        if (swapfilepages && maxpages > swapfilepages) {
-                printk(KERN_WARNING
+                pr_warn("Swap area shorter than signature indicates\n");
-                       "Swap area shorter than signature indicates\n");
                return 0;
        }
        if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
@@ -1995,15 +2248,23 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 static int setup_swap_map_and_extents(struct swap_info_struct *p,
                                        union swap_header *swap_header,
                                        unsigned char *swap_map,
+                                        struct swap_cluster_info *cluster_info,
                                        unsigned long maxpages,
                                        sector_t *span)
 {
        int i;
        unsigned int nr_good_pages;
        int nr_extents;
+        unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+        unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
        nr_good_pages = maxpages - 1;   /* omit header page */
+        cluster_set_null(&p->free_cluster_head);
+        cluster_set_null(&p->free_cluster_tail);
+        cluster_set_null(&p->discard_cluster_head);
+        cluster_set_null(&p->discard_cluster_tail);
        for (i = 0; i < swap_header->info.nr_badpages; i++) {
                unsigned int page_nr = swap_header->info.badpages[i];
                if (page_nr == 0 || page_nr > swap_header->info.last_page)
@@ -2011,11 +2272,25 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
                if (page_nr < maxpages) {
                        swap_map[page_nr] = SWAP_MAP_BAD;
                        nr_good_pages--;
+                        /*
+                         * Haven't marked the cluster free yet, no list
+                         * operation involved
+                         */
+                        inc_cluster_info_page(p, cluster_info, page_nr);
                }
        }
+        /* Haven't marked the cluster free yet, no list operation involved */
+        for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
+                inc_cluster_info_page(p, cluster_info, i);
        if (nr_good_pages) {
                swap_map[0] = SWAP_MAP_BAD;
+                /*
+                 * Not mark the cluster free yet, no list
+                 * operation involved
+                 */
+                inc_cluster_info_page(p, cluster_info, 0);
                p->max = maxpages;
                p->pages = nr_good_pages;
                nr_extents = setup_swap_extents(p, span);
@@ -2024,10 +2299,34 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
                nr_good_pages = p->pages;
        }
        if (!nr_good_pages) {
-                printk(KERN_WARNING "Empty swap-file\n");
+                pr_warn("Empty swap-file\n");
                return -EINVAL;
        }
+        if (!cluster_info)
+                return nr_extents;
+        for (i = 0; i < nr_clusters; i++) {
+                if (!cluster_count(&cluster_info[idx])) {
+                        cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
+                        if (cluster_is_null(&p->free_cluster_head)) {
+                                cluster_set_next_flag(&p->free_cluster_head,
+                                                                idx, 0);
+                                cluster_set_next_flag(&p->free_cluster_tail,
+                                                                idx, 0);
+                        } else {
+                                unsigned int tail;
+                                tail = cluster_next(&p->free_cluster_tail);
+                                cluster_set_next(&cluster_info[tail], idx);
+                                cluster_set_next_flag(&p->free_cluster_tail,
+                                                                idx, 0);
+                        }
+                }
+                idx++;
+                if (idx == nr_clusters)
+                        idx = 0;
+        }
        return nr_extents;
 }
@@ -2059,6 +2358,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        sector_t span;
        unsigned long maxpages;
        unsigned char *swap_map = NULL;
+        struct swap_cluster_info *cluster_info = NULL;
        unsigned long *frontswap_map = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
@@ -2073,6 +2373,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (IS_ERR(p))
                return PTR_ERR(p);
+        INIT_WORK(&p->discard_work, swap_discard_work);
        name = getname(specialfile);
        if (IS_ERR(name)) {
                error = PTR_ERR(name);
@@ -2132,13 +2434,38 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                error = -ENOMEM;
                goto bad_swap;
        }
+        if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+                p->flags |= SWP_SOLIDSTATE;
+                /*
+                 * select a random position to start with to help wear leveling
+                 * SSD
+                 */
+                p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+                cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
+                        SWAPFILE_CLUSTER) * sizeof(*cluster_info));
+                if (!cluster_info) {
+                        error = -ENOMEM;
+                        goto bad_swap;
+                }
+                p->percpu_cluster = alloc_percpu(struct percpu_cluster);
+                if (!p->percpu_cluster) {
+                        error = -ENOMEM;
+                        goto bad_swap;
+                }
+                for_each_possible_cpu(i) {
+                        struct percpu_cluster *cluster;
+                        cluster = per_cpu_ptr(p->percpu_cluster, i);
+                        cluster_set_null(&cluster->index);
+                }
+        }
        error = swap_cgroup_swapon(p->type, maxpages);
        if (error)
                goto bad_swap;
        nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
-                maxpages, &span);
+                cluster_info, maxpages, &span);
        if (unlikely(nr_extents < 0)) {
                error = nr_extents;
                goto bad_swap;
@@ -2147,41 +2474,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (frontswap_enabled)
                frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
-        if (p->bdev) {
+        if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
-                if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+                /*
-                        p->flags |= SWP_SOLIDSTATE;
+                 * When discard is enabled for swap with no particular
-                        p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+                 * policy flagged, we set all swap discard flags here in
-                }
+                 * order to sustain backward compatibility with older
+                 * swapon(8) releases.
-                if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+                 */
-                        /*
+                p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
-                         * When discard is enabled for swap with no particular
+                             SWP_PAGE_DISCARD);
-                         * policy flagged, we set all swap discard flags here in
-                         * order to sustain backward compatibility with older
-                         * swapon(8) releases.
-                         */
-                        p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
-                                     SWP_PAGE_DISCARD);
-                        /*
+                /*
-                         * By flagging sys_swapon, a sysadmin can tell us to
+                 * By flagging sys_swapon, a sysadmin can tell us to
-                         * either do single-time area discards only, or to just
+                 * either do single-time area discards only, or to just
-                         * perform discards for released swap page-clusters.
+                 * perform discards for released swap page-clusters.
-                         * Now it's time to adjust the p->flags accordingly.
+                 * Now it's time to adjust the p->flags accordingly.
-                         */
+                 */
-                        if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
+                if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
-                                p->flags &= ~SWP_PAGE_DISCARD;
+                        p->flags &= ~SWP_PAGE_DISCARD;
-                        else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
+                else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
-                                p->flags &= ~SWP_AREA_DISCARD;
+                        p->flags &= ~SWP_AREA_DISCARD;
-                        /* issue a swapon-time discard if it's still required */
+                /* issue a swapon-time discard if it's still required */
-                        if (p->flags & SWP_AREA_DISCARD) {
+                if (p->flags & SWP_AREA_DISCARD) {
-                                int err = discard_swap(p);
+                        int err = discard_swap(p);
-                                if (unlikely(err))
+                        if (unlikely(err))
-                                        printk(KERN_ERR
+                                pr_err("swapon: discard_swap(%p): %d\n",
-                                               "swapon: discard_swap(%p): %d\n",
+                                        p, err);
-                                                p, err);
-                        }
                }
        }
@@ -2190,9 +2509,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (swap_flags & SWAP_FLAG_PREFER)
                prio =
                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
-        enable_swap_info(p, prio, swap_map, frontswap_map);
+        enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
-        printk(KERN_INFO "Adding %uk swap on %s.  "
+        pr_info("Adding %uk swap on %s.  "
                        "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
                p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
@@ -2211,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        error = 0;
        goto out;
 bad_swap:
+        free_percpu(p->percpu_cluster);
+        p->percpu_cluster = NULL;
        if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
                set_blocksize(p->bdev, p->old_block_size);
                blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
@@ -2222,6 +2543,7 @@ bad_swap:
        p->flags = 0;
        spin_unlock(&swap_lock);
        vfree(swap_map);
+        vfree(cluster_info);
        if (swap_file) {
                if (inode && S_ISREG(inode->i_mode)) {
                        mutex_unlock(&inode->i_mutex);
@@ -2291,6 +2613,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
                goto unlock_out;
        count = p->swap_map[offset];
+        /*
+         * swapin_readahead() doesn't check if a swap entry is valid, so the
+         * swap entry could be SWAP_MAP_BAD. Check here with lock held.
+         */
+        if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
+                err = -ENOENT;
+                goto unlock_out;
+        }
        has_cache = count & SWAP_HAS_CACHE;
        count &= ~SWAP_HAS_CACHE;
        err = 0;
@@ -2326,7 +2658,7 @@ out:
        return err;
 bad_file:
-        printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
+        pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
        goto out;
 }
diff --git a/mm/util.c b/mm/util.c
index 7441c41d00f6..eaf63fc2c92f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -388,15 +388,12 @@ struct address_space *page_mapping(struct page *page)
        struct address_space *mapping = page->mapping;
        VM_BUG_ON(PageSlab(page));
-#ifdef CONFIG_SWAP
        if (unlikely(PageSwapCache(page))) {
                swp_entry_t entry;
                entry.val = page_private(page);
                mapping = swap_address_space(entry);
-        } else
+        } else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
-#endif
-        if ((unsigned long)mapping & PAGE_MAPPING_ANON)
                mapping = NULL;
        return mapping;
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 13a54953a273..107454312d5e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -752,7 +752,6 @@ struct vmap_block_queue {
 struct vmap_block {
        spinlock_t lock;
        struct vmap_area *va;
-        struct vmap_block_queue *vbq;
        unsigned long free, dirty;
        DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
        struct list_head free_list;
@@ -830,7 +829,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        radix_tree_preload_end();
        vbq = &get_cpu_var(vmap_block_queue);
-        vb->vbq = vbq;
        spin_lock(&vbq->lock);
        list_add_rcu(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);
@@ -1018,15 +1016,16 @@ void vm_unmap_aliases(void)
                rcu_read_lock();
                list_for_each_entry_rcu(vb, &vbq->free, free_list) {
-                        int i;
+                        int i, j;
                        spin_lock(&vb->lock);
                        i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
-                        while (i < VMAP_BBMAP_BITS) {
+                        if (i < VMAP_BBMAP_BITS) {
                                unsigned long s, e;
-                                int j;
-                                j = find_next_zero_bit(vb->dirty_map,
+                                j = find_last_bit(vb->dirty_map,
-                                        VMAP_BBMAP_BITS, i);
+                                                        VMAP_BBMAP_BITS);
+                                j = j + 1; /* need exclusive index */
                                s = vb->va->va_start + (i << PAGE_SHIFT);
                                e = vb->va->va_start + (j << PAGE_SHIFT);
@@ -1036,10 +1035,6 @@ void vm_unmap_aliases(void)
                                        start = s;
                                if (e > end)
                                        end = e;
-                                i = j;
-                                i = find_next_bit(vb->dirty_map,
-                                                        VMAP_BBMAP_BITS, i);
                        }
                        spin_unlock(&vb->lock);
                }
@@ -1263,7 +1258,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 {
        unsigned long addr = (unsigned long)area->addr;
-        unsigned long end = addr + area->size - PAGE_SIZE;
+        unsigned long end = addr + get_vm_area_size(area);
        int err;
        err = vmap_page_range(addr, end, prot, *pages);
@@ -1558,7 +1553,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        unsigned int nr_pages, array_size, i;
        gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
-        nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
+        nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
        array_size = (nr_pages * sizeof(struct page *));
        area->nr_pages = nr_pages;
@@ -1990,7 +1985,7 @@ long vread(char *buf, char *addr, unsigned long count)
                vm = va->vm;
                vaddr = (char *) vm->addr;
-                if (addr >= vaddr + vm->size - PAGE_SIZE)
+                if (addr >= vaddr + get_vm_area_size(vm))
                        continue;
                while (addr < vaddr) {
                        if (count == 0)
@@ -2000,7 +1995,7 @@ long vread(char *buf, char *addr, unsigned long count)
                        addr++;
                        count--;
                }
-                n = vaddr + vm->size - PAGE_SIZE - addr;
+                n = vaddr + get_vm_area_size(vm) - addr;
                if (n > count)
                        n = count;
                if (!(vm->flags & VM_IOREMAP))
@@ -2072,7 +2067,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
                vm = va->vm;
                vaddr = (char *) vm->addr;
-                if (addr >= vaddr + vm->size - PAGE_SIZE)
+                if (addr >= vaddr + get_vm_area_size(vm))
                        continue;
                while (addr < vaddr) {
                        if (count == 0)
@@ -2081,7 +2076,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
                        addr++;
                        count--;
                }
-                n = vaddr + vm->size - PAGE_SIZE - addr;
+                n = vaddr + get_vm_area_size(vm) - addr;
                if (n > count)
                        n = count;
                if (!(vm->flags & VM_IOREMAP)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e36454220614..beb35778c69f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc)
 }
 #endif
+unsigned long zone_reclaimable_pages(struct zone *zone)
+{
+        int nr;
+        nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+             zone_page_state(zone, NR_INACTIVE_FILE);
+        if (get_nr_swap_pages() > 0)
+                nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+                      zone_page_state(zone, NR_INACTIVE_ANON);
+        return nr;
+}
+bool zone_reclaimable(struct zone *zone)
+{
+        return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
        if (!mem_cgroup_disabled())
@@ -579,7 +598,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
 */
 void putback_lru_page(struct page *page)
 {
-        int lru;
+        bool is_unevictable;
        int was_unevictable = PageUnevictable(page);
        VM_BUG_ON(PageLRU(page));
@@ -594,14 +613,14 @@ redo:
                 * unevictable page on [in]active list.
                 * We know how to handle that.
                 */
-                lru = page_lru_base_type(page);
+                is_unevictable = false;
                lru_cache_add(page);
        } else {
                /*
                 * Put unevictable pages directly on zone's unevictable
                 * list.
                 */
-                lru = LRU_UNEVICTABLE;
+                is_unevictable = true;
                add_page_to_unevictable_list(page);
                /*
                 * When racing with an mlock or AS_UNEVICTABLE clearing
@@ -621,7 +640,7 @@ redo:
         * page is on unevictable list, it never be freed. To avoid that,
         * check after we added it to the list, again.
         */
-        if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
+        if (is_unevictable && page_evictable(page)) {
                if (!isolate_lru_page(page)) {
                        put_page(page);
                        goto redo;
@@ -632,9 +651,9 @@ redo:
                 */
        }
-        if (was_unevictable && lru != LRU_UNEVICTABLE)
+        if (was_unevictable && !is_unevictable)
                count_vm_event(UNEVICTABLE_PGRESCUED);
-        else if (!was_unevictable && lru == LRU_UNEVICTABLE)
+        else if (!was_unevictable && is_unevictable)
                count_vm_event(UNEVICTABLE_PGCULLED);
        put_page(page);         /* drop ref from isolate */
@@ -1823,7 +1842,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * latencies, so it's better to scan a minimum amount there as
         * well.
         */
-        if (current_is_kswapd() && zone->all_unreclaimable)
+        if (current_is_kswapd() && !zone_reclaimable(zone))
                force_scan = true;
        if (!global_reclaim(sc))
                force_scan = true;
@@ -2278,8 +2297,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                if (global_reclaim(sc)) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
-                        if (zone->all_unreclaimable &&
+                        if (sc->priority != DEF_PRIORITY &&
-                                        sc->priority != DEF_PRIORITY)
+                            !zone_reclaimable(zone))
                                continue;       /* Let kswapd poll it */
                        if (IS_ENABLED(CONFIG_COMPACTION)) {
                                /*
@@ -2317,11 +2336,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
        return aborted_reclaim;
 }
-static bool zone_reclaimable(struct zone *zone)
-{
-        return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
-}
 /* All zones in zonelist are unreclaimable? */
 static bool all_unreclaimable(struct zonelist *zonelist,
                struct scan_control *sc)
@@ -2335,7 +2349,7 @@ static bool all_unreclaimable(struct zonelist *zonelist,
                        continue;
                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                        continue;
-                if (!zone->all_unreclaimable)
+                if (zone_reclaimable(zone))
                        return false;
        }
@@ -2750,7 +2764,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
                 * DEF_PRIORITY. Effectively, it considers them balanced so
                 * they must be considered balanced here as well!
                 */
-                if (zone->all_unreclaimable) {
+                if (!zone_reclaimable(zone)) {
                        balanced_pages += zone->managed_pages;
                        continue;
                }
@@ -2811,7 +2825,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
                               unsigned long lru_pages,
                               unsigned long *nr_attempted)
 {
-        unsigned long nr_slab;
        int testorder = sc->order;
        unsigned long balance_gap;
        struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2858,15 +2871,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
        node_set(zone_to_nid(zone), shrink.nodes_to_scan);
        reclaim_state->reclaimed_slab = 0;
-        nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
+        shrink_slab(&shrink, sc->nr_scanned, lru_pages);
        sc->nr_reclaimed += reclaim_state->reclaimed_slab;
        /* Account for the number of pages attempted to reclaim */
        *nr_attempted += sc->nr_to_reclaim;
-        if (nr_slab == 0 && !zone_reclaimable(zone))
-                zone->all_unreclaimable = 1;
        zone_clear_flag(zone, ZONE_WRITEBACK);
        /*
@@ -2875,7 +2885,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
         * BDIs but as pressure is relieved, speculatively avoid congestion
         * waits.
         */
-        if (!zone->all_unreclaimable &&
+        if (zone_reclaimable(zone) &&
            zone_balanced(zone, testorder, 0, classzone_idx)) {
                zone_clear_flag(zone, ZONE_CONGESTED);
                zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
@@ -2941,8 +2951,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                        if (!populated_zone(zone))
                                continue;
-                        if (zone->all_unreclaimable &&
+                        if (sc.priority != DEF_PRIORITY &&
-                            sc.priority != DEF_PRIORITY)
+                            !zone_reclaimable(zone))
                                continue;
                        /*
@@ -3020,8 +3030,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                        if (!populated_zone(zone))
                                continue;
-                        if (zone->all_unreclaimable &&
+                        if (sc.priority != DEF_PRIORITY &&
-                            sc.priority != DEF_PRIORITY)
+                            !zone_reclaimable(zone))
                                continue;
                        sc.nr_scanned = 0;
@@ -3277,7 +3287,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
        }
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
-        if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+        if (zone_balanced(zone, order, 0, 0))
                return;
        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
@@ -3305,20 +3315,6 @@ unsigned long global_reclaimable_pages(void)
        return nr;
 }
-unsigned long zone_reclaimable_pages(struct zone *zone)
-{
-        int nr;
-        nr = zone_page_state(zone, NR_ACTIVE_FILE) +
-             zone_page_state(zone, NR_INACTIVE_FILE);
-        if (get_nr_swap_pages() > 0)
-                nr += zone_page_state(zone, NR_ACTIVE_ANON) +
-                      zone_page_state(zone, NR_INACTIVE_ANON);
-        return nr;
-}
 #ifdef CONFIG_HIBERNATION
 /*
 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3615,7 +3611,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
            zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
                return ZONE_RECLAIM_FULL;
-        if (zone->all_unreclaimable)
+        if (!zone_reclaimable(zone))
                return ZONE_RECLAIM_FULL;
        /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c2ef4458fa..9bb314577911 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -19,6 +19,9 @@
 #include <linux/math64.h>
 #include <linux/writeback.h>
 #include <linux/compaction.h>
+#include <linux/mm_inline.h>
+#include "internal.h"
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -414,12 +417,17 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 EXPORT_SYMBOL(dec_zone_page_state);
 #endif
+static inline void fold_diff(int *diff)
+{
+        int i;
+        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                if (diff[i])
+                        atomic_long_add(diff[i], &vm_stat[i]);
+}
 /*
- * Update the zone counters for one cpu.
+ * Update the zone counters for the current cpu.
- *
- * The cpu specified must be either the current cpu or a processor that
- * is not online. If it is the current cpu then the execution thread must
- * be pinned to the current cpu.
 *
 * Note that refresh_cpu_vm_stats strives to only access
 * node local memory. The per cpu pagesets on remote zones are placed
@@ -432,33 +440,29 @@ EXPORT_SYMBOL(dec_zone_page_state);
 * with the global counters. These could cause remote node cache line
 * bouncing and will have to be only done when necessary.
 */
-void refresh_cpu_vm_stats(int cpu)
+static void refresh_cpu_vm_stats(void)
 {
        struct zone *zone;
        int i;
        int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
        for_each_populated_zone(zone) {
-                struct per_cpu_pageset *p;
+                struct per_cpu_pageset __percpu *p = zone->pageset;
-                p = per_cpu_ptr(zone->pageset, cpu);
+                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+                        int v;
-                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                        v = this_cpu_xchg(p->vm_stat_diff[i], 0);
-                        if (p->vm_stat_diff[i]) {
+                        if (v) {
-                                unsigned long flags;
-                                int v;
-                                local_irq_save(flags);
-                                v = p->vm_stat_diff[i];
-                                p->vm_stat_diff[i] = 0;
-                                local_irq_restore(flags);
                                atomic_long_add(v, &zone->vm_stat[i]);
                                global_diff[i] += v;
 #ifdef CONFIG_NUMA
                                /* 3 seconds idle till flush */
-                                p->expire = 3;
+                                __this_cpu_write(p->expire, 3);
 #endif
                        }
+                }
                cond_resched();
 #ifdef CONFIG_NUMA
                /*
@@ -468,29 +472,57 @@ void refresh_cpu_vm_stats(int cpu)
                 * Check if there are pages remaining in this pageset
                 * if not then there is nothing to expire.
                 */
-                if (!p->expire || !p->pcp.count)
+                if (!__this_cpu_read(p->expire) ||
+                               !__this_cpu_read(p->pcp.count))
                        continue;
                /*
                 * We never drain zones local to this processor.
                 */
                if (zone_to_nid(zone) == numa_node_id()) {
-                        p->expire = 0;
+                        __this_cpu_write(p->expire, 0);
                        continue;
                }
-                p->expire--;
-                if (p->expire)
+                if (__this_cpu_dec_return(p->expire))
                        continue;
-                if (p->pcp.count)
+                if (__this_cpu_read(p->pcp.count))
-                        drain_zone_pages(zone, &p->pcp);
+                        drain_zone_pages(zone, __this_cpu_ptr(&p->pcp));
 #endif
        }
+        fold_diff(global_diff);
+}
-        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+/*
-                if (global_diff[i])
+ * Fold the data for an offline cpu into the global array.
-                        atomic_long_add(global_diff[i], &vm_stat[i]);
+ * There cannot be any access by the offline cpu and therefore
+ * synchronization is simplified.
+ */
+void cpu_vm_stats_fold(int cpu)
+{
+        struct zone *zone;
+        int i;
+        int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+        for_each_populated_zone(zone) {
+                struct per_cpu_pageset *p;
+                p = per_cpu_ptr(zone->pageset, cpu);
+                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                        if (p->vm_stat_diff[i]) {
+                                int v;
+                                v = p->vm_stat_diff[i];
+                                p->vm_stat_diff[i] = 0;
+                                atomic_long_add(v, &zone->vm_stat[i]);
+                                global_diff[i] += v;
+                        }
+        }
+        fold_diff(global_diff);
 }
 /*
@@ -703,6 +735,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
 const char * const vmstat_text[] = {
        /* Zoned VM counters */
        "nr_free_pages",
+        "nr_alloc_batch",
        "nr_inactive_anon",
        "nr_active_anon",
        "nr_inactive_file",
@@ -817,6 +850,12 @@ const char * const vmstat_text[] = {
        "thp_zero_page_alloc",
        "thp_zero_page_alloc_failed",
 #endif
+#ifdef CONFIG_SMP
+        "nr_tlb_remote_flush",
+        "nr_tlb_remote_flush_received",
+#endif
+        "nr_tlb_local_flush_all",
+        "nr_tlb_local_flush_one",
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };
@@ -1052,7 +1091,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n  all_unreclaimable: %u"
                   "\n  start_pfn:         %lu"
                   "\n  inactive_ratio:    %u",
-                   zone->all_unreclaimable,
+                   !zone_reclaimable(zone),
                   zone->zone_start_pfn,
                   zone->inactive_ratio);
        seq_putc(m, '\n');
@@ -1177,7 +1216,7 @@ int sysctl_stat_interval __read_mostly = HZ;
 static void vmstat_update(struct work_struct *w)
 {
-        refresh_cpu_vm_stats(smp_processor_id());
+        refresh_cpu_vm_stats();
        schedule_delayed_work(&__get_cpu_var(vmstat_work),
                round_jiffies_relative(sysctl_stat_interval));
 }
diff --git a/mm/zbud.c b/mm/zbud.c
index ad1e781284fd..9451361e6aa7 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -16,7 +16,7 @@
 *
 * zbud works by storing compressed pages, or "zpages", together in pairs in a
 * single memory page called a "zbud page".  The first buddy is "left
- * justifed" at the beginning of the zbud page, and the last buddy is "right
+ * justified" at the beginning of the zbud page, and the last buddy is "right
 * justified" at the end of the zbud page.  The benefit is that if either
 * buddy is freed, the freed buddy space, coalesced with whatever slack space
 * that existed between the buddies, results in the largest possible free region
@@ -243,7 +243,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
 * as zbud pool pages.
 *
- * Return: 0 if success and handle is set, otherwise -EINVAL is the size or
+ * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
 * a new page.
 */
diff --git a/mm/zswap.c b/mm/zswap.c
index deda2b671e12..841e35f1db22 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -409,7 +409,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
                                struct page **retpage)
 {
        struct page *found_page, *new_page = NULL;
-        struct address_space *swapper_space = &swapper_spaces[swp_type(entry)];
+        struct address_space *swapper_space = swap_address_space(entry);
        int err;
        *retpage = NULL;
@@ -790,26 +790,14 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
 static void zswap_frontswap_invalidate_area(unsigned type)
 {
        struct zswap_tree *tree = zswap_trees[type];
-        struct rb_node *node;
+        struct zswap_entry *entry, *n;
-        struct zswap_entry *entry;
        if (!tree)
                return;
        /* walk the tree and free everything */
        spin_lock(&tree->lock);
-        /*
+        rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) {
-         * TODO: Even though this code should not be executed because
-         * the try_to_unuse() in swapoff should have emptied the tree,
-         * it is very wasteful to rebalance the tree after every
-         * removal when we are freeing the whole tree.
-         *
-         * If post-order traversal code is ever added to the rbtree
-         * implementation, it should be used here.
-         */
-        while ((node = rb_first(&tree->rbroot))) {
-                entry = rb_entry(node, struct zswap_entry, rbnode);
-                rb_erase(&entry->rbnode, &tree->rbroot);
                zbud_free(tree->pool, entry->handle);
                zswap_entry_cache_free(entry);
                atomic_dec(&zswap_stored_pages);