Merge branch 'akpm' (patches from Andrew)

Merge second patch-bomb from Andrew Morton: "Almost all of the rest of MM. There was an unusually large amount of MM material this time" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (141 commits) zpool: remove no-op module init/exit mm: zbud: constify the zbud_ops mm: zpool: constify the zpool_ops mm: swap: zswap: maybe_preload & refactoring zram: unify error reporting zsmalloc: remove null check from destroy_handle_cache() zsmalloc: do not take class lock in zs_shrinker_count() zsmalloc: use class->pages_per_zspage zsmalloc: consider ZS_ALMOST_FULL as migrate source zsmalloc: partial page ordering within a fullness_list zsmalloc: use shrinker to trigger auto-compaction zsmalloc: account the number of compacted pages zsmalloc/zram: introduce zs_pool_stats api zsmalloc: cosmetic compaction code adjustments zsmalloc: introduce zs_can_compact() function zsmalloc: always keep per-class stats zsmalloc: drop unused variable `nr_to_migrate' mm/memblock.c: fix comment in __next_mem_range() mm/page_alloc.c: fix type information of memoryless node memory-hotplug: fix comments in zone_spanned_pages_in_node() and zone_spanned_pages_in_node() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-09-08 20:52:23 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-09-08 20:52:23 -0400
commit: f6f7a6369203fa3e07efb7f35cfd81efe9f25b07 (patch)
tree: 97bec9ddd999040822acf314647eaf4208213589 /mm
parent: 839fe9156fbe89c3157aa6146d22090f8cffddd8 (diff)
parent: df69f52d990bd85159727bd26e819d3a6e49c666 (diff)
36 files changed, 1243 insertions, 1030 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a23dd1934654..3b6380784c28 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -236,6 +236,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        count += pages;
        while (pages--)
                __free_pages_bootmem(page++, cur++, 0);
+        bdata->node_bootmem_map = NULL;
        bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
@@ -294,6 +295,9 @@ static void __init __free(bootmem_data_t *bdata,
                sidx + bdata->node_min_pfn,
                eidx + bdata->node_min_pfn);
+        if (WARN_ON(bdata->node_bootmem_map == NULL))
+                return;
        if (bdata->hint_idx > sidx)
                bdata->hint_idx = sidx;
@@ -314,6 +318,9 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
                eidx + bdata->node_min_pfn,
                flags);
+        if (WARN_ON(bdata->node_bootmem_map == NULL))
+                return 0;
        for (idx = sidx; idx < eidx; idx++)
                if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
                        if (exclusive) {
diff --git a/mm/compaction.c b/mm/compaction.c
index 018f08da99a2..c5c627aae996 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -207,6 +207,13 @@ static inline bool isolation_suitable(struct compact_control *cc,
        return !get_pageblock_skip(page);
 }
+static void reset_cached_positions(struct zone *zone)
+{
+        zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+        zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
+        zone->compact_cached_free_pfn = zone_end_pfn(zone);
+}
 /*
 * This function is called to clear all cached information on pageblocks that
 * should be skipped for page isolation when the migrate and free page scanner
@@ -218,9 +225,6 @@ static void __reset_isolation_suitable(struct zone *zone)
        unsigned long end_pfn = zone_end_pfn(zone);
        unsigned long pfn;
-        zone->compact_cached_migrate_pfn[0] = start_pfn;
-        zone->compact_cached_migrate_pfn[1] = start_pfn;
-        zone->compact_cached_free_pfn = end_pfn;
        zone->compact_blockskip_flush = false;
        /* Walk the zone and mark every pageblock as suitable for isolation */
@@ -238,6 +242,8 @@ static void __reset_isolation_suitable(struct zone *zone)
                clear_pageblock_skip(page);
        }
+        reset_cached_positions(zone);
 }
 void reset_isolation_suitable(pg_data_t *pgdat)
@@ -431,6 +437,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
                if (!valid_page)
                        valid_page = page;
+                /*
+                 * For compound pages such as THP and hugetlbfs, we can save
+                 * potentially a lot of iterations if we skip them at once.
+                 * The check is racy, but we can consider only valid values
+                 * and the only danger is skipping too much.
+                 */
+                if (PageCompound(page)) {
+                        unsigned int comp_order = compound_order(page);
+                        if (likely(comp_order < MAX_ORDER)) {
+                                blockpfn += (1UL << comp_order) - 1;
+                                cursor += (1UL << comp_order) - 1;
+                        }
+                        goto isolate_fail;
+                }
                if (!PageBuddy(page))
                        goto isolate_fail;
@@ -490,6 +514,13 @@ isolate_fail:
        }
+        /*
+         * There is a tiny chance that we have read bogus compound_order(),
+         * so be careful to not go outside of the pageblock.
+         */
+        if (unlikely(blockpfn > end_pfn))
+                blockpfn = end_pfn;
        trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
                                        nr_scanned, total_isolated);
@@ -674,6 +705,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
        /* Time to isolate some pages for migration */
        for (; low_pfn < end_pfn; low_pfn++) {
+                bool is_lru;
                /*
                 * Periodically drop the lock (if held) regardless of its
                 * contention, to give chance to IRQs. Abort async compaction
@@ -717,36 +750,35 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                 * It's possible to migrate LRU pages and balloon pages
                 * Skip any other type of page
                 */
-                if (!PageLRU(page)) {
+                is_lru = PageLRU(page);
+                if (!is_lru) {
                        if (unlikely(balloon_page_movable(page))) {
                                if (balloon_page_isolate(page)) {
                                        /* Successfully isolated */
                                        goto isolate_success;
                                }
                        }
-                        continue;
                }
                /*
-                 * PageLRU is set. lru_lock normally excludes isolation
+                 * Regardless of being on LRU, compound pages such as THP and
-                 * splitting and collapsing (collapsing has already happened
+                 * hugetlbfs are not to be compacted. We can potentially save
-                 * if PageLRU is set) but the lock is not necessarily taken
+                 * a lot of iterations if we skip them at once. The check is
-                 * here and it is wasteful to take it just to check transhuge.
+                 * racy, but we can consider only valid values and the only
-                 * Check TransHuge without lock and skip the whole pageblock if
+                 * danger is skipping too much.
-                 * it's either a transhuge or hugetlbfs page, as calling
-                 * compound_order() without preventing THP from splitting the
-                 * page underneath us may return surprising results.
                 */
-                if (PageTransHuge(page)) {
+                if (PageCompound(page)) {
-                        if (!locked)
+                        unsigned int comp_order = compound_order(page);
-                                low_pfn = ALIGN(low_pfn + 1,
-                                                pageblock_nr_pages) - 1;
+                        if (likely(comp_order < MAX_ORDER))
-                        else
+                                low_pfn += (1UL << comp_order) - 1;
-                                low_pfn += (1 << compound_order(page)) - 1;
                        continue;
                }
+                if (!is_lru)
+                        continue;
                /*
                 * Migration will fail if an anonymous page is pinned in memory,
                 * so avoid taking lru_lock and isolating it unnecessarily in an
@@ -763,11 +795,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        if (!locked)
                                break;
-                        /* Recheck PageLRU and PageTransHuge under lock */
+                        /* Recheck PageLRU and PageCompound under lock */
                        if (!PageLRU(page))
                                continue;
-                        if (PageTransHuge(page)) {
-                                low_pfn += (1 << compound_order(page)) - 1;
+                        /*
+                         * Page become compound since the non-locked check,
+                         * and it's on LRU. It can only be a THP so the order
+                         * is safe to read and it's 0 for tail pages.
+                         */
+                        if (unlikely(PageCompound(page))) {
+                                low_pfn += (1UL << compound_order(page)) - 1;
                                continue;
                        }
                }
@@ -778,7 +816,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                if (__isolate_lru_page(page, isolate_mode) != 0)
                        continue;
-                VM_BUG_ON_PAGE(PageTransCompound(page), page);
+                VM_BUG_ON_PAGE(PageCompound(page), page);
                /* Successfully isolated */
                del_page_from_lru_list(page, lruvec, page_lru(page));
@@ -898,6 +936,16 @@ static bool suitable_migration_target(struct page *page)
 }
 /*
+ * Test whether the free scanner has reached the same or lower pageblock than
+ * the migration scanner, and compaction should thus terminate.
+ */
+static inline bool compact_scanners_met(struct compact_control *cc)
+{
+        return (cc->free_pfn >> pageblock_order)
+                <= (cc->migrate_pfn >> pageblock_order);
+}
+/*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
 */
@@ -933,8 +981,7 @@ static void isolate_freepages(struct compact_control *cc)
         * pages on cc->migratepages. We stop searching if the migrate
         * and free page scanners meet or enough free pages are isolated.
         */
-        for (; block_start_pfn >= low_pfn &&
+        for (; block_start_pfn >= low_pfn;
-                        cc->nr_migratepages > cc->nr_freepages;
                                block_end_pfn = block_start_pfn,
                                block_start_pfn -= pageblock_nr_pages,
                                isolate_start_pfn = block_start_pfn) {
@@ -966,6 +1013,8 @@ static void isolate_freepages(struct compact_control *cc)
                                        block_end_pfn, freelist, false);
                /*
+                 * If we isolated enough freepages, or aborted due to async
+                 * compaction being contended, terminate the loop.
                 * Remember where the free scanner should restart next time,
                 * which is where isolate_freepages_block() left off.
                 * But if it scanned the whole pageblock, isolate_start_pfn
@@ -974,27 +1023,31 @@ static void isolate_freepages(struct compact_control *cc)
                 * In that case we will however want to restart at the start
                 * of the previous pageblock.
                 */
-                cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
+                if ((cc->nr_freepages >= cc->nr_migratepages)
-                                isolate_start_pfn :
+                                                        || cc->contended) {
-                                block_start_pfn - pageblock_nr_pages;
+                        if (isolate_start_pfn >= block_end_pfn)
+                                isolate_start_pfn =
-                /*
+                                        block_start_pfn - pageblock_nr_pages;
-                 * isolate_freepages_block() might have aborted due to async
-                 * compaction being contended
-                 */
-                if (cc->contended)
                        break;
+                } else {
+                        /*
+                         * isolate_freepages_block() should not terminate
+                         * prematurely unless contended, or isolated enough
+                         */
+                        VM_BUG_ON(isolate_start_pfn < block_end_pfn);
+                }
        }
        /* split_free_page does not map the pages */
        map_pages(freelist);
        /*
-         * If we crossed the migrate scanner, we want to keep it that way
+         * Record where the free scanner will restart next time. Either we
-         * so that compact_finished() may detect this
+         * broke from the loop and set isolate_start_pfn based on the last
+         * call to isolate_freepages_block(), or we met the migration scanner
+         * and the loop terminated due to isolate_start_pfn < low_pfn
         */
-        if (block_start_pfn < low_pfn)
+        cc->free_pfn = isolate_start_pfn;
-                cc->free_pfn = cc->migrate_pfn;
 }
 /*
@@ -1062,6 +1115,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
        unsigned long low_pfn, end_pfn;
+        unsigned long isolate_start_pfn;
        struct page *page;
        const isolate_mode_t isolate_mode =
                (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
@@ -1110,6 +1164,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                        continue;
                /* Perform the isolation */
+                isolate_start_pfn = low_pfn;
                low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
                                                                isolate_mode);
@@ -1119,6 +1174,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                }
                /*
+                 * Record where we could have freed pages by migration and not
+                 * yet flushed them to buddy allocator.
+                 * - this is the lowest page that could have been isolated and
+                 * then freed by migration.
+                 */
+                if (cc->nr_migratepages && !cc->last_migrated_pfn)
+                        cc->last_migrated_pfn = isolate_start_pfn;
+                /*
                 * Either we isolated something and proceed with migration. Or
                 * we failed and compact_zone should decide if we should
                 * continue or not.
@@ -1127,12 +1191,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        }
        acct_isolated(zone, cc);
-        /*
+        /* Record where migration scanner will be restarted. */
-         * Record where migration scanner will be restarted. If we end up in
+        cc->migrate_pfn = low_pfn;
-         * the same pageblock as the free scanner, make the scanners fully
-         * meet so that compact_finished() terminates compaction.
-         */
-        cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
        return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
@@ -1147,11 +1207,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
                return COMPACT_PARTIAL;
        /* Compaction run completes if the migrate and free scanner meet */
-        if (cc->free_pfn <= cc->migrate_pfn) {
+        if (compact_scanners_met(cc)) {
                /* Let the next compaction start anew. */
-                zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+                reset_cached_positions(zone);
-                zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
-                zone->compact_cached_free_pfn = zone_end_pfn(zone);
                /*
                 * Mark that the PG_migrate_skip information should be cleared
@@ -1295,7 +1353,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        unsigned long end_pfn = zone_end_pfn(zone);
        const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
        const bool sync = cc->mode != MIGRATE_ASYNC;
-        unsigned long last_migrated_pfn = 0;
        ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
                                                        cc->classzone_idx);
@@ -1333,6 +1390,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
                zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
        }
+        cc->last_migrated_pfn = 0;
        trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
                                cc->free_pfn, end_pfn, sync);
@@ -1342,7 +1400,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        while ((ret = compact_finished(zone, cc, migratetype)) ==
                                                COMPACT_CONTINUE) {
                int err;
-                unsigned long isolate_start_pfn = cc->migrate_pfn;
                switch (isolate_migratepages(zone, cc)) {
                case ISOLATE_ABORT:
@@ -1376,22 +1433,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                         * migrate_pages() may return -ENOMEM when scanners meet
                         * and we want compact_finished() to detect it
                         */
-                        if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
+                        if (err == -ENOMEM && !compact_scanners_met(cc)) {
                                ret = COMPACT_PARTIAL;
                                goto out;
                        }
                }
-                /*
-                 * Record where we could have freed pages by migration and not
-                 * yet flushed them to buddy allocator. We use the pfn that
-                 * isolate_migratepages() started from in this loop iteration
-                 * - this is the lowest page that could have been isolated and
-                 * then freed by migration.
-                 */
-                if (!last_migrated_pfn)
-                        last_migrated_pfn = isolate_start_pfn;
 check_drain:
                /*
                 * Has the migration scanner moved away from the previous
@@ -1400,18 +1447,18 @@ check_drain:
                 * compact_finished() can detect immediately if allocation
                 * would succeed.
                 */
-                if (cc->order > 0 && last_migrated_pfn) {
+                if (cc->order > 0 && cc->last_migrated_pfn) {
                        int cpu;
                        unsigned long current_block_start =
                                cc->migrate_pfn & ~((1UL << cc->order) - 1);
-                        if (last_migrated_pfn < current_block_start) {
+                        if (cc->last_migrated_pfn < current_block_start) {
                                cpu = get_cpu();
                                lru_add_drain_cpu(cpu);
                                drain_local_pages(zone);
                                put_cpu();
                                /* No more flushing until we migrate again */
-                                last_migrated_pfn = 0;
+                                cc->last_migrated_pfn = 0;
                        }
                }
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 59d10d16f0a5..71a8998cd03a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -271,6 +271,9 @@ void dma_pool_destroy(struct dma_pool *pool)
 {
        bool empty = false;
+        if (unlikely(!pool))
+                return;
        mutex_lock(&pools_reg_lock);
        mutex_lock(&pools_lock);
        list_del(&pool->pools);
@@ -334,7 +337,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
        /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
        spin_unlock_irqrestore(&pool->lock, flags);
-        page = pool_alloc_page(pool, mem_flags);
+        page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO));
        if (!page)
                return NULL;
@@ -372,9 +375,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
                        break;
                }
        }
-        memset(retval, POOL_POISON_ALLOCATED, pool->size);
+        if (!(mem_flags & __GFP_ZERO))
+                memset(retval, POOL_POISON_ALLOCATED, pool->size);
 #endif
        spin_unlock_irqrestore(&pool->lock, flags);
+        if (mem_flags & __GFP_ZERO)
+                memset(retval, 0, pool->size);
        return retval;
 }
 EXPORT_SYMBOL(dma_pool_alloc);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 0cfadafb3fb0..23f744d77ce0 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -224,6 +224,28 @@ early_memremap_ro(resource_size_t phys_addr, unsigned long size)
        return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
 }
 #endif
+#define MAX_MAP_CHUNK   (NR_FIX_BTMAPS << PAGE_SHIFT)
+void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
+{
+        unsigned long slop, clen;
+        char *p;
+        while (size) {
+                slop = src & ~PAGE_MASK;
+                clen = size;
+                if (clen > MAX_MAP_CHUNK - slop)
+                        clen = MAX_MAP_CHUNK - slop;
+                p = early_memremap(src & PAGE_MASK, clen + slop);
+                memcpy(dest, p + slop, clen);
+                early_memunmap(p, clen + slop);
+                dest += clen;
+                src += clen;
+                size -= clen;
+        }
+}
 #else /* CONFIG_MMU */
 void __init __iomem *
diff --git a/mm/filemap.c b/mm/filemap.c
index 1283fc825458..72940fb38666 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -674,7 +674,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
                do {
                        cpuset_mems_cookie = read_mems_allowed_begin();
                        n = cpuset_mem_spread_node();
-                        page = alloc_pages_exact_node(n, gfp, 0);
+                        page = __alloc_pages_node(n, gfp, 0);
                } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
                return page;
@@ -2473,21 +2473,6 @@ ssize_t generic_perform_write(struct file *file,
                                                iov_iter_count(i));
 again:
-                /*
-                 * Bring in the user page that we will copy from _first_.
-                 * Otherwise there's a nasty deadlock on copying from the
-                 * same page as we're writing to, without it being marked
-                 * up-to-date.
-                 *
-                 * Not only is this an optimisation, but it is also required
-                 * to check that the address is actually valid, when atomic
-                 * usercopies are used, below.
-                 */
-                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
-                        status = -EFAULT;
-                        break;
-                }
                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                                                &page, &fsdata);
                if (unlikely(status < 0))
@@ -2495,8 +2480,17 @@ again:
                if (mapping_writably_mapped(mapping))
                        flush_dcache_page(page);
+                /*
+                 * 'page' is now locked.  If we are trying to copy from a
+                 * mapping of 'page' in userspace, the copy might fault and
+                 * would need PageUptodate() to complete.  But, page can not be
+                 * made Uptodate without acquiring the page lock, which we hold.
+                 * Deadlock.  Avoid with pagefault_disable().  Fix up below with
+                 * iov_iter_fault_in_readable().
+                 */
+                pagefault_disable();
                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+                pagefault_enable();
                flush_dcache_page(page);
                status = a_ops->write_end(file, mapping, pos, bytes, copied,
@@ -2519,6 +2513,14 @@ again:
                         */
                        bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                                                iov_iter_single_seg_count(i));
+                        /*
+                         * This is the fallback to recover if the copy from
+                         * userspace above faults.
+                         */
+                        if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+                                status = -EFAULT;
+                                break;
+                        }
                        goto again;
                }
                pos += copied;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 279a818a39b1..b16279cbd91d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,6 +16,7 @@
 #include <linux/swap.h>
 #include <linux/shrinker.h>
 #include <linux/mm_inline.h>
+#include <linux/dax.h>
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
@@ -105,7 +106,7 @@ static struct khugepaged_scan khugepaged_scan = {
 };
-static int set_recommended_min_free_kbytes(void)
+static void set_recommended_min_free_kbytes(void)
 {
        struct zone *zone;
        int nr_zones = 0;
@@ -140,7 +141,6 @@ static int set_recommended_min_free_kbytes(void)
                min_free_kbytes = recommended_min;
        }
        setup_per_zone_wmarks();
-        return 0;
 }
 static int start_stop_khugepaged(void)
@@ -172,12 +172,7 @@ fail:
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
-static inline bool is_huge_zero_pmd(pmd_t pmd)
+struct page *get_huge_zero_page(void)
-{
-        return is_huge_zero_page(pmd_page(pmd));
-}
-static struct page *get_huge_zero_page(void)
 {
        struct page *zero_page;
 retry:
@@ -794,16 +789,19 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 }
 /* Caller must hold page table lock. */
-static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
                struct page *zero_page)
 {
        pmd_t entry;
+        if (!pmd_none(*pmd))
+                return false;
        entry = mk_pmd(zero_page, vma->vm_page_prot);
        entry = pmd_mkhuge(entry);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
        atomic_long_inc(&mm->nr_ptes);
+        return true;
 }
 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -870,6 +868,49 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                            flags);
 }
+static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+                pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        pmd_t entry;
+        spinlock_t *ptl;
+        ptl = pmd_lock(mm, pmd);
+        if (pmd_none(*pmd)) {
+                entry = pmd_mkhuge(pfn_pmd(pfn, prot));
+                if (write) {
+                        entry = pmd_mkyoung(pmd_mkdirty(entry));
+                        entry = maybe_pmd_mkwrite(entry, vma);
+                }
+                set_pmd_at(mm, addr, pmd, entry);
+                update_mmu_cache_pmd(vma, addr, pmd);
+        }
+        spin_unlock(ptl);
+}
+int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+                        pmd_t *pmd, unsigned long pfn, bool write)
+{
+        pgprot_t pgprot = vma->vm_page_prot;
+        /*
+         * If we had pmd_special, we could avoid all these restrictions,
+         * but we need to be consistent with PTEs and architectures that
+         * can't support a 'special' bit.
+         */
+        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+                                                (VM_PFNMAP|VM_MIXEDMAP));
+        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+        if (addr < vma->vm_start || addr >= vma->vm_end)
+                return VM_FAULT_SIGBUS;
+        if (track_pfn_insert(vma, &pgprot, pfn))
+                return VM_FAULT_SIGBUS;
+        insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
+        return VM_FAULT_NOPAGE;
+}
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *vma)
@@ -1414,41 +1455,41 @@ out:
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
+        pmd_t orig_pmd;
        spinlock_t *ptl;
-        int ret = 0;
-        if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+        if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
-                struct page *page;
+                return 0;
-                pgtable_t pgtable;
+        /*
-                pmd_t orig_pmd;
+         * For architectures like ppc64 we look at deposited pgtable
-                /*
+         * when calling pmdp_huge_get_and_clear. So do the
-                 * For architectures like ppc64 we look at deposited pgtable
+         * pgtable_trans_huge_withdraw after finishing pmdp related
-                 * when calling pmdp_huge_get_and_clear. So do the
+         * operations.
-                 * pgtable_trans_huge_withdraw after finishing pmdp related
+         */
-                 * operations.
+        orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
-                 */
+                        tlb->fullmm);
-                orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
-                                                        tlb->fullmm);
+        if (vma_is_dax(vma)) {
-                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+                spin_unlock(ptl);
-                pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
+                if (is_huge_zero_pmd(orig_pmd))
-                if (is_huge_zero_pmd(orig_pmd)) {
-                        atomic_long_dec(&tlb->mm->nr_ptes);
-                        spin_unlock(ptl);
                        put_huge_zero_page();
-                } else {
+        } else if (is_huge_zero_pmd(orig_pmd)) {
-                        page = pmd_page(orig_pmd);
+                pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
-                        page_remove_rmap(page);
+                atomic_long_dec(&tlb->mm->nr_ptes);
-                        VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+                spin_unlock(ptl);
-                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+                put_huge_zero_page();
-                        VM_BUG_ON_PAGE(!PageHead(page), page);
+        } else {
-                        atomic_long_dec(&tlb->mm->nr_ptes);
+                struct page *page = pmd_page(orig_pmd);
-                        spin_unlock(ptl);
+                page_remove_rmap(page);
-                        tlb_remove_page(tlb, page);
+                VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
-                }
+                add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
-                pte_free(tlb->mm, pgtable);
+                VM_BUG_ON_PAGE(!PageHead(page), page);
-                ret = 1;
+                pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
+                atomic_long_dec(&tlb->mm->nr_ptes);
+                spin_unlock(ptl);
+                tlb_remove_page(tlb, page);
        }
-        return ret;
+        return 1;
 }
 int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
@@ -2285,8 +2326,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 static void khugepaged_alloc_sleep(void)
 {
-        wait_event_freezable_timeout(khugepaged_wait, false,
+        DEFINE_WAIT(wait);
-                        msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+        add_wait_queue(&khugepaged_wait, &wait);
+        freezable_schedule_timeout_interruptible(
+                msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+        remove_wait_queue(&khugepaged_wait, &wait);
 }
 static int khugepaged_node_load[MAX_NUMNODES];
@@ -2373,7 +2418,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
         */
        up_read(&mm->mmap_sem);
-        *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
+        *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
@@ -2911,7 +2956,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
                pmd_t *pmd)
 {
        spinlock_t *ptl;
-        struct page *page;
+        struct page *page = NULL;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long haddr = address & HPAGE_PMD_MASK;
        unsigned long mmun_start;       /* For mmu_notifiers */
@@ -2924,25 +2969,27 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
 again:
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        ptl = pmd_lock(mm, pmd);
-        if (unlikely(!pmd_trans_huge(*pmd))) {
+        if (unlikely(!pmd_trans_huge(*pmd)))
-                spin_unlock(ptl);
+                goto unlock;
-                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+        if (vma_is_dax(vma)) {
-                return;
+                pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-        }
+                if (is_huge_zero_pmd(_pmd))
-        if (is_huge_zero_pmd(*pmd)) {
+                        put_huge_zero_page();
+        } else if (is_huge_zero_pmd(*pmd)) {
                __split_huge_zero_page_pmd(vma, haddr, pmd);
-                spin_unlock(ptl);
+        } else {
-                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+                page = pmd_page(*pmd);
-                return;
+                VM_BUG_ON_PAGE(!page_count(page), page);
+                get_page(page);
        }
-        page = pmd_page(*pmd);
+ unlock:
-        VM_BUG_ON_PAGE(!page_count(page), page);
-        get_page(page);
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-        split_huge_page(page);
+        if (!page)
+                return;
+        split_huge_page(page);
        put_page(page);
        /*
@@ -2991,7 +3038,7 @@ static void split_huge_page_address(struct mm_struct *mm,
        split_huge_page_pmd_mm(mm, address, pmd);
 }
-void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+void vma_adjust_trans_huge(struct vm_area_struct *vma,
                             unsigned long start,
                             unsigned long end,
                             long adjust_next)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51ae41d0fbc0..999fb0aef8f1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock);
 * prevent spurious OOMs when the hugepage pool is fully utilized.
 */
 static int num_fault_mutexes;
-static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -240,11 +240,14 @@ struct file_region {
 /*
 * Add the huge page range represented by [f, t) to the reserve
- * map.  Existing regions will be expanded to accommodate the
+ * map.  In the normal case, existing regions will be expanded
- * specified range.  We know only existing regions need to be
+ * to accommodate the specified range.  Sufficient regions should
- * expanded, because region_add is only called after region_chg
+ * exist for expansion due to the previous call to region_chg
- * with the same range.  If a new file_region structure must
+ * with the same range.  However, it is possible that region_del
- * be allocated, it is done in region_chg.
+ * could have been called after region_chg and modifed the map
+ * in such a way that no region exists to be expanded.  In this
+ * case, pull a region descriptor from the cache associated with
+ * the map and use that for the new range.
 *
 * Return the number of new huge pages added to the map.  This
 * number is greater than or equal to zero.
@@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t)
                if (f <= rg->to)
                        break;
+        /*
+         * If no region exists which can be expanded to include the
+         * specified range, the list must have been modified by an
+         * interleving call to region_del().  Pull a region descriptor
+         * from the cache and use it for this range.
+         */
+        if (&rg->link == head || t < rg->from) {
+                VM_BUG_ON(resv->region_cache_count <= 0);
+                resv->region_cache_count--;
+                nrg = list_first_entry(&resv->region_cache, struct file_region,
+                                        link);
+                list_del(&nrg->link);
+                nrg->from = f;
+                nrg->to = t;
+                list_add(&nrg->link, rg->link.prev);
+                add += t - f;
+                goto out_locked;
+        }
        /* Round our left edge to the current segment if it encloses us. */
        if (f > rg->from)
                f = rg->from;
@@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t)
        add += t - nrg->to;             /* Added to end of region */
        nrg->to = t;
+out_locked:
+        resv->adds_in_progress--;
        spin_unlock(&resv->lock);
        VM_BUG_ON(add < 0);
        return add;
@@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t)
 * so that the subsequent region_add call will have all the
 * regions it needs and will not fail.
 *
- * Returns the number of huge pages that need to be added
+ * Upon entry, region_chg will also examine the cache of region descriptors
- * to the existing reservation map for the range [f, t).
+ * associated with the map.  If there are not enough descriptors cached, one
- * This number is greater or equal to zero.  -ENOMEM is
+ * will be allocated for the in progress add operation.
- * returned if a new file_region structure is needed and can
+ *
- * not be allocated.
+ * Returns the number of huge pages that need to be added to the existing
+ * reservation map for the range [f, t).  This number is greater or equal to
+ * zero.  -ENOMEM is returned if a new file_region structure or cache entry
+ * is needed and can not be allocated.
 */
 static long region_chg(struct resv_map *resv, long f, long t)
 {
@@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t)
 retry:
        spin_lock(&resv->lock);
+retry_locked:
+        resv->adds_in_progress++;
+        /*
+         * Check for sufficient descriptors in the cache to accommodate
+         * the number of in progress add operations.
+         */
+        if (resv->adds_in_progress > resv->region_cache_count) {
+                struct file_region *trg;
+                VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
+                /* Must drop lock to allocate a new descriptor. */
+                resv->adds_in_progress--;
+                spin_unlock(&resv->lock);
+                trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+                if (!trg)
+                        return -ENOMEM;
+                spin_lock(&resv->lock);
+                list_add(&trg->link, &resv->region_cache);
+                resv->region_cache_count++;
+                goto retry_locked;
+        }
        /* Locate the region we are before or in. */
        list_for_each_entry(rg, head, link)
                if (f <= rg->to)
@@ -336,6 +391,7 @@ retry:
         * size such that we can guarantee to record the reservation. */
        if (&rg->link == head || t < rg->from) {
                if (!nrg) {
+                        resv->adds_in_progress--;
                        spin_unlock(&resv->lock);
                        nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
                        if (!nrg)
@@ -385,43 +441,131 @@ out_nrg:
 }
 /*
- * Truncate the reserve map at index 'end'.  Modify/truncate any
+ * Abort the in progress add operation.  The adds_in_progress field
- * region which contains end.  Delete any regions past end.
+ * of the resv_map keeps track of the operations in progress between
- * Return the number of huge pages removed from the map.
+ * calls to region_chg and region_add.  Operations are sometimes
+ * aborted after the call to region_chg.  In such cases, region_abort
+ * is called to decrement the adds_in_progress counter.
+ *
+ * NOTE: The range arguments [f, t) are not needed or used in this
+ * routine.  They are kept to make reading the calling code easier as
+ * arguments will match the associated region_chg call.
 */
-static long region_truncate(struct resv_map *resv, long end)
+static void region_abort(struct resv_map *resv, long f, long t)
+{
+        spin_lock(&resv->lock);
+        VM_BUG_ON(!resv->region_cache_count);
+        resv->adds_in_progress--;
+        spin_unlock(&resv->lock);
+}
+/*
+ * Delete the specified range [f, t) from the reserve map.  If the
+ * t parameter is LONG_MAX, this indicates that ALL regions after f
+ * should be deleted.  Locate the regions which intersect [f, t)
+ * and either trim, delete or split the existing regions.
+ *
+ * Returns the number of huge pages deleted from the reserve map.
+ * In the normal case, the return value is zero or more.  In the
+ * case where a region must be split, a new region descriptor must
+ * be allocated.  If the allocation fails, -ENOMEM will be returned.
+ * NOTE: If the parameter t == LONG_MAX, then we will never split
+ * a region and possibly return -ENOMEM.  Callers specifying
+ * t == LONG_MAX do not need to check for -ENOMEM error.
+ */
+static long region_del(struct resv_map *resv, long f, long t)
 {
        struct list_head *head = &resv->regions;
        struct file_region *rg, *trg;
-        long chg = 0;
+        struct file_region *nrg = NULL;
+        long del = 0;
+retry:
        spin_lock(&resv->lock);
-        /* Locate the region we are either in or before. */
+        list_for_each_entry_safe(rg, trg, head, link) {
-        list_for_each_entry(rg, head, link)
+                if (rg->to <= f)
-                if (end <= rg->to)
+                        continue;
+                if (rg->from >= t)
                        break;
-        if (&rg->link == head)
-                goto out;
-        /* If we are in the middle of a region then adjust it. */
+                if (f > rg->from && t < rg->to) { /* Must split region */
-        if (end > rg->from) {
+                        /*
-                chg = rg->to - end;
+                         * Check for an entry in the cache before dropping
-                rg->to = end;
+                         * lock and attempting allocation.
-                rg = list_entry(rg->link.next, typeof(*rg), link);
+                         */
-        }
+                        if (!nrg &&
+                            resv->region_cache_count > resv->adds_in_progress) {
+                                nrg = list_first_entry(&resv->region_cache,
+                                                        struct file_region,
+                                                        link);
+                                list_del(&nrg->link);
+                                resv->region_cache_count--;
+                        }
-        /* Drop any remaining regions. */
+                        if (!nrg) {
-        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                                spin_unlock(&resv->lock);
-                if (&rg->link == head)
+                                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+                                if (!nrg)
+                                        return -ENOMEM;
+                                goto retry;
+                        }
+                        del += t - f;
+                        /* New entry for end of split region */
+                        nrg->from = t;
+                        nrg->to = rg->to;
+                        INIT_LIST_HEAD(&nrg->link);
+                        /* Original entry is trimmed */
+                        rg->to = f;
+                        list_add(&nrg->link, &rg->link);
+                        nrg = NULL;
                        break;
-                chg += rg->to - rg->from;
+                }
-                list_del(&rg->link);
-                kfree(rg);
+                if (f <= rg->from && t >= rg->to) { /* Remove entire region */
+                        del += rg->to - rg->from;
+                        list_del(&rg->link);
+                        kfree(rg);
+                        continue;
+                }
+                if (f <= rg->from) {    /* Trim beginning of region */
+                        del += t - rg->from;
+                        rg->from = t;
+                } else {                /* Trim end of region */
+                        del += rg->to - f;
+                        rg->to = f;
+                }
        }
-out:
        spin_unlock(&resv->lock);
-        return chg;
+        kfree(nrg);
+        return del;
+}
+/*
+ * A rare out of memory error was encountered which prevented removal of
+ * the reserve map region for a page.  The huge page itself was free'ed
+ * and removed from the page cache.  This routine will adjust the subpool
+ * usage count, and the global reserve count if needed.  By incrementing
+ * these counts, the reserve map entry which could not be deleted will
+ * appear as a "reserved" entry instead of simply dangling with incorrect
+ * counts.
+ */
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+{
+        struct hugepage_subpool *spool = subpool_inode(inode);
+        long rsv_adjust;
+        rsv_adjust = hugepage_subpool_get_pages(spool, 1);
+        if (restore_reserve && rsv_adjust) {
+                struct hstate *h = hstate_inode(inode);
+                hugetlb_acct_memory(h, 1);
+        }
 }
 /*
@@ -544,22 +688,44 @@ static void set_vma_private_data(struct vm_area_struct *vma,
 struct resv_map *resv_map_alloc(void)
 {
        struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
-        if (!resv_map)
+        struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
+        if (!resv_map || !rg) {
+                kfree(resv_map);
+                kfree(rg);
                return NULL;
+        }
        kref_init(&resv_map->refs);
        spin_lock_init(&resv_map->lock);
        INIT_LIST_HEAD(&resv_map->regions);
+        resv_map->adds_in_progress = 0;
+        INIT_LIST_HEAD(&resv_map->region_cache);
+        list_add(&rg->link, &resv_map->region_cache);
+        resv_map->region_cache_count = 1;
        return resv_map;
 }
 void resv_map_release(struct kref *ref)
 {
        struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+        struct list_head *head = &resv_map->region_cache;
+        struct file_region *rg, *trg;
        /* Clear out any active regions before we release the map. */
-        region_truncate(resv_map, 0);
+        region_del(resv_map, 0, LONG_MAX);
+        /* ... and any entries left in the cache */
+        list_for_each_entry_safe(rg, trg, head, link) {
+                list_del(&rg->link);
+                kfree(rg);
+        }
+        VM_BUG_ON(resv_map->adds_in_progress);
        kfree(resv_map);
 }
@@ -635,8 +801,19 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
        }
        /* Shared mappings always use reserves */
-        if (vma->vm_flags & VM_MAYSHARE)
+        if (vma->vm_flags & VM_MAYSHARE) {
-                return true;
+                /*
+                 * We know VM_NORESERVE is not set.  Therefore, there SHOULD
+                 * be a region map for all pages.  The only situation where
+                 * there is no region map is if a hole was punched via
+                 * fallocate.  In this case, there really are no reverves to
+                 * use.  This situation is indicated if chg != 0.
+                 */
+                if (chg)
+                        return false;
+                else
+                        return true;
+        }
        /*
         * Only the process that called mmap() has reserves for
@@ -1154,7 +1331,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
        struct page *page;
-        page = alloc_pages_exact_node(nid,
+        page = __alloc_pages_node(nid,
                htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
                                                __GFP_REPEAT|__GFP_NOWARN,
                huge_page_order(h));
@@ -1306,7 +1483,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
                                   __GFP_REPEAT|__GFP_NOWARN,
                                   huge_page_order(h));
        else
-                page = alloc_pages_exact_node(nid,
+                page = __alloc_pages_node(nid,
                        htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
                        __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
@@ -1473,16 +1650,19 @@ static void return_unused_surplus_pages(struct hstate *h,
        }
 }
 /*
- * vma_needs_reservation and vma_commit_reservation are used by the huge
+ * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
- * page allocation routines to manage reservations.
+ * are used by the huge page allocation routines to manage reservations.
 *
 * vma_needs_reservation is called to determine if the huge page at addr
 * within the vma has an associated reservation.  If a reservation is
 * needed, the value 1 is returned.  The caller is then responsible for
 * managing the global reservation and subpool usage counts.  After
 * the huge page has been allocated, vma_commit_reservation is called
- * to add the page to the reservation map.
+ * to add the page to the reservation map.  If the page allocation fails,
+ * the reservation must be ended instead of committed.  vma_end_reservation
+ * is called in such cases.
 *
 * In the normal case, vma_commit_reservation returns the same value
 * as the preceding vma_needs_reservation call.  The only time this
@@ -1490,9 +1670,14 @@ static void return_unused_surplus_pages(struct hstate *h,
 * is the responsibility of the caller to notice the difference and
 * take appropriate action.
 */
+enum vma_resv_mode {
+        VMA_NEEDS_RESV,
+        VMA_COMMIT_RESV,
+        VMA_END_RESV,
+};
 static long __vma_reservation_common(struct hstate *h,
                                struct vm_area_struct *vma, unsigned long addr,
-                                bool commit)
+                                enum vma_resv_mode mode)
 {
        struct resv_map *resv;
        pgoff_t idx;
@@ -1503,10 +1688,20 @@ static long __vma_reservation_common(struct hstate *h,
                return 1;
        idx = vma_hugecache_offset(h, vma, addr);
-        if (commit)
+        switch (mode) {
-                ret = region_add(resv, idx, idx + 1);
+        case VMA_NEEDS_RESV:
-        else
                ret = region_chg(resv, idx, idx + 1);
+                break;
+        case VMA_COMMIT_RESV:
+                ret = region_add(resv, idx, idx + 1);
+                break;
+        case VMA_END_RESV:
+                region_abort(resv, idx, idx + 1);
+                ret = 0;
+                break;
+        default:
+                BUG();
+        }
        if (vma->vm_flags & VM_MAYSHARE)
                return ret;
@@ -1517,47 +1712,79 @@ static long __vma_reservation_common(struct hstate *h,
 static long vma_needs_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
 {
-        return __vma_reservation_common(h, vma, addr, false);
+        return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
 }
 static long vma_commit_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
 {
-        return __vma_reservation_common(h, vma, addr, true);
+        return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
+}
+static void vma_end_reservation(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long addr)
+{
+        (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
 }
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr, int avoid_reserve)
 {
        struct hugepage_subpool *spool = subpool_vma(vma);
        struct hstate *h = hstate_vma(vma);
        struct page *page;
-        long chg, commit;
+        long map_chg, map_commit;
+        long gbl_chg;
        int ret, idx;
        struct hugetlb_cgroup *h_cg;
        idx = hstate_index(h);
        /*
-         * Processes that did not create the mapping will have no
+         * Examine the region/reserve map to determine if the process
-         * reserves and will not have accounted against subpool
+         * has a reservation for the page to be allocated.  A return
-         * limit. Check that the subpool limit can be made before
+         * code of zero indicates a reservation exists (no change).
-         * satisfying the allocation MAP_NORESERVE mappings may also
-         * need pages and subpool limit allocated allocated if no reserve
-         * mapping overlaps.
         */
-        chg = vma_needs_reservation(h, vma, addr);
+        map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
-        if (chg < 0)
+        if (map_chg < 0)
                return ERR_PTR(-ENOMEM);
-        if (chg || avoid_reserve)
-                if (hugepage_subpool_get_pages(spool, 1) < 0)
+        /*
+         * Processes that did not create the mapping will have no
+         * reserves as indicated by the region/reserve map. Check
+         * that the allocation will not exceed the subpool limit.
+         * Allocations for MAP_NORESERVE mappings also need to be
+         * checked against any subpool limit.
+         */
+        if (map_chg || avoid_reserve) {
+                gbl_chg = hugepage_subpool_get_pages(spool, 1);
+                if (gbl_chg < 0) {
+                        vma_end_reservation(h, vma, addr);
                        return ERR_PTR(-ENOSPC);
+                }
+                /*
+                 * Even though there was no reservation in the region/reserve
+                 * map, there could be reservations associated with the
+                 * subpool that can be used.  This would be indicated if the
+                 * return value of hugepage_subpool_get_pages() is zero.
+                 * However, if avoid_reserve is specified we still avoid even
+                 * the subpool reservations.
+                 */
+                if (avoid_reserve)
+                        gbl_chg = 1;
+        }
        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
        if (ret)
                goto out_subpool_put;
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
+        /*
+         * glb_chg is passed to indicate whether or not a page must be taken
+         * from the global free pool (global change).  gbl_chg == 0 indicates
+         * a reservation exists for the allocation.
+         */
+        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
        if (!page) {
                spin_unlock(&hugetlb_lock);
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
@@ -1573,8 +1800,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        set_page_private(page, (unsigned long)spool);
-        commit = vma_commit_reservation(h, vma, addr);
+        map_commit = vma_commit_reservation(h, vma, addr);
-        if (unlikely(chg > commit)) {
+        if (unlikely(map_chg > map_commit)) {
                /*
                 * The page was added to the reservation map between
                 * vma_needs_reservation and vma_commit_reservation.
@@ -1594,8 +1821,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 out_uncharge_cgroup:
        hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
 out_subpool_put:
-        if (chg || avoid_reserve)
+        if (map_chg || avoid_reserve)
                hugepage_subpool_put_pages(spool, 1);
+        vma_end_reservation(h, vma, addr);
        return ERR_PTR(-ENOSPC);
 }
@@ -2311,7 +2539,7 @@ static void __exit hugetlb_exit(void)
        }
        kobject_put(hugepages_kobj);
-        kfree(htlb_fault_mutex_table);
+        kfree(hugetlb_fault_mutex_table);
 }
 module_exit(hugetlb_exit);
@@ -2344,12 +2572,12 @@ static int __init hugetlb_init(void)
 #else
        num_fault_mutexes = 1;
 #endif
-        htlb_fault_mutex_table =
+        hugetlb_fault_mutex_table =
                kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
-        BUG_ON(!htlb_fault_mutex_table);
+        BUG_ON(!hugetlb_fault_mutex_table);
        for (i = 0; i < num_fault_mutexes; i++)
-                mutex_init(&htlb_fault_mutex_table[i]);
+                mutex_init(&hugetlb_fault_mutex_table[i]);
        return 0;
 }
 module_init(hugetlb_init);
@@ -3147,6 +3375,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
        return page != NULL;
 }
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+                           pgoff_t idx)
+{
+        struct inode *inode = mapping->host;
+        struct hstate *h = hstate_inode(inode);
+        int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+        if (err)
+                return err;
+        ClearPagePrivate(page);
+        spin_lock(&inode->i_lock);
+        inode->i_blocks += blocks_per_huge_page(h);
+        spin_unlock(&inode->i_lock);
+        return 0;
+}
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                           struct address_space *mapping, pgoff_t idx,
                           unsigned long address, pte_t *ptep, unsigned int flags)
@@ -3194,21 +3439,13 @@ retry:
                set_page_huge_active(page);
                if (vma->vm_flags & VM_MAYSHARE) {
-                        int err;
+                        int err = huge_add_to_page_cache(page, mapping, idx);
-                        struct inode *inode = mapping->host;
-                        err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
                        if (err) {
                                put_page(page);
                                if (err == -EEXIST)
                                        goto retry;
                                goto out;
                        }
-                        ClearPagePrivate(page);
-                        spin_lock(&inode->i_lock);
-                        inode->i_blocks += blocks_per_huge_page(h);
-                        spin_unlock(&inode->i_lock);
                } else {
                        lock_page(page);
                        if (unlikely(anon_vma_prepare(vma))) {
@@ -3236,11 +3473,14 @@ retry:
         * any allocations necessary to record that reservation occur outside
         * the spinlock.
         */
-        if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
+        if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                if (vma_needs_reservation(h, vma, address) < 0) {
                        ret = VM_FAULT_OOM;
                        goto backout_unlocked;
                }
+                /* Just decrements count, does not deallocate */
+                vma_end_reservation(h, vma, address);
+        }
        ptl = huge_pte_lockptr(h, mm, ptep);
        spin_lock(ptl);
@@ -3280,7 +3520,7 @@ backout_unlocked:
 }
 #ifdef CONFIG_SMP
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
                            struct vm_area_struct *vma,
                            struct address_space *mapping,
                            pgoff_t idx, unsigned long address)
@@ -3305,7 +3545,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
 * For uniprocesor systems we always use a single mutex, so just
 * return 0 and avoid the hashing overhead.
 */
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
                            struct vm_area_struct *vma,
                            struct address_space *mapping,
                            pgoff_t idx, unsigned long address)
@@ -3353,8 +3593,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * get spurious allocation failures if two CPUs race to instantiate
         * the same page in the page cache.
         */
-        hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
+        hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
-        mutex_lock(&htlb_fault_mutex_table[hash]);
+        mutex_lock(&hugetlb_fault_mutex_table[hash]);
        entry = huge_ptep_get(ptep);
        if (huge_pte_none(entry)) {
@@ -3387,6 +3627,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        ret = VM_FAULT_OOM;
                        goto out_mutex;
                }
+                /* Just decrements count, does not deallocate */
+                vma_end_reservation(h, vma, address);
                if (!(vma->vm_flags & VM_MAYSHARE))
                        pagecache_page = hugetlbfs_pagecache_page(h,
@@ -3437,7 +3679,7 @@ out_ptl:
                put_page(pagecache_page);
        }
 out_mutex:
-        mutex_unlock(&htlb_fault_mutex_table[hash]);
+        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
        /*
         * Generally it's safe to hold refcount during waiting page lock. But
         * here we just wait to defer the next page fault to avoid busy loop and
@@ -3726,12 +3968,15 @@ int hugetlb_reserve_pages(struct inode *inode,
        }
        return 0;
 out_err:
+        if (!vma || vma->vm_flags & VM_MAYSHARE)
+                region_abort(resv_map, from, to);
        if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                kref_put(&resv_map->refs, resv_map_release);
        return ret;
 }
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+                                                                long freed)
 {
        struct hstate *h = hstate_inode(inode);
        struct resv_map *resv_map = inode_resv_map(inode);
@@ -3739,8 +3984,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        struct hugepage_subpool *spool = subpool_inode(inode);
        long gbl_reserve;
-        if (resv_map)
+        if (resv_map) {
-                chg = region_truncate(resv_map, offset);
+                chg = region_del(resv_map, start, end);
+                /*
+                 * region_del() can fail in the rare case where a region
+                 * must be split and another region descriptor can not be
+                 * allocated.  If end == LONG_MAX, it will not fail.
+                 */
+                if (chg < 0)
+                        return chg;
+        }
        spin_lock(&inode->i_lock);
        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);
@@ -3751,6 +4005,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
         */
        gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
        hugetlb_acct_memory(h, -gbl_reserve);
+        return 0;
 }
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index bf73ac17dad4..aeba0edd6e44 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -58,7 +58,7 @@ inject:
        pr_info("Injecting memory failure at pfn %#lx\n", pfn);
        return memory_failure(pfn, 18, MF_COUNT_INCREASED);
 put_out:
-        put_page(p);
+        put_hwpoison_page(p);
        return 0;
 }
diff --git a/mm/internal.h b/mm/internal.h
index 1195dd2d6a2b..bc0fa9a69e46 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -182,6 +182,7 @@ struct compact_control {
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
+        unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
        enum migrate_mode mode;         /* Async or sync migration mode */
        bool ignore_skip_hint;          /* Scan blocks even if marked skip */
        int order;                      /* order a direct compactor needs */
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index cf79f110157c..f532f6a37b55 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -838,6 +838,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
        }
        if (crt_early_log >= ARRAY_SIZE(early_log)) {
+                crt_early_log++;
                kmemleak_disable();
                return;
        }
@@ -1882,7 +1883,7 @@ void __init kmemleak_init(void)
        object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
        scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
-        if (crt_early_log >= ARRAY_SIZE(early_log))
+        if (crt_early_log > ARRAY_SIZE(early_log))
                pr_warning("Early log buffer exceeded (%d), please increase "
                           "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 909eca2c820e..e1da19fac1b3 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -99,8 +99,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
        struct list_lru_one *l;
        spin_lock(&nlru->lock);
-        l = list_lru_from_kmem(nlru, item);
        if (list_empty(item)) {
+                l = list_lru_from_kmem(nlru, item);
                list_add_tail(item, &l->list);
                l->nr_items++;
                spin_unlock(&nlru->lock);
@@ -118,8 +118,8 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
        struct list_lru_one *l;
        spin_lock(&nlru->lock);
-        l = list_lru_from_kmem(nlru, item);
        if (!list_empty(item)) {
+                l = list_lru_from_kmem(nlru, item);
                list_del_init(item);
                l->nr_items--;
                spin_unlock(&nlru->lock);
diff --git a/mm/madvise.c b/mm/madvise.c
index ce3a4222c7e7..c889fcbb530e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -301,7 +301,7 @@ static long madvise_remove(struct vm_area_struct *vma,
        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
-        if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
+        if (vma->vm_flags & VM_LOCKED)
                return -EINVAL;
        f = vma->vm_file;
diff --git a/mm/memblock.c b/mm/memblock.c
index 95ce68c6da8a..1c7b647e5897 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -91,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
        return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
 }
-static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
+bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
                                        phys_addr_t base, phys_addr_t size)
 {
        unsigned long i;
@@ -103,7 +103,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
                        break;
        }
-        return (i < type->cnt) ? i : -1;
+        return i < type->cnt;
 }
 /*
@@ -569,6 +569,7 @@ repeat:
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
                        WARN_ON(nid != memblock_get_region_node(rgn));
 #endif
+                        WARN_ON(flags != rgn->flags);
                        nr_new++;
                        if (insert)
                                memblock_insert_region(type, i++, base,
@@ -614,14 +615,14 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
                                                int nid,
                                                unsigned long flags)
 {
-        struct memblock_type *_rgn = &memblock.memory;
+        struct memblock_type *type = &memblock.memory;
        memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
                     (unsigned long long)base,
                     (unsigned long long)base + size - 1,
                     flags, (void *)_RET_IP_);
-        return memblock_add_range(_rgn, base, size, nid, flags);
+        return memblock_add_range(type, base, size, nid, flags);
 }
 int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -761,7 +762,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 *
 * This function isolates region [@base, @base + @size), and sets/clears flag
 *
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
 */
 static int __init_memblock memblock_setclr_flag(phys_addr_t base,
                                phys_addr_t size, int set, int flag)
@@ -788,7 +789,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
 * @base: the base phys addr of the region
 * @size: the size of the region
 *
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
 */
 int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
 {
@@ -800,7 +801,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
 * @base: the base phys addr of the region
 * @size: the size of the region
 *
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
 */
 int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
 {
@@ -812,7 +813,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
 * @base: the base phys addr of the region
 * @size: the size of the region
 *
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
 */
 int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
 {
@@ -834,10 +835,10 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
                                           phys_addr_t *out_start,
                                           phys_addr_t *out_end)
 {
-        struct memblock_type *rsv = &memblock.reserved;
+        struct memblock_type *type = &memblock.reserved;
-        if (*idx >= 0 && *idx < rsv->cnt) {
+        if (*idx >= 0 && *idx < type->cnt) {
-                struct memblock_region *r = &rsv->regions[*idx];
+                struct memblock_region *r = &type->regions[*idx];
                phys_addr_t base = r->base;
                phys_addr_t size = r->size;
@@ -975,7 +976,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
 * in type_b.
 *
 * @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
 * @flags: pick from blocks based on memory attributes
 * @type_a: pointer to memblock_type from where the range is taken
 * @type_b: pointer to memblock_type which excludes memory from being taken
@@ -1565,12 +1566,12 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
 * Check if the region [@base, @base+@size) intersects a reserved memory block.
 *
 * RETURNS:
- * 0 if false, non-zero if true
+ * True if they intersect, false if not.
 */
-int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
 {
        memblock_cap_size(base, &size);
-        return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+        return memblock_overlaps_region(&memblock.reserved, base, size);
 }
 void __init_memblock memblock_trim_memory(phys_addr_t align)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1af057575ce9..1742a2db89c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = {
        "unevictable",
 };
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremated by the number of pages. This counter is used for
- * for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
-        MEM_CGROUP_TARGET_THRESH,
-        MEM_CGROUP_TARGET_SOFTLIMIT,
-        MEM_CGROUP_TARGET_NUMAINFO,
-        MEM_CGROUP_NTARGETS,
-};
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 #define NUMAINFO_EVENTS_TARGET  1024
-struct mem_cgroup_stat_cpu {
-        long count[MEM_CGROUP_STAT_NSTATS];
-        unsigned long events[MEMCG_NR_EVENTS];
-        unsigned long nr_page_events;
-        unsigned long targets[MEM_CGROUP_NTARGETS];
-};
-struct reclaim_iter {
-        struct mem_cgroup *position;
-        /* scan generation, increased every round-trip */
-        unsigned int generation;
-};
-/*
- * per-zone information in memory controller.
- */
-struct mem_cgroup_per_zone {
-        struct lruvec           lruvec;
-        unsigned long           lru_size[NR_LRU_LISTS];
-        struct reclaim_iter     iter[DEF_PRIORITY + 1];
-        struct rb_node          tree_node;      /* RB tree node */
-        unsigned long           usage_in_excess;/* Set to the value by which */
-                                                /* the soft limit is exceeded*/
-        bool                    on_tree;
-        struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
-                                                /* use container_of        */
-};
-struct mem_cgroup_per_node {
-        struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
 /*
 * Cgroups above their limits are maintained in a RB-Tree, independent of
 * their hierarchy representation
@@ -181,32 +135,6 @@ struct mem_cgroup_tree {
 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
-struct mem_cgroup_threshold {
-        struct eventfd_ctx *eventfd;
-        unsigned long threshold;
-};
-/* For threshold */
-struct mem_cgroup_threshold_ary {
-        /* An array index points to threshold just below or equal to usage. */
-        int current_threshold;
-        /* Size of entries[] */
-        unsigned int size;
-        /* Array of thresholds */
-        struct mem_cgroup_threshold entries[0];
-};
-struct mem_cgroup_thresholds {
-        /* Primary thresholds array */
-        struct mem_cgroup_threshold_ary *primary;
-        /*
-         * Spare threshold array.
-         * This is needed to make mem_cgroup_unregister_event() "never fail".
-         * It must be able to store at least primary->size - 1 entries.
-         */
-        struct mem_cgroup_threshold_ary *spare;
-};
 /* for OOM */
 struct mem_cgroup_eventfd_list {
        struct list_head list;
@@ -256,113 +184,6 @@ struct mem_cgroup_event {
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
-/*
- * The memory controller data structure. The memory controller controls both
- * page cache and RSS per cgroup. We would eventually like to provide
- * statistics based on the statistics developed by Rik Van Riel for clock-pro,
- * to help the administrator determine what knobs to tune.
- */
-struct mem_cgroup {
-        struct cgroup_subsys_state css;
-        /* Accounted resources */
-        struct page_counter memory;
-        struct page_counter memsw;
-        struct page_counter kmem;
-        /* Normal memory consumption range */
-        unsigned long low;
-        unsigned long high;
-        unsigned long soft_limit;
-        /* vmpressure notifications */
-        struct vmpressure vmpressure;
-        /* css_online() has been completed */
-        int initialized;
-        /*
-         * Should the accounting and control be hierarchical, per subtree?
-         */
-        bool use_hierarchy;
-        /* protected by memcg_oom_lock */
-        bool            oom_lock;
-        int             under_oom;
-        int     swappiness;
-        /* OOM-Killer disable */
-        int             oom_kill_disable;
-        /* protect arrays of thresholds */
-        struct mutex thresholds_lock;
-        /* thresholds for memory usage. RCU-protected */
-        struct mem_cgroup_thresholds thresholds;
-        /* thresholds for mem+swap usage. RCU-protected */
-        struct mem_cgroup_thresholds memsw_thresholds;
-        /* For oom notifier event fd */
-        struct list_head oom_notify;
-        /*
-         * Should we move charges of a task when a task is moved into this
-         * mem_cgroup ? And what type of charges should we move ?
-         */
-        unsigned long move_charge_at_immigrate;
-        /*
-         * set > 0 if pages under this cgroup are moving to other cgroup.
-         */
-        atomic_t                moving_account;
-        /* taken only while moving_account > 0 */
-        spinlock_t              move_lock;
-        struct task_struct      *move_lock_task;
-        unsigned long           move_lock_flags;
-        /*
-         * percpu counter.
-         */
-        struct mem_cgroup_stat_cpu __percpu *stat;
-        spinlock_t pcp_counter_lock;
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
-        struct cg_proto tcp_mem;
-#endif
-#if defined(CONFIG_MEMCG_KMEM)
-        /* Index in the kmem_cache->memcg_params.memcg_caches array */
-        int kmemcg_id;
-        bool kmem_acct_activated;
-        bool kmem_acct_active;
-#endif
-        int last_scanned_node;
-#if MAX_NUMNODES > 1
-        nodemask_t      scan_nodes;
-        atomic_t        numainfo_events;
-        atomic_t        numainfo_updating;
-#endif
-#ifdef CONFIG_CGROUP_WRITEBACK
-        struct list_head cgwb_list;
-        struct wb_domain cgwb_domain;
-#endif
-        /* List of events which userspace want to receive */
-        struct list_head event_list;
-        spinlock_t event_list_lock;
-        struct mem_cgroup_per_node *nodeinfo[0];
-        /* WARNING: nodeinfo must be the last member here */
-};
-#ifdef CONFIG_MEMCG_KMEM
-bool memcg_kmem_is_active(struct mem_cgroup *memcg)
-{
-        return memcg->kmem_acct_active;
-}
-#endif
 /* Stuffs for move charges at task migration. */
 /*
 * Types of charges to be moved.
@@ -423,11 +244,6 @@ enum res_type {
 */
 static DEFINE_MUTEX(memcg_create_mutex);
-struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
-{
-        return s ? container_of(s, struct mem_cgroup, css) : NULL;
-}
 /* Some nice accessors for the vmpressure. */
 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 {
@@ -499,8 +315,7 @@ void sock_update_memcg(struct sock *sk)
                rcu_read_lock();
                memcg = mem_cgroup_from_task(current);
                cg_proto = sk->sk_prot->proto_cgroup(memcg);
-                if (!mem_cgroup_is_root(memcg) &&
+                if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
-                    memcg_proto_active(cg_proto) &&
                    css_tryget_online(&memcg->css)) {
                        sk->sk_cgrp = cg_proto;
                }
@@ -593,11 +408,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
        return &memcg->nodeinfo[nid]->zoneinfo[zid];
 }
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
-{
-        return &memcg->css;
-}
 /**
 * mem_cgroup_css_from_page - css of the memcg associated with a page
 * @page: page of interest
@@ -876,14 +686,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
        __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 }
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
-        struct mem_cgroup_per_zone *mz;
-        mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
-        return mz->lru_size[lru];
-}
 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
                                                  int nid,
                                                  unsigned int lru_mask)
@@ -986,6 +788,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
        return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 }
+EXPORT_SYMBOL(mem_cgroup_from_task);
 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
@@ -1031,7 +834,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                   struct mem_cgroup *prev,
                                   struct mem_cgroup_reclaim_cookie *reclaim)
 {
-        struct reclaim_iter *uninitialized_var(iter);
+        struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
        struct cgroup_subsys_state *css = NULL;
        struct mem_cgroup *memcg = NULL;
        struct mem_cgroup *pos = NULL;
@@ -1173,30 +976,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
             iter != NULL;                              \
             iter = mem_cgroup_iter(NULL, iter, NULL))
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
-{
-        struct mem_cgroup *memcg;
-        rcu_read_lock();
-        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-        if (unlikely(!memcg))
-                goto out;
-        switch (idx) {
-        case PGFAULT:
-                this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
-                break;
-        case PGMAJFAULT:
-                this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
-                break;
-        default:
-                BUG();
-        }
-out:
-        rcu_read_unlock();
-}
-EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
 /**
 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
 * @zone: zone of the wanted lruvec
@@ -1295,15 +1074,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
        VM_BUG_ON((long)(*lru_size) < 0);
 }
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
-{
-        if (root == memcg)
-                return true;
-        if (!root->use_hierarchy)
-                return false;
-        return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
-}
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
 {
        struct mem_cgroup *task_memcg;
@@ -1330,39 +1100,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
        return ret;
 }
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-        unsigned long inactive_ratio;
-        unsigned long inactive;
-        unsigned long active;
-        unsigned long gb;
-        inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
-        active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-        gb = (inactive + active) >> (30 - PAGE_SHIFT);
-        if (gb)
-                inactive_ratio = int_sqrt(10 * gb);
-        else
-                inactive_ratio = 1;
-        return inactive * inactive_ratio < active;
-}
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
-{
-        struct mem_cgroup_per_zone *mz;
-        struct mem_cgroup *memcg;
-        if (mem_cgroup_disabled())
-                return true;
-        mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
-        memcg = mz->memcg;
-        return !!(memcg->css.flags & CSS_ONLINE);
-}
 #define mem_cgroup_from_counter(counter, member)        \
        container_of(counter, struct mem_cgroup, member)
@@ -1394,15 +1131,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
        return margin;
 }
-int mem_cgroup_swappiness(struct mem_cgroup *memcg)
-{
-        /* root ? */
-        if (mem_cgroup_disabled() || !memcg->css.parent)
-                return vm_swappiness;
-        return memcg->swappiness;
-}
 /*
 * A routine for checking "mem" is under move_account() or not.
 *
@@ -1545,6 +1273,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                     int order)
 {
+        struct oom_control oc = {
+                .zonelist = NULL,
+                .nodemask = NULL,
+                .gfp_mask = gfp_mask,
+                .order = order,
+        };
        struct mem_cgroup *iter;
        unsigned long chosen_points = 0;
        unsigned long totalpages;
@@ -1563,7 +1297,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                goto unlock;
        }
-        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+        check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
        totalpages = mem_cgroup_get_limit(memcg) ? : 1;
        for_each_mem_cgroup_tree(iter, memcg) {
                struct css_task_iter it;
@@ -1571,8 +1305,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                css_task_iter_start(&iter->css, &it);
                while ((task = css_task_iter_next(&it))) {
-                        switch (oom_scan_process_thread(task, totalpages, NULL,
+                        switch (oom_scan_process_thread(&oc, task, totalpages)) {
-                                                        false)) {
                        case OOM_SCAN_SELECT:
                                if (chosen)
                                        put_task_struct(chosen);
@@ -1610,8 +1343,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
        if (chosen) {
                points = chosen_points * 1000 / totalpages;
-                oom_kill_process(chosen, gfp_mask, order, points, totalpages,
+                oom_kill_process(&oc, chosen, points, totalpages, memcg,
-                                 memcg, NULL, "Memory cgroup out of memory");
+                                 "Memory cgroup out of memory");
        }
 unlock:
        mutex_unlock(&oom_lock);
@@ -2062,23 +1795,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(mem_cgroup_end_page_stat);
-/**
- * mem_cgroup_update_page_stat - update page state statistics
- * @memcg: memcg to account against
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * See mem_cgroup_begin_page_stat() for locking requirements.
- */
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
-                                 enum mem_cgroup_stat_index idx, int val)
-{
-        VM_BUG_ON(!rcu_read_lock_held());
-        if (memcg)
-                this_cpu_add(memcg->stat->count[idx], val);
-}
 /*
 * size of first charge trial. "32" comes from vmscan.c's magic value.
 * TODO: maybe necessary to use big numbers in big irons.
@@ -2504,16 +2220,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
        css_put_many(&memcg->css, nr_pages);
 }
-/*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
-int memcg_cache_id(struct mem_cgroup *memcg)
-{
-        return memcg ? memcg->kmemcg_id : -1;
-}
 static int memcg_alloc_cache_id(void)
 {
        int id, size;
@@ -5127,10 +4833,12 @@ static void mem_cgroup_clear_mc(void)
 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
                                 struct cgroup_taskset *tset)
 {
-        struct task_struct *p = cgroup_taskset_first(tset);
-        int ret = 0;
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup *from;
+        struct task_struct *p;
+        struct mm_struct *mm;
        unsigned long move_flags;
+        int ret = 0;
        /*
         * We are now commited to this value whatever it is. Changes in this
@@ -5138,36 +4846,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
         * So we need to save it, and keep it going.
         */
        move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
-        if (move_flags) {
+        if (!move_flags)
-                struct mm_struct *mm;
+                return 0;
-                struct mem_cgroup *from = mem_cgroup_from_task(p);
-                VM_BUG_ON(from == memcg);
+        p = cgroup_taskset_first(tset);
+        from = mem_cgroup_from_task(p);
-                mm = get_task_mm(p);
+        VM_BUG_ON(from == memcg);
-                if (!mm)
-                        return 0;
+        mm = get_task_mm(p);
-                /* We move charges only when we move a owner of the mm */
+        if (!mm)
-                if (mm->owner == p) {
+                return 0;
-                        VM_BUG_ON(mc.from);
+        /* We move charges only when we move a owner of the mm */
-                        VM_BUG_ON(mc.to);
+        if (mm->owner == p) {
-                        VM_BUG_ON(mc.precharge);
+                VM_BUG_ON(mc.from);
-                        VM_BUG_ON(mc.moved_charge);
+                VM_BUG_ON(mc.to);
-                        VM_BUG_ON(mc.moved_swap);
+                VM_BUG_ON(mc.precharge);
+                VM_BUG_ON(mc.moved_charge);
-                        spin_lock(&mc.lock);
+                VM_BUG_ON(mc.moved_swap);
-                        mc.from = from;
-                        mc.to = memcg;
+                spin_lock(&mc.lock);
-                        mc.flags = move_flags;
+                mc.from = from;
-                        spin_unlock(&mc.lock);
+                mc.to = memcg;
-                        /* We set mc.moving_task later */
+                mc.flags = move_flags;
+                spin_unlock(&mc.lock);
-                        ret = mem_cgroup_precharge_mc(mm);
+                /* We set mc.moving_task later */
-                        if (ret)
-                                mem_cgroup_clear_mc();
+                ret = mem_cgroup_precharge_mc(mm);
-                }
+                if (ret)
-                mmput(mm);
+                        mem_cgroup_clear_mc();
        }
+        mmput(mm);
        return ret;
 }
@@ -5521,19 +5230,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
 };
 /**
- * mem_cgroup_events - count memory events against a cgroup
- * @memcg: the memory cgroup
- * @idx: the event index
- * @nr: the number of events to account for
- */
-void mem_cgroup_events(struct mem_cgroup *memcg,
-                       enum mem_cgroup_events_index idx,
-                       unsigned int nr)
-{
-        this_cpu_add(memcg->stat->events[idx], nr);
-}
-/**
 * mem_cgroup_low - check if memory consumption is below the normal range
 * @root: the highest ancestor to consider
 * @memcg: the memory cgroup to check
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1f4446a90cef..eeda6485e76c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -146,7 +146,7 @@ static int hwpoison_filter_task(struct page *p)
        if (!mem)
                return -EINVAL;
-        css = mem_cgroup_css(mem);
+        css = &mem->css;
        ino = cgroup_ino(css->cgroup);
        css_put(css);
@@ -934,6 +934,27 @@ int get_hwpoison_page(struct page *page)
 }
 EXPORT_SYMBOL_GPL(get_hwpoison_page);
+/**
+ * put_hwpoison_page() - Put refcount for memory error handling:
+ * @page:       raw error page (hit by memory error)
+ */
+void put_hwpoison_page(struct page *page)
+{
+        struct page *head = compound_head(page);
+        if (PageHuge(head)) {
+                put_page(head);
+                return;
+        }
+        if (PageTransHuge(head))
+                if (page != head)
+                        put_page(head);
+        put_page(page);
+}
+EXPORT_SYMBOL_GPL(put_hwpoison_page);
 /*
 * Do all that is necessary to remove user space mappings. Unmap
 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -1100,7 +1121,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                nr_pages = 1 << compound_order(hpage);
        else /* normal page or thp */
                nr_pages = 1;
-        atomic_long_add(nr_pages, &num_poisoned_pages);
+        num_poisoned_pages_add(nr_pages);
        /*
         * We need/can do nothing about count=0 pages.
@@ -1128,7 +1149,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        if (PageHWPoison(hpage)) {
                                if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
                                    || (p != hpage && TestSetPageHWPoison(hpage))) {
-                                        atomic_long_sub(nr_pages, &num_poisoned_pages);
+                                        num_poisoned_pages_sub(nr_pages);
                                        unlock_page(hpage);
                                        return 0;
                                }
@@ -1152,10 +1173,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        else
                                pr_err("MCE: %#lx: thp split failed\n", pfn);
                        if (TestClearPageHWPoison(p))
-                                atomic_long_sub(nr_pages, &num_poisoned_pages);
+                                num_poisoned_pages_sub(nr_pages);
-                        put_page(p);
+                        put_hwpoison_page(p);
-                        if (p != hpage)
-                                put_page(hpage);
                        return -EBUSY;
                }
                VM_BUG_ON_PAGE(!page_count(p), p);
@@ -1214,16 +1233,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         */
        if (!PageHWPoison(p)) {
                printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
-                atomic_long_sub(nr_pages, &num_poisoned_pages);
+                num_poisoned_pages_sub(nr_pages);
                unlock_page(hpage);
-                put_page(hpage);
+                put_hwpoison_page(hpage);
                return 0;
        }
        if (hwpoison_filter(p)) {
                if (TestClearPageHWPoison(p))
-                        atomic_long_sub(nr_pages, &num_poisoned_pages);
+                        num_poisoned_pages_sub(nr_pages);
                unlock_page(hpage);
-                put_page(hpage);
+                put_hwpoison_page(hpage);
                return 0;
        }
@@ -1237,7 +1256,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
                action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
                unlock_page(hpage);
-                put_page(hpage);
+                put_hwpoison_page(hpage);
                return 0;
        }
        /*
@@ -1426,6 +1445,22 @@ int unpoison_memory(unsigned long pfn)
                return 0;
        }
+        if (page_count(page) > 1) {
+                pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
+                return 0;
+        }
+        if (page_mapped(page)) {
+                pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
+                return 0;
+        }
+        if (page_mapping(page)) {
+                pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+                        pfn);
+                return 0;
+        }
        /*
         * unpoison_memory() can encounter thp only when the thp is being
         * worked by memory_failure() and the page lock is not held yet.
@@ -1450,7 +1485,7 @@ int unpoison_memory(unsigned long pfn)
                        return 0;
                }
                if (TestClearPageHWPoison(p))
-                        atomic_long_dec(&num_poisoned_pages);
+                        num_poisoned_pages_dec();
                pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
                return 0;
        }
@@ -1464,16 +1499,16 @@ int unpoison_memory(unsigned long pfn)
         */
        if (TestClearPageHWPoison(page)) {
                pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
-                atomic_long_sub(nr_pages, &num_poisoned_pages);
+                num_poisoned_pages_sub(nr_pages);
                freeit = 1;
                if (PageHuge(page))
                        clear_page_hwpoison_huge_page(page);
        }
        unlock_page(page);
-        put_page(page);
+        put_hwpoison_page(page);
        if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
-                put_page(page);
+                put_hwpoison_page(page);
        return 0;
 }
@@ -1486,7 +1521,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
                return alloc_huge_page_node(page_hstate(compound_head(p)),
                                                   nid);
        else
-                return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+                return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
 }
 /*
@@ -1533,7 +1568,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
                /*
                 * Try to free it.
                 */
-                put_page(page);
+                put_hwpoison_page(page);
                shake_page(page, 1);
                /*
@@ -1542,7 +1577,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
                ret = __get_any_page(page, pfn, 0);
                if (!PageLRU(page)) {
                        /* Drop page reference which is from __get_any_page() */
-                        put_page(page);
+                        put_hwpoison_page(page);
                        pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
                                pfn, page->flags);
                        return -EIO;
@@ -1565,7 +1600,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
        lock_page(hpage);
        if (PageHWPoison(hpage)) {
                unlock_page(hpage);
-                put_page(hpage);
+                put_hwpoison_page(hpage);
                pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
                return -EBUSY;
        }
@@ -1576,7 +1611,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
         * get_any_page() and isolate_huge_page() takes a refcount each,
         * so need to drop one here.
         */
-        put_page(hpage);
+        put_hwpoison_page(hpage);
        if (!ret) {
                pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
                return -EBUSY;
@@ -1600,11 +1635,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
                if (PageHuge(page)) {
                        set_page_hwpoison_huge_page(hpage);
                        dequeue_hwpoisoned_huge_page(hpage);
-                        atomic_long_add(1 << compound_order(hpage),
+                        num_poisoned_pages_add(1 << compound_order(hpage));
-                                        &num_poisoned_pages);
                } else {
                        SetPageHWPoison(page);
-                        atomic_long_inc(&num_poisoned_pages);
+                        num_poisoned_pages_inc();
                }
        }
        return ret;
@@ -1625,7 +1659,7 @@ static int __soft_offline_page(struct page *page, int flags)
        wait_on_page_writeback(page);
        if (PageHWPoison(page)) {
                unlock_page(page);
-                put_page(page);
+                put_hwpoison_page(page);
                pr_info("soft offline: %#lx page already poisoned\n", pfn);
                return -EBUSY;
        }
@@ -1640,10 +1674,10 @@ static int __soft_offline_page(struct page *page, int flags)
         * would need to fix isolation locking first.
         */
        if (ret == 1) {
-                put_page(page);
+                put_hwpoison_page(page);
                pr_info("soft_offline: %#lx: invalidated\n", pfn);
                SetPageHWPoison(page);
-                atomic_long_inc(&num_poisoned_pages);
+                num_poisoned_pages_inc();
                return 0;
        }
@@ -1657,14 +1691,12 @@ static int __soft_offline_page(struct page *page, int flags)
         * Drop page reference which is came from get_any_page()
         * successful isolate_lru_page() already took another one.
         */
-        put_page(page);
+        put_hwpoison_page(page);
        if (!ret) {
                LIST_HEAD(pagelist);
                inc_zone_page_state(page, NR_ISOLATED_ANON +
                                        page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
-                if (!TestSetPageHWPoison(page))
-                        atomic_long_inc(&num_poisoned_pages);
                ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
                                        MIGRATE_SYNC, MR_MEMORY_FAILURE);
                if (ret) {
@@ -1679,8 +1711,6 @@ static int __soft_offline_page(struct page *page, int flags)
                                pfn, ret, page->flags);
                        if (ret > 0)
                                ret = -EIO;
-                        if (TestClearPageHWPoison(page))
-                                atomic_long_dec(&num_poisoned_pages);
                }
        } else {
                pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
@@ -1719,12 +1749,16 @@ int soft_offline_page(struct page *page, int flags)
        if (PageHWPoison(page)) {
                pr_info("soft offline: %#lx page already poisoned\n", pfn);
+                if (flags & MF_COUNT_INCREASED)
+                        put_hwpoison_page(page);
                return -EBUSY;
        }
        if (!PageHuge(page) && PageTransHuge(hpage)) {
                if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
                        pr_info("soft offline: %#lx: failed to split THP\n",
                                pfn);
+                        if (flags & MF_COUNT_INCREASED)
+                                put_hwpoison_page(page);
                        return -EBUSY;
                }
        }
@@ -1742,11 +1776,10 @@ int soft_offline_page(struct page *page, int flags)
                if (PageHuge(page)) {
                        set_page_hwpoison_huge_page(hpage);
                        if (!dequeue_hwpoisoned_huge_page(hpage))
-                                atomic_long_add(1 << compound_order(hpage),
+                                num_poisoned_pages_add(1 << compound_order(hpage));
-                                        &num_poisoned_pages);
                } else {
                        if (!TestSetPageHWPoison(page))
-                                atomic_long_inc(&num_poisoned_pages);
+                                num_poisoned_pages_inc();
                }
        }
        return ret;
diff --git a/mm/memory.c b/mm/memory.c
index bb04d8f2f86c..6cd0b2160401 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2426,8 +2426,6 @@ void unmap_mapping_range(struct address_space *mapping,
        if (details.last_index < details.first_index)
                details.last_index = ULONG_MAX;
-        /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
@@ -3015,9 +3013,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                } else {
                        /*
                         * The fault handler has no page to lock, so it holds
-                         * i_mmap_lock for read to protect against truncate.
+                         * i_mmap_lock for write to protect against truncate.
                         */
-                        i_mmap_unlock_read(vma->vm_file->f_mapping);
+                        i_mmap_unlock_write(vma->vm_file->f_mapping);
                }
                goto uncharge_out;
        }
@@ -3031,9 +3029,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        } else {
                /*
                 * The fault handler has no page to lock, so it holds
-                 * i_mmap_lock for read to protect against truncate.
+                 * i_mmap_lock for write to protect against truncate.
                 */
-                i_mmap_unlock_read(vma->vm_file->f_mapping);
+                i_mmap_unlock_write(vma->vm_file->f_mapping);
        }
        return ret;
 uncharge_out:
@@ -3232,6 +3230,27 @@ out:
        return 0;
 }
+static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+                        unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+        if (!vma->vm_ops)
+                return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
+        if (vma->vm_ops->pmd_fault)
+                return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+        return VM_FAULT_FALLBACK;
+}
+static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+                        unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
+                        unsigned int flags)
+{
+        if (!vma->vm_ops)
+                return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
+        if (vma->vm_ops->pmd_fault)
+                return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+        return VM_FAULT_FALLBACK;
+}
 /*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
@@ -3267,12 +3286,12 @@ static int handle_pte_fault(struct mm_struct *mm,
        barrier();
        if (!pte_present(entry)) {
                if (pte_none(entry)) {
-                        if (vma->vm_ops)
+                        if (vma_is_anonymous(vma))
+                                return do_anonymous_page(mm, vma, address,
+                                                         pte, pmd, flags);
+                        else
                                return do_fault(mm, vma, address, pte, pmd,
                                                flags, entry);
-                        return do_anonymous_page(mm, vma, address, pte, pmd,
-                                        flags);
                }
                return do_swap_page(mm, vma, address,
                                        pte, pmd, flags, entry);
@@ -3334,10 +3353,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pmd)
                return VM_FAULT_OOM;
        if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
-                int ret = VM_FAULT_FALLBACK;
+                int ret = create_huge_pmd(mm, vma, address, pmd, flags);
-                if (!vma->vm_ops)
-                        ret = do_huge_pmd_anonymous_page(mm, vma, address,
-                                        pmd, flags);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
@@ -3361,8 +3377,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                             orig_pmd, pmd);
                        if (dirty && !pmd_write(orig_pmd)) {
-                                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
+                                ret = wp_huge_pmd(mm, vma, address, pmd,
-                                                          orig_pmd);
+                                                        orig_pmd, flags);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a7f1e0d1d6b8..87a177917cb2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -608,9 +608,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
        qp->prev = vma;
-        if (vma->vm_flags & VM_PFNMAP)
-                return 1;
        if (flags & MPOL_MF_LAZY) {
                /* Similar to task_numa_work, skip inaccessible VMAs */
                if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
@@ -945,7 +942,7 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
                return alloc_huge_page_node(page_hstate(compound_head(page)),
                                        node);
        else
-                return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
+                return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
                                                    __GFP_THISNODE, 0);
 }
@@ -2001,7 +1998,7 @@ retry_cpuset:
                nmask = policy_nodemask(gfp, pol);
                if (!nmask || node_isset(hpage_node, *nmask)) {
                        mpol_cond_put(pol);
-                        page = alloc_pages_exact_node(hpage_node,
+                        page = __alloc_pages_node(hpage_node,
                                                gfp | __GFP_THISNODE, order);
                        goto out;
                }
diff --git a/mm/mempool.c b/mm/mempool.c
index 2cc08de8b1db..4c533bc51d73 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -150,6 +150,9 @@ static void *remove_element(mempool_t *pool)
 */
 void mempool_destroy(mempool_t *pool)
 {
+        if (unlikely(!pool))
+                return;
        while (pool->curr_nr) {
                void *element = remove_element(pool);
                pool->free(element, pool->pool_data);
diff --git a/mm/memtest.c b/mm/memtest.c
index 0a1cc133f6d7..8eaa4c3a5f65 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -1,11 +1,6 @@
 #include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
 #include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
 #include <linux/init.h>
-#include <linux/pfn.h>
 #include <linux/memblock.h>
 static u64 patterns[] __initdata = {
@@ -31,10 +26,8 @@ static u64 patterns[] __initdata = {
 static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
 {
-        printk(KERN_INFO "  %016llx bad mem addr %010llx - %010llx reserved\n",
+        pr_info("  %016llx bad mem addr %pa - %pa reserved\n",
-               (unsigned long long) pattern,
+                cpu_to_be64(pattern), &start_bad, &end_bad);
-               (unsigned long long) start_bad,
-               (unsigned long long) end_bad);
        memblock_reserve(start_bad, end_bad - start_bad);
 }
@@ -79,26 +72,26 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
                this_start = clamp(this_start, start, end);
                this_end = clamp(this_end, start, end);
                if (this_start < this_end) {
-                        printk(KERN_INFO "  %010llx - %010llx pattern %016llx\n",
+                        pr_info("  %pa - %pa pattern %016llx\n",
-                               (unsigned long long)this_start,
+                                &this_start, &this_end, cpu_to_be64(pattern));
-                               (unsigned long long)this_end,
-                               (unsigned long long)cpu_to_be64(pattern));
                        memtest(pattern, this_start, this_end - this_start);
                }
        }
 }
 /* default is disabled */
-static int memtest_pattern __initdata;
+static unsigned int memtest_pattern __initdata;
 static int __init parse_memtest(char *arg)
 {
+        int ret = 0;
        if (arg)
-                memtest_pattern = simple_strtoul(arg, NULL, 0);
+                ret = kstrtouint(arg, 0, &memtest_pattern);
        else
                memtest_pattern = ARRAY_SIZE(patterns);
-        return 0;
+        return ret;
 }
 early_param("memtest", parse_memtest);
@@ -111,7 +104,7 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end)
        if (!memtest_pattern)
                return;
-        printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
+        pr_info("early_memtest: # of tests: %u\n", memtest_pattern);
        for (i = memtest_pattern-1; i < UINT_MAX; --i) {
                idx = i % ARRAY_SIZE(patterns);
                do_one_pass(patterns[idx], start, end);
diff --git a/mm/migrate.c b/mm/migrate.c
index 5c08cab5419e..02ce25df16c2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -880,8 +880,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
        /* Establish migration ptes or remove ptes */
        if (page_mapped(page)) {
                try_to_unmap(page,
-                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
+                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
-                        TTU_IGNORE_HWPOISON);
                page_was_mapped = 1;
        }
@@ -952,9 +951,11 @@ out:
                dec_zone_page_state(page, NR_ISOLATED_ANON +
                                page_is_file_cache(page));
                /* Soft-offlined page shouldn't go through lru cache list */
-                if (reason == MR_MEMORY_FAILURE)
+                if (reason == MR_MEMORY_FAILURE) {
                        put_page(page);
-                else
+                        if (!test_set_page_hwpoison(page))
+                                num_poisoned_pages_inc();
+                } else
                        putback_lru_page(page);
        }
@@ -1194,7 +1195,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
                return alloc_huge_page_node(page_hstate(compound_head(p)),
                                        pm->node);
        else
-                return alloc_pages_exact_node(pm->node,
+                return __alloc_pages_node(pm->node,
                                GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
 }
@@ -1554,7 +1555,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
        int nid = (int) data;
        struct page *newpage;
-        newpage = alloc_pages_exact_node(nid,
+        newpage = __alloc_pages_node(nid,
                                         (GFP_HIGHUSER_MOVABLE |
                                          __GFP_THISNODE | __GFP_NOMEMALLOC |
                                          __GFP_NORETRY | __GFP_NOWARN) &
diff --git a/mm/mmap.c b/mm/mmap.c
index 82db4fc0a9d3..b6be3249f0a9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2455,7 +2455,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
              unsigned long addr, int new_below)
 {
        struct vm_area_struct *new;
-        int err = -ENOMEM;
+        int err;
        if (is_vm_hugetlb_page(vma) && (addr &
                                        ~(huge_page_mask(hstate_vma(vma)))))
@@ -2463,7 +2463,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
        new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (!new)
-                goto out_err;
+                return -ENOMEM;
        /* most fields are the same, copy all, and then fixup */
        *new = *vma;
@@ -2511,7 +2511,6 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
        mpol_put(vma_policy(new));
 out_free_vma:
        kmem_cache_free(vm_area_cachep, new);
- out_err:
        return err;
 }
@@ -2872,6 +2871,13 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
        struct vm_area_struct *prev;
        struct rb_node **rb_link, *rb_parent;
+        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+                           &prev, &rb_link, &rb_parent))
+                return -ENOMEM;
+        if ((vma->vm_flags & VM_ACCOUNT) &&
+             security_vm_enough_memory_mm(mm, vma_pages(vma)))
+                return -ENOMEM;
        /*
         * The vm_pgoff of a purely anonymous vma should be irrelevant
         * until its first write fault, when page's anon_vma and index
@@ -2884,16 +2890,10 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
         * using the existing file pgoff checks and manipulations.
         * Similarly in do_mmap_pgoff and in do_brk.
         */
-        if (!vma->vm_file) {
+        if (vma_is_anonymous(vma)) {
                BUG_ON(vma->anon_vma);
                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
        }
-        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
-                           &prev, &rb_link, &rb_parent))
-                return -ENOMEM;
-        if ((vma->vm_flags & VM_ACCOUNT) &&
-             security_vm_enough_memory_mm(mm, vma_pages(vma)))
-                return -ENOMEM;
        vma_link(mm, vma, prev, rb_link, rb_parent);
        return 0;
@@ -2918,7 +2918,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
         * If anonymous vma has not yet been faulted, update new pgoff
         * to match new location, to increase its chance of merging.
         */
-        if (unlikely(!vma->vm_file && !vma->anon_vma)) {
+        if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
                pgoff = addr >> PAGE_SHIFT;
                faulted_in_anon_vma = false;
        }
@@ -2952,30 +2952,31 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-                if (new_vma) {
+                if (!new_vma)
-                        *new_vma = *vma;
+                        goto out;
-                        new_vma->vm_start = addr;
+                *new_vma = *vma;
-                        new_vma->vm_end = addr + len;
+                new_vma->vm_start = addr;
-                        new_vma->vm_pgoff = pgoff;
+                new_vma->vm_end = addr + len;
-                        if (vma_dup_policy(vma, new_vma))
+                new_vma->vm_pgoff = pgoff;
-                                goto out_free_vma;
+                if (vma_dup_policy(vma, new_vma))
-                        INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+                        goto out_free_vma;
-                        if (anon_vma_clone(new_vma, vma))
+                INIT_LIST_HEAD(&new_vma->anon_vma_chain);
-                                goto out_free_mempol;
+                if (anon_vma_clone(new_vma, vma))
-                        if (new_vma->vm_file)
+                        goto out_free_mempol;
-                                get_file(new_vma->vm_file);
+                if (new_vma->vm_file)
-                        if (new_vma->vm_ops && new_vma->vm_ops->open)
+                        get_file(new_vma->vm_file);
-                                new_vma->vm_ops->open(new_vma);
+                if (new_vma->vm_ops && new_vma->vm_ops->open)
-                        vma_link(mm, new_vma, prev, rb_link, rb_parent);
+                        new_vma->vm_ops->open(new_vma);
-                        *need_rmap_locks = false;
+                vma_link(mm, new_vma, prev, rb_link, rb_parent);
-                }
+                *need_rmap_locks = false;
        }
        return new_vma;
- out_free_mempol:
+out_free_mempol:
        mpol_put(vma_policy(new_vma));
- out_free_vma:
+out_free_vma:
        kmem_cache_free(vm_area_cachep, new_vma);
+out:
        return NULL;
 }
@@ -3027,21 +3028,13 @@ static int special_mapping_fault(struct vm_area_struct *vma,
        pgoff_t pgoff;
        struct page **pages;
-        /*
-         * special mappings have no vm_file, and in that case, the mm
-         * uses vm_pgoff internally. So we have to subtract it from here.
-         * We are allowed to do this because we are the mm; do not copy
-         * this code into drivers!
-         */
-        pgoff = vmf->pgoff - vma->vm_pgoff;
        if (vma->vm_ops == &legacy_special_mapping_vmops)
                pages = vma->vm_private_data;
        else
                pages = ((struct vm_special_mapping *)vma->vm_private_data)->
                        pages;
-        for (; pgoff && *pages; ++pages)
+        for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
                pgoff--;
        if (*pages) {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dff991e0681e..1ecc0bcaecc5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 * Determine the type of allocation constraint.
 */
 #ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
-                                gfp_t gfp_mask, nodemask_t *nodemask,
+                                             unsigned long *totalpages)
-                                unsigned long *totalpages)
 {
        struct zone *zone;
        struct zoneref *z;
-        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+        enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
        bool cpuset_limited = false;
        int nid;
        /* Default to all available memory */
        *totalpages = totalram_pages + total_swap_pages;
-        if (!zonelist)
+        if (!oc->zonelist)
                return CONSTRAINT_NONE;
        /*
         * Reach here only when __GFP_NOFAIL is used. So, we should avoid
         * to kill current.We have to random task kill in this case.
         * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
         */
-        if (gfp_mask & __GFP_THISNODE)
+        if (oc->gfp_mask & __GFP_THISNODE)
                return CONSTRAINT_NONE;
        /*
@@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
         * the page allocator means a mempolicy is in effect.  Cpuset policy
         * is enforced in get_page_from_freelist().
         */
-        if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
+        if (oc->nodemask &&
+            !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
                *totalpages = total_swap_pages;
-                for_each_node_mask(nid, *nodemask)
+                for_each_node_mask(nid, *oc->nodemask)
                        *totalpages += node_spanned_pages(nid);
                return CONSTRAINT_MEMORY_POLICY;
        }
        /* Check this allocation failure is caused by cpuset's wall function */
-        for_each_zone_zonelist_nodemask(zone, z, zonelist,
+        for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
-                        high_zoneidx, nodemask)
+                        high_zoneidx, oc->nodemask)
-                if (!cpuset_zone_allowed(zone, gfp_mask))
+                if (!cpuset_zone_allowed(zone, oc->gfp_mask))
                        cpuset_limited = true;
        if (cpuset_limited) {
@@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
        return CONSTRAINT_NONE;
 }
 #else
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
-                                gfp_t gfp_mask, nodemask_t *nodemask,
+                                             unsigned long *totalpages)
-                                unsigned long *totalpages)
 {
        *totalpages = totalram_pages + total_swap_pages;
        return CONSTRAINT_NONE;
 }
 #endif
-enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
+enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
-                unsigned long totalpages, const nodemask_t *nodemask,
+                        struct task_struct *task, unsigned long totalpages)
-                bool force_kill)
 {
-        if (oom_unkillable_task(task, NULL, nodemask))
+        if (oom_unkillable_task(task, NULL, oc->nodemask))
                return OOM_SCAN_CONTINUE;
        /*
@@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
         * Don't allow any other task to have access to the reserves.
         */
        if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-                if (!force_kill)
+                if (oc->order != -1)
                        return OOM_SCAN_ABORT;
        }
        if (!task->mm)
@@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
        if (oom_task_origin(task))
                return OOM_SCAN_SELECT;
-        if (task_will_free_mem(task) && !force_kill)
+        if (task_will_free_mem(task) && oc->order != -1)
                return OOM_SCAN_ABORT;
        return OOM_SCAN_OK;
@@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 /*
 * Simple selection loop. We chose the process with the highest
 * number of 'points'.  Returns -1 on scan abort.
- *
- * (not docbooked, we don't want this one cluttering up the manual)
 */
-static struct task_struct *select_bad_process(unsigned int *ppoints,
+static struct task_struct *select_bad_process(struct oom_control *oc,
-                unsigned long totalpages, const nodemask_t *nodemask,
+                unsigned int *ppoints, unsigned long totalpages)
-                bool force_kill)
 {
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
@@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
        for_each_process_thread(g, p) {
                unsigned int points;
-                switch (oom_scan_process_thread(p, totalpages, nodemask,
+                switch (oom_scan_process_thread(oc, p, totalpages)) {
-                                                force_kill)) {
                case OOM_SCAN_SELECT:
                        chosen = p;
                        chosen_points = ULONG_MAX;
@@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                case OOM_SCAN_OK:
                        break;
                };
-                points = oom_badness(p, NULL, nodemask, totalpages);
+                points = oom_badness(p, NULL, oc->nodemask, totalpages);
                if (!points || points < chosen_points)
                        continue;
                /* Prefer thread group leaders for display purposes */
@@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
        rcu_read_unlock();
 }
-static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
+static void dump_header(struct oom_control *oc, struct task_struct *p,
-                        struct mem_cgroup *memcg, const nodemask_t *nodemask)
+                        struct mem_cgroup *memcg)
 {
        task_lock(current);
        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
                "oom_score_adj=%hd\n",
-                current->comm, gfp_mask, order,
+                current->comm, oc->gfp_mask, oc->order,
                current->signal->oom_score_adj);
        cpuset_print_task_mems_allowed(current);
        task_unlock(current);
@@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
        else
                show_mem(SHOW_MEM_FILTER_NODES);
        if (sysctl_oom_dump_tasks)
-                dump_tasks(memcg, nodemask);
+                dump_tasks(memcg, oc->nodemask);
 }
 /*
@@ -487,10 +481,9 @@ void oom_killer_enable(void)
 * Must be called while holding a reference to p, which will be released upon
 * returning.
 */
-void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                      unsigned int points, unsigned long totalpages,
-                      struct mem_cgroup *memcg, nodemask_t *nodemask,
+                      struct mem_cgroup *memcg, const char *message)
-                      const char *message)
 {
        struct task_struct *victim = p;
        struct task_struct *child;
@@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        task_unlock(p);
        if (__ratelimit(&oom_rs))
-                dump_header(p, gfp_mask, order, memcg, nodemask);
+                dump_header(oc, p, memcg);
        task_lock(p);
        pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
@@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                        /*
                         * oom_badness() returns 0 if the thread is unkillable
                         */
-                        child_points = oom_badness(child, memcg, nodemask,
+                        child_points = oom_badness(child, memcg, oc->nodemask,
                                                                totalpages);
                        if (child_points > victim_points) {
                                put_task_struct(victim);
@@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 /*
 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
 */
-void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
-                        int order, const nodemask_t *nodemask,
                        struct mem_cgroup *memcg)
 {
        if (likely(!sysctl_panic_on_oom))
@@ -615,7 +607,10 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
                if (constraint != CONSTRAINT_NONE)
                        return;
        }
-        dump_header(NULL, gfp_mask, order, memcg, nodemask);
+        /* Do not panic for oom kills triggered by sysrq */
+        if (oc->order == -1)
+                return;
+        dump_header(oc, NULL, memcg);
        panic("Out of memory: %s panic_on_oom is enabled\n",
                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
@@ -635,28 +630,21 @@ int unregister_oom_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 /**
- * __out_of_memory - kill the "best" process when we run out of memory
+ * out_of_memory - kill the "best" process when we run out of memory
- * @zonelist: zonelist pointer
+ * @oc: pointer to struct oom_control
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
 *
 * If we run out of memory, we have the choice between either
 * killing a random task (bad), letting the system crash (worse)
 * OR try to be smart about which process to kill. Note that we
 * don't have to be perfect here, we just have to be good.
 */
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+bool out_of_memory(struct oom_control *oc)
-                   int order, nodemask_t *nodemask, bool force_kill)
 {
-        const nodemask_t *mpol_mask;
        struct task_struct *p;
        unsigned long totalpages;
        unsigned long freed = 0;
        unsigned int uninitialized_var(points);
        enum oom_constraint constraint = CONSTRAINT_NONE;
-        int killed = 0;
        if (oom_killer_disabled)
                return false;
@@ -664,7 +652,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
        if (freed > 0)
                /* Got some memory back in the last second. */
-                goto out;
+                return true;
        /*
         * If current has a pending SIGKILL or is exiting, then automatically
@@ -677,47 +665,42 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        if (current->mm &&
            (fatal_signal_pending(current) || task_will_free_mem(current))) {
                mark_oom_victim(current);
-                goto out;
+                return true;
        }
        /*
         * Check if there were limitations on the allocation (only relevant for
         * NUMA) that may require different handling.
         */
-        constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
+        constraint = constrained_alloc(oc, &totalpages);
-                                                &totalpages);
+        if (constraint != CONSTRAINT_MEMORY_POLICY)
-        mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
+                oc->nodemask = NULL;
-        check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
+        check_panic_on_oom(oc, constraint, NULL);
        if (sysctl_oom_kill_allocating_task && current->mm &&
-            !oom_unkillable_task(current, NULL, nodemask) &&
+            !oom_unkillable_task(current, NULL, oc->nodemask) &&
            current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
                get_task_struct(current);
-                oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
+                oom_kill_process(oc, current, 0, totalpages, NULL,
-                                 nodemask,
                                 "Out of memory (oom_kill_allocating_task)");
-                goto out;
+                return true;
        }
-        p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+        p = select_bad_process(oc, &points, totalpages);
        /* Found nothing?!?! Either we hang forever, or we panic. */
-        if (!p) {
+        if (!p && oc->order != -1) {
-                dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
+                dump_header(oc, NULL, NULL);
                panic("Out of memory and no killable processes...\n");
        }
-        if (p != (void *)-1UL) {
+        if (p && p != (void *)-1UL) {
-                oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+                oom_kill_process(oc, p, points, totalpages, NULL,
-                                 nodemask, "Out of memory");
+                                 "Out of memory");
-                killed = 1;
+                /*
-        }
+                 * Give the killed process a good chance to exit before trying
-out:
+                 * to allocate memory again.
-        /*
+                 */
-         * Give the killed threads a good chance of exiting before trying to
-         * allocate memory again.
-         */
-        if (killed)
                schedule_timeout_killable(1);
+        }
        return true;
 }
@@ -728,13 +711,20 @@ out:
 */
 void pagefault_out_of_memory(void)
 {
+        struct oom_control oc = {
+                .zonelist = NULL,
+                .nodemask = NULL,
+                .gfp_mask = 0,
+                .order = 0,
+        };
        if (mem_cgroup_oom_synchronize(true))
                return;
        if (!mutex_trylock(&oom_lock))
                return;
-        if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+        if (!out_of_memory(&oc)) {
                /*
                 * There shouldn't be any user tasks runnable while the
                 * OOM killer is disabled, so the current task has to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b401d40cb4fd..48aaf7b9f253 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -125,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+/*
+ * A cached value of the page's pageblock's migratetype, used when the page is
+ * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+ * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+ * Also the migratetype set in the page does not necessarily match the pcplist
+ * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+ * other index - this ensures that it will be put on the correct CMA freelist.
+ */
+static inline int get_pcppage_migratetype(struct page *page)
+{
+        return page->index;
+}
+static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+{
+        page->index = migratetype;
+}
 #ifdef CONFIG_PM_SLEEP
 /*
 * The following functions are used by the suspend/hibernate code to temporarily
@@ -791,7 +809,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
                        list_del(&page->lru);
-                        mt = get_freepage_migratetype(page);
+                        mt = get_pcppage_migratetype(page);
+                        /* MIGRATE_ISOLATE page should not go to pcplists */
+                        VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+                        /* Pageblock could have been isolated meanwhile */
                        if (unlikely(has_isolate_pageblock(zone)))
                                mt = get_pageblock_migratetype(page);
@@ -955,7 +977,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        migratetype = get_pfnblock_migratetype(page, pfn);
        local_irq_save(flags);
        __count_vm_events(PGFREE, 1 << order);
-        set_freepage_migratetype(page, migratetype);
        free_one_page(page_zone(page), page, pfn, order, migratetype);
        local_irq_restore(flags);
 }
@@ -1383,7 +1404,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                rmv_page_order(page);
                area->nr_free--;
                expand(zone, page, order, current_order, area, migratetype);
-                set_freepage_migratetype(page, migratetype);
+                set_pcppage_migratetype(page, migratetype);
                return page;
        }
@@ -1460,7 +1481,6 @@ int move_freepages(struct zone *zone,
                order = page_order(page);
                list_move(&page->lru,
                          &zone->free_area[order].free_list[migratetype]);
-                set_freepage_migratetype(page, migratetype);
                page += 1 << order;
                pages_moved += 1 << order;
        }
@@ -1630,14 +1650,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
                expand(zone, page, order, current_order, area,
                                        start_migratetype);
                /*
-                 * The freepage_migratetype may differ from pageblock's
+                 * The pcppage_migratetype may differ from pageblock's
                 * migratetype depending on the decisions in
-                 * try_to_steal_freepages(). This is OK as long as it
+                 * find_suitable_fallback(). This is OK as long as it does not
-                 * does not differ for MIGRATE_CMA pageblocks. For CMA
+                 * differ for MIGRATE_CMA pageblocks. Those can be used as
-                 * we need to make sure unallocated pages flushed from
+                 * fallback only via special __rmqueue_cma_fallback() function
-                 * pcp lists are returned to the correct freelist.
                 */
-                set_freepage_migratetype(page, start_migratetype);
+                set_pcppage_migratetype(page, start_migratetype);
                trace_mm_page_alloc_extfrag(page, order, current_order,
                        start_migratetype, fallback_mt);
@@ -1713,7 +1732,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                else
                        list_add_tail(&page->lru, list);
                list = &page->lru;
-                if (is_migrate_cma(get_freepage_migratetype(page)))
+                if (is_migrate_cma(get_pcppage_migratetype(page)))
                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
                                              -(1 << order));
        }
@@ -1910,7 +1929,7 @@ void free_hot_cold_page(struct page *page, bool cold)
                return;
        migratetype = get_pfnblock_migratetype(page, pfn);
-        set_freepage_migratetype(page, migratetype);
+        set_pcppage_migratetype(page, migratetype);
        local_irq_save(flags);
        __count_vm_event(PGFREE);
@@ -2115,7 +2134,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                if (!page)
                        goto failed;
                __mod_zone_freepage_state(zone, -(1 << order),
-                                          get_freepage_migratetype(page));
+                                          get_pcppage_migratetype(page));
        }
        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -2696,6 +2715,12 @@ static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        const struct alloc_context *ac, unsigned long *did_some_progress)
 {
+        struct oom_control oc = {
+                .zonelist = ac->zonelist,
+                .nodemask = ac->nodemask,
+                .gfp_mask = gfp_mask,
+                .order = order,
+        };
        struct page *page;
        *did_some_progress = 0;
@@ -2747,8 +2772,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        goto out;
        }
        /* Exhausted what can be done so it's blamo time */
-        if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
+        if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
-                        || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
                *did_some_progress = 1;
 out:
        mutex_unlock(&oom_lock);
@@ -3490,8 +3514,6 @@ EXPORT_SYMBOL(alloc_pages_exact);
 *
 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
 * back.
- * Note this is not alloc_pages_exact_node() which allocates on a specific node,
- * but is not exact.
 */
 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
@@ -5066,7 +5088,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 {
        unsigned long zone_start_pfn, zone_end_pfn;
-        /* When hotadd a new node, the node should be empty */
+        /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
                return 0;
@@ -5133,7 +5155,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
        unsigned long zone_start_pfn, zone_end_pfn;
-        /* When hotadd a new node, the node should be empty */
+        /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
                return 0;
@@ -5306,8 +5328,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
 *
 * NOTE: pgdat should get zeroed by caller.
 */
-static void __paginginit free_area_init_core(struct pglist_data *pgdat,
+static void __paginginit free_area_init_core(struct pglist_data *pgdat)
-                unsigned long node_start_pfn, unsigned long node_end_pfn)
 {
        enum zone_type j;
        int nid = pgdat->node_id;
@@ -5458,7 +5479,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
        pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
-                (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
+                (u64)start_pfn << PAGE_SHIFT,
+                end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
 #endif
        calculate_node_totalpages(pgdat, start_pfn, end_pfn,
                                  zones_size, zholes_size);
@@ -5470,7 +5492,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                (unsigned long)pgdat->node_mem_map);
 #endif
-        free_area_init_core(pgdat, start_pfn, end_pfn);
+        free_area_init_core(pgdat);
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5481,11 +5503,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 */
 void __init setup_nr_node_ids(void)
 {
-        unsigned int node;
+        unsigned int highest;
-        unsigned int highest = 0;
-        for_each_node_mask(node, node_possible_map)
+        highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
-                highest = node;
        nr_node_ids = highest + 1;
 }
 #endif
@@ -6006,7 +6026,7 @@ void __init mem_init_print_info(const char *str)
 * set_dma_reserve - set the specified number of pages reserved in the first zone
 * @new_dma_reserve: The number of pages to mark reserved
 *
- * The per-cpu batchsize and zone watermarks are determined by present_pages.
+ * The per-cpu batchsize and zone watermarks are determined by managed_pages.
 * In the DMA zone, a significant percentage may be consumed by kernel image
 * and other unfreeable allocations which can skew the watermarks badly. This
 * function may optionally be used to account for unfreeable pages in the
@@ -6059,7 +6079,7 @@ void __init page_alloc_init(void)
 }
 /*
- * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
+ * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
 *      or min_free_kbytes changes.
 */
 static void calculate_totalreserve_pages(void)
@@ -6103,7 +6123,7 @@ static void calculate_totalreserve_pages(void)
 /*
 * setup_per_zone_lowmem_reserve - called whenever
- *      sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
+ *      sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
 *      has a correct pages reserved value, so an adequate number of
 *      pages are left in the zone after a successful __alloc_pages().
 */
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 303c908790ef..4568fd58f70a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -9,7 +9,8 @@
 #include <linux/hugetlb.h>
 #include "internal.h"
-int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
+static int set_migratetype_isolate(struct page *page,
+                                bool skip_hwpoisoned_pages)
 {
        struct zone *zone;
        unsigned long flags, pfn;
@@ -72,7 +73,7 @@ out:
        return ret;
 }
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 {
        struct zone *zone;
        unsigned long flags, nr_pages;
@@ -223,34 +224,16 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
                        continue;
                }
                page = pfn_to_page(pfn);
-                if (PageBuddy(page)) {
+                if (PageBuddy(page))
                        /*
-                         * If race between isolatation and allocation happens,
+                         * If the page is on a free list, it has to be on
-                         * some free pages could be in MIGRATE_MOVABLE list
+                         * the correct MIGRATE_ISOLATE freelist. There is no
-                         * although pageblock's migratation type of the page
+                         * simple way to verify that as VM_BUG_ON(), though.
-                         * is MIGRATE_ISOLATE. Catch it and move the page into
-                         * MIGRATE_ISOLATE list.
                         */
-                        if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
-                                struct page *end_page;
-                                end_page = page + (1 << page_order(page)) - 1;
-                                move_freepages(page_zone(page), page, end_page,
-                                                MIGRATE_ISOLATE);
-                        }
                        pfn += 1 << page_order(page);
-                }
+                else if (skip_hwpoisoned_pages && PageHWPoison(page))
-                else if (page_count(page) == 0 &&
+                        /* A HWPoisoned page cannot be also PageBuddy */
-                        get_freepage_migratetype(page) == MIGRATE_ISOLATE)
-                        pfn += 1;
-                else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
-                        /*
-                         * The HWPoisoned page may be not in buddy
-                         * system, and page_count() is not 0.
-                         */
                        pfn++;
-                        continue;
-                }
                else
                        break;
        }
diff --git a/mm/shmem.c b/mm/shmem.c
index dbe0c1e8349c..48ce82926d93 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -542,6 +542,21 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
+static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                         struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct shmem_inode_info *info = SHMEM_I(inode);
+        spin_lock(&info->lock);
+        shmem_recalc_inode(inode);
+        spin_unlock(&info->lock);
+        generic_fillattr(inode, stat);
+        return 0;
+}
 static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = d_inode(dentry);
@@ -3122,6 +3137,7 @@ static const struct file_operations shmem_file_operations = {
 };
 static const struct inode_operations shmem_inode_operations = {
+        .getattr        = shmem_getattr,
        .setattr        = shmem_setattr,
 #ifdef CONFIG_TMPFS_XATTR
        .setxattr       = shmem_setxattr,
diff --git a/mm/slab.c b/mm/slab.c
index 60c936938b84..c77ebe6cc87c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1595,7 +1595,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
        if (memcg_charge_slab(cachep, flags, cachep->gfporder))
                return NULL;
-        page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
+        page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
        if (!page) {
                memcg_uncharge_slab(cachep, cachep->gfporder);
                slab_out_of_memory(cachep, flags, nodeid);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index c26829fe4e37..5ce4faeb16fb 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -500,7 +500,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
                             struct kmem_cache *root_cache)
 {
        static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
-        struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+        struct cgroup_subsys_state *css = &memcg->css;
        struct memcg_cache_array *arr;
        struct kmem_cache *s = NULL;
        char *cache_name;
@@ -640,6 +640,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
        bool need_rcu_barrier = false;
        bool busy = false;
+        if (unlikely(!s))
+                return;
        BUG_ON(!is_root_cache(s));
        get_online_cpus();
diff --git a/mm/slob.c b/mm/slob.c
index 165bbd3cd606..0d7e5df74d1f 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -45,7 +45,7 @@
 * NUMA support in SLOB is fairly simplistic, pushing most of the real
 * logic down to the page allocator, and simply doing the node accounting
 * on the upper levels. In the event that a node id is explicitly
- * provided, alloc_pages_exact_node() with the specified node id is used
+ * provided, __alloc_pages_node() with the specified node id is used
 * instead. The common case (or when the node id isn't explicitly provided)
 * will default to the current node, as per numa_node_id().
 *
@@ -193,7 +193,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
 #ifdef CONFIG_NUMA
        if (node != NUMA_NO_NODE)
-                page = alloc_pages_exact_node(node, gfp, order);
+                page = __alloc_pages_node(node, gfp, order);
        else
 #endif
                page = alloc_pages(gfp, order);
diff --git a/mm/slub.c b/mm/slub.c
index 084184e706c6..f614b5dc396b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1334,7 +1334,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
        if (node == NUMA_NO_NODE)
                page = alloc_pages(flags, order);
        else
-                page = alloc_pages_exact_node(node, flags, order);
+                page = __alloc_pages_node(node, flags, order);
        if (!page)
                memcg_uncharge_slab(s, order);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8bc8e66138da..d504adb7fa5f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -288,17 +288,14 @@ struct page * lookup_swap_cache(swp_entry_t entry)
        return page;
 }
-/* 
+struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- * Locate a page of swap in physical memory, reserving swap cache space
+                        struct vm_area_struct *vma, unsigned long addr,
- * and reading the disk if it is not already cached.
+                        bool *new_page_allocated)
- * A failure return means that either the page allocation failed or that
- * the swap entry is no longer in use.
- */
-struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-                        struct vm_area_struct *vma, unsigned long addr)
 {
        struct page *found_page, *new_page = NULL;
+        struct address_space *swapper_space = swap_address_space(entry);
        int err;
+        *new_page_allocated = false;
        do {
                /*
@@ -306,8 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                 * called after lookup_swap_cache() failed, re-calling
                 * that would confuse statistics.
                 */
-                found_page = find_get_page(swap_address_space(entry),
+                found_page = find_get_page(swapper_space, entry.val);
-                                        entry.val);
                if (found_page)
                        break;
@@ -366,7 +362,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                         * Initiate read into locked page and return.
                         */
                        lru_cache_add_anon(new_page);
-                        swap_readpage(new_page);
+                        *new_page_allocated = true;
                        return new_page;
                }
                radix_tree_preload_end();
@@ -384,6 +380,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
        return found_page;
 }
+/*
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached.
+ * A failure return means that either the page allocation failed or that
+ * the swap entry is no longer in use.
+ */
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+                        struct vm_area_struct *vma, unsigned long addr)
+{
+        bool page_was_allocated;
+        struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
+                        vma, addr, &page_was_allocated);
+        if (page_was_allocated)
+                swap_readpage(retpage);
+        return retpage;
+}
 static unsigned long swapin_nr_pages(unsigned long offset)
 {
        static unsigned long prev_offset;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index aebc2dd6e649..58877312cf6b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -875,6 +875,48 @@ int page_swapcount(struct page *page)
 }
 /*
+ * How many references to @entry are currently swapped out?
+ * This considers COUNT_CONTINUED so it returns exact answer.
+ */
+int swp_swapcount(swp_entry_t entry)
+{
+        int count, tmp_count, n;
+        struct swap_info_struct *p;
+        struct page *page;
+        pgoff_t offset;
+        unsigned char *map;
+        p = swap_info_get(entry);
+        if (!p)
+                return 0;
+        count = swap_count(p->swap_map[swp_offset(entry)]);
+        if (!(count & COUNT_CONTINUED))
+                goto out;
+        count &= ~COUNT_CONTINUED;
+        n = SWAP_MAP_MAX + 1;
+        offset = swp_offset(entry);
+        page = vmalloc_to_page(p->swap_map + offset);
+        offset &= ~PAGE_MASK;
+        VM_BUG_ON(page_private(page) != SWP_CONTINUED);
+        do {
+                page = list_entry(page->lru.next, struct page, lru);
+                map = kmap_atomic(page);
+                tmp_count = map[offset];
+                kunmap_atomic(map);
+                count += (tmp_count & ~COUNT_CONTINUED) * n;
+                n *= (SWAP_CONT_MAX + 1);
+        } while (tmp_count & COUNT_CONTINUED);
+out:
+        spin_unlock(&p->lock);
+        return count;
+}
+/*
 * We can write to an anon page without COW if there are no other references
 * to it.  And as a side-effect, free up its swap: because the old content
 * on disk will never be read, and seeking back there to write new content
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1139039122a..2d978b28a410 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc)
        if (!memcg)
                return true;
 #ifdef CONFIG_CGROUP_WRITEBACK
-        if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+        if (memcg->css.cgroup)
                return true;
 #endif
        return false;
@@ -985,7 +985,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 *    __GFP_IO|__GFP_FS for this reason); but more thought
                 *    would probably show more reasons.
                 *
-                 * 3) Legacy memcg encounters a page that is not already marked
+                 * 3) Legacy memcg encounters a page that is already marked
                 *    PageReclaim. memcg does not have any dirty pages
                 *    throttling so we could easily OOM just because too many
                 *    pages are in writeback and there is nothing else to
@@ -1015,12 +1015,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 */
                                SetPageReclaim(page);
                                nr_writeback++;
                                goto keep_locked;
                        /* Case 3 above */
                        } else {
+                                unlock_page(page);
                                wait_on_page_writeback(page);
+                                /* then go back and try same page again */
+                                list_add_tail(&page->lru, page_list);
+                                continue;
                        }
                }
@@ -1196,7 +1199,7 @@ cull_mlocked:
                if (PageSwapCache(page))
                        try_to_free_swap(page);
                unlock_page(page);
-                putback_lru_page(page);
+                list_add(&page->lru, &ret_pages);
                continue;
 activate_locked:
@@ -1359,7 +1362,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        unsigned long nr_taken = 0;
        unsigned long scan;
-        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+        for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
+                                        !list_empty(src); scan++) {
                struct page *page;
                int nr_pages;
diff --git a/mm/zbud.c b/mm/zbud.c
index f3bf6f7627d8..fa48bcdff9d5 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -96,10 +96,10 @@ struct zbud_pool {
        struct list_head buddied;
        struct list_head lru;
        u64 pages_nr;
-        struct zbud_ops *ops;
+        const struct zbud_ops *ops;
 #ifdef CONFIG_ZPOOL
        struct zpool *zpool;
-        struct zpool_ops *zpool_ops;
+        const struct zpool_ops *zpool_ops;
 #endif
 };
@@ -133,12 +133,12 @@ static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
                return -ENOENT;
 }
-static struct zbud_ops zbud_zpool_ops = {
+static const struct zbud_ops zbud_zpool_ops = {
        .evict =        zbud_zpool_evict
 };
 static void *zbud_zpool_create(char *name, gfp_t gfp,
-                               struct zpool_ops *zpool_ops,
+                               const struct zpool_ops *zpool_ops,
                               struct zpool *zpool)
 {
        struct zbud_pool *pool;
@@ -302,7 +302,7 @@ static int num_free_chunks(struct zbud_header *zhdr)
 * Return: pointer to the new zbud pool or NULL if the metadata allocation
 * failed.
 */
-struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
 {
        struct zbud_pool *pool;
        int i;
diff --git a/mm/zpool.c b/mm/zpool.c
index 722a4f60e90b..68d2dd8ed2d8 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -22,7 +22,7 @@ struct zpool {
        struct zpool_driver *driver;
        void *pool;
-        struct zpool_ops *ops;
+        const struct zpool_ops *ops;
        struct list_head list;
 };
@@ -115,7 +115,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
 * Returns: New zpool on success, NULL on failure.
 */
 struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
-                struct zpool_ops *ops)
+                const struct zpool_ops *ops)
 {
        struct zpool_driver *driver;
        struct zpool *zpool;
@@ -320,20 +320,6 @@ u64 zpool_get_total_size(struct zpool *zpool)
        return zpool->driver->total_size(zpool->pool);
 }
-static int __init init_zpool(void)
-{
-        pr_info("loaded\n");
-        return 0;
-}
-static void __exit exit_zpool(void)
-{
-        pr_info("unloaded\n");
-}
-module_init(init_zpool);
-module_exit(exit_zpool);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
 MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0a7f81aa2249..f135b1b6fcdc 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -169,14 +169,12 @@ enum zs_stat_type {
        NR_ZS_STAT_TYPE,
 };
-#ifdef CONFIG_ZSMALLOC_STAT
-static struct dentry *zs_stat_root;
 struct zs_size_stat {
        unsigned long objs[NR_ZS_STAT_TYPE];
 };
+#ifdef CONFIG_ZSMALLOC_STAT
+static struct dentry *zs_stat_root;
 #endif
 /*
@@ -201,6 +199,8 @@ static int zs_size_classes;
 static const int fullness_threshold_frac = 4;
 struct size_class {
+        spinlock_t lock;
+        struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
        /*
         * Size of objects stored in this class. Must be multiple
         * of ZS_ALIGN.
@@ -210,16 +210,10 @@ struct size_class {
        /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
        int pages_per_zspage;
-        /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
-        bool huge;
-#ifdef CONFIG_ZSMALLOC_STAT
        struct zs_size_stat stats;
-#endif
-        spinlock_t lock;
-        struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+        /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+        bool huge;
 };
 /*
@@ -251,6 +245,15 @@ struct zs_pool {
        gfp_t flags;    /* allocation flags used when growing pool */
        atomic_long_t pages_allocated;
+        struct zs_pool_stats stats;
+        /* Compact classes */
+        struct shrinker shrinker;
+        /*
+         * To signify that register_shrinker() was successful
+         * and unregister_shrinker() will not Oops.
+         */
+        bool shrinker_enabled;
 #ifdef CONFIG_ZSMALLOC_STAT
        struct dentry *stat_dentry;
 #endif
@@ -285,8 +288,7 @@ static int create_handle_cache(struct zs_pool *pool)
 static void destroy_handle_cache(struct zs_pool *pool)
 {
-        if (pool->handle_cachep)
+        kmem_cache_destroy(pool->handle_cachep);
-                kmem_cache_destroy(pool->handle_cachep);
 }
 static unsigned long alloc_handle(struct zs_pool *pool)
@@ -309,7 +311,8 @@ static void record_obj(unsigned long handle, unsigned long obj)
 #ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops,
+static void *zs_zpool_create(char *name, gfp_t gfp,
+                             const struct zpool_ops *zpool_ops,
                             struct zpool *zpool)
 {
        return zs_create_pool(name, gfp);
@@ -441,8 +444,6 @@ static int get_size_class_index(int size)
        return min(zs_size_classes - 1, idx);
 }
-#ifdef CONFIG_ZSMALLOC_STAT
 static inline void zs_stat_inc(struct size_class *class,
                                enum zs_stat_type type, unsigned long cnt)
 {
@@ -461,6 +462,8 @@ static inline unsigned long zs_stat_get(struct size_class *class,
        return class->stats.objs[type];
 }
+#ifdef CONFIG_ZSMALLOC_STAT
 static int __init zs_stat_init(void)
 {
        if (!debugfs_initialized())
@@ -576,23 +579,6 @@ static void zs_pool_stat_destroy(struct zs_pool *pool)
 }
 #else /* CONFIG_ZSMALLOC_STAT */
-static inline void zs_stat_inc(struct size_class *class,
-                                enum zs_stat_type type, unsigned long cnt)
-{
-}
-static inline void zs_stat_dec(struct size_class *class,
-                                enum zs_stat_type type, unsigned long cnt)
-{
-}
-static inline unsigned long zs_stat_get(struct size_class *class,
-                                enum zs_stat_type type)
-{
-        return 0;
-}
 static int __init zs_stat_init(void)
 {
        return 0;
@@ -610,7 +596,6 @@ static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
 static inline void zs_pool_stat_destroy(struct zs_pool *pool)
 {
 }
 #endif
@@ -658,13 +643,22 @@ static void insert_zspage(struct page *page, struct size_class *class,
        if (fullness >= _ZS_NR_FULLNESS_GROUPS)
                return;
-        head = &class->fullness_list[fullness];
-        if (*head)
-                list_add_tail(&page->lru, &(*head)->lru);
-        *head = page;
        zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
                        CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
+        head = &class->fullness_list[fullness];
+        if (!*head) {
+                *head = page;
+                return;
+        }
+        /*
+         * We want to see more ZS_FULL pages and less almost
+         * empty/full. Put pages with higher ->inuse first.
+         */
+        list_add_tail(&page->lru, &(*head)->lru);
+        if (page->inuse >= (*head)->inuse)
+                *head = page;
 }
 /*
@@ -1495,7 +1489,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 }
 EXPORT_SYMBOL_GPL(zs_free);
-static void zs_object_copy(unsigned long src, unsigned long dst,
+static void zs_object_copy(unsigned long dst, unsigned long src,
                                struct size_class *class)
 {
        struct page *s_page, *d_page;
@@ -1602,8 +1596,6 @@ struct zs_compact_control {
         /* Starting object index within @s_page which used for live object
          * in the subpage. */
        int index;
-        /* how many of objects are migrated */
-        int nr_migrated;
 };
 static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1614,7 +1606,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
        struct page *s_page = cc->s_page;
        struct page *d_page = cc->d_page;
        unsigned long index = cc->index;
-        int nr_migrated = 0;
        int ret = 0;
        while (1) {
@@ -1636,23 +1627,21 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
                used_obj = handle_to_obj(handle);
                free_obj = obj_malloc(d_page, class, handle);
-                zs_object_copy(used_obj, free_obj, class);
+                zs_object_copy(free_obj, used_obj, class);
                index++;
                record_obj(handle, free_obj);
                unpin_tag(handle);
                obj_free(pool, class, used_obj);
-                nr_migrated++;
        }
        /* Remember last position in this iteration */
        cc->s_page = s_page;
        cc->index = index;
-        cc->nr_migrated = nr_migrated;
        return ret;
 }
-static struct page *alloc_target_page(struct size_class *class)
+static struct page *isolate_target_page(struct size_class *class)
 {
        int i;
        struct page *page;
@@ -1668,8 +1657,17 @@ static struct page *alloc_target_page(struct size_class *class)
        return page;
 }
-static void putback_zspage(struct zs_pool *pool, struct size_class *class,
+/*
-                                struct page *first_page)
+ * putback_zspage - add @first_page into right class's fullness list
+ * @pool: target pool
+ * @class: destination class
+ * @first_page: target page
+ *
+ * Return @fist_page's fullness_group
+ */
+static enum fullness_group putback_zspage(struct zs_pool *pool,
+                        struct size_class *class,
+                        struct page *first_page)
 {
        enum fullness_group fullness;
@@ -1687,50 +1685,72 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class,
                free_zspage(first_page);
        }
+        return fullness;
 }
 static struct page *isolate_source_page(struct size_class *class)
 {
-        struct page *page;
+        int i;
+        struct page *page = NULL;
-        page = class->fullness_list[ZS_ALMOST_EMPTY];
+        for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
-        if (page)
+                page = class->fullness_list[i];
-                remove_zspage(page, class, ZS_ALMOST_EMPTY);
+                if (!page)
+                        continue;
+                remove_zspage(page, class, i);
+                break;
+        }
        return page;
 }
-static unsigned long __zs_compact(struct zs_pool *pool,
+/*
-                                struct size_class *class)
+ *
+ * Based on the number of unused allocated objects calculate
+ * and return the number of pages that we can free.
+ */
+static unsigned long zs_can_compact(struct size_class *class)
+{
+        unsigned long obj_wasted;
+        obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
+                zs_stat_get(class, OBJ_USED);
+        obj_wasted /= get_maxobj_per_zspage(class->size,
+                        class->pages_per_zspage);
+        return obj_wasted * class->pages_per_zspage;
+}
+static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 {
-        int nr_to_migrate;
        struct zs_compact_control cc;
        struct page *src_page;
        struct page *dst_page = NULL;
-        unsigned long nr_total_migrated = 0;
        spin_lock(&class->lock);
        while ((src_page = isolate_source_page(class))) {
                BUG_ON(!is_first_page(src_page));
-                /* The goal is to migrate all live objects in source page */
+                if (!zs_can_compact(class))
-                nr_to_migrate = src_page->inuse;
+                        break;
                cc.index = 0;
                cc.s_page = src_page;
-                while ((dst_page = alloc_target_page(class))) {
+                while ((dst_page = isolate_target_page(class))) {
                        cc.d_page = dst_page;
                        /*
-                         * If there is no more space in dst_page, try to
+                         * If there is no more space in dst_page, resched
-                         * allocate another zspage.
+                         * and see if anyone had allocated another zspage.
                         */
                        if (!migrate_zspage(pool, class, &cc))
                                break;
                        putback_zspage(pool, class, dst_page);
-                        nr_total_migrated += cc.nr_migrated;
-                        nr_to_migrate -= cc.nr_migrated;
                }
                /* Stop if we couldn't find slot */
@@ -1738,9 +1758,9 @@ static unsigned long __zs_compact(struct zs_pool *pool,
                        break;
                putback_zspage(pool, class, dst_page);
-                putback_zspage(pool, class, src_page);
+                if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
+                        pool->stats.pages_compacted += class->pages_per_zspage;
                spin_unlock(&class->lock);
-                nr_total_migrated += cc.nr_migrated;
                cond_resched();
                spin_lock(&class->lock);
        }
@@ -1749,14 +1769,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
                putback_zspage(pool, class, src_page);
        spin_unlock(&class->lock);
-        return nr_total_migrated;
 }
 unsigned long zs_compact(struct zs_pool *pool)
 {
        int i;
-        unsigned long nr_migrated = 0;
        struct size_class *class;
        for (i = zs_size_classes - 1; i >= 0; i--) {
@@ -1765,13 +1782,80 @@ unsigned long zs_compact(struct zs_pool *pool)
                        continue;
                if (class->index != i)
                        continue;
-                nr_migrated += __zs_compact(pool, class);
+                __zs_compact(pool, class);
        }
-        return nr_migrated;
+        return pool->stats.pages_compacted;
 }
 EXPORT_SYMBOL_GPL(zs_compact);
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
+{
+        memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
+}
+EXPORT_SYMBOL_GPL(zs_pool_stats);
+static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
+                struct shrink_control *sc)
+{
+        unsigned long pages_freed;
+        struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+                        shrinker);
+        pages_freed = pool->stats.pages_compacted;
+        /*
+         * Compact classes and calculate compaction delta.
+         * Can run concurrently with a manually triggered
+         * (by user) compaction.
+         */
+        pages_freed = zs_compact(pool) - pages_freed;
+        return pages_freed ? pages_freed : SHRINK_STOP;
+}
+static unsigned long zs_shrinker_count(struct shrinker *shrinker,
+                struct shrink_control *sc)
+{
+        int i;
+        struct size_class *class;
+        unsigned long pages_to_free = 0;
+        struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+                        shrinker);
+        if (!pool->shrinker_enabled)
+                return 0;
+        for (i = zs_size_classes - 1; i >= 0; i--) {
+                class = pool->size_class[i];
+                if (!class)
+                        continue;
+                if (class->index != i)
+                        continue;
+                pages_to_free += zs_can_compact(class);
+        }
+        return pages_to_free;
+}
+static void zs_unregister_shrinker(struct zs_pool *pool)
+{
+        if (pool->shrinker_enabled) {
+                unregister_shrinker(&pool->shrinker);
+                pool->shrinker_enabled = false;
+        }
+}
+static int zs_register_shrinker(struct zs_pool *pool)
+{
+        pool->shrinker.scan_objects = zs_shrinker_scan;
+        pool->shrinker.count_objects = zs_shrinker_count;
+        pool->shrinker.batch = 0;
+        pool->shrinker.seeks = DEFAULT_SEEKS;
+        return register_shrinker(&pool->shrinker);
+}
 /**
 * zs_create_pool - Creates an allocation pool to work from.
 * @flags: allocation flags used to allocate pool metadata
@@ -1857,6 +1941,12 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
        if (zs_pool_stat_create(name, pool))
                goto err;
+        /*
+         * Not critical, we still can use the pool
+         * and user can trigger compaction manually.
+         */
+        if (zs_register_shrinker(pool) == 0)
+                pool->shrinker_enabled = true;
        return pool;
 err:
@@ -1869,6 +1959,7 @@ void zs_destroy_pool(struct zs_pool *pool)
 {
        int i;
+        zs_unregister_shrinker(pool);
        zs_pool_stat_destroy(pool);
        for (i = 0; i < zs_size_classes; i++) {
diff --git a/mm/zswap.c b/mm/zswap.c
index 2d5727baed59..48a1d081e2a5 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -446,75 +446,14 @@ enum zswap_get_swap_ret {
 static int zswap_get_swap_cache_page(swp_entry_t entry,
                                struct page **retpage)
 {
-        struct page *found_page, *new_page = NULL;
+        bool page_was_allocated;
-        struct address_space *swapper_space = swap_address_space(entry);
-        int err;
-        *retpage = NULL;
+        *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
-        do {
+                        NULL, 0, &page_was_allocated);
-                /*
+        if (page_was_allocated)
-                 * First check the swap cache.  Since this is normally
+                return ZSWAP_SWAPCACHE_NEW;
-                 * called after lookup_swap_cache() failed, re-calling
+        if (!*retpage)
-                 * that would confuse statistics.
-                 */
-                found_page = find_get_page(swapper_space, entry.val);
-                if (found_page)
-                        break;
-                /*
-                 * Get a new page to read into from swap.
-                 */
-                if (!new_page) {
-                        new_page = alloc_page(GFP_KERNEL);
-                        if (!new_page)
-                                break; /* Out of memory */
-                }
-                /*
-                 * call radix_tree_preload() while we can wait.
-                 */
-                err = radix_tree_preload(GFP_KERNEL);
-                if (err)
-                        break;
-                /*
-                 * Swap entry may have been freed since our caller observed it.
-                 */
-                err = swapcache_prepare(entry);
-                if (err == -EEXIST) { /* seems racy */
-                        radix_tree_preload_end();
-                        continue;
-                }
-                if (err) { /* swp entry is obsolete ? */
-                        radix_tree_preload_end();
-                        break;
-                }
-                /* May fail (-ENOMEM) if radix-tree node allocation failed. */
-                __set_page_locked(new_page);
-                SetPageSwapBacked(new_page);
-                err = __add_to_swap_cache(new_page, entry);
-                if (likely(!err)) {
-                        radix_tree_preload_end();
-                        lru_cache_add_anon(new_page);
-                        *retpage = new_page;
-                        return ZSWAP_SWAPCACHE_NEW;
-                }
-                radix_tree_preload_end();
-                ClearPageSwapBacked(new_page);
-                __clear_page_locked(new_page);
-                /*
-                 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
-                 * clear SWAP_HAS_CACHE flag.
-                 */
-                swapcache_free(entry);
-        } while (err != -ENOMEM);
-        if (new_page)
-                page_cache_release(new_page);
-        if (!found_page)
                return ZSWAP_SWAPCACHE_FAIL;
-        *retpage = found_page;
        return ZSWAP_SWAPCACHE_EXIST;
 }
@@ -816,7 +755,7 @@ static void zswap_frontswap_invalidate_area(unsigned type)
        zswap_trees[type] = NULL;
 }
-static struct zpool_ops zswap_zpool_ops = {
+static const struct zpool_ops zswap_zpool_ops = {
        .evict = zswap_writeback_entry
 };
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-09-08 20:52:23 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-09-08 20:52:23 -0400
commit	f6f7a6369203fa3e07efb7f35cfd81efe9f25b07 (patch)
tree	97bec9ddd999040822acf314647eaf4208213589 /mm
parent	839fe9156fbe89c3157aa6146d22090f8cffddd8 (diff)
parent	df69f52d990bd85159727bd26e819d3a6e49c666 (diff)