Merge branch 'akpm' (incoming from Andrew)

Merge first patch-bomb from Andrew Morton: - a couple of misc things - inotify/fsnotify work from Jan - ocfs2 updates (partial) - about half of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (117 commits) mm/migrate: remove unused function, fail_migrate_page() mm/migrate: remove putback_lru_pages, fix comment on putback_movable_pages mm/migrate: correct failure handling if !hugepage_migration_support() mm/migrate: add comment about permanent failure path mm, page_alloc: warn for non-blockable __GFP_NOFAIL allocation failure mm: compaction: reset scanner positions immediately when they meet mm: compaction: do not mark unmovable pageblocks as skipped in async compaction mm: compaction: detect when scanners meet in isolate_freepages mm: compaction: reset cached scanner pfn's before reading them mm: compaction: encapsulate defer reset logic mm: compaction: trace compaction begin and end memcg, oom: lock mem_cgroup_print_oom_info sched: add tracepoints related to NUMA task migration mm: numa: do not automatically migrate KSM pages mm: numa: trace tasks that fail migration due to rate limiting mm: numa: limit scope of lock for NUMA migrate rate limiting mm: numa: make NUMA-migrate related functions static lib/show_mem.c: show num_poisoned_pages when oom mm/hwpoison: add '#' to hwpoison_inject mm/memblock: use WARN_ONCE when MAX_NUMNODES passed as input parameter ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-01-21 22:05:45 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-01-21 22:05:45 -0500
commit: df32e43a54d04eda35d2859beaf90e3864d53288 (patch)
tree: 7a61cf658b2949bd426285eb9902be7758ced1ba /mm
parent: fbd918a2026d0464ce9c23f57b7de4bcfccdc2e6 (diff)
parent: 78d5506e82b21a1a1de68c24182db2c2fe521422 (diff)
26 files changed, 1145 insertions, 820 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index f58bcd016f43..3a91a2ea3d34 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        unsigned long flags;
        bool locked = false;
        struct page *page = NULL, *valid_page = NULL;
+        bool skipped_async_unsuitable = false;
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -534,6 +535,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                if (!cc->sync && last_pageblock_nr != pageblock_nr &&
                    !migrate_async_suitable(get_pageblock_migratetype(page))) {
                        cc->finished_update_migrate = true;
+                        skipped_async_unsuitable = true;
                        goto next_pageblock;
                }
@@ -627,8 +629,13 @@ next_pageblock:
        if (locked)
                spin_unlock_irqrestore(&zone->lru_lock, flags);
-        /* Update the pageblock-skip if the whole pageblock was scanned */
+        /*
-        if (low_pfn == end_pfn)
+         * Update the pageblock-skip information and cached scanner pfn,
+         * if the whole pageblock was scanned without isolating any page.
+         * This is not done when pageblock was skipped due to being unsuitable
+         * for async compaction, so that eventual sync compaction can try.
+         */
+        if (low_pfn == end_pfn && !skipped_async_unsuitable)
                update_pageblock_skip(cc, valid_page, nr_isolated, true);
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -660,7 +667,7 @@ static void isolate_freepages(struct zone *zone,
         * is the end of the pageblock the migration scanner is using.
         */
        pfn = cc->free_pfn;
-        low_pfn = cc->migrate_pfn + pageblock_nr_pages;
+        low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
        /*
         * Take care that if the migration scanner is at the end of the zone
@@ -676,7 +683,7 @@ static void isolate_freepages(struct zone *zone,
         * pages on cc->migratepages. We stop searching if the migrate
         * and free page scanners meet or enough free pages are isolated.
         */
-        for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
+        for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
                                        pfn -= pageblock_nr_pages) {
                unsigned long isolated;
@@ -738,7 +745,14 @@ static void isolate_freepages(struct zone *zone,
        /* split_free_page does not map the pages */
        map_pages(freelist);
-        cc->free_pfn = high_pfn;
+        /*
+         * If we crossed the migrate scanner, we want to keep it that way
+         * so that compact_finished() may detect this
+         */
+        if (pfn < low_pfn)
+                cc->free_pfn = max(pfn, zone->zone_start_pfn);
+        else
+                cc->free_pfn = high_pfn;
        cc->nr_freepages = nr_freepages;
 }
@@ -837,6 +851,10 @@ static int compact_finished(struct zone *zone,
        /* Compaction run completes if the migrate and free scanner meet */
        if (cc->free_pfn <= cc->migrate_pfn) {
+                /* Let the next compaction start anew. */
+                zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+                zone->compact_cached_free_pfn = zone_end_pfn(zone);
                /*
                 * Mark that the PG_migrate_skip information should be cleared
                 * by kswapd when it goes to sleep. kswapd does not set the
@@ -947,6 +965,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        }
        /*
+         * Clear pageblock skip if there were failures recently and compaction
+         * is about to be retried after being deferred. kswapd does not do
+         * this reset as it'll reset the cached information when going to sleep.
+         */
+        if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+                __reset_isolation_suitable(zone);
+        /*
         * Setup to move all movable pages to the end of the zone. Used cached
         * information on where the scanners should start but check that it
         * is initialised by ensuring the values are within zone boundaries.
@@ -962,13 +988,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                zone->compact_cached_migrate_pfn = cc->migrate_pfn;
        }
-        /*
+        trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
-         * Clear pageblock skip if there were failures recently and compaction
-         * is about to be retried after being deferred. kswapd does not do
-         * this reset as it'll reset the cached information when going to sleep.
-         */
-        if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
-                __reset_isolation_suitable(zone);
        migrate_prep_local();
@@ -1003,7 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                if (err) {
                        putback_movable_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
-                        if (err == -ENOMEM) {
+                        /*
+                         * migrate_pages() may return -ENOMEM when scanners meet
+                         * and we want compact_finished() to detect it
+                         */
+                        if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
                                ret = COMPACT_PARTIAL;
                                goto out;
                        }
@@ -1015,6 +1039,8 @@ out:
        cc->nr_freepages -= release_freepages(&cc->freepages);
        VM_BUG_ON(cc->nr_freepages != 0);
+        trace_mm_compaction_end(ret);
        return ret;
 }
@@ -1120,12 +1146,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                        compact_zone(zone, cc);
                if (cc->order > 0) {
-                        int ok = zone_watermark_ok(zone, cc->order,
+                        if (zone_watermark_ok(zone, cc->order,
-                                                low_wmark_pages(zone), 0, 0);
+                                                low_wmark_pages(zone), 0, 0))
-                        if (ok && cc->order >= zone->compact_order_failed)
+                                compaction_defer_reset(zone, cc->order, false);
-                                zone->compact_order_failed = cc->order + 1;
                        /* Currently async compaction is never deferred. */
-                        else if (!ok && cc->sync)
+                        else if (cc->sync)
                                defer_compaction(zone, cc->order);
                }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dee6cf4e6d34..04306b9de90d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 */
 int PageHuge(struct page *page)
 {
-        compound_page_dtor *dtor;
        if (!PageCompound(page))
                return 0;
        page = compound_head(page);
-        dtor = get_compound_page_dtor(page);
+        return get_compound_page_dtor(page) == free_huge_page;
-        return dtor == free_huge_page;
 }
 EXPORT_SYMBOL_GPL(PageHuge);
@@ -708,16 +704,11 @@ EXPORT_SYMBOL_GPL(PageHuge);
 */
 int PageHeadHuge(struct page *page_head)
 {
-        compound_page_dtor *dtor;
        if (!PageHead(page_head))
                return 0;
-        dtor = get_compound_page_dtor(page_head);
+        return get_compound_page_dtor(page_head) == free_huge_page;
-        return dtor == free_huge_page;
 }
-EXPORT_SYMBOL_GPL(PageHeadHuge);
 pgoff_t __basepage_index(struct page *page)
 {
@@ -1280,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
        for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
                void *addr;
-                addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
+                addr = memblock_virt_alloc_try_nid_nopanic(
-                                huge_page_size(h), huge_page_size(h), 0);
+                                huge_page_size(h), huge_page_size(h),
+                                0, BOOTMEM_ALLOC_ACCESSIBLE, node);
                if (addr) {
                        /*
                         * Use the beginning of the huge page to store the
@@ -1322,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void)
 #ifdef CONFIG_HIGHMEM
                page = pfn_to_page(m->phys >> PAGE_SHIFT);
-                free_bootmem_late((unsigned long)m,
+                memblock_free_late(__pa(m),
-                                  sizeof(struct huge_bootmem_page));
+                                   sizeof(struct huge_bootmem_page));
 #else
                page = virt_to_page(m);
 #endif
@@ -2355,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        int cow;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        int ret = 0;
        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+        mmun_start = vma->vm_start;
+        mmun_end = vma->vm_end;
+        if (cow)
+                mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
        for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
                spinlock_t *src_ptl, *dst_ptl;
                src_pte = huge_pte_offset(src, addr);
                if (!src_pte)
                        continue;
                dst_pte = huge_pte_alloc(dst, addr, sz);
-                if (!dst_pte)
+                if (!dst_pte) {
-                        goto nomem;
+                        ret = -ENOMEM;
+                        break;
+                }
                /* If the pagetables are shared don't copy or take references */
                if (dst_pte == src_pte)
@@ -2386,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                spin_unlock(src_ptl);
                spin_unlock(dst_ptl);
        }
-        return 0;
-nomem:
+        if (cow)
-        return -ENOMEM;
+                mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+        return ret;
 }
 static int is_hugetlb_entry_migration(pte_t pte)
@@ -3079,7 +3081,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 same_page:
                if (pages) {
                        pages[i] = mem_map_offset(page, pfn_offset);
-                        get_page(pages[i]);
+                        get_page_foll(pages[i]);
                }
                if (vmas)
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 4c84678371eb..95487c71cad5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val)
                return 0;
 inject:
-        printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
+        pr_info("Injecting memory failure at pfn %#lx\n", pfn);
        return memory_failure(pfn, 18, MF_COUNT_INCREASED);
 }
diff --git a/mm/internal.h b/mm/internal.h
index 684f7aa9692a..a346ba120e42 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -47,11 +47,9 @@ static inline void __get_page_tail_foll(struct page *page,
         * page_cache_get_speculative()) on tail pages.
         */
        VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
-        VM_BUG_ON(atomic_read(&page->_count) != 0);
-        VM_BUG_ON(page_mapcount(page) < 0);
        if (get_page_head)
                atomic_inc(&page->first_page->_count);
-        atomic_inc(&page->_mapcount);
+        get_huge_page_tail(page);
 }
 /*
diff --git a/mm/ksm.c b/mm/ksm.c
index 175fff79dc95..3df141e5f3e0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1891,21 +1891,24 @@ struct page *ksm_might_need_to_copy(struct page *page,
        return new_page;
 }
-int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
-                        unsigned long *vm_flags)
 {
        struct stable_node *stable_node;
        struct rmap_item *rmap_item;
-        unsigned int mapcount = page_mapcount(page);
+        int ret = SWAP_AGAIN;
-        int referenced = 0;
        int search_new_forks = 0;
        VM_BUG_ON(!PageKsm(page));
+        /*
+         * Rely on the page lock to protect against concurrent modifications
+         * to that page's node of the stable tree.
+         */
        VM_BUG_ON(!PageLocked(page));
        stable_node = page_stable_node(page);
        if (!stable_node)
-                return 0;
+                return ret;
 again:
        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1928,113 +1931,16 @@ again:
                        if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
                                continue;
-                        if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
+                        if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
-                                continue;
-                        referenced += page_referenced_one(page, vma,
-                                rmap_item->address, &mapcount, vm_flags);
-                        if (!search_new_forks || !mapcount)
-                                break;
-                }
-                anon_vma_unlock_read(anon_vma);
-                if (!mapcount)
-                        goto out;
-        }
-        if (!search_new_forks++)
-                goto again;
-out:
-        return referenced;
-}
-int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
-{
-        struct stable_node *stable_node;
-        struct rmap_item *rmap_item;
-        int ret = SWAP_AGAIN;
-        int search_new_forks = 0;
-        VM_BUG_ON(!PageKsm(page));
-        VM_BUG_ON(!PageLocked(page));
-        stable_node = page_stable_node(page);
-        if (!stable_node)
-                return SWAP_FAIL;
-again:
-        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
-                struct anon_vma *anon_vma = rmap_item->anon_vma;
-                struct anon_vma_chain *vmac;
-                struct vm_area_struct *vma;
-                anon_vma_lock_read(anon_vma);
-                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
-                                               0, ULONG_MAX) {
-                        vma = vmac->vma;
-                        if (rmap_item->address < vma->vm_start ||
-                            rmap_item->address >= vma->vm_end)
-                                continue;
-                        /*
-                         * Initially we examine only the vma which covers this
-                         * rmap_item; but later, if there is still work to do,
-                         * we examine covering vmas in other mms: in case they
-                         * were forked from the original since ksmd passed.
-                         */
-                        if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
                                continue;
-                        ret = try_to_unmap_one(page, vma,
+                        ret = rwc->rmap_one(page, vma,
-                                        rmap_item->address, flags);
+                                        rmap_item->address, rwc->arg);
-                        if (ret != SWAP_AGAIN || !page_mapped(page)) {
+                        if (ret != SWAP_AGAIN) {
                                anon_vma_unlock_read(anon_vma);
                                goto out;
                        }
-                }
+                        if (rwc->done && rwc->done(page)) {
-                anon_vma_unlock_read(anon_vma);
-        }
-        if (!search_new_forks++)
-                goto again;
-out:
-        return ret;
-}
-#ifdef CONFIG_MIGRATION
-int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
-                  struct vm_area_struct *, unsigned long, void *), void *arg)
-{
-        struct stable_node *stable_node;
-        struct rmap_item *rmap_item;
-        int ret = SWAP_AGAIN;
-        int search_new_forks = 0;
-        VM_BUG_ON(!PageKsm(page));
-        VM_BUG_ON(!PageLocked(page));
-        stable_node = page_stable_node(page);
-        if (!stable_node)
-                return ret;
-again:
-        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
-                struct anon_vma *anon_vma = rmap_item->anon_vma;
-                struct anon_vma_chain *vmac;
-                struct vm_area_struct *vma;
-                anon_vma_lock_read(anon_vma);
-                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
-                                               0, ULONG_MAX) {
-                        vma = vmac->vma;
-                        if (rmap_item->address < vma->vm_start ||
-                            rmap_item->address >= vma->vm_end)
-                                continue;
-                        /*
-                         * Initially we examine only the vma which covers this
-                         * rmap_item; but later, if there is still work to do,
-                         * we examine covering vmas in other mms: in case they
-                         * were forked from the original since ksmd passed.
-                         */
-                        if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
-                                continue;
-                        ret = rmap_one(page, vma, rmap_item->address, arg);
-                        if (ret != SWAP_AGAIN) {
                                anon_vma_unlock_read(anon_vma);
                                goto out;
                        }
@@ -2047,6 +1953,7 @@ out:
        return ret;
 }
+#ifdef CONFIG_MIGRATION
 void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 {
        struct stable_node *stable_node;
diff --git a/mm/memblock.c b/mm/memblock.c
index 53e477bb5558..1c2ef2c7edab 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -21,6 +21,9 @@
 #include <linux/memblock.h>
 #include <asm-generic/sections.h>
+#include <linux/io.h>
+#include "internal.h"
 static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
 static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -39,6 +42,9 @@ struct memblock memblock __initdata_memblock = {
 };
 int memblock_debug __initdata_memblock;
+#ifdef CONFIG_MOVABLE_NODE
+bool movable_node_enabled __initdata_memblock = false;
+#endif
 static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
 static int memblock_reserved_in_slab __initdata_memblock = 0;
@@ -91,7 +97,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
 * @size: size of free area to find
 * @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 *
 * Utility called from memblock_find_in_range_node(), find free area bottom-up.
 *
@@ -123,7 +129,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
 * @size: size of free area to find
 * @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 *
 * Utility called from memblock_find_in_range_node(), find free area top-down.
 *
@@ -154,11 +160,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
 /**
 * memblock_find_in_range_node - find free area in given range and node
- * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
 * @size: size of free area to find
 * @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 *
 * Find @size free area aligned to @align in the specified range and node.
 *
@@ -173,9 +179,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
 * RETURNS:
 * Found address on success, 0 on failure.
 */
-phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
-                                        phys_addr_t end, phys_addr_t size,
+                                        phys_addr_t align, phys_addr_t start,
-                                        phys_addr_t align, int nid)
+                                        phys_addr_t end, int nid)
 {
        int ret;
        phys_addr_t kernel_end;
@@ -238,8 +244,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
                                        phys_addr_t end, phys_addr_t size,
                                        phys_addr_t align)
 {
-        return memblock_find_in_range_node(start, end, size, align,
+        return memblock_find_in_range_node(size, align, start, end,
-                                           MAX_NUMNODES);
+                                            NUMA_NO_NODE);
 }
 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -255,6 +261,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
                type->cnt = 1;
                type->regions[0].base = 0;
                type->regions[0].size = 0;
+                type->regions[0].flags = 0;
                memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
        }
 }
@@ -265,6 +272,19 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
        if (memblock.reserved.regions == memblock_reserved_init_regions)
                return 0;
+        /*
+         * Don't allow nobootmem allocator to free reserved memory regions
+         * array if
+         *  - CONFIG_DEBUG_FS is enabled;
+         *  - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled;
+         *  - reserved memory regions array have been resized during boot.
+         * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved"
+         * will show garbage instead of state of memory reservations.
+         */
+        if (IS_ENABLED(CONFIG_DEBUG_FS) &&
+            !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
+                return 0;
        *addr = __pa(memblock.reserved.regions);
        return PAGE_ALIGN(sizeof(struct memblock_region) *
@@ -405,7 +425,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
                if (this->base + this->size != next->base ||
                    memblock_get_region_node(this) !=
-                    memblock_get_region_node(next)) {
+                    memblock_get_region_node(next) ||
+                    this->flags != next->flags) {
                        BUG_ON(this->base + this->size > next->base);
                        i++;
                        continue;
@@ -425,13 +446,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
 * @base:       base address of the new region
 * @size:       size of the new region
 * @nid:        node id of the new region
+ * @flags:      flags of the new region
 *
 * Insert new memblock region [@base,@base+@size) into @type at @idx.
 * @type must already have extra room to accomodate the new region.
 */
 static void __init_memblock memblock_insert_region(struct memblock_type *type,
                                                   int idx, phys_addr_t base,
-                                                   phys_addr_t size, int nid)
+                                                   phys_addr_t size,
+                                                   int nid, unsigned long flags)
 {
        struct memblock_region *rgn = &type->regions[idx];
@@ -439,6 +462,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
        memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
        rgn->base = base;
        rgn->size = size;
+        rgn->flags = flags;
        memblock_set_region_node(rgn, nid);
        type->cnt++;
        type->total_size += size;
@@ -450,6 +474,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
 * @base: base address of the new region
 * @size: size of the new region
 * @nid: nid of the new region
+ * @flags: flags of the new region
 *
 * Add new memblock region [@base,@base+@size) into @type.  The new region
 * is allowed to overlap with existing ones - overlaps don't affect already
@@ -460,7 +485,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
 * 0 on success, -errno on failure.
 */
 static int __init_memblock memblock_add_region(struct memblock_type *type,
-                                phys_addr_t base, phys_addr_t size, int nid)
+                                phys_addr_t base, phys_addr_t size,
+                                int nid, unsigned long flags)
 {
        bool insert = false;
        phys_addr_t obase = base;
@@ -475,6 +501,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type,
                WARN_ON(type->cnt != 1 || type->total_size);
                type->regions[0].base = base;
                type->regions[0].size = size;
+                type->regions[0].flags = flags;
                memblock_set_region_node(&type->regions[0], nid);
                type->total_size = size;
                return 0;
@@ -505,7 +532,8 @@ repeat:
                        nr_new++;
                        if (insert)
                                memblock_insert_region(type, i++, base,
-                                                       rbase - base, nid);
+                                                       rbase - base, nid,
+                                                       flags);
                }
                /* area below @rend is dealt with, forget about it */
                base = min(rend, end);
@@ -515,7 +543,8 @@ repeat:
        if (base < end) {
                nr_new++;
                if (insert)
-                        memblock_insert_region(type, i, base, end - base, nid);
+                        memblock_insert_region(type, i, base, end - base,
+                                               nid, flags);
        }
        /*
@@ -537,12 +566,13 @@ repeat:
 int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
                                       int nid)
 {
-        return memblock_add_region(&memblock.memory, base, size, nid);
+        return memblock_add_region(&memblock.memory, base, size, nid, 0);
 }
 int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
 {
-        return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES);
+        return memblock_add_region(&memblock.memory, base, size,
+                                   MAX_NUMNODES, 0);
 }
 /**
@@ -597,7 +627,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
                        rgn->size -= base - rbase;
                        type->total_size -= base - rbase;
                        memblock_insert_region(type, i, rbase, base - rbase,
-                                               memblock_get_region_node(rgn));
+                                               memblock_get_region_node(rgn),
+                                               rgn->flags);
                } else if (rend > end) {
                        /*
                         * @rgn intersects from above.  Split and redo the
@@ -607,7 +638,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
                        rgn->size -= end - rbase;
                        type->total_size -= end - rbase;
                        memblock_insert_region(type, i--, rbase, end - rbase,
-                                               memblock_get_region_node(rgn));
+                                               memblock_get_region_node(rgn),
+                                               rgn->flags);
                } else {
                        /* @rgn is fully contained, record it */
                        if (!*end_rgn)
@@ -643,28 +675,89 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
 {
        memblock_dbg("   memblock_free: [%#016llx-%#016llx] %pF\n",
                     (unsigned long long)base,
-                     (unsigned long long)base + size,
+                     (unsigned long long)base + size - 1,
                     (void *)_RET_IP_);
        return __memblock_remove(&memblock.reserved, base, size);
 }
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+static int __init_memblock memblock_reserve_region(phys_addr_t base,
+                                                   phys_addr_t size,
+                                                   int nid,
+                                                   unsigned long flags)
 {
        struct memblock_type *_rgn = &memblock.reserved;
-        memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n",
+        memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
                     (unsigned long long)base,
-                     (unsigned long long)base + size,
+                     (unsigned long long)base + size - 1,
-                     (void *)_RET_IP_);
+                     flags, (void *)_RET_IP_);
-        return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
+        return memblock_add_region(_rgn, base, size, nid, flags);
+}
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+{
+        return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
+}
+/**
+ * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * This function isolates region [@base, @base + @size), and mark it with flag
+ * MEMBLOCK_HOTPLUG.
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+{
+        struct memblock_type *type = &memblock.memory;
+        int i, ret, start_rgn, end_rgn;
+        ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+        if (ret)
+                return ret;
+        for (i = start_rgn; i < end_rgn; i++)
+                memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
+        memblock_merge_regions(type);
+        return 0;
+}
+/**
+ * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * This function isolates region [@base, @base + @size), and clear flag
+ * MEMBLOCK_HOTPLUG for the isolated regions.
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
+{
+        struct memblock_type *type = &memblock.memory;
+        int i, ret, start_rgn, end_rgn;
+        ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+        if (ret)
+                return ret;
+        for (i = start_rgn; i < end_rgn; i++)
+                memblock_clear_region_flags(&type->regions[i],
+                                            MEMBLOCK_HOTPLUG);
+        memblock_merge_regions(type);
+        return 0;
 }
 /**
 * __next_free_mem_range - next function for for_each_free_mem_range()
 * @idx: pointer to u64 loop variable
- * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @out_nid: ptr to int for nid of the range, can be %NULL
@@ -693,13 +786,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
        int mi = *idx & 0xffffffff;
        int ri = *idx >> 32;
+        if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+                nid = NUMA_NO_NODE;
        for ( ; mi < mem->cnt; mi++) {
                struct memblock_region *m = &mem->regions[mi];
                phys_addr_t m_start = m->base;
                phys_addr_t m_end = m->base + m->size;
                /* only memory regions are associated with nodes, check it */
-                if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+                if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
                        continue;
                /* scan areas before each reservation for intersection */
@@ -740,12 +836,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
 /**
 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
 * @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @out_nid: ptr to int for nid of the range, can be %NULL
 *
 * Reverse of __next_free_mem_range().
+ *
+ * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't
+ * be able to hot-remove hotpluggable memory used by the kernel. So this
+ * function skip hotpluggable regions if needed when allocating memory for the
+ * kernel.
 */
 void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
                                           phys_addr_t *out_start,
@@ -756,6 +857,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
        int mi = *idx & 0xffffffff;
        int ri = *idx >> 32;
+        if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+                nid = NUMA_NO_NODE;
        if (*idx == (u64)ULLONG_MAX) {
                mi = mem->cnt - 1;
                ri = rsv->cnt;
@@ -767,7 +871,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
                phys_addr_t m_end = m->base + m->size;
                /* only memory regions are associated with nodes, check it */
-                if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+                if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
+                        continue;
+                /* skip hotpluggable memory regions if needed */
+                if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
                        continue;
                /* scan areas before each reservation for intersection */
@@ -837,18 +945,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 * memblock_set_node - set node ID on memblock regions
 * @base: base of area to set node ID for
 * @size: size of area to set node ID for
+ * @type: memblock type to set node ID for
 * @nid: node ID to set
 *
- * Set the nid of memblock memory regions in [@base,@base+@size) to @nid.
+ * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
 * Regions which cross the area boundaries are split as necessary.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
 int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
-                                      int nid)
+                                      struct memblock_type *type, int nid)
 {
-        struct memblock_type *type = &memblock.memory;
        int start_rgn, end_rgn;
        int i, ret;
@@ -870,13 +978,13 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
 {
        phys_addr_t found;
-        if (WARN_ON(!align))
+        if (!align)
-                align = __alignof__(long long);
+                align = SMP_CACHE_BYTES;
        /* align @size to avoid excessive fragmentation on reserved array */
        size = round_up(size, align);
-        found = memblock_find_in_range_node(0, max_addr, size, align, nid);
+        found = memblock_find_in_range_node(size, align, 0, max_addr, nid);
        if (found && !memblock_reserve(found, size))
                return found;
@@ -890,7 +998,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n
 phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
 {
-        return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
+        return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
 }
 phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -920,6 +1028,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
        return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 }
+/**
+ * memblock_virt_alloc_internal - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region to allocate (phys address)
+ * @max_addr: the upper bound of the memory region to allocate (phys address)
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * The @min_addr limit is dropped if it can not be satisfied and the allocation
+ * will fall back to memory below @min_addr. Also, allocation may fall back
+ * to any node in the system if the specified node can not
+ * hold the requested memory.
+ *
+ * The allocation is performed from memory region limited by
+ * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
+ *
+ * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
+ *
+ * The phys address of allocated boot memory block is converted to virtual and
+ * allocated memory is reset to 0.
+ *
+ * In addition, function sets the min_count to 0 using kmemleak_alloc for
+ * allocated boot memory block, so that it is never reported as leaks.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+static void * __init memblock_virt_alloc_internal(
+                                phys_addr_t size, phys_addr_t align,
+                                phys_addr_t min_addr, phys_addr_t max_addr,
+                                int nid)
+{
+        phys_addr_t alloc;
+        void *ptr;
+        if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+                nid = NUMA_NO_NODE;
+        /*
+         * Detect any accidental use of these APIs after slab is ready, as at
+         * this moment memblock may be deinitialized already and its
+         * internal data may be destroyed (after execution of free_all_bootmem)
+         */
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc_node(size, GFP_NOWAIT, nid);
+        if (!align)
+                align = SMP_CACHE_BYTES;
+        /* align @size to avoid excessive fragmentation on reserved array */
+        size = round_up(size, align);
+again:
+        alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
+                                            nid);
+        if (alloc)
+                goto done;
+        if (nid != NUMA_NO_NODE) {
+                alloc = memblock_find_in_range_node(size, align, min_addr,
+                                                    max_addr,  NUMA_NO_NODE);
+                if (alloc)
+                        goto done;
+        }
+        if (min_addr) {
+                min_addr = 0;
+                goto again;
+        } else {
+                goto error;
+        }
+done:
+        memblock_reserve(alloc, size);
+        ptr = phys_to_virt(alloc);
+        memset(ptr, 0, size);
+        /*
+         * The min_count is set to 0 so that bootmem allocated blocks
+         * are never reported as leaks. This is because many of these blocks
+         * are only referred via the physical address which is not
+         * looked up by kmemleak.
+         */
+        kmemleak_alloc(ptr, size, 0, 0);
+        return ptr;
+error:
+        return NULL;
+}
+/**
+ * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ *        is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ *            is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ *            allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
+ * additional debug information (including caller info), if enabled.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_nopanic(
+                                phys_addr_t size, phys_addr_t align,
+                                phys_addr_t min_addr, phys_addr_t max_addr,
+                                int nid)
+{
+        memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+                     __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+                     (u64)max_addr, (void *)_RET_IP_);
+        return memblock_virt_alloc_internal(size, align, min_addr,
+                                             max_addr, nid);
+}
+/**
+ * memblock_virt_alloc_try_nid - allocate boot memory block with panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ *        is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ *            is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ *            allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
+ * which provides debug information (including caller info), if enabled,
+ * and panics if the request can not be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid(
+                        phys_addr_t size, phys_addr_t align,
+                        phys_addr_t min_addr, phys_addr_t max_addr,
+                        int nid)
+{
+        void *ptr;
+        memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+                     __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+                     (u64)max_addr, (void *)_RET_IP_);
+        ptr = memblock_virt_alloc_internal(size, align,
+                                           min_addr, max_addr, nid);
+        if (ptr)
+                return ptr;
+        panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
+              __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+              (u64)max_addr);
+        return NULL;
+}
+/**
+ * __memblock_free_early - free boot memory block
+ * @base: phys starting address of the  boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_virt_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
+void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
+{
+        memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+                     __func__, (u64)base, (u64)base + size - 1,
+                     (void *)_RET_IP_);
+        kmemleak_free_part(__va(base), size);
+        __memblock_remove(&memblock.reserved, base, size);
+}
+/*
+ * __memblock_free_late - free bootmem block pages directly to buddy allocator
+ * @addr: phys starting address of the  boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system.  Pages are released directly
+ * to the buddy allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
+{
+        u64 cursor, end;
+        memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+                     __func__, (u64)base, (u64)base + size - 1,
+                     (void *)_RET_IP_);
+        kmemleak_free_part(__va(base), size);
+        cursor = PFN_UP(base);
+        end = PFN_DOWN(base + size);
+        for (; cursor < end; cursor++) {
+                __free_pages_bootmem(pfn_to_page(cursor), 0);
+                totalram_pages++;
+        }
+}
 /*
 * Remaining API functions
@@ -1101,6 +1410,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit)
 static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
 {
        unsigned long long base, size;
+        unsigned long flags;
        int i;
        pr_info(" %s.cnt  = 0x%lx\n", name, type->cnt);
@@ -1111,13 +1421,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
                base = rgn->base;
                size = rgn->size;
+                flags = rgn->flags;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
                if (memblock_get_region_node(rgn) != MAX_NUMNODES)
                        snprintf(nid_buf, sizeof(nid_buf), " on node %d",
                                 memblock_get_region_node(rgn));
 #endif
-                pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n",
+                pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
-                        name, i, base, base + size - 1, size, nid_buf);
+                        name, i, base, base + size - 1, size, nid_buf, flags);
        }
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7caff36180cd..67dd2a881433 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1688,13 +1688,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
 */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
-        struct cgroup *task_cgrp;
-        struct cgroup *mem_cgrp;
        /*
-         * Need a buffer in BSS, can't rely on allocations. The code relies
+         * protects memcg_name and makes sure that parallel ooms do not
-         * on the assumption that OOM is serialized for memory controller.
+         * interleave
-         * If this assumption is broken, revisit this code.
         */
+        static DEFINE_SPINLOCK(oom_info_lock);
+        struct cgroup *task_cgrp;
+        struct cgroup *mem_cgrp;
        static char memcg_name[PATH_MAX];
        int ret;
        struct mem_cgroup *iter;
@@ -1703,6 +1703,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        if (!p)
                return;
+        spin_lock(&oom_info_lock);
        rcu_read_lock();
        mem_cgrp = memcg->css.cgroup;
@@ -1771,6 +1772,7 @@ done:
                pr_cont("\n");
        }
+        spin_unlock(&oom_info_lock);
 }
 /*
@@ -3000,7 +3002,8 @@ static DEFINE_MUTEX(set_limit_mutex);
 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
 {
        return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
-                (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+                (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) ==
+                                                        KMEM_ACCOUNTED_MASK;
 }
 /*
@@ -3126,7 +3129,7 @@ int memcg_cache_id(struct mem_cgroup *memcg)
 * But when we create a new cache, we can call this as well if its parent
 * is kmem-limited. That will have to hold set_limit_mutex as well.
 */
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
+static int memcg_update_cache_sizes(struct mem_cgroup *memcg)
 {
        int num, ret;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fabe55046c1d..b25ed321e667 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 }
 /*
- * Dirty cache page page
+ * Dirty pagecache page
 * Issues: when the error hit a hole page the error is not properly
 * propagated.
 */
@@ -1585,7 +1585,13 @@ static int __soft_offline_page(struct page *page, int flags)
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
                                        MIGRATE_SYNC, MR_MEMORY_FAILURE);
                if (ret) {
-                        putback_lru_pages(&pagelist);
+                        if (!list_empty(&pagelist)) {
+                                list_del(&page->lru);
+                                dec_zone_page_state(page, NR_ISOLATED_ANON +
+                                                page_is_file_cache(page));
+                                putback_lru_page(page);
+                        }
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
                        if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 6768ce9e57d2..86487dfa5e59 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -59,6 +59,7 @@
 #include <linux/gfp.h>
 #include <linux/migrate.h>
 #include <linux/string.h>
+#include <linux/dma-debug.h>
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
+        debug_dma_assert_idle(src);
        /*
         * If the source page was a PFN mapping, we don't have
         * a "struct page" for it. We do a best-effort copy by
@@ -4272,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
+static struct kmem_cache *page_ptl_cachep;
+void __init ptlock_cache_init(void)
+{
+        page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
+                        SLAB_PANIC, NULL);
+}
 bool ptlock_alloc(struct page *page)
 {
        spinlock_t *ptl;
-        ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+        ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
        if (!ptl)
                return false;
        page->ptl = ptl;
@@ -4285,6 +4297,6 @@ bool ptlock_alloc(struct page *page)
 void ptlock_free(struct page *page)
 {
-        kfree(page->ptl);
+        kmem_cache_free(page_ptl_cachep, page->ptl);
 }
 #endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 489f235502db..cc2ab37220b7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -9,7 +9,6 @@
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
-#include <linux/bootmem.h>
 #include <linux/compiler.h>
 #include <linux/export.h>
 #include <linux/pagevec.h>
@@ -269,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
 }
 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
- * alloc_bootmem_node_nopanic() */
+ * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
 static int __ref ensure_zone_is_initialized(struct zone *zone,
                        unsigned long start_pfn, unsigned long num_pages)
 {
@@ -1446,6 +1445,7 @@ static int __init cmdline_parse_movable_node(char *p)
         * the kernel away from hotpluggable memory.
         */
        memblock_set_bottom_up(true);
+        movable_node_enabled = true;
 #else
        pr_warn("movable_node option not supported\n");
 #endif
diff --git a/mm/migrate.c b/mm/migrate.c
index 9194375b2307..a8025befc323 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -72,28 +72,12 @@ int migrate_prep_local(void)
 }
 /*
- * Add isolated pages on the list back to the LRU under page lock
- * to avoid leaking evictable pages back onto unevictable list.
- */
-void putback_lru_pages(struct list_head *l)
-{
-        struct page *page;
-        struct page *page2;
-        list_for_each_entry_safe(page, page2, l, lru) {
-                list_del(&page->lru);
-                dec_zone_page_state(page, NR_ISOLATED_ANON +
-                                page_is_file_cache(page));
-                        putback_lru_page(page);
-        }
-}
-/*
 * Put previously isolated pages back onto the appropriate lists
 * from where they were once taken off for compaction/migration.
 *
- * This function shall be used instead of putback_lru_pages(),
+ * This function shall be used whenever the isolated pageset has been
- * whenever the isolated pageset has been built by isolate_migratepages_range()
+ * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
+ * and isolate_huge_page().
 */
 void putback_movable_pages(struct list_head *l)
 {
@@ -199,7 +183,12 @@ out:
 */
 static void remove_migration_ptes(struct page *old, struct page *new)
 {
-        rmap_walk(new, remove_migration_pte, old);
+        struct rmap_walk_control rwc = {
+                .rmap_one = remove_migration_pte,
+                .arg = old,
+        };
+        rmap_walk(new, &rwc);
 }
 /*
@@ -563,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 *                    Migration functions
 ***********************************************************/
-/* Always fail migration. Used for mappings that are not movable */
-int fail_migrate_page(struct address_space *mapping,
-                        struct page *newpage, struct page *page)
-{
-        return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
 /*
 * Common logic to directly migrate a single page suitable for
 * pages that do not use PagePrivate/PagePrivate2.
@@ -1008,7 +989,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 {
        int rc = 0;
        int *result = NULL;
-        struct page *new_hpage = get_new_page(hpage, private, &result);
+        struct page *new_hpage;
        struct anon_vma *anon_vma = NULL;
        /*
@@ -1018,9 +999,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
         * tables or check whether the hugepage is pmd-based or not before
         * kicking migration.
         */
-        if (!hugepage_migration_support(page_hstate(hpage)))
+        if (!hugepage_migration_support(page_hstate(hpage))) {
+                putback_active_hugepage(hpage);
                return -ENOSYS;
+        }
+        new_hpage = get_new_page(hpage, private, &result);
        if (!new_hpage)
                return -ENOMEM;
@@ -1120,7 +1104,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
                                nr_succeeded++;
                                break;
                        default:
-                                /* Permanent failure */
+                                /*
+                                 * Permanent failure (-EBUSY, -ENOSYS, etc.):
+                                 * unlike -EAGAIN case, the failed page is
+                                 * removed from migration page list and not
+                                 * retried in the next outer loop.
+                                 */
                                nr_failed++;
                                break;
                        }
@@ -1594,31 +1583,38 @@ bool migrate_ratelimited(int node)
 }
 /* Returns true if the node is migrate rate-limited after the update */
-bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
+                                        unsigned long nr_pages)
 {
-        bool rate_limited = false;
        /*
         * Rate-limit the amount of data that is being migrated to a node.
         * Optimal placement is no good if the memory bus is saturated and
         * all the time is being spent migrating!
         */
-        spin_lock(&pgdat->numabalancing_migrate_lock);
        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+                spin_lock(&pgdat->numabalancing_migrate_lock);
                pgdat->numabalancing_migrate_nr_pages = 0;
                pgdat->numabalancing_migrate_next_window = jiffies +
                        msecs_to_jiffies(migrate_interval_millisecs);
+                spin_unlock(&pgdat->numabalancing_migrate_lock);
        }
-        if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
+        if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
-                rate_limited = true;
+                trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
-        else
+                                                                nr_pages);
-                pgdat->numabalancing_migrate_nr_pages += nr_pages;
+                return true;
-        spin_unlock(&pgdat->numabalancing_migrate_lock);
+        }
-        
-        return rate_limited;
+        /*
+         * This is an unlocked non-atomic update so errors are possible.
+         * The consequences are failing to migrate when we potentiall should
+         * have which is not severe enough to warrant locking. If it is ever
+         * a problem, it can be converted to a per-cpu counter.
+         */
+        pgdat->numabalancing_migrate_nr_pages += nr_pages;
+        return false;
 }
-int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 {
        int page_lru;
@@ -1705,7 +1701,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
        nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
                                     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
        if (nr_remaining) {
-                putback_lru_pages(&migratepages);
+                if (!list_empty(&migratepages)) {
+                        list_del(&page->lru);
+                        dec_zone_page_state(page, NR_ISOLATED_ANON +
+                                        page_is_file_cache(page));
+                        putback_lru_page(page);
+                }
                isolated = 0;
        } else
                count_vm_numa_event(NUMA_PAGE_MIGRATE);
diff --git a/mm/mlock.c b/mm/mlock.c
index 192e6eebe4f2..10819ed4df3e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -709,19 +709,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
        lru_add_drain_all();    /* flush pagevec */
-        down_write(&current->mm->mmap_sem);
        len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
        start &= PAGE_MASK;
-        locked = len >> PAGE_SHIFT;
-        locked += current->mm->locked_vm;
        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
+        locked = len >> PAGE_SHIFT;
+        down_write(&current->mm->mmap_sem);
+        locked += current->mm->locked_vm;
        /* check against resource limits */
        if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
                error = do_mlock(start, len, 1);
        up_write(&current->mm->mmap_sem);
        if (!error)
                error = __mm_populate(start, len, 0);
@@ -732,11 +734,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 {
        int ret;
-        down_write(&current->mm->mmap_sem);
        len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
        start &= PAGE_MASK;
+        down_write(&current->mm->mmap_sem);
        ret = do_mlock(start, len, 0);
        up_write(&current->mm->mmap_sem);
        return ret;
 }
@@ -781,12 +785,12 @@ SYSCALL_DEFINE1(mlockall, int, flags)
        if (flags & MCL_CURRENT)
                lru_add_drain_all();    /* flush pagevec */
-        down_write(&current->mm->mmap_sem);
        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        ret = -ENOMEM;
+        down_write(&current->mm->mmap_sem);
        if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
            capable(CAP_IPC_LOCK))
                ret = do_mlockall(flags);
diff --git a/mm/mmap.c b/mm/mmap.c
index 834b2d785f1e..a0e7153a79e6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
@@ -1190,6 +1191,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
        return hint;
 }
+static inline int mlock_future_check(struct mm_struct *mm,
+                                     unsigned long flags,
+                                     unsigned long len)
+{
+        unsigned long locked, lock_limit;
+        /*  mlock MCL_FUTURE? */
+        if (flags & VM_LOCKED) {
+                locked = len >> PAGE_SHIFT;
+                locked += mm->locked_vm;
+                lock_limit = rlimit(RLIMIT_MEMLOCK);
+                lock_limit >>= PAGE_SHIFT;
+                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+                        return -EAGAIN;
+        }
+        return 0;
+}
 /*
 * The caller must hold down_write(&current->mm->mmap_sem).
 */
@@ -1251,16 +1270,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                if (!can_do_mlock())
                        return -EPERM;
-        /* mlock MCL_FUTURE? */
+        if (mlock_future_check(mm, vm_flags, len))
-        if (vm_flags & VM_LOCKED) {
+                return -EAGAIN;
-                unsigned long locked, lock_limit;
-                locked = len >> PAGE_SHIFT;
-                locked += mm->locked_vm;
-                lock_limit = rlimit(RLIMIT_MEMLOCK);
-                lock_limit >>= PAGE_SHIFT;
-                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-                        return -EAGAIN;
-        }
        if (file) {
                struct inode *inode = file_inode(file);
@@ -2591,18 +2602,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
        if (error & ~PAGE_MASK)
                return error;
-        /*
+        error = mlock_future_check(mm, mm->def_flags, len);
-         * mlock MCL_FUTURE?
+        if (error)
-         */
+                return error;
-        if (mm->def_flags & VM_LOCKED) {
-                unsigned long locked, lock_limit;
-                locked = len >> PAGE_SHIFT;
-                locked += mm->locked_vm;
-                lock_limit = rlimit(RLIMIT_MEMLOCK);
-                lock_limit >>= PAGE_SHIFT;
-                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-                        return -EAGAIN;
-        }
        /*
         * mm->mmap_sem is required to protect against another thread
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bb53a6591aea..7332c1785744 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
 #include <linux/perf_event.h>
+#include <linux/ksm.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                ptent = *pte;
                                page = vm_normal_page(vma, addr, oldpte);
-                                if (page) {
+                                if (page && !PageKsm(page)) {
                                        if (!pte_numa(oldpte)) {
                                                ptent = pte_mknuma(ptent);
                                                set_pte_at(mm, addr, pte, ptent);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 2c254d374655..19121ceb8874 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,7 +41,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
        if (limit > memblock.current_limit)
                limit = memblock.current_limit;
-        addr = memblock_find_in_range_node(goal, limit, size, align, nid);
+        addr = memblock_find_in_range_node(size, align, goal, limit, nid);
        if (!addr)
                return NULL;
@@ -117,7 +117,7 @@ static unsigned long __init free_low_memory_core_early(void)
        phys_addr_t start, end, size;
        u64 i;
-        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
+        for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
                count += __free_memory_core(start, end);
        /* free range that is used for reserved array if we allocate it */
@@ -161,7 +161,7 @@ unsigned long __init free_all_bootmem(void)
        reset_all_zones_managed_pages();
        /*
-         * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+         * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
         *  because in some case like Node0 doesn't have RAM installed
         *  low ram will be on Node1
         */
@@ -215,7 +215,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 restart:
-        ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+        ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);
        if (ptr)
                return ptr;
@@ -299,7 +299,7 @@ again:
        if (ptr)
                return ptr;
-        ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+        ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
                                        goal, limit);
        if (ptr)
                return ptr;
diff --git a/mm/nommu.c b/mm/nommu.c
index fec093adad9c..8740213b1647 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn;
 struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e4a600a6163..054ff47c4478 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock);
 #ifdef CONFIG_NUMA
 /**
 * has_intersects_mems_allowed() - check task eligiblity for kill
- * @tsk: task struct of which task to consider
+ * @start: task struct of which task to consider
 * @mask: nodemask passed to page allocator for mempolicy ooms
 *
 * Task eligibility is determined by whether or not a candidate task, @tsk,
 * shares the same mempolicy nodes as current if it is bound by such a policy
 * and whether or not it has the same set of allowed cpuset nodes.
 */
-static bool has_intersects_mems_allowed(struct task_struct *tsk,
+static bool has_intersects_mems_allowed(struct task_struct *start,
                                        const nodemask_t *mask)
 {
-        struct task_struct *start = tsk;
+        struct task_struct *tsk;
+        bool ret = false;
-        do {
+        rcu_read_lock();
+        for_each_thread(start, tsk) {
                if (mask) {
                        /*
                         * If this is a mempolicy constrained oom, tsk's
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
                         * mempolicy intersects current, otherwise it may be
                         * needlessly killed.
                         */
-                        if (mempolicy_nodemask_intersects(tsk, mask))
+                        ret = mempolicy_nodemask_intersects(tsk, mask);
-                                return true;
                } else {
                        /*
                         * This is not a mempolicy constrained oom, so only
                         * check the mems of tsk's cpuset.
                         */
-                        if (cpuset_mems_allowed_intersects(current, tsk))
+                        ret = cpuset_mems_allowed_intersects(current, tsk);
-                                return true;
                }
-        } while_each_thread(start, tsk);
+                if (ret)
+                        break;
+        }
+        rcu_read_unlock();
-        return false;
+        return ret;
 }
 #else
 static bool has_intersects_mems_allowed(struct task_struct *tsk,
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
 */
 struct task_struct *find_lock_task_mm(struct task_struct *p)
 {
-        struct task_struct *t = p;
+        struct task_struct *t;
-        do {
+        rcu_read_lock();
+        for_each_thread(p, t) {
                task_lock(t);
                if (likely(t->mm))
-                        return t;
+                        goto found;
                task_unlock(t);
-        } while_each_thread(p, t);
+        }
+        t = NULL;
+found:
+        rcu_read_unlock();
-        return NULL;
+        return t;
 }
 /* return true if the task is not adequate as candidate victim task. */
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
        unsigned long chosen_points = 0;
        rcu_read_lock();
-        do_each_thread(g, p) {
+        for_each_process_thread(g, p) {
                unsigned int points;
                switch (oom_scan_process_thread(p, totalpages, nodemask,
@@ -323,7 +331,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                        chosen = p;
                        chosen_points = points;
                }
-        } while_each_thread(g, p);
+        }
        if (chosen)
                get_task_struct(chosen);
        rcu_read_unlock();
@@ -406,7 +414,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 {
        struct task_struct *victim = p;
        struct task_struct *child;
-        struct task_struct *t = p;
+        struct task_struct *t;
        struct mm_struct *mm;
        unsigned int victim_points = 0;
        static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -437,7 +445,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * still freeing memory.
         */
        read_lock(&tasklist_lock);
-        do {
+        for_each_thread(p, t) {
                list_for_each_entry(child, &t->children, sibling) {
                        unsigned int child_points;
@@ -455,13 +463,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                                get_task_struct(victim);
                        }
                }
-        } while_each_thread(p, t);
+        }
        read_unlock(&tasklist_lock);
-        rcu_read_lock();
        p = find_lock_task_mm(victim);
        if (!p) {
-                rcu_read_unlock();
                put_task_struct(victim);
                return;
        } else if (victim != p) {
@@ -487,6 +493,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * That thread will now get access to memory reserves since it has a
         * pending fatal signal.
         */
+        rcu_read_lock();
        for_each_process(p)
                if (p->mm == mm && !same_thread_group(p, victim) &&
                    !(p->flags & PF_KTHREAD)) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5248fe070aa4..533e2147d14f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2072,13 +2072,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                return;
        /*
-         * Walking all memory to count page types is very expensive and should
-         * be inhibited in non-blockable contexts.
-         */
-        if (!(gfp_mask & __GFP_WAIT))
-                filter |= SHOW_MEM_FILTER_PAGE_COUNT;
-        /*
         * This documents exceptions given to allocations in certain
         * contexts that are allowed to allocate outside current's set
         * of allowed nodes.
@@ -2242,10 +2235,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                                preferred_zone, migratetype);
                if (page) {
                        preferred_zone->compact_blockskip_flush = false;
-                        preferred_zone->compact_considered = 0;
+                        compaction_defer_reset(preferred_zone, order, true);
-                        preferred_zone->compact_defer_shift = 0;
-                        if (order >= preferred_zone->compact_order_failed)
-                                preferred_zone->compact_order_failed = order + 1;
                        count_vm_event(COMPACTSUCCESS);
                        return page;
                }
@@ -2535,8 +2525,15 @@ rebalance:
        }
        /* Atomic allocations - we can't balance anything */
-        if (!wait)
+        if (!wait) {
+                /*
+                 * All existing users of the deprecated __GFP_NOFAIL are
+                 * blockable, so warn of any new users that actually allow this
+                 * type of allocation to fail.
+                 */
+                WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
                goto nopage;
+        }
        /* Avoid recursion of direct reclaim */
        if (current->flags & PF_MEMALLOC)
@@ -3901,6 +3898,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
        struct page *page;
        unsigned long block_migratetype;
        int reserve;
+        int old_reserve;
        /*
         * Get the start pfn, end pfn and the number of blocks to reserve
@@ -3922,6 +3920,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
         * future allocation of hugepages at runtime.
         */
        reserve = min(2, reserve);
+        old_reserve = zone->nr_migrate_reserve_block;
+        /* When memory hot-add, we almost always need to do nothing */
+        if (reserve == old_reserve)
+                return;
+        zone->nr_migrate_reserve_block = reserve;
        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
                if (!pfn_valid(pfn))
@@ -3959,6 +3963,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                                reserve--;
                                continue;
                        }
+                } else if (!old_reserve) {
+                        /*
+                         * At boot time we don't need to scan the whole zone
+                         * for turning off MIGRATE_RESERVE.
+                         */
+                        break;
                }
                /*
@@ -4209,7 +4219,6 @@ static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
        int i;
-        struct pglist_data *pgdat = zone->zone_pgdat;
        size_t alloc_size;
        /*
@@ -4225,7 +4234,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
        if (!slab_is_available()) {
                zone->wait_table = (wait_queue_head_t *)
-                        alloc_bootmem_node_nopanic(pgdat, alloc_size);
+                        memblock_virt_alloc_node_nopanic(
+                                alloc_size, zone->zone_pgdat->node_id);
        } else {
                /*
                 * This case means that a zone whose size was 0 gets new memory
@@ -4345,13 +4355,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 #endif
 /**
- * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
 *
 * If an architecture guarantees that all ranges registered with
 * add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling free_bootmem() manually.
+ * this function may be used instead of calling memblock_free_early_nid()
+ * manually.
 */
 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
@@ -4363,9 +4374,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
                end_pfn = min(end_pfn, max_low_pfn);
                if (start_pfn < end_pfn)
-                        free_bootmem_node(NODE_DATA(this_nid),
+                        memblock_free_early_nid(PFN_PHYS(start_pfn),
-                                          PFN_PHYS(start_pfn),
+                                        (end_pfn - start_pfn) << PAGE_SHIFT,
-                                          (end_pfn - start_pfn) << PAGE_SHIFT);
+                                        this_nid);
        }
 }
@@ -4636,8 +4647,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
        unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
        zone->pageblock_flags = NULL;
        if (usemapsize)
-                zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
+                zone->pageblock_flags =
-                                                                   usemapsize);
+                        memblock_virt_alloc_node_nopanic(usemapsize,
+                                                         pgdat->node_id);
 }
 #else
 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4831,7 +4843,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                size =  (end - start) * sizeof(struct page);
                map = alloc_remap(pgdat->node_id, size);
                if (!map)
-                        map = alloc_bootmem_node_nopanic(pgdat, size);
+                        map = memblock_virt_alloc_node_nopanic(size,
+                                                               pgdat->node_id);
                pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
        }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -5012,9 +5025,33 @@ static void __init find_zone_movable_pfns_for_nodes(void)
        nodemask_t saved_node_state = node_states[N_MEMORY];
        unsigned long totalpages = early_calculate_totalpages();
        int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+        struct memblock_type *type = &memblock.memory;
+        /* Need to find movable_zone earlier when movable_node is specified. */
+        find_usable_zone_for_movable();
+        /*
+         * If movable_node is specified, ignore kernelcore and movablecore
+         * options.
+         */
+        if (movable_node_is_enabled()) {
+                for (i = 0; i < type->cnt; i++) {
+                        if (!memblock_is_hotpluggable(&type->regions[i]))
+                                continue;
+                        nid = type->regions[i].nid;
+                        usable_startpfn = PFN_DOWN(type->regions[i].base);
+                        zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+                                min(usable_startpfn, zone_movable_pfn[nid]) :
+                                usable_startpfn;
+                }
+                goto out2;
+        }
        /*
-         * If movablecore was specified, calculate what size of
+         * If movablecore=nn[KMG] was specified, calculate what size of
         * kernelcore that corresponds so that memory usable for
         * any allocation type is evenly spread. If both kernelcore
         * and movablecore are specified, then the value of kernelcore
@@ -5040,7 +5077,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
                goto out;
        /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-        find_usable_zone_for_movable();
        usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
@@ -5131,6 +5167,7 @@ restart:
        if (usable_nodes && required_kernelcore > usable_nodes)
                goto restart;
+out2:
        /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
        for (nid = 0; nid < MAX_NUMNODES; nid++)
                zone_movable_pfn[nid] =
@@ -5857,7 +5894,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        do {
                size = bucketsize << log2qty;
                if (flags & HASH_EARLY)
-                        table = alloc_bootmem_nopanic(size);
+                        table = memblock_virt_alloc_nopanic(size, 0);
                else if (hashdist)
                        table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
                else {
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3bd0b8e6ab12..cfd162882c00 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid)
        table_size = sizeof(struct page_cgroup) * nr_pages;
-        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+        base = memblock_virt_alloc_try_nid_nopanic(
-                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+                        BOOTMEM_ALLOC_ACCESSIBLE, nid);
        if (!base)
                return -ENOMEM;
        NODE_DATA(nid)->node_page_cgroup = base;
diff --git a/mm/percpu.c b/mm/percpu.c
index afbf352ae580..036cfe07050f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
                          __alignof__(ai->groups[0].cpu_map[0]));
        ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
-        ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
+        ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
        if (!ptr)
                return NULL;
        ai = ptr;
@@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
 */
 void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
 {
-        free_bootmem(__pa(ai), ai->__ai_size);
+        memblock_free_early(__pa(ai), ai->__ai_size);
 }
 /**
@@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
        /* process group information and build config tables accordingly */
-        group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
+        group_offsets = memblock_virt_alloc(ai->nr_groups *
-        group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
+                                             sizeof(group_offsets[0]), 0);
-        unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
+        group_sizes = memblock_virt_alloc(ai->nr_groups *
-        unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
+                                           sizeof(group_sizes[0]), 0);
+        unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
+        unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
                unit_map[cpu] = UINT_MAX;
@@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
         * empty chunks.
         */
        pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
-        pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+        pcpu_slot = memblock_virt_alloc(
+                        pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
        for (i = 0; i < pcpu_nr_slots; i++)
                INIT_LIST_HEAD(&pcpu_slot[i]);
@@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
         * covers static area + reserved area (mostly used for module
         * static percpu allocation).
         */
-        schunk = alloc_bootmem(pcpu_chunk_struct_size);
+        schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
        INIT_LIST_HEAD(&schunk->list);
        schunk->base_addr = base_addr;
        schunk->map = smap;
@@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        /* init dynamic chunk if necessary */
        if (dyn_size) {
-                dchunk = alloc_bootmem(pcpu_chunk_struct_size);
+                dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
                INIT_LIST_HEAD(&dchunk->list);
                dchunk->base_addr = base_addr;
                dchunk->map = dmap;
@@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
        size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
-        areas = alloc_bootmem_nopanic(areas_size);
+        areas = memblock_virt_alloc_nopanic(areas_size, 0);
        if (!areas) {
                rc = -ENOMEM;
                goto out_free;
@@ -1712,7 +1715,7 @@ out_free_areas:
 out_free:
        pcpu_free_alloc_info(ai);
        if (areas)
-                free_bootmem(__pa(areas), areas_size);
+                memblock_free_early(__pa(areas), areas_size);
        return rc;
 }
 #endif /* BUILD_EMBED_FIRST_CHUNK */
@@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
        /* unaligned allocations can't be freed, round up to page size */
        pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
                               sizeof(pages[0]));
-        pages = alloc_bootmem(pages_size);
+        pages = memblock_virt_alloc(pages_size, 0);
        /* allocate pages */
        j = 0;
@@ -1823,7 +1826,7 @@ enomem:
                free_fn(page_address(pages[j]), PAGE_SIZE);
        rc = -ENOMEM;
 out_free_ar:
-        free_bootmem(__pa(pages), pages_size);
+        memblock_free_early(__pa(pages), pages_size);
        pcpu_free_alloc_info(ai);
        return rc;
 }
@@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset);
 static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
                                       size_t align)
 {
-        return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
+        return  memblock_virt_alloc_from_nopanic(
+                        size, align, __pa(MAX_DMA_ADDRESS));
 }
 static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
 {
-        free_bootmem(__pa(ptr), size);
+        memblock_free_early(__pa(ptr), size);
 }
 void __init setup_per_cpu_areas(void)
@@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void)
        void *fc;
        ai = pcpu_alloc_alloc_info(1, 1);
-        fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+        fc = memblock_virt_alloc_from_nopanic(unit_size,
+                                              PAGE_SIZE,
+                                              __pa(MAX_DMA_ADDRESS));
        if (!ai || !fc)
                panic("Failed to allocate memory for percpu areas.");
        /* kmemleak tracks the percpu allocations separately */
diff --git a/mm/rmap.c b/mm/rmap.c
index 068522d8502a..962e2a1e13a0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
        return 1;
 }
+struct page_referenced_arg {
+        int mapcount;
+        int referenced;
+        unsigned long vm_flags;
+        struct mem_cgroup *memcg;
+};
 /*
- * Subfunctions of page_referenced: page_referenced_one called
+ * arg: page_referenced_arg will be passed
- * repeatedly from either page_referenced_anon or page_referenced_file.
 */
 int page_referenced_one(struct page *page, struct vm_area_struct *vma,
-                        unsigned long address, unsigned int *mapcount,
+                        unsigned long address, void *arg)
-                        unsigned long *vm_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
        int referenced = 0;
+        struct page_referenced_arg *pra = arg;
        if (unlikely(PageTransHuge(page))) {
                pmd_t *pmd;
@@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                pmd = page_check_address_pmd(page, mm, address,
                                             PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
                if (!pmd)
-                        goto out;
+                        return SWAP_AGAIN;
                if (vma->vm_flags & VM_LOCKED) {
                        spin_unlock(ptl);
-                        *mapcount = 0;  /* break early from loop */
+                        pra->vm_flags |= VM_LOCKED;
-                        *vm_flags |= VM_LOCKED;
+                        return SWAP_FAIL; /* To break the loop */
-                        goto out;
                }
                /* go ahead even if the pmd is pmd_trans_splitting() */
@@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                 */
                pte = page_check_address(page, mm, address, &ptl, 0);
                if (!pte)
-                        goto out;
+                        return SWAP_AGAIN;
                if (vma->vm_flags & VM_LOCKED) {
                        pte_unmap_unlock(pte, ptl);
-                        *mapcount = 0;  /* break early from loop */
+                        pra->vm_flags |= VM_LOCKED;
-                        *vm_flags |= VM_LOCKED;
+                        return SWAP_FAIL; /* To break the loop */
-                        goto out;
                }
                if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
-        (*mapcount)--;
+        if (referenced) {
+                pra->referenced++;
-        if (referenced)
+                pra->vm_flags |= vma->vm_flags;
-                *vm_flags |= vma->vm_flags;
-out:
-        return referenced;
-}
-static int page_referenced_anon(struct page *page,
-                                struct mem_cgroup *memcg,
-                                unsigned long *vm_flags)
-{
-        unsigned int mapcount;
-        struct anon_vma *anon_vma;
-        pgoff_t pgoff;
-        struct anon_vma_chain *avc;
-        int referenced = 0;
-        anon_vma = page_lock_anon_vma_read(page);
-        if (!anon_vma)
-                return referenced;
-        mapcount = page_mapcount(page);
-        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-                struct vm_area_struct *vma = avc->vma;
-                unsigned long address = vma_address(page, vma);
-                /*
-                 * If we are reclaiming on behalf of a cgroup, skip
-                 * counting on behalf of references from different
-                 * cgroups
-                 */
-                if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
-                        continue;
-                referenced += page_referenced_one(page, vma, address,
-                                                  &mapcount, vm_flags);
-                if (!mapcount)
-                        break;
        }
-        page_unlock_anon_vma_read(anon_vma);
+        pra->mapcount--;
-        return referenced;
+        if (!pra->mapcount)
+                return SWAP_SUCCESS; /* To break the loop */
+        return SWAP_AGAIN;
 }
-/**
+static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
- * page_referenced_file - referenced check for object-based rmap
- * @page: the page we're checking references on.
- * @memcg: target memory control group
- * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
- *
- * For an object-based mapped page, find all the places it is mapped and
- * check/clear the referenced flag.  This is done by following the page->mapping
- * pointer, then walking the chain of vmas it holds.  It returns the number
- * of references it found.
- *
- * This function is only called from page_referenced for object-based pages.
- */
-static int page_referenced_file(struct page *page,
-                                struct mem_cgroup *memcg,
-                                unsigned long *vm_flags)
 {
-        unsigned int mapcount;
+        struct page_referenced_arg *pra = arg;
-        struct address_space *mapping = page->mapping;
+        struct mem_cgroup *memcg = pra->memcg;
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        struct vm_area_struct *vma;
-        int referenced = 0;
-        /*
-         * The caller's checks on page->mapping and !PageAnon have made
-         * sure that this is a file page: the check for page->mapping
-         * excludes the case just before it gets set on an anon page.
-         */
-        BUG_ON(PageAnon(page));
-        /*
-         * The page lock not only makes sure that page->mapping cannot
-         * suddenly be NULLified by truncation, it makes sure that the
-         * structure at mapping cannot be freed and reused yet,
-         * so we can safely take mapping->i_mmap_mutex.
-         */
-        BUG_ON(!PageLocked(page));
-        mutex_lock(&mapping->i_mmap_mutex);
-        /*
+        if (!mm_match_cgroup(vma->vm_mm, memcg))
-         * i_mmap_mutex does not stabilize mapcount at all, but mapcount
+                return true;
-         * is more likely to be accurate if we note it after spinning.
-         */
-        mapcount = page_mapcount(page);
-        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-                unsigned long address = vma_address(page, vma);
-                /*
-                 * If we are reclaiming on behalf of a cgroup, skip
-                 * counting on behalf of references from different
-                 * cgroups
-                 */
-                if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
-                        continue;
-                referenced += page_referenced_one(page, vma, address,
-                                                  &mapcount, vm_flags);
-                if (!mapcount)
-                        break;
-        }
-        mutex_unlock(&mapping->i_mmap_mutex);
+        return false;
-        return referenced;
 }
 /**
@@ -851,41 +768,57 @@ int page_referenced(struct page *page,
                    struct mem_cgroup *memcg,
                    unsigned long *vm_flags)
 {
-        int referenced = 0;
+        int ret;
        int we_locked = 0;
+        struct page_referenced_arg pra = {
+                .mapcount = page_mapcount(page),
+                .memcg = memcg,
+        };
+        struct rmap_walk_control rwc = {
+                .rmap_one = page_referenced_one,
+                .arg = (void *)&pra,
+                .anon_lock = page_lock_anon_vma_read,
+        };
        *vm_flags = 0;
-        if (page_mapped(page) && page_rmapping(page)) {
+        if (!page_mapped(page))
-                if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+                return 0;
-                        we_locked = trylock_page(page);
-                        if (!we_locked) {
+        if (!page_rmapping(page))
-                                referenced++;
+                return 0;
-                                goto out;
-                        }
+        if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
-                }
+                we_locked = trylock_page(page);
-                if (unlikely(PageKsm(page)))
+                if (!we_locked)
-                        referenced += page_referenced_ksm(page, memcg,
+                        return 1;
-                                                                vm_flags);
-                else if (PageAnon(page))
-                        referenced += page_referenced_anon(page, memcg,
-                                                                vm_flags);
-                else if (page->mapping)
-                        referenced += page_referenced_file(page, memcg,
-                                                                vm_flags);
-                if (we_locked)
-                        unlock_page(page);
        }
-out:
-        return referenced;
+        /*
+         * If we are reclaiming on behalf of a cgroup, skip
+         * counting on behalf of references from different
+         * cgroups
+         */
+        if (memcg) {
+                rwc.invalid_vma = invalid_page_referenced_vma;
+        }
+        ret = rmap_walk(page, &rwc);
+        *vm_flags = pra.vm_flags;
+        if (we_locked)
+                unlock_page(page);
+        return pra.referenced;
 }
 static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
-                            unsigned long address)
+                            unsigned long address, void *arg)
 {
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte;
        spinlock_t *ptl;
        int ret = 0;
+        int *cleaned = arg;
        pte = page_check_address(page, mm, address, &ptl, 1);
        if (!pte)
@@ -904,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
        pte_unmap_unlock(pte, ptl);
-        if (ret)
+        if (ret) {
                mmu_notifier_invalidate_page(mm, address);
+                (*cleaned)++;
+        }
 out:
-        return ret;
+        return SWAP_AGAIN;
 }
-static int page_mkclean_file(struct address_space *mapping, struct page *page)
+static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
 {
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        if (vma->vm_flags & VM_SHARED)
-        struct vm_area_struct *vma;
+                return 0;
-        int ret = 0;
-        BUG_ON(PageAnon(page));
-        mutex_lock(&mapping->i_mmap_mutex);
+        return 1;
-        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-                if (vma->vm_flags & VM_SHARED) {
-                        unsigned long address = vma_address(page, vma);
-                        ret += page_mkclean_one(page, vma, address);
-                }
-        }
-        mutex_unlock(&mapping->i_mmap_mutex);
-        return ret;
 }
 int page_mkclean(struct page *page)
 {
-        int ret = 0;
+        int cleaned = 0;
+        struct address_space *mapping;
+        struct rmap_walk_control rwc = {
+                .arg = (void *)&cleaned,
+                .rmap_one = page_mkclean_one,
+                .invalid_vma = invalid_mkclean_vma,
+        };
        BUG_ON(!PageLocked(page));
-        if (page_mapped(page)) {
+        if (!page_mapped(page))
-                struct address_space *mapping = page_mapping(page);
+                return 0;
-                if (mapping)
-                        ret = page_mkclean_file(mapping, page);
-        }
-        return ret;
+        mapping = page_mapping(page);
+        if (!mapping)
+                return 0;
+        rmap_walk(page, &rwc);
+        return cleaned;
 }
 EXPORT_SYMBOL_GPL(page_mkclean);
@@ -1177,17 +1110,17 @@ out:
 }
 /*
- * Subfunctions of try_to_unmap: try_to_unmap_one called
+ * @arg: enum ttu_flags will be passed to this argument
- * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
 */
 int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-                     unsigned long address, enum ttu_flags flags)
+                     unsigned long address, void *arg)
 {
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte;
        pte_t pteval;
        spinlock_t *ptl;
        int ret = SWAP_AGAIN;
+        enum ttu_flags flags = (enum ttu_flags)arg;
        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
@@ -1426,124 +1359,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        return ret;
 }
-bool is_vma_temporary_stack(struct vm_area_struct *vma)
+static int try_to_unmap_nonlinear(struct page *page,
-{
+                struct address_space *mapping, struct vm_area_struct *vma)
-        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
-        if (!maybe_stack)
-                return false;
-        if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
-                                                VM_STACK_INCOMPLETE_SETUP)
-                return true;
-        return false;
-}
-/**
- * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
- * rmap method
- * @page: the page to unmap/unlock
- * @flags: action and flags
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the anon_vma struct it points to.
- *
- * This function is only called from try_to_unmap/try_to_munlock for
- * anonymous pages.
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
- * where the page was found will be held for write.  So, we won't recheck
- * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
- * 'LOCKED.
- */
-static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
-{
-        struct anon_vma *anon_vma;
-        pgoff_t pgoff;
-        struct anon_vma_chain *avc;
-        int ret = SWAP_AGAIN;
-        anon_vma = page_lock_anon_vma_read(page);
-        if (!anon_vma)
-                return ret;
-        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-                struct vm_area_struct *vma = avc->vma;
-                unsigned long address;
-                /*
-                 * During exec, a temporary VMA is setup and later moved.
-                 * The VMA is moved under the anon_vma lock but not the
-                 * page tables leading to a race where migration cannot
-                 * find the migration ptes. Rather than increasing the
-                 * locking requirements of exec(), migration skips
-                 * temporary VMAs until after exec() completes.
-                 */
-                if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
-                                is_vma_temporary_stack(vma))
-                        continue;
-                address = vma_address(page, vma);
-                ret = try_to_unmap_one(page, vma, address, flags);
-                if (ret != SWAP_AGAIN || !page_mapped(page))
-                        break;
-        }
-        page_unlock_anon_vma_read(anon_vma);
-        return ret;
-}
-/**
- * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
- * @page: the page to unmap/unlock
- * @flags: action and flags
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
- *
- * This function is only called from try_to_unmap/try_to_munlock for
- * object-based pages.
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
- * where the page was found will be held for write.  So, we won't recheck
- * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
- * 'LOCKED.
- */
-static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 {
-        struct address_space *mapping = page->mapping;
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        struct vm_area_struct *vma;
        int ret = SWAP_AGAIN;
        unsigned long cursor;
        unsigned long max_nl_cursor = 0;
        unsigned long max_nl_size = 0;
        unsigned int mapcount;
-        if (PageHuge(page))
+        list_for_each_entry(vma,
-                pgoff = page->index << compound_order(page);
+                &mapping->i_mmap_nonlinear, shared.nonlinear) {
-        mutex_lock(&mapping->i_mmap_mutex);
-        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-                unsigned long address = vma_address(page, vma);
-                ret = try_to_unmap_one(page, vma, address, flags);
-                if (ret != SWAP_AGAIN || !page_mapped(page))
-                        goto out;
-        }
-        if (list_empty(&mapping->i_mmap_nonlinear))
-                goto out;
-        /*
-         * We don't bother to try to find the munlocked page in nonlinears.
-         * It's costly. Instead, later, page reclaim logic may call
-         * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
-         */
-        if (TTU_ACTION(flags) == TTU_MUNLOCK)
-                goto out;
-        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-                                                        shared.nonlinear) {
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
                        max_nl_cursor = cursor;
@@ -1553,8 +1380,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        }
        if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
-                ret = SWAP_FAIL;
+                return SWAP_FAIL;
-                goto out;
        }
        /*
@@ -1566,7 +1392,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
         */
        mapcount = page_mapcount(page);
        if (!mapcount)
-                goto out;
+                return ret;
        cond_resched();
        max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
@@ -1574,10 +1401,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
                max_nl_cursor = CLUSTER_SIZE;
        do {
-                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+                list_for_each_entry(vma,
-                                                        shared.nonlinear) {
+                        &mapping->i_mmap_nonlinear, shared.nonlinear) {
                        cursor = (unsigned long) vma->vm_private_data;
-                        while ( cursor < max_nl_cursor &&
+                        while (cursor < max_nl_cursor &&
                                cursor < vma->vm_end - vma->vm_start) {
                                if (try_to_unmap_cluster(cursor, &mapcount,
                                                vma, page) == SWAP_MLOCK)
@@ -1585,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
                                cursor += CLUSTER_SIZE;
                                vma->vm_private_data = (void *) cursor;
                                if ((int)mapcount <= 0)
-                                        goto out;
+                                        return ret;
                        }
                        vma->vm_private_data = (void *) max_nl_cursor;
                }
@@ -1600,11 +1428,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
         */
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
                vma->vm_private_data = NULL;
-out:
-        mutex_unlock(&mapping->i_mmap_mutex);
        return ret;
 }
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
+{
+        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
+        if (!maybe_stack)
+                return false;
+        if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
+                                                VM_STACK_INCOMPLETE_SETUP)
+                return true;
+        return false;
+}
+static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
+{
+        return is_vma_temporary_stack(vma);
+}
+static int page_not_mapped(struct page *page)
+{
+        return !page_mapped(page);
+};
 /**
 * try_to_unmap - try to remove all page table mappings to a page
 * @page: the page to get unmapped
@@ -1622,16 +1473,29 @@ out:
 int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
        int ret;
+        struct rmap_walk_control rwc = {
+                .rmap_one = try_to_unmap_one,
+                .arg = (void *)flags,
+                .done = page_not_mapped,
+                .file_nonlinear = try_to_unmap_nonlinear,
+                .anon_lock = page_lock_anon_vma_read,
+        };
-        BUG_ON(!PageLocked(page));
        VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
-        if (unlikely(PageKsm(page)))
+        /*
-                ret = try_to_unmap_ksm(page, flags);
+         * During exec, a temporary VMA is setup and later moved.
-        else if (PageAnon(page))
+         * The VMA is moved under the anon_vma lock but not the
-                ret = try_to_unmap_anon(page, flags);
+         * page tables leading to a race where migration cannot
-        else
+         * find the migration ptes. Rather than increasing the
-                ret = try_to_unmap_file(page, flags);
+         * locking requirements of exec(), migration skips
+         * temporary VMAs until after exec() completes.
+         */
+        if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page))
+                rwc.invalid_vma = invalid_migration_vma;
+        ret = rmap_walk(page, &rwc);
        if (ret != SWAP_MLOCK && !page_mapped(page))
                ret = SWAP_SUCCESS;
        return ret;
@@ -1654,14 +1518,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 */
 int try_to_munlock(struct page *page)
 {
+        int ret;
+        struct rmap_walk_control rwc = {
+                .rmap_one = try_to_unmap_one,
+                .arg = (void *)TTU_MUNLOCK,
+                .done = page_not_mapped,
+                /*
+                 * We don't bother to try to find the munlocked page in
+                 * nonlinears. It's costly. Instead, later, page reclaim logic
+                 * may call try_to_unmap() and recover PG_mlocked lazily.
+                 */
+                .file_nonlinear = NULL,
+                .anon_lock = page_lock_anon_vma_read,
+        };
        VM_BUG_ON(!PageLocked(page) || PageLRU(page));
-        if (unlikely(PageKsm(page)))
+        ret = rmap_walk(page, &rwc);
-                return try_to_unmap_ksm(page, TTU_MUNLOCK);
+        return ret;
-        else if (PageAnon(page))
-                return try_to_unmap_anon(page, TTU_MUNLOCK);
-        else
-                return try_to_unmap_file(page, TTU_MUNLOCK);
 }
 void __put_anon_vma(struct anon_vma *anon_vma)
@@ -1674,18 +1549,13 @@ void __put_anon_vma(struct anon_vma *anon_vma)
        anon_vma_free(anon_vma);
 }
-#ifdef CONFIG_MIGRATION
+static struct anon_vma *rmap_walk_anon_lock(struct page *page,
-/*
+                                        struct rmap_walk_control *rwc)
- * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
- * Called by migrate.c to remove migration ptes, but might be used more later.
- */
-static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
-                struct vm_area_struct *, unsigned long, void *), void *arg)
 {
        struct anon_vma *anon_vma;
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        struct anon_vma_chain *avc;
+        if (rwc->anon_lock)
-        int ret = SWAP_AGAIN;
+                return rwc->anon_lock(page);
        /*
         * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
@@ -1695,58 +1565,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
         */
        anon_vma = page_anon_vma(page);
        if (!anon_vma)
-                return ret;
+                return NULL;
        anon_vma_lock_read(anon_vma);
+        return anon_vma;
+}
+/*
+ * rmap_walk_anon - do something to anonymous page using the object-based
+ * rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the anon_vma struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write.  So, we won't recheck
+ * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
+{
+        struct anon_vma *anon_vma;
+        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        struct anon_vma_chain *avc;
+        int ret = SWAP_AGAIN;
+        anon_vma = rmap_walk_anon_lock(page, rwc);
+        if (!anon_vma)
+                return ret;
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
-                ret = rmap_one(page, vma, address, arg);
+                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+                        continue;
+                ret = rwc->rmap_one(page, vma, address, rwc->arg);
                if (ret != SWAP_AGAIN)
                        break;
+                if (rwc->done && rwc->done(page))
+                        break;
        }
        anon_vma_unlock_read(anon_vma);
        return ret;
 }
-static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
+/*
-                struct vm_area_struct *, unsigned long, void *), void *arg)
+ * rmap_walk_file - do something to file page using the object-based rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write.  So, we won't recheck
+ * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 {
        struct address_space *mapping = page->mapping;
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        pgoff_t pgoff = page->index << compound_order(page);
        struct vm_area_struct *vma;
        int ret = SWAP_AGAIN;
+        /*
+         * The page lock not only makes sure that page->mapping cannot
+         * suddenly be NULLified by truncation, it makes sure that the
+         * structure at mapping cannot be freed and reused yet,
+         * so we can safely take mapping->i_mmap_mutex.
+         */
+        VM_BUG_ON(!PageLocked(page));
        if (!mapping)
                return ret;
        mutex_lock(&mapping->i_mmap_mutex);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-                ret = rmap_one(page, vma, address, arg);
+                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+                        continue;
+                ret = rwc->rmap_one(page, vma, address, rwc->arg);
                if (ret != SWAP_AGAIN)
-                        break;
+                        goto done;
+                if (rwc->done && rwc->done(page))
+                        goto done;
        }
-        /*
-         * No nonlinear handling: being always shared, nonlinear vmas
+        if (!rwc->file_nonlinear)
-         * never contain migration ptes.  Decide what to do about this
+                goto done;
-         * limitation to linear when we need rmap_walk() on nonlinear.
-         */
+        if (list_empty(&mapping->i_mmap_nonlinear))
+                goto done;
+        ret = rwc->file_nonlinear(page, mapping, vma);
+done:
        mutex_unlock(&mapping->i_mmap_mutex);
        return ret;
 }
-int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
+int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
-                struct vm_area_struct *, unsigned long, void *), void *arg)
 {
-        VM_BUG_ON(!PageLocked(page));
        if (unlikely(PageKsm(page)))
-                return rmap_walk_ksm(page, rmap_one, arg);
+                return rmap_walk_ksm(page, rwc);
        else if (PageAnon(page))
-                return rmap_walk_anon(page, rmap_one, arg);
+                return rmap_walk_anon(page, rwc);
        else
-                return rmap_walk_file(page, rmap_one, arg);
+                return rmap_walk_file(page, rwc);
 }
-#endif /* CONFIG_MIGRATION */
 #ifdef CONFIG_HUGETLB_PAGE
 /*
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 27eeab3be757..4cba9c2783a1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
                                unsigned long align,
                                unsigned long goal)
 {
-        return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
+        return memblock_virt_alloc_try_nid(size, align, goal,
+                                            BOOTMEM_ALLOC_ACCESSIBLE, node);
 }
 static void *vmemmap_buf;
@@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
        if (vmemmap_buf_start) {
                /* need to free left buf */
-                free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
+                memblock_free_early(__pa(vmemmap_buf),
+                                    vmemmap_buf_end - vmemmap_buf);
                vmemmap_buf = NULL;
                vmemmap_buf_end = NULL;
        }
diff --git a/mm/sparse.c b/mm/sparse.c
index 8cc7be0e9590..63c3ea5c119c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
                else
                        section = kzalloc(array_size, GFP_KERNEL);
        } else {
-                section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+                section = memblock_virt_alloc_node(array_size, nid);
        }
        return section;
@@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
        limit = goal + (1UL << PA_SECTION_SHIFT);
        nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
-        p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
+        p = memblock_virt_alloc_try_nid_nopanic(size,
-                                          SMP_CACHE_BYTES, goal, limit);
+                                                SMP_CACHE_BYTES, goal, limit,
+                                                nid);
        if (!p && limit) {
                limit = 0;
                goto again;
@@ -331,7 +332,7 @@ static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
                                         unsigned long size)
 {
-        return alloc_bootmem_node_nopanic(pgdat, size);
+        return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
 }
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
                return map;
        size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
-        map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
+        map = memblock_virt_alloc_try_nid(size,
-                                         PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+                                          PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+                                          BOOTMEM_ALLOC_ACCESSIBLE, nid);
        return map;
 }
 void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
        }
        size = PAGE_ALIGN(size);
-        map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
+        map = memblock_virt_alloc_try_nid(size * map_count,
-                                         PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+                                          PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+                                          BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
        if (map) {
                for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
                        if (!present_section_nr(pnum))
@@ -545,7 +548,7 @@ void __init sparse_init(void)
         * sparse_early_mem_map_alloc, so allocate usemap_map at first.
         */
        size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
-        usemap_map = alloc_bootmem(size);
+        usemap_map = memblock_virt_alloc(size, 0);
        if (!usemap_map)
                panic("can not allocate usemap_map\n");
        alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
@@ -553,7 +556,7 @@ void __init sparse_init(void)
 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
        size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
-        map_map = alloc_bootmem(size2);
+        map_map = memblock_virt_alloc(size2, 0);
        if (!map_map)
                panic("can not allocate map_map\n");
        alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
@@ -583,9 +586,9 @@ void __init sparse_init(void)
        vmemmap_populate_print_last();
 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
-        free_bootmem(__pa(map_map), size2);
+        memblock_free_early(__pa(map_map), size2);
 #endif
-        free_bootmem(__pa(usemap_map), size);
+        memblock_free_early(__pa(usemap_map), size);
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/swap.c b/mm/swap.c
index 84b26aaabd03..d1100b619e61 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,7 +31,6 @@
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
 #include <linux/uio.h>
-#include <linux/hugetlb.h>
 #include "internal.h"
@@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page)
 static void put_compound_page(struct page *page)
 {
-        if (unlikely(PageTail(page))) {
+        struct page *page_head;
-                /* __split_huge_page_refcount can run under us */
-                struct page *page_head = compound_trans_head(page);
-                if (likely(page != page_head &&
-                           get_page_unless_zero(page_head))) {
-                        unsigned long flags;
+        if (likely(!PageTail(page))) {
+                if (put_page_testzero(page)) {
                        /*
-                         * THP can not break up slab pages so avoid taking
+                         * By the time all refcounts have been released
-                         * compound_lock().  Slab performs non-atomic bit ops
+                         * split_huge_page cannot run anymore from under us.
-                         * on page->flags for better performance.  In particular
-                         * slab_unlock() in slub used to be a hot path.  It is
-                         * still hot on arches that do not support
-                         * this_cpu_cmpxchg_double().
                         */
-                        if (PageSlab(page_head) || PageHeadHuge(page_head)) {
+                        if (PageHead(page))
-                                if (likely(PageTail(page))) {
+                                __put_compound_page(page);
-                                        /*
+                        else
-                                         * __split_huge_page_refcount
+                                __put_single_page(page);
-                                         * cannot race here.
+                }
-                                         */
+                return;
-                                        VM_BUG_ON(!PageHead(page_head));
+        }
-                                        atomic_dec(&page->_mapcount);
-                                        if (put_page_testzero(page_head))
+        /* __split_huge_page_refcount can run under us */
-                                                VM_BUG_ON(1);
+        page_head = compound_trans_head(page);
-                                        if (put_page_testzero(page_head))
-                                                __put_compound_page(page_head);
+        /*
-                                        return;
+         * THP can not break up slab pages so avoid taking
-                                } else
+         * compound_lock() and skip the tail page refcounting (in
-                                        /*
+         * _mapcount) too. Slab performs non-atomic bit ops on
-                                         * __split_huge_page_refcount
+         * page->flags for better performance. In particular
-                                         * run before us, "page" was a
+         * slab_unlock() in slub used to be a hot path. It is still
-                                         * THP tail. The split
+         * hot on arches that do not support
-                                         * page_head has been freed
+         * this_cpu_cmpxchg_double().
-                                         * and reallocated as slab or
+         *
-                                         * hugetlbfs page of smaller
+         * If "page" is part of a slab or hugetlbfs page it cannot be
-                                         * order (only possible if
+         * splitted and the head page cannot change from under us. And
-                                         * reallocated as slab on
+         * if "page" is part of a THP page under splitting, if the
-                                         * x86).
+         * head page pointed by the THP tail isn't a THP head anymore,
-                                         */
+         * we'll find PageTail clear after smp_rmb() and we'll treat
-                                        goto skip_lock;
+         * it as a single page.
-                        }
+         */
+        if (!__compound_tail_refcounted(page_head)) {
+                /*
+                 * If "page" is a THP tail, we must read the tail page
+                 * flags after the head page flags. The
+                 * split_huge_page side enforces write memory barriers
+                 * between clearing PageTail and before the head page
+                 * can be freed and reallocated.
+                 */
+                smp_rmb();
+                if (likely(PageTail(page))) {
                        /*
-                         * page_head wasn't a dangling pointer but it
+                         * __split_huge_page_refcount cannot race
-                         * may not be a head page anymore by the time
+                         * here.
-                         * we obtain the lock. That is ok as long as it
-                         * can't be freed from under us.
                         */
-                        flags = compound_lock_irqsave(page_head);
+                        VM_BUG_ON(!PageHead(page_head));
-                        if (unlikely(!PageTail(page))) {
+                        VM_BUG_ON(page_mapcount(page) != 0);
-                                /* __split_huge_page_refcount run before us */
+                        if (put_page_testzero(page_head)) {
-                                compound_unlock_irqrestore(page_head, flags);
+                                /*
-skip_lock:
+                                 * If this is the tail of a slab
-                                if (put_page_testzero(page_head)) {
+                                 * compound page, the tail pin must
-                                        /*
+                                 * not be the last reference held on
-                                         * The head page may have been
+                                 * the page, because the PG_slab
-                                         * freed and reallocated as a
+                                 * cannot be cleared before all tail
-                                         * compound page of smaller
+                                 * pins (which skips the _mapcount
-                                         * order and then freed again.
+                                 * tail refcounting) have been
-                                         * All we know is that it
+                                 * released. For hugetlbfs the tail
-                                         * cannot have become: a THP
+                                 * pin may be the last reference on
-                                         * page, a compound page of
+                                 * the page instead, because
-                                         * higher order, a tail page.
+                                 * PageHeadHuge will not go away until
-                                         * That is because we still
+                                 * the compound page enters the buddy
-                                         * hold the refcount of the
+                                 * allocator.
-                                         * split THP tail and
+                                 */
-                                         * page_head was the THP head
+                                VM_BUG_ON(PageSlab(page_head));
-                                         * before the split.
+                                __put_compound_page(page_head);
-                                         */
-                                        if (PageHead(page_head))
-                                                __put_compound_page(page_head);
-                                        else
-                                                __put_single_page(page_head);
-                                }
-out_put_single:
-                                if (put_page_testzero(page))
-                                        __put_single_page(page);
-                                return;
                        }
-                        VM_BUG_ON(page_head != page->first_page);
+                        return;
+                } else
                        /*
-                         * We can release the refcount taken by
+                         * __split_huge_page_refcount run before us,
-                         * get_page_unless_zero() now that
+                         * "page" was a THP tail. The split page_head
-                         * __split_huge_page_refcount() is blocked on
+                         * has been freed and reallocated as slab or
-                         * the compound_lock.
+                         * hugetlbfs page of smaller order (only
+                         * possible if reallocated as slab on x86).
                         */
-                        if (put_page_testzero(page_head))
+                        goto out_put_single;
-                                VM_BUG_ON(1);
+        }
-                        /* __split_huge_page_refcount will wait now */
-                        VM_BUG_ON(page_mapcount(page) <= 0);
-                        atomic_dec(&page->_mapcount);
-                        VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
-                        VM_BUG_ON(atomic_read(&page->_count) != 0);
-                        compound_unlock_irqrestore(page_head, flags);
+        if (likely(page != page_head && get_page_unless_zero(page_head))) {
+                unsigned long flags;
+                /*
+                 * page_head wasn't a dangling pointer but it may not
+                 * be a head page anymore by the time we obtain the
+                 * lock. That is ok as long as it can't be freed from
+                 * under us.
+                 */
+                flags = compound_lock_irqsave(page_head);
+                if (unlikely(!PageTail(page))) {
+                        /* __split_huge_page_refcount run before us */
+                        compound_unlock_irqrestore(page_head, flags);
                        if (put_page_testzero(page_head)) {
+                                /*
+                                 * The head page may have been freed
+                                 * and reallocated as a compound page
+                                 * of smaller order and then freed
+                                 * again.  All we know is that it
+                                 * cannot have become: a THP page, a
+                                 * compound page of higher order, a
+                                 * tail page.  That is because we
+                                 * still hold the refcount of the
+                                 * split THP tail and page_head was
+                                 * the THP head before the split.
+                                 */
                                if (PageHead(page_head))
                                        __put_compound_page(page_head);
                                else
                                        __put_single_page(page_head);
                        }
-                } else {
+out_put_single:
-                        /* page_head is a dangling pointer */
+                        if (put_page_testzero(page))
-                        VM_BUG_ON(PageTail(page));
+                                __put_single_page(page);
-                        goto out_put_single;
+                        return;
                }
-        } else if (put_page_testzero(page)) {
+                VM_BUG_ON(page_head != page->first_page);
-                if (PageHead(page))
+                /*
-                        __put_compound_page(page);
+                 * We can release the refcount taken by
-                else
+                 * get_page_unless_zero() now that
-                        __put_single_page(page);
+                 * __split_huge_page_refcount() is blocked on the
+                 * compound_lock.
+                 */
+                if (put_page_testzero(page_head))
+                        VM_BUG_ON(1);
+                /* __split_huge_page_refcount will wait now */
+                VM_BUG_ON(page_mapcount(page) <= 0);
+                atomic_dec(&page->_mapcount);
+                VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+                VM_BUG_ON(atomic_read(&page->_count) != 0);
+                compound_unlock_irqrestore(page_head, flags);
+                if (put_page_testzero(page_head)) {
+                        if (PageHead(page_head))
+                                __put_compound_page(page_head);
+                        else
+                                __put_single_page(page_head);
+                }
+        } else {
+                /* page_head is a dangling pointer */
+                VM_BUG_ON(PageTail(page));
+                goto out_put_single;
        }
 }
@@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page)
         * split_huge_page().
         */
        unsigned long flags;
-        bool got = false;
+        bool got;
        struct page *page_head = compound_trans_head(page);
-        if (likely(page != page_head && get_page_unless_zero(page_head))) {
+        /* Ref to put_compound_page() comment. */
-                /* Ref to put_compound_page() comment. */
+        if (!__compound_tail_refcounted(page_head)) {
-                if (PageSlab(page_head) || PageHeadHuge(page_head)) {
+                smp_rmb();
-                        if (likely(PageTail(page))) {
+                if (likely(PageTail(page))) {
-                                /*
+                        /*
-                                 * This is a hugetlbfs page or a slab
+                         * This is a hugetlbfs page or a slab
-                                 * page. __split_huge_page_refcount
+                         * page. __split_huge_page_refcount
-                                 * cannot race here.
+                         * cannot race here.
-                                 */
+                         */
-                                VM_BUG_ON(!PageHead(page_head));
+                        VM_BUG_ON(!PageHead(page_head));
-                                __get_page_tail_foll(page, false);
+                        __get_page_tail_foll(page, true);
-                                return true;
+                        return true;
-                        } else {
+                } else {
-                                /*
+                        /*
-                                 * __split_huge_page_refcount run
+                         * __split_huge_page_refcount run
-                                 * before us, "page" was a THP
+                         * before us, "page" was a THP
-                                 * tail. The split page_head has been
+                         * tail. The split page_head has been
-                                 * freed and reallocated as slab or
+                         * freed and reallocated as slab or
-                                 * hugetlbfs page of smaller order
+                         * hugetlbfs page of smaller order
-                                 * (only possible if reallocated as
+                         * (only possible if reallocated as
-                                 * slab on x86).
+                         * slab on x86).
-                                 */
+                         */
-                                put_page(page_head);
+                        return false;
-                                return false;
-                        }
                }
+        }
+        got = false;
+        if (likely(page != page_head && get_page_unless_zero(page_head))) {
                /*
                 * page_head wasn't a dangling pointer but it
                 * may not be a head page anymore by the time
diff --git a/mm/util.c b/mm/util.c
index 808f375648e7..a24aa22f2473 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page)
        return mapping;
 }
+int overcommit_ratio_handler(struct ctl_table *table, int write,
+                             void __user *buffer, size_t *lenp,
+                             loff_t *ppos)
+{
+        int ret;
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (ret == 0 && write)
+                sysctl_overcommit_kbytes = 0;
+        return ret;
+}
+int overcommit_kbytes_handler(struct ctl_table *table, int write,
+                             void __user *buffer, size_t *lenp,
+                             loff_t *ppos)
+{
+        int ret;
+        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+        if (ret == 0 && write)
+                sysctl_overcommit_ratio = 0;
+        return ret;
+}
 /*
 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
 */
 unsigned long vm_commit_limit(void)
 {
-        return ((totalram_pages - hugetlb_total_pages())
+        unsigned long allowed;
-                * sysctl_overcommit_ratio / 100) + total_swap_pages;
+        if (sysctl_overcommit_kbytes)
+                allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
+        else
+                allowed = ((totalram_pages - hugetlb_total_pages())
+                           * sysctl_overcommit_ratio / 100);
+        allowed += total_swap_pages;
+        return allowed;
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0fdf96803c5b..e4f0db2a3eae 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x)
 }
 /*
- * Walk a vmap address to the struct page it maps.
+ * Walk a vmap address to the physical pfn it maps to.
 */
-struct page *vmalloc_to_page(const void *vmalloc_addr)
+unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 {
        unsigned long addr = (unsigned long) vmalloc_addr;
-        struct page *page = NULL;
+        unsigned long pfn = 0;
        pgd_t *pgd = pgd_offset_k(addr);
        /*
@@ -244,23 +244,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
                                ptep = pte_offset_map(pmd, addr);
                                pte = *ptep;
                                if (pte_present(pte))
-                                        page = pte_page(pte);
+                                        pfn = pte_pfn(pte);
                                pte_unmap(ptep);
                        }
                }
        }
-        return page;
+        return pfn;
 }
-EXPORT_SYMBOL(vmalloc_to_page);
+EXPORT_SYMBOL(vmalloc_to_pfn);
 /*
- * Map a vmalloc()-space virtual address to the physical page frame number.
+ * Map a vmalloc()-space virtual address to the struct page.
 */
-unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
+struct page *vmalloc_to_page(const void *vmalloc_addr)
 {
-        return page_to_pfn(vmalloc_to_page(vmalloc_addr));
+        return pfn_to_page(vmalloc_to_pfn(vmalloc_addr));
 }
-EXPORT_SYMBOL(vmalloc_to_pfn);
+EXPORT_SYMBOL(vmalloc_to_page);
 /*** Global kva allocator ***/
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-01-21 22:05:45 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-01-21 22:05:45 -0500
commit	df32e43a54d04eda35d2859beaf90e3864d53288 (patch)
tree	7a61cf658b2949bd426285eb9902be7758ced1ba /mm
parent	fbd918a2026d0464ce9c23f57b7de4bcfccdc2e6 (diff)
parent	78d5506e82b21a1a1de68c24182db2c2fe521422 (diff)