Merge tag 'efi-urgent' into x86/urgent

* Avoid WARN_ON() when mapping BGRT on Baytrail (EFI 32-bit). Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
author: H. Peter Anvin <hpa@linux.intel.com> 2014-02-07 14:27:30 -0500
committer: H. Peter Anvin <hpa@linux.intel.com> 2014-02-07 14:27:30 -0500
commit: a3b072cd180c12e8fe0ece9487b9065808327640 (patch)
tree: 62b982041be84748852d77cdf6ca5639ef40858f /mm
parent: 75a1ba5b2c529db60ca49626bcaf0bddf4548438 (diff)
parent: 081cd62a010f97b5bc1d2b0cd123c5abc692b68a (diff)
51 files changed, 3496 insertions, 1666 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 723bbe04a0b0..2d9f1504d75e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -552,3 +552,28 @@ config MEM_SOFT_DIRTY
          it can be cleared by hands.
          See Documentation/vm/soft-dirty.txt for more details.
+config ZSMALLOC
+        bool "Memory allocator for compressed pages"
+        depends on MMU
+        default n
+        help
+          zsmalloc is a slab-based memory allocator designed to store
+          compressed RAM pages.  zsmalloc uses virtual memory mapping
+          in order to reduce fragmentation.  However, this results in a
+          non-standard allocator interface where a handle, not a pointer, is
+          returned by an alloc().  This handle must be mapped in order to
+          access the allocated space.
+config PGTABLE_MAPPING
+        bool "Use page table mapping to access object in zsmalloc"
+        depends on ZSMALLOC
+        help
+          By default, zsmalloc uses a copy-based object mapping method to
+          access allocations that span two pages. However, if a particular
+          architecture (ex, ARM) performs VM mapping faster than copying,
+          then you should select this. This causes zsmalloc to use page table
+          mapping rather than copying for object mapping.
+          You can check speed with zsmalloc benchmark[1].
+          [1] https://github.com/spartacus06/zsmalloc
diff --git a/mm/Makefile b/mm/Makefile
index 305d10acd081..310c90a09264 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -60,3 +60,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
 obj-$(CONFIG_ZBUD)      += zbud.o
+obj-$(CONFIG_ZSMALLOC)  += zsmalloc.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 07dbc8ec46cf..6e45a5074bf0 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -267,7 +267,7 @@ void balloon_page_putback(struct page *page)
                put_page(page);
        } else {
                WARN_ON(1);
-                dump_page(page);
+                dump_page(page, "not movable balloon page");
        }
        unlock_page(page);
 }
@@ -287,7 +287,7 @@ int balloon_page_migrate(struct page *newpage,
        BUG_ON(!trylock_page(newpage));
        if (WARN_ON(!__is_movable_balloon_page(page))) {
-                dump_page(page);
+                dump_page(page, "not movable balloon page");
                unlock_page(newpage);
                return rc;
        }
diff --git a/mm/bounce.c b/mm/bounce.c
index 5a7d58fb883b..523918b8c6dc 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -98,27 +98,24 @@ int init_emergency_isa_pool(void)
 static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
 {
        unsigned char *vfrom;
-        struct bio_vec *tovec, *fromvec;
+        struct bio_vec tovec, *fromvec = from->bi_io_vec;
-        int i;
+        struct bvec_iter iter;
-        bio_for_each_segment(tovec, to, i) {
+        bio_for_each_segment(tovec, to, iter) {
-                fromvec = from->bi_io_vec + i;
+                if (tovec.bv_page != fromvec->bv_page) {
+                        /*
-                /*
+                         * fromvec->bv_offset and fromvec->bv_len might have
-                 * not bounced
+                         * been modified by the block layer, so use the original
-                 */
+                         * copy, bounce_copy_vec already uses tovec->bv_len
-                if (tovec->bv_page == fromvec->bv_page)
+                         */
-                        continue;
+                        vfrom = page_address(fromvec->bv_page) +
+                                tovec.bv_offset;
-                /*
-                 * fromvec->bv_offset and fromvec->bv_len might have been
+                        bounce_copy_vec(&tovec, vfrom);
-                 * modified by the block layer, so use the original copy,
+                        flush_dcache_page(tovec.bv_page);
-                 * bounce_copy_vec already uses tovec->bv_len
+                }
-                 */
-                vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
-                bounce_copy_vec(tovec, vfrom);
+                fromvec++;
-                flush_dcache_page(tovec->bv_page);
        }
 }
@@ -201,13 +198,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 {
        struct bio *bio;
        int rw = bio_data_dir(*bio_orig);
-        struct bio_vec *to, *from;
+        struct bio_vec *to, from;
+        struct bvec_iter iter;
        unsigned i;
        if (force)
                goto bounce;
-        bio_for_each_segment(from, *bio_orig, i)
+        bio_for_each_segment(from, *bio_orig, iter)
-                if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
+                if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
                        goto bounce;
        return;
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 5875f48ce279..d0eac4350403 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -237,7 +237,7 @@ int __cleancache_get_page(struct page *page)
                goto out;
        }
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
        fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
        if (fake_pool_id < 0)
                goto out;
@@ -279,7 +279,7 @@ void __cleancache_put_page(struct page *page)
                return;
        }
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
        fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
        if (fake_pool_id < 0)
                return;
@@ -318,7 +318,7 @@ void __cleancache_invalidate_page(struct address_space *mapping,
                if (pool_id < 0)
                        return;
-                VM_BUG_ON(!PageLocked(page));
+                VM_BUG_ON_PAGE(!PageLocked(page), page);
                if (cleancache_get_key(mapping->host, &key) >= 0) {
                        cleancache_ops->invalidate_page(pool_id,
                                        key, page->index);
diff --git a/mm/compaction.c b/mm/compaction.c
index f58bcd016f43..b48c5259ea33 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        unsigned long flags;
        bool locked = false;
        struct page *page = NULL, *valid_page = NULL;
+        bool skipped_async_unsuitable = false;
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -522,7 +523,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                if (!isolation_suitable(cc, page))
                        goto next_pageblock;
-                /* Skip if free */
+                /*
+                 * Skip if free. page_order cannot be used without zone->lock
+                 * as nothing prevents parallel allocations or buddy merging.
+                 */
                if (PageBuddy(page))
                        continue;
@@ -534,6 +538,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                if (!cc->sync && last_pageblock_nr != pageblock_nr &&
                    !migrate_async_suitable(get_pageblock_migratetype(page))) {
                        cc->finished_update_migrate = true;
+                        skipped_async_unsuitable = true;
                        goto next_pageblock;
                }
@@ -599,7 +604,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                if (__isolate_lru_page(page, mode) != 0)
                        continue;
-                VM_BUG_ON(PageTransCompound(page));
+                VM_BUG_ON_PAGE(PageTransCompound(page), page);
                /* Successfully isolated */
                cc->finished_update_migrate = true;
@@ -627,8 +632,13 @@ next_pageblock:
        if (locked)
                spin_unlock_irqrestore(&zone->lru_lock, flags);
-        /* Update the pageblock-skip if the whole pageblock was scanned */
+        /*
-        if (low_pfn == end_pfn)
+         * Update the pageblock-skip information and cached scanner pfn,
+         * if the whole pageblock was scanned without isolating any page.
+         * This is not done when pageblock was skipped due to being unsuitable
+         * for async compaction, so that eventual sync compaction can try.
+         */
+        if (low_pfn == end_pfn && !skipped_async_unsuitable)
                update_pageblock_skip(cc, valid_page, nr_isolated, true);
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -660,7 +670,7 @@ static void isolate_freepages(struct zone *zone,
         * is the end of the pageblock the migration scanner is using.
         */
        pfn = cc->free_pfn;
-        low_pfn = cc->migrate_pfn + pageblock_nr_pages;
+        low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
        /*
         * Take care that if the migration scanner is at the end of the zone
@@ -676,7 +686,7 @@ static void isolate_freepages(struct zone *zone,
         * pages on cc->migratepages. We stop searching if the migrate
         * and free page scanners meet or enough free pages are isolated.
         */
-        for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
+        for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
                                        pfn -= pageblock_nr_pages) {
                unsigned long isolated;
@@ -738,7 +748,14 @@ static void isolate_freepages(struct zone *zone,
        /* split_free_page does not map the pages */
        map_pages(freelist);
-        cc->free_pfn = high_pfn;
+        /*
+         * If we crossed the migrate scanner, we want to keep it that way
+         * so that compact_finished() may detect this
+         */
+        if (pfn < low_pfn)
+                cc->free_pfn = max(pfn, zone->zone_start_pfn);
+        else
+                cc->free_pfn = high_pfn;
        cc->nr_freepages = nr_freepages;
 }
@@ -837,6 +854,10 @@ static int compact_finished(struct zone *zone,
        /* Compaction run completes if the migrate and free scanner meet */
        if (cc->free_pfn <= cc->migrate_pfn) {
+                /* Let the next compaction start anew. */
+                zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+                zone->compact_cached_free_pfn = zone_end_pfn(zone);
                /*
                 * Mark that the PG_migrate_skip information should be cleared
                 * by kswapd when it goes to sleep. kswapd does not set the
@@ -947,6 +968,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        }
        /*
+         * Clear pageblock skip if there were failures recently and compaction
+         * is about to be retried after being deferred. kswapd does not do
+         * this reset as it'll reset the cached information when going to sleep.
+         */
+        if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+                __reset_isolation_suitable(zone);
+        /*
         * Setup to move all movable pages to the end of the zone. Used cached
         * information on where the scanners should start but check that it
         * is initialised by ensuring the values are within zone boundaries.
@@ -962,13 +991,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                zone->compact_cached_migrate_pfn = cc->migrate_pfn;
        }
-        /*
+        trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
-         * Clear pageblock skip if there were failures recently and compaction
-         * is about to be retried after being deferred. kswapd does not do
-         * this reset as it'll reset the cached information when going to sleep.
-         */
-        if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
-                __reset_isolation_suitable(zone);
        migrate_prep_local();
@@ -1003,7 +1026,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                if (err) {
                        putback_movable_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
-                        if (err == -ENOMEM) {
+                        /*
+                         * migrate_pages() may return -ENOMEM when scanners meet
+                         * and we want compact_finished() to detect it
+                         */
+                        if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
                                ret = COMPACT_PARTIAL;
                                goto out;
                        }
@@ -1015,6 +1042,8 @@ out:
        cc->nr_freepages -= release_freepages(&cc->freepages);
        VM_BUG_ON(cc->nr_freepages != 0);
+        trace_mm_compaction_end(ret);
        return ret;
 }
@@ -1120,12 +1149,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                        compact_zone(zone, cc);
                if (cc->order > 0) {
-                        int ok = zone_watermark_ok(zone, cc->order,
+                        if (zone_watermark_ok(zone, cc->order,
-                                                low_wmark_pages(zone), 0, 0);
+                                                low_wmark_pages(zone), 0, 0))
-                        if (ok && cc->order >= zone->compact_order_failed)
+                                compaction_defer_reset(zone, cc->order, false);
-                                zone->compact_order_failed = cc->order + 1;
                        /* Currently async compaction is never deferred. */
-                        else if (!ok && cc->sync)
+                        else if (cc->sync)
                                defer_compaction(zone, cc->order);
                }
diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a92021c..d56d3c145b9f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -409,9 +409,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 {
        int error;
-        VM_BUG_ON(!PageLocked(old));
+        VM_BUG_ON_PAGE(!PageLocked(old), old);
-        VM_BUG_ON(!PageLocked(new));
+        VM_BUG_ON_PAGE(!PageLocked(new), new);
-        VM_BUG_ON(new->mapping);
+        VM_BUG_ON_PAGE(new->mapping, new);
        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (!error) {
@@ -461,8 +461,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 {
        int error;
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
-        VM_BUG_ON(PageSwapBacked(page));
+        VM_BUG_ON_PAGE(PageSwapBacked(page), page);
        error = mem_cgroup_cache_charge(page, current->mm,
                                        gfp_mask & GFP_RECLAIM_MASK);
@@ -607,7 +607,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
 */
 void unlock_page(struct page *page)
 {
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
        clear_bit_unlock(PG_locked, &page->flags);
        smp_mb__after_clear_bit();
        wake_up_page(page, PG_locked);
@@ -760,7 +760,7 @@ repeat:
                        page_cache_release(page);
                        goto repeat;
                }
-                VM_BUG_ON(page->index != offset);
+                VM_BUG_ON_PAGE(page->index != offset, page);
        }
        return page;
 }
@@ -1428,30 +1428,28 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                if (!count)
                        goto out; /* skip atime */
                size = i_size_read(inode);
-                if (pos < size) {
+                retval = filemap_write_and_wait_range(mapping, pos,
-                        retval = filemap_write_and_wait_range(mapping, pos,
                                        pos + iov_length(iov, nr_segs) - 1);
-                        if (!retval) {
+                if (!retval) {
-                                retval = mapping->a_ops->direct_IO(READ, iocb,
+                        retval = mapping->a_ops->direct_IO(READ, iocb,
-                                                        iov, pos, nr_segs);
+                                                           iov, pos, nr_segs);
-                        }
+                }
-                        if (retval > 0) {
+                if (retval > 0) {
-                                *ppos = pos + retval;
+                        *ppos = pos + retval;
-                                count -= retval;
+                        count -= retval;
-                        }
+                }
-                        /*
+                /*
-                         * Btrfs can have a short DIO read if we encounter
+                 * Btrfs can have a short DIO read if we encounter
-                         * compressed extents, so if there was an error, or if
+                 * compressed extents, so if there was an error, or if
-                         * we've already read everything we wanted to, or if
+                 * we've already read everything we wanted to, or if
-                         * there was a short read because we hit EOF, go ahead
+                 * there was a short read because we hit EOF, go ahead
-                         * and return.  Otherwise fallthrough to buffered io for
+                 * and return.  Otherwise fallthrough to buffered io for
-                         * the rest of the read.
+                 * the rest of the read.
-                         */
+                 */
-                        if (retval < 0 || !count || *ppos >= size) {
+                if (retval < 0 || !count || *ppos >= size) {
-                                file_accessed(filp);
+                        file_accessed(filp);
-                                goto out;
+                        goto out;
-                        }
                }
        }
@@ -1656,7 +1654,7 @@ retry_find:
                put_page(page);
                goto retry_find;
        }
-        VM_BUG_ON(page->index != offset);
+        VM_BUG_ON_PAGE(page->index != offset, page);
        /*
         * We have a locked page in the page cache, now we need to check
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 95d1acb0f3d2..82166bf974e1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -130,8 +130,14 @@ static int set_recommended_min_free_kbytes(void)
                              (unsigned long) nr_free_buffer_pages() / 20);
        recommended_min <<= (PAGE_SHIFT-10);
-        if (recommended_min > min_free_kbytes)
+        if (recommended_min > min_free_kbytes) {
+                if (user_min_free_kbytes >= 0)
+                        pr_info("raising min_free_kbytes from %d to %lu "
+                                "to help transparent hugepage allocations\n",
+                                min_free_kbytes, recommended_min);
                min_free_kbytes = recommended_min;
+        }
        setup_per_zone_wmarks();
        return 0;
 }
@@ -655,7 +661,7 @@ out:
        hugepage_exit_sysfs(hugepage_kobj);
        return err;
 }
-module_init(hugepage_init)
+subsys_initcall(hugepage_init);
 static int __init setup_transparent_hugepage(char *str)
 {
@@ -712,7 +718,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        pgtable_t pgtable;
        spinlock_t *ptl;
-        VM_BUG_ON(!PageCompound(page));
+        VM_BUG_ON_PAGE(!PageCompound(page), page);
        pgtable = pte_alloc_one(mm, haddr);
        if (unlikely(!pgtable))
                return VM_FAULT_OOM;
@@ -893,7 +899,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                goto out;
        }
        src_page = pmd_page(pmd);
-        VM_BUG_ON(!PageHead(src_page));
+        VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
        get_page(src_page);
        page_dup_rmap(src_page);
        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1067,7 +1073,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        ptl = pmd_lock(mm, pmd);
        if (unlikely(!pmd_same(*pmd, orig_pmd)))
                goto out_free_pages;
-        VM_BUG_ON(!PageHead(page));
+        VM_BUG_ON_PAGE(!PageHead(page), page);
        pmdp_clear_flush(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
@@ -1133,7 +1139,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_unlock;
        page = pmd_page(orig_pmd);
-        VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+        VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
        if (page_mapcount(page) == 1) {
                pmd_t entry;
                entry = pmd_mkyoung(orig_pmd);
@@ -1211,7 +1217,7 @@ alloc:
                        add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                        put_huge_zero_page();
                } else {
-                        VM_BUG_ON(!PageHead(page));
+                        VM_BUG_ON_PAGE(!PageHead(page), page);
                        page_remove_rmap(page);
                        put_page(page);
                }
@@ -1249,7 +1255,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                goto out;
        page = pmd_page(*pmd);
-        VM_BUG_ON(!PageHead(page));
+        VM_BUG_ON_PAGE(!PageHead(page), page);
        if (flags & FOLL_TOUCH) {
                pmd_t _pmd;
                /*
@@ -1274,7 +1280,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                }
        }
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
-        VM_BUG_ON(!PageCompound(page));
+        VM_BUG_ON_PAGE(!PageCompound(page), page);
        if (flags & FOLL_GET)
                get_page_foll(page);
@@ -1432,9 +1438,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                } else {
                        page = pmd_page(orig_pmd);
                        page_remove_rmap(page);
-                        VM_BUG_ON(page_mapcount(page) < 0);
+                        VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
-                        VM_BUG_ON(!PageHead(page));
+                        VM_BUG_ON_PAGE(!PageHead(page), page);
                        atomic_long_dec(&tlb->mm->nr_ptes);
                        spin_unlock(ptl);
                        tlb_remove_page(tlb, page);
@@ -1502,19 +1508,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
                pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
                VM_BUG_ON(!pmd_none(*new_pmd));
-                set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-                if (new_ptl != old_ptl) {
-                        pgtable_t pgtable;
-                        /*
+                if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
-                         * Move preallocated PTE page table if new_pmd is on
+                        pgtable_t pgtable;
-                         * different PMD page table.
-                         */
                        pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
                        pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
-                        spin_unlock(new_ptl);
                }
+                set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+                if (new_ptl != old_ptl)
+                        spin_unlock(new_ptl);
                spin_unlock(old_ptl);
        }
 out:
@@ -2176,9 +2178,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                if (unlikely(!page))
                        goto out;
-                VM_BUG_ON(PageCompound(page));
+                VM_BUG_ON_PAGE(PageCompound(page), page);
-                BUG_ON(!PageAnon(page));
+                VM_BUG_ON_PAGE(!PageAnon(page), page);
-                VM_BUG_ON(!PageSwapBacked(page));
+                VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
                /* cannot use mapcount: can't collapse if there's a gup pin */
                if (page_count(page) != 1)
@@ -2201,8 +2203,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                }
                /* 0 stands for page_is_file_cache(page) == false */
                inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
-                VM_BUG_ON(!PageLocked(page));
+                VM_BUG_ON_PAGE(!PageLocked(page), page);
-                VM_BUG_ON(PageLRU(page));
+                VM_BUG_ON_PAGE(PageLRU(page), page);
                /* If there is no mapped pte young don't collapse the page */
                if (pte_young(pteval) || PageReferenced(page) ||
@@ -2232,7 +2234,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
                } else {
                        src_page = pte_page(pteval);
                        copy_user_highpage(page, src_page, address, vma);
-                        VM_BUG_ON(page_mapcount(src_page) != 1);
+                        VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
                        release_pte_page(src_page);
                        /*
                         * ptl mostly unnecessary, but preempt has to
@@ -2311,7 +2313,7 @@ static struct page
                       struct vm_area_struct *vma, unsigned long address,
                       int node)
 {
-        VM_BUG_ON(*hpage);
+        VM_BUG_ON_PAGE(*hpage, *hpage);
        /*
         * Allocate the page while the vma is still valid and under
         * the mmap_sem read mode so there is no memory allocation
@@ -2580,7 +2582,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                 */
                node = page_to_nid(page);
                khugepaged_node_load[node]++;
-                VM_BUG_ON(PageCompound(page));
+                VM_BUG_ON_PAGE(PageCompound(page), page);
                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
                        goto out_unmap;
                /* cannot use mapcount: can't collapse if there's a gup pin */
@@ -2876,7 +2878,7 @@ again:
                return;
        }
        page = pmd_page(*pmd);
-        VM_BUG_ON(!page_count(page));
+        VM_BUG_ON_PAGE(!page_count(page), page);
        get_page(page);
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dee6cf4e6d34..c01cb9fedb18 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -584,7 +584,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                1 << PG_active | 1 << PG_reserved |
                                1 << PG_private | 1 << PG_writeback);
        }
-        VM_BUG_ON(hugetlb_cgroup_from_page(page));
+        VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
        set_compound_page_dtor(page, NULL);
        set_page_refcounted(page);
        arch_release_hugepage(page);
@@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 */
 int PageHuge(struct page *page)
 {
-        compound_page_dtor *dtor;
        if (!PageCompound(page))
                return 0;
        page = compound_head(page);
-        dtor = get_compound_page_dtor(page);
+        return get_compound_page_dtor(page) == free_huge_page;
-        return dtor == free_huge_page;
 }
 EXPORT_SYMBOL_GPL(PageHuge);
@@ -708,16 +704,11 @@ EXPORT_SYMBOL_GPL(PageHuge);
 */
 int PageHeadHuge(struct page *page_head)
 {
-        compound_page_dtor *dtor;
        if (!PageHead(page_head))
                return 0;
-        dtor = get_compound_page_dtor(page_head);
+        return get_compound_page_dtor(page_head) == free_huge_page;
-        return dtor == free_huge_page;
 }
-EXPORT_SYMBOL_GPL(PageHeadHuge);
 pgoff_t __basepage_index(struct page *page)
 {
@@ -1098,7 +1089,7 @@ retry:
                 * no users -- drop the buddy allocator's reference.
                 */
                put_page_testzero(page);
-                VM_BUG_ON(page_count(page));
+                VM_BUG_ON_PAGE(page_count(page), page);
                enqueue_huge_page(h, page);
        }
 free:
@@ -1280,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
        for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
                void *addr;
-                addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
+                addr = memblock_virt_alloc_try_nid_nopanic(
-                                huge_page_size(h), huge_page_size(h), 0);
+                                huge_page_size(h), huge_page_size(h),
+                                0, BOOTMEM_ALLOC_ACCESSIBLE, node);
                if (addr) {
                        /*
                         * Use the beginning of the huge page to store the
@@ -1322,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void)
 #ifdef CONFIG_HIGHMEM
                page = pfn_to_page(m->phys >> PAGE_SHIFT);
-                free_bootmem_late((unsigned long)m,
+                memblock_free_late(__pa(m),
-                                  sizeof(struct huge_bootmem_page));
+                                   sizeof(struct huge_bootmem_page));
 #else
                page = virt_to_page(m);
 #endif
@@ -2355,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        int cow;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        int ret = 0;
        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+        mmun_start = vma->vm_start;
+        mmun_end = vma->vm_end;
+        if (cow)
+                mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
        for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
                spinlock_t *src_ptl, *dst_ptl;
                src_pte = huge_pte_offset(src, addr);
                if (!src_pte)
                        continue;
                dst_pte = huge_pte_alloc(dst, addr, sz);
-                if (!dst_pte)
+                if (!dst_pte) {
-                        goto nomem;
+                        ret = -ENOMEM;
+                        break;
+                }
                /* If the pagetables are shared don't copy or take references */
                if (dst_pte == src_pte)
@@ -2386,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                spin_unlock(src_ptl);
                spin_unlock(dst_ptl);
        }
-        return 0;
-nomem:
+        if (cow)
-        return -ENOMEM;
+                mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+        return ret;
 }
 static int is_hugetlb_entry_migration(pte_t pte)
@@ -3079,7 +3081,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 same_page:
                if (pages) {
                        pages[i] = mem_map_offset(page, pfn_offset);
-                        get_page(pages[i]);
+                        get_page_foll(pages[i]);
                }
                if (vmas)
@@ -3501,7 +3503,7 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
 bool isolate_huge_page(struct page *page, struct list_head *list)
 {
-        VM_BUG_ON(!PageHead(page));
+        VM_BUG_ON_PAGE(!PageHead(page), page);
        if (!get_page_unless_zero(page))
                return false;
        spin_lock(&hugetlb_lock);
@@ -3512,7 +3514,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
 void putback_active_hugepage(struct page *page)
 {
-        VM_BUG_ON(!PageHead(page));
+        VM_BUG_ON_PAGE(!PageHead(page), page);
        spin_lock(&hugetlb_lock);
        list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
        spin_unlock(&hugetlb_lock);
@@ -3521,7 +3523,7 @@ void putback_active_hugepage(struct page *page)
 bool is_hugepage_active(struct page *page)
 {
-        VM_BUG_ON(!PageHuge(page));
+        VM_BUG_ON_PAGE(!PageHuge(page), page);
        /*
         * This function can be called for a tail page because the caller,
         * scan_movable_pages, scans through a given pfn-range which typically
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index bda8e44f6fde..cb00829bb466 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -242,22 +242,16 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
        return;
 }
-static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css,
+static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
-                                   struct cftype *cft, struct file *file,
+                                   struct cftype *cft)
-                                   char __user *buf, size_t nbytes,
-                                   loff_t *ppos)
 {
-        u64 val;
+        int idx, name;
-        char str[64];
-        int idx, name, len;
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
        idx = MEMFILE_IDX(cft->private);
        name = MEMFILE_ATTR(cft->private);
-        val = res_counter_read_u64(&h_cg->hugepage[idx], name);
+        return res_counter_read_u64(&h_cg->hugepage[idx], name);
-        len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
-        return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
 static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
@@ -337,28 +331,28 @@ static void __init __hugetlb_cgroup_file_init(int idx)
        cft = &h->cgroup_files[0];
        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
        cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
-        cft->read = hugetlb_cgroup_read;
+        cft->read_u64 = hugetlb_cgroup_read_u64;
        cft->write_string = hugetlb_cgroup_write;
        /* Add the usage file */
        cft = &h->cgroup_files[1];
        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
        cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
-        cft->read = hugetlb_cgroup_read;
+        cft->read_u64 = hugetlb_cgroup_read_u64;
        /* Add the MAX usage file */
        cft = &h->cgroup_files[2];
        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
        cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
        cft->trigger = hugetlb_cgroup_reset;
-        cft->read = hugetlb_cgroup_read;
+        cft->read_u64 = hugetlb_cgroup_read_u64;
        /* Add the failcntfile */
        cft = &h->cgroup_files[3];
        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
        cft->private  = MEMFILE_PRIVATE(idx, RES_FAILCNT);
        cft->trigger  = hugetlb_cgroup_reset;
-        cft->read = hugetlb_cgroup_read;
+        cft->read_u64 = hugetlb_cgroup_read_u64;
        /* NULL terminate the last cft */
        cft = &h->cgroup_files[4];
@@ -396,7 +390,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
        if (hugetlb_cgroup_disabled())
                return;
-        VM_BUG_ON(!PageHuge(oldhpage));
+        VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
        spin_lock(&hugetlb_lock);
        h_cg = hugetlb_cgroup_from_page(oldhpage);
        set_hugetlb_cgroup(oldhpage, NULL);
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 4c84678371eb..95487c71cad5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val)
                return 0;
 inject:
-        printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
+        pr_info("Injecting memory failure at pfn %#lx\n", pfn);
        return memory_failure(pfn, 18, MF_COUNT_INCREASED);
 }
diff --git a/mm/internal.h b/mm/internal.h
index 684f7aa9692a..29e1e761f9eb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -27,8 +27,8 @@ static inline void set_page_count(struct page *page, int v)
 */
 static inline void set_page_refcounted(struct page *page)
 {
-        VM_BUG_ON(PageTail(page));
+        VM_BUG_ON_PAGE(PageTail(page), page);
-        VM_BUG_ON(atomic_read(&page->_count));
+        VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
        set_page_count(page, 1);
 }
@@ -46,12 +46,10 @@ static inline void __get_page_tail_foll(struct page *page,
         * speculative page access (like in
         * page_cache_get_speculative()) on tail pages.
         */
-        VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+        VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page);
-        VM_BUG_ON(atomic_read(&page->_count) != 0);
-        VM_BUG_ON(page_mapcount(page) < 0);
        if (get_page_head)
                atomic_inc(&page->first_page->_count);
-        atomic_inc(&page->_mapcount);
+        get_huge_page_tail(page);
 }
 /*
@@ -73,7 +71,7 @@ static inline void get_page_foll(struct page *page)
                 * Getting a normal page or the head of a compound page
                 * requires to already have an elevated page->_count.
                 */
-                VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
                atomic_inc(&page->_count);
        }
 }
@@ -85,7 +83,6 @@ extern unsigned long highest_memmap_pfn;
 */
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
-extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern bool zone_reclaimable(struct zone *zone);
 /*
@@ -101,6 +98,7 @@ extern void prep_compound_page(struct page *page, unsigned long order);
 #ifdef CONFIG_MEMORY_FAILURE
 extern bool is_free_buddy_page(struct page *page);
 #endif
+extern int user_min_free_kbytes;
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -144,9 +142,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 #endif
 /*
- * function for dealing with page's order in buddy system.
+ * This function returns the order of a free page in the buddy system. In
- * zone->lock is already acquired when we use these.
+ * general, page_zone(page)->lock must be held by the caller to prevent the
- * So, we don't need atomic page->flags operations here.
+ * page from being allocated in parallel and returning garbage as the order.
+ * If a caller does not hold page_zone(page)->lock, it must guarantee that the
+ * page cannot be allocated or merged in parallel.
 */
 static inline unsigned long page_order(struct page *page)
 {
@@ -175,7 +175,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
                                    struct page *page)
 {
-        VM_BUG_ON(PageLRU(page));
+        VM_BUG_ON_PAGE(PageLRU(page), page);
        if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
                return 0;
diff --git a/mm/ksm.c b/mm/ksm.c
index 175fff79dc95..aa4c7c7250c1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1891,21 +1891,24 @@ struct page *ksm_might_need_to_copy(struct page *page,
        return new_page;
 }
-int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
-                        unsigned long *vm_flags)
 {
        struct stable_node *stable_node;
        struct rmap_item *rmap_item;
-        unsigned int mapcount = page_mapcount(page);
+        int ret = SWAP_AGAIN;
-        int referenced = 0;
        int search_new_forks = 0;
-        VM_BUG_ON(!PageKsm(page));
+        VM_BUG_ON_PAGE(!PageKsm(page), page);
-        VM_BUG_ON(!PageLocked(page));
+        /*
+         * Rely on the page lock to protect against concurrent modifications
+         * to that page's node of the stable tree.
+         */
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
        stable_node = page_stable_node(page);
        if (!stable_node)
-                return 0;
+                return ret;
 again:
        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1928,113 +1931,16 @@ again:
                        if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
                                continue;
-                        if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
+                        if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
-                                continue;
-                        referenced += page_referenced_one(page, vma,
-                                rmap_item->address, &mapcount, vm_flags);
-                        if (!search_new_forks || !mapcount)
-                                break;
-                }
-                anon_vma_unlock_read(anon_vma);
-                if (!mapcount)
-                        goto out;
-        }
-        if (!search_new_forks++)
-                goto again;
-out:
-        return referenced;
-}
-int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
-{
-        struct stable_node *stable_node;
-        struct rmap_item *rmap_item;
-        int ret = SWAP_AGAIN;
-        int search_new_forks = 0;
-        VM_BUG_ON(!PageKsm(page));
-        VM_BUG_ON(!PageLocked(page));
-        stable_node = page_stable_node(page);
-        if (!stable_node)
-                return SWAP_FAIL;
-again:
-        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
-                struct anon_vma *anon_vma = rmap_item->anon_vma;
-                struct anon_vma_chain *vmac;
-                struct vm_area_struct *vma;
-                anon_vma_lock_read(anon_vma);
-                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
-                                               0, ULONG_MAX) {
-                        vma = vmac->vma;
-                        if (rmap_item->address < vma->vm_start ||
-                            rmap_item->address >= vma->vm_end)
-                                continue;
-                        /*
-                         * Initially we examine only the vma which covers this
-                         * rmap_item; but later, if there is still work to do,
-                         * we examine covering vmas in other mms: in case they
-                         * were forked from the original since ksmd passed.
-                         */
-                        if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
                                continue;
-                        ret = try_to_unmap_one(page, vma,
+                        ret = rwc->rmap_one(page, vma,
-                                        rmap_item->address, flags);
+                                        rmap_item->address, rwc->arg);
-                        if (ret != SWAP_AGAIN || !page_mapped(page)) {
+                        if (ret != SWAP_AGAIN) {
                                anon_vma_unlock_read(anon_vma);
                                goto out;
                        }
-                }
+                        if (rwc->done && rwc->done(page)) {
-                anon_vma_unlock_read(anon_vma);
-        }
-        if (!search_new_forks++)
-                goto again;
-out:
-        return ret;
-}
-#ifdef CONFIG_MIGRATION
-int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
-                  struct vm_area_struct *, unsigned long, void *), void *arg)
-{
-        struct stable_node *stable_node;
-        struct rmap_item *rmap_item;
-        int ret = SWAP_AGAIN;
-        int search_new_forks = 0;
-        VM_BUG_ON(!PageKsm(page));
-        VM_BUG_ON(!PageLocked(page));
-        stable_node = page_stable_node(page);
-        if (!stable_node)
-                return ret;
-again:
-        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
-                struct anon_vma *anon_vma = rmap_item->anon_vma;
-                struct anon_vma_chain *vmac;
-                struct vm_area_struct *vma;
-                anon_vma_lock_read(anon_vma);
-                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
-                                               0, ULONG_MAX) {
-                        vma = vmac->vma;
-                        if (rmap_item->address < vma->vm_start ||
-                            rmap_item->address >= vma->vm_end)
-                                continue;
-                        /*
-                         * Initially we examine only the vma which covers this
-                         * rmap_item; but later, if there is still work to do,
-                         * we examine covering vmas in other mms: in case they
-                         * were forked from the original since ksmd passed.
-                         */
-                        if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
-                                continue;
-                        ret = rmap_one(page, vma, rmap_item->address, arg);
-                        if (ret != SWAP_AGAIN) {
                                anon_vma_unlock_read(anon_vma);
                                goto out;
                        }
@@ -2047,17 +1953,18 @@ out:
        return ret;
 }
+#ifdef CONFIG_MIGRATION
 void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 {
        struct stable_node *stable_node;
-        VM_BUG_ON(!PageLocked(oldpage));
+        VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
-        VM_BUG_ON(!PageLocked(newpage));
+        VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-        VM_BUG_ON(newpage->mapping != oldpage->mapping);
+        VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
        stable_node = page_stable_node(newpage);
        if (stable_node) {
-                VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
+                VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
                stable_node->kpfn = page_to_pfn(newpage);
                /*
                 * newpage->mapping was set in advance; now we need smp_wmb()
@@ -2438,4 +2345,4 @@ out_free:
 out:
        return err;
 }
-module_init(ksm_init)
+subsys_initcall(ksm_init);
diff --git a/mm/memblock.c b/mm/memblock.c
index 53e477bb5558..39a31e7f0045 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -21,6 +21,9 @@
 #include <linux/memblock.h>
 #include <asm-generic/sections.h>
+#include <linux/io.h>
+#include "internal.h"
 static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
 static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -39,6 +42,9 @@ struct memblock memblock __initdata_memblock = {
 };
 int memblock_debug __initdata_memblock;
+#ifdef CONFIG_MOVABLE_NODE
+bool movable_node_enabled __initdata_memblock = false;
+#endif
 static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
 static int memblock_reserved_in_slab __initdata_memblock = 0;
@@ -91,7 +97,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
 * @size: size of free area to find
 * @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 *
 * Utility called from memblock_find_in_range_node(), find free area bottom-up.
 *
@@ -123,7 +129,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
 * @size: size of free area to find
 * @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 *
 * Utility called from memblock_find_in_range_node(), find free area top-down.
 *
@@ -154,11 +160,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
 /**
 * memblock_find_in_range_node - find free area in given range and node
- * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
 * @size: size of free area to find
 * @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 *
 * Find @size free area aligned to @align in the specified range and node.
 *
@@ -173,9 +179,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
 * RETURNS:
 * Found address on success, 0 on failure.
 */
-phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
-                                        phys_addr_t end, phys_addr_t size,
+                                        phys_addr_t align, phys_addr_t start,
-                                        phys_addr_t align, int nid)
+                                        phys_addr_t end, int nid)
 {
        int ret;
        phys_addr_t kernel_end;
@@ -238,8 +244,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
                                        phys_addr_t end, phys_addr_t size,
                                        phys_addr_t align)
 {
-        return memblock_find_in_range_node(start, end, size, align,
+        return memblock_find_in_range_node(size, align, start, end,
-                                           MAX_NUMNODES);
+                                            NUMA_NO_NODE);
 }
 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -255,10 +261,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
                type->cnt = 1;
                type->regions[0].base = 0;
                type->regions[0].size = 0;
+                type->regions[0].flags = 0;
                memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
        }
 }
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
                                        phys_addr_t *addr)
 {
@@ -271,6 +280,20 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
                          memblock.reserved.max);
 }
+phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
+                                        phys_addr_t *addr)
+{
+        if (memblock.memory.regions == memblock_memory_init_regions)
+                return 0;
+        *addr = __pa(memblock.memory.regions);
+        return PAGE_ALIGN(sizeof(struct memblock_region) *
+                          memblock.memory.max);
+}
+#endif
 /**
 * memblock_double_array - double the size of the memblock regions array
 * @type: memblock type of the regions array being doubled
@@ -405,7 +428,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
                if (this->base + this->size != next->base ||
                    memblock_get_region_node(this) !=
-                    memblock_get_region_node(next)) {
+                    memblock_get_region_node(next) ||
+                    this->flags != next->flags) {
                        BUG_ON(this->base + this->size > next->base);
                        i++;
                        continue;
@@ -425,13 +449,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
 * @base:       base address of the new region
 * @size:       size of the new region
 * @nid:        node id of the new region
+ * @flags:      flags of the new region
 *
 * Insert new memblock region [@base,@base+@size) into @type at @idx.
 * @type must already have extra room to accomodate the new region.
 */
 static void __init_memblock memblock_insert_region(struct memblock_type *type,
                                                   int idx, phys_addr_t base,
-                                                   phys_addr_t size, int nid)
+                                                   phys_addr_t size,
+                                                   int nid, unsigned long flags)
 {
        struct memblock_region *rgn = &type->regions[idx];
@@ -439,6 +465,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
        memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
        rgn->base = base;
        rgn->size = size;
+        rgn->flags = flags;
        memblock_set_region_node(rgn, nid);
        type->cnt++;
        type->total_size += size;
@@ -450,6 +477,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
 * @base: base address of the new region
 * @size: size of the new region
 * @nid: nid of the new region
+ * @flags: flags of the new region
 *
 * Add new memblock region [@base,@base+@size) into @type.  The new region
 * is allowed to overlap with existing ones - overlaps don't affect already
@@ -460,7 +488,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
 * 0 on success, -errno on failure.
 */
 static int __init_memblock memblock_add_region(struct memblock_type *type,
-                                phys_addr_t base, phys_addr_t size, int nid)
+                                phys_addr_t base, phys_addr_t size,
+                                int nid, unsigned long flags)
 {
        bool insert = false;
        phys_addr_t obase = base;
@@ -475,6 +504,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type,
                WARN_ON(type->cnt != 1 || type->total_size);
                type->regions[0].base = base;
                type->regions[0].size = size;
+                type->regions[0].flags = flags;
                memblock_set_region_node(&type->regions[0], nid);
                type->total_size = size;
                return 0;
@@ -505,7 +535,8 @@ repeat:
                        nr_new++;
                        if (insert)
                                memblock_insert_region(type, i++, base,
-                                                       rbase - base, nid);
+                                                       rbase - base, nid,
+                                                       flags);
                }
                /* area below @rend is dealt with, forget about it */
                base = min(rend, end);
@@ -515,7 +546,8 @@ repeat:
        if (base < end) {
                nr_new++;
                if (insert)
-                        memblock_insert_region(type, i, base, end - base, nid);
+                        memblock_insert_region(type, i, base, end - base,
+                                               nid, flags);
        }
        /*
@@ -537,12 +569,13 @@ repeat:
 int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
                                       int nid)
 {
-        return memblock_add_region(&memblock.memory, base, size, nid);
+        return memblock_add_region(&memblock.memory, base, size, nid, 0);
 }
 int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
 {
-        return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES);
+        return memblock_add_region(&memblock.memory, base, size,
+                                   MAX_NUMNODES, 0);
 }
 /**
@@ -597,7 +630,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
                        rgn->size -= base - rbase;
                        type->total_size -= base - rbase;
                        memblock_insert_region(type, i, rbase, base - rbase,
-                                               memblock_get_region_node(rgn));
+                                               memblock_get_region_node(rgn),
+                                               rgn->flags);
                } else if (rend > end) {
                        /*
                         * @rgn intersects from above.  Split and redo the
@@ -607,7 +641,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
                        rgn->size -= end - rbase;
                        type->total_size -= end - rbase;
                        memblock_insert_region(type, i--, rbase, end - rbase,
-                                               memblock_get_region_node(rgn));
+                                               memblock_get_region_node(rgn),
+                                               rgn->flags);
                } else {
                        /* @rgn is fully contained, record it */
                        if (!*end_rgn)
@@ -643,28 +678,89 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
 {
        memblock_dbg("   memblock_free: [%#016llx-%#016llx] %pF\n",
                     (unsigned long long)base,
-                     (unsigned long long)base + size,
+                     (unsigned long long)base + size - 1,
                     (void *)_RET_IP_);
        return __memblock_remove(&memblock.reserved, base, size);
 }
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+static int __init_memblock memblock_reserve_region(phys_addr_t base,
+                                                   phys_addr_t size,
+                                                   int nid,
+                                                   unsigned long flags)
 {
        struct memblock_type *_rgn = &memblock.reserved;
-        memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n",
+        memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
                     (unsigned long long)base,
-                     (unsigned long long)base + size,
+                     (unsigned long long)base + size - 1,
-                     (void *)_RET_IP_);
+                     flags, (void *)_RET_IP_);
+        return memblock_add_region(_rgn, base, size, nid, flags);
+}
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+{
+        return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
+}
+/**
+ * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * This function isolates region [@base, @base + @size), and mark it with flag
+ * MEMBLOCK_HOTPLUG.
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+{
+        struct memblock_type *type = &memblock.memory;
+        int i, ret, start_rgn, end_rgn;
+        ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+        if (ret)
+                return ret;
+        for (i = start_rgn; i < end_rgn; i++)
+                memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
-        return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
+        memblock_merge_regions(type);
+        return 0;
+}
+/**
+ * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * This function isolates region [@base, @base + @size), and clear flag
+ * MEMBLOCK_HOTPLUG for the isolated regions.
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
+{
+        struct memblock_type *type = &memblock.memory;
+        int i, ret, start_rgn, end_rgn;
+        ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+        if (ret)
+                return ret;
+        for (i = start_rgn; i < end_rgn; i++)
+                memblock_clear_region_flags(&type->regions[i],
+                                            MEMBLOCK_HOTPLUG);
+        memblock_merge_regions(type);
+        return 0;
 }
 /**
 * __next_free_mem_range - next function for for_each_free_mem_range()
 * @idx: pointer to u64 loop variable
- * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @out_nid: ptr to int for nid of the range, can be %NULL
@@ -693,13 +789,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
        int mi = *idx & 0xffffffff;
        int ri = *idx >> 32;
+        if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+                nid = NUMA_NO_NODE;
        for ( ; mi < mem->cnt; mi++) {
                struct memblock_region *m = &mem->regions[mi];
                phys_addr_t m_start = m->base;
                phys_addr_t m_end = m->base + m->size;
                /* only memory regions are associated with nodes, check it */
-                if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+                if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
                        continue;
                /* scan areas before each reservation for intersection */
@@ -740,12 +839,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
 /**
 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
 * @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @out_nid: ptr to int for nid of the range, can be %NULL
 *
 * Reverse of __next_free_mem_range().
+ *
+ * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't
+ * be able to hot-remove hotpluggable memory used by the kernel. So this
+ * function skip hotpluggable regions if needed when allocating memory for the
+ * kernel.
 */
 void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
                                           phys_addr_t *out_start,
@@ -756,6 +860,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
        int mi = *idx & 0xffffffff;
        int ri = *idx >> 32;
+        if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+                nid = NUMA_NO_NODE;
        if (*idx == (u64)ULLONG_MAX) {
                mi = mem->cnt - 1;
                ri = rsv->cnt;
@@ -767,7 +874,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
                phys_addr_t m_end = m->base + m->size;
                /* only memory regions are associated with nodes, check it */
-                if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+                if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
+                        continue;
+                /* skip hotpluggable memory regions if needed */
+                if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
                        continue;
                /* scan areas before each reservation for intersection */
@@ -837,18 +948,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 * memblock_set_node - set node ID on memblock regions
 * @base: base of area to set node ID for
 * @size: size of area to set node ID for
+ * @type: memblock type to set node ID for
 * @nid: node ID to set
 *
- * Set the nid of memblock memory regions in [@base,@base+@size) to @nid.
+ * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
 * Regions which cross the area boundaries are split as necessary.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
 int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
-                                      int nid)
+                                      struct memblock_type *type, int nid)
 {
-        struct memblock_type *type = &memblock.memory;
        int start_rgn, end_rgn;
        int i, ret;
@@ -870,13 +981,10 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
 {
        phys_addr_t found;
-        if (WARN_ON(!align))
+        if (!align)
-                align = __alignof__(long long);
+                align = SMP_CACHE_BYTES;
-        /* align @size to avoid excessive fragmentation on reserved array */
+        found = memblock_find_in_range_node(size, align, 0, max_addr, nid);
-        size = round_up(size, align);
-        found = memblock_find_in_range_node(0, max_addr, size, align, nid);
        if (found && !memblock_reserve(found, size))
                return found;
@@ -890,7 +998,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n
 phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
 {
-        return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
+        return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
 }
 phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -920,6 +1028,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
        return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 }
+/**
+ * memblock_virt_alloc_internal - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region to allocate (phys address)
+ * @max_addr: the upper bound of the memory region to allocate (phys address)
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * The @min_addr limit is dropped if it can not be satisfied and the allocation
+ * will fall back to memory below @min_addr. Also, allocation may fall back
+ * to any node in the system if the specified node can not
+ * hold the requested memory.
+ *
+ * The allocation is performed from memory region limited by
+ * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
+ *
+ * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
+ *
+ * The phys address of allocated boot memory block is converted to virtual and
+ * allocated memory is reset to 0.
+ *
+ * In addition, function sets the min_count to 0 using kmemleak_alloc for
+ * allocated boot memory block, so that it is never reported as leaks.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+static void * __init memblock_virt_alloc_internal(
+                                phys_addr_t size, phys_addr_t align,
+                                phys_addr_t min_addr, phys_addr_t max_addr,
+                                int nid)
+{
+        phys_addr_t alloc;
+        void *ptr;
+        if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+                nid = NUMA_NO_NODE;
+        /*
+         * Detect any accidental use of these APIs after slab is ready, as at
+         * this moment memblock may be deinitialized already and its
+         * internal data may be destroyed (after execution of free_all_bootmem)
+         */
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc_node(size, GFP_NOWAIT, nid);
+        if (!align)
+                align = SMP_CACHE_BYTES;
+        if (max_addr > memblock.current_limit)
+                max_addr = memblock.current_limit;
+again:
+        alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
+                                            nid);
+        if (alloc)
+                goto done;
+        if (nid != NUMA_NO_NODE) {
+                alloc = memblock_find_in_range_node(size, align, min_addr,
+                                                    max_addr,  NUMA_NO_NODE);
+                if (alloc)
+                        goto done;
+        }
+        if (min_addr) {
+                min_addr = 0;
+                goto again;
+        } else {
+                goto error;
+        }
+done:
+        memblock_reserve(alloc, size);
+        ptr = phys_to_virt(alloc);
+        memset(ptr, 0, size);
+        /*
+         * The min_count is set to 0 so that bootmem allocated blocks
+         * are never reported as leaks. This is because many of these blocks
+         * are only referred via the physical address which is not
+         * looked up by kmemleak.
+         */
+        kmemleak_alloc(ptr, size, 0, 0);
+        return ptr;
+error:
+        return NULL;
+}
+/**
+ * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ *        is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ *            is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ *            allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
+ * additional debug information (including caller info), if enabled.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_nopanic(
+                                phys_addr_t size, phys_addr_t align,
+                                phys_addr_t min_addr, phys_addr_t max_addr,
+                                int nid)
+{
+        memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+                     __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+                     (u64)max_addr, (void *)_RET_IP_);
+        return memblock_virt_alloc_internal(size, align, min_addr,
+                                             max_addr, nid);
+}
+/**
+ * memblock_virt_alloc_try_nid - allocate boot memory block with panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ *        is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ *            is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ *            allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
+ * which provides debug information (including caller info), if enabled,
+ * and panics if the request can not be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid(
+                        phys_addr_t size, phys_addr_t align,
+                        phys_addr_t min_addr, phys_addr_t max_addr,
+                        int nid)
+{
+        void *ptr;
+        memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+                     __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+                     (u64)max_addr, (void *)_RET_IP_);
+        ptr = memblock_virt_alloc_internal(size, align,
+                                           min_addr, max_addr, nid);
+        if (ptr)
+                return ptr;
+        panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
+              __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+              (u64)max_addr);
+        return NULL;
+}
+/**
+ * __memblock_free_early - free boot memory block
+ * @base: phys starting address of the  boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_virt_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
+void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
+{
+        memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+                     __func__, (u64)base, (u64)base + size - 1,
+                     (void *)_RET_IP_);
+        kmemleak_free_part(__va(base), size);
+        __memblock_remove(&memblock.reserved, base, size);
+}
+/*
+ * __memblock_free_late - free bootmem block pages directly to buddy allocator
+ * @addr: phys starting address of the  boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system.  Pages are released directly
+ * to the buddy allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
+{
+        u64 cursor, end;
+        memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+                     __func__, (u64)base, (u64)base + size - 1,
+                     (void *)_RET_IP_);
+        kmemleak_free_part(__va(base), size);
+        cursor = PFN_UP(base);
+        end = PFN_DOWN(base + size);
+        for (; cursor < end; cursor++) {
+                __free_pages_bootmem(pfn_to_page(cursor), 0);
+                totalram_pages++;
+        }
+}
 /*
 * Remaining API functions
@@ -1101,6 +1410,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit)
 static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
 {
        unsigned long long base, size;
+        unsigned long flags;
        int i;
        pr_info(" %s.cnt  = 0x%lx\n", name, type->cnt);
@@ -1111,13 +1421,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
                base = rgn->base;
                size = rgn->size;
+                flags = rgn->flags;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
                if (memblock_get_region_node(rgn) != MAX_NUMNODES)
                        snprintf(nid_buf, sizeof(nid_buf), " on node %d",
                                 memblock_get_region_node(rgn));
 #endif
-                pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n",
+                pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
-                        name, i, base, base + size - 1, size, nid_buf);
+                        name, i, base, base + size - 1, size, nid_buf, flags);
        }
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7f1a356153c0..53385cd4e6f0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,16 +45,17 @@
 #include <linux/swapops.h>
 #include <linux/spinlock.h>
 #include <linux/eventfd.h>
+#include <linux/poll.h>
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
-#include <linux/vmalloc.h>
 #include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include <linux/lockdep.h>
+#include <linux/file.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -148,7 +149,7 @@ struct mem_cgroup_reclaim_iter {
         * matches memcg->dead_count of the hierarchy root group.
         */
        struct mem_cgroup *last_visited;
-        unsigned long last_dead_count;
+        int last_dead_count;
        /* scan generation, increased every round-trip */
        unsigned int generation;
@@ -227,6 +228,46 @@ struct mem_cgroup_eventfd_list {
        struct eventfd_ctx *eventfd;
 };
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+        /*
+         * memcg which the event belongs to.
+         */
+        struct mem_cgroup *memcg;
+        /*
+         * eventfd to signal userspace about the event.
+         */
+        struct eventfd_ctx *eventfd;
+        /*
+         * Each of these stored in a list by the cgroup.
+         */
+        struct list_head list;
+        /*
+         * register_event() callback will be used to add new userspace
+         * waiter for changes related to this event.  Use eventfd_signal()
+         * on eventfd to send notification to userspace.
+         */
+        int (*register_event)(struct mem_cgroup *memcg,
+                              struct eventfd_ctx *eventfd, const char *args);
+        /*
+         * unregister_event() callback will be called when userspace closes
+         * the eventfd or on cgroup removing.  This callback must be set,
+         * if you want provide notification functionality.
+         */
+        void (*unregister_event)(struct mem_cgroup *memcg,
+                                 struct eventfd_ctx *eventfd);
+        /*
+         * All fields below needed to unregister event when
+         * userspace closes eventfd.
+         */
+        poll_table pt;
+        wait_queue_head_t *wqh;
+        wait_queue_t wait;
+        struct work_struct remove;
+};
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
@@ -331,27 +372,20 @@ struct mem_cgroup {
        atomic_t        numainfo_updating;
 #endif
+        /* List of events which userspace want to receive */
+        struct list_head event_list;
+        spinlock_t event_list_lock;
        struct mem_cgroup_per_node *nodeinfo[0];
        /* WARNING: nodeinfo must be the last member here */
 };
-static size_t memcg_size(void)
-{
-        return sizeof(struct mem_cgroup) +
-                nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-}
 /* internal only representation about the status of kmem accounting. */
 enum {
-        KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
+        KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
-        KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
        KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
 };
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
-                ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
 #ifdef CONFIG_MEMCG_KMEM
 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
 {
@@ -363,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
        return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
-        set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
-        clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 {
        /*
@@ -490,11 +514,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
        return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 }
-struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
-{
-        return &mem_cgroup_from_css(css)->vmpressure;
-}
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
        return (memcg == root_mem_cgroup);
@@ -1098,16 +1117,22 @@ skip_node:
         * skipped and we should continue the tree walk.
         * last_visited css is safe to use because it is
         * protected by css_get and the tree walk is rcu safe.
+         *
+         * We do not take a reference on the root of the tree walk
+         * because we might race with the root removal when it would
+         * be the only node in the iterated hierarchy and mem_cgroup_iter
+         * would end up in an endless loop because it expects that at
+         * least one valid node will be returned. Root cannot disappear
+         * because caller of the iterator should hold it already so
+         * skipping css reference should be safe.
         */
        if (next_css) {
-                struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
+                if ((next_css->flags & CSS_ONLINE) &&
+                                (next_css == &root->css || css_tryget(next_css)))
+                        return mem_cgroup_from_css(next_css);
-                if (css_tryget(&mem->css))
+                prev_css = next_css;
-                        return mem;
+                goto skip_node;
-                else {
-                        prev_css = next_css;
-                        goto skip_node;
-                }
        }
        return NULL;
@@ -1141,7 +1166,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
        if (iter->last_dead_count == *sequence) {
                smp_rmb();
                position = iter->last_visited;
-                if (position && !css_tryget(&position->css))
+                /*
+                 * We cannot take a reference to root because we might race
+                 * with root removal and returning NULL would end up in
+                 * an endless loop on the iterator user level when root
+                 * would be returned all the time.
+                 */
+                if (position && position != root &&
+                                !css_tryget(&position->css))
                        position = NULL;
        }
        return position;
@@ -1150,9 +1183,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
                                   struct mem_cgroup *last_visited,
                                   struct mem_cgroup *new_position,
+                                   struct mem_cgroup *root,
                                   int sequence)
 {
-        if (last_visited)
+        /* root reference counting symmetric to mem_cgroup_iter_load */
+        if (last_visited && last_visited != root)
                css_put(&last_visited->css);
        /*
         * We store the sequence count from the time @last_visited was
@@ -1227,7 +1262,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                memcg = __mem_cgroup_iter_next(root, last_visited);
                if (reclaim) {
-                        mem_cgroup_iter_update(iter, last_visited, memcg, seq);
+                        mem_cgroup_iter_update(iter, last_visited, memcg, root,
+                                        seq);
                        if (!memcg)
                                iter->generation++;
@@ -1647,13 +1683,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
 */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
-        struct cgroup *task_cgrp;
-        struct cgroup *mem_cgrp;
        /*
-         * Need a buffer in BSS, can't rely on allocations. The code relies
+         * protects memcg_name and makes sure that parallel ooms do not
-         * on the assumption that OOM is serialized for memory controller.
+         * interleave
-         * If this assumption is broken, revisit this code.
         */
+        static DEFINE_SPINLOCK(oom_info_lock);
+        struct cgroup *task_cgrp;
+        struct cgroup *mem_cgrp;
        static char memcg_name[PATH_MAX];
        int ret;
        struct mem_cgroup *iter;
@@ -1662,6 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        if (!p)
                return;
+        spin_lock(&oom_info_lock);
        rcu_read_lock();
        mem_cgrp = memcg->css.cgroup;
@@ -1730,6 +1767,7 @@ done:
                pr_cont("\n");
        }
+        spin_unlock(&oom_info_lock);
 }
 /*
@@ -1822,13 +1860,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                break;
                        };
                        points = oom_badness(task, memcg, NULL, totalpages);
-                        if (points > chosen_points) {
+                        if (!points || points < chosen_points)
-                                if (chosen)
+                                continue;
-                                        put_task_struct(chosen);
+                        /* Prefer thread group leaders for display purposes */
-                                chosen = task;
+                        if (points == chosen_points &&
-                                chosen_points = points;
+                            thread_group_leader(chosen))
-                                get_task_struct(chosen);
+                                continue;
-                        }
+                        if (chosen)
+                                put_task_struct(chosen);
+                        chosen = task;
+                        chosen_points = points;
+                        get_task_struct(chosen);
                }
                css_task_iter_end(&it);
        }
@@ -2861,7 +2904,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
        unsigned short id;
        swp_entry_t ent;
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
@@ -2895,7 +2938,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        bool anon;
        lock_page_cgroup(pc);
-        VM_BUG_ON(PageCgroupUsed(pc));
+        VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
        /*
         * we don't need page_cgroup_lock about tail pages, becase they are not
         * accessed by any other context at this point.
@@ -2930,7 +2973,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        if (lrucare) {
                if (was_on_lru) {
                        lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
-                        VM_BUG_ON(PageLRU(page));
+                        VM_BUG_ON_PAGE(PageLRU(page), page);
                        SetPageLRU(page);
                        add_page_to_lru_list(page, lruvec, page_lru(page));
                }
@@ -2956,10 +2999,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 static DEFINE_MUTEX(set_limit_mutex);
 #ifdef CONFIG_MEMCG_KMEM
+static DEFINE_MUTEX(activate_kmem_mutex);
 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
 {
        return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
-                (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+                memcg_kmem_is_active(memcg);
 }
 /*
@@ -2976,10 +3021,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
 }
 #ifdef CONFIG_SLABINFO
-static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
+static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
-                                    struct cftype *cft, struct seq_file *m)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
        struct memcg_cache_params *params;
        if (!memcg_can_account_kmem(memcg))
@@ -3059,16 +3103,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
                css_put(&memcg->css);
 }
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
-{
-        if (!memcg)
-                return;
-        mutex_lock(&memcg->slab_caches_mutex);
-        list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-        mutex_unlock(&memcg->slab_caches_mutex);
-}
 /*
 * helper for acessing a memcg's index. It will be used as an index in the
 * child cache array in kmem_cache, and also to derive its name. This function
@@ -3079,43 +3113,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)
        return memcg ? memcg->kmemcg_id : -1;
 }
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
-{
-        int num, ret;
-        num = ida_simple_get(&kmem_limited_groups,
-                                0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
-        if (num < 0)
-                return num;
-        /*
-         * After this point, kmem_accounted (that we test atomically in
-         * the beginning of this conditional), is no longer 0. This
-         * guarantees only one process will set the following boolean
-         * to true. We don't need test_and_set because we're protected
-         * by the set_limit_mutex anyway.
-         */
-        memcg_kmem_set_activated(memcg);
-        ret = memcg_update_all_caches(num+1);
-        if (ret) {
-                ida_simple_remove(&kmem_limited_groups, num);
-                memcg_kmem_clear_activated(memcg);
-                return ret;
-        }
-        memcg->kmemcg_id = num;
-        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
-        mutex_init(&memcg->slab_caches_mutex);
-        return 0;
-}
 static size_t memcg_caches_array_size(int num_groups)
 {
        ssize_t size;
@@ -3152,18 +3149,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
        if (num_groups > memcg_limited_groups_array_size) {
                int i;
+                struct memcg_cache_params *new_params;
                ssize_t size = memcg_caches_array_size(num_groups);
                size *= sizeof(void *);
                size += offsetof(struct memcg_cache_params, memcg_caches);
-                s->memcg_params = kzalloc(size, GFP_KERNEL);
+                new_params = kzalloc(size, GFP_KERNEL);
-                if (!s->memcg_params) {
+                if (!new_params)
-                        s->memcg_params = cur_params;
                        return -ENOMEM;
-                }
-                s->memcg_params->is_root_cache = true;
+                new_params->is_root_cache = true;
                /*
                 * There is the chance it will be bigger than
@@ -3177,7 +3173,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
                for (i = 0; i < memcg_limited_groups_array_size; i++) {
                        if (!cur_params->memcg_caches[i])
                                continue;
-                        s->memcg_params->memcg_caches[i] =
+                        new_params->memcg_caches[i] =
                                                cur_params->memcg_caches[i];
                }
@@ -3190,13 +3186,15 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
                 * bigger than the others. And all updates will reset this
                 * anyway.
                 */
-                kfree(cur_params);
+                rcu_assign_pointer(s->memcg_params, new_params);
+                if (cur_params)
+                        kfree_rcu(cur_params, rcu_head);
        }
        return 0;
 }
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
-                         struct kmem_cache *root_cache)
+                             struct kmem_cache *root_cache)
 {
        size_t size;
@@ -3224,35 +3222,85 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
        return 0;
 }
-void memcg_release_cache(struct kmem_cache *s)
+void memcg_free_cache_params(struct kmem_cache *s)
+{
+        kfree(s->memcg_params);
+}
+void memcg_register_cache(struct kmem_cache *s)
 {
        struct kmem_cache *root;
        struct mem_cgroup *memcg;
        int id;
-        /*
+        if (is_root_cache(s))
-         * This happens, for instance, when a root cache goes away before we
-         * add any memcg.
-         */
-        if (!s->memcg_params)
                return;
-        if (s->memcg_params->is_root_cache)
+        /*
-                goto out;
+         * Holding the slab_mutex assures nobody will touch the memcg_caches
+         * array while we are modifying it.
+         */
+        lockdep_assert_held(&slab_mutex);
+        root = s->memcg_params->root_cache;
        memcg = s->memcg_params->memcg;
-        id  = memcg_cache_id(memcg);
+        id = memcg_cache_id(memcg);
+        css_get(&memcg->css);
+        /*
+         * Since readers won't lock (see cache_from_memcg_idx()), we need a
+         * barrier here to ensure nobody will see the kmem_cache partially
+         * initialized.
+         */
+        smp_wmb();
+        /*
+         * Initialize the pointer to this cache in its parent's memcg_params
+         * before adding it to the memcg_slab_caches list, otherwise we can
+         * fail to convert memcg_params_to_cache() while traversing the list.
+         */
+        VM_BUG_ON(root->memcg_params->memcg_caches[id]);
+        root->memcg_params->memcg_caches[id] = s;
+        mutex_lock(&memcg->slab_caches_mutex);
+        list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
+        mutex_unlock(&memcg->slab_caches_mutex);
+}
+void memcg_unregister_cache(struct kmem_cache *s)
+{
+        struct kmem_cache *root;
+        struct mem_cgroup *memcg;
+        int id;
+        if (is_root_cache(s))
+                return;
+        /*
+         * Holding the slab_mutex assures nobody will touch the memcg_caches
+         * array while we are modifying it.
+         */
+        lockdep_assert_held(&slab_mutex);
        root = s->memcg_params->root_cache;
-        root->memcg_params->memcg_caches[id] = NULL;
+        memcg = s->memcg_params->memcg;
+        id = memcg_cache_id(memcg);
        mutex_lock(&memcg->slab_caches_mutex);
        list_del(&s->memcg_params->list);
        mutex_unlock(&memcg->slab_caches_mutex);
+        /*
+         * Clear the pointer to this cache in its parent's memcg_params only
+         * after removing it from the memcg_slab_caches list, otherwise we can
+         * fail to convert memcg_params_to_cache() while traversing the list.
+         */
+        VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
+        root->memcg_params->memcg_caches[id] = NULL;
        css_put(&memcg->css);
-out:
-        kfree(s->memcg_params);
 }
 /*
@@ -3311,11 +3359,9 @@ static void kmem_cache_destroy_work_func(struct work_struct *w)
         * So if we aren't down to zero, we'll just schedule a worker and try
         * again
         */
-        if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+        if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
                kmem_cache_shrink(cachep);
-                if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+        else
-                        return;
-        } else
                kmem_cache_destroy(cachep);
 }
@@ -3351,27 +3397,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
        schedule_work(&cachep->memcg_params->destroy);
 }
-/*
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- * This lock protects updaters, not readers. We want readers to be as fast as
+                                                  struct kmem_cache *s)
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
-                                         struct kmem_cache *s)
 {
-        struct kmem_cache *new;
+        struct kmem_cache *new = NULL;
        static char *tmp_name = NULL;
+        static DEFINE_MUTEX(mutex);     /* protects tmp_name */
-        lockdep_assert_held(&memcg_cache_mutex);
+        BUG_ON(!memcg_can_account_kmem(memcg));
+        mutex_lock(&mutex);
        /*
         * kmem_cache_create_memcg duplicates the given name and
         * cgroup_name for this name requires RCU context.
@@ -3381,7 +3416,7 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
        if (!tmp_name) {
                tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
                if (!tmp_name)
-                        return NULL;
+                        goto out;
        }
        rcu_read_lock();
@@ -3391,48 +3426,13 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
        new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
                                      (s->flags & ~SLAB_PANIC), s->ctor, s);
        if (new)
                new->allocflags |= __GFP_KMEMCG;
+        else
-        return new;
+                new = s;
-}
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
-                                                  struct kmem_cache *cachep)
-{
-        struct kmem_cache *new_cachep;
-        int idx;
-        BUG_ON(!memcg_can_account_kmem(memcg));
-        idx = memcg_cache_id(memcg);
-        mutex_lock(&memcg_cache_mutex);
-        new_cachep = cache_from_memcg_idx(cachep, idx);
-        if (new_cachep) {
-                css_put(&memcg->css);
-                goto out;
-        }
-        new_cachep = kmem_cache_dup(memcg, cachep);
-        if (new_cachep == NULL) {
-                new_cachep = cachep;
-                css_put(&memcg->css);
-                goto out;
-        }
-        atomic_set(&new_cachep->memcg_params->nr_pages , 0);
-        cachep->memcg_params->memcg_caches[idx] = new_cachep;
-        /*
-         * the readers won't lock, make sure everybody sees the updated value,
-         * so they won't put stuff in the queue again for no reason
-         */
-        wmb();
 out:
-        mutex_unlock(&memcg_cache_mutex);
+        mutex_unlock(&mutex);
-        return new_cachep;
+        return new;
 }
 void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
@@ -3452,9 +3452,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
         *
         * Still, we don't want anyone else freeing memcg_caches under our
         * noses, which can happen if a new memcg comes to life. As usual,
-         * we'll take the set_limit_mutex to protect ourselves against this.
+         * we'll take the activate_kmem_mutex to protect ourselves against
+         * this.
         */
-        mutex_lock(&set_limit_mutex);
+        mutex_lock(&activate_kmem_mutex);
        for_each_memcg_cache_index(i) {
                c = cache_from_memcg_idx(s, i);
                if (!c)
@@ -3477,7 +3478,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
                cancel_work_sync(&c->memcg_params->destroy);
                kmem_cache_destroy(c);
        }
-        mutex_unlock(&set_limit_mutex);
+        mutex_unlock(&activate_kmem_mutex);
 }
 struct create_work {
@@ -3509,6 +3510,7 @@ static void memcg_create_cache_work_func(struct work_struct *w)
        cw = container_of(w, struct create_work, work);
        memcg_create_kmem_cache(cw->memcg, cw->cachep);
+        css_put(&cw->memcg->css);
        kfree(cw);
 }
@@ -3568,7 +3570,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
                                          gfp_t gfp)
 {
        struct mem_cgroup *memcg;
-        int idx;
+        struct kmem_cache *memcg_cachep;
        VM_BUG_ON(!cachep->memcg_params);
        VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3582,15 +3584,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
        if (!memcg_can_account_kmem(memcg))
                goto out;
-        idx = memcg_cache_id(memcg);
+        memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+        if (likely(memcg_cachep)) {
-        /*
+                cachep = memcg_cachep;
-         * barrier to mare sure we're always seeing the up to date value.  The
-         * code updating memcg_caches will issue a write barrier to match this.
-         */
-        read_barrier_depends();
-        if (likely(cache_from_memcg_idx(cachep, idx))) {
-                cachep = cache_from_memcg_idx(cachep, idx);
                goto out;
        }
@@ -3744,7 +3740,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
        if (!memcg)
                return;
-        VM_BUG_ON(mem_cgroup_is_root(memcg));
+        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
        memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
 }
 #else
@@ -3823,7 +3819,7 @@ static int mem_cgroup_move_account(struct page *page,
        bool anon = PageAnon(page);
        VM_BUG_ON(from == to);
-        VM_BUG_ON(PageLRU(page));
+        VM_BUG_ON_PAGE(PageLRU(page), page);
        /*
         * The page is isolated from LRU. So, collapse function
         * will not handle this page. But page splitting can happen.
@@ -3916,7 +3912,7 @@ static int mem_cgroup_move_parent(struct page *page,
                parent = root_mem_cgroup;
        if (nr_pages > 1) {
-                VM_BUG_ON(!PageTransHuge(page));
+                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                flags = compound_lock_irqsave(page);
        }
@@ -3950,7 +3946,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        if (PageTransHuge(page)) {
                nr_pages <<= compound_order(page);
-                VM_BUG_ON(!PageTransHuge(page));
+                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                /*
                 * Never OOM-kill a process for a huge page.  The
                 * fault handler will fall back to regular pages.
@@ -3970,8 +3966,8 @@ int mem_cgroup_newpage_charge(struct page *page,
 {
        if (mem_cgroup_disabled())
                return 0;
-        VM_BUG_ON(page_mapped(page));
+        VM_BUG_ON_PAGE(page_mapped(page), page);
-        VM_BUG_ON(page->mapping && !PageAnon(page));
+        VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
        VM_BUG_ON(!mm);
        return mem_cgroup_charge_common(page, mm, gfp_mask,
                                        MEM_CGROUP_CHARGE_TYPE_ANON);
@@ -4175,7 +4171,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
        if (PageTransHuge(page)) {
                nr_pages <<= compound_order(page);
-                VM_BUG_ON(!PageTransHuge(page));
+                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
        }
        /*
         * Check if our page_cgroup is valid
@@ -4267,7 +4263,7 @@ void mem_cgroup_uncharge_page(struct page *page)
        /* early check. */
        if (page_mapped(page))
                return;
-        VM_BUG_ON(page->mapping && !PageAnon(page));
+        VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
        /*
         * If the page is in swap cache, uncharge should be deferred
         * to the swap path, which also properly accounts swap usage
@@ -4287,8 +4283,8 @@ void mem_cgroup_uncharge_page(struct page *page)
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
-        VM_BUG_ON(page_mapped(page));
+        VM_BUG_ON_PAGE(page_mapped(page), page);
-        VM_BUG_ON(page->mapping);
+        VM_BUG_ON_PAGE(page->mapping, page);
        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
 }
@@ -5112,14 +5108,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
        return val << PAGE_SHIFT;
 }
-static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
+static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
-                               struct cftype *cft, struct file *file,
+                                   struct cftype *cft)
-                               char __user *buf, size_t nbytes, loff_t *ppos)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        char str[64];
        u64 val;
-        int name, len;
+        int name;
        enum res_type type;
        type = MEMFILE_TYPE(cft->private);
@@ -5145,15 +5139,26 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
                BUG();
        }
-        len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+        return val;
-        return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
-static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
-{
-        int ret = -EINVAL;
 #ifdef CONFIG_MEMCG_KMEM
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+                                 unsigned long long limit)
+{
+        int err = 0;
+        int memcg_id;
+        if (memcg_kmem_is_active(memcg))
+                return 0;
+        /*
+         * We are going to allocate memory for data shared by all memory
+         * cgroups so let's stop accounting here.
+         */
+        memcg_stop_kmem_account();
        /*
         * For simplicity, we won't allow this to be disabled.  It also can't
         * be changed if the cgroup has children already, or if tasks had
@@ -5167,72 +5172,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
         * of course permitted.
         */
        mutex_lock(&memcg_create_mutex);
-        mutex_lock(&set_limit_mutex);
+        if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
-        if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
+                err = -EBUSY;
-                if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
+        mutex_unlock(&memcg_create_mutex);
-                        ret = -EBUSY;
+        if (err)
-                        goto out;
+                goto out;
-                }
-                ret = res_counter_set_limit(&memcg->kmem, val);
-                VM_BUG_ON(ret);
-                ret = memcg_update_cache_sizes(memcg);
+        memcg_id = ida_simple_get(&kmem_limited_groups,
-                if (ret) {
+                                  0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
-                        res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
+        if (memcg_id < 0) {
-                        goto out;
+                err = memcg_id;
-                }
+                goto out;
-                static_key_slow_inc(&memcg_kmem_enabled_key);
+        }
-                /*
-                 * setting the active bit after the inc will guarantee no one
+        /*
-                 * starts accounting before all call sites are patched
+         * Make sure we have enough space for this cgroup in each root cache's
-                 */
+         * memcg_params.
-                memcg_kmem_set_active(memcg);
+         */
-        } else
+        err = memcg_update_all_caches(memcg_id + 1);
-                ret = res_counter_set_limit(&memcg->kmem, val);
+        if (err)
+                goto out_rmid;
+        memcg->kmemcg_id = memcg_id;
+        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+        mutex_init(&memcg->slab_caches_mutex);
+        /*
+         * We couldn't have accounted to this cgroup, because it hasn't got the
+         * active bit set yet, so this should succeed.
+         */
+        err = res_counter_set_limit(&memcg->kmem, limit);
+        VM_BUG_ON(err);
+        static_key_slow_inc(&memcg_kmem_enabled_key);
+        /*
+         * Setting the active bit after enabling static branching will
+         * guarantee no one starts accounting before all call sites are
+         * patched.
+         */
+        memcg_kmem_set_active(memcg);
 out:
-        mutex_unlock(&set_limit_mutex);
+        memcg_resume_kmem_account();
-        mutex_unlock(&memcg_create_mutex);
+        return err;
-#endif
+out_rmid:
+        ida_simple_remove(&kmem_limited_groups, memcg_id);
+        goto out;
+}
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+                               unsigned long long limit)
+{
+        int ret;
+        mutex_lock(&activate_kmem_mutex);
+        ret = __memcg_activate_kmem(memcg, limit);
+        mutex_unlock(&activate_kmem_mutex);
+        return ret;
+}
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+                                   unsigned long long val)
+{
+        int ret;
+        if (!memcg_kmem_is_active(memcg))
+                ret = memcg_activate_kmem(memcg, val);
+        else
+                ret = res_counter_set_limit(&memcg->kmem, val);
        return ret;
 }
-#ifdef CONFIG_MEMCG_KMEM
 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 {
        int ret = 0;
        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-        if (!parent)
-                goto out;
-        memcg->kmem_account_flags = parent->kmem_account_flags;
+        if (!parent)
-        /*
+                return 0;
-         * When that happen, we need to disable the static branch only on those
-         * memcgs that enabled it. To achieve this, we would be forced to
-         * complicate the code by keeping track of which memcgs were the ones
-         * that actually enabled limits, and which ones got it from its
-         * parents.
-         *
-         * It is a lot simpler just to do static_key_slow_inc() on every child
-         * that is accounted.
-         */
-        if (!memcg_kmem_is_active(memcg))
-                goto out;
+        mutex_lock(&activate_kmem_mutex);
        /*
-         * __mem_cgroup_free() will issue static_key_slow_dec() because this
+         * If the parent cgroup is not kmem-active now, it cannot be activated
-         * memcg is active already. If the later initialization fails then the
+         * after this point, because it has at least one child already.
-         * cgroup core triggers the cleanup so we do not have to do it here.
         */
-        static_key_slow_inc(&memcg_kmem_enabled_key);
+        if (memcg_kmem_is_active(parent))
+                ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
-        mutex_lock(&set_limit_mutex);
+        mutex_unlock(&activate_kmem_mutex);
-        memcg_stop_kmem_account();
-        ret = memcg_update_cache_sizes(memcg);
-        memcg_resume_kmem_account();
-        mutex_unlock(&set_limit_mutex);
-out:
        return ret;
 }
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+                                   unsigned long long val)
+{
+        return -EINVAL;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 /*
@@ -5266,7 +5300,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
                else if (type == _MEMSWAP)
                        ret = mem_cgroup_resize_memsw_limit(memcg, val);
                else if (type == _KMEM)
-                        ret = memcg_update_kmem_limit(css, val);
+                        ret = memcg_update_kmem_limit(memcg, val);
                else
                        return -EINVAL;
                break;
@@ -5383,8 +5417,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 #endif
 #ifdef CONFIG_NUMA
-static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
+static int memcg_numa_stat_show(struct seq_file *m, void *v)
-                                struct cftype *cft, struct seq_file *m)
 {
        struct numa_stat {
                const char *name;
@@ -5400,7 +5433,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
        const struct numa_stat *stat;
        int nid;
        unsigned long nr;
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
                nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -5439,10 +5472,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 }
-static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
+static int memcg_stat_show(struct seq_file *m, void *v)
-                                 struct seq_file *m)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
        struct mem_cgroup *mi;
        unsigned int i;
@@ -5651,13 +5683,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
                mem_cgroup_oom_notify_cb(iter);
 }
-static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
-        struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+        struct eventfd_ctx *eventfd, const char *args, enum res_type type)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup_thresholds *thresholds;
        struct mem_cgroup_threshold_ary *new;
-        enum res_type type = MEMFILE_TYPE(cft->private);
        u64 threshold, usage;
        int i, size, ret;
@@ -5734,13 +5764,23 @@ unlock:
        return ret;
 }
-static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
-        struct cftype *cft, struct eventfd_ctx *eventfd)
+        struct eventfd_ctx *eventfd, const char *args)
+{
+        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+        struct eventfd_ctx *eventfd, const char *args)
+{
+        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+        struct eventfd_ctx *eventfd, enum res_type type)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup_thresholds *thresholds;
        struct mem_cgroup_threshold_ary *new;
-        enum res_type type = MEMFILE_TYPE(cft->private);
        u64 usage;
        int i, j, size;
@@ -5813,14 +5853,23 @@ unlock:
        mutex_unlock(&memcg->thresholds_lock);
 }
-static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
-        struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+        struct eventfd_ctx *eventfd)
+{
+        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+        struct eventfd_ctx *eventfd)
+{
+        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+        struct eventfd_ctx *eventfd, const char *args)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup_eventfd_list *event;
-        enum res_type type = MEMFILE_TYPE(cft->private);
-        BUG_ON(type != _OOM_TYPE);
        event = kmalloc(sizeof(*event), GFP_KERNEL);
        if (!event)
                return -ENOMEM;
@@ -5838,14 +5887,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
        return 0;
 }
-static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
-        struct cftype *cft, struct eventfd_ctx *eventfd)
+        struct eventfd_ctx *eventfd)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup_eventfd_list *ev, *tmp;
-        enum res_type type = MEMFILE_TYPE(cft->private);
-        BUG_ON(type != _OOM_TYPE);
        spin_lock(&memcg_oom_lock);
@@ -5859,17 +5904,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
        spin_unlock(&memcg_oom_lock);
 }
-static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
+static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
-        struct cftype *cft,  struct cgroup_map_cb *cb)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
-        cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
+        seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+        seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
-        if (atomic_read(&memcg->under_oom))
-                cb->fill(cb, "under_oom", 1);
-        else
-                cb->fill(cb, "under_oom", 0);
        return 0;
 }
@@ -5962,41 +6002,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
 }
 #endif
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered.  It tries to support fully configurable
+ * events for each user.  Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+        struct mem_cgroup_event *event =
+                container_of(work, struct mem_cgroup_event, remove);
+        struct mem_cgroup *memcg = event->memcg;
+        remove_wait_queue(event->wqh, &event->wait);
+        event->unregister_event(memcg, event->eventfd);
+        /* Notify userspace the event is going away. */
+        eventfd_signal(event->eventfd, 1);
+        eventfd_ctx_put(event->eventfd);
+        kfree(event);
+        css_put(&memcg->css);
+}
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+                            int sync, void *key)
+{
+        struct mem_cgroup_event *event =
+                container_of(wait, struct mem_cgroup_event, wait);
+        struct mem_cgroup *memcg = event->memcg;
+        unsigned long flags = (unsigned long)key;
+        if (flags & POLLHUP) {
+                /*
+                 * If the event has been detached at cgroup removal, we
+                 * can simply return knowing the other side will cleanup
+                 * for us.
+                 *
+                 * We can't race against event freeing since the other
+                 * side will require wqh->lock via remove_wait_queue(),
+                 * which we hold.
+                 */
+                spin_lock(&memcg->event_list_lock);
+                if (!list_empty(&event->list)) {
+                        list_del_init(&event->list);
+                        /*
+                         * We are in atomic context, but cgroup_event_remove()
+                         * may sleep, so we have to call it in workqueue.
+                         */
+                        schedule_work(&event->remove);
+                }
+                spin_unlock(&memcg->event_list_lock);
+        }
+        return 0;
+}
+static void memcg_event_ptable_queue_proc(struct file *file,
+                wait_queue_head_t *wqh, poll_table *pt)
+{
+        struct mem_cgroup_event *event =
+                container_of(pt, struct mem_cgroup_event, pt);
+        event->wqh = wqh;
+        add_wait_queue(wqh, &event->wait);
+}
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int memcg_write_event_control(struct cgroup_subsys_state *css,
+                                     struct cftype *cft, const char *buffer)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup_event *event;
+        struct cgroup_subsys_state *cfile_css;
+        unsigned int efd, cfd;
+        struct fd efile;
+        struct fd cfile;
+        const char *name;
+        char *endp;
+        int ret;
+        efd = simple_strtoul(buffer, &endp, 10);
+        if (*endp != ' ')
+                return -EINVAL;
+        buffer = endp + 1;
+        cfd = simple_strtoul(buffer, &endp, 10);
+        if ((*endp != ' ') && (*endp != '\0'))
+                return -EINVAL;
+        buffer = endp + 1;
+        event = kzalloc(sizeof(*event), GFP_KERNEL);
+        if (!event)
+                return -ENOMEM;
+        event->memcg = memcg;
+        INIT_LIST_HEAD(&event->list);
+        init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+        init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+        INIT_WORK(&event->remove, memcg_event_remove);
+        efile = fdget(efd);
+        if (!efile.file) {
+                ret = -EBADF;
+                goto out_kfree;
+        }
+        event->eventfd = eventfd_ctx_fileget(efile.file);
+        if (IS_ERR(event->eventfd)) {
+                ret = PTR_ERR(event->eventfd);
+                goto out_put_efile;
+        }
+        cfile = fdget(cfd);
+        if (!cfile.file) {
+                ret = -EBADF;
+                goto out_put_eventfd;
+        }
+        /* the process need read permission on control file */
+        /* AV: shouldn't we check that it's been opened for read instead? */
+        ret = inode_permission(file_inode(cfile.file), MAY_READ);
+        if (ret < 0)
+                goto out_put_cfile;
+        /*
+         * Determine the event callbacks and set them in @event.  This used
+         * to be done via struct cftype but cgroup core no longer knows
+         * about these events.  The following is crude but the whole thing
+         * is for compatibility anyway.
+         *
+         * DO NOT ADD NEW FILES.
+         */
+        name = cfile.file->f_dentry->d_name.name;
+        if (!strcmp(name, "memory.usage_in_bytes")) {
+                event->register_event = mem_cgroup_usage_register_event;
+                event->unregister_event = mem_cgroup_usage_unregister_event;
+        } else if (!strcmp(name, "memory.oom_control")) {
+                event->register_event = mem_cgroup_oom_register_event;
+                event->unregister_event = mem_cgroup_oom_unregister_event;
+        } else if (!strcmp(name, "memory.pressure_level")) {
+                event->register_event = vmpressure_register_event;
+                event->unregister_event = vmpressure_unregister_event;
+        } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+                event->register_event = memsw_cgroup_usage_register_event;
+                event->unregister_event = memsw_cgroup_usage_unregister_event;
+        } else {
+                ret = -EINVAL;
+                goto out_put_cfile;
+        }
+        /*
+         * Verify @cfile should belong to @css.  Also, remaining events are
+         * automatically removed on cgroup destruction but the removal is
+         * asynchronous, so take an extra ref on @css.
+         */
+        rcu_read_lock();
+        ret = -EINVAL;
+        cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
+                                 &mem_cgroup_subsys);
+        if (cfile_css == css && css_tryget(css))
+                ret = 0;
+        rcu_read_unlock();
+        if (ret)
+                goto out_put_cfile;
+        ret = event->register_event(memcg, event->eventfd, buffer);
+        if (ret)
+                goto out_put_css;
+        efile.file->f_op->poll(efile.file, &event->pt);
+        spin_lock(&memcg->event_list_lock);
+        list_add(&event->list, &memcg->event_list);
+        spin_unlock(&memcg->event_list_lock);
+        fdput(cfile);
+        fdput(efile);
+        return 0;
+out_put_css:
+        css_put(css);
+out_put_cfile:
+        fdput(cfile);
+out_put_eventfd:
+        eventfd_ctx_put(event->eventfd);
+out_put_efile:
+        fdput(efile);
+out_kfree:
+        kfree(event);
+        return ret;
+}
 static struct cftype mem_cgroup_files[] = {
        {
                .name = "usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
-                .register_event = mem_cgroup_usage_register_event,
-                .unregister_event = mem_cgroup_usage_unregister_event,
        },
        {
                .name = "max_usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "limit_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
                .write_string = mem_cgroup_write,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "soft_limit_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
                .write_string = mem_cgroup_write,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "failcnt",
                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "stat",
-                .read_seq_string = memcg_stat_show,
+                .seq_show = memcg_stat_show,
        },
        {
                .name = "force_empty",
@@ -6009,6 +6269,12 @@ static struct cftype mem_cgroup_files[] = {
                .read_u64 = mem_cgroup_hierarchy_read,
        },
        {
+                .name = "cgroup.event_control",         /* XXX: for compat */
+                .write_string = memcg_write_event_control,
+                .flags = CFTYPE_NO_PREFIX,
+                .mode = S_IWUGO,
+        },
+        {
                .name = "swappiness",
                .read_u64 = mem_cgroup_swappiness_read,
                .write_u64 = mem_cgroup_swappiness_write,
@@ -6020,21 +6286,17 @@ static struct cftype mem_cgroup_files[] = {
        },
        {
                .name = "oom_control",
-                .read_map = mem_cgroup_oom_control_read,
+                .seq_show = mem_cgroup_oom_control_read,
                .write_u64 = mem_cgroup_oom_control_write,
-                .register_event = mem_cgroup_oom_register_event,
-                .unregister_event = mem_cgroup_oom_unregister_event,
                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
        },
        {
                .name = "pressure_level",
-                .register_event = vmpressure_register_event,
-                .unregister_event = vmpressure_unregister_event,
        },
 #ifdef CONFIG_NUMA
        {
                .name = "numa_stat",
-                .read_seq_string = memcg_numa_stat_show,
+                .seq_show = memcg_numa_stat_show,
        },
 #endif
 #ifdef CONFIG_MEMCG_KMEM
@@ -6042,29 +6304,29 @@ static struct cftype mem_cgroup_files[] = {
                .name = "kmem.limit_in_bytes",
                .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
                .write_string = mem_cgroup_write,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "kmem.usage_in_bytes",
                .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "kmem.failcnt",
                .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "kmem.max_usage_in_bytes",
                .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
 #ifdef CONFIG_SLABINFO
        {
                .name = "kmem.slabinfo",
-                .read_seq_string = mem_cgroup_slabinfo_read,
+                .seq_show = mem_cgroup_slabinfo_read,
        },
 #endif
 #endif
@@ -6076,27 +6338,25 @@ static struct cftype memsw_cgroup_files[] = {
        {
                .name = "memsw.usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
-                .register_event = mem_cgroup_usage_register_event,
-                .unregister_event = mem_cgroup_usage_unregister_event,
        },
        {
                .name = "memsw.max_usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "memsw.limit_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
                .write_string = mem_cgroup_write,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "memsw.failcnt",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        { },    /* terminate */
 };
@@ -6139,14 +6399,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
        struct mem_cgroup *memcg;
-        size_t size = memcg_size();
+        size_t size;
-        /* Can be very big if nr_node_ids is very big */
+        size = sizeof(struct mem_cgroup);
-        if (size < PAGE_SIZE)
+        size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-                memcg = kzalloc(size, GFP_KERNEL);
-        else
-                memcg = vzalloc(size);
+        memcg = kzalloc(size, GFP_KERNEL);
        if (!memcg)
                return NULL;
@@ -6157,10 +6415,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        return memcg;
 out_free:
-        if (size < PAGE_SIZE)
+        kfree(memcg);
-                kfree(memcg);
-        else
-                vfree(memcg);
        return NULL;
 }
@@ -6178,7 +6433,6 @@ out_free:
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
        int node;
-        size_t size = memcg_size();
        mem_cgroup_remove_from_trees(memcg);
@@ -6199,10 +6453,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
         * the cgroup_lock.
         */
        disarm_static_keys(memcg);
-        if (size < PAGE_SIZE)
+        kfree(memcg);
-                kfree(memcg);
-        else
-                vfree(memcg);
 }
 /*
@@ -6268,6 +6519,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        mutex_init(&memcg->thresholds_lock);
        spin_lock_init(&memcg->move_lock);
        vmpressure_init(&memcg->vmpressure);
+        INIT_LIST_HEAD(&memcg->event_list);
+        spin_lock_init(&memcg->event_list_lock);
        return &memcg->css;
@@ -6281,7 +6534,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
-        int error = 0;
        if (css->cgroup->id > MEM_CGROUP_ID_MAX)
                return -ENOSPC;
@@ -6316,10 +6568,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
                if (parent != root_mem_cgroup)
                        mem_cgroup_subsys.broken_hierarchy = true;
        }
-        error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
        mutex_unlock(&memcg_create_mutex);
-        return error;
+        return memcg_init_kmem(memcg, &mem_cgroup_subsys);
 }
 /*
@@ -6343,6 +6594,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup_event *event, *tmp;
+        /*
+         * Unregister events and notify userspace.
+         * Notify userspace about cgroup removing only after rmdir of cgroup
+         * directory to avoid race between userspace and kernelspace.
+         */
+        spin_lock(&memcg->event_list_lock);
+        list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+                list_del_init(&event->list);
+                schedule_work(&event->remove);
+        }
+        spin_unlock(&memcg->event_list_lock);
        kmem_cgroup_css_offline(memcg);
@@ -6615,7 +6879,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
        enum mc_target_type ret = MC_TARGET_NONE;
        page = pmd_page(pmd);
-        VM_BUG_ON(!page || !PageHead(page));
+        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
        if (!move_anon())
                return ret;
        pc = lookup_page_cgroup(page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fabe55046c1d..4f08a2d61487 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 }
 /*
- * Dirty cache page page
+ * Dirty pagecache page
 * Issues: when the error hit a hole page the error is not properly
 * propagated.
 */
@@ -856,14 +856,14 @@ static int page_action(struct page_state *ps, struct page *p,
 * the pages and send SIGBUS to the processes if the data was dirty.
 */
 static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
-                                  int trapno, int flags)
+                                  int trapno, int flags, struct page **hpagep)
 {
        enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
        struct address_space *mapping;
        LIST_HEAD(tokill);
        int ret;
        int kill = 1, forcekill;
-        struct page *hpage = compound_head(p);
+        struct page *hpage = *hpagep;
        struct page *ppage;
        if (PageReserved(p) || PageSlab(p))
@@ -942,11 +942,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
                         * We pinned the head page for hwpoison handling,
                         * now we split the thp and we are interested in
                         * the hwpoisoned raw page, so move the refcount
-                         * to it.
+                         * to it. Similarly, page lock is shifted.
                         */
                        if (hpage != p) {
                                put_page(hpage);
                                get_page(p);
+                                lock_page(p);
+                                unlock_page(hpage);
+                                *hpagep = p;
                        }
                        /* THP is split, so ppage should be the real poisoned page. */
                        ppage = p;
@@ -964,17 +967,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        if (kill)
                collect_procs(ppage, &tokill);
-        if (hpage != ppage)
-                lock_page(ppage);
        ret = try_to_unmap(ppage, ttu);
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
                                pfn, page_mapcount(ppage));
-        if (hpage != ppage)
-                unlock_page(ppage);
        /*
         * Now that the dirty bit has been propagated to the
         * struct page and all unmaps done we can decide if
@@ -1193,8 +1190,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        /*
         * Now take care of user space mappings.
         * Abort on fail: __delete_from_page_cache() assumes unmapped page.
+         *
+         * When the raw error page is thp tail page, hpage points to the raw
+         * page after thp split.
         */
-        if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
+        if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
+            != SWAP_SUCCESS) {
                printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
                res = -EBUSY;
                goto out;
@@ -1585,7 +1586,13 @@ static int __soft_offline_page(struct page *page, int flags)
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
                                        MIGRATE_SYNC, MR_MEMORY_FAILURE);
                if (ret) {
-                        putback_lru_pages(&pagelist);
+                        if (!list_empty(&pagelist)) {
+                                list_del(&page->lru);
+                                dec_zone_page_state(page, NR_ISOLATED_ANON +
+                                                page_is_file_cache(page));
+                                putback_lru_page(page);
+                        }
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
                        if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 6768ce9e57d2..be6a0c0d4ae0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -59,6 +59,7 @@
 #include <linux/gfp.h>
 #include <linux/migrate.h>
 #include <linux/string.h>
+#include <linux/dma-debug.h>
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -288,7 +289,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
                        return 0;
                batch = tlb->active;
        }
-        VM_BUG_ON(batch->nr > batch->max);
+        VM_BUG_ON_PAGE(batch->nr > batch->max, page);
        return batch->max - batch->nr;
 }
@@ -670,7 +671,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
                current->comm,
                (long long)pte_val(pte), (long long)pmd_val(*pmd));
        if (page)
-                dump_page(page);
+                dump_page(page, "bad pte");
        printk(KERN_ALERT
                "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
                (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
@@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
+        debug_dma_assert_idle(src);
        /*
         * If the source page was a PFN mapping, we don't have
         * a "struct page" for it. We do a best-effort copy by
@@ -2699,7 +2702,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                        goto unwritable_page;
                                }
                        } else
-                                VM_BUG_ON(!PageLocked(old_page));
+                                VM_BUG_ON_PAGE(!PageLocked(old_page), old_page);
                        /*
                         * Since we dropped the lock we need to revalidate
@@ -3355,7 +3358,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(!(ret & VM_FAULT_LOCKED)))
                lock_page(vmf.page);
        else
-                VM_BUG_ON(!PageLocked(vmf.page));
+                VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
        /*
         * Should we do an early C-O-W break?
@@ -3392,7 +3395,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                goto unwritable_page;
                                        }
                                } else
-                                        VM_BUG_ON(!PageLocked(page));
+                                        VM_BUG_ON_PAGE(!PageLocked(page), page);
                                page_mkwrite = 1;
                        }
                }
@@ -4272,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
+static struct kmem_cache *page_ptl_cachep;
+void __init ptlock_cache_init(void)
+{
+        page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
+                        SLAB_PANIC, NULL);
+}
 bool ptlock_alloc(struct page *page)
 {
        spinlock_t *ptl;
-        ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+        ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
        if (!ptl)
                return false;
        page->ptl = ptl;
@@ -4285,6 +4297,6 @@ bool ptlock_alloc(struct page *page)
 void ptlock_free(struct page *page)
 {
-        kfree(page->ptl);
+        kmem_cache_free(page_ptl_cachep, page->ptl);
 }
 #endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 489f235502db..a650db29606f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -9,7 +9,6 @@
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
-#include <linux/bootmem.h>
 #include <linux/compiler.h>
 #include <linux/export.h>
 #include <linux/pagevec.h>
@@ -269,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
 }
 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
- * alloc_bootmem_node_nopanic() */
+ * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
 static int __ref ensure_zone_is_initialized(struct zone *zone,
                        unsigned long start_pfn, unsigned long num_pages)
 {
@@ -1108,17 +1107,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
        if (ret)
                return ret;
-        lock_memory_hotplug();
        res = register_memory_resource(start, size);
        ret = -EEXIST;
        if (!res)
-                goto out;
+                return ret;
        {       /* Stupid hack to suppress address-never-null warning */
                void *p = NODE_DATA(nid);
                new_pgdat = !p;
        }
+        lock_memory_hotplug();
        new_node = !node_online(nid);
        if (new_node) {
                pgdat = hotadd_new_pgdat(nid, start);
@@ -1310,7 +1310,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 #ifdef CONFIG_DEBUG_VM
                        printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
                               pfn);
-                        dump_page(page);
+                        dump_page(page, "failed to remove from LRU");
 #endif
                        put_page(page);
                        /* Because we don't have big zone->lock. we should
@@ -1446,6 +1446,7 @@ static int __init cmdline_parse_movable_node(char *p)
         * the kernel away from hotpluggable memory.
         */
        memblock_set_bottom_up(true);
+        movable_node_enabled = true;
 #else
        pr_warn("movable_node option not supported\n");
 #endif
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0cd2c4d4e270..ae3c8f3595d4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -613,7 +613,7 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
        return 0;
 }
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+#ifdef CONFIG_NUMA_BALANCING
 /*
 * This is used to mark a range of virtual addresses to be inaccessible.
 * These are later cleared by a NUMA hinting fault. Depending on these
@@ -627,7 +627,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long addr, unsigned long end)
 {
        int nr_updated;
-        BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
        nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
        if (nr_updated)
@@ -641,7 +640,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
 {
        return 0;
 }
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+#endif /* CONFIG_NUMA_BALANCING */
 /*
 * Walk through page tables and collect pages to be migrated.
@@ -1199,10 +1198,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
        }
        if (PageHuge(page)) {
-                if (vma)
+                BUG_ON(!vma);
-                        return alloc_huge_page_noerr(vma, address, 1);
+                return alloc_huge_page_noerr(vma, address, 1);
-                else
-                        return NULL;
        }
        /*
         * if !vma, alloc_page_vma() will use task or system default policy
@@ -2657,7 +2654,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
 }
 #ifdef CONFIG_NUMA_BALANCING
-static bool __initdata numabalancing_override;
+static int __initdata numabalancing_override;
 static void __init check_numabalancing_enable(void)
 {
@@ -2666,9 +2663,15 @@ static void __init check_numabalancing_enable(void)
        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
                numabalancing_default = true;
+        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
+        if (numabalancing_override)
+                set_numabalancing_state(numabalancing_override == 1);
        if (nr_node_ids > 1 && !numabalancing_override) {
-                printk(KERN_INFO "Enabling automatic NUMA balancing. "
+                pr_info("%s automatic NUMA balancing. "
-                        "Configure with numa_balancing= or sysctl");
+                        "Configure with numa_balancing= or the "
+                        "kernel.numa_balancing sysctl",
+                        numabalancing_default ? "Enabling" : "Disabling");
                set_numabalancing_state(numabalancing_default);
        }
 }
@@ -2678,18 +2681,17 @@ static int __init setup_numabalancing(char *str)
        int ret = 0;
        if (!str)
                goto out;
-        numabalancing_override = true;
        if (!strcmp(str, "enable")) {
-                set_numabalancing_state(true);
+                numabalancing_override = 1;
                ret = 1;
        } else if (!strcmp(str, "disable")) {
-                set_numabalancing_state(false);
+                numabalancing_override = -1;
                ret = 1;
        }
 out:
        if (!ret)
-                printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+                pr_warn("Unable to parse numa_balancing=\n");
        return ret;
 }
@@ -2928,7 +2930,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
        unsigned short mode = MPOL_DEFAULT;
        unsigned short flags = 0;
-        if (pol && pol != &default_policy) {
+        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
                mode = pol->mode;
                flags = pol->flags;
        }
diff --git a/mm/migrate.c b/mm/migrate.c
index 9194375b2307..482a33d89134 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -72,28 +72,12 @@ int migrate_prep_local(void)
 }
 /*
- * Add isolated pages on the list back to the LRU under page lock
- * to avoid leaking evictable pages back onto unevictable list.
- */
-void putback_lru_pages(struct list_head *l)
-{
-        struct page *page;
-        struct page *page2;
-        list_for_each_entry_safe(page, page2, l, lru) {
-                list_del(&page->lru);
-                dec_zone_page_state(page, NR_ISOLATED_ANON +
-                                page_is_file_cache(page));
-                        putback_lru_page(page);
-        }
-}
-/*
 * Put previously isolated pages back onto the appropriate lists
 * from where they were once taken off for compaction/migration.
 *
- * This function shall be used instead of putback_lru_pages(),
+ * This function shall be used whenever the isolated pageset has been
- * whenever the isolated pageset has been built by isolate_migratepages_range()
+ * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
+ * and isolate_huge_page().
 */
 void putback_movable_pages(struct list_head *l)
 {
@@ -199,7 +183,12 @@ out:
 */
 static void remove_migration_ptes(struct page *old, struct page *new)
 {
-        rmap_walk(new, remove_migration_pte, old);
+        struct rmap_walk_control rwc = {
+                .rmap_one = remove_migration_pte,
+                .arg = old,
+        };
+        rmap_walk(new, &rwc);
 }
 /*
@@ -510,7 +499,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
        if (PageUptodate(page))
                SetPageUptodate(newpage);
        if (TestClearPageActive(page)) {
-                VM_BUG_ON(PageUnevictable(page));
+                VM_BUG_ON_PAGE(PageUnevictable(page), page);
                SetPageActive(newpage);
        } else if (TestClearPageUnevictable(page))
                SetPageUnevictable(newpage);
@@ -563,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 *                    Migration functions
 ***********************************************************/
-/* Always fail migration. Used for mappings that are not movable */
-int fail_migrate_page(struct address_space *mapping,
-                        struct page *newpage, struct page *page)
-{
-        return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
 /*
 * Common logic to directly migrate a single page suitable for
 * pages that do not use PagePrivate/PagePrivate2.
@@ -890,7 +871,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
         * free the metadata, so the page can be freed.
         */
        if (!page->mapping) {
-                VM_BUG_ON(PageAnon(page));
+                VM_BUG_ON_PAGE(PageAnon(page), page);
                if (page_has_private(page)) {
                        try_to_free_buffers(page);
                        goto uncharge;
@@ -1008,7 +989,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 {
        int rc = 0;
        int *result = NULL;
-        struct page *new_hpage = get_new_page(hpage, private, &result);
+        struct page *new_hpage;
        struct anon_vma *anon_vma = NULL;
        /*
@@ -1018,9 +999,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
         * tables or check whether the hugepage is pmd-based or not before
         * kicking migration.
         */
-        if (!hugepage_migration_support(page_hstate(hpage)))
+        if (!hugepage_migration_support(page_hstate(hpage))) {
+                putback_active_hugepage(hpage);
                return -ENOSYS;
+        }
+        new_hpage = get_new_page(hpage, private, &result);
        if (!new_hpage)
                return -ENOMEM;
@@ -1120,7 +1104,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
                                nr_succeeded++;
                                break;
                        default:
-                                /* Permanent failure */
+                                /*
+                                 * Permanent failure (-EBUSY, -ENOSYS, etc.):
+                                 * unlike -EAGAIN case, the failed page is
+                                 * removed from migration page list and not
+                                 * retried in the next outer loop.
+                                 */
                                nr_failed++;
                                break;
                        }
@@ -1559,8 +1548,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
                                          __GFP_NOMEMALLOC | __GFP_NORETRY |
                                          __GFP_NOWARN) &
                                         ~GFP_IOFS, 0);
-        if (newpage)
-                page_cpupid_xchg_last(newpage, page_cpupid_last(page));
        return newpage;
 }
@@ -1594,35 +1581,42 @@ bool migrate_ratelimited(int node)
 }
 /* Returns true if the node is migrate rate-limited after the update */
-bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
+                                        unsigned long nr_pages)
 {
-        bool rate_limited = false;
        /*
         * Rate-limit the amount of data that is being migrated to a node.
         * Optimal placement is no good if the memory bus is saturated and
         * all the time is being spent migrating!
         */
-        spin_lock(&pgdat->numabalancing_migrate_lock);
        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+                spin_lock(&pgdat->numabalancing_migrate_lock);
                pgdat->numabalancing_migrate_nr_pages = 0;
                pgdat->numabalancing_migrate_next_window = jiffies +
                        msecs_to_jiffies(migrate_interval_millisecs);
+                spin_unlock(&pgdat->numabalancing_migrate_lock);
        }
-        if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
+        if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
-                rate_limited = true;
+                trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
-        else
+                                                                nr_pages);
-                pgdat->numabalancing_migrate_nr_pages += nr_pages;
+                return true;
-        spin_unlock(&pgdat->numabalancing_migrate_lock);
+        }
-        
-        return rate_limited;
+        /*
+         * This is an unlocked non-atomic update so errors are possible.
+         * The consequences are failing to migrate when we potentiall should
+         * have which is not severe enough to warrant locking. If it is ever
+         * a problem, it can be converted to a per-cpu counter.
+         */
+        pgdat->numabalancing_migrate_nr_pages += nr_pages;
+        return false;
 }
-int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 {
        int page_lru;
-        VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
+        VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
        /* Avoid migrating to a node that is nearly full */
        if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
@@ -1705,7 +1699,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
        nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
                                     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
        if (nr_remaining) {
-                putback_lru_pages(&migratepages);
+                if (!list_empty(&migratepages)) {
+                        list_del(&page->lru);
+                        dec_zone_page_state(page, NR_ISOLATED_ANON +
+                                        page_is_file_cache(page));
+                        putback_lru_page(page);
+                }
                isolated = 0;
        } else
                count_vm_numa_event(NUMA_PAGE_MIGRATE);
@@ -1752,8 +1751,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
        if (!new_page)
                goto out_fail;
-        page_cpupid_xchg_last(new_page, page_cpupid_last(page));
        isolated = numamigrate_isolate_page(pgdat, page);
        if (!isolated) {
                put_page(new_page);
diff --git a/mm/mincore.c b/mm/mincore.c
index da2be56a7b8f..101623378fbf 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -225,13 +225,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
        end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
-        if (is_vm_hugetlb_page(vma)) {
-                mincore_hugetlb_page_range(vma, addr, end, vec);
-                return (end - addr) >> PAGE_SHIFT;
-        }
-        end = pmd_addr_end(addr, end);
        if (is_vm_hugetlb_page(vma))
                mincore_hugetlb_page_range(vma, addr, end, vec);
        else
diff --git a/mm/mlock.c b/mm/mlock.c
index 192e6eebe4f2..4e1a68162285 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -91,6 +91,26 @@ void mlock_vma_page(struct page *page)
 }
 /*
+ * Isolate a page from LRU with optional get_page() pin.
+ * Assumes lru_lock already held and page already pinned.
+ */
+static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
+{
+        if (PageLRU(page)) {
+                struct lruvec *lruvec;
+                lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
+                if (getpage)
+                        get_page(page);
+                ClearPageLRU(page);
+                del_page_from_lru_list(page, lruvec, page_lru(page));
+                return true;
+        }
+        return false;
+}
+/*
 * Finish munlock after successful page isolation
 *
 * Page must be locked. This is a wrapper for try_to_munlock()
@@ -126,9 +146,9 @@ static void __munlock_isolated_page(struct page *page)
 static void __munlock_isolation_failed(struct page *page)
 {
        if (PageUnevictable(page))
-                count_vm_event(UNEVICTABLE_PGSTRANDED);
+                __count_vm_event(UNEVICTABLE_PGSTRANDED);
        else
-                count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+                __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
 }
 /**
@@ -152,28 +172,34 @@ static void __munlock_isolation_failed(struct page *page)
 unsigned int munlock_vma_page(struct page *page)
 {
        unsigned int nr_pages;
+        struct zone *zone = page_zone(page);
        BUG_ON(!PageLocked(page));
-        if (TestClearPageMlocked(page)) {
-                nr_pages = hpage_nr_pages(page);
-                mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
-                if (!isolate_lru_page(page))
-                        __munlock_isolated_page(page);
-                else
-                        __munlock_isolation_failed(page);
-        } else {
-                nr_pages = hpage_nr_pages(page);
-        }
        /*
-         * Regardless of the original PageMlocked flag, we determine nr_pages
+         * Serialize with any parallel __split_huge_page_refcount() which
-         * after touching the flag. This leaves a possible race with a THP page
+         * might otherwise copy PageMlocked to part of the tail pages before
-         * split, such that a whole THP page was munlocked, but nr_pages == 1.
+         * we clear it in the head page. It also stabilizes hpage_nr_pages().
-         * Returning a smaller mask due to that is OK, the worst that can
-         * happen is subsequent useless scanning of the former tail pages.
-         * The NR_MLOCK accounting can however become broken.
         */
+        spin_lock_irq(&zone->lru_lock);
+        nr_pages = hpage_nr_pages(page);
+        if (!TestClearPageMlocked(page))
+                goto unlock_out;
+        __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
+        if (__munlock_isolate_lru_page(page, true)) {
+                spin_unlock_irq(&zone->lru_lock);
+                __munlock_isolated_page(page);
+                goto out;
+        }
+        __munlock_isolation_failed(page);
+unlock_out:
+        spin_unlock_irq(&zone->lru_lock);
+out:
        return nr_pages - 1;
 }
@@ -253,8 +279,8 @@ static int __mlock_posix_error_return(long retval)
 static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
                int *pgrescued)
 {
-        VM_BUG_ON(PageLRU(page));
+        VM_BUG_ON_PAGE(PageLRU(page), page);
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
        if (page_mapcount(page) <= 1 && page_evictable(page)) {
                pagevec_add(pvec, page);
@@ -310,34 +336,24 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
                struct page *page = pvec->pages[i];
                if (TestClearPageMlocked(page)) {
-                        struct lruvec *lruvec;
-                        int lru;
-                        if (PageLRU(page)) {
-                                lruvec = mem_cgroup_page_lruvec(page, zone);
-                                lru = page_lru(page);
-                                /*
-                                 * We already have pin from follow_page_mask()
-                                 * so we can spare the get_page() here.
-                                 */
-                                ClearPageLRU(page);
-                                del_page_from_lru_list(page, lruvec, lru);
-                        } else {
-                                __munlock_isolation_failed(page);
-                                goto skip_munlock;
-                        }
-                } else {
-skip_munlock:
                        /*
-                         * We won't be munlocking this page in the next phase
+                         * We already have pin from follow_page_mask()
-                         * but we still need to release the follow_page_mask()
+                         * so we can spare the get_page() here.
-                         * pin. We cannot do it under lru_lock however. If it's
-                         * the last pin, __page_cache_release would deadlock.
                         */
-                        pagevec_add(&pvec_putback, pvec->pages[i]);
+                        if (__munlock_isolate_lru_page(page, false))
-                        pvec->pages[i] = NULL;
+                                continue;
+                        else
+                                __munlock_isolation_failed(page);
                }
+                /*
+                 * We won't be munlocking this page in the next phase
+                 * but we still need to release the follow_page_mask()
+                 * pin. We cannot do it under lru_lock however. If it's
+                 * the last pin, __page_cache_release() would deadlock.
+                 */
+                pagevec_add(&pvec_putback, pvec->pages[i]);
+                pvec->pages[i] = NULL;
        }
        delta_munlocked = -nr + pagevec_count(&pvec_putback);
        __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
@@ -709,19 +725,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
        lru_add_drain_all();    /* flush pagevec */
-        down_write(&current->mm->mmap_sem);
        len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
        start &= PAGE_MASK;
-        locked = len >> PAGE_SHIFT;
-        locked += current->mm->locked_vm;
        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
+        locked = len >> PAGE_SHIFT;
+        down_write(&current->mm->mmap_sem);
+        locked += current->mm->locked_vm;
        /* check against resource limits */
        if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
                error = do_mlock(start, len, 1);
        up_write(&current->mm->mmap_sem);
        if (!error)
                error = __mm_populate(start, len, 0);
@@ -732,11 +750,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 {
        int ret;
-        down_write(&current->mm->mmap_sem);
        len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
        start &= PAGE_MASK;
+        down_write(&current->mm->mmap_sem);
        ret = do_mlock(start, len, 0);
        up_write(&current->mm->mmap_sem);
        return ret;
 }
@@ -781,12 +801,12 @@ SYSCALL_DEFINE1(mlockall, int, flags)
        if (flags & MCL_CURRENT)
                lru_add_drain_all();    /* flush pagevec */
-        down_write(&current->mm->mmap_sem);
        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        ret = -ENOMEM;
+        down_write(&current->mm->mmap_sem);
        if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
            capable(CAP_IPC_LOCK))
                ret = do_mlockall(flags);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 68562e92d50c..4074caf9936b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -202,5 +202,4 @@ static int __init mm_sysfs_init(void)
        return 0;
 }
+postcore_initcall(mm_sysfs_init);
-__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 834b2d785f1e..20ff0c33274c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
@@ -893,7 +894,15 @@ again:			remove_next = 1 + (end > next->vm_end);
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
                        struct file *file, unsigned long vm_flags)
 {
-        if (vma->vm_flags ^ vm_flags)
+        /*
+         * VM_SOFTDIRTY should not prevent from VMA merging, if we
+         * match the flags but dirty bit -- the caller should mark
+         * merged VMA as dirty. If dirty bit won't be excluded from
+         * comparison, we increase pressue on the memory system forcing
+         * the kernel to generate new VMAs when old one could be
+         * extended instead.
+         */
+        if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
                return 0;
        if (vma->vm_file != file)
                return 0;
@@ -1082,7 +1091,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
        return a->vm_end == b->vm_start &&
                mpol_equal(vma_policy(a), vma_policy(b)) &&
                a->vm_file == b->vm_file &&
-                !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
+                !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
                b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
 }
@@ -1190,6 +1199,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
        return hint;
 }
+static inline int mlock_future_check(struct mm_struct *mm,
+                                     unsigned long flags,
+                                     unsigned long len)
+{
+        unsigned long locked, lock_limit;
+        /*  mlock MCL_FUTURE? */
+        if (flags & VM_LOCKED) {
+                locked = len >> PAGE_SHIFT;
+                locked += mm->locked_vm;
+                lock_limit = rlimit(RLIMIT_MEMLOCK);
+                lock_limit >>= PAGE_SHIFT;
+                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+                        return -EAGAIN;
+        }
+        return 0;
+}
 /*
 * The caller must hold down_write(&current->mm->mmap_sem).
 */
@@ -1251,16 +1278,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                if (!can_do_mlock())
                        return -EPERM;
-        /* mlock MCL_FUTURE? */
+        if (mlock_future_check(mm, vm_flags, len))
-        if (vm_flags & VM_LOCKED) {
+                return -EAGAIN;
-                unsigned long locked, lock_limit;
-                locked = len >> PAGE_SHIFT;
-                locked += mm->locked_vm;
-                lock_limit = rlimit(RLIMIT_MEMLOCK);
-                lock_limit >>= PAGE_SHIFT;
-                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-                        return -EAGAIN;
-        }
        if (file) {
                struct inode *inode = file_inode(file);
@@ -2591,18 +2610,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
        if (error & ~PAGE_MASK)
                return error;
-        /*
+        error = mlock_future_check(mm, mm->def_flags, len);
-         * mlock MCL_FUTURE?
+        if (error)
-         */
+                return error;
-        if (mm->def_flags & VM_LOCKED) {
-                unsigned long locked, lock_limit;
-                locked = len >> PAGE_SHIFT;
-                locked += mm->locked_vm;
-                lock_limit = rlimit(RLIMIT_MEMLOCK);
-                lock_limit >>= PAGE_SHIFT;
-                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-                        return -EAGAIN;
-        }
        /*
         * mm->mmap_sem is required to protect against another thread
@@ -3140,7 +3150,7 @@ static int init_user_reserve(void)
        sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
        return 0;
 }
-module_init(init_user_reserve)
+subsys_initcall(init_user_reserve);
 /*
 * Initialise sysctl_admin_reserve_kbytes.
@@ -3161,7 +3171,7 @@ static int init_admin_reserve(void)
        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
        return 0;
 }
-module_init(init_admin_reserve)
+subsys_initcall(init_admin_reserve);
 /*
 * Reinititalise user and admin reserves if memory is added or removed.
@@ -3231,4 +3241,4 @@ static int __meminit init_reserve_notifier(void)
        return 0;
 }
-module_init(init_reserve_notifier)
+subsys_initcall(init_reserve_notifier);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 93e6089cb456..41cefdf0aadd 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -329,5 +329,4 @@ static int __init mmu_notifier_init(void)
 {
        return init_srcu_struct(&srcu);
 }
+subsys_initcall(mmu_notifier_init);
-module_init(mmu_notifier_init);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bb53a6591aea..7332c1785744 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
 #include <linux/perf_event.h>
+#include <linux/ksm.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                ptent = *pte;
                                page = vm_normal_page(vma, addr, oldpte);
-                                if (page) {
+                                if (page && !PageKsm(page)) {
                                        if (!pte_numa(oldpte)) {
                                                ptent = pte_mknuma(ptent);
                                                set_pte_at(mm, addr, pte, ptent);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 2c254d374655..f73f2987a852 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,11 +41,13 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
        if (limit > memblock.current_limit)
                limit = memblock.current_limit;
-        addr = memblock_find_in_range_node(goal, limit, size, align, nid);
+        addr = memblock_find_in_range_node(size, align, goal, limit, nid);
        if (!addr)
                return NULL;
-        memblock_reserve(addr, size);
+        if (memblock_reserve(addr, size))
+                return NULL;
        ptr = phys_to_virt(addr);
        memset(ptr, 0, size);
        /*
@@ -114,16 +116,27 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
 static unsigned long __init free_low_memory_core_early(void)
 {
        unsigned long count = 0;
-        phys_addr_t start, end, size;
+        phys_addr_t start, end;
        u64 i;
-        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
+        for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
                count += __free_memory_core(start, end);
-        /* free range that is used for reserved array if we allocate it */
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-        size = get_allocated_memblock_reserved_regions_info(&start);
+        {
-        if (size)
+                phys_addr_t size;
-                count += __free_memory_core(start, start + size);
+                /* Free memblock.reserved array if it was allocated */
+                size = get_allocated_memblock_reserved_regions_info(&start);
+                if (size)
+                        count += __free_memory_core(start, start + size);
+                /* Free memblock.memory array if it was allocated */
+                size = get_allocated_memblock_memory_regions_info(&start);
+                if (size)
+                        count += __free_memory_core(start, start + size);
+        }
+#endif
        return count;
 }
@@ -161,7 +174,7 @@ unsigned long __init free_all_bootmem(void)
        reset_all_zones_managed_pages();
        /*
-         * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+         * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
         *  because in some case like Node0 doesn't have RAM installed
         *  low ram will be on Node1
         */
@@ -215,7 +228,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 restart:
-        ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+        ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);
        if (ptr)
                return ptr;
@@ -299,7 +312,7 @@ again:
        if (ptr)
                return ptr;
-        ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+        ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
                                        goal, limit);
        if (ptr)
                return ptr;
diff --git a/mm/nommu.c b/mm/nommu.c
index fec093adad9c..8740213b1647 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn;
 struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e4a600a6163..3291e82d4352 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock);
 #ifdef CONFIG_NUMA
 /**
 * has_intersects_mems_allowed() - check task eligiblity for kill
- * @tsk: task struct of which task to consider
+ * @start: task struct of which task to consider
 * @mask: nodemask passed to page allocator for mempolicy ooms
 *
 * Task eligibility is determined by whether or not a candidate task, @tsk,
 * shares the same mempolicy nodes as current if it is bound by such a policy
 * and whether or not it has the same set of allowed cpuset nodes.
 */
-static bool has_intersects_mems_allowed(struct task_struct *tsk,
+static bool has_intersects_mems_allowed(struct task_struct *start,
                                        const nodemask_t *mask)
 {
-        struct task_struct *start = tsk;
+        struct task_struct *tsk;
+        bool ret = false;
-        do {
+        rcu_read_lock();
+        for_each_thread(start, tsk) {
                if (mask) {
                        /*
                         * If this is a mempolicy constrained oom, tsk's
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
                         * mempolicy intersects current, otherwise it may be
                         * needlessly killed.
                         */
-                        if (mempolicy_nodemask_intersects(tsk, mask))
+                        ret = mempolicy_nodemask_intersects(tsk, mask);
-                                return true;
                } else {
                        /*
                         * This is not a mempolicy constrained oom, so only
                         * check the mems of tsk's cpuset.
                         */
-                        if (cpuset_mems_allowed_intersects(current, tsk))
+                        ret = cpuset_mems_allowed_intersects(current, tsk);
-                                return true;
                }
-        } while_each_thread(start, tsk);
+                if (ret)
+                        break;
+        }
+        rcu_read_unlock();
-        return false;
+        return ret;
 }
 #else
 static bool has_intersects_mems_allowed(struct task_struct *tsk,
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
 */
 struct task_struct *find_lock_task_mm(struct task_struct *p)
 {
-        struct task_struct *t = p;
+        struct task_struct *t;
-        do {
+        rcu_read_lock();
+        for_each_thread(p, t) {
                task_lock(t);
                if (likely(t->mm))
-                        return t;
+                        goto found;
                task_unlock(t);
-        } while_each_thread(p, t);
+        }
+        t = NULL;
+found:
+        rcu_read_unlock();
-        return NULL;
+        return t;
 }
 /* return true if the task is not adequate as candidate victim task. */
@@ -170,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
         * implementation used by LSMs.
         */
        if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-                adj -= 30;
+                points -= (points * 3) / 100;
        /* Normalize to oom_score_adj units */
        adj *= totalpages / 1000;
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
        unsigned long chosen_points = 0;
        rcu_read_lock();
-        do_each_thread(g, p) {
+        for_each_process_thread(g, p) {
                unsigned int points;
                switch (oom_scan_process_thread(p, totalpages, nodemask,
@@ -319,11 +327,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                        break;
                };
                points = oom_badness(p, NULL, nodemask, totalpages);
-                if (points > chosen_points) {
+                if (!points || points < chosen_points)
-                        chosen = p;
+                        continue;
-                        chosen_points = points;
+                /* Prefer thread group leaders for display purposes */
-                }
+                if (points == chosen_points && thread_group_leader(chosen))
-        } while_each_thread(g, p);
+                        continue;
+                chosen = p;
+                chosen_points = points;
+        }
        if (chosen)
                get_task_struct(chosen);
        rcu_read_unlock();
@@ -406,7 +418,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 {
        struct task_struct *victim = p;
        struct task_struct *child;
-        struct task_struct *t = p;
+        struct task_struct *t;
        struct mm_struct *mm;
        unsigned int victim_points = 0;
        static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -437,7 +449,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * still freeing memory.
         */
        read_lock(&tasklist_lock);
-        do {
+        for_each_thread(p, t) {
                list_for_each_entry(child, &t->children, sibling) {
                        unsigned int child_points;
@@ -455,13 +467,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                                get_task_struct(victim);
                        }
                }
-        } while_each_thread(p, t);
+        }
        read_unlock(&tasklist_lock);
-        rcu_read_lock();
        p = find_lock_task_mm(victim);
        if (!p) {
-                rcu_read_unlock();
                put_task_struct(victim);
                return;
        } else if (victim != p) {
@@ -487,6 +497,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * That thread will now get access to memory reserves since it has a
         * pending fatal signal.
         */
+        rcu_read_lock();
        for_each_process(p)
                if (p->mm == mm && !same_thread_group(p, victim) &&
                    !(p->flags & PF_KTHREAD)) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 63807583d8e8..2d30e2cfe804 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -191,6 +191,26 @@ static unsigned long writeout_period_time = 0;
 * global dirtyable memory first.
 */
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache.  This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+        unsigned long nr_pages;
+        nr_pages = zone_page_state(zone, NR_FREE_PAGES);
+        nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+        nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
+        nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
+        return nr_pages;
+}
 static unsigned long highmem_dirtyable_memory(unsigned long total)
 {
 #ifdef CONFIG_HIGHMEM
@@ -198,11 +218,9 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
        unsigned long x = 0;
        for_each_node_state(node, N_HIGH_MEMORY) {
-                struct zone *z =
+                struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
-                        &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
-                x += zone_page_state(z, NR_FREE_PAGES) +
+                x += zone_dirtyable_memory(z);
-                     zone_reclaimable_pages(z) - z->dirty_balance_reserve;
        }
        /*
         * Unreclaimable memory (kernel memory or anonymous memory
@@ -238,9 +256,12 @@ static unsigned long global_dirtyable_memory(void)
 {
        unsigned long x;
-        x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+        x = global_page_state(NR_FREE_PAGES);
        x -= min(x, dirty_balance_reserve);
+        x += global_page_state(NR_INACTIVE_FILE);
+        x += global_page_state(NR_ACTIVE_FILE);
        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);
@@ -289,32 +310,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 }
 /**
- * zone_dirtyable_memory - number of dirtyable pages in a zone
- * @zone: the zone
- *
- * Returns the zone's number of pages potentially available for dirty
- * page cache.  This is the base value for the per-zone dirty limits.
- */
-static unsigned long zone_dirtyable_memory(struct zone *zone)
-{
-        /*
-         * The effective global number of dirtyable pages may exclude
-         * highmem as a big-picture measure to keep the ratio between
-         * dirty memory and lowmem reasonable.
-         *
-         * But this function is purely about the individual zone and a
-         * highmem zone can hold its share of dirty pages, so we don't
-         * care about vm_highmem_is_dirtyable here.
-         */
-        unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
-                zone_reclaimable_pages(zone);
-        /* don't allow this to underflow */
-        nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
-        return nr_pages;
-}
-/**
 * zone_dirty_limit - maximum number of dirty pages allowed in a zone
 * @zone: the zone
 *
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5248fe070aa4..e3758a09a009 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -205,7 +205,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
 };
 int min_free_kbytes = 1024;
-int user_min_free_kbytes;
+int user_min_free_kbytes = -1;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -295,7 +295,7 @@ static inline int bad_range(struct zone *zone, struct page *page)
 }
 #endif
-static void bad_page(struct page *page)
+static void bad_page(struct page *page, char *reason, unsigned long bad_flags)
 {
        static unsigned long resume;
        static unsigned long nr_shown;
@@ -329,7 +329,7 @@ static void bad_page(struct page *page)
        printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
                current->comm, page_to_pfn(page));
-        dump_page(page);
+        dump_page_badflags(page, reason, bad_flags);
        print_modules();
        dump_stack();
@@ -383,7 +383,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
        int bad = 0;
        if (unlikely(compound_order(page) != order)) {
-                bad_page(page);
+                bad_page(page, "wrong compound order", 0);
                bad++;
        }
@@ -392,8 +392,11 @@ static int destroy_compound_page(struct page *page, unsigned long order)
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
-                if (unlikely(!PageTail(p) || (p->first_page != page))) {
+                if (unlikely(!PageTail(p))) {
-                        bad_page(page);
+                        bad_page(page, "PageTail not set", 0);
+                        bad++;
+                } else if (unlikely(p->first_page != page)) {
+                        bad_page(page, "first_page not consistent", 0);
                        bad++;
                }
                __ClearPageTail(p);
@@ -506,12 +509,12 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
                return 0;
        if (page_is_guard(buddy) && page_order(buddy) == order) {
-                VM_BUG_ON(page_count(buddy) != 0);
+                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                return 1;
        }
        if (PageBuddy(buddy) && page_order(buddy) == order) {
-                VM_BUG_ON(page_count(buddy) != 0);
+                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                return 1;
        }
        return 0;
@@ -561,8 +564,8 @@ static inline void __free_one_page(struct page *page,
        page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
-        VM_BUG_ON(page_idx & ((1 << order) - 1));
+        VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
-        VM_BUG_ON(bad_range(zone, page));
+        VM_BUG_ON_PAGE(bad_range(zone, page), page);
        while (order < MAX_ORDER-1) {
                buddy_idx = __find_buddy_index(page_idx, order);
@@ -618,12 +621,23 @@ out:
 static inline int free_pages_check(struct page *page)
 {
-        if (unlikely(page_mapcount(page) |
+        char *bad_reason = NULL;
-                (page->mapping != NULL)  |
+        unsigned long bad_flags = 0;
-                (atomic_read(&page->_count) != 0) |
-                (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
+        if (unlikely(page_mapcount(page)))
-                (mem_cgroup_bad_page_check(page)))) {
+                bad_reason = "nonzero mapcount";
-                bad_page(page);
+        if (unlikely(page->mapping != NULL))
+                bad_reason = "non-NULL mapping";
+        if (unlikely(atomic_read(&page->_count) != 0))
+                bad_reason = "nonzero _count";
+        if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
+                bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
+                bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
+        }
+        if (unlikely(mem_cgroup_bad_page_check(page)))
+                bad_reason = "cgroup check failed";
+        if (unlikely(bad_reason)) {
+                bad_page(page, bad_reason, bad_flags);
                return 1;
        }
        page_cpupid_reset_last(page);
@@ -813,7 +827,7 @@ static inline void expand(struct zone *zone, struct page *page,
                area--;
                high--;
                size >>= 1;
-                VM_BUG_ON(bad_range(zone, &page[size]));
+                VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if (high < debug_guardpage_minorder()) {
@@ -843,12 +857,23 @@ static inline void expand(struct zone *zone, struct page *page,
 */
 static inline int check_new_page(struct page *page)
 {
-        if (unlikely(page_mapcount(page) |
+        char *bad_reason = NULL;
-                (page->mapping != NULL)  |
+        unsigned long bad_flags = 0;
-                (atomic_read(&page->_count) != 0)  |
-                (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
+        if (unlikely(page_mapcount(page)))
-                (mem_cgroup_bad_page_check(page)))) {
+                bad_reason = "nonzero mapcount";
-                bad_page(page);
+        if (unlikely(page->mapping != NULL))
+                bad_reason = "non-NULL mapping";
+        if (unlikely(atomic_read(&page->_count) != 0))
+                bad_reason = "nonzero _count";
+        if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
+                bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
+                bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
+        }
+        if (unlikely(mem_cgroup_bad_page_check(page)))
+                bad_reason = "cgroup check failed";
+        if (unlikely(bad_reason)) {
+                bad_page(page, bad_reason, bad_flags);
                return 1;
        }
        return 0;
@@ -955,7 +980,7 @@ int move_freepages(struct zone *zone,
        for (page = start_page; page <= end_page;) {
                /* Make sure we are not inadvertently changing nodes */
-                VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
+                VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
                if (!pfn_valid_within(page_to_pfn(page))) {
                        page++;
@@ -1404,8 +1429,8 @@ void split_page(struct page *page, unsigned int order)
 {
        int i;
-        VM_BUG_ON(PageCompound(page));
+        VM_BUG_ON_PAGE(PageCompound(page), page);
-        VM_BUG_ON(!page_count(page));
+        VM_BUG_ON_PAGE(!page_count(page), page);
 #ifdef CONFIG_KMEMCHECK
        /*
@@ -1552,7 +1577,7 @@ again:
        zone_statistics(preferred_zone, zone, gfp_flags);
        local_irq_restore(flags);
-        VM_BUG_ON(bad_range(zone, page));
+        VM_BUG_ON_PAGE(bad_range(zone, page), page);
        if (prep_new_page(page, order, gfp_flags))
                goto again;
        return page;
@@ -2072,13 +2097,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                return;
        /*
-         * Walking all memory to count page types is very expensive and should
-         * be inhibited in non-blockable contexts.
-         */
-        if (!(gfp_mask & __GFP_WAIT))
-                filter |= SHOW_MEM_FILTER_PAGE_COUNT;
-        /*
         * This documents exceptions given to allocations in certain
         * contexts that are allowed to allocate outside current's set
         * of allowed nodes.
@@ -2242,10 +2260,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                                preferred_zone, migratetype);
                if (page) {
                        preferred_zone->compact_blockskip_flush = false;
-                        preferred_zone->compact_considered = 0;
+                        compaction_defer_reset(preferred_zone, order, true);
-                        preferred_zone->compact_defer_shift = 0;
-                        if (order >= preferred_zone->compact_order_failed)
-                                preferred_zone->compact_order_failed = order + 1;
                        count_vm_event(COMPACTSUCCESS);
                        return page;
                }
@@ -2535,8 +2550,15 @@ rebalance:
        }
        /* Atomic allocations - we can't balance anything */
-        if (!wait)
+        if (!wait) {
+                /*
+                 * All existing users of the deprecated __GFP_NOFAIL are
+                 * blockable, so warn of any new users that actually allow this
+                 * type of allocation to fail.
+                 */
+                WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
                goto nopage;
+        }
        /* Avoid recursion of direct reclaim */
        if (current->flags & PF_MEMALLOC)
@@ -3901,6 +3923,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
        struct page *page;
        unsigned long block_migratetype;
        int reserve;
+        int old_reserve;
        /*
         * Get the start pfn, end pfn and the number of blocks to reserve
@@ -3922,6 +3945,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
         * future allocation of hugepages at runtime.
         */
        reserve = min(2, reserve);
+        old_reserve = zone->nr_migrate_reserve_block;
+        /* When memory hot-add, we almost always need to do nothing */
+        if (reserve == old_reserve)
+                return;
+        zone->nr_migrate_reserve_block = reserve;
        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
                if (!pfn_valid(pfn))
@@ -3959,6 +3988,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                                reserve--;
                                continue;
                        }
+                } else if (!old_reserve) {
+                        /*
+                         * At boot time we don't need to scan the whole zone
+                         * for turning off MIGRATE_RESERVE.
+                         */
+                        break;
                }
                /*
@@ -4209,7 +4244,6 @@ static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
        int i;
-        struct pglist_data *pgdat = zone->zone_pgdat;
        size_t alloc_size;
        /*
@@ -4225,7 +4259,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
        if (!slab_is_available()) {
                zone->wait_table = (wait_queue_head_t *)
-                        alloc_bootmem_node_nopanic(pgdat, alloc_size);
+                        memblock_virt_alloc_node_nopanic(
+                                alloc_size, zone->zone_pgdat->node_id);
        } else {
                /*
                 * This case means that a zone whose size was 0 gets new memory
@@ -4345,13 +4380,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 #endif
 /**
- * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
 *
 * If an architecture guarantees that all ranges registered with
 * add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling free_bootmem() manually.
+ * this function may be used instead of calling memblock_free_early_nid()
+ * manually.
 */
 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
@@ -4363,9 +4399,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
                end_pfn = min(end_pfn, max_low_pfn);
                if (start_pfn < end_pfn)
-                        free_bootmem_node(NODE_DATA(this_nid),
+                        memblock_free_early_nid(PFN_PHYS(start_pfn),
-                                          PFN_PHYS(start_pfn),
+                                        (end_pfn - start_pfn) << PAGE_SHIFT,
-                                          (end_pfn - start_pfn) << PAGE_SHIFT);
+                                        this_nid);
        }
 }
@@ -4636,8 +4672,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
        unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
        zone->pageblock_flags = NULL;
        if (usemapsize)
-                zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
+                zone->pageblock_flags =
-                                                                   usemapsize);
+                        memblock_virt_alloc_node_nopanic(usemapsize,
+                                                         pgdat->node_id);
 }
 #else
 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4831,7 +4868,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                size =  (end - start) * sizeof(struct page);
                map = alloc_remap(pgdat->node_id, size);
                if (!map)
-                        map = alloc_bootmem_node_nopanic(pgdat, size);
+                        map = memblock_virt_alloc_node_nopanic(size,
+                                                               pgdat->node_id);
                pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
        }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -5012,9 +5050,33 @@ static void __init find_zone_movable_pfns_for_nodes(void)
        nodemask_t saved_node_state = node_states[N_MEMORY];
        unsigned long totalpages = early_calculate_totalpages();
        int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+        struct memblock_type *type = &memblock.memory;
+        /* Need to find movable_zone earlier when movable_node is specified. */
+        find_usable_zone_for_movable();
+        /*
+         * If movable_node is specified, ignore kernelcore and movablecore
+         * options.
+         */
+        if (movable_node_is_enabled()) {
+                for (i = 0; i < type->cnt; i++) {
+                        if (!memblock_is_hotpluggable(&type->regions[i]))
+                                continue;
+                        nid = type->regions[i].nid;
+                        usable_startpfn = PFN_DOWN(type->regions[i].base);
+                        zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+                                min(usable_startpfn, zone_movable_pfn[nid]) :
+                                usable_startpfn;
+                }
+                goto out2;
+        }
        /*
-         * If movablecore was specified, calculate what size of
+         * If movablecore=nn[KMG] was specified, calculate what size of
         * kernelcore that corresponds so that memory usable for
         * any allocation type is evenly spread. If both kernelcore
         * and movablecore are specified, then the value of kernelcore
@@ -5040,7 +5102,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
                goto out;
        /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-        find_usable_zone_for_movable();
        usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
@@ -5131,6 +5192,7 @@ restart:
        if (usable_nodes && required_kernelcore > usable_nodes)
                goto restart;
+out2:
        /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
        for (nid = 0; nid < MAX_NUMNODES; nid++)
                zone_movable_pfn[nid] =
@@ -5692,7 +5754,12 @@ module_init(init_per_zone_wmark_min)
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
-        proc_dointvec(table, write, buffer, length, ppos);
+        int rc;
+        rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+        if (rc)
+                return rc;
        if (write) {
                user_min_free_kbytes = min_free_kbytes;
                setup_per_zone_wmarks();
@@ -5857,7 +5924,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        do {
                size = bucketsize << log2qty;
                if (flags & HASH_EARLY)
-                        table = alloc_bootmem_nopanic(size);
+                        table = memblock_virt_alloc_nopanic(size, 0);
                else if (hashdist)
                        table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
                else {
@@ -5959,7 +6026,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
        pfn = page_to_pfn(page);
        bitmap = get_pageblock_bitmap(zone, pfn);
        bitidx = pfn_to_bitidx(zone, pfn);
-        VM_BUG_ON(!zone_spans_pfn(zone, pfn));
+        VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
        for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
                if (flags & value)
@@ -6457,12 +6524,24 @@ static void dump_page_flags(unsigned long flags)
        printk(")\n");
 }
-void dump_page(struct page *page)
+void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
 {
        printk(KERN_ALERT
               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
                page, atomic_read(&page->_count), page_mapcount(page),
                page->mapping, page->index);
        dump_page_flags(page->flags);
+        if (reason)
+                pr_alert("page dumped because: %s\n", reason);
+        if (page->flags & badflags) {
+                pr_alert("bad because of flags:\n");
+                dump_page_flags(page->flags & badflags);
+        }
        mem_cgroup_print_bad_page(page);
 }
+void dump_page(struct page *page, char *reason)
+{
+        dump_page_badflags(page, reason, 0);
+}
+EXPORT_SYMBOL_GPL(dump_page);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6d757e3a872a..cfd162882c00 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid)
        table_size = sizeof(struct page_cgroup) * nr_pages;
-        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+        base = memblock_virt_alloc_try_nid_nopanic(
-                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+                        BOOTMEM_ALLOC_ACCESSIBLE, nid);
        if (!base)
                return -ENOMEM;
        NODE_DATA(nid)->node_page_cgroup = base;
@@ -451,7 +452,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
 * @ent: swap entry to be looked up.
 *
- * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
+ * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 */
 unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
 {
diff --git a/mm/page_io.c b/mm/page_io.c
index 8c79a4764be0..7c59ef681381 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -31,13 +31,13 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
        bio = bio_alloc(gfp_flags, 1);
        if (bio) {
-                bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
+                bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
-                bio->bi_sector <<= PAGE_SHIFT - 9;
+                bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
                bio->bi_io_vec[0].bv_page = page;
                bio->bi_io_vec[0].bv_len = PAGE_SIZE;
                bio->bi_io_vec[0].bv_offset = 0;
                bio->bi_vcnt = 1;
-                bio->bi_size = PAGE_SIZE;
+                bio->bi_iter.bi_size = PAGE_SIZE;
                bio->bi_end_io = end_io;
        }
        return bio;
@@ -62,7 +62,7 @@ void end_swap_bio_write(struct bio *bio, int err)
                printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
                                imajor(bio->bi_bdev->bd_inode),
                                iminor(bio->bi_bdev->bd_inode),
-                                (unsigned long long)bio->bi_sector);
+                                (unsigned long long)bio->bi_iter.bi_sector);
                ClearPageReclaim(page);
        }
        end_page_writeback(page);
@@ -80,7 +80,7 @@ void end_swap_bio_read(struct bio *bio, int err)
                printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
                                imajor(bio->bi_bdev->bd_inode),
                                iminor(bio->bi_bdev->bd_inode),
-                                (unsigned long long)bio->bi_sector);
+                                (unsigned long long)bio->bi_iter.bi_sector);
                goto out;
        }
@@ -320,8 +320,8 @@ int swap_readpage(struct page *page)
        int ret = 0;
        struct swap_info_struct *sis = page_swap_info(page);
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
-        VM_BUG_ON(PageUptodate(page));
+        VM_BUG_ON_PAGE(PageUptodate(page), page);
        if (frontswap_load(page) == 0) {
                SetPageUptodate(page);
                unlock_page(page);
diff --git a/mm/percpu.c b/mm/percpu.c
index 0d10defe951e..036cfe07050f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
                          __alignof__(ai->groups[0].cpu_map[0]));
        ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
-        ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
+        ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
        if (!ptr)
                return NULL;
        ai = ptr;
@@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
 */
 void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
 {
-        free_bootmem(__pa(ai), ai->__ai_size);
+        memblock_free_early(__pa(ai), ai->__ai_size);
 }
 /**
@@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
        /* process group information and build config tables accordingly */
-        group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
+        group_offsets = memblock_virt_alloc(ai->nr_groups *
-        group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
+                                             sizeof(group_offsets[0]), 0);
-        unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
+        group_sizes = memblock_virt_alloc(ai->nr_groups *
-        unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
+                                           sizeof(group_sizes[0]), 0);
+        unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
+        unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
                unit_map[cpu] = UINT_MAX;
@@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
         * empty chunks.
         */
        pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
-        pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+        pcpu_slot = memblock_virt_alloc(
+                        pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
        for (i = 0; i < pcpu_nr_slots; i++)
                INIT_LIST_HEAD(&pcpu_slot[i]);
@@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
         * covers static area + reserved area (mostly used for module
         * static percpu allocation).
         */
-        schunk = alloc_bootmem(pcpu_chunk_struct_size);
+        schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
        INIT_LIST_HEAD(&schunk->list);
        schunk->base_addr = base_addr;
        schunk->map = smap;
@@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        /* init dynamic chunk if necessary */
        if (dyn_size) {
-                dchunk = alloc_bootmem(pcpu_chunk_struct_size);
+                dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
                INIT_LIST_HEAD(&dchunk->list);
                dchunk->base_addr = base_addr;
                dchunk->map = dmap;
@@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
        size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
-        areas = alloc_bootmem_nopanic(areas_size);
+        areas = memblock_virt_alloc_nopanic(areas_size, 0);
        if (!areas) {
                rc = -ENOMEM;
                goto out_free;
@@ -1686,10 +1689,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
        max_distance += ai->unit_size;
        /* warn if maximum distance is further than 75% of vmalloc space */
-        if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
+        if (max_distance > VMALLOC_TOTAL * 3 / 4) {
                pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
                           "space 0x%lx\n", max_distance,
-                           (unsigned long)(VMALLOC_END - VMALLOC_START));
+                           VMALLOC_TOTAL);
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
                /* and fail if we have fallback */
                rc = -EINVAL;
@@ -1712,7 +1715,7 @@ out_free_areas:
 out_free:
        pcpu_free_alloc_info(ai);
        if (areas)
-                free_bootmem(__pa(areas), areas_size);
+                memblock_free_early(__pa(areas), areas_size);
        return rc;
 }
 #endif /* BUILD_EMBED_FIRST_CHUNK */
@@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
        /* unaligned allocations can't be freed, round up to page size */
        pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
                               sizeof(pages[0]));
-        pages = alloc_bootmem(pages_size);
+        pages = memblock_virt_alloc(pages_size, 0);
        /* allocate pages */
        j = 0;
@@ -1823,7 +1826,7 @@ enomem:
                free_fn(page_address(pages[j]), PAGE_SIZE);
        rc = -ENOMEM;
 out_free_ar:
-        free_bootmem(__pa(pages), pages_size);
+        memblock_free_early(__pa(pages), pages_size);
        pcpu_free_alloc_info(ai);
        return rc;
 }
@@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset);
 static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
                                       size_t align)
 {
-        return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
+        return  memblock_virt_alloc_from_nopanic(
+                        size, align, __pa(MAX_DMA_ADDRESS));
 }
 static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
 {
-        free_bootmem(__pa(ptr), size);
+        memblock_free_early(__pa(ptr), size);
 }
 void __init setup_per_cpu_areas(void)
@@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void)
        void *fc;
        ai = pcpu_alloc_alloc_info(1, 1);
-        fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+        fc = memblock_virt_alloc_from_nopanic(unit_size,
+                                              PAGE_SIZE,
+                                              __pa(MAX_DMA_ADDRESS));
        if (!ai || !fc)
                panic("Failed to allocate memory for percpu areas.");
        /* kmemleak tracks the percpu allocations separately */
diff --git a/mm/readahead.c b/mm/readahead.c
index 7cdbb44aa90b..0de2360d65f3 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -211,8 +211,6 @@ out:
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
                pgoff_t offset, unsigned long nr_to_read)
 {
-        int ret = 0;
        if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
                return -EINVAL;
@@ -226,15 +224,13 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
                        this_chunk = nr_to_read;
                err = __do_page_cache_readahead(mapping, filp,
                                                offset, this_chunk, 0);
-                if (err < 0) {
+                if (err < 0)
-                        ret = err;
+                        return err;
-                        break;
-                }
-                ret += err;
                offset += this_chunk;
                nr_to_read -= this_chunk;
        }
-        return ret;
+        return 0;
 }
 /*
@@ -576,8 +572,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
        if (!mapping || !mapping->a_ops)
                return -EINVAL;
-        force_page_cache_readahead(mapping, filp, index, nr);
+        return force_page_cache_readahead(mapping, filp, index, nr);
-        return 0;
 }
 SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
diff --git a/mm/rmap.c b/mm/rmap.c
index 068522d8502a..d9d42316a99a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
        return 1;
 }
+struct page_referenced_arg {
+        int mapcount;
+        int referenced;
+        unsigned long vm_flags;
+        struct mem_cgroup *memcg;
+};
 /*
- * Subfunctions of page_referenced: page_referenced_one called
+ * arg: page_referenced_arg will be passed
- * repeatedly from either page_referenced_anon or page_referenced_file.
 */
 int page_referenced_one(struct page *page, struct vm_area_struct *vma,
-                        unsigned long address, unsigned int *mapcount,
+                        unsigned long address, void *arg)
-                        unsigned long *vm_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
        int referenced = 0;
+        struct page_referenced_arg *pra = arg;
        if (unlikely(PageTransHuge(page))) {
                pmd_t *pmd;
@@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                pmd = page_check_address_pmd(page, mm, address,
                                             PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
                if (!pmd)
-                        goto out;
+                        return SWAP_AGAIN;
                if (vma->vm_flags & VM_LOCKED) {
                        spin_unlock(ptl);
-                        *mapcount = 0;  /* break early from loop */
+                        pra->vm_flags |= VM_LOCKED;
-                        *vm_flags |= VM_LOCKED;
+                        return SWAP_FAIL; /* To break the loop */
-                        goto out;
                }
                /* go ahead even if the pmd is pmd_trans_splitting() */
@@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                 */
                pte = page_check_address(page, mm, address, &ptl, 0);
                if (!pte)
-                        goto out;
+                        return SWAP_AGAIN;
                if (vma->vm_flags & VM_LOCKED) {
                        pte_unmap_unlock(pte, ptl);
-                        *mapcount = 0;  /* break early from loop */
+                        pra->vm_flags |= VM_LOCKED;
-                        *vm_flags |= VM_LOCKED;
+                        return SWAP_FAIL; /* To break the loop */
-                        goto out;
                }
                if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
-        (*mapcount)--;
+        if (referenced) {
+                pra->referenced++;
-        if (referenced)
+                pra->vm_flags |= vma->vm_flags;
-                *vm_flags |= vma->vm_flags;
-out:
-        return referenced;
-}
-static int page_referenced_anon(struct page *page,
-                                struct mem_cgroup *memcg,
-                                unsigned long *vm_flags)
-{
-        unsigned int mapcount;
-        struct anon_vma *anon_vma;
-        pgoff_t pgoff;
-        struct anon_vma_chain *avc;
-        int referenced = 0;
-        anon_vma = page_lock_anon_vma_read(page);
-        if (!anon_vma)
-                return referenced;
-        mapcount = page_mapcount(page);
-        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-                struct vm_area_struct *vma = avc->vma;
-                unsigned long address = vma_address(page, vma);
-                /*
-                 * If we are reclaiming on behalf of a cgroup, skip
-                 * counting on behalf of references from different
-                 * cgroups
-                 */
-                if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
-                        continue;
-                referenced += page_referenced_one(page, vma, address,
-                                                  &mapcount, vm_flags);
-                if (!mapcount)
-                        break;
        }
-        page_unlock_anon_vma_read(anon_vma);
+        pra->mapcount--;
-        return referenced;
+        if (!pra->mapcount)
+                return SWAP_SUCCESS; /* To break the loop */
+        return SWAP_AGAIN;
 }
-/**
+static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
- * page_referenced_file - referenced check for object-based rmap
- * @page: the page we're checking references on.
- * @memcg: target memory control group
- * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
- *
- * For an object-based mapped page, find all the places it is mapped and
- * check/clear the referenced flag.  This is done by following the page->mapping
- * pointer, then walking the chain of vmas it holds.  It returns the number
- * of references it found.
- *
- * This function is only called from page_referenced for object-based pages.
- */
-static int page_referenced_file(struct page *page,
-                                struct mem_cgroup *memcg,
-                                unsigned long *vm_flags)
 {
-        unsigned int mapcount;
+        struct page_referenced_arg *pra = arg;
-        struct address_space *mapping = page->mapping;
+        struct mem_cgroup *memcg = pra->memcg;
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        struct vm_area_struct *vma;
-        int referenced = 0;
-        /*
+        if (!mm_match_cgroup(vma->vm_mm, memcg))
-         * The caller's checks on page->mapping and !PageAnon have made
+                return true;
-         * sure that this is a file page: the check for page->mapping
-         * excludes the case just before it gets set on an anon page.
-         */
-        BUG_ON(PageAnon(page));
-        /*
-         * The page lock not only makes sure that page->mapping cannot
-         * suddenly be NULLified by truncation, it makes sure that the
-         * structure at mapping cannot be freed and reused yet,
-         * so we can safely take mapping->i_mmap_mutex.
-         */
-        BUG_ON(!PageLocked(page));
-        mutex_lock(&mapping->i_mmap_mutex);
-        /*
-         * i_mmap_mutex does not stabilize mapcount at all, but mapcount
-         * is more likely to be accurate if we note it after spinning.
-         */
-        mapcount = page_mapcount(page);
-        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-                unsigned long address = vma_address(page, vma);
-                /*
-                 * If we are reclaiming on behalf of a cgroup, skip
-                 * counting on behalf of references from different
-                 * cgroups
-                 */
-                if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
-                        continue;
-                referenced += page_referenced_one(page, vma, address,
-                                                  &mapcount, vm_flags);
-                if (!mapcount)
-                        break;
-        }
-        mutex_unlock(&mapping->i_mmap_mutex);
+        return false;
-        return referenced;
 }
 /**
@@ -851,41 +768,57 @@ int page_referenced(struct page *page,
                    struct mem_cgroup *memcg,
                    unsigned long *vm_flags)
 {
-        int referenced = 0;
+        int ret;
        int we_locked = 0;
+        struct page_referenced_arg pra = {
+                .mapcount = page_mapcount(page),
+                .memcg = memcg,
+        };
+        struct rmap_walk_control rwc = {
+                .rmap_one = page_referenced_one,
+                .arg = (void *)&pra,
+                .anon_lock = page_lock_anon_vma_read,
+        };
        *vm_flags = 0;
-        if (page_mapped(page) && page_rmapping(page)) {
+        if (!page_mapped(page))
-                if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+                return 0;
-                        we_locked = trylock_page(page);
-                        if (!we_locked) {
+        if (!page_rmapping(page))
-                                referenced++;
+                return 0;
-                                goto out;
-                        }
+        if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
-                }
+                we_locked = trylock_page(page);
-                if (unlikely(PageKsm(page)))
+                if (!we_locked)
-                        referenced += page_referenced_ksm(page, memcg,
+                        return 1;
-                                                                vm_flags);
-                else if (PageAnon(page))
-                        referenced += page_referenced_anon(page, memcg,
-                                                                vm_flags);
-                else if (page->mapping)
-                        referenced += page_referenced_file(page, memcg,
-                                                                vm_flags);
-                if (we_locked)
-                        unlock_page(page);
        }
-out:
-        return referenced;
+        /*
+         * If we are reclaiming on behalf of a cgroup, skip
+         * counting on behalf of references from different
+         * cgroups
+         */
+        if (memcg) {
+                rwc.invalid_vma = invalid_page_referenced_vma;
+        }
+        ret = rmap_walk(page, &rwc);
+        *vm_flags = pra.vm_flags;
+        if (we_locked)
+                unlock_page(page);
+        return pra.referenced;
 }
 static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
-                            unsigned long address)
+                            unsigned long address, void *arg)
 {
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte;
        spinlock_t *ptl;
        int ret = 0;
+        int *cleaned = arg;
        pte = page_check_address(page, mm, address, &ptl, 1);
        if (!pte)
@@ -904,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
        pte_unmap_unlock(pte, ptl);
-        if (ret)
+        if (ret) {
                mmu_notifier_invalidate_page(mm, address);
+                (*cleaned)++;
+        }
 out:
-        return ret;
+        return SWAP_AGAIN;
 }
-static int page_mkclean_file(struct address_space *mapping, struct page *page)
+static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
 {
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        if (vma->vm_flags & VM_SHARED)
-        struct vm_area_struct *vma;
+                return false;
-        int ret = 0;
-        BUG_ON(PageAnon(page));
-        mutex_lock(&mapping->i_mmap_mutex);
+        return true;
-        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-                if (vma->vm_flags & VM_SHARED) {
-                        unsigned long address = vma_address(page, vma);
-                        ret += page_mkclean_one(page, vma, address);
-                }
-        }
-        mutex_unlock(&mapping->i_mmap_mutex);
-        return ret;
 }
 int page_mkclean(struct page *page)
 {
-        int ret = 0;
+        int cleaned = 0;
+        struct address_space *mapping;
+        struct rmap_walk_control rwc = {
+                .arg = (void *)&cleaned,
+                .rmap_one = page_mkclean_one,
+                .invalid_vma = invalid_mkclean_vma,
+        };
        BUG_ON(!PageLocked(page));
-        if (page_mapped(page)) {
+        if (!page_mapped(page))
-                struct address_space *mapping = page_mapping(page);
+                return 0;
-                if (mapping)
-                        ret = page_mkclean_file(mapping, page);
-        }
-        return ret;
+        mapping = page_mapping(page);
+        if (!mapping)
+                return 0;
+        rmap_walk(page, &rwc);
+        return cleaned;
 }
 EXPORT_SYMBOL_GPL(page_mkclean);
@@ -961,9 +894,9 @@ void page_move_anon_rmap(struct page *page,
 {
        struct anon_vma *anon_vma = vma->anon_vma;
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON(!anon_vma);
-        VM_BUG_ON(page->index != linear_page_index(vma, address));
+        VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;
@@ -1062,7 +995,7 @@ void do_page_add_anon_rmap(struct page *page,
        if (unlikely(PageKsm(page)))
                return;
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
        /* address might be in next vma when migration races vma_adjust */
        if (first)
                __page_set_anon_rmap(page, vma, address, exclusive);
@@ -1177,17 +1110,17 @@ out:
 }
 /*
- * Subfunctions of try_to_unmap: try_to_unmap_one called
+ * @arg: enum ttu_flags will be passed to this argument
- * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
 */
 int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-                     unsigned long address, enum ttu_flags flags)
+                     unsigned long address, void *arg)
 {
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte;
        pte_t pteval;
        spinlock_t *ptl;
        int ret = SWAP_AGAIN;
+        enum ttu_flags flags = (enum ttu_flags)arg;
        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
@@ -1426,124 +1359,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        return ret;
 }
-bool is_vma_temporary_stack(struct vm_area_struct *vma)
+static int try_to_unmap_nonlinear(struct page *page,
-{
+                struct address_space *mapping, struct vm_area_struct *vma)
-        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
-        if (!maybe_stack)
-                return false;
-        if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
-                                                VM_STACK_INCOMPLETE_SETUP)
-                return true;
-        return false;
-}
-/**
- * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
- * rmap method
- * @page: the page to unmap/unlock
- * @flags: action and flags
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the anon_vma struct it points to.
- *
- * This function is only called from try_to_unmap/try_to_munlock for
- * anonymous pages.
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
- * where the page was found will be held for write.  So, we won't recheck
- * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
- * 'LOCKED.
- */
-static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
-        struct anon_vma *anon_vma;
-        pgoff_t pgoff;
-        struct anon_vma_chain *avc;
-        int ret = SWAP_AGAIN;
-        anon_vma = page_lock_anon_vma_read(page);
-        if (!anon_vma)
-                return ret;
-        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-                struct vm_area_struct *vma = avc->vma;
-                unsigned long address;
-                /*
-                 * During exec, a temporary VMA is setup and later moved.
-                 * The VMA is moved under the anon_vma lock but not the
-                 * page tables leading to a race where migration cannot
-                 * find the migration ptes. Rather than increasing the
-                 * locking requirements of exec(), migration skips
-                 * temporary VMAs until after exec() completes.
-                 */
-                if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
-                                is_vma_temporary_stack(vma))
-                        continue;
-                address = vma_address(page, vma);
-                ret = try_to_unmap_one(page, vma, address, flags);
-                if (ret != SWAP_AGAIN || !page_mapped(page))
-                        break;
-        }
-        page_unlock_anon_vma_read(anon_vma);
-        return ret;
-}
-/**
- * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
- * @page: the page to unmap/unlock
- * @flags: action and flags
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
- *
- * This function is only called from try_to_unmap/try_to_munlock for
- * object-based pages.
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
- * where the page was found will be held for write.  So, we won't recheck
- * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
- * 'LOCKED.
- */
-static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
-{
-        struct address_space *mapping = page->mapping;
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        struct vm_area_struct *vma;
        int ret = SWAP_AGAIN;
        unsigned long cursor;
        unsigned long max_nl_cursor = 0;
        unsigned long max_nl_size = 0;
        unsigned int mapcount;
-        if (PageHuge(page))
+        list_for_each_entry(vma,
-                pgoff = page->index << compound_order(page);
+                &mapping->i_mmap_nonlinear, shared.nonlinear) {
-        mutex_lock(&mapping->i_mmap_mutex);
-        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-                unsigned long address = vma_address(page, vma);
-                ret = try_to_unmap_one(page, vma, address, flags);
-                if (ret != SWAP_AGAIN || !page_mapped(page))
-                        goto out;
-        }
-        if (list_empty(&mapping->i_mmap_nonlinear))
-                goto out;
-        /*
-         * We don't bother to try to find the munlocked page in nonlinears.
-         * It's costly. Instead, later, page reclaim logic may call
-         * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
-         */
-        if (TTU_ACTION(flags) == TTU_MUNLOCK)
-                goto out;
-        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-                                                        shared.nonlinear) {
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
                        max_nl_cursor = cursor;
@@ -1553,8 +1380,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        }
        if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
-                ret = SWAP_FAIL;
+                return SWAP_FAIL;
-                goto out;
        }
        /*
@@ -1566,7 +1392,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
         */
        mapcount = page_mapcount(page);
        if (!mapcount)
-                goto out;
+                return ret;
        cond_resched();
        max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
@@ -1574,10 +1401,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
                max_nl_cursor = CLUSTER_SIZE;
        do {
-                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+                list_for_each_entry(vma,
-                                                        shared.nonlinear) {
+                        &mapping->i_mmap_nonlinear, shared.nonlinear) {
                        cursor = (unsigned long) vma->vm_private_data;
-                        while ( cursor < max_nl_cursor &&
+                        while (cursor < max_nl_cursor &&
                                cursor < vma->vm_end - vma->vm_start) {
                                if (try_to_unmap_cluster(cursor, &mapcount,
                                                vma, page) == SWAP_MLOCK)
@@ -1585,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
                                cursor += CLUSTER_SIZE;
                                vma->vm_private_data = (void *) cursor;
                                if ((int)mapcount <= 0)
-                                        goto out;
+                                        return ret;
                        }
                        vma->vm_private_data = (void *) max_nl_cursor;
                }
@@ -1600,11 +1428,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
         */
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
                vma->vm_private_data = NULL;
-out:
-        mutex_unlock(&mapping->i_mmap_mutex);
        return ret;
 }
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
+{
+        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
+        if (!maybe_stack)
+                return false;
+        if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
+                                                VM_STACK_INCOMPLETE_SETUP)
+                return true;
+        return false;
+}
+static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
+{
+        return is_vma_temporary_stack(vma);
+}
+static int page_not_mapped(struct page *page)
+{
+        return !page_mapped(page);
+};
 /**
 * try_to_unmap - try to remove all page table mappings to a page
 * @page: the page to get unmapped
@@ -1622,16 +1473,29 @@ out:
 int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
        int ret;
+        struct rmap_walk_control rwc = {
+                .rmap_one = try_to_unmap_one,
+                .arg = (void *)flags,
+                .done = page_not_mapped,
+                .file_nonlinear = try_to_unmap_nonlinear,
+                .anon_lock = page_lock_anon_vma_read,
+        };
-        BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
-        VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
+        /*
+         * During exec, a temporary VMA is setup and later moved.
+         * The VMA is moved under the anon_vma lock but not the
+         * page tables leading to a race where migration cannot
+         * find the migration ptes. Rather than increasing the
+         * locking requirements of exec(), migration skips
+         * temporary VMAs until after exec() completes.
+         */
+        if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page))
+                rwc.invalid_vma = invalid_migration_vma;
+        ret = rmap_walk(page, &rwc);
-        if (unlikely(PageKsm(page)))
-                ret = try_to_unmap_ksm(page, flags);
-        else if (PageAnon(page))
-                ret = try_to_unmap_anon(page, flags);
-        else
-                ret = try_to_unmap_file(page, flags);
        if (ret != SWAP_MLOCK && !page_mapped(page))
                ret = SWAP_SUCCESS;
        return ret;
@@ -1654,14 +1518,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 */
 int try_to_munlock(struct page *page)
 {
-        VM_BUG_ON(!PageLocked(page) || PageLRU(page));
+        int ret;
+        struct rmap_walk_control rwc = {
+                .rmap_one = try_to_unmap_one,
+                .arg = (void *)TTU_MUNLOCK,
+                .done = page_not_mapped,
+                /*
+                 * We don't bother to try to find the munlocked page in
+                 * nonlinears. It's costly. Instead, later, page reclaim logic
+                 * may call try_to_unmap() and recover PG_mlocked lazily.
+                 */
+                .file_nonlinear = NULL,
+                .anon_lock = page_lock_anon_vma_read,
-        if (unlikely(PageKsm(page)))
+        };
-                return try_to_unmap_ksm(page, TTU_MUNLOCK);
-        else if (PageAnon(page))
+        VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
-                return try_to_unmap_anon(page, TTU_MUNLOCK);
-        else
+        ret = rmap_walk(page, &rwc);
-                return try_to_unmap_file(page, TTU_MUNLOCK);
+        return ret;
 }
 void __put_anon_vma(struct anon_vma *anon_vma)
@@ -1674,18 +1549,13 @@ void __put_anon_vma(struct anon_vma *anon_vma)
        anon_vma_free(anon_vma);
 }
-#ifdef CONFIG_MIGRATION
+static struct anon_vma *rmap_walk_anon_lock(struct page *page,
-/*
+                                        struct rmap_walk_control *rwc)
- * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
- * Called by migrate.c to remove migration ptes, but might be used more later.
- */
-static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
-                struct vm_area_struct *, unsigned long, void *), void *arg)
 {
        struct anon_vma *anon_vma;
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        struct anon_vma_chain *avc;
+        if (rwc->anon_lock)
-        int ret = SWAP_AGAIN;
+                return rwc->anon_lock(page);
        /*
         * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
@@ -1695,58 +1565,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
         */
        anon_vma = page_anon_vma(page);
        if (!anon_vma)
-                return ret;
+                return NULL;
        anon_vma_lock_read(anon_vma);
+        return anon_vma;
+}
+/*
+ * rmap_walk_anon - do something to anonymous page using the object-based
+ * rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the anon_vma struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write.  So, we won't recheck
+ * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
+{
+        struct anon_vma *anon_vma;
+        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        struct anon_vma_chain *avc;
+        int ret = SWAP_AGAIN;
+        anon_vma = rmap_walk_anon_lock(page, rwc);
+        if (!anon_vma)
+                return ret;
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
-                ret = rmap_one(page, vma, address, arg);
+                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+                        continue;
+                ret = rwc->rmap_one(page, vma, address, rwc->arg);
                if (ret != SWAP_AGAIN)
                        break;
+                if (rwc->done && rwc->done(page))
+                        break;
        }
        anon_vma_unlock_read(anon_vma);
        return ret;
 }
-static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
+/*
-                struct vm_area_struct *, unsigned long, void *), void *arg)
+ * rmap_walk_file - do something to file page using the object-based rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write.  So, we won't recheck
+ * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 {
        struct address_space *mapping = page->mapping;
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        pgoff_t pgoff = page->index << compound_order(page);
        struct vm_area_struct *vma;
        int ret = SWAP_AGAIN;
+        /*
+         * The page lock not only makes sure that page->mapping cannot
+         * suddenly be NULLified by truncation, it makes sure that the
+         * structure at mapping cannot be freed and reused yet,
+         * so we can safely take mapping->i_mmap_mutex.
+         */
+        VM_BUG_ON(!PageLocked(page));
        if (!mapping)
                return ret;
        mutex_lock(&mapping->i_mmap_mutex);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-                ret = rmap_one(page, vma, address, arg);
+                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+                        continue;
+                ret = rwc->rmap_one(page, vma, address, rwc->arg);
                if (ret != SWAP_AGAIN)
-                        break;
+                        goto done;
+                if (rwc->done && rwc->done(page))
+                        goto done;
        }
-        /*
-         * No nonlinear handling: being always shared, nonlinear vmas
+        if (!rwc->file_nonlinear)
-         * never contain migration ptes.  Decide what to do about this
+                goto done;
-         * limitation to linear when we need rmap_walk() on nonlinear.
-         */
+        if (list_empty(&mapping->i_mmap_nonlinear))
+                goto done;
+        ret = rwc->file_nonlinear(page, mapping, vma);
+done:
        mutex_unlock(&mapping->i_mmap_mutex);
        return ret;
 }
-int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
+int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
-                struct vm_area_struct *, unsigned long, void *), void *arg)
 {
-        VM_BUG_ON(!PageLocked(page));
        if (unlikely(PageKsm(page)))
-                return rmap_walk_ksm(page, rmap_one, arg);
+                return rmap_walk_ksm(page, rwc);
        else if (PageAnon(page))
-                return rmap_walk_anon(page, rmap_one, arg);
+                return rmap_walk_anon(page, rwc);
        else
-                return rmap_walk_file(page, rmap_one, arg);
+                return rmap_walk_file(page, rwc);
 }
-#endif /* CONFIG_MIGRATION */
 #ifdef CONFIG_HUGETLB_PAGE
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 902a14842b74..1f18c9d0d93e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,7 +45,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/xattr.h>
 #include <linux/exportfs.h>
 #include <linux/posix_acl.h>
-#include <linux/generic_acl.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/mman.h>
 #include <linux/string.h>
 #include <linux/slab.h>
@@ -285,8 +285,8 @@ static int shmem_add_to_page_cache(struct page *page,
 {
        int error;
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
-        VM_BUG_ON(!PageSwapBacked(page));
+        VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
        page_cache_get(page);
        page->mapping = mapping;
@@ -491,7 +491,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                                continue;
                        if (!unfalloc || !PageUptodate(page)) {
                                if (page->mapping == mapping) {
-                                        VM_BUG_ON(PageWriteback(page));
+                                        VM_BUG_ON_PAGE(PageWriteback(page), page);
                                        truncate_inode_page(mapping, page);
                                }
                        }
@@ -568,7 +568,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                        lock_page(page);
                        if (!unfalloc || !PageUptodate(page)) {
                                if (page->mapping == mapping) {
-                                        VM_BUG_ON(PageWriteback(page));
+                                        VM_BUG_ON_PAGE(PageWriteback(page), page);
                                        truncate_inode_page(mapping, page);
                                }
                        }
@@ -620,10 +620,8 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
        }
        setattr_copy(inode, attr);
-#ifdef CONFIG_TMPFS_POSIX_ACL
        if (attr->ia_valid & ATTR_MODE)
-                error = generic_acl_chmod(inode);
+                error = posix_acl_chmod(inode, inode->i_mode);
-#endif
        return error;
 }
@@ -1937,22 +1935,14 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
        inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
        if (inode) {
-#ifdef CONFIG_TMPFS_POSIX_ACL
+                error = simple_acl_create(dir, inode);
-                error = generic_acl_init(inode, dir);
+                if (error)
-                if (error) {
+                        goto out_iput;
-                        iput(inode);
-                        return error;
-                }
-#endif
                error = security_inode_init_security(inode, dir,
                                                     &dentry->d_name,
                                                     shmem_initxattrs, NULL);
-                if (error) {
+                if (error && error != -EOPNOTSUPP)
-                        if (error != -EOPNOTSUPP) {
+                        goto out_iput;
-                                iput(inode);
-                                return error;
-                        }
-                }
                error = 0;
                dir->i_size += BOGO_DIRENT_SIZE;
@@ -1961,6 +1951,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
                dget(dentry); /* Extra count - pin the dentry in core */
        }
        return error;
+out_iput:
+        iput(inode);
+        return error;
 }
 static int
@@ -1974,24 +1967,17 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
                error = security_inode_init_security(inode, dir,
                                                     NULL,
                                                     shmem_initxattrs, NULL);
-                if (error) {
+                if (error && error != -EOPNOTSUPP)
-                        if (error != -EOPNOTSUPP) {
+                        goto out_iput;
-                                iput(inode);
+                error = simple_acl_create(dir, inode);
-                                return error;
+                if (error)
-                        }
+                        goto out_iput;
-                }
-#ifdef CONFIG_TMPFS_POSIX_ACL
-                error = generic_acl_init(inode, dir);
-                if (error) {
-                        iput(inode);
-                        return error;
-                }
-#else
-                error = 0;
-#endif
                d_tmpfile(dentry, inode);
        }
        return error;
+out_iput:
+        iput(inode);
+        return error;
 }
 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -2223,8 +2209,8 @@ static int shmem_initxattrs(struct inode *inode,
 static const struct xattr_handler *shmem_xattr_handlers[] = {
 #ifdef CONFIG_TMPFS_POSIX_ACL
-        &generic_acl_access_handler,
+        &posix_acl_access_xattr_handler,
-        &generic_acl_default_handler,
+        &posix_acl_default_xattr_handler,
 #endif
        NULL
 };
@@ -2740,6 +2726,7 @@ static const struct inode_operations shmem_inode_operations = {
        .getxattr       = shmem_getxattr,
        .listxattr      = shmem_listxattr,
        .removexattr    = shmem_removexattr,
+        .set_acl        = simple_set_acl,
 #endif
 };
@@ -2764,6 +2751,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
+        .set_acl        = simple_set_acl,
 #endif
 };
@@ -2776,6 +2764,7 @@ static const struct inode_operations shmem_special_inode_operations = {
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
+        .set_acl        = simple_set_acl,
 #endif
 };
diff --git a/mm/slab.c b/mm/slab.c
index eb043bf05f4c..b264214c77ea 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1946,7 +1946,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
 /**
 * slab_destroy - destroy and release all objects in a slab
 * @cachep: cache pointer being destroyed
- * @slabp: slab pointer being destroyed
+ * @page: page pointer being destroyed
 *
 * Destroy all the objs in a slab, and release the mem back to the system.
 * Before calling the slab must have been unlinked from the cache.  The
diff --git a/mm/slab.h b/mm/slab.h
index 0859c4241ba1..8184a7cde272 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -160,12 +160,36 @@ static inline const char *cache_name(struct kmem_cache *s)
        return s->name;
 }
+/*
+ * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
+ * That said the caller must assure the memcg's cache won't go away. Since once
+ * created a memcg's cache is destroyed only along with the root cache, it is
+ * true if we are going to allocate from the cache or hold a reference to the
+ * root cache by other means. Otherwise, we should hold either the slab_mutex
+ * or the memcg's slab_caches_mutex while calling this function and accessing
+ * the returned value.
+ */
 static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
+        struct kmem_cache *cachep;
+        struct memcg_cache_params *params;
        if (!s->memcg_params)
                return NULL;
-        return s->memcg_params->memcg_caches[idx];
+        rcu_read_lock();
+        params = rcu_dereference(s->memcg_params);
+        cachep = params->memcg_caches[idx];
+        rcu_read_unlock();
+        /*
+         * Make sure we will access the up-to-date value. The code updating
+         * memcg_caches issues a write barrier to match this (see
+         * memcg_register_cache()).
+         */
+        smp_read_barrier_depends();
+        return cachep;
 }
 static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0b7bb399b0e4..1ec3c619ba04 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -171,13 +171,26 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
                        struct kmem_cache *parent_cache)
 {
        struct kmem_cache *s = NULL;
-        int err = 0;
+        int err;
        get_online_cpus();
        mutex_lock(&slab_mutex);
-        if (!kmem_cache_sanity_check(memcg, name, size) == 0)
+        err = kmem_cache_sanity_check(memcg, name, size);
-                goto out_locked;
+        if (err)
+                goto out_unlock;
+        if (memcg) {
+                /*
+                 * Since per-memcg caches are created asynchronously on first
+                 * allocation (see memcg_kmem_get_cache()), several threads can
+                 * try to create the same cache, but only one of them may
+                 * succeed. Therefore if we get here and see the cache has
+                 * already been created, we silently return NULL.
+                 */
+                if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg)))
+                        goto out_unlock;
+        }
        /*
         * Some allocators will constraint the set of valid flags to a subset
@@ -189,44 +202,47 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
        s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
        if (s)
-                goto out_locked;
+                goto out_unlock;
+        err = -ENOMEM;
        s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
-        if (s) {
+        if (!s)
-                s->object_size = s->size = size;
+                goto out_unlock;
-                s->align = calculate_alignment(flags, align, size);
-                s->ctor = ctor;
-                if (memcg_register_cache(memcg, s, parent_cache)) {
+        s->object_size = s->size = size;
-                        kmem_cache_free(kmem_cache, s);
+        s->align = calculate_alignment(flags, align, size);
-                        err = -ENOMEM;
+        s->ctor = ctor;
-                        goto out_locked;
-                }
-                s->name = kstrdup(name, GFP_KERNEL);
+        s->name = kstrdup(name, GFP_KERNEL);
-                if (!s->name) {
+        if (!s->name)
-                        kmem_cache_free(kmem_cache, s);
+                goto out_free_cache;
-                        err = -ENOMEM;
-                        goto out_locked;
-                }
-                err = __kmem_cache_create(s, flags);
+        err = memcg_alloc_cache_params(memcg, s, parent_cache);
-                if (!err) {
+        if (err)
-                        s->refcount = 1;
+                goto out_free_cache;
-                        list_add(&s->list, &slab_caches);
-                        memcg_cache_list_add(memcg, s);
+        err = __kmem_cache_create(s, flags);
-                } else {
+        if (err)
-                        kfree(s->name);
+                goto out_free_cache;
-                        kmem_cache_free(kmem_cache, s);
-                }
+        s->refcount = 1;
-        } else
+        list_add(&s->list, &slab_caches);
-                err = -ENOMEM;
+        memcg_register_cache(s);
-out_locked:
+out_unlock:
        mutex_unlock(&slab_mutex);
        put_online_cpus();
        if (err) {
+                /*
+                 * There is no point in flooding logs with warnings or
+                 * especially crashing the system if we fail to create a cache
+                 * for a memcg. In this case we will be accounting the memcg
+                 * allocation to the root cgroup until we succeed to create its
+                 * own cache, but it isn't that critical.
+                 */
+                if (!memcg)
+                        return NULL;
                if (flags & SLAB_PANIC)
                        panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
@@ -236,11 +252,15 @@ out_locked:
                                name, err);
                        dump_stack();
                }
                return NULL;
        }
        return s;
+out_free_cache:
+        memcg_free_cache_params(s);
+        kfree(s->name);
+        kmem_cache_free(kmem_cache, s);
+        goto out_unlock;
 }
 struct kmem_cache *
@@ -263,11 +283,12 @@ void kmem_cache_destroy(struct kmem_cache *s)
                list_del(&s->list);
                if (!__kmem_cache_shutdown(s)) {
+                        memcg_unregister_cache(s);
                        mutex_unlock(&slab_mutex);
                        if (s->flags & SLAB_DESTROY_BY_RCU)
                                rcu_barrier();
-                        memcg_release_cache(s);
+                        memcg_free_cache_params(s);
                        kfree(s->name);
                        kmem_cache_free(kmem_cache, s);
                } else {
diff --git a/mm/slub.c b/mm/slub.c
index 545a170ebf9f..7e3e0458bce4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -355,6 +355,21 @@ static __always_inline void slab_unlock(struct page *page)
        __bit_spin_unlock(PG_locked, &page->flags);
 }
+static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
+{
+        struct page tmp;
+        tmp.counters = counters_new;
+        /*
+         * page->counters can cover frozen/inuse/objects as well
+         * as page->_count.  If we assign to ->counters directly
+         * we run the risk of losing updates to page->_count, so
+         * be careful and only assign to the fields we need.
+         */
+        page->frozen  = tmp.frozen;
+        page->inuse   = tmp.inuse;
+        page->objects = tmp.objects;
+}
 /* Interrupts must be disabled (for the fallback code to work right) */
 static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
                void *freelist_old, unsigned long counters_old,
@@ -376,7 +391,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
                if (page->freelist == freelist_old &&
                                        page->counters == counters_old) {
                        page->freelist = freelist_new;
-                        page->counters = counters_new;
+                        set_page_slub_counters(page, counters_new);
                        slab_unlock(page);
                        return 1;
                }
@@ -415,7 +430,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
                if (page->freelist == freelist_old &&
                                        page->counters == counters_old) {
                        page->freelist = freelist_new;
-                        page->counters = counters_new;
+                        set_page_slub_counters(page, counters_new);
                        slab_unlock(page);
                        local_irq_restore(flags);
                        return 1;
@@ -985,23 +1000,22 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
 /*
 * Tracking of fully allocated slabs for debugging purposes.
- *
- * list_lock must be held.
 */
 static void add_full(struct kmem_cache *s,
        struct kmem_cache_node *n, struct page *page)
 {
+        lockdep_assert_held(&n->list_lock);
        if (!(s->flags & SLAB_STORE_USER))
                return;
        list_add(&page->lru, &n->full);
 }
-/*
+static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
- * list_lock must be held.
- */
-static void remove_full(struct kmem_cache *s, struct page *page)
 {
+        lockdep_assert_held(&n->list_lock);
        if (!(s->flags & SLAB_STORE_USER))
                return;
@@ -1250,7 +1264,8 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
                        void *object, u8 val) { return 1; }
 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
                                        struct page *page) {}
-static inline void remove_full(struct kmem_cache *s, struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
+                                        struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long object_size,
        unsigned long flags, const char *name,
        void (*ctor)(void *))
@@ -1504,12 +1519,12 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
 /*
 * Management of partially allocated slabs.
- *
- * list_lock must be held.
 */
 static inline void add_partial(struct kmem_cache_node *n,
                                struct page *page, int tail)
 {
+        lockdep_assert_held(&n->list_lock);
        n->nr_partial++;
        if (tail == DEACTIVATE_TO_TAIL)
                list_add_tail(&page->lru, &n->partial);
@@ -1517,12 +1532,11 @@ static inline void add_partial(struct kmem_cache_node *n,
                list_add(&page->lru, &n->partial);
 }
-/*
- * list_lock must be held.
- */
 static inline void remove_partial(struct kmem_cache_node *n,
                                        struct page *page)
 {
+        lockdep_assert_held(&n->list_lock);
        list_del(&page->lru);
        n->nr_partial--;
 }
@@ -1532,8 +1546,6 @@ static inline void remove_partial(struct kmem_cache_node *n,
 * return the pointer to the freelist.
 *
 * Returns a list of objects or NULL if it fails.
- *
- * Must hold list_lock since we modify the partial list.
 */
 static inline void *acquire_slab(struct kmem_cache *s,
                struct kmem_cache_node *n, struct page *page,
@@ -1543,6 +1555,8 @@ static inline void *acquire_slab(struct kmem_cache *s,
        unsigned long counters;
        struct page new;
+        lockdep_assert_held(&n->list_lock);
        /*
         * Zap the freelist and set the frozen bit.
         * The old freelist is the list of objects for the
@@ -1887,7 +1901,7 @@ redo:
                else if (l == M_FULL)
-                        remove_full(s, page);
+                        remove_full(s, n, page);
                if (m == M_PARTIAL) {
@@ -2541,7 +2555,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                new.inuse--;
                if ((!new.inuse || !prior) && !was_frozen) {
-                        if (kmem_cache_has_cpu_partial(s) && !prior)
+                        if (kmem_cache_has_cpu_partial(s) && !prior) {
                                /*
                                 * Slab was on no list before and will be
@@ -2551,7 +2565,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                                 */
                                new.frozen = 1;
-                        else { /* Needs to be taken off a list */
+                        } else { /* Needs to be taken off a list */
                                n = get_node(s, page_to_nid(page));
                                /*
@@ -2600,7 +2614,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
         */
        if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
                if (kmem_cache_debug(s))
-                        remove_full(s, page);
+                        remove_full(s, n, page);
                add_partial(n, page, DEACTIVATE_TO_TAIL);
                stat(s, FREE_ADD_PARTIAL);
        }
@@ -2614,9 +2628,10 @@ slab_empty:
                 */
                remove_partial(n, page);
                stat(s, FREE_REMOVE_PARTIAL);
-        } else
+        } else {
                /* Slab must be on the full list */
-                remove_full(s, page);
+                remove_full(s, n, page);
+        }
        spin_unlock_irqrestore(&n->list_lock, flags);
        stat(s, FREE_SLAB);
@@ -2890,7 +2905,13 @@ static void early_kmem_cache_node_alloc(int node)
        init_kmem_cache_node(n);
        inc_slabs_node(kmem_cache_node, node, page->objects);
+        /*
+         * the lock is for lockdep's sake, not for any actual
+         * race protection
+         */
+        spin_lock(&n->list_lock);
        add_partial(n, page, DEACTIVATE_TO_HEAD);
+        spin_unlock(&n->list_lock);
 }
 static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -4299,7 +4320,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                        page = ACCESS_ONCE(c->partial);
                        if (page) {
-                                x = page->pobjects;
+                                node = page_to_nid(page);
+                                if (flags & SO_TOTAL)
+                                        WARN_ON_ONCE(1);
+                                else if (flags & SO_OBJECTS)
+                                        WARN_ON_ONCE(1);
+                                else
+                                        x = page->pages;
                                total += x;
                                nodes[node] += x;
                        }
@@ -5163,7 +5190,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
        }
        s->kobj.kset = slab_kset;
-        err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
+        err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
        if (err) {
                kobject_put(&s->kobj);
                return err;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 27eeab3be757..4cba9c2783a1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
                                unsigned long align,
                                unsigned long goal)
 {
-        return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
+        return memblock_virt_alloc_try_nid(size, align, goal,
+                                            BOOTMEM_ALLOC_ACCESSIBLE, node);
 }
 static void *vmemmap_buf;
@@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
        if (vmemmap_buf_start) {
                /* need to free left buf */
-                free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
+                memblock_free_early(__pa(vmemmap_buf),
+                                    vmemmap_buf_end - vmemmap_buf);
                vmemmap_buf = NULL;
                vmemmap_buf_end = NULL;
        }
diff --git a/mm/sparse.c b/mm/sparse.c
index 8cc7be0e9590..63c3ea5c119c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
                else
                        section = kzalloc(array_size, GFP_KERNEL);
        } else {
-                section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+                section = memblock_virt_alloc_node(array_size, nid);
        }
        return section;
@@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
        limit = goal + (1UL << PA_SECTION_SHIFT);
        nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
-        p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
+        p = memblock_virt_alloc_try_nid_nopanic(size,
-                                          SMP_CACHE_BYTES, goal, limit);
+                                                SMP_CACHE_BYTES, goal, limit,
+                                                nid);
        if (!p && limit) {
                limit = 0;
                goto again;
@@ -331,7 +332,7 @@ static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
                                         unsigned long size)
 {
-        return alloc_bootmem_node_nopanic(pgdat, size);
+        return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
 }
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
                return map;
        size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
-        map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
+        map = memblock_virt_alloc_try_nid(size,
-                                         PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+                                          PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+                                          BOOTMEM_ALLOC_ACCESSIBLE, nid);
        return map;
 }
 void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
        }
        size = PAGE_ALIGN(size);
-        map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
+        map = memblock_virt_alloc_try_nid(size * map_count,
-                                         PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+                                          PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+                                          BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
        if (map) {
                for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
                        if (!present_section_nr(pnum))
@@ -545,7 +548,7 @@ void __init sparse_init(void)
         * sparse_early_mem_map_alloc, so allocate usemap_map at first.
         */
        size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
-        usemap_map = alloc_bootmem(size);
+        usemap_map = memblock_virt_alloc(size, 0);
        if (!usemap_map)
                panic("can not allocate usemap_map\n");
        alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
@@ -553,7 +556,7 @@ void __init sparse_init(void)
 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
        size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
-        map_map = alloc_bootmem(size2);
+        map_map = memblock_virt_alloc(size2, 0);
        if (!map_map)
                panic("can not allocate map_map\n");
        alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
@@ -583,9 +586,9 @@ void __init sparse_init(void)
        vmemmap_populate_print_last();
 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
-        free_bootmem(__pa(map_map), size2);
+        memblock_free_early(__pa(map_map), size2);
 #endif
-        free_bootmem(__pa(usemap_map), size);
+        memblock_free_early(__pa(usemap_map), size);
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/swap.c b/mm/swap.c
index 84b26aaabd03..b31ba67d440a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,7 +31,6 @@
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
 #include <linux/uio.h>
-#include <linux/hugetlb.h>
 #include "internal.h"
@@ -58,7 +57,7 @@ static void __page_cache_release(struct page *page)
                spin_lock_irqsave(&zone->lru_lock, flags);
                lruvec = mem_cgroup_page_lruvec(page, zone);
-                VM_BUG_ON(!PageLRU(page));
+                VM_BUG_ON_PAGE(!PageLRU(page), page);
                __ClearPageLRU(page);
                del_page_from_lru_list(page, lruvec, page_off_lru(page));
                spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page)
 static void put_compound_page(struct page *page)
 {
-        if (unlikely(PageTail(page))) {
+        struct page *page_head;
-                /* __split_huge_page_refcount can run under us */
-                struct page *page_head = compound_trans_head(page);
-                if (likely(page != page_head &&
-                           get_page_unless_zero(page_head))) {
-                        unsigned long flags;
+        if (likely(!PageTail(page))) {
+                if (put_page_testzero(page)) {
                        /*
-                         * THP can not break up slab pages so avoid taking
+                         * By the time all refcounts have been released
-                         * compound_lock().  Slab performs non-atomic bit ops
+                         * split_huge_page cannot run anymore from under us.
-                         * on page->flags for better performance.  In particular
-                         * slab_unlock() in slub used to be a hot path.  It is
-                         * still hot on arches that do not support
-                         * this_cpu_cmpxchg_double().
                         */
-                        if (PageSlab(page_head) || PageHeadHuge(page_head)) {
+                        if (PageHead(page))
-                                if (likely(PageTail(page))) {
+                                __put_compound_page(page);
-                                        /*
+                        else
-                                         * __split_huge_page_refcount
+                                __put_single_page(page);
-                                         * cannot race here.
+                }
-                                         */
+                return;
-                                        VM_BUG_ON(!PageHead(page_head));
+        }
-                                        atomic_dec(&page->_mapcount);
-                                        if (put_page_testzero(page_head))
+        /* __split_huge_page_refcount can run under us */
-                                                VM_BUG_ON(1);
+        page_head = compound_trans_head(page);
-                                        if (put_page_testzero(page_head))
-                                                __put_compound_page(page_head);
+        /*
-                                        return;
+         * THP can not break up slab pages so avoid taking
-                                } else
+         * compound_lock() and skip the tail page refcounting (in
-                                        /*
+         * _mapcount) too. Slab performs non-atomic bit ops on
-                                         * __split_huge_page_refcount
+         * page->flags for better performance. In particular
-                                         * run before us, "page" was a
+         * slab_unlock() in slub used to be a hot path. It is still
-                                         * THP tail. The split
+         * hot on arches that do not support
-                                         * page_head has been freed
+         * this_cpu_cmpxchg_double().
-                                         * and reallocated as slab or
+         *
-                                         * hugetlbfs page of smaller
+         * If "page" is part of a slab or hugetlbfs page it cannot be
-                                         * order (only possible if
+         * splitted and the head page cannot change from under us. And
-                                         * reallocated as slab on
+         * if "page" is part of a THP page under splitting, if the
-                                         * x86).
+         * head page pointed by the THP tail isn't a THP head anymore,
-                                         */
+         * we'll find PageTail clear after smp_rmb() and we'll treat
-                                        goto skip_lock;
+         * it as a single page.
-                        }
+         */
+        if (!__compound_tail_refcounted(page_head)) {
+                /*
+                 * If "page" is a THP tail, we must read the tail page
+                 * flags after the head page flags. The
+                 * split_huge_page side enforces write memory barriers
+                 * between clearing PageTail and before the head page
+                 * can be freed and reallocated.
+                 */
+                smp_rmb();
+                if (likely(PageTail(page))) {
                        /*
-                         * page_head wasn't a dangling pointer but it
+                         * __split_huge_page_refcount cannot race
-                         * may not be a head page anymore by the time
+                         * here.
-                         * we obtain the lock. That is ok as long as it
-                         * can't be freed from under us.
                         */
-                        flags = compound_lock_irqsave(page_head);
+                        VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
-                        if (unlikely(!PageTail(page))) {
+                        VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
-                                /* __split_huge_page_refcount run before us */
+                        if (put_page_testzero(page_head)) {
-                                compound_unlock_irqrestore(page_head, flags);
+                                /*
-skip_lock:
+                                 * If this is the tail of a slab
-                                if (put_page_testzero(page_head)) {
+                                 * compound page, the tail pin must
-                                        /*
+                                 * not be the last reference held on
-                                         * The head page may have been
+                                 * the page, because the PG_slab
-                                         * freed and reallocated as a
+                                 * cannot be cleared before all tail
-                                         * compound page of smaller
+                                 * pins (which skips the _mapcount
-                                         * order and then freed again.
+                                 * tail refcounting) have been
-                                         * All we know is that it
+                                 * released. For hugetlbfs the tail
-                                         * cannot have become: a THP
+                                 * pin may be the last reference on
-                                         * page, a compound page of
+                                 * the page instead, because
-                                         * higher order, a tail page.
+                                 * PageHeadHuge will not go away until
-                                         * That is because we still
+                                 * the compound page enters the buddy
-                                         * hold the refcount of the
+                                 * allocator.
-                                         * split THP tail and
+                                 */
-                                         * page_head was the THP head
+                                VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
-                                         * before the split.
+                                __put_compound_page(page_head);
-                                         */
-                                        if (PageHead(page_head))
-                                                __put_compound_page(page_head);
-                                        else
-                                                __put_single_page(page_head);
-                                }
-out_put_single:
-                                if (put_page_testzero(page))
-                                        __put_single_page(page);
-                                return;
                        }
-                        VM_BUG_ON(page_head != page->first_page);
+                        return;
+                } else
                        /*
-                         * We can release the refcount taken by
+                         * __split_huge_page_refcount run before us,
-                         * get_page_unless_zero() now that
+                         * "page" was a THP tail. The split page_head
-                         * __split_huge_page_refcount() is blocked on
+                         * has been freed and reallocated as slab or
-                         * the compound_lock.
+                         * hugetlbfs page of smaller order (only
+                         * possible if reallocated as slab on x86).
                         */
-                        if (put_page_testzero(page_head))
+                        goto out_put_single;
-                                VM_BUG_ON(1);
+        }
-                        /* __split_huge_page_refcount will wait now */
-                        VM_BUG_ON(page_mapcount(page) <= 0);
-                        atomic_dec(&page->_mapcount);
-                        VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
-                        VM_BUG_ON(atomic_read(&page->_count) != 0);
-                        compound_unlock_irqrestore(page_head, flags);
+        if (likely(page != page_head && get_page_unless_zero(page_head))) {
+                unsigned long flags;
+                /*
+                 * page_head wasn't a dangling pointer but it may not
+                 * be a head page anymore by the time we obtain the
+                 * lock. That is ok as long as it can't be freed from
+                 * under us.
+                 */
+                flags = compound_lock_irqsave(page_head);
+                if (unlikely(!PageTail(page))) {
+                        /* __split_huge_page_refcount run before us */
+                        compound_unlock_irqrestore(page_head, flags);
                        if (put_page_testzero(page_head)) {
+                                /*
+                                 * The head page may have been freed
+                                 * and reallocated as a compound page
+                                 * of smaller order and then freed
+                                 * again.  All we know is that it
+                                 * cannot have become: a THP page, a
+                                 * compound page of higher order, a
+                                 * tail page.  That is because we
+                                 * still hold the refcount of the
+                                 * split THP tail and page_head was
+                                 * the THP head before the split.
+                                 */
                                if (PageHead(page_head))
                                        __put_compound_page(page_head);
                                else
                                        __put_single_page(page_head);
                        }
-                } else {
+out_put_single:
-                        /* page_head is a dangling pointer */
+                        if (put_page_testzero(page))
-                        VM_BUG_ON(PageTail(page));
+                                __put_single_page(page);
-                        goto out_put_single;
+                        return;
                }
-        } else if (put_page_testzero(page)) {
+                VM_BUG_ON_PAGE(page_head != page->first_page, page);
-                if (PageHead(page))
+                /*
-                        __put_compound_page(page);
+                 * We can release the refcount taken by
-                else
+                 * get_page_unless_zero() now that
-                        __put_single_page(page);
+                 * __split_huge_page_refcount() is blocked on the
+                 * compound_lock.
+                 */
+                if (put_page_testzero(page_head))
+                        VM_BUG_ON_PAGE(1, page_head);
+                /* __split_huge_page_refcount will wait now */
+                VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
+                atomic_dec(&page->_mapcount);
+                VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
+                VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
+                compound_unlock_irqrestore(page_head, flags);
+                if (put_page_testzero(page_head)) {
+                        if (PageHead(page_head))
+                                __put_compound_page(page_head);
+                        else
+                                __put_single_page(page_head);
+                }
+        } else {
+                /* page_head is a dangling pointer */
+                VM_BUG_ON_PAGE(PageTail(page), page);
+                goto out_put_single;
        }
 }
@@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page)
         * split_huge_page().
         */
        unsigned long flags;
-        bool got = false;
+        bool got;
        struct page *page_head = compound_trans_head(page);
-        if (likely(page != page_head && get_page_unless_zero(page_head))) {
+        /* Ref to put_compound_page() comment. */
-                /* Ref to put_compound_page() comment. */
+        if (!__compound_tail_refcounted(page_head)) {
-                if (PageSlab(page_head) || PageHeadHuge(page_head)) {
+                smp_rmb();
-                        if (likely(PageTail(page))) {
+                if (likely(PageTail(page))) {
-                                /*
+                        /*
-                                 * This is a hugetlbfs page or a slab
+                         * This is a hugetlbfs page or a slab
-                                 * page. __split_huge_page_refcount
+                         * page. __split_huge_page_refcount
-                                 * cannot race here.
+                         * cannot race here.
-                                 */
+                         */
-                                VM_BUG_ON(!PageHead(page_head));
+                        VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
-                                __get_page_tail_foll(page, false);
+                        __get_page_tail_foll(page, true);
-                                return true;
+                        return true;
-                        } else {
+                } else {
-                                /*
+                        /*
-                                 * __split_huge_page_refcount run
+                         * __split_huge_page_refcount run
-                                 * before us, "page" was a THP
+                         * before us, "page" was a THP
-                                 * tail. The split page_head has been
+                         * tail. The split page_head has been
-                                 * freed and reallocated as slab or
+                         * freed and reallocated as slab or
-                                 * hugetlbfs page of smaller order
+                         * hugetlbfs page of smaller order
-                                 * (only possible if reallocated as
+                         * (only possible if reallocated as
-                                 * slab on x86).
+                         * slab on x86).
-                                 */
+                         */
-                                put_page(page_head);
+                        return false;
-                                return false;
-                        }
                }
+        }
+        got = false;
+        if (likely(page != page_head && get_page_unless_zero(page_head))) {
                /*
                 * page_head wasn't a dangling pointer but it
                 * may not be a head page anymore by the time
@@ -572,8 +604,8 @@ EXPORT_SYMBOL(__lru_cache_add);
 */
 void lru_cache_add(struct page *page)
 {
-        VM_BUG_ON(PageActive(page) && PageUnevictable(page));
+        VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
-        VM_BUG_ON(PageLRU(page));
+        VM_BUG_ON_PAGE(PageLRU(page), page);
        __lru_cache_add(page);
 }
@@ -814,7 +846,7 @@ void release_pages(struct page **pages, int nr, int cold)
                        }
                        lruvec = mem_cgroup_page_lruvec(page, zone);
-                        VM_BUG_ON(!PageLRU(page));
+                        VM_BUG_ON_PAGE(!PageLRU(page), page);
                        __ClearPageLRU(page);
                        del_page_from_lru_list(page, lruvec, page_off_lru(page));
                }
@@ -856,9 +888,9 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
 {
        const int file = 0;
-        VM_BUG_ON(!PageHead(page));
+        VM_BUG_ON_PAGE(!PageHead(page), page);
-        VM_BUG_ON(PageCompound(page_tail));
+        VM_BUG_ON_PAGE(PageCompound(page_tail), page);
-        VM_BUG_ON(PageLRU(page_tail));
+        VM_BUG_ON_PAGE(PageLRU(page_tail), page);
        VM_BUG_ON(NR_CPUS != 1 &&
                  !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
@@ -897,7 +929,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
        int active = PageActive(page);
        enum lru_list lru = page_lru(page);
-        VM_BUG_ON(PageLRU(page));
+        VM_BUG_ON_PAGE(PageLRU(page), page);
        SetPageLRU(page);
        add_page_to_lru_list(page, lruvec, lru);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e6f15f8ca2af..98e85e9c2b2d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,9 +83,9 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
        int error;
        struct address_space *address_space;
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
-        VM_BUG_ON(PageSwapCache(page));
+        VM_BUG_ON_PAGE(PageSwapCache(page), page);
-        VM_BUG_ON(!PageSwapBacked(page));
+        VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
        page_cache_get(page);
        SetPageSwapCache(page);
@@ -139,9 +139,9 @@ void __delete_from_swap_cache(struct page *page)
        swp_entry_t entry;
        struct address_space *address_space;
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
-        VM_BUG_ON(!PageSwapCache(page));
+        VM_BUG_ON_PAGE(!PageSwapCache(page), page);
-        VM_BUG_ON(PageWriteback(page));
+        VM_BUG_ON_PAGE(PageWriteback(page), page);
        entry.val = page_private(page);
        address_space = swap_address_space(entry);
@@ -165,8 +165,8 @@ int add_to_swap(struct page *page, struct list_head *list)
        swp_entry_t entry;
        int err;
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
-        VM_BUG_ON(!PageUptodate(page));
+        VM_BUG_ON_PAGE(!PageUptodate(page), page);
        entry = get_swap_page();
        if (!entry.val)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 612a7c9795f6..c6c13b050a58 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -616,7 +616,7 @@ scan:
                }
        }
        offset = si->lowest_bit;
-        while (++offset < scan_base) {
+        while (offset < scan_base) {
                if (!si->swap_map[offset]) {
                        spin_lock(&si->lock);
                        goto checks;
@@ -629,6 +629,7 @@ scan:
                        cond_resched();
                        latency_ration = LATENCY_LIMIT;
                }
+                offset++;
        }
        spin_lock(&si->lock);
@@ -906,7 +907,7 @@ int reuse_swap_page(struct page *page)
 {
        int count;
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
        if (unlikely(PageKsm(page)))
                return 0;
        count = page_mapcount(page);
@@ -926,7 +927,7 @@ int reuse_swap_page(struct page *page)
 */
 int try_to_free_swap(struct page *page)
 {
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
        if (!PageSwapCache(page))
                return 0;
@@ -2714,7 +2715,7 @@ struct swap_info_struct *page_swap_info(struct page *page)
 */
 struct address_space *__page_file_mapping(struct page *page)
 {
-        VM_BUG_ON(!PageSwapCache(page));
+        VM_BUG_ON_PAGE(!PageSwapCache(page), page);
        return page_swap_info(page)->swap_file->f_mapping;
 }
 EXPORT_SYMBOL_GPL(__page_file_mapping);
@@ -2722,7 +2723,7 @@ EXPORT_SYMBOL_GPL(__page_file_mapping);
 pgoff_t __page_file_index(struct page *page)
 {
        swp_entry_t swap = { .val = page_private(page) };
-        VM_BUG_ON(!PageSwapCache(page));
+        VM_BUG_ON_PAGE(!PageSwapCache(page), page);
        return swp_offset(swap);
 }
 EXPORT_SYMBOL_GPL(__page_file_index);
diff --git a/mm/util.c b/mm/util.c
index 808f375648e7..a24aa22f2473 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page)
        return mapping;
 }
+int overcommit_ratio_handler(struct ctl_table *table, int write,
+                             void __user *buffer, size_t *lenp,
+                             loff_t *ppos)
+{
+        int ret;
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (ret == 0 && write)
+                sysctl_overcommit_kbytes = 0;
+        return ret;
+}
+int overcommit_kbytes_handler(struct ctl_table *table, int write,
+                             void __user *buffer, size_t *lenp,
+                             loff_t *ppos)
+{
+        int ret;
+        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+        if (ret == 0 && write)
+                sysctl_overcommit_ratio = 0;
+        return ret;
+}
 /*
 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
 */
 unsigned long vm_commit_limit(void)
 {
-        return ((totalram_pages - hugetlb_total_pages())
+        unsigned long allowed;
-                * sysctl_overcommit_ratio / 100) + total_swap_pages;
+        if (sysctl_overcommit_kbytes)
+                allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
+        else
+                allowed = ((totalram_pages - hugetlb_total_pages())
+                           * sysctl_overcommit_ratio / 100);
+        allowed += total_swap_pages;
+        return allowed;
 }
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index e0f62837c3f4..196970a4541f 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
 /**
 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
- * @css:        css that is interested in vmpressure notifications
+ * @memcg:      memcg that is interested in vmpressure notifications
- * @cft:        cgroup control files handle
 * @eventfd:    eventfd context to link notifications with
 * @args:       event arguments (used to set up a pressure level threshold)
 *
@@ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
 * "critical").
 *
- * This function should not be used directly, just pass it to (struct
+ * To be used as memcg event method.
- * cftype).register_event, and then cgroup core will handle everything by
- * itself.
 */
-int vmpressure_register_event(struct cgroup_subsys_state *css,
+int vmpressure_register_event(struct mem_cgroup *memcg,
-                              struct cftype *cft, struct eventfd_ctx *eventfd,
+                              struct eventfd_ctx *eventfd, const char *args)
-                              const char *args)
 {
-        struct vmpressure *vmpr = css_to_vmpressure(css);
+        struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
        struct vmpressure_event *ev;
        int level;
@@ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
 /**
 * vmpressure_unregister_event() - Unbind eventfd from vmpressure
- * @css:        css handle
+ * @memcg:      memcg handle
- * @cft:        cgroup control files handle
 * @eventfd:    eventfd context that was used to link vmpressure with the @cg
 *
 * This function does internal manipulations to detach the @eventfd from
 * the vmpressure notifications, and then frees internal resources
 * associated with the @eventfd (but the @eventfd itself is not freed).
 *
- * This function should not be used directly, just pass it to (struct
+ * To be used as memcg event method.
- * cftype).unregister_event, and then cgroup core will handle everything
- * by itself.
 */
-void vmpressure_unregister_event(struct cgroup_subsys_state *css,
+void vmpressure_unregister_event(struct mem_cgroup *memcg,
-                                 struct cftype *cft,
                                 struct eventfd_ctx *eventfd)
 {
-        struct vmpressure *vmpr = css_to_vmpressure(css);
+        struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
        struct vmpressure_event *ev;
        mutex_lock(&vmpr->events_lock);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eea668d9cff6..a9c74b409681 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_control *sc)
 }
 #endif
-unsigned long zone_reclaimable_pages(struct zone *zone)
+static unsigned long zone_reclaimable_pages(struct zone *zone)
 {
        int nr;
@@ -281,17 +281,34 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
                                nr_pages_scanned, lru_pages,
                                max_pass, delta, total_scan);
-        while (total_scan >= batch_size) {
+        /*
+         * Normally, we should not scan less than batch_size objects in one
+         * pass to avoid too frequent shrinker calls, but if the slab has less
+         * than batch_size objects in total and we are really tight on memory,
+         * we will try to reclaim all available objects, otherwise we can end
+         * up failing allocations although there are plenty of reclaimable
+         * objects spread over several slabs with usage less than the
+         * batch_size.
+         *
+         * We detect the "tight on memory" situations by looking at the total
+         * number of objects we want to scan (total_scan). If it is greater
+         * than the total number of objects on slab (max_pass), we must be
+         * scanning at high prio and therefore should try to reclaim as much as
+         * possible.
+         */
+        while (total_scan >= batch_size ||
+               total_scan >= max_pass) {
                unsigned long ret;
+                unsigned long nr_to_scan = min(batch_size, total_scan);
-                shrinkctl->nr_to_scan = batch_size;
+                shrinkctl->nr_to_scan = nr_to_scan;
                ret = shrinker->scan_objects(shrinker, shrinkctl);
                if (ret == SHRINK_STOP)
                        break;
                freed += ret;
-                count_vm_events(SLABS_SCANNED, batch_size);
+                count_vm_events(SLABS_SCANNED, nr_to_scan);
-                total_scan -= batch_size;
+                total_scan -= nr_to_scan;
                cond_resched();
        }
@@ -352,16 +369,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
        }
        list_for_each_entry(shrinker, &shrinker_list, list) {
-                for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+                if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
-                        if (!node_online(shrinkctl->nid))
+                        shrinkctl->nid = 0;
-                                continue;
-                        if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
-                            (shrinkctl->nid != 0))
-                                break;
                        freed += shrink_slab_node(shrinkctl, shrinker,
-                                 nr_pages_scanned, lru_pages);
+                                        nr_pages_scanned, lru_pages);
+                        continue;
+                }
+                for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+                        if (node_online(shrinkctl->nid))
+                                freed += shrink_slab_node(shrinkctl, shrinker,
+                                                nr_pages_scanned, lru_pages);
                }
        }
@@ -603,7 +621,7 @@ void putback_lru_page(struct page *page)
        bool is_unevictable;
        int was_unevictable = PageUnevictable(page);
-        VM_BUG_ON(PageLRU(page));
+        VM_BUG_ON_PAGE(PageLRU(page), page);
 redo:
        ClearPageUnevictable(page);
@@ -794,8 +812,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (!trylock_page(page))
                        goto keep;
-                VM_BUG_ON(PageActive(page));
+                VM_BUG_ON_PAGE(PageActive(page), page);
-                VM_BUG_ON(page_zone(page) != zone);
+                VM_BUG_ON_PAGE(page_zone(page) != zone, page);
                sc->nr_scanned++;
@@ -1079,14 +1097,14 @@ activate_locked:
                /* Not a candidate for swapping, so reclaim swap space. */
                if (PageSwapCache(page) && vm_swap_full())
                        try_to_free_swap(page);
-                VM_BUG_ON(PageActive(page));
+                VM_BUG_ON_PAGE(PageActive(page), page);
                SetPageActive(page);
                pgactivate++;
 keep_locked:
                unlock_page(page);
 keep:
                list_add(&page->lru, &ret_pages);
-                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
+                VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
        }
        free_hot_cold_page_list(&free_pages, 1);
@@ -1240,7 +1258,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
-                VM_BUG_ON(!PageLRU(page));
+                VM_BUG_ON_PAGE(!PageLRU(page), page);
                switch (__isolate_lru_page(page, mode)) {
                case 0:
@@ -1295,7 +1313,7 @@ int isolate_lru_page(struct page *page)
 {
        int ret = -EBUSY;
-        VM_BUG_ON(!page_count(page));
+        VM_BUG_ON_PAGE(!page_count(page), page);
        if (PageLRU(page)) {
                struct zone *zone = page_zone(page);
@@ -1366,7 +1384,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
                struct page *page = lru_to_page(page_list);
                int lru;
-                VM_BUG_ON(PageLRU(page));
+                VM_BUG_ON_PAGE(PageLRU(page), page);
                list_del(&page->lru);
                if (unlikely(!page_evictable(page))) {
                        spin_unlock_irq(&zone->lru_lock);
@@ -1586,7 +1604,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
                page = lru_to_page(list);
                lruvec = mem_cgroup_page_lruvec(page, zone);
-                VM_BUG_ON(PageLRU(page));
+                VM_BUG_ON_PAGE(PageLRU(page), page);
                SetPageLRU(page);
                nr_pages = hpage_nr_pages(page);
@@ -3297,27 +3315,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
        wake_up_interruptible(&pgdat->kswapd_wait);
 }
-/*
- * The reclaimable count would be mostly accurate.
- * The less reclaimable pages may be
- * - mlocked pages, which will be moved to unevictable list when encountered
- * - mapped pages, which may require several travels to be reclaimed
- * - dirty pages, which is not "instantly" reclaimable
- */
-unsigned long global_reclaimable_pages(void)
-{
-        int nr;
-        nr = global_page_state(NR_ACTIVE_FILE) +
-             global_page_state(NR_INACTIVE_FILE);
-        if (get_nr_swap_pages() > 0)
-                nr += global_page_state(NR_ACTIVE_ANON) +
-                      global_page_state(NR_INACTIVE_ANON);
-        return nr;
-}
 #ifdef CONFIG_HIBERNATION
 /*
 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3701,7 +3698,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
                if (page_evictable(page)) {
                        enum lru_list lru = page_lru_base_type(page);
-                        VM_BUG_ON(PageActive(page));
+                        VM_BUG_ON_PAGE(PageActive(page), page);
                        ClearPageUnevictable(page);
                        del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
                        add_page_to_lru_list(page, lruvec, lru);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
new file mode 100644
index 000000000000..c03ca5e9fe15
--- /dev/null
+++ b/mm/zsmalloc.c
@@ -0,0 +1,1106 @@
+/*
+ * zsmalloc memory allocator
+ *
+ * Copyright (C) 2011  Nitin Gupta
+ * Copyright (C) 2012, 2013 Minchan Kim
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the license that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+/*
+ * This allocator is designed for use with zram. Thus, the allocator is
+ * supposed to work well under low memory conditions. In particular, it
+ * never attempts higher order page allocation which is very likely to
+ * fail under memory pressure. On the other hand, if we just use single
+ * (0-order) pages, it would suffer from very high fragmentation --
+ * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
+ * This was one of the major issues with its predecessor (xvmalloc).
+ *
+ * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
+ * and links them together using various 'struct page' fields. These linked
+ * pages act as a single higher-order page i.e. an object can span 0-order
+ * page boundaries. The code refers to these linked pages as a single entity
+ * called zspage.
+ *
+ * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
+ * since this satisfies the requirements of all its current users (in the
+ * worst case, page is incompressible and is thus stored "as-is" i.e. in
+ * uncompressed form). For allocation requests larger than this size, failure
+ * is returned (see zs_malloc).
+ *
+ * Additionally, zs_malloc() does not return a dereferenceable pointer.
+ * Instead, it returns an opaque handle (unsigned long) which encodes actual
+ * location of the allocated object. The reason for this indirection is that
+ * zsmalloc does not keep zspages permanently mapped since that would cause
+ * issues on 32-bit systems where the VA region for kernel space mappings
+ * is very small. So, before using the allocating memory, the object has to
+ * be mapped using zs_map_object() to get a usable pointer and subsequently
+ * unmapped using zs_unmap_object().
+ *
+ * Following is how we use various fields and flags of underlying
+ * struct page(s) to form a zspage.
+ *
+ * Usage of struct page fields:
+ *      page->first_page: points to the first component (0-order) page
+ *      page->index (union with page->freelist): offset of the first object
+ *              starting in this page. For the first page, this is
+ *              always 0, so we use this field (aka freelist) to point
+ *              to the first free object in zspage.
+ *      page->lru: links together all component pages (except the first page)
+ *              of a zspage
+ *
+ *      For _first_ page only:
+ *
+ *      page->private (union with page->first_page): refers to the
+ *              component page after the first page
+ *      page->freelist: points to the first free object in zspage.
+ *              Free objects are linked together using in-place
+ *              metadata.
+ *      page->objects: maximum number of objects we can store in this
+ *              zspage (class->zspage_order * PAGE_SIZE / class->size)
+ *      page->lru: links together first pages of various zspages.
+ *              Basically forming list of zspages in a fullness group.
+ *      page->mapping: class index and fullness group of the zspage
+ *
+ * Usage of struct page flags:
+ *      PG_private: identifies the first component page
+ *      PG_private2: identifies the last component page
+ *
+ */
+#ifdef CONFIG_ZSMALLOC_DEBUG
+#define DEBUG
+#endif
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+#include <linux/cpumask.h>
+#include <linux/cpu.h>
+#include <linux/vmalloc.h>
+#include <linux/hardirq.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/zsmalloc.h>
+/*
+ * This must be power of 2 and greater than of equal to sizeof(link_free).
+ * These two conditions ensure that any 'struct link_free' itself doesn't
+ * span more than 1 page which avoids complex case of mapping 2 pages simply
+ * to restore link_free pointer values.
+ */
+#define ZS_ALIGN                8
+/*
+ * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
+ * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
+ */
+#define ZS_MAX_ZSPAGE_ORDER 2
+#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
+/*
+ * Object location (<PFN>, <obj_idx>) is encoded as
+ * as single (unsigned long) handle value.
+ *
+ * Note that object index <obj_idx> is relative to system
+ * page <PFN> it is stored in, so for each sub-page belonging
+ * to a zspage, obj_idx starts with 0.
+ *
+ * This is made more complicated by various memory models and PAE.
+ */
+#ifndef MAX_PHYSMEM_BITS
+#ifdef CONFIG_HIGHMEM64G
+#define MAX_PHYSMEM_BITS 36
+#else /* !CONFIG_HIGHMEM64G */
+/*
+ * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
+ * be PAGE_SHIFT
+ */
+#define MAX_PHYSMEM_BITS BITS_PER_LONG
+#endif
+#endif
+#define _PFN_BITS               (MAX_PHYSMEM_BITS - PAGE_SHIFT)
+#define OBJ_INDEX_BITS  (BITS_PER_LONG - _PFN_BITS)
+#define OBJ_INDEX_MASK  ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
+#define MAX(a, b) ((a) >= (b) ? (a) : (b))
+/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
+#define ZS_MIN_ALLOC_SIZE \
+        MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+#define ZS_MAX_ALLOC_SIZE       PAGE_SIZE
+/*
+ * On systems with 4K page size, this gives 254 size classes! There is a
+ * trader-off here:
+ *  - Large number of size classes is potentially wasteful as free page are
+ *    spread across these classes
+ *  - Small number of size classes causes large internal fragmentation
+ *  - Probably its better to use specific size classes (empirically
+ *    determined). NOTE: all those class sizes must be set as multiple of
+ *    ZS_ALIGN to make sure link_free itself never has to span 2 pages.
+ *
+ *  ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
+ *  (reason above)
+ */
+#define ZS_SIZE_CLASS_DELTA     (PAGE_SIZE >> 8)
+#define ZS_SIZE_CLASSES         ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
+                                        ZS_SIZE_CLASS_DELTA + 1)
+/*
+ * We do not maintain any list for completely empty or full pages
+ */
+enum fullness_group {
+        ZS_ALMOST_FULL,
+        ZS_ALMOST_EMPTY,
+        _ZS_NR_FULLNESS_GROUPS,
+        ZS_EMPTY,
+        ZS_FULL
+};
+/*
+ * We assign a page to ZS_ALMOST_EMPTY fullness group when:
+ *      n <= N / f, where
+ * n = number of allocated objects
+ * N = total number of objects zspage can store
+ * f = 1/fullness_threshold_frac
+ *
+ * Similarly, we assign zspage to:
+ *      ZS_ALMOST_FULL  when n > N / f
+ *      ZS_EMPTY        when n == 0
+ *      ZS_FULL         when n == N
+ *
+ * (see: fix_fullness_group())
+ */
+static const int fullness_threshold_frac = 4;
+struct size_class {
+        /*
+         * Size of objects stored in this class. Must be multiple
+         * of ZS_ALIGN.
+         */
+        int size;
+        unsigned int index;
+        /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
+        int pages_per_zspage;
+        spinlock_t lock;
+        /* stats */
+        u64 pages_allocated;
+        struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+};
+/*
+ * Placed within free objects to form a singly linked list.
+ * For every zspage, first_page->freelist gives head of this list.
+ *
+ * This must be power of 2 and less than or equal to ZS_ALIGN
+ */
+struct link_free {
+        /* Handle of next free chunk (encodes <PFN, obj_idx>) */
+        void *next;
+};
+struct zs_pool {
+        struct size_class size_class[ZS_SIZE_CLASSES];
+        gfp_t flags;    /* allocation flags used when growing pool */
+};
+/*
+ * A zspage's class index and fullness group
+ * are encoded in its (first)page->mapping
+ */
+#define CLASS_IDX_BITS  28
+#define FULLNESS_BITS   4
+#define CLASS_IDX_MASK  ((1 << CLASS_IDX_BITS) - 1)
+#define FULLNESS_MASK   ((1 << FULLNESS_BITS) - 1)
+struct mapping_area {
+#ifdef CONFIG_PGTABLE_MAPPING
+        struct vm_struct *vm; /* vm area for mapping object that span pages */
+#else
+        char *vm_buf; /* copy buffer for objects that span pages */
+#endif
+        char *vm_addr; /* address of kmap_atomic()'ed pages */
+        enum zs_mapmode vm_mm; /* mapping mode */
+};
+/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
+static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
+static int is_first_page(struct page *page)
+{
+        return PagePrivate(page);
+}
+static int is_last_page(struct page *page)
+{
+        return PagePrivate2(page);
+}
+static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
+                                enum fullness_group *fullness)
+{
+        unsigned long m;
+        BUG_ON(!is_first_page(page));
+        m = (unsigned long)page->mapping;
+        *fullness = m & FULLNESS_MASK;
+        *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
+}
+static void set_zspage_mapping(struct page *page, unsigned int class_idx,
+                                enum fullness_group fullness)
+{
+        unsigned long m;
+        BUG_ON(!is_first_page(page));
+        m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
+                        (fullness & FULLNESS_MASK);
+        page->mapping = (struct address_space *)m;
+}
+/*
+ * zsmalloc divides the pool into various size classes where each
+ * class maintains a list of zspages where each zspage is divided
+ * into equal sized chunks. Each allocation falls into one of these
+ * classes depending on its size. This function returns index of the
+ * size class which has chunk size big enough to hold the give size.
+ */
+static int get_size_class_index(int size)
+{
+        int idx = 0;
+        if (likely(size > ZS_MIN_ALLOC_SIZE))
+                idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
+                                ZS_SIZE_CLASS_DELTA);
+        return idx;
+}
+/*
+ * For each size class, zspages are divided into different groups
+ * depending on how "full" they are. This was done so that we could
+ * easily find empty or nearly empty zspages when we try to shrink
+ * the pool (not yet implemented). This function returns fullness
+ * status of the given page.
+ */
+static enum fullness_group get_fullness_group(struct page *page)
+{
+        int inuse, max_objects;
+        enum fullness_group fg;
+        BUG_ON(!is_first_page(page));
+        inuse = page->inuse;
+        max_objects = page->objects;
+        if (inuse == 0)
+                fg = ZS_EMPTY;
+        else if (inuse == max_objects)
+                fg = ZS_FULL;
+        else if (inuse <= max_objects / fullness_threshold_frac)
+                fg = ZS_ALMOST_EMPTY;
+        else
+                fg = ZS_ALMOST_FULL;
+        return fg;
+}
+/*
+ * Each size class maintains various freelists and zspages are assigned
+ * to one of these freelists based on the number of live objects they
+ * have. This functions inserts the given zspage into the freelist
+ * identified by <class, fullness_group>.
+ */
+static void insert_zspage(struct page *page, struct size_class *class,
+                                enum fullness_group fullness)
+{
+        struct page **head;
+        BUG_ON(!is_first_page(page));
+        if (fullness >= _ZS_NR_FULLNESS_GROUPS)
+                return;
+        head = &class->fullness_list[fullness];
+        if (*head)
+                list_add_tail(&page->lru, &(*head)->lru);
+        *head = page;
+}
+/*
+ * This function removes the given zspage from the freelist identified
+ * by <class, fullness_group>.
+ */
+static void remove_zspage(struct page *page, struct size_class *class,
+                                enum fullness_group fullness)
+{
+        struct page **head;
+        BUG_ON(!is_first_page(page));
+        if (fullness >= _ZS_NR_FULLNESS_GROUPS)
+                return;
+        head = &class->fullness_list[fullness];
+        BUG_ON(!*head);
+        if (list_empty(&(*head)->lru))
+                *head = NULL;
+        else if (*head == page)
+                *head = (struct page *)list_entry((*head)->lru.next,
+                                        struct page, lru);
+        list_del_init(&page->lru);
+}
+/*
+ * Each size class maintains zspages in different fullness groups depending
+ * on the number of live objects they contain. When allocating or freeing
+ * objects, the fullness status of the page can change, say, from ALMOST_FULL
+ * to ALMOST_EMPTY when freeing an object. This function checks if such
+ * a status change has occurred for the given page and accordingly moves the
+ * page from the freelist of the old fullness group to that of the new
+ * fullness group.
+ */
+static enum fullness_group fix_fullness_group(struct zs_pool *pool,
+                                                struct page *page)
+{
+        int class_idx;
+        struct size_class *class;
+        enum fullness_group currfg, newfg;
+        BUG_ON(!is_first_page(page));
+        get_zspage_mapping(page, &class_idx, &currfg);
+        newfg = get_fullness_group(page);
+        if (newfg == currfg)
+                goto out;
+        class = &pool->size_class[class_idx];
+        remove_zspage(page, class, currfg);
+        insert_zspage(page, class, newfg);
+        set_zspage_mapping(page, class_idx, newfg);
+out:
+        return newfg;
+}
+/*
+ * We have to decide on how many pages to link together
+ * to form a zspage for each size class. This is important
+ * to reduce wastage due to unusable space left at end of
+ * each zspage which is given as:
+ *      wastage = Zp - Zp % size_class
+ * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
+ *
+ * For example, for size class of 3/8 * PAGE_SIZE, we should
+ * link together 3 PAGE_SIZE sized pages to form a zspage
+ * since then we can perfectly fit in 8 such objects.
+ */
+static int get_pages_per_zspage(int class_size)
+{
+        int i, max_usedpc = 0;
+        /* zspage order which gives maximum used size per KB */
+        int max_usedpc_order = 1;
+        for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
+                int zspage_size;
+                int waste, usedpc;
+                zspage_size = i * PAGE_SIZE;
+                waste = zspage_size % class_size;
+                usedpc = (zspage_size - waste) * 100 / zspage_size;
+                if (usedpc > max_usedpc) {
+                        max_usedpc = usedpc;
+                        max_usedpc_order = i;
+                }
+        }
+        return max_usedpc_order;
+}
+/*
+ * A single 'zspage' is composed of many system pages which are
+ * linked together using fields in struct page. This function finds
+ * the first/head page, given any component page of a zspage.
+ */
+static struct page *get_first_page(struct page *page)
+{
+        if (is_first_page(page))
+                return page;
+        else
+                return page->first_page;
+}
+static struct page *get_next_page(struct page *page)
+{
+        struct page *next;
+        if (is_last_page(page))
+                next = NULL;
+        else if (is_first_page(page))
+                next = (struct page *)page_private(page);
+        else
+                next = list_entry(page->lru.next, struct page, lru);
+        return next;
+}
+/*
+ * Encode <page, obj_idx> as a single handle value.
+ * On hardware platforms with physical memory starting at 0x0 the pfn
+ * could be 0 so we ensure that the handle will never be 0 by adjusting the
+ * encoded obj_idx value before encoding.
+ */
+static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+{
+        unsigned long handle;
+        if (!page) {
+                BUG_ON(obj_idx);
+                return NULL;
+        }
+        handle = page_to_pfn(page) << OBJ_INDEX_BITS;
+        handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
+        return (void *)handle;
+}
+/*
+ * Decode <page, obj_idx> pair from the given object handle. We adjust the
+ * decoded obj_idx back to its original value since it was adjusted in
+ * obj_location_to_handle().
+ */
+static void obj_handle_to_location(unsigned long handle, struct page **page,
+                                unsigned long *obj_idx)
+{
+        *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
+        *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
+}
+static unsigned long obj_idx_to_offset(struct page *page,
+                                unsigned long obj_idx, int class_size)
+{
+        unsigned long off = 0;
+        if (!is_first_page(page))
+                off = page->index;
+        return off + obj_idx * class_size;
+}
+static void reset_page(struct page *page)
+{
+        clear_bit(PG_private, &page->flags);
+        clear_bit(PG_private_2, &page->flags);
+        set_page_private(page, 0);
+        page->mapping = NULL;
+        page->freelist = NULL;
+        page_mapcount_reset(page);
+}
+static void free_zspage(struct page *first_page)
+{
+        struct page *nextp, *tmp, *head_extra;
+        BUG_ON(!is_first_page(first_page));
+        BUG_ON(first_page->inuse);
+        head_extra = (struct page *)page_private(first_page);
+        reset_page(first_page);
+        __free_page(first_page);
+        /* zspage with only 1 system page */
+        if (!head_extra)
+                return;
+        list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
+                list_del(&nextp->lru);
+                reset_page(nextp);
+                __free_page(nextp);
+        }
+        reset_page(head_extra);
+        __free_page(head_extra);
+}
+/* Initialize a newly allocated zspage */
+static void init_zspage(struct page *first_page, struct size_class *class)
+{
+        unsigned long off = 0;
+        struct page *page = first_page;
+        BUG_ON(!is_first_page(first_page));
+        while (page) {
+                struct page *next_page;
+                struct link_free *link;
+                unsigned int i, objs_on_page;
+                /*
+                 * page->index stores offset of first object starting
+                 * in the page. For the first page, this is always 0,
+                 * so we use first_page->index (aka ->freelist) to store
+                 * head of corresponding zspage's freelist.
+                 */
+                if (page != first_page)
+                        page->index = off;
+                link = (struct link_free *)kmap_atomic(page) +
+                                                off / sizeof(*link);
+                objs_on_page = (PAGE_SIZE - off) / class->size;
+                for (i = 1; i <= objs_on_page; i++) {
+                        off += class->size;
+                        if (off < PAGE_SIZE) {
+                                link->next = obj_location_to_handle(page, i);
+                                link += class->size / sizeof(*link);
+                        }
+                }
+                /*
+                 * We now come to the last (full or partial) object on this
+                 * page, which must point to the first object on the next
+                 * page (if present)
+                 */
+                next_page = get_next_page(page);
+                link->next = obj_location_to_handle(next_page, 0);
+                kunmap_atomic(link);
+                page = next_page;
+                off = (off + class->size) % PAGE_SIZE;
+        }
+}
+/*
+ * Allocate a zspage for the given size class
+ */
+static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
+{
+        int i, error;
+        struct page *first_page = NULL, *uninitialized_var(prev_page);
+        /*
+         * Allocate individual pages and link them together as:
+         * 1. first page->private = first sub-page
+         * 2. all sub-pages are linked together using page->lru
+         * 3. each sub-page is linked to the first page using page->first_page
+         *
+         * For each size class, First/Head pages are linked together using
+         * page->lru. Also, we set PG_private to identify the first page
+         * (i.e. no other sub-page has this flag set) and PG_private_2 to
+         * identify the last page.
+         */
+        error = -ENOMEM;
+        for (i = 0; i < class->pages_per_zspage; i++) {
+                struct page *page;
+                page = alloc_page(flags);
+                if (!page)
+                        goto cleanup;
+                INIT_LIST_HEAD(&page->lru);
+                if (i == 0) {   /* first page */
+                        SetPagePrivate(page);
+                        set_page_private(page, 0);
+                        first_page = page;
+                        first_page->inuse = 0;
+                }
+                if (i == 1)
+                        set_page_private(first_page, (unsigned long)page);
+                if (i >= 1)
+                        page->first_page = first_page;
+                if (i >= 2)
+                        list_add(&page->lru, &prev_page->lru);
+                if (i == class->pages_per_zspage - 1)   /* last page */
+                        SetPagePrivate2(page);
+                prev_page = page;
+        }
+        init_zspage(first_page, class);
+        first_page->freelist = obj_location_to_handle(first_page, 0);
+        /* Maximum number of objects we can store in this zspage */
+        first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
+        error = 0; /* Success */
+cleanup:
+        if (unlikely(error) && first_page) {
+                free_zspage(first_page);
+                first_page = NULL;
+        }
+        return first_page;
+}
+static struct page *find_get_zspage(struct size_class *class)
+{
+        int i;
+        struct page *page;
+        for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+                page = class->fullness_list[i];
+                if (page)
+                        break;
+        }
+        return page;
+}
+#ifdef CONFIG_PGTABLE_MAPPING
+static inline int __zs_cpu_up(struct mapping_area *area)
+{
+        /*
+         * Make sure we don't leak memory if a cpu UP notification
+         * and zs_init() race and both call zs_cpu_up() on the same cpu
+         */
+        if (area->vm)
+                return 0;
+        area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
+        if (!area->vm)
+                return -ENOMEM;
+        return 0;
+}
+static inline void __zs_cpu_down(struct mapping_area *area)
+{
+        if (area->vm)
+                free_vm_area(area->vm);
+        area->vm = NULL;
+}
+static inline void *__zs_map_object(struct mapping_area *area,
+                                struct page *pages[2], int off, int size)
+{
+        BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages));
+        area->vm_addr = area->vm->addr;
+        return area->vm_addr + off;
+}
+static inline void __zs_unmap_object(struct mapping_area *area,
+                                struct page *pages[2], int off, int size)
+{
+        unsigned long addr = (unsigned long)area->vm_addr;
+        unmap_kernel_range(addr, PAGE_SIZE * 2);
+}
+#else /* CONFIG_PGTABLE_MAPPING */
+static inline int __zs_cpu_up(struct mapping_area *area)
+{
+        /*
+         * Make sure we don't leak memory if a cpu UP notification
+         * and zs_init() race and both call zs_cpu_up() on the same cpu
+         */
+        if (area->vm_buf)
+                return 0;
+        area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
+        if (!area->vm_buf)
+                return -ENOMEM;
+        return 0;
+}
+static inline void __zs_cpu_down(struct mapping_area *area)
+{
+        if (area->vm_buf)
+                free_page((unsigned long)area->vm_buf);
+        area->vm_buf = NULL;
+}
+static void *__zs_map_object(struct mapping_area *area,
+                        struct page *pages[2], int off, int size)
+{
+        int sizes[2];
+        void *addr;
+        char *buf = area->vm_buf;
+        /* disable page faults to match kmap_atomic() return conditions */
+        pagefault_disable();
+        /* no read fastpath */
+        if (area->vm_mm == ZS_MM_WO)
+                goto out;
+        sizes[0] = PAGE_SIZE - off;
+        sizes[1] = size - sizes[0];
+        /* copy object to per-cpu buffer */
+        addr = kmap_atomic(pages[0]);
+        memcpy(buf, addr + off, sizes[0]);
+        kunmap_atomic(addr);
+        addr = kmap_atomic(pages[1]);
+        memcpy(buf + sizes[0], addr, sizes[1]);
+        kunmap_atomic(addr);
+out:
+        return area->vm_buf;
+}
+static void __zs_unmap_object(struct mapping_area *area,
+                        struct page *pages[2], int off, int size)
+{
+        int sizes[2];
+        void *addr;
+        char *buf = area->vm_buf;
+        /* no write fastpath */
+        if (area->vm_mm == ZS_MM_RO)
+                goto out;
+        sizes[0] = PAGE_SIZE - off;
+        sizes[1] = size - sizes[0];
+        /* copy per-cpu buffer to object */
+        addr = kmap_atomic(pages[0]);
+        memcpy(addr + off, buf, sizes[0]);
+        kunmap_atomic(addr);
+        addr = kmap_atomic(pages[1]);
+        memcpy(addr, buf + sizes[0], sizes[1]);
+        kunmap_atomic(addr);
+out:
+        /* enable page faults to match kunmap_atomic() return conditions */
+        pagefault_enable();
+}
+#endif /* CONFIG_PGTABLE_MAPPING */
+static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action,
+                                void *pcpu)
+{
+        int ret, cpu = (long)pcpu;
+        struct mapping_area *area;
+        switch (action) {
+        case CPU_UP_PREPARE:
+                area = &per_cpu(zs_map_area, cpu);
+                ret = __zs_cpu_up(area);
+                if (ret)
+                        return notifier_from_errno(ret);
+                break;
+        case CPU_DEAD:
+        case CPU_UP_CANCELED:
+                area = &per_cpu(zs_map_area, cpu);
+                __zs_cpu_down(area);
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block zs_cpu_nb = {
+        .notifier_call = zs_cpu_notifier
+};
+static void zs_exit(void)
+{
+        int cpu;
+        for_each_online_cpu(cpu)
+                zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
+        unregister_cpu_notifier(&zs_cpu_nb);
+}
+static int zs_init(void)
+{
+        int cpu, ret;
+        register_cpu_notifier(&zs_cpu_nb);
+        for_each_online_cpu(cpu) {
+                ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+                if (notifier_to_errno(ret))
+                        goto fail;
+        }
+        return 0;
+fail:
+        zs_exit();
+        return notifier_to_errno(ret);
+}
+/**
+ * zs_create_pool - Creates an allocation pool to work from.
+ * @flags: allocation flags used to allocate pool metadata
+ *
+ * This function must be called before anything when using
+ * the zsmalloc allocator.
+ *
+ * On success, a pointer to the newly created pool is returned,
+ * otherwise NULL.
+ */
+struct zs_pool *zs_create_pool(gfp_t flags)
+{
+        int i, ovhd_size;
+        struct zs_pool *pool;
+        ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
+        pool = kzalloc(ovhd_size, GFP_KERNEL);
+        if (!pool)
+                return NULL;
+        for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+                int size;
+                struct size_class *class;
+                size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
+                if (size > ZS_MAX_ALLOC_SIZE)
+                        size = ZS_MAX_ALLOC_SIZE;
+                class = &pool->size_class[i];
+                class->size = size;
+                class->index = i;
+                spin_lock_init(&class->lock);
+                class->pages_per_zspage = get_pages_per_zspage(size);
+        }
+        pool->flags = flags;
+        return pool;
+}
+EXPORT_SYMBOL_GPL(zs_create_pool);
+void zs_destroy_pool(struct zs_pool *pool)
+{
+        int i;
+        for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+                int fg;
+                struct size_class *class = &pool->size_class[i];
+                for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
+                        if (class->fullness_list[fg]) {
+                                pr_info("Freeing non-empty class with size %db, fullness group %d\n",
+                                        class->size, fg);
+                        }
+                }
+        }
+        kfree(pool);
+}
+EXPORT_SYMBOL_GPL(zs_destroy_pool);
+/**
+ * zs_malloc - Allocate block of given size from pool.
+ * @pool: pool to allocate from
+ * @size: size of block to allocate
+ *
+ * On success, handle to the allocated object is returned,
+ * otherwise 0.
+ * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
+ */
+unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+{
+        unsigned long obj;
+        struct link_free *link;
+        int class_idx;
+        struct size_class *class;
+        struct page *first_page, *m_page;
+        unsigned long m_objidx, m_offset;
+        if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
+                return 0;
+        class_idx = get_size_class_index(size);
+        class = &pool->size_class[class_idx];
+        BUG_ON(class_idx != class->index);
+        spin_lock(&class->lock);
+        first_page = find_get_zspage(class);
+        if (!first_page) {
+                spin_unlock(&class->lock);
+                first_page = alloc_zspage(class, pool->flags);
+                if (unlikely(!first_page))
+                        return 0;
+                set_zspage_mapping(first_page, class->index, ZS_EMPTY);
+                spin_lock(&class->lock);
+                class->pages_allocated += class->pages_per_zspage;
+        }
+        obj = (unsigned long)first_page->freelist;
+        obj_handle_to_location(obj, &m_page, &m_objidx);
+        m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+        link = (struct link_free *)kmap_atomic(m_page) +
+                                        m_offset / sizeof(*link);
+        first_page->freelist = link->next;
+        memset(link, POISON_INUSE, sizeof(*link));
+        kunmap_atomic(link);
+        first_page->inuse++;
+        /* Now move the zspage to another fullness group, if required */
+        fix_fullness_group(pool, first_page);
+        spin_unlock(&class->lock);
+        return obj;
+}
+EXPORT_SYMBOL_GPL(zs_malloc);
+void zs_free(struct zs_pool *pool, unsigned long obj)
+{
+        struct link_free *link;
+        struct page *first_page, *f_page;
+        unsigned long f_objidx, f_offset;
+        int class_idx;
+        struct size_class *class;
+        enum fullness_group fullness;
+        if (unlikely(!obj))
+                return;
+        obj_handle_to_location(obj, &f_page, &f_objidx);
+        first_page = get_first_page(f_page);
+        get_zspage_mapping(first_page, &class_idx, &fullness);
+        class = &pool->size_class[class_idx];
+        f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
+        spin_lock(&class->lock);
+        /* Insert this object in containing zspage's freelist */
+        link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
+                                                        + f_offset);
+        link->next = first_page->freelist;
+        kunmap_atomic(link);
+        first_page->freelist = (void *)obj;
+        first_page->inuse--;
+        fullness = fix_fullness_group(pool, first_page);
+        if (fullness == ZS_EMPTY)
+                class->pages_allocated -= class->pages_per_zspage;
+        spin_unlock(&class->lock);
+        if (fullness == ZS_EMPTY)
+                free_zspage(first_page);
+}
+EXPORT_SYMBOL_GPL(zs_free);
+/**
+ * zs_map_object - get address of allocated object from handle.
+ * @pool: pool from which the object was allocated
+ * @handle: handle returned from zs_malloc
+ *
+ * Before using an object allocated from zs_malloc, it must be mapped using
+ * this function. When done with the object, it must be unmapped using
+ * zs_unmap_object.
+ *
+ * Only one object can be mapped per cpu at a time. There is no protection
+ * against nested mappings.
+ *
+ * This function returns with preemption and page faults disabled.
+ */
+void *zs_map_object(struct zs_pool *pool, unsigned long handle,
+                        enum zs_mapmode mm)
+{
+        struct page *page;
+        unsigned long obj_idx, off;
+        unsigned int class_idx;
+        enum fullness_group fg;
+        struct size_class *class;
+        struct mapping_area *area;
+        struct page *pages[2];
+        BUG_ON(!handle);
+        /*
+         * Because we use per-cpu mapping areas shared among the
+         * pools/users, we can't allow mapping in interrupt context
+         * because it can corrupt another users mappings.
+         */
+        BUG_ON(in_interrupt());
+        obj_handle_to_location(handle, &page, &obj_idx);
+        get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+        class = &pool->size_class[class_idx];
+        off = obj_idx_to_offset(page, obj_idx, class->size);
+        area = &get_cpu_var(zs_map_area);
+        area->vm_mm = mm;
+        if (off + class->size <= PAGE_SIZE) {
+                /* this object is contained entirely within a page */
+                area->vm_addr = kmap_atomic(page);
+                return area->vm_addr + off;
+        }
+        /* this object spans two pages */
+        pages[0] = page;
+        pages[1] = get_next_page(page);
+        BUG_ON(!pages[1]);
+        return __zs_map_object(area, pages, off, class->size);
+}
+EXPORT_SYMBOL_GPL(zs_map_object);
+void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
+{
+        struct page *page;
+        unsigned long obj_idx, off;
+        unsigned int class_idx;
+        enum fullness_group fg;
+        struct size_class *class;
+        struct mapping_area *area;
+        BUG_ON(!handle);
+        obj_handle_to_location(handle, &page, &obj_idx);
+        get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+        class = &pool->size_class[class_idx];
+        off = obj_idx_to_offset(page, obj_idx, class->size);
+        area = &__get_cpu_var(zs_map_area);
+        if (off + class->size <= PAGE_SIZE)
+                kunmap_atomic(area->vm_addr);
+        else {
+                struct page *pages[2];
+                pages[0] = page;
+                pages[1] = get_next_page(page);
+                BUG_ON(!pages[1]);
+                __zs_unmap_object(area, pages, off, class->size);
+        }
+        put_cpu_var(zs_map_area);
+}
+EXPORT_SYMBOL_GPL(zs_unmap_object);
+u64 zs_get_total_size_bytes(struct zs_pool *pool)
+{
+        int i;
+        u64 npages = 0;
+        for (i = 0; i < ZS_SIZE_CLASSES; i++)
+                npages += pool->size_class[i].pages_allocated;
+        return npages << PAGE_SHIFT;
+}
+EXPORT_SYMBOL_GPL(zs_get_total_size_bytes);
+module_init(zs_init);
+module_exit(zs_exit);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
diff --git a/mm/zswap.c b/mm/zswap.c
index 5a63f78a5601..e55bab9dc41f 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -77,12 +77,12 @@ static u64 zswap_duplicate_entry;
 **********************************/
 /* Enable/disable zswap (disabled by default, fixed at boot for now) */
 static bool zswap_enabled __read_mostly;
-module_param_named(enabled, zswap_enabled, bool, 0);
+module_param_named(enabled, zswap_enabled, bool, 0444);
 /* Compressor to be used by zswap (fixed at boot for now) */
 #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
-module_param_named(compressor, zswap_compressor, charp, 0);
+module_param_named(compressor, zswap_compressor, charp, 0444);
 /* The maximum percentage of memory that the compressed pool can occupy */
 static unsigned int zswap_max_pool_percent = 20;
author	H. Peter Anvin <hpa@linux.intel.com>	2014-02-07 14:27:30 -0500
committer	H. Peter Anvin <hpa@linux.intel.com>	2014-02-07 14:27:30 -0500
commit	a3b072cd180c12e8fe0ece9487b9065808327640 (patch)
tree	62b982041be84748852d77cdf6ca5639ef40858f /mm
parent	75a1ba5b2c529db60ca49626bcaf0bddf4548438 (diff)
parent	081cd62a010f97b5bc1d2b0cd123c5abc692b68a (diff)