Merge branch 'akpm' (patches from Andrew)

Merge first patch-bomb from Andrew Morton: - some misc things - ofs2 updates - about half of MM - checkpatch updates - autofs4 update * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (120 commits) autofs4: fix string.h include in auto_dev-ioctl.h autofs4: use pr_xxx() macros directly for logging autofs4: change log print macros to not insert newline autofs4: make autofs log prints consistent autofs4: fix some white space errors autofs4: fix invalid ioctl return in autofs4_root_ioctl_unlocked() autofs4: fix coding style line length in autofs4_wait() autofs4: fix coding style problem in autofs4_get_set_timeout() autofs4: coding style fixes autofs: show pipe inode in mount options kallsyms: add support for relative offsets in kallsyms address table kallsyms: don't overload absolute symbol type for percpu symbols x86: kallsyms: disable absolute percpu symbols on !SMP checkpatch: fix another left brace warning checkpatch: improve UNSPECIFIED_INT test for bare signed/unsigned uses checkpatch: warn on bare unsigned or signed declarations without int checkpatch: exclude asm volatile from complex macro check mm: memcontrol: drop unnecessary lru locking from mem_cgroup_migrate() mm: migrate: consolidate mem_cgroup_migrate() calls mm/compaction: speed up pageblock_pfn_to_page() when zone is contiguous ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-16 14:51:08 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-16 14:51:08 -0400
commit: 271ecc5253e2b317d729d366560789cd7f93836c (patch)
tree: d3a60bc4dfa8245ff934f357f2367db76b59e7cf /mm
parent: aa6865d836418eb2ba888a4cb1318a28e9aa2e0c (diff)
parent: 63c06227a22b098a3849c5c99e836aea161ca0d7 (diff)
33 files changed, 1707 insertions, 1187 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 957d3da53ddd..5c50b238b770 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC
        select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
        ---help---
          Unmap pages from the kernel linear mapping after free_pages().
-          This results in a large slowdown, but helps to find certain types
+          Depending on runtime enablement, this results in a small or large
-          of memory corruption.
+          slowdown, but helps to find certain types of memory corruption.
          For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
          fill the pages with poison patterns after free_pages() and verify
@@ -26,5 +26,56 @@ config DEBUG_PAGEALLOC
          that would result in incorrect warnings of memory corruption after
          a resume because free pages are not saved to the suspend image.
+          By default this option will have a small overhead, e.g. by not
+          allowing the kernel mapping to be backed by large pages on some
+          architectures. Even bigger overhead comes when the debugging is
+          enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc
+          command line parameter.
+config DEBUG_PAGEALLOC_ENABLE_DEFAULT
+        bool "Enable debug page memory allocations by default?"
+        default n
+        depends on DEBUG_PAGEALLOC
+        ---help---
+          Enable debug page memory allocations by default? This value
+          can be overridden by debug_pagealloc=off|on.
 config PAGE_POISONING
-        bool
+        bool "Poison pages after freeing"
+        select PAGE_EXTENSION
+        select PAGE_POISONING_NO_SANITY if HIBERNATION
+        ---help---
+          Fill the pages with poison patterns after free_pages() and verify
+          the patterns before alloc_pages. The filling of the memory helps
+          reduce the risk of information leaks from freed data. This does
+          have a potential performance impact.
+          Note that "poison" here is not the same thing as the "HWPoison"
+          for CONFIG_MEMORY_FAILURE. This is software poisoning only.
+          If unsure, say N
+config PAGE_POISONING_NO_SANITY
+        depends on PAGE_POISONING
+        bool "Only poison, don't sanity check"
+        ---help---
+           Skip the sanity checking on alloc, only fill the pages with
+           poison on free. This reduces some of the overhead of the
+           poisoning feature.
+           If you are only interested in sanitization, say Y. Otherwise
+           say N.
+config PAGE_POISONING_ZERO
+        bool "Use zero for poisoning instead of random data"
+        depends on PAGE_POISONING
+        ---help---
+           Instead of using the existing poison value, fill the pages with
+           zeros. This makes it harder to detect when errors are occurring
+           due to sanitization but the zeroing at free means that it is
+           no longer necessary to write zeros when GFP_ZERO is used on
+           allocation.
+           Enabling page poisoning with this option will disable hibernation
+           If unsure, say N
diff --git a/mm/Makefile b/mm/Makefile
index 2ed43191fc3b..cfdd481d27a5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -48,7 +48,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_KSM) += ksm.o
-obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
+obj-$(CONFIG_PAGE_POISONING) += page_poison.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 585de54dbe8c..93f71d968098 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -71,49 +71,6 @@ static inline bool migrate_async_suitable(int migratetype)
        return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
-/*
- * Check that the whole (or subset of) a pageblock given by the interval of
- * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
- * with the migration of free compaction scanner. The scanners then need to
- * use only pfn_valid_within() check for arches that allow holes within
- * pageblocks.
- *
- * Return struct page pointer of start_pfn, or NULL if checks were not passed.
- *
- * It's possible on some configurations to have a setup like node0 node1 node0
- * i.e. it's possible that all pages within a zones range of pages do not
- * belong to a single zone. We assume that a border between node0 and node1
- * can occur within a single pageblock, but not a node0 node1 node0
- * interleaving within a single pageblock. It is therefore sufficient to check
- * the first and last page of a pageblock and avoid checking each individual
- * page in a pageblock.
- */
-static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
-                                unsigned long end_pfn, struct zone *zone)
-{
-        struct page *start_page;
-        struct page *end_page;
-        /* end_pfn is one past the range we are checking */
-        end_pfn--;
-        if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
-                return NULL;
-        start_page = pfn_to_page(start_pfn);
-        if (page_zone(start_page) != zone)
-                return NULL;
-        end_page = pfn_to_page(end_pfn);
-        /* This gives a shorter code than deriving page_zone(end_page) */
-        if (page_zone_id(start_page) != page_zone_id(end_page))
-                return NULL;
-        return start_page;
-}
 #ifdef CONFIG_COMPACTION
 /* Do not skip compaction more than 64 times */
@@ -200,7 +157,8 @@ static void reset_cached_positions(struct zone *zone)
 {
        zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
        zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
-        zone->compact_cached_free_pfn = zone_end_pfn(zone);
+        zone->compact_cached_free_pfn =
+                        round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
 }
 /*
@@ -554,13 +512,17 @@ unsigned long
 isolate_freepages_range(struct compact_control *cc,
                        unsigned long start_pfn, unsigned long end_pfn)
 {
-        unsigned long isolated, pfn, block_end_pfn;
+        unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
        LIST_HEAD(freelist);
        pfn = start_pfn;
+        block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+        if (block_start_pfn < cc->zone->zone_start_pfn)
+                block_start_pfn = cc->zone->zone_start_pfn;
        block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
        for (; pfn < end_pfn; pfn += isolated,
+                                block_start_pfn = block_end_pfn,
                                block_end_pfn += pageblock_nr_pages) {
                /* Protect pfn from changing by isolate_freepages_block */
                unsigned long isolate_start_pfn = pfn;
@@ -573,11 +535,13 @@ isolate_freepages_range(struct compact_control *cc,
                 * scanning range to right one.
                 */
                if (pfn >= block_end_pfn) {
+                        block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
                        block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
                        block_end_pfn = min(block_end_pfn, end_pfn);
                }
-                if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+                if (!pageblock_pfn_to_page(block_start_pfn,
+                                        block_end_pfn, cc->zone))
                        break;
                isolated = isolate_freepages_block(cc, &isolate_start_pfn,
@@ -863,18 +827,23 @@ unsigned long
 isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
                                                        unsigned long end_pfn)
 {
-        unsigned long pfn, block_end_pfn;
+        unsigned long pfn, block_start_pfn, block_end_pfn;
        /* Scan block by block. First and last block may be incomplete */
        pfn = start_pfn;
+        block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+        if (block_start_pfn < cc->zone->zone_start_pfn)
+                block_start_pfn = cc->zone->zone_start_pfn;
        block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
        for (; pfn < end_pfn; pfn = block_end_pfn,
+                                block_start_pfn = block_end_pfn,
                                block_end_pfn += pageblock_nr_pages) {
                block_end_pfn = min(block_end_pfn, end_pfn);
-                if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+                if (!pageblock_pfn_to_page(block_start_pfn,
+                                        block_end_pfn, cc->zone))
                        continue;
                pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
@@ -1103,7 +1072,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1;
 static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
-        unsigned long low_pfn, end_pfn;
+        unsigned long block_start_pfn;
+        unsigned long block_end_pfn;
+        unsigned long low_pfn;
        unsigned long isolate_start_pfn;
        struct page *page;
        const isolate_mode_t isolate_mode =
@@ -1115,16 +1086,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
         * initialized by compact_zone()
         */
        low_pfn = cc->migrate_pfn;
+        block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
+        if (block_start_pfn < zone->zone_start_pfn)
+                block_start_pfn = zone->zone_start_pfn;
        /* Only scan within a pageblock boundary */
-        end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+        block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
        /*
         * Iterate over whole pageblocks until we find the first suitable.
         * Do not cross the free scanner.
         */
-        for (; end_pfn <= cc->free_pfn;
+        for (; block_end_pfn <= cc->free_pfn;
-                        low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
+                        low_pfn = block_end_pfn,
+                        block_start_pfn = block_end_pfn,
+                        block_end_pfn += pageblock_nr_pages) {
                /*
                 * This can potentially iterate a massively long zone with
@@ -1135,7 +1111,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                                && compact_should_abort(cc))
                        break;
-                page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
+                page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
+                                                                        zone);
                if (!page)
                        continue;
@@ -1154,8 +1131,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                /* Perform the isolation */
                isolate_start_pfn = low_pfn;
-                low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
+                low_pfn = isolate_migratepages_block(cc, low_pfn,
-                                                                isolate_mode);
+                                                block_end_pfn, isolate_mode);
                if (!low_pfn || cc->contended) {
                        acct_isolated(zone, cc);
@@ -1371,11 +1348,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
         */
        cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
        cc->free_pfn = zone->compact_cached_free_pfn;
-        if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
+        if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
-                cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+                cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
                zone->compact_cached_free_pfn = cc->free_pfn;
        }
-        if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+        if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
                cc->migrate_pfn = start_pfn;
                zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
                zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
diff --git a/mm/debug.c b/mm/debug.c
index f05b2d5d6481..df7247b0b532 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -9,75 +9,38 @@
 #include <linux/mm.h>
 #include <linux/trace_events.h>
 #include <linux/memcontrol.h>
+#include <trace/events/mmflags.h>
-static const struct trace_print_flags pageflag_names[] = {
+#include <linux/migrate.h>
-        {1UL << PG_locked,              "locked"        },
+#include <linux/page_owner.h>
-        {1UL << PG_error,               "error"         },
-        {1UL << PG_referenced,          "referenced"    },
+#include "internal.h"
-        {1UL << PG_uptodate,            "uptodate"      },
-        {1UL << PG_dirty,               "dirty"         },
+char *migrate_reason_names[MR_TYPES] = {
-        {1UL << PG_lru,                 "lru"           },
+        "compaction",
-        {1UL << PG_active,              "active"        },
+        "memory_failure",
-        {1UL << PG_slab,                "slab"          },
+        "memory_hotplug",
-        {1UL << PG_owner_priv_1,        "owner_priv_1"  },
+        "syscall_or_cpuset",
-        {1UL << PG_arch_1,              "arch_1"        },
+        "mempolicy_mbind",
-        {1UL << PG_reserved,            "reserved"      },
+        "numa_misplaced",
-        {1UL << PG_private,             "private"       },
+        "cma",
-        {1UL << PG_private_2,           "private_2"     },
-        {1UL << PG_writeback,           "writeback"     },
-        {1UL << PG_head,                "head"          },
-        {1UL << PG_swapcache,           "swapcache"     },
-        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
-        {1UL << PG_reclaim,             "reclaim"       },
-        {1UL << PG_swapbacked,          "swapbacked"    },
-        {1UL << PG_unevictable,         "unevictable"   },
-#ifdef CONFIG_MMU
-        {1UL << PG_mlocked,             "mlocked"       },
-#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-        {1UL << PG_uncached,            "uncached"      },
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
-        {1UL << PG_hwpoison,            "hwpoison"      },
-#endif
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
-        {1UL << PG_young,               "young"         },
-        {1UL << PG_idle,                "idle"          },
-#endif
 };
-static void dump_flags(unsigned long flags,
+const struct trace_print_flags pageflag_names[] = {
-                        const struct trace_print_flags *names, int count)
+        __def_pageflag_names,
-{
+        {0, NULL}
-        const char *delim = "";
+};
-        unsigned long mask;
-        int i;
-        pr_emerg("flags: %#lx(", flags);
-        /* remove zone id */
-        flags &= (1UL << NR_PAGEFLAGS) - 1;
-        for (i = 0; i < count && flags; i++) {
-                mask = names[i].mask;
-                if ((flags & mask) != mask)
-                        continue;
-                flags &= ~mask;
-                pr_cont("%s%s", delim, names[i].name);
-                delim = "|";
-        }
-        /* check for left over flags */
+const struct trace_print_flags gfpflag_names[] = {
-        if (flags)
+        __def_gfpflag_names,
-                pr_cont("%s%#lx", delim, flags);
+        {0, NULL}
+};
-        pr_cont(")\n");
+const struct trace_print_flags vmaflag_names[] = {
-}
+        __def_vmaflag_names,
+        {0, NULL}
+};
-void dump_page_badflags(struct page *page, const char *reason,
+void __dump_page(struct page *page, const char *reason)
-                unsigned long badflags)
 {
        pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
                  page, atomic_read(&page->_count), page_mapcount(page),
@@ -85,15 +48,13 @@ void dump_page_badflags(struct page *page, const char *reason,
        if (PageCompound(page))
                pr_cont(" compound_mapcount: %d", compound_mapcount(page));
        pr_cont("\n");
-        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
+        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
-        dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
+        pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
        if (reason)
                pr_alert("page dumped because: %s\n", reason);
-        if (page->flags & badflags) {
-                pr_alert("bad because of flags:\n");
-                dump_flags(page->flags & badflags,
-                                pageflag_names, ARRAY_SIZE(pageflag_names));
-        }
 #ifdef CONFIG_MEMCG
        if (page->mem_cgroup)
                pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
@@ -102,67 +63,26 @@ void dump_page_badflags(struct page *page, const char *reason,
 void dump_page(struct page *page, const char *reason)
 {
-        dump_page_badflags(page, reason, 0);
+        __dump_page(page, reason);
+        dump_page_owner(page);
 }
 EXPORT_SYMBOL(dump_page);
 #ifdef CONFIG_DEBUG_VM
-static const struct trace_print_flags vmaflags_names[] = {
-        {VM_READ,                       "read"          },
-        {VM_WRITE,                      "write"         },
-        {VM_EXEC,                       "exec"          },
-        {VM_SHARED,                     "shared"        },
-        {VM_MAYREAD,                    "mayread"       },
-        {VM_MAYWRITE,                   "maywrite"      },
-        {VM_MAYEXEC,                    "mayexec"       },
-        {VM_MAYSHARE,                   "mayshare"      },
-        {VM_GROWSDOWN,                  "growsdown"     },
-        {VM_PFNMAP,                     "pfnmap"        },
-        {VM_DENYWRITE,                  "denywrite"     },
-        {VM_LOCKONFAULT,                "lockonfault"   },
-        {VM_LOCKED,                     "locked"        },
-        {VM_IO,                         "io"            },
-        {VM_SEQ_READ,                   "seqread"       },
-        {VM_RAND_READ,                  "randread"      },
-        {VM_DONTCOPY,                   "dontcopy"      },
-        {VM_DONTEXPAND,                 "dontexpand"    },
-        {VM_ACCOUNT,                    "account"       },
-        {VM_NORESERVE,                  "noreserve"     },
-        {VM_HUGETLB,                    "hugetlb"       },
-#if defined(CONFIG_X86)
-        {VM_PAT,                        "pat"           },
-#elif defined(CONFIG_PPC)
-        {VM_SAO,                        "sao"           },
-#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
-        {VM_GROWSUP,                    "growsup"       },
-#elif !defined(CONFIG_MMU)
-        {VM_MAPPED_COPY,                "mappedcopy"    },
-#else
-        {VM_ARCH_1,                     "arch_1"        },
-#endif
-        {VM_DONTDUMP,                   "dontdump"      },
-#ifdef CONFIG_MEM_SOFT_DIRTY
-        {VM_SOFTDIRTY,                  "softdirty"     },
-#endif
-        {VM_MIXEDMAP,                   "mixedmap"      },
-        {VM_HUGEPAGE,                   "hugepage"      },
-        {VM_NOHUGEPAGE,                 "nohugepage"    },
-        {VM_MERGEABLE,                  "mergeable"     },
-};
 void dump_vma(const struct vm_area_struct *vma)
 {
        pr_emerg("vma %p start %p end %p\n"
                "next %p prev %p mm %p\n"
                "prot %lx anon_vma %p vm_ops %p\n"
-                "pgoff %lx file %p private_data %p\n",
+                "pgoff %lx file %p private_data %p\n"
+                "flags: %#lx(%pGv)\n",
                vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
                vma->vm_prev, vma->vm_mm,
                (unsigned long)pgprot_val(vma->vm_page_prot),
                vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
-                vma->vm_file, vma->vm_private_data);
+                vma->vm_file, vma->vm_private_data,
-        dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
+                vma->vm_flags, &vma->vm_flags);
 }
 EXPORT_SYMBOL(dump_vma);
@@ -196,7 +116,7 @@ void dump_mm(const struct mm_struct *mm)
 #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
                "tlb_flush_pending %d\n"
 #endif
-                "%s",   /* This is here to hold the comma */
+                "def_flags: %#lx(%pGv)\n",
                mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
 #ifdef CONFIG_MMU
@@ -230,11 +150,8 @@ void dump_mm(const struct mm_struct *mm)
 #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
                mm->tlb_flush_pending,
 #endif
-                ""              /* This is here to not have a comma! */
+                mm->def_flags, &mm->def_flags
-                );
+        );
-                dump_flags(mm->def_flags, vmaflags_names,
-                                ARRAY_SIZE(vmaflags_names));
 }
 #endif          /* CONFIG_DEBUG_VM */
diff --git a/mm/failslab.c b/mm/failslab.c
index 79171b4a5826..b0fac98cd938 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,5 +1,7 @@
 #include <linux/fault-inject.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
+#include "slab.h"
 static struct {
        struct fault_attr attr;
@@ -11,18 +13,22 @@ static struct {
        .cache_filter = false,
 };
-bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
+bool should_failslab(struct kmem_cache *s, gfp_t gfpflags)
 {
+        /* No fault-injection for bootstrap cache */
+        if (unlikely(s == kmem_cache))
+                return false;
        if (gfpflags & __GFP_NOFAIL)
                return false;
        if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
                return false;
-        if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
+        if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
                return false;
-        return should_fail(&failslab.attr, size);
+        return should_fail(&failslab.attr, s->object_size);
 }
 static int __init setup_failslab(char *str)
diff --git a/mm/filemap.c b/mm/filemap.c
index da7a35d83de7..61b441b191ad 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,7 +101,7 @@
 *    ->tree_lock               (page_remove_rmap->set_page_dirty)
 *    bdi.wb->list_lock         (page_remove_rmap->set_page_dirty)
 *    ->inode->i_lock           (page_remove_rmap->set_page_dirty)
- *    ->memcg->move_lock        (page_remove_rmap->mem_cgroup_begin_page_stat)
+ *    ->memcg->move_lock        (page_remove_rmap->lock_page_memcg)
 *    bdi.wb->list_lock         (zap_pte_range->set_page_dirty)
 *    ->inode->i_lock           (zap_pte_range->set_page_dirty)
 *    ->private_lock            (zap_pte_range->__set_page_dirty_buffers)
@@ -176,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
 /*
 * Delete a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold the mapping's tree_lock and
+ * is safe.  The caller must hold the mapping's tree_lock.
- * mem_cgroup_begin_page_stat().
 */
-void __delete_from_page_cache(struct page *page, void *shadow,
+void __delete_from_page_cache(struct page *page, void *shadow)
-                              struct mem_cgroup *memcg)
 {
        struct address_space *mapping = page->mapping;
@@ -239,8 +237,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
         * anyway will be cleared before returning page into buddy allocator.
         */
        if (WARN_ON_ONCE(PageDirty(page)))
-                account_page_cleaned(page, mapping, memcg,
+                account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
-                                     inode_to_wb(mapping->host));
 }
 /**
@@ -254,7 +251,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
 void delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
-        struct mem_cgroup *memcg;
        unsigned long flags;
        void (*freepage)(struct page *);
@@ -263,11 +259,9 @@ void delete_from_page_cache(struct page *page)
        freepage = mapping->a_ops->freepage;
-        memcg = mem_cgroup_begin_page_stat(page);
        spin_lock_irqsave(&mapping->tree_lock, flags);
-        __delete_from_page_cache(page, NULL, memcg);
+        __delete_from_page_cache(page, NULL);
        spin_unlock_irqrestore(&mapping->tree_lock, flags);
-        mem_cgroup_end_page_stat(memcg);
        if (freepage)
                freepage(page);
@@ -551,7 +545,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
        if (!error) {
                struct address_space *mapping = old->mapping;
                void (*freepage)(struct page *);
-                struct mem_cgroup *memcg;
                unsigned long flags;
                pgoff_t offset = old->index;
@@ -561,9 +554,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                new->mapping = mapping;
                new->index = offset;
-                memcg = mem_cgroup_begin_page_stat(old);
                spin_lock_irqsave(&mapping->tree_lock, flags);
-                __delete_from_page_cache(old, NULL, memcg);
+                __delete_from_page_cache(old, NULL);
                error = radix_tree_insert(&mapping->page_tree, offset, new);
                BUG_ON(error);
                mapping->nrpages++;
@@ -576,8 +568,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
-                mem_cgroup_end_page_stat(memcg);
+                mem_cgroup_migrate(old, new);
-                mem_cgroup_replace_page(old, new);
                radix_tree_preload_end();
                if (freepage)
                        freepage(old);
@@ -1668,6 +1659,15 @@ find_page:
                                        index, last_index - index);
                }
                if (!PageUptodate(page)) {
+                        /*
+                         * See comment in do_read_cache_page on why
+                         * wait_on_page_locked is used to avoid unnecessarily
+                         * serialisations and why it's safe.
+                         */
+                        wait_on_page_locked_killable(page);
+                        if (PageUptodate(page))
+                                goto page_ok;
                        if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
                                        !mapping->a_ops->is_partially_uptodate)
                                goto page_not_up_to_date;
@@ -2303,7 +2303,7 @@ static struct page *wait_on_page_read(struct page *page)
        return page;
 }
-static struct page *__read_cache_page(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
                                pgoff_t index,
                                int (*filler)(void *, struct page *),
                                void *data,
@@ -2325,53 +2325,74 @@ repeat:
                        /* Presumably ENOMEM for radix tree node */
                        return ERR_PTR(err);
                }
+filler:
                err = filler(data, page);
                if (err < 0) {
                        page_cache_release(page);
-                        page = ERR_PTR(err);
+                        return ERR_PTR(err);
-                } else {
-                        page = wait_on_page_read(page);
                }
-        }
-        return page;
-}
-static struct page *do_read_cache_page(struct address_space *mapping,
-                                pgoff_t index,
-                                int (*filler)(void *, struct page *),
-                                void *data,
-                                gfp_t gfp)
-{
+                page = wait_on_page_read(page);
-        struct page *page;
+                if (IS_ERR(page))
-        int err;
+                        return page;
+                goto out;
+        }
+        if (PageUptodate(page))
+                goto out;
-retry:
+        /*
-        page = __read_cache_page(mapping, index, filler, data, gfp);
+         * Page is not up to date and may be locked due one of the following
-        if (IS_ERR(page))
+         * case a: Page is being filled and the page lock is held
-                return page;
+         * case b: Read/write error clearing the page uptodate status
+         * case c: Truncation in progress (page locked)
+         * case d: Reclaim in progress
+         *
+         * Case a, the page will be up to date when the page is unlocked.
+         *    There is no need to serialise on the page lock here as the page
+         *    is pinned so the lock gives no additional protection. Even if the
+         *    the page is truncated, the data is still valid if PageUptodate as
+         *    it's a race vs truncate race.
+         * Case b, the page will not be up to date
+         * Case c, the page may be truncated but in itself, the data may still
+         *    be valid after IO completes as it's a read vs truncate race. The
+         *    operation must restart if the page is not uptodate on unlock but
+         *    otherwise serialising on page lock to stabilise the mapping gives
+         *    no additional guarantees to the caller as the page lock is
+         *    released before return.
+         * Case d, similar to truncation. If reclaim holds the page lock, it
+         *    will be a race with remove_mapping that determines if the mapping
+         *    is valid on unlock but otherwise the data is valid and there is
+         *    no need to serialise with page lock.
+         *
+         * As the page lock gives no additional guarantee, we optimistically
+         * wait on the page to be unlocked and check if it's up to date and
+         * use the page if it is. Otherwise, the page lock is required to
+         * distinguish between the different cases. The motivation is that we
+         * avoid spurious serialisations and wakeups when multiple processes
+         * wait on the same page for IO to complete.
+         */
+        wait_on_page_locked(page);
        if (PageUptodate(page))
                goto out;
+        /* Distinguish between all the cases under the safety of the lock */
        lock_page(page);
+        /* Case c or d, restart the operation */
        if (!page->mapping) {
                unlock_page(page);
                page_cache_release(page);
-                goto retry;
+                goto repeat;
        }
+        /* Someone else locked and filled the page in a very small window */
        if (PageUptodate(page)) {
                unlock_page(page);
                goto out;
        }
-        err = filler(data, page);
+        goto filler;
-        if (err < 0) {
-                page_cache_release(page);
-                return ERR_PTR(err);
-        } else {
-                page = wait_on_page_read(page);
-                if (IS_ERR(page))
-                        return page;
-        }
 out:
        mark_page_accessed(page);
        return page;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e10a4fee88d2..1ea21e203a70 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3220,28 +3220,26 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
        }
 }
-static int __split_huge_page_tail(struct page *head, int tail,
+static void __split_huge_page_tail(struct page *head, int tail,
                struct lruvec *lruvec, struct list_head *list)
 {
-        int mapcount;
        struct page *page_tail = head + tail;
-        mapcount = atomic_read(&page_tail->_mapcount) + 1;
+        VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
        VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
        /*
         * tail_page->_count is zero and not changing from under us. But
         * get_page_unless_zero() may be running from under us on the
-         * tail_page. If we used atomic_set() below instead of atomic_add(), we
+         * tail_page. If we used atomic_set() below instead of atomic_inc(), we
         * would then run atomic_set() concurrently with
         * get_page_unless_zero(), and atomic_set() is implemented in C not
         * using locked ops. spin_unlock on x86 sometime uses locked ops
         * because of PPro errata 66, 92, so unless somebody can guarantee
         * atomic_set() here would be safe on all archs (and not only on x86),
-         * it's safer to use atomic_add().
+         * it's safer to use atomic_inc().
         */
-        atomic_add(mapcount + 1, &page_tail->_count);
+        atomic_inc(&page_tail->_count);
        page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
        page_tail->flags |= (head->flags &
@@ -3275,8 +3273,6 @@ static int __split_huge_page_tail(struct page *head, int tail,
        page_tail->index = head->index + tail;
        page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
        lru_add_page_tail(head, page_tail, lruvec, list);
-        return mapcount;
 }
 static void __split_huge_page(struct page *page, struct list_head *list)
@@ -3284,7 +3280,7 @@ static void __split_huge_page(struct page *page, struct list_head *list)
        struct page *head = compound_head(page);
        struct zone *zone = page_zone(head);
        struct lruvec *lruvec;
-        int i, tail_mapcount;
+        int i;
        /* prevent PageLRU to go away from under us, and freeze lru stats */
        spin_lock_irq(&zone->lru_lock);
@@ -3293,10 +3289,8 @@ static void __split_huge_page(struct page *page, struct list_head *list)
        /* complete memcg works before add pages to LRU */
        mem_cgroup_split_huge_fixup(head);
-        tail_mapcount = 0;
        for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
-                tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
+                __split_huge_page_tail(head, i, lruvec, list);
-        atomic_sub(tail_mapcount, &head->_count);
        ClearPageCompound(head);
        spin_unlock_irq(&zone->lru_lock);
diff --git a/mm/internal.h b/mm/internal.h
index a38a21ebddb4..ad9400d759c8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
+#include <linux/tracepoint-defs.h>
 /*
 * The set of flags that only affect watermark checking and reclaim
@@ -131,6 +132,18 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
        return page_idx ^ (1 << order);
 }
+extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+                                unsigned long end_pfn, struct zone *zone);
+static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
+                                unsigned long end_pfn, struct zone *zone)
+{
+        if (zone->contiguous)
+                return pfn_to_page(start_pfn);
+        return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
+}
 extern int __isolate_free_page(struct page *page, unsigned int order);
 extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
                                        unsigned int order);
@@ -466,4 +479,9 @@ static inline void try_to_unmap_flush_dirty(void)
 }
 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+extern const struct trace_print_flags pageflag_names[];
+extern const struct trace_print_flags vmaflag_names[];
+extern const struct trace_print_flags gfpflag_names[];
 #endif  /* __MM_INTERNAL_H */
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index cab58bb592d8..6f4f424037c0 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -60,6 +60,9 @@ void kmemcheck_free_shadow(struct page *page, int order)
 void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
                          size_t size)
 {
+        if (unlikely(!object)) /* Skip object if allocation failed */
+                return;
        /*
         * Has already been memset(), which initializes the shadow for us
         * as well.
diff --git a/mm/madvise.c b/mm/madvise.c
index f56825b6d2e1..a01147359f3b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -555,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
                }
                pr_info("Injecting memory failure for page %#lx at %#lx\n",
                       page_to_pfn(p), start);
-                /* Ignore return value for now */
+                ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
-                memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+                if (ret)
+                        return ret;
        }
        return 0;
 }
@@ -638,14 +639,28 @@ madvise_behavior_valid(int behavior)
 *              some pages ahead.
 *  MADV_DONTNEED - the application is finished with the given range,
 *              so the kernel can free resources associated with it.
+ *  MADV_FREE - the application marks pages in the given range as lazy free,
+ *              where actual purges are postponed until memory pressure happens.
 *  MADV_REMOVE - the application wants to free up the given range of
 *              pages and associated backing store.
 *  MADV_DONTFORK - omit this area from child's address space when forking:
 *              typically, to avoid COWing pages pinned by get_user_pages().
 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
+ *  MADV_HWPOISON - trigger memory error handler as if the given memory range
+ *              were corrupted by unrecoverable hardware memory failure.
+ *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
 *              this area with pages of identical content from other such areas.
 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
+ *  MADV_HUGEPAGE - the application wants to back the given range by transparent
+ *              huge pages in the future. Existing pages might be coalesced and
+ *              new pages might be allocated as THP.
+ *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
+ *              transparent huge pages so the existing pages will not be
+ *              coalesced into THP and new pages will not be allocated as THP.
+ *  MADV_DONTDUMP - the application wants to prevent pages in the given range
+ *              from being included in its core dump.
+ *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
 *
 * return values:
 *  zero    - success
diff --git a/mm/memblock.c b/mm/memblock.c
index dd7989929f13..fc7824fa1b42 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -612,14 +612,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
                                                int nid,
                                                unsigned long flags)
 {
-        struct memblock_type *type = &memblock.memory;
        memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
                     (unsigned long long)base,
                     (unsigned long long)base + size - 1,
                     flags, (void *)_RET_IP_);
-        return memblock_add_range(type, base, size, nid, flags);
+        return memblock_add_range(&memblock.memory, base, size, nid, flags);
 }
 int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -740,14 +738,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
                                                   int nid,
                                                   unsigned long flags)
 {
-        struct memblock_type *type = &memblock.reserved;
        memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
                     (unsigned long long)base,
                     (unsigned long long)base + size - 1,
                     flags, (void *)_RET_IP_);
-        return memblock_add_range(type, base, size, nid, flags);
+        return memblock_add_range(&memblock.reserved, base, size, nid, flags);
 }
 int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d06cae2de783..42882c1e7fce 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
        return (memcg == root_mem_cgroup);
 }
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX       USHRT_MAX
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
-        return memcg->css.id;
-}
-/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock().  The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
-        struct cgroup_subsys_state *css;
-        css = css_from_id(id, &memory_cgrp_subsys);
-        return mem_cgroup_from_css(css);
-}
 #ifndef CONFIG_SLOB
 /*
 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
@@ -1709,19 +1684,13 @@ cleanup:
 }
 /**
- * mem_cgroup_begin_page_stat - begin a page state statistics transaction
+ * lock_page_memcg - lock a page->mem_cgroup binding
- * @page: page that is going to change accounted state
+ * @page: the page
- *
- * This function must mark the beginning of an accounted page state
- * change to prevent double accounting when the page is concurrently
- * being moved to another memcg:
 *
- *   memcg = mem_cgroup_begin_page_stat(page);
+ * This function protects unlocked LRU pages from being moved to
- *   if (TestClearPageState(page))
+ * another cgroup and stabilizes their page->mem_cgroup binding.
- *     mem_cgroup_update_page_stat(memcg, state, -1);
- *   mem_cgroup_end_page_stat(memcg);
 */
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
+void lock_page_memcg(struct page *page)
 {
        struct mem_cgroup *memcg;
        unsigned long flags;
@@ -1730,25 +1699,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
         * The RCU lock is held throughout the transaction.  The fast
         * path can get away without acquiring the memcg->move_lock
         * because page moving starts with an RCU grace period.
-         *
-         * The RCU lock also protects the memcg from being freed when
-         * the page state that is going to change is the only thing
-         * preventing the page from being uncharged.
-         * E.g. end-writeback clearing PageWriteback(), which allows
-         * migration to go ahead and uncharge the page before the
-         * account transaction might be complete.
         */
        rcu_read_lock();
        if (mem_cgroup_disabled())
-                return NULL;
+                return;
 again:
        memcg = page->mem_cgroup;
        if (unlikely(!memcg))
-                return NULL;
+                return;
        if (atomic_read(&memcg->moving_account) <= 0)
-                return memcg;
+                return;
        spin_lock_irqsave(&memcg->move_lock, flags);
        if (memcg != page->mem_cgroup) {
@@ -1759,21 +1721,23 @@ again:
        /*
         * When charge migration first begins, we can have locked and
         * unlocked page stat updates happening concurrently.  Track
-         * the task who has the lock for mem_cgroup_end_page_stat().
+         * the task who has the lock for unlock_page_memcg().
         */
        memcg->move_lock_task = current;
        memcg->move_lock_flags = flags;
-        return memcg;
+        return;
 }
-EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
+EXPORT_SYMBOL(lock_page_memcg);
 /**
- * mem_cgroup_end_page_stat - finish a page state statistics transaction
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
- * @memcg: the memcg that was accounted against
+ * @page: the page
 */
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
+void unlock_page_memcg(struct page *page)
 {
+        struct mem_cgroup *memcg = page->mem_cgroup;
        if (memcg && memcg->move_lock_task == current) {
                unsigned long flags = memcg->move_lock_flags;
@@ -1785,7 +1749,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
        rcu_read_unlock();
 }
-EXPORT_SYMBOL(mem_cgroup_end_page_stat);
+EXPORT_SYMBOL(unlock_page_memcg);
 /*
 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -4488,7 +4452,7 @@ static int mem_cgroup_move_account(struct page *page,
        VM_BUG_ON(compound && !PageTransHuge(page));
        /*
-         * Prevent mem_cgroup_replace_page() from looking at
+         * Prevent mem_cgroup_migrate() from looking at
         * page->mem_cgroup of its source page while we change it.
         */
        ret = -EBUSY;
@@ -4923,9 +4887,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
        lru_add_drain_all();
        /*
-         * Signal mem_cgroup_begin_page_stat() to take the memcg's
+         * Signal lock_page_memcg() to take the memcg's move_lock
-         * move_lock while we're moving its pages to another memcg.
+         * while we're moving its pages to another memcg. Then wait
-         * Then wait for already started RCU-only updates to finish.
+         * for already started RCU-only updates to finish.
         */
        atomic_inc(&mc.from->moving_account);
        synchronize_rcu();
@@ -5517,16 +5481,16 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
 }
 /**
- * mem_cgroup_replace_page - migrate a charge to another page
+ * mem_cgroup_migrate - charge a page's replacement
- * @oldpage: currently charged page
+ * @oldpage: currently circulating page
- * @newpage: page to transfer the charge to
+ * @newpage: replacement page
 *
- * Migrate the charge from @oldpage to @newpage.
+ * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * be uncharged upon free.
 *
 * Both pages must be locked, @newpage->mapping must be set up.
- * Either or both pages might be on the LRU already.
 */
-void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
 {
        struct mem_cgroup *memcg;
        unsigned int nr_pages;
@@ -5559,7 +5523,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
                page_counter_charge(&memcg->memsw, nr_pages);
        css_get_many(&memcg->css, nr_pages);
-        commit_charge(newpage, memcg, true);
+        commit_charge(newpage, memcg, false);
        local_irq_disable();
        mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ac595e7a3a95..67c30eb993f0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -826,8 +826,6 @@ static struct page_state {
 #undef lru
 #undef swapbacked
 #undef head
-#undef tail
-#undef compound
 #undef slab
 #undef reserved
diff --git a/mm/memory.c b/mm/memory.c
index 906d8e3b42c0..0e247642ed5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1897,7 +1897,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
        unsigned long end = addr + size;
        int err;
-        BUG_ON(addr >= end);
+        if (WARN_ON(addr >= end))
+                return -EINVAL;
        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -3143,8 +3145,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
                unsigned int flags, pte_t orig_pte)
 {
-        pgoff_t pgoff = (((address & PAGE_MASK)
+        pgoff_t pgoff = linear_page_index(vma, address);
-                        - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
        pte_unmap(page_table);
        /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 979b18cbd343..24ea06393816 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -77,6 +77,9 @@ static struct {
 #define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
 #define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)
+bool memhp_auto_online;
+EXPORT_SYMBOL_GPL(memhp_auto_online);
 void get_online_mems(void)
 {
        might_sleep();
@@ -509,6 +512,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
        int start_sec, end_sec;
        struct vmem_altmap *altmap;
+        clear_zone_contiguous(zone);
        /* during initialize mem_map, align hot-added range to section */
        start_sec = pfn_to_section_nr(phys_start_pfn);
        end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
@@ -521,7 +526,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
                if (altmap->base_pfn != phys_start_pfn
                                || vmem_altmap_offset(altmap) > nr_pages) {
                        pr_warn_once("memory add fail, invalid altmap\n");
-                        return -EINVAL;
+                        err = -EINVAL;
+                        goto out;
                }
                altmap->alloc = 0;
        }
@@ -539,7 +545,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
                err = 0;
        }
        vmemmap_populate_print_last();
+out:
+        set_zone_contiguous(zone);
        return err;
 }
 EXPORT_SYMBOL_GPL(__add_pages);
@@ -811,6 +818,8 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
                }
        }
+        clear_zone_contiguous(zone);
        /*
         * We can only remove entire sections
         */
@@ -826,6 +835,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
                if (ret)
                        break;
        }
+        set_zone_contiguous(zone);
        return ret;
 }
 EXPORT_SYMBOL_GPL(__remove_pages);
@@ -1261,8 +1273,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
        return zone_default;
 }
+static int online_memory_block(struct memory_block *mem, void *arg)
+{
+        return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+}
 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-int __ref add_memory_resource(int nid, struct resource *res)
+int __ref add_memory_resource(int nid, struct resource *res, bool online)
 {
        u64 start, size;
        pg_data_t *pgdat = NULL;
@@ -1322,6 +1339,11 @@ int __ref add_memory_resource(int nid, struct resource *res)
        /* create new memmap entry */
        firmware_map_add_hotplug(start, start + size, "System RAM");
+        /* online pages if requested */
+        if (online)
+                walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
+                                  NULL, online_memory_block);
        goto out;
 error:
@@ -1345,7 +1367,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
        if (IS_ERR(res))
                return PTR_ERR(res);
-        ret = add_memory_resource(nid, res);
+        ret = add_memory_resource(nid, res, memhp_auto_online);
        if (ret < 0)
                release_memory_resource(res);
        return ret;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9a3f6b90e628..8cbc74387df3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -643,7 +643,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
        if (flags & MPOL_MF_LAZY) {
                /* Similar to task_numa_work, skip inaccessible VMAs */
-                if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+                if (!is_vm_hugetlb_page(vma) &&
+                        (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
+                        !(vma->vm_flags & VM_MIXEDMAP))
                        change_prot_numa(vma, start, endvma);
                return 1;
        }
diff --git a/mm/migrate.c b/mm/migrate.c
index 3ad0fea5c438..568284ec75d4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -38,6 +38,7 @@
 #include <linux/balloon_compaction.h>
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
+#include <linux/page_owner.h>
 #include <asm/tlbflush.h>
@@ -325,7 +326,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
                        return -EAGAIN;
                /* No turning back from here */
-                set_page_memcg(newpage, page_memcg(page));
                newpage->index = page->index;
                newpage->mapping = page->mapping;
                if (PageSwapBacked(page))
@@ -372,7 +372,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
         * Now we know that no one else is looking at the page:
         * no turning back from here.
         */
-        set_page_memcg(newpage, page_memcg(page));
        newpage->index = page->index;
        newpage->mapping = page->mapping;
        if (PageSwapBacked(page))
@@ -457,9 +456,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
                return -EAGAIN;
        }
-        set_page_memcg(newpage, page_memcg(page));
        newpage->index = page->index;
        newpage->mapping = page->mapping;
        get_page(newpage);
        radix_tree_replace_slot(pslot, newpage);
@@ -467,6 +466,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
        page_unfreeze_refs(page, expected_count - 1);
        spin_unlock_irq(&mapping->tree_lock);
        return MIGRATEPAGE_SUCCESS;
 }
@@ -578,6 +578,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
         */
        if (PageWriteback(newpage))
                end_page_writeback(newpage);
+        copy_page_owner(page, newpage);
+        mem_cgroup_migrate(page, newpage);
 }
 /************************************************************
@@ -772,7 +776,6 @@ static int move_to_new_page(struct page *newpage, struct page *page,
         * page is freed; but stats require that PageAnon be left as PageAnon.
         */
        if (rc == MIGRATEPAGE_SUCCESS) {
-                set_page_memcg(page, NULL);
                if (!PageAnon(page))
                        page->mapping = NULL;
        }
@@ -952,8 +955,10 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
        }
        rc = __unmap_and_move(page, newpage, force, mode);
-        if (rc == MIGRATEPAGE_SUCCESS)
+        if (rc == MIGRATEPAGE_SUCCESS) {
                put_new_page = NULL;
+                set_page_owner_migrate_reason(newpage, reason);
+        }
 out:
        if (rc != -EAGAIN) {
@@ -1018,7 +1023,7 @@ out:
 static int unmap_and_move_huge_page(new_page_t get_new_page,
                                free_page_t put_new_page, unsigned long private,
                                struct page *hpage, int force,
-                                enum migrate_mode mode)
+                                enum migrate_mode mode, int reason)
 {
        int rc = -EAGAIN;
        int *result = NULL;
@@ -1076,6 +1081,7 @@ put_anon:
        if (rc == MIGRATEPAGE_SUCCESS) {
                hugetlb_cgroup_migrate(hpage, new_hpage);
                put_new_page = NULL;
+                set_page_owner_migrate_reason(new_hpage, reason);
        }
        unlock_page(hpage);
@@ -1148,7 +1154,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
                        if (PageHuge(page))
                                rc = unmap_and_move_huge_page(get_new_page,
                                                put_new_page, private, page,
-                                                pass > 2, mode);
+                                                pass > 2, mode, reason);
                        else
                                rc = unmap_and_move(get_new_page, put_new_page,
                                                private, page, pass > 2, mode,
@@ -1836,9 +1842,8 @@ fail_putback:
        }
        mlock_migrate_page(new_page, page);
-        set_page_memcg(new_page, page_memcg(page));
-        set_page_memcg(page, NULL);
        page_remove_rmap(page, true);
+        set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dc490c06941b..e97a05d9621f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -386,10 +386,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 static void dump_header(struct oom_control *oc, struct task_struct *p,
                        struct mem_cgroup *memcg)
 {
-        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
+        pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, "
-                "oom_score_adj=%hd\n",
+                        "oom_score_adj=%hd\n",
-                current->comm, oc->gfp_mask, oc->order,
+                current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
                current->signal->oom_score_adj);
        cpuset_print_current_mems_allowed();
        dump_stack();
        if (memcg)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6fe7d15bd1f7..11ff8f758631 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1169,6 +1169,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
        unsigned long balanced_dirty_ratelimit;
        unsigned long step;
        unsigned long x;
+        unsigned long shift;
        /*
         * The dirty rate will match the writeout rate in long term, except
@@ -1293,11 +1294,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
         * rate itself is constantly fluctuating. So decrease the track speed
         * when it gets close to the target. Helps eliminate pointless tremors.
         */
-        step >>= dirty_ratelimit / (2 * step + 1);
+        shift = dirty_ratelimit / (2 * step + 1);
-        /*
+        if (shift < BITS_PER_LONG)
-         * Limit the tracking speed to avoid overshooting.
+                step = DIV_ROUND_UP(step >> shift, 8);
-         */
+        else
-        step = (step + 7) / 8;
+                step = 0;
        if (dirty_ratelimit < balanced_dirty_ratelimit)
                dirty_ratelimit += step;
@@ -2409,12 +2410,11 @@ int __set_page_dirty_no_writeback(struct page *page)
 /*
 * Helper function for set_page_dirty family.
 *
- * Caller must hold mem_cgroup_begin_page_stat().
+ * Caller must hold lock_page_memcg().
 *
 * NOTE: This relies on being atomic wrt interrupts.
 */
-void account_page_dirtied(struct page *page, struct address_space *mapping,
+void account_page_dirtied(struct page *page, struct address_space *mapping)
-                          struct mem_cgroup *memcg)
 {
        struct inode *inode = mapping->host;
@@ -2426,7 +2426,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping,
                inode_attach_wb(inode, page);
                wb = inode_to_wb(inode);
-                mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+                mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
                __inc_zone_page_state(page, NR_FILE_DIRTY);
                __inc_zone_page_state(page, NR_DIRTIED);
                __inc_wb_stat(wb, WB_RECLAIMABLE);
@@ -2441,13 +2441,13 @@ EXPORT_SYMBOL(account_page_dirtied);
 /*
 * Helper function for deaccounting dirty page without writeback.
 *
- * Caller must hold mem_cgroup_begin_page_stat().
+ * Caller must hold lock_page_memcg().
 */
 void account_page_cleaned(struct page *page, struct address_space *mapping,
-                          struct mem_cgroup *memcg, struct bdi_writeback *wb)
+                          struct bdi_writeback *wb)
 {
        if (mapping_cap_account_dirty(mapping)) {
-                mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+                mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
                dec_zone_page_state(page, NR_FILE_DIRTY);
                dec_wb_stat(wb, WB_RECLAIMABLE);
                task_io_account_cancelled_write(PAGE_CACHE_SIZE);
@@ -2468,26 +2468,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
 */
 int __set_page_dirty_nobuffers(struct page *page)
 {
-        struct mem_cgroup *memcg;
+        lock_page_memcg(page);
-        memcg = mem_cgroup_begin_page_stat(page);
        if (!TestSetPageDirty(page)) {
                struct address_space *mapping = page_mapping(page);
                unsigned long flags;
                if (!mapping) {
-                        mem_cgroup_end_page_stat(memcg);
+                        unlock_page_memcg(page);
                        return 1;
                }
                spin_lock_irqsave(&mapping->tree_lock, flags);
                BUG_ON(page_mapping(page) != mapping);
                WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-                account_page_dirtied(page, mapping, memcg);
+                account_page_dirtied(page, mapping);
                radix_tree_tag_set(&mapping->page_tree, page_index(page),
                                   PAGECACHE_TAG_DIRTY);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
-                mem_cgroup_end_page_stat(memcg);
+                unlock_page_memcg(page);
                if (mapping->host) {
                        /* !PageAnon && !swapper_space */
@@ -2495,7 +2493,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                }
                return 1;
        }
-        mem_cgroup_end_page_stat(memcg);
+        unlock_page_memcg(page);
        return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -2625,17 +2623,16 @@ void cancel_dirty_page(struct page *page)
        if (mapping_cap_account_dirty(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
-                struct mem_cgroup *memcg;
                bool locked;
-                memcg = mem_cgroup_begin_page_stat(page);
+                lock_page_memcg(page);
                wb = unlocked_inode_to_wb_begin(inode, &locked);
                if (TestClearPageDirty(page))
-                        account_page_cleaned(page, mapping, memcg, wb);
+                        account_page_cleaned(page, mapping, wb);
                unlocked_inode_to_wb_end(inode, locked);
-                mem_cgroup_end_page_stat(memcg);
+                unlock_page_memcg(page);
        } else {
                ClearPageDirty(page);
        }
@@ -2666,7 +2663,6 @@ int clear_page_dirty_for_io(struct page *page)
        if (mapping && mapping_cap_account_dirty(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
-                struct mem_cgroup *memcg;
                bool locked;
                /*
@@ -2704,16 +2700,14 @@ int clear_page_dirty_for_io(struct page *page)
                 * always locked coming in here, so we get the desired
                 * exclusion.
                 */
-                memcg = mem_cgroup_begin_page_stat(page);
                wb = unlocked_inode_to_wb_begin(inode, &locked);
                if (TestClearPageDirty(page)) {
-                        mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+                        mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
                        dec_zone_page_state(page, NR_FILE_DIRTY);
                        dec_wb_stat(wb, WB_RECLAIMABLE);
                        ret = 1;
                }
                unlocked_inode_to_wb_end(inode, locked);
-                mem_cgroup_end_page_stat(memcg);
                return ret;
        }
        return TestClearPageDirty(page);
@@ -2723,10 +2717,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
 int test_clear_page_writeback(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
-        struct mem_cgroup *memcg;
        int ret;
-        memcg = mem_cgroup_begin_page_stat(page);
+        lock_page_memcg(page);
        if (mapping) {
                struct inode *inode = mapping->host;
                struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2750,21 +2743,20 @@ int test_clear_page_writeback(struct page *page)
                ret = TestClearPageWriteback(page);
        }
        if (ret) {
-                mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+                mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
                dec_zone_page_state(page, NR_WRITEBACK);
                inc_zone_page_state(page, NR_WRITTEN);
        }
-        mem_cgroup_end_page_stat(memcg);
+        unlock_page_memcg(page);
        return ret;
 }
 int __test_set_page_writeback(struct page *page, bool keep_write)
 {
        struct address_space *mapping = page_mapping(page);
-        struct mem_cgroup *memcg;
        int ret;
-        memcg = mem_cgroup_begin_page_stat(page);
+        lock_page_memcg(page);
        if (mapping) {
                struct inode *inode = mapping->host;
                struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2792,10 +2784,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
                ret = TestSetPageWriteback(page);
        }
        if (!ret) {
-                mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+                mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
                inc_zone_page_state(page, NR_WRITEBACK);
        }
-        mem_cgroup_end_page_stat(memcg);
+        unlock_page_memcg(page);
        return ret;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 838ca8bb64f7..c46b75d14b6f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = {
 #endif
 };
+char * const migratetype_names[MIGRATE_TYPES] = {
+        "Unmovable",
+        "Movable",
+        "Reclaimable",
+        "HighAtomic",
+#ifdef CONFIG_CMA
+        "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+        "Isolate",
+#endif
+};
 compound_page_dtor * const compound_page_dtors[] = {
        NULL,
        free_compound_page,
@@ -247,6 +260,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool mirrored_kernelcore;
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -416,7 +430,7 @@ static void bad_page(struct page *page, const char *reason,
                        goto out;
                }
                if (nr_unshown) {
-                        printk(KERN_ALERT
+                        pr_alert(
                              "BUG: Bad page state: %lu messages suppressed\n",
                                nr_unshown);
                        nr_unshown = 0;
@@ -426,9 +440,14 @@ static void bad_page(struct page *page, const char *reason,
        if (nr_shown++ == 0)
                resume = jiffies + 60 * HZ;
-        printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
+        pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
                current->comm, page_to_pfn(page));
-        dump_page_badflags(page, reason, bad_flags);
+        __dump_page(page, reason);
+        bad_flags &= page->flags;
+        if (bad_flags)
+                pr_alert("bad because of flags: %#lx(%pGp)\n",
+                                                bad_flags, &bad_flags);
+        dump_page_owner(page);
        print_modules();
        dump_stack();
@@ -477,7 +496,8 @@ void prep_compound_page(struct page *page, unsigned int order)
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_pagealloc_enabled __read_mostly
+                        = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
 bool _debug_guardpage_enabled __read_mostly;
 static int __init early_debug_pagealloc(char *buf)
@@ -488,6 +508,9 @@ static int __init early_debug_pagealloc(char *buf)
        if (strcmp(buf, "on") == 0)
                _debug_pagealloc_enabled = true;
+        if (strcmp(buf, "off") == 0)
+                _debug_pagealloc_enabled = false;
        return 0;
 }
 early_param("debug_pagealloc", early_debug_pagealloc);
@@ -1002,6 +1025,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
                                           PAGE_SIZE << order);
        }
        arch_free_page(page, order);
+        kernel_poison_pages(page, 1 << order, 0);
        kernel_map_pages(page, 1 << order, 0);
        return true;
@@ -1104,6 +1128,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
        return __free_pages_boot_core(page, pfn, order);
 }
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+                                     unsigned long end_pfn, struct zone *zone)
+{
+        struct page *start_page;
+        struct page *end_page;
+        /* end_pfn is one past the range we are checking */
+        end_pfn--;
+        if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+                return NULL;
+        start_page = pfn_to_page(start_pfn);
+        if (page_zone(start_page) != zone)
+                return NULL;
+        end_page = pfn_to_page(end_pfn);
+        /* This gives a shorter code than deriving page_zone(end_page) */
+        if (page_zone_id(start_page) != page_zone_id(end_page))
+                return NULL;
+        return start_page;
+}
+void set_zone_contiguous(struct zone *zone)
+{
+        unsigned long block_start_pfn = zone->zone_start_pfn;
+        unsigned long block_end_pfn;
+        block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
+        for (; block_start_pfn < zone_end_pfn(zone);
+                        block_start_pfn = block_end_pfn,
+                         block_end_pfn += pageblock_nr_pages) {
+                block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+                if (!__pageblock_pfn_to_page(block_start_pfn,
+                                             block_end_pfn, zone))
+                        return;
+        }
+        /* We confirm that there is no hole */
+        zone->contiguous = true;
+}
+void clear_zone_contiguous(struct zone *zone)
+{
+        zone->contiguous = false;
+}
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static void __init deferred_free_range(struct page *page,
                                        unsigned long pfn, int nr_pages)
@@ -1254,9 +1347,13 @@ free_range:
        pgdat_init_report_one_done();
        return 0;
 }
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 void __init page_alloc_init_late(void)
 {
+        struct zone *zone;
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
        int nid;
        /* There will be num_node_state(N_MEMORY) threads */
@@ -1270,8 +1367,11 @@ void __init page_alloc_init_late(void)
        /* Reinit limits that are based on free pages after the kernel is up */
        files_maxfiles_init();
+#endif
+        for_each_populated_zone(zone)
+                set_zone_contiguous(zone);
 }
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
@@ -1381,15 +1481,24 @@ static inline int check_new_page(struct page *page)
        return 0;
 }
+static inline bool free_pages_prezeroed(bool poisoned)
+{
+        return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+                page_poisoning_enabled() && poisoned;
+}
 static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
                                                                int alloc_flags)
 {
        int i;
+        bool poisoned = true;
        for (i = 0; i < (1 << order); i++) {
                struct page *p = page + i;
                if (unlikely(check_new_page(p)))
                        return 1;
+                if (poisoned)
+                        poisoned &= page_is_poisoned(p);
        }
        set_page_private(page, 0);
@@ -1397,9 +1506,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
        arch_alloc_page(page, order);
        kernel_map_pages(page, 1 << order, 1);
+        kernel_poison_pages(page, 1 << order, 1);
        kasan_alloc_pages(page, order);
-        if (gfp_flags & __GFP_ZERO)
+        if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
                for (i = 0; i < (1 << order); i++)
                        clear_highpage(page + i);
@@ -2690,9 +2800,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
                va_end(args);
        }
-        pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
+        pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
-                current->comm, order, gfp_mask);
+                current->comm, order, gfp_mask, &gfp_mask);
        dump_stack();
        if (!should_suppress_show_mem())
                show_mem(filter);
@@ -4491,6 +4600,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        pg_data_t *pgdat = NODE_DATA(nid);
        unsigned long pfn;
        unsigned long nr_initialised = 0;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+        struct memblock_region *r = NULL, *tmp;
+#endif
        if (highest_memmap_pfn < end_pfn - 1)
                highest_memmap_pfn = end_pfn - 1;
@@ -4504,20 +4616,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                /*
-                 * There can be holes in boot-time mem_map[]s
+                 * There can be holes in boot-time mem_map[]s handed to this
-                 * handed to this function.  They do not
+                 * function.  They do not exist on hotplugged memory.
-                 * exist on hotplugged memory.
                 */
-                if (context == MEMMAP_EARLY) {
+                if (context != MEMMAP_EARLY)
-                        if (!early_pfn_valid(pfn))
+                        goto not_early;
+                if (!early_pfn_valid(pfn))
+                        continue;
+                if (!early_pfn_in_nid(pfn, nid))
+                        continue;
+                if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
+                        break;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+                /*
+                 * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
+                 * from zone_movable_pfn[nid] to end of each node should be
+                 * ZONE_MOVABLE not ZONE_NORMAL. skip it.
+                 */
+                if (!mirrored_kernelcore && zone_movable_pfn[nid])
+                        if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
                                continue;
-                        if (!early_pfn_in_nid(pfn, nid))
+                /*
+                 * Check given memblock attribute by firmware which can affect
+                 * kernel memory layout.  If zone==ZONE_MOVABLE but memory is
+                 * mirrored, it's an overlapped memmap init. skip it.
+                 */
+                if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+                        if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
+                                for_each_memblock(memory, tmp)
+                                        if (pfn < memblock_region_memory_end_pfn(tmp))
+                                                break;
+                                r = tmp;
+                        }
+                        if (pfn >= memblock_region_memory_base_pfn(r) &&
+                            memblock_is_mirror(r)) {
+                                /* already initialized as NORMAL */
+                                pfn = memblock_region_memory_end_pfn(r);
                                continue;
-                        if (!update_defer_init(pgdat, pfn, end_pfn,
+                        }
-                                                &nr_initialised))
-                                break;
                }
+#endif
+not_early:
                /*
                 * Mark the block movable so that blocks are reserved for
                 * movable at startup. This will force kernel allocations
@@ -4934,11 +5077,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
                        *zone_end_pfn = min(node_end_pfn,
                                arch_zone_highest_possible_pfn[movable_zone]);
-                /* Adjust for ZONE_MOVABLE starting within this range */
-                } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
-                                *zone_end_pfn > zone_movable_pfn[nid]) {
-                        *zone_end_pfn = zone_movable_pfn[nid];
                /* Check if this whole range is within ZONE_MOVABLE */
                } else if (*zone_start_pfn >= zone_movable_pfn[nid])
                        *zone_start_pfn = *zone_end_pfn;
@@ -4953,31 +5091,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
+                                        unsigned long *zone_start_pfn,
+                                        unsigned long *zone_end_pfn,
                                        unsigned long *ignored)
 {
-        unsigned long zone_start_pfn, zone_end_pfn;
        /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
                return 0;
        /* Get the start and end of the zone */
-        zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+        *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-        zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+        *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
        adjust_zone_range_for_zone_movable(nid, zone_type,
                                node_start_pfn, node_end_pfn,
-                                &zone_start_pfn, &zone_end_pfn);
+                                zone_start_pfn, zone_end_pfn);
        /* Check that this node has pages within the zone's required range */
-        if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+        if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
                return 0;
        /* Move the zone boundaries inside the node if necessary */
-        zone_end_pfn = min(zone_end_pfn, node_end_pfn);
+        *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
-        zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+        *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
        /* Return the spanned pages */
-        return zone_end_pfn - zone_start_pfn;
+        return *zone_end_pfn - *zone_start_pfn;
 }
 /*
@@ -5023,6 +5161,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
        unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
        unsigned long zone_start_pfn, zone_end_pfn;
+        unsigned long nr_absent;
        /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
@@ -5034,7 +5173,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        adjust_zone_range_for_zone_movable(nid, zone_type,
                        node_start_pfn, node_end_pfn,
                        &zone_start_pfn, &zone_end_pfn);
-        return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+        nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+        /*
+         * ZONE_MOVABLE handling.
+         * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+         * and vice versa.
+         */
+        if (zone_movable_pfn[nid]) {
+                if (mirrored_kernelcore) {
+                        unsigned long start_pfn, end_pfn;
+                        struct memblock_region *r;
+                        for_each_memblock(memory, r) {
+                                start_pfn = clamp(memblock_region_memory_base_pfn(r),
+                                                  zone_start_pfn, zone_end_pfn);
+                                end_pfn = clamp(memblock_region_memory_end_pfn(r),
+                                                zone_start_pfn, zone_end_pfn);
+                                if (zone_type == ZONE_MOVABLE &&
+                                    memblock_is_mirror(r))
+                                        nr_absent += end_pfn - start_pfn;
+                                if (zone_type == ZONE_NORMAL &&
+                                    !memblock_is_mirror(r))
+                                        nr_absent += end_pfn - start_pfn;
+                        }
+                } else {
+                        if (zone_type == ZONE_NORMAL)
+                                nr_absent += node_end_pfn - zone_movable_pfn[nid];
+                }
+        }
+        return nr_absent;
 }
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
@@ -5042,8 +5213,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
+                                        unsigned long *zone_start_pfn,
+                                        unsigned long *zone_end_pfn,
                                        unsigned long *zones_size)
 {
+        unsigned int zone;
+        *zone_start_pfn = node_start_pfn;
+        for (zone = 0; zone < zone_type; zone++)
+                *zone_start_pfn += zones_size[zone];
+        *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
        return zones_size[zone_type];
 }
@@ -5072,15 +5253,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
        for (i = 0; i < MAX_NR_ZONES; i++) {
                struct zone *zone = pgdat->node_zones + i;
+                unsigned long zone_start_pfn, zone_end_pfn;
                unsigned long size, real_size;
                size = zone_spanned_pages_in_node(pgdat->node_id, i,
                                                  node_start_pfn,
                                                  node_end_pfn,
+                                                  &zone_start_pfn,
+                                                  &zone_end_pfn,
                                                  zones_size);
                real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
                                                  node_start_pfn, node_end_pfn,
                                                  zholes_size);
+                if (size)
+                        zone->zone_start_pfn = zone_start_pfn;
+                else
+                        zone->zone_start_pfn = 0;
                zone->spanned_pages = size;
                zone->present_pages = real_size;
@@ -5201,7 +5389,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 {
        enum zone_type j;
        int nid = pgdat->node_id;
-        unsigned long zone_start_pfn = pgdat->node_start_pfn;
        int ret;
        pgdat_resize_init(pgdat);
@@ -5222,6 +5409,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize, freesize, memmap_pages;
+                unsigned long zone_start_pfn = zone->zone_start_pfn;
                size = zone->spanned_pages;
                realsize = freesize = zone->present_pages;
@@ -5290,7 +5478,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                ret = init_currently_empty_zone(zone, zone_start_pfn, size);
                BUG_ON(ret);
                memmap_init(size, nid, j, zone_start_pfn);
-                zone_start_pfn += size;
        }
 }
@@ -5358,6 +5545,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
                (u64)start_pfn << PAGE_SHIFT,
                end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+#else
+        start_pfn = node_start_pfn;
 #endif
        calculate_node_totalpages(pgdat, start_pfn, end_pfn,
                                  zones_size, zholes_size);
@@ -5529,6 +5718,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
        }
        /*
+         * If kernelcore=mirror is specified, ignore movablecore option
+         */
+        if (mirrored_kernelcore) {
+                bool mem_below_4gb_not_mirrored = false;
+                for_each_memblock(memory, r) {
+                        if (memblock_is_mirror(r))
+                                continue;
+                        nid = r->nid;
+                        usable_startpfn = memblock_region_memory_base_pfn(r);
+                        if (usable_startpfn < 0x100000) {
+                                mem_below_4gb_not_mirrored = true;
+                                continue;
+                        }
+                        zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+                                min(usable_startpfn, zone_movable_pfn[nid]) :
+                                usable_startpfn;
+                }
+                if (mem_below_4gb_not_mirrored)
+                        pr_warn("This configuration results in unmirrored kernel memory.");
+                goto out2;
+        }
+        /*
         * If movablecore=nn[KMG] was specified, calculate what size of
         * kernelcore that corresponds so that memory usable for
         * any allocation type is evenly spread. If both kernelcore
@@ -5788,6 +6007,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
 */
 static int __init cmdline_parse_kernelcore(char *p)
 {
+        /* parse kernelcore=mirror */
+        if (parse_option_str(p, "mirror")) {
+                mirrored_kernelcore = true;
+                return 0;
+        }
        return cmdline_parse_core(p, &required_kernelcore);
 }
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 292ca7b8debd..2d864e64f7fe 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page)
        struct page_ext *base;
        base = NODE_DATA(page_to_nid(page))->node_page_ext;
-#ifdef CONFIG_DEBUG_VM
+#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
        /*
         * The sanity checks the page allocator does upon freeing a
         * page can reach here before the page_ext arrays are
         * allocated when feeding a range of pages to the allocator
         * for the first time during bootup or memory hotplug.
+         *
+         * This check is also necessary for ensuring page poisoning
+         * works as expected when enabled
         */
        if (unlikely(!base))
                return NULL;
@@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page)
 {
        unsigned long pfn = page_to_pfn(page);
        struct mem_section *section = __pfn_to_section(pfn);
-#ifdef CONFIG_DEBUG_VM
+#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
        /*
         * The sanity checks the page allocator does upon freeing a
         * page can reach here before the page_ext arrays are
         * allocated when feeding a range of pages to the allocator
         * for the first time during bootup or memory hotplug.
+         *
+         * This check is also necessary for ensuring page poisoning
+         * works as expected when enabled
         */
        if (!section->page_ext)
                return NULL;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 983c3a10fa07..44ad1f00c4e1 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -5,10 +5,12 @@
 #include <linux/bootmem.h>
 #include <linux/stacktrace.h>
 #include <linux/page_owner.h>
+#include <linux/jump_label.h>
+#include <linux/migrate.h>
 #include "internal.h"
 static bool page_owner_disabled = true;
-bool page_owner_inited __read_mostly;
+DEFINE_STATIC_KEY_FALSE(page_owner_inited);
 static void init_early_allocated_pages(void);
@@ -37,7 +39,7 @@ static void init_page_owner(void)
        if (page_owner_disabled)
                return;
-        page_owner_inited = true;
+        static_branch_enable(&page_owner_inited);
        init_early_allocated_pages();
 }
@@ -72,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
        page_ext->order = order;
        page_ext->gfp_mask = gfp_mask;
        page_ext->nr_entries = trace.nr_entries;
+        page_ext->last_migrate_reason = -1;
        __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
 }
+void __set_page_owner_migrate_reason(struct page *page, int reason)
+{
+        struct page_ext *page_ext = lookup_page_ext(page);
+        page_ext->last_migrate_reason = reason;
+}
 gfp_t __get_page_owner_gfp(struct page *page)
 {
        struct page_ext *page_ext = lookup_page_ext(page);
@@ -83,6 +93,31 @@ gfp_t __get_page_owner_gfp(struct page *page)
        return page_ext->gfp_mask;
 }
+void __copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+        struct page_ext *old_ext = lookup_page_ext(oldpage);
+        struct page_ext *new_ext = lookup_page_ext(newpage);
+        int i;
+        new_ext->order = old_ext->order;
+        new_ext->gfp_mask = old_ext->gfp_mask;
+        new_ext->nr_entries = old_ext->nr_entries;
+        for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
+                new_ext->trace_entries[i] = old_ext->trace_entries[i];
+        /*
+         * We don't clear the bit on the oldpage as it's going to be freed
+         * after migration. Until then, the info can be useful in case of
+         * a bug, and the overal stats will be off a bit only temporarily.
+         * Also, migrate_misplaced_transhuge_page() can still fail the
+         * migration and then we want the oldpage to retain the info. But
+         * in that case we also don't need to explicitly clear the info from
+         * the new page, which will be freed.
+         */
+        __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+}
 static ssize_t
 print_page_owner(char __user *buf, size_t count, unsigned long pfn,
                struct page *page, struct page_ext *page_ext)
@@ -100,8 +135,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
                return -ENOMEM;
        ret = snprintf(kbuf, count,
-                        "Page allocated via order %u, mask 0x%x\n",
+                        "Page allocated via order %u, mask %#x(%pGg)\n",
-                        page_ext->order, page_ext->gfp_mask);
+                        page_ext->order, page_ext->gfp_mask,
+                        &page_ext->gfp_mask);
        if (ret >= count)
                goto err;
@@ -110,23 +146,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
        pageblock_mt = get_pfnblock_migratetype(page, pfn);
        page_mt  = gfpflags_to_migratetype(page_ext->gfp_mask);
        ret += snprintf(kbuf + ret, count - ret,
-                        "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
+                        "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
                        pfn,
+                        migratetype_names[page_mt],
                        pfn >> pageblock_order,
-                        pageblock_mt,
+                        migratetype_names[pageblock_mt],
-                        pageblock_mt != page_mt ? "Fallback" : "        ",
+                        page->flags, &page->flags);
-                        PageLocked(page)        ? "K" : " ",
-                        PageError(page)         ? "E" : " ",
-                        PageReferenced(page)    ? "R" : " ",
-                        PageUptodate(page)      ? "U" : " ",
-                        PageDirty(page)         ? "D" : " ",
-                        PageLRU(page)           ? "L" : " ",
-                        PageActive(page)        ? "A" : " ",
-                        PageSlab(page)          ? "S" : " ",
-                        PageWriteback(page)     ? "W" : " ",
-                        PageCompound(page)      ? "C" : " ",
-                        PageSwapCache(page)     ? "B" : " ",
-                        PageMappedToDisk(page)  ? "M" : " ");
        if (ret >= count)
                goto err;
@@ -135,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
        if (ret >= count)
                goto err;
+        if (page_ext->last_migrate_reason != -1) {
+                ret += snprintf(kbuf + ret, count - ret,
+                        "Page has been migrated, last migrate reason: %s\n",
+                        migrate_reason_names[page_ext->last_migrate_reason]);
+                if (ret >= count)
+                        goto err;
+        }
        ret += snprintf(kbuf + ret, count - ret, "\n");
        if (ret >= count)
                goto err;
@@ -150,6 +183,31 @@ err:
        return -ENOMEM;
 }
+void __dump_page_owner(struct page *page)
+{
+        struct page_ext *page_ext = lookup_page_ext(page);
+        struct stack_trace trace = {
+                .nr_entries = page_ext->nr_entries,
+                .entries = &page_ext->trace_entries[0],
+        };
+        gfp_t gfp_mask = page_ext->gfp_mask;
+        int mt = gfpflags_to_migratetype(gfp_mask);
+        if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
+                pr_alert("page_owner info is not active (free page?)\n");
+                return;
+        }
+        pr_alert("page allocated via order %u, migratetype %s, "
+                        "gfp_mask %#x(%pGg)\n", page_ext->order,
+                        migratetype_names[mt], gfp_mask, &gfp_mask);
+        print_stack_trace(&trace, 0);
+        if (page_ext->last_migrate_reason != -1)
+                pr_alert("page has been migrated, last migrate reason: %s\n",
+                        migrate_reason_names[page_ext->last_migrate_reason]);
+}
 static ssize_t
 read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
@@ -157,7 +215,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        struct page *page;
        struct page_ext *page_ext;
-        if (!page_owner_inited)
+        if (!static_branch_unlikely(&page_owner_inited))
                return -EINVAL;
        page = NULL;
@@ -305,7 +363,7 @@ static int __init pageowner_init(void)
 {
        struct dentry *dentry;
-        if (!page_owner_inited) {
+        if (!static_branch_unlikely(&page_owner_inited)) {
                pr_info("page_owner is disabled\n");
                return 0;
        }
diff --git a/mm/debug-pagealloc.c b/mm/page_poison.c
index 5bf5906ce13b..479e7ea2bea6 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/page_poison.c
@@ -6,22 +6,48 @@
 #include <linux/poison.h>
 #include <linux/ratelimit.h>
-static bool page_poisoning_enabled __read_mostly;
+static bool __page_poisoning_enabled __read_mostly;
+static bool want_page_poisoning __read_mostly;
-static bool need_page_poisoning(void)
+static int early_page_poison_param(char *buf)
 {
-        if (!debug_pagealloc_enabled())
+        if (!buf)
-                return false;
+                return -EINVAL;
+        if (strcmp(buf, "on") == 0)
+                want_page_poisoning = true;
+        else if (strcmp(buf, "off") == 0)
+                want_page_poisoning = false;
-        return true;
+        return 0;
+}
+early_param("page_poison", early_page_poison_param);
+bool page_poisoning_enabled(void)
+{
+        return __page_poisoning_enabled;
+}
+static bool need_page_poisoning(void)
+{
+        return want_page_poisoning;
 }
 static void init_page_poisoning(void)
 {
-        if (!debug_pagealloc_enabled())
+        /*
-                return;
+         * page poisoning is debug page alloc for some arches. If either
+         * of those options are enabled, enable poisoning
+         */
+        if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
+                if (!want_page_poisoning && !debug_pagealloc_enabled())
+                        return;
+        } else {
+                if (!want_page_poisoning)
+                        return;
+        }
-        page_poisoning_enabled = true;
+        __page_poisoning_enabled = true;
 }
 struct page_ext_operations page_poisoning_ops = {
@@ -45,11 +71,14 @@ static inline void clear_page_poison(struct page *page)
        __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
 }
-static inline bool page_poison(struct page *page)
+bool page_is_poisoned(struct page *page)
 {
        struct page_ext *page_ext;
        page_ext = lookup_page_ext(page);
+        if (!page_ext)
+                return false;
        return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
 }
@@ -83,6 +112,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
        unsigned char *start;
        unsigned char *end;
+        if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
+                return;
        start = memchr_inv(mem, PAGE_POISON, bytes);
        if (!start)
                return;
@@ -95,9 +127,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
        if (!__ratelimit(&ratelimit))
                return;
        else if (start == end && single_bit_flip(*start, PAGE_POISON))
-                printk(KERN_ERR "pagealloc: single bit error\n");
+                pr_err("pagealloc: single bit error\n");
        else
-                printk(KERN_ERR "pagealloc: memory corruption\n");
+                pr_err("pagealloc: memory corruption\n");
        print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
                        end - start + 1, 1);
@@ -108,7 +140,7 @@ static void unpoison_page(struct page *page)
 {
        void *addr;
-        if (!page_poison(page))
+        if (!page_is_poisoned(page))
                return;
        addr = kmap_atomic(page);
@@ -125,9 +157,9 @@ static void unpoison_pages(struct page *page, int n)
                unpoison_page(page + i);
 }
-void __kernel_map_pages(struct page *page, int numpages, int enable)
+void kernel_poison_pages(struct page *page, int numpages, int enable)
 {
-        if (!page_poisoning_enabled)
+        if (!page_poisoning_enabled())
                return;
        if (enable)
@@ -135,3 +167,10 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
        else
                poison_pages(page, numpages);
 }
+#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+        /* This function does nothing, all work is done via poison pages */
+}
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 79f3bf047f38..02f0bfc3c80a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1287,21 +1287,17 @@ void page_add_new_anon_rmap(struct page *page,
 */
 void page_add_file_rmap(struct page *page)
 {
-        struct mem_cgroup *memcg;
+        lock_page_memcg(page);
-        memcg = mem_cgroup_begin_page_stat(page);
        if (atomic_inc_and_test(&page->_mapcount)) {
                __inc_zone_page_state(page, NR_FILE_MAPPED);
-                mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+                mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
        }
-        mem_cgroup_end_page_stat(memcg);
+        unlock_page_memcg(page);
 }
 static void page_remove_file_rmap(struct page *page)
 {
-        struct mem_cgroup *memcg;
+        lock_page_memcg(page);
-        memcg = mem_cgroup_begin_page_stat(page);
        /* Hugepages are not counted in NR_FILE_MAPPED for now. */
        if (unlikely(PageHuge(page))) {
@@ -1320,12 +1316,12 @@ static void page_remove_file_rmap(struct page *page)
         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
        __dec_zone_page_state(page, NR_FILE_MAPPED);
-        mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+        mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);
 out:
-        mem_cgroup_end_page_stat(memcg);
+        unlock_page_memcg(page);
 }
 static void page_remove_anon_compound_rmap(struct page *page)
diff --git a/mm/shmem.c b/mm/shmem.c
index 440e2a7e6c1c..1acfdbc4bd9e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1116,7 +1116,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
                 */
                oldpage = newpage;
        } else {
-                mem_cgroup_replace_page(oldpage, newpage);
+                mem_cgroup_migrate(oldpage, newpage);
                lru_cache_add_anon(newpage);
                *pagep = newpage;
        }
diff --git a/mm/slab.c b/mm/slab.c
index 621fbcb35a36..852fc5c79829 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t;
 #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
 /*
- * true if a page was allocated from pfmemalloc reserves for network-based
- * swap
- */
-static bool pfmemalloc_active __read_mostly;
-/*
 * struct array_cache
 *
 * Purpose:
@@ -195,10 +189,6 @@ struct array_cache {
                         * Must have this definition in here for the proper
                         * alignment of array_cache. Also simplifies accessing
                         * the entries.
-                         *
-                         * Entries should not be directly dereferenced as
-                         * entries belonging to slabs marked pfmemalloc will
-                         * have the lower bits set SLAB_OBJ_PFMEMALLOC
                         */
 };
@@ -207,33 +197,6 @@ struct alien_cache {
        struct array_cache ac;
 };
-#define SLAB_OBJ_PFMEMALLOC     1
-static inline bool is_obj_pfmemalloc(void *objp)
-{
-        return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
-}
-static inline void set_obj_pfmemalloc(void **objp)
-{
-        *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
-        return;
-}
-static inline void clear_obj_pfmemalloc(void **objp)
-{
-        *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
-}
-/*
- * bootstrap: The caches do not work without cpuarrays anymore, but the
- * cpuarrays are allocated from the generic caches...
- */
-#define BOOT_CPUCACHE_ENTRIES   1
-struct arraycache_init {
-        struct array_cache cache;
-        void *entries[BOOT_CPUCACHE_ENTRIES];
-};
 /*
 * Need this for bootstrapping a per node allocator.
 */
@@ -280,9 +243,10 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
        MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);  \
        } while (0)
+#define CFLGS_OBJFREELIST_SLAB  (0x40000000UL)
 #define CFLGS_OFF_SLAB          (0x80000000UL)
+#define OBJFREELIST_SLAB(x)     ((x)->flags & CFLGS_OBJFREELIST_SLAB)
 #define OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
-#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
 #define BATCHREFILL_LIMIT       16
 /*
@@ -390,36 +354,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #endif
-#define OBJECT_FREE (0)
-#define OBJECT_ACTIVE (1)
 #ifdef CONFIG_DEBUG_SLAB_LEAK
-static void set_obj_status(struct page *page, int idx, int val)
+static inline bool is_store_user_clean(struct kmem_cache *cachep)
 {
-        int freelist_size;
+        return atomic_read(&cachep->store_user_clean) == 1;
-        char *status;
-        struct kmem_cache *cachep = page->slab_cache;
-        freelist_size = cachep->num * sizeof(freelist_idx_t);
-        status = (char *)page->freelist + freelist_size;
-        status[idx] = val;
 }
-static inline unsigned int get_obj_status(struct page *page, int idx)
+static inline void set_store_user_clean(struct kmem_cache *cachep)
 {
-        int freelist_size;
+        atomic_set(&cachep->store_user_clean, 1);
-        char *status;
+}
-        struct kmem_cache *cachep = page->slab_cache;
-        freelist_size = cachep->num * sizeof(freelist_idx_t);
-        status = (char *)page->freelist + freelist_size;
-        return status[idx];
+static inline void set_store_user_dirty(struct kmem_cache *cachep)
+{
+        if (is_store_user_clean(cachep))
+                atomic_set(&cachep->store_user_clean, 0);
 }
 #else
-static inline void set_obj_status(struct page *page, int idx, int val) {}
+static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
 #endif
@@ -457,6 +411,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
        return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 }
+#define BOOT_CPUCACHE_ENTRIES   1
 /* internal cache of cache description objs */
 static struct kmem_cache kmem_cache_boot = {
        .batchcount = 1,
@@ -475,61 +430,13 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
        return this_cpu_ptr(cachep->cpu_cache);
 }
-static size_t calculate_freelist_size(int nr_objs, size_t align)
-{
-        size_t freelist_size;
-        freelist_size = nr_objs * sizeof(freelist_idx_t);
-        if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
-                freelist_size += nr_objs * sizeof(char);
-        if (align)
-                freelist_size = ALIGN(freelist_size, align);
-        return freelist_size;
-}
-static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
-                                size_t idx_size, size_t align)
-{
-        int nr_objs;
-        size_t remained_size;
-        size_t freelist_size;
-        int extra_space = 0;
-        if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
-                extra_space = sizeof(char);
-        /*
-         * Ignore padding for the initial guess. The padding
-         * is at most @align-1 bytes, and @buffer_size is at
-         * least @align. In the worst case, this result will
-         * be one greater than the number of objects that fit
-         * into the memory allocation when taking the padding
-         * into account.
-         */
-        nr_objs = slab_size / (buffer_size + idx_size + extra_space);
-        /*
-         * This calculated number will be either the right
-         * amount, or one greater than what we want.
-         */
-        remained_size = slab_size - nr_objs * buffer_size;
-        freelist_size = calculate_freelist_size(nr_objs, align);
-        if (remained_size < freelist_size)
-                nr_objs--;
-        return nr_objs;
-}
 /*
 * Calculate the number of objects and left-over bytes for a given buffer size.
 */
-static void cache_estimate(unsigned long gfporder, size_t buffer_size,
+static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
-                           size_t align, int flags, size_t *left_over,
+                unsigned long flags, size_t *left_over)
-                           unsigned int *num)
 {
-        int nr_objs;
+        unsigned int num;
-        size_t mgmt_size;
        size_t slab_size = PAGE_SIZE << gfporder;
        /*
@@ -537,26 +444,28 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
         * on it. For the latter case, the memory allocated for a
         * slab is used for:
         *
-         * - One unsigned int for each object
-         * - Padding to respect alignment of @align
         * - @buffer_size bytes for each object
+         * - One freelist_idx_t for each object
+         *
+         * We don't need to consider alignment of freelist because
+         * freelist will be at the end of slab page. The objects will be
+         * at the correct alignment.
         *
         * If the slab management structure is off the slab, then the
         * alignment will already be calculated into the size. Because
         * the slabs are all pages aligned, the objects will be at the
         * correct alignment when allocated.
         */
-        if (flags & CFLGS_OFF_SLAB) {
+        if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) {
-                mgmt_size = 0;
+                num = slab_size / buffer_size;
-                nr_objs = slab_size / buffer_size;
+                *left_over = slab_size % buffer_size;
        } else {
-                nr_objs = calculate_nr_objs(slab_size, buffer_size,
+                num = slab_size / (buffer_size + sizeof(freelist_idx_t));
-                                        sizeof(freelist_idx_t), align);
+                *left_over = slab_size %
-                mgmt_size = calculate_freelist_size(nr_objs, align);
+                        (buffer_size + sizeof(freelist_idx_t));
        }
-        *num = nr_objs;
-        *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
+        return num;
 }
 #if DEBUG
@@ -687,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries,
        return ac;
 }
-static inline bool is_slab_pfmemalloc(struct page *page)
+static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
-{
+                                        struct page *page, void *objp)
-        return PageSlabPfmemalloc(page);
-}
-/* Clears pfmemalloc_active if no slabs have pfmalloc set */
-static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
-                                                struct array_cache *ac)
-{
-        struct kmem_cache_node *n = get_node(cachep, numa_mem_id());
-        struct page *page;
-        unsigned long flags;
-        if (!pfmemalloc_active)
-                return;
-        spin_lock_irqsave(&n->list_lock, flags);
-        list_for_each_entry(page, &n->slabs_full, lru)
-                if (is_slab_pfmemalloc(page))
-                        goto out;
-        list_for_each_entry(page, &n->slabs_partial, lru)
-                if (is_slab_pfmemalloc(page))
-                        goto out;
-        list_for_each_entry(page, &n->slabs_free, lru)
-                if (is_slab_pfmemalloc(page))
-                        goto out;
-        pfmemalloc_active = false;
-out:
-        spin_unlock_irqrestore(&n->list_lock, flags);
-}
-static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
-                                                gfp_t flags, bool force_refill)
 {
-        int i;
+        struct kmem_cache_node *n;
-        void *objp = ac->entry[--ac->avail];
+        int page_node;
+        LIST_HEAD(list);
-        /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
-        if (unlikely(is_obj_pfmemalloc(objp))) {
-                struct kmem_cache_node *n;
-                if (gfp_pfmemalloc_allowed(flags)) {
-                        clear_obj_pfmemalloc(&objp);
-                        return objp;
-                }
-                /* The caller cannot use PFMEMALLOC objects, find another one */
-                for (i = 0; i < ac->avail; i++) {
-                        /* If a !PFMEMALLOC object is found, swap them */
-                        if (!is_obj_pfmemalloc(ac->entry[i])) {
-                                objp = ac->entry[i];
-                                ac->entry[i] = ac->entry[ac->avail];
-                                ac->entry[ac->avail] = objp;
-                                return objp;
-                        }
-                }
-                /*
-                 * If there are empty slabs on the slabs_free list and we are
-                 * being forced to refill the cache, mark this one !pfmemalloc.
-                 */
-                n = get_node(cachep, numa_mem_id());
-                if (!list_empty(&n->slabs_free) && force_refill) {
-                        struct page *page = virt_to_head_page(objp);
-                        ClearPageSlabPfmemalloc(page);
-                        clear_obj_pfmemalloc(&objp);
-                        recheck_pfmemalloc_active(cachep, ac);
-                        return objp;
-                }
-                /* No !PFMEMALLOC objects available */
-                ac->avail++;
-                objp = NULL;
-        }
-        return objp;
-}
-static inline void *ac_get_obj(struct kmem_cache *cachep,
-                        struct array_cache *ac, gfp_t flags, bool force_refill)
-{
-        void *objp;
-        if (unlikely(sk_memalloc_socks()))
-                objp = __ac_get_obj(cachep, ac, flags, force_refill);
-        else
-                objp = ac->entry[--ac->avail];
-        return objp;
-}
-static noinline void *__ac_put_obj(struct kmem_cache *cachep,
-                        struct array_cache *ac, void *objp)
-{
-        if (unlikely(pfmemalloc_active)) {
-                /* Some pfmemalloc slabs exist, check if this is one */
-                struct page *page = virt_to_head_page(objp);
-                if (PageSlabPfmemalloc(page))
-                        set_obj_pfmemalloc(&objp);
-        }
-        return objp;
+        page_node = page_to_nid(page);
-}
+        n = get_node(cachep, page_node);
-static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+        spin_lock(&n->list_lock);
-                                                                void *objp)
+        free_block(cachep, &objp, 1, page_node, &list);
-{
+        spin_unlock(&n->list_lock);
-        if (unlikely(sk_memalloc_socks()))
-                objp = __ac_put_obj(cachep, ac, objp);
-        ac->entry[ac->avail++] = objp;
+        slabs_destroy(cachep, &list);
 }
 /*
@@ -1003,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
                        STATS_INC_ACOVERFLOW(cachep);
                        __drain_alien_cache(cachep, ac, page_node, &list);
                }
-                ac_put_obj(cachep, ac, objp);
+                ac->entry[ac->avail++] = objp;
                spin_unlock(&alien->lock);
                slabs_destroy(cachep, &list);
        } else {
@@ -1540,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
        if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
                return;
-        printk(KERN_WARNING
+        pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
-                "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
+                nodeid, gfpflags, &gfpflags);
-                nodeid, gfpflags);
+        pr_warn("  cache: %s, object size: %d, order: %d\n",
-        printk(KERN_WARNING "  cache: %s, object size: %d, order: %d\n",
                cachep->name, cachep->size, cachep->gfporder);
        for_each_kmem_cache_node(cachep, node, n) {
@@ -1567,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
                num_slabs += active_slabs;
                num_objs = num_slabs * cachep->num;
-                printk(KERN_WARNING
+                pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
-                        "  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
                        node, active_slabs, num_slabs, active_objs, num_objs,
                        free_objects);
        }
@@ -1604,10 +1412,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
                return NULL;
        }
-        /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
-        if (page_is_pfmemalloc(page))
-                pfmemalloc_active = true;
        nr_pages = (1 << cachep->gfporder);
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
                add_zone_page_state(page_zone(page),
@@ -1615,8 +1419,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
        else
                add_zone_page_state(page_zone(page),
                        NR_SLAB_UNRECLAIMABLE, nr_pages);
        __SetPageSlab(page);
-        if (page_is_pfmemalloc(page))
+        /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+        if (sk_memalloc_socks() && page_is_pfmemalloc(page))
                SetPageSlabPfmemalloc(page);
        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
@@ -1670,6 +1476,14 @@ static void kmem_rcu_free(struct rcu_head *head)
 }
 #if DEBUG
+static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
+{
+        if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
+                (cachep->size % PAGE_SIZE) == 0)
+                return true;
+        return false;
+}
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
@@ -1703,6 +1517,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
        }
        *addr++ = 0x87654321;
 }
+static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+                                int map, unsigned long caller)
+{
+        if (!is_debug_pagealloc_cache(cachep))
+                return;
+        if (caller)
+                store_stackinfo(cachep, objp, caller);
+        kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
+}
+#else
+static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+                                int map, unsigned long caller) {}
 #endif
 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
@@ -1781,6 +1612,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
        int size, i;
        int lines = 0;
+        if (is_debug_pagealloc_cache(cachep))
+                return;
        realobj = (char *)objp + obj_offset(cachep);
        size = cachep->object_size;
@@ -1842,20 +1676,18 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
                                                struct page *page)
 {
        int i;
+        if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
+                poison_obj(cachep, page->freelist - obj_offset(cachep),
+                        POISON_FREE);
+        }
        for (i = 0; i < cachep->num; i++) {
                void *objp = index_to_obj(cachep, page, i);
                if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
-                        if (cachep->size % PAGE_SIZE == 0 &&
-                                        OFF_SLAB(cachep))
-                                kernel_map_pages(virt_to_page(objp),
-                                        cachep->size / PAGE_SIZE, 1);
-                        else
-                                check_poison_obj(cachep, objp);
-#else
                        check_poison_obj(cachep, objp);
-#endif
+                        slab_kernel_map(cachep, objp, 1, 0);
                }
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
@@ -1916,7 +1748,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
 * calculate_slab_order - calculate size (page order) of slabs
 * @cachep: pointer to the cache that is being created
 * @size: size of objects to be created in this cache.
- * @align: required alignment for the objects.
 * @flags: slab allocation flags
 *
 * Also calculates the number of objects per slab.
@@ -1926,9 +1757,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
 * towards high-order requests, this should be changed.
 */
 static size_t calculate_slab_order(struct kmem_cache *cachep,
-                        size_t size, size_t align, unsigned long flags)
+                                size_t size, unsigned long flags)
 {
-        unsigned long offslab_limit;
        size_t left_over = 0;
        int gfporder;
@@ -1936,7 +1766,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
                unsigned int num;
                size_t remainder;
-                cache_estimate(gfporder, size, align, flags, &remainder, &num);
+                num = cache_estimate(gfporder, size, flags, &remainder);
                if (!num)
                        continue;
@@ -1945,19 +1775,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
                        break;
                if (flags & CFLGS_OFF_SLAB) {
-                        size_t freelist_size_per_obj = sizeof(freelist_idx_t);
+                        struct kmem_cache *freelist_cache;
+                        size_t freelist_size;
+                        freelist_size = num * sizeof(freelist_idx_t);
+                        freelist_cache = kmalloc_slab(freelist_size, 0u);
+                        if (!freelist_cache)
+                                continue;
                        /*
-                         * Max number of objs-per-slab for caches which
+                         * Needed to avoid possible looping condition
-                         * use off-slab slabs. Needed to avoid a possible
+                         * in cache_grow()
-                         * looping condition in cache_grow().
                         */
-                        if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+                        if (OFF_SLAB(freelist_cache))
-                                freelist_size_per_obj += sizeof(char);
+                                continue;
-                        offslab_limit = size;
-                        offslab_limit /= freelist_size_per_obj;
-                        if (num > offslab_limit)
+                        /* check if off slab has enough benefit */
-                                break;
+                        if (freelist_cache->size > cachep->size / 2)
+                                continue;
                }
                /* Found something acceptable - save it away */
@@ -2075,6 +1910,79 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
        return cachep;
 }
+static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
+                        size_t size, unsigned long flags)
+{
+        size_t left;
+        cachep->num = 0;
+        if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU)
+                return false;
+        left = calculate_slab_order(cachep, size,
+                        flags | CFLGS_OBJFREELIST_SLAB);
+        if (!cachep->num)
+                return false;
+        if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
+                return false;
+        cachep->colour = left / cachep->colour_off;
+        return true;
+}
+static bool set_off_slab_cache(struct kmem_cache *cachep,
+                        size_t size, unsigned long flags)
+{
+        size_t left;
+        cachep->num = 0;
+        /*
+         * Always use on-slab management when SLAB_NOLEAKTRACE
+         * to avoid recursive calls into kmemleak.
+         */
+        if (flags & SLAB_NOLEAKTRACE)
+                return false;
+        /*
+         * Size is large, assume best to place the slab management obj
+         * off-slab (should allow better packing of objs).
+         */
+        left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB);
+        if (!cachep->num)
+                return false;
+        /*
+         * If the slab has been placed off-slab, and we have enough space then
+         * move it on-slab. This is at the expense of any extra colouring.
+         */
+        if (left >= cachep->num * sizeof(freelist_idx_t))
+                return false;
+        cachep->colour = left / cachep->colour_off;
+        return true;
+}
+static bool set_on_slab_cache(struct kmem_cache *cachep,
+                        size_t size, unsigned long flags)
+{
+        size_t left;
+        cachep->num = 0;
+        left = calculate_slab_order(cachep, size, flags);
+        if (!cachep->num)
+                return false;
+        cachep->colour = left / cachep->colour_off;
+        return true;
+}
 /**
 * __kmem_cache_create - Create a cache.
 * @cachep: cache management descriptor
@@ -2099,7 +2007,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
 int
 __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 {
-        size_t left_over, freelist_size;
        size_t ralign = BYTES_PER_WORD;
        gfp_t gfp;
        int err;
@@ -2119,8 +2026,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        if (!(flags & SLAB_DESTROY_BY_RCU))
                flags |= SLAB_POISON;
 #endif
-        if (flags & SLAB_DESTROY_BY_RCU)
-                BUG_ON(flags & SLAB_POISON);
 #endif
        /*
@@ -2152,6 +2057,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
         * 4) Store it.
         */
        cachep->align = ralign;
+        cachep->colour_off = cache_line_size();
+        /* Offset must be a multiple of the alignment. */
+        if (cachep->colour_off < cachep->align)
+                cachep->colour_off = cachep->align;
        if (slab_is_available())
                gfp = GFP_KERNEL;
@@ -2179,37 +2088,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
                else
                        size += BYTES_PER_WORD;
        }
-#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-        /*
-         * To activate debug pagealloc, off-slab management is necessary
-         * requirement. In early phase of initialization, small sized slab
-         * doesn't get initialized so it would not be possible. So, we need
-         * to check size >= 256. It guarantees that all necessary small
-         * sized slab is initialized in current slab initialization sequence.
-         */
-        if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) &&
-                size >= 256 && cachep->object_size > cache_line_size() &&
-                ALIGN(size, cachep->align) < PAGE_SIZE) {
-                cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
-                size = PAGE_SIZE;
-        }
-#endif
 #endif
-        /*
-         * Determine if the slab management is 'on' or 'off' slab.
-         * (bootstrapping cannot cope with offslab caches so don't do
-         * it too early on. Always use on-slab management when
-         * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
-         */
-        if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
-            !(flags & SLAB_NOLEAKTRACE))
-                /*
-                 * Size is large, assume best to place the slab management obj
-                 * off-slab (should allow better packing of objs).
-                 */
-                flags |= CFLGS_OFF_SLAB;
        size = ALIGN(size, cachep->align);
        /*
         * We should restrict the number of objects in a slab to implement
@@ -2218,42 +2098,46 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
                size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
-        left_over = calculate_slab_order(cachep, size, cachep->align, flags);
+#if DEBUG
-        if (!cachep->num)
-                return -E2BIG;
-        freelist_size = calculate_freelist_size(cachep->num, cachep->align);
        /*
-         * If the slab has been placed off-slab, and we have enough space then
+         * To activate debug pagealloc, off-slab management is necessary
-         * move it on-slab. This is at the expense of any extra colouring.
+         * requirement. In early phase of initialization, small sized slab
+         * doesn't get initialized so it would not be possible. So, we need
+         * to check size >= 256. It guarantees that all necessary small
+         * sized slab is initialized in current slab initialization sequence.
         */
-        if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
+        if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
-                flags &= ~CFLGS_OFF_SLAB;
+                size >= 256 && cachep->object_size > cache_line_size()) {
-                left_over -= freelist_size;
+                if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
+                        size_t tmp_size = ALIGN(size, PAGE_SIZE);
+                        if (set_off_slab_cache(cachep, tmp_size, flags)) {
+                                flags |= CFLGS_OFF_SLAB;
+                                cachep->obj_offset += tmp_size - size;
+                                size = tmp_size;
+                                goto done;
+                        }
+                }
        }
+#endif
-        if (flags & CFLGS_OFF_SLAB) {
+        if (set_objfreelist_slab_cache(cachep, size, flags)) {
-                /* really off slab. No need for manual alignment */
+                flags |= CFLGS_OBJFREELIST_SLAB;
-                freelist_size = calculate_freelist_size(cachep->num, 0);
+                goto done;
+        }
-#ifdef CONFIG_PAGE_POISONING
+        if (set_off_slab_cache(cachep, size, flags)) {
-                /* If we're going to use the generic kernel_map_pages()
+                flags |= CFLGS_OFF_SLAB;
-                 * poisoning, then it's going to smash the contents of
+                goto done;
-                 * the redzone and userword anyhow, so switch them off.
-                 */
-                if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
-                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-#endif
        }
-        cachep->colour_off = cache_line_size();
+        if (set_on_slab_cache(cachep, size, flags))
-        /* Offset must be a multiple of the alignment. */
+                goto done;
-        if (cachep->colour_off < cachep->align)
-                cachep->colour_off = cachep->align;
+        return -E2BIG;
-        cachep->colour = left_over / cachep->colour_off;
-        cachep->freelist_size = freelist_size;
+done:
+        cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
        cachep->flags = flags;
        cachep->allocflags = __GFP_COMP;
        if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
@@ -2261,16 +2145,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        cachep->size = size;
        cachep->reciprocal_buffer_size = reciprocal_value(size);
-        if (flags & CFLGS_OFF_SLAB) {
+#if DEBUG
-                cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
+        /*
-                /*
+         * If we're going to use the generic kernel_map_pages()
-                 * This is a possibility for one of the kmalloc_{dma,}_caches.
+         * poisoning, then it's going to smash the contents of
-                 * But since we go off slab only for object size greater than
+         * the redzone and userword anyhow, so switch them off.
-                 * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created
+         */
-                 * in ascending order,this should not happen at all.
+        if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
-                 * But leave a BUG_ON for some lucky dude.
+                (cachep->flags & SLAB_POISON) &&
-                 */
+                is_debug_pagealloc_cache(cachep))
-                BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
+                cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+#endif
+        if (OFF_SLAB(cachep)) {
+                cachep->freelist_cache =
+                        kmalloc_slab(cachep->freelist_size, 0u);
        }
        err = setup_cpu_cache(cachep, gfp);
@@ -2377,9 +2266,6 @@ static int drain_freelist(struct kmem_cache *cache,
                }
                page = list_entry(p, struct page, lru);
-#if DEBUG
-                BUG_ON(page->active);
-#endif
                list_del(&page->lru);
                /*
                 * Safe to drop the lock. The slab is no longer linked
@@ -2454,18 +2340,23 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
        void *freelist;
        void *addr = page_address(page);
-        if (OFF_SLAB(cachep)) {
+        page->s_mem = addr + colour_off;
+        page->active = 0;
+        if (OBJFREELIST_SLAB(cachep))
+                freelist = NULL;
+        else if (OFF_SLAB(cachep)) {
                /* Slab management obj is off-slab. */
                freelist = kmem_cache_alloc_node(cachep->freelist_cache,
                                              local_flags, nodeid);
                if (!freelist)
                        return NULL;
        } else {
-                freelist = addr + colour_off;
+                /* We will use last bytes at the slab for freelist */
-                colour_off += cachep->freelist_size;
+                freelist = addr + (PAGE_SIZE << cachep->gfporder) -
+                                cachep->freelist_size;
        }
-        page->active = 0;
-        page->s_mem = addr + colour_off;
        return freelist;
 }
@@ -2480,17 +2371,14 @@ static inline void set_free_obj(struct page *page,
        ((freelist_idx_t *)(page->freelist))[idx] = val;
 }
-static void cache_init_objs(struct kmem_cache *cachep,
+static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
-                            struct page *page)
 {
+#if DEBUG
        int i;
        for (i = 0; i < cachep->num; i++) {
                void *objp = index_to_obj(cachep, page, i);
-#if DEBUG
-                /* need to poison the objs? */
-                if (cachep->flags & SLAB_POISON)
-                        poison_obj(cachep, objp, POISON_FREE);
                if (cachep->flags & SLAB_STORE_USER)
                        *dbg_userword(cachep, objp) = NULL;
@@ -2514,15 +2402,32 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                slab_error(cachep, "constructor overwrote the"
                                           " start of an object");
                }
-                if ((cachep->size % PAGE_SIZE) == 0 &&
+                /* need to poison the objs? */
-                            OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
+                if (cachep->flags & SLAB_POISON) {
-                        kernel_map_pages(virt_to_page(objp),
+                        poison_obj(cachep, objp, POISON_FREE);
-                                         cachep->size / PAGE_SIZE, 0);
+                        slab_kernel_map(cachep, objp, 0, 0);
-#else
+                }
-                if (cachep->ctor)
+        }
-                        cachep->ctor(objp);
 #endif
-                set_obj_status(page, i, OBJECT_FREE);
+}
+static void cache_init_objs(struct kmem_cache *cachep,
+                            struct page *page)
+{
+        int i;
+        cache_init_objs_debug(cachep, page);
+        if (OBJFREELIST_SLAB(cachep)) {
+                page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
+                                                obj_offset(cachep);
+        }
+        for (i = 0; i < cachep->num; i++) {
+                /* constructor could break poison info */
+                if (DEBUG == 0 && cachep->ctor)
+                        cachep->ctor(index_to_obj(cachep, page, i));
                set_free_obj(page, i, i);
        }
 }
@@ -2537,30 +2442,28 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
        }
 }
-static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
+static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
-                                int nodeid)
 {
        void *objp;
        objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
        page->active++;
 #if DEBUG
-        WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
+        if (cachep->flags & SLAB_STORE_USER)
+                set_store_user_dirty(cachep);
 #endif
        return objp;
 }
-static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
+static void slab_put_obj(struct kmem_cache *cachep,
-                                void *objp, int nodeid)
+                        struct page *page, void *objp)
 {
        unsigned int objnr = obj_to_index(cachep, page, objp);
 #if DEBUG
        unsigned int i;
-        /* Verify that the slab belongs to the intended node */
-        WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
        /* Verify double free bug */
        for (i = page->active; i < cachep->num; i++) {
                if (get_free_obj(page, i) == objnr) {
@@ -2571,6 +2474,9 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
        }
 #endif
        page->active--;
+        if (!page->freelist)
+                page->freelist = objp + obj_offset(cachep);
        set_free_obj(page, page->active, objnr);
 }
@@ -2645,7 +2551,7 @@ static int cache_grow(struct kmem_cache *cachep,
        /* Get slab management. */
        freelist = alloc_slabmgmt(cachep, page, offset,
                        local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
-        if (!freelist)
+        if (OFF_SLAB(cachep) && !freelist)
                goto opps1;
        slab_map_pages(cachep, page, freelist);
@@ -2726,27 +2632,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
        }
-        if (cachep->flags & SLAB_STORE_USER)
+        if (cachep->flags & SLAB_STORE_USER) {
+                set_store_user_dirty(cachep);
                *dbg_userword(cachep, objp) = (void *)caller;
+        }
        objnr = obj_to_index(cachep, page, objp);
        BUG_ON(objnr >= cachep->num);
        BUG_ON(objp != index_to_obj(cachep, page, objnr));
-        set_obj_status(page, objnr, OBJECT_FREE);
        if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
-                if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
-                        store_stackinfo(cachep, objp, caller);
-                        kernel_map_pages(virt_to_page(objp),
-                                         cachep->size / PAGE_SIZE, 0);
-                } else {
-                        poison_obj(cachep, objp, POISON_FREE);
-                }
-#else
                poison_obj(cachep, objp, POISON_FREE);
-#endif
+                slab_kernel_map(cachep, objp, 0, caller);
        }
        return objp;
 }
@@ -2756,7 +2654,85 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 #define cache_free_debugcheck(x,objp,z) (objp)
 #endif
-static struct page *get_first_slab(struct kmem_cache_node *n)
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+                                                void **list)
+{
+#if DEBUG
+        void *next = *list;
+        void *objp;
+        while (next) {
+                objp = next - obj_offset(cachep);
+                next = *(void **)next;
+                poison_obj(cachep, objp, POISON_FREE);
+        }
+#endif
+}
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+                                struct kmem_cache_node *n, struct page *page,
+                                void **list)
+{
+        /* move slabp to correct slabp list: */
+        list_del(&page->lru);
+        if (page->active == cachep->num) {
+                list_add(&page->lru, &n->slabs_full);
+                if (OBJFREELIST_SLAB(cachep)) {
+#if DEBUG
+                        /* Poisoning will be done without holding the lock */
+                        if (cachep->flags & SLAB_POISON) {
+                                void **objp = page->freelist;
+                                *objp = *list;
+                                *list = objp;
+                        }
+#endif
+                        page->freelist = NULL;
+                }
+        } else
+                list_add(&page->lru, &n->slabs_partial);
+}
+/* Try to find non-pfmemalloc slab if needed */
+static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
+                                        struct page *page, bool pfmemalloc)
+{
+        if (!page)
+                return NULL;
+        if (pfmemalloc)
+                return page;
+        if (!PageSlabPfmemalloc(page))
+                return page;
+        /* No need to keep pfmemalloc slab if we have enough free objects */
+        if (n->free_objects > n->free_limit) {
+                ClearPageSlabPfmemalloc(page);
+                return page;
+        }
+        /* Move pfmemalloc slab to the end of list to speed up next search */
+        list_del(&page->lru);
+        if (!page->active)
+                list_add_tail(&page->lru, &n->slabs_free);
+        else
+                list_add_tail(&page->lru, &n->slabs_partial);
+        list_for_each_entry(page, &n->slabs_partial, lru) {
+                if (!PageSlabPfmemalloc(page))
+                        return page;
+        }
+        list_for_each_entry(page, &n->slabs_free, lru) {
+                if (!PageSlabPfmemalloc(page))
+                        return page;
+        }
+        return NULL;
+}
+static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
 {
        struct page *page;
@@ -2768,21 +2744,51 @@ static struct page *get_first_slab(struct kmem_cache_node *n)
                                struct page, lru);
        }
+        if (sk_memalloc_socks())
+                return get_valid_first_slab(n, page, pfmemalloc);
        return page;
 }
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
+static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
-                                                        bool force_refill)
+                                struct kmem_cache_node *n, gfp_t flags)
+{
+        struct page *page;
+        void *obj;
+        void *list = NULL;
+        if (!gfp_pfmemalloc_allowed(flags))
+                return NULL;
+        spin_lock(&n->list_lock);
+        page = get_first_slab(n, true);
+        if (!page) {
+                spin_unlock(&n->list_lock);
+                return NULL;
+        }
+        obj = slab_get_obj(cachep, page);
+        n->free_objects--;
+        fixup_slab_list(cachep, n, page, &list);
+        spin_unlock(&n->list_lock);
+        fixup_objfreelist_debug(cachep, &list);
+        return obj;
+}
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 {
        int batchcount;
        struct kmem_cache_node *n;
        struct array_cache *ac;
        int node;
+        void *list = NULL;
        check_irq_off();
        node = numa_mem_id();
-        if (unlikely(force_refill))
-                goto force_grow;
 retry:
        ac = cpu_cache_get(cachep);
        batchcount = ac->batchcount;
@@ -2808,7 +2814,7 @@ retry:
        while (batchcount > 0) {
                struct page *page;
                /* Get slab alloc is to come from. */
-                page = get_first_slab(n);
+                page = get_first_slab(n, false);
                if (!page)
                        goto must_grow;
@@ -2826,26 +2832,29 @@ retry:
                        STATS_INC_ACTIVE(cachep);
                        STATS_SET_HIGH(cachep);
-                        ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
+                        ac->entry[ac->avail++] = slab_get_obj(cachep, page);
-                                                                        node));
                }
-                /* move slabp to correct slabp list: */
+                fixup_slab_list(cachep, n, page, &list);
-                list_del(&page->lru);
-                if (page->active == cachep->num)
-                        list_add(&page->lru, &n->slabs_full);
-                else
-                        list_add(&page->lru, &n->slabs_partial);
        }
 must_grow:
        n->free_objects -= ac->avail;
 alloc_done:
        spin_unlock(&n->list_lock);
+        fixup_objfreelist_debug(cachep, &list);
        if (unlikely(!ac->avail)) {
                int x;
-force_grow:
+                /* Check if we can use obj in pfmemalloc slab */
+                if (sk_memalloc_socks()) {
+                        void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
+                        if (obj)
+                                return obj;
+                }
                x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
                /* cache_grow can reenable interrupts, then ac could change. */
@@ -2853,7 +2862,7 @@ force_grow:
                node = numa_mem_id();
                /* no objects in sight? abort */
-                if (!x && (ac->avail == 0 || force_refill))
+                if (!x && ac->avail == 0)
                        return NULL;
                if (!ac->avail)         /* objects refilled by interrupt? */
@@ -2861,7 +2870,7 @@ force_grow:
        }
        ac->touched = 1;
-        return ac_get_obj(cachep, ac, flags, force_refill);
+        return ac->entry[--ac->avail];
 }
 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -2877,20 +2886,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                                gfp_t flags, void *objp, unsigned long caller)
 {
-        struct page *page;
        if (!objp)
                return objp;
        if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
-                if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
-                        kernel_map_pages(virt_to_page(objp),
-                                         cachep->size / PAGE_SIZE, 1);
-                else
-                        check_poison_obj(cachep, objp);
-#else
                check_poison_obj(cachep, objp);
-#endif
+                slab_kernel_map(cachep, objp, 1, 0);
                poison_obj(cachep, objp, POISON_INUSE);
        }
        if (cachep->flags & SLAB_STORE_USER)
@@ -2910,8 +2910,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
        }
-        page = virt_to_head_page(objp);
-        set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
                cachep->ctor(objp);
@@ -2926,40 +2924,24 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
-static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
-{
-        if (unlikely(cachep == kmem_cache))
-                return false;
-        return should_failslab(cachep->object_size, flags, cachep->flags);
-}
 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
        void *objp;
        struct array_cache *ac;
-        bool force_refill = false;
        check_irq_off();
        ac = cpu_cache_get(cachep);
        if (likely(ac->avail)) {
                ac->touched = 1;
-                objp = ac_get_obj(cachep, ac, flags, false);
+                objp = ac->entry[--ac->avail];
-                /*
+                STATS_INC_ALLOCHIT(cachep);
-                 * Allow for the possibility all avail objects are not allowed
+                goto out;
-                 * by the current flags
-                 */
-                if (objp) {
-                        STATS_INC_ALLOCHIT(cachep);
-                        goto out;
-                }
-                force_refill = true;
        }
        STATS_INC_ALLOCMISS(cachep);
-        objp = cache_alloc_refill(cachep, flags, force_refill);
+        objp = cache_alloc_refill(cachep, flags);
        /*
         * the 'ac' may be updated by cache_alloc_refill(),
         * and kmemleak_erase() requires its correct value.
@@ -3097,6 +3079,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
        struct page *page;
        struct kmem_cache_node *n;
        void *obj;
+        void *list = NULL;
        int x;
        VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
@@ -3106,7 +3089,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
 retry:
        check_irq_off();
        spin_lock(&n->list_lock);
-        page = get_first_slab(n);
+        page = get_first_slab(n, false);
        if (!page)
                goto must_grow;
@@ -3118,17 +3101,13 @@ retry:
        BUG_ON(page->active == cachep->num);
-        obj = slab_get_obj(cachep, page, nodeid);
+        obj = slab_get_obj(cachep, page);
        n->free_objects--;
-        /* move slabp to correct slabp list: */
-        list_del(&page->lru);
-        if (page->active == cachep->num)
+        fixup_slab_list(cachep, n, page, &list);
-                list_add(&page->lru, &n->slabs_full);
-        else
-                list_add(&page->lru, &n->slabs_partial);
        spin_unlock(&n->list_lock);
+        fixup_objfreelist_debug(cachep, &list);
        goto done;
 must_grow:
@@ -3152,14 +3131,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
        int slab_node = numa_mem_id();
        flags &= gfp_allowed_mask;
+        cachep = slab_pre_alloc_hook(cachep, flags);
-        lockdep_trace_alloc(flags);
+        if (unlikely(!cachep))
-        if (slab_should_failslab(cachep, flags))
                return NULL;
-        cachep = memcg_kmem_get_cache(cachep, flags);
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
@@ -3188,16 +3163,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
  out:
        local_irq_restore(save_flags);
        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
-        kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
-                                 flags);
-        if (likely(ptr)) {
+        if (unlikely(flags & __GFP_ZERO) && ptr)
-                kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
+                memset(ptr, 0, cachep->object_size);
-                if (unlikely(flags & __GFP_ZERO))
-                        memset(ptr, 0, cachep->object_size);
-        }
-        memcg_kmem_put_cache(cachep);
+        slab_post_alloc_hook(cachep, flags, 1, &ptr);
        return ptr;
 }
@@ -3240,30 +3210,21 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
        void *objp;
        flags &= gfp_allowed_mask;
+        cachep = slab_pre_alloc_hook(cachep, flags);
-        lockdep_trace_alloc(flags);
+        if (unlikely(!cachep))
-        if (slab_should_failslab(cachep, flags))
                return NULL;
-        cachep = memcg_kmem_get_cache(cachep, flags);
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
        objp = __do_cache_alloc(cachep, flags);
        local_irq_restore(save_flags);
        objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
-        kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
-                                 flags);
        prefetchw(objp);
-        if (likely(objp)) {
+        if (unlikely(flags & __GFP_ZERO) && objp)
-                kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
+                memset(objp, 0, cachep->object_size);
-                if (unlikely(flags & __GFP_ZERO))
-                        memset(objp, 0, cachep->object_size);
-        }
-        memcg_kmem_put_cache(cachep);
+        slab_post_alloc_hook(cachep, flags, 1, &objp);
        return objp;
 }
@@ -3281,13 +3242,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
                void *objp;
                struct page *page;
-                clear_obj_pfmemalloc(&objpp[i]);
                objp = objpp[i];
                page = virt_to_head_page(objp);
                list_del(&page->lru);
                check_spinlock_acquired_node(cachep, node);
-                slab_put_obj(cachep, page, objp, node);
+                slab_put_obj(cachep, page, objp);
                STATS_DEC_ACTIVE(cachep);
                n->free_objects++;
@@ -3317,9 +3277,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
        LIST_HEAD(list);
        batchcount = ac->batchcount;
-#if DEBUG
-        BUG_ON(!batchcount || batchcount > ac->avail);
-#endif
        check_irq_off();
        n = get_node(cachep, node);
        spin_lock(&n->list_lock);
@@ -3389,7 +3347,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
                cache_flusharray(cachep, ac);
        }
-        ac_put_obj(cachep, ac, objp);
+        if (sk_memalloc_socks()) {
+                struct page *page = virt_to_head_page(objp);
+                if (unlikely(PageSlabPfmemalloc(page))) {
+                        cache_free_pfmemalloc(cachep, page, objp);
+                        return;
+                }
+        }
+        ac->entry[ac->avail++] = objp;
 }
 /**
@@ -3411,16 +3378,53 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+static __always_inline void
+cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
+                                  size_t size, void **p, unsigned long caller)
 {
-        __kmem_cache_free_bulk(s, size, p);
+        size_t i;
+        for (i = 0; i < size; i++)
+                p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
 }
-EXPORT_SYMBOL(kmem_cache_free_bulk);
 int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
-                                                                void **p)
+                          void **p)
 {
-        return __kmem_cache_alloc_bulk(s, flags, size, p);
+        size_t i;
+        s = slab_pre_alloc_hook(s, flags);
+        if (!s)
+                return 0;
+        cache_alloc_debugcheck_before(s, flags);
+        local_irq_disable();
+        for (i = 0; i < size; i++) {
+                void *objp = __do_cache_alloc(s, flags);
+                if (unlikely(!objp))
+                        goto error;
+                p[i] = objp;
+        }
+        local_irq_enable();
+        cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
+        /* Clear memory outside IRQ disabled section */
+        if (unlikely(flags & __GFP_ZERO))
+                for (i = 0; i < size; i++)
+                        memset(p[i], 0, s->object_size);
+        slab_post_alloc_hook(s, flags, size, p);
+        /* FIXME: Trace call missing. Christoph would like a bulk variant */
+        return size;
+error:
+        local_irq_enable();
+        cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
+        slab_post_alloc_hook(s, flags, i, p);
+        __kmem_cache_free_bulk(s, i, p);
+        return 0;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -3567,6 +3571,32 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 }
 EXPORT_SYMBOL(kmem_cache_free);
+void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
+{
+        struct kmem_cache *s;
+        size_t i;
+        local_irq_disable();
+        for (i = 0; i < size; i++) {
+                void *objp = p[i];
+                if (!orig_s) /* called via kfree_bulk */
+                        s = virt_to_cache(objp);
+                else
+                        s = cache_from_obj(orig_s, objp);
+                debug_check_no_locks_freed(objp, s->object_size);
+                if (!(s->flags & SLAB_DEBUG_OBJECTS))
+                        debug_check_no_obj_freed(objp, s->object_size);
+                __cache_free(s, objp, _RET_IP_);
+        }
+        local_irq_enable();
+        /* FIXME: add tracing */
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
 /**
 * kfree - free previously allocated memory
 * @objp: pointer returned by kmalloc.
@@ -4102,15 +4132,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
                                                struct page *page)
 {
        void *p;
-        int i;
+        int i, j;
+        unsigned long v;
        if (n[0] == n[1])
                return;
        for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
-                if (get_obj_status(page, i) != OBJECT_ACTIVE)
+                bool active = true;
+                for (j = page->active; j < c->num; j++) {
+                        if (get_free_obj(page, j) == i) {
+                                active = false;
+                                break;
+                        }
+                }
+                if (!active)
+                        continue;
+                /*
+                 * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
+                 * mapping is established when actual object allocation and
+                 * we could mistakenly access the unmapped object in the cpu
+                 * cache.
+                 */
+                if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
                        continue;
-                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+                if (!add_caller(n, v))
                        return;
        }
 }
@@ -4146,21 +4195,31 @@ static int leaks_show(struct seq_file *m, void *p)
        if (!(cachep->flags & SLAB_RED_ZONE))
                return 0;
-        /* OK, we can do it */
+        /*
+         * Set store_user_clean and start to grab stored user information
+         * for all objects on this cache. If some alloc/free requests comes
+         * during the processing, information would be wrong so restart
+         * whole processing.
+         */
+        do {
+                set_store_user_clean(cachep);
+                drain_cpu_caches(cachep);
-        x[1] = 0;
+                x[1] = 0;
-        for_each_kmem_cache_node(cachep, node, n) {
+                for_each_kmem_cache_node(cachep, node, n) {
-                check_irq_on();
+                        check_irq_on();
-                spin_lock_irq(&n->list_lock);
+                        spin_lock_irq(&n->list_lock);
+                        list_for_each_entry(page, &n->slabs_full, lru)
+                                handle_slab(x, cachep, page);
+                        list_for_each_entry(page, &n->slabs_partial, lru)
+                                handle_slab(x, cachep, page);
+                        spin_unlock_irq(&n->list_lock);
+                }
+        } while (!is_store_user_clean(cachep));
-                list_for_each_entry(page, &n->slabs_full, lru)
-                        handle_slab(x, cachep, page);
-                list_for_each_entry(page, &n->slabs_partial, lru)
-                        handle_slab(x, cachep, page);
-                spin_unlock_irq(&n->list_lock);
-        }
        name = cachep->name;
        if (x[0] == x[1]) {
                /* Increase the buffer size */
diff --git a/mm/slab.h b/mm/slab.h
index 2eedacea439d..b7934361f026 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -38,6 +38,10 @@ struct kmem_cache {
 #endif
 #include <linux/memcontrol.h>
+#include <linux/fault-inject.h>
+#include <linux/kmemcheck.h>
+#include <linux/kasan.h>
+#include <linux/kmemleak.h>
 /*
 * State of the slab allocator.
@@ -121,7 +125,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
 #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
 #elif defined(CONFIG_SLUB_DEBUG)
 #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
-                          SLAB_TRACE | SLAB_DEBUG_FREE)
+                          SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
 #else
 #define SLAB_DEBUG_FLAGS (0)
 #endif
@@ -168,7 +172,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 /*
 * Generic implementation of bulk operations
 * These are useful for situations in which the allocator cannot
- * perform optimizations. In that case segments of the objecct listed
+ * perform optimizations. In that case segments of the object listed
 * may be allocated or freed using these operations.
 */
 void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
@@ -307,7 +311,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
         * to not do even the assignment. In that case, slab_equal_or_root
         * will also be a constant.
         */
-        if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
+        if (!memcg_kmem_enabled() &&
+            !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
                return s;
        page = virt_to_head_page(x);
@@ -321,6 +326,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
        return s;
 }
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifndef CONFIG_SLUB
+        return s->object_size;
+#else /* CONFIG_SLUB */
+# ifdef CONFIG_SLUB_DEBUG
+        /*
+         * Debugging requires use of the padding between object
+         * and whatever may come after it.
+         */
+        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+                return s->object_size;
+# endif
+        /*
+         * If we have the need to store the freelist pointer
+         * back there or track user information then we can
+         * only use the space before that information.
+         */
+        if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+                return s->inuse;
+        /*
+         * Else we can use all the padding etc for the allocation
+         */
+        return s->size;
+#endif
+}
+static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+                                                     gfp_t flags)
+{
+        flags &= gfp_allowed_mask;
+        lockdep_trace_alloc(flags);
+        might_sleep_if(gfpflags_allow_blocking(flags));
+        if (should_failslab(s, flags))
+                return NULL;
+        return memcg_kmem_get_cache(s, flags);
+}
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+                                        size_t size, void **p)
+{
+        size_t i;
+        flags &= gfp_allowed_mask;
+        for (i = 0; i < size; i++) {
+                void *object = p[i];
+                kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
+                kmemleak_alloc_recursive(object, s->object_size, 1,
+                                         s->flags, flags);
+                kasan_slab_alloc(s, object);
+        }
+        memcg_kmem_put_cache(s);
+}
 #ifndef CONFIG_SLOB
 /*
 * The slab lists for all objects.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 065b7bdabdc3..6afb2263a5c5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -109,8 +109,12 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
 {
        size_t i;
-        for (i = 0; i < nr; i++)
+        for (i = 0; i < nr; i++) {
-                kmem_cache_free(s, p[i]);
+                if (s)
+                        kmem_cache_free(s, p[i]);
+                else
+                        kfree(p[i]);
+        }
 }
 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
diff --git a/mm/slub.c b/mm/slub.c
index d8fbd4a6ed59..6c91324f9370 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 #endif
 }
+static inline void *fixup_red_left(struct kmem_cache *s, void *p)
+{
+        if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+                p += s->red_left_pad;
+        return p;
+}
 static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 {
 #ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -160,10 +168,18 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 */
 #define MAX_PARTIAL 10
-#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
+#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
                                SLAB_POISON | SLAB_STORE_USER)
 /*
+ * These debug flags cannot use CMPXCHG because there might be consistency
+ * issues when checking or reading debug information
+ */
+#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
+                                SLAB_TRACE)
+/*
 * Debugging flags that require metadata to be stored in the slab.  These get
 * disabled when slub_debug=O is used and a cache's min order increases with
 * metadata.
@@ -224,24 +240,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
 *                      Core slab cache functions
 *******************************************************************/
-/* Verify that a pointer has an address that is valid within a slab page */
-static inline int check_valid_pointer(struct kmem_cache *s,
-                                struct page *page, const void *object)
-{
-        void *base;
-        if (!object)
-                return 1;
-        base = page_address(page);
-        if (object < base || object >= base + page->objects * s->size ||
-                (object - base) % s->size) {
-                return 0;
-        }
-        return 1;
-}
 static inline void *get_freepointer(struct kmem_cache *s, void *object)
 {
        return *(void **)(object + s->offset);
@@ -271,12 +269,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 /* Loop over all objects in a slab */
 #define for_each_object(__p, __s, __addr, __objects) \
-        for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
+        for (__p = fixup_red_left(__s, __addr); \
-                        __p += (__s)->size)
+                __p < (__addr) + (__objects) * (__s)->size; \
+                __p += (__s)->size)
 #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
-        for (__p = (__addr), __idx = 1; __idx <= __objects;\
+        for (__p = fixup_red_left(__s, __addr), __idx = 1; \
-                        __p += (__s)->size, __idx++)
+                __idx <= __objects; \
+                __p += (__s)->size, __idx++)
 /* Determine object index from a given position */
 static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -284,30 +284,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
        return (p - addr) / s->size;
 }
-static inline size_t slab_ksize(const struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_DEBUG
-        /*
-         * Debugging requires use of the padding between object
-         * and whatever may come after it.
-         */
-        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
-                return s->object_size;
-#endif
-        /*
-         * If we have the need to store the freelist pointer
-         * back there or track user information then we can
-         * only use the space before that information.
-         */
-        if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
-                return s->inuse;
-        /*
-         * Else we can use all the padding etc for the allocation
-         */
-        return s->size;
-}
 static inline int order_objects(int order, unsigned long size, int reserved)
 {
        return ((PAGE_SIZE << order) - reserved) / size;
@@ -458,6 +434,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
                set_bit(slab_index(p, s, addr), map);
 }
+static inline int size_from_object(struct kmem_cache *s)
+{
+        if (s->flags & SLAB_RED_ZONE)
+                return s->size - s->red_left_pad;
+        return s->size;
+}
+static inline void *restore_red_left(struct kmem_cache *s, void *p)
+{
+        if (s->flags & SLAB_RED_ZONE)
+                p -= s->red_left_pad;
+        return p;
+}
 /*
 * Debug settings:
 */
@@ -491,6 +483,26 @@ static inline void metadata_access_disable(void)
 /*
 * Object debugging
 */
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+                                struct page *page, void *object)
+{
+        void *base;
+        if (!object)
+                return 1;
+        base = page_address(page);
+        object = restore_red_left(s, object);
+        if (object < base || object >= base + page->objects * s->size ||
+                (object - base) % s->size) {
+                return 0;
+        }
+        return 1;
+}
 static void print_section(char *text, u8 *addr, unsigned int length)
 {
        metadata_access_enable();
@@ -630,7 +642,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
        pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
               p, p - addr, get_freepointer(s, p));
-        if (p > addr + 16)
+        if (s->flags & SLAB_RED_ZONE)
+                print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+        else if (p > addr + 16)
                print_section("Bytes b4 ", p - 16, 16);
        print_section("Object ", p, min_t(unsigned long, s->object_size,
@@ -647,9 +661,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
        if (s->flags & SLAB_STORE_USER)
                off += 2 * sizeof(struct track);
-        if (off != s->size)
+        if (off != size_from_object(s))
                /* Beginning of the filler is the free pointer */
-                print_section("Padding ", p + off, s->size - off);
+                print_section("Padding ", p + off, size_from_object(s) - off);
        dump_stack();
 }
@@ -679,6 +693,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
 {
        u8 *p = object;
+        if (s->flags & SLAB_RED_ZONE)
+                memset(p - s->red_left_pad, val, s->red_left_pad);
        if (s->flags & __OBJECT_POISON) {
                memset(p, POISON_FREE, s->object_size - 1);
                p[s->object_size - 1] = POISON_END;
@@ -771,11 +788,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
                /* We also have user information there */
                off += 2 * sizeof(struct track);
-        if (s->size == off)
+        if (size_from_object(s) == off)
                return 1;
        return check_bytes_and_report(s, page, p, "Object padding",
-                                p + off, POISON_INUSE, s->size - off);
+                        p + off, POISON_INUSE, size_from_object(s) - off);
 }
 /* Check the pad bytes at the end of a slab page */
@@ -820,6 +837,10 @@ static int check_object(struct kmem_cache *s, struct page *page,
        if (s->flags & SLAB_RED_ZONE) {
                if (!check_bytes_and_report(s, page, object, "Redzone",
+                        object - s->red_left_pad, val, s->red_left_pad))
+                        return 0;
+                if (!check_bytes_and_report(s, page, object, "Redzone",
                        endobject, val, s->inuse - s->object_size))
                        return 0;
        } else {
@@ -1031,20 +1052,32 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
        init_tracking(s, object);
 }
-static noinline int alloc_debug_processing(struct kmem_cache *s,
+static inline int alloc_consistency_checks(struct kmem_cache *s,
                                        struct page *page,
                                        void *object, unsigned long addr)
 {
        if (!check_slab(s, page))
-                goto bad;
+                return 0;
        if (!check_valid_pointer(s, page, object)) {
                object_err(s, page, object, "Freelist Pointer check fails");
-                goto bad;
+                return 0;
        }
        if (!check_object(s, page, object, SLUB_RED_INACTIVE))
-                goto bad;
+                return 0;
+        return 1;
+}
+static noinline int alloc_debug_processing(struct kmem_cache *s,
+                                        struct page *page,
+                                        void *object, unsigned long addr)
+{
+        if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+                if (!alloc_consistency_checks(s, page, object, addr))
+                        goto bad;
+        }
        /* Success perform special debug activities for allocs */
        if (s->flags & SLAB_STORE_USER)
@@ -1067,37 +1100,21 @@ bad:
        return 0;
 }
-/* Supports checking bulk free of a constructed freelist */
+static inline int free_consistency_checks(struct kmem_cache *s,
-static noinline struct kmem_cache_node *free_debug_processing(
+                struct page *page, void *object, unsigned long addr)
-        struct kmem_cache *s, struct page *page,
-        void *head, void *tail, int bulk_cnt,
-        unsigned long addr, unsigned long *flags)
 {
-        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-        void *object = head;
-        int cnt = 0;
-        spin_lock_irqsave(&n->list_lock, *flags);
-        slab_lock(page);
-        if (!check_slab(s, page))
-                goto fail;
-next_object:
-        cnt++;
        if (!check_valid_pointer(s, page, object)) {
                slab_err(s, page, "Invalid object pointer 0x%p", object);
-                goto fail;
+                return 0;
        }
        if (on_freelist(s, page, object)) {
                object_err(s, page, object, "Object already free");
-                goto fail;
+                return 0;
        }
        if (!check_object(s, page, object, SLUB_RED_ACTIVE))
-                goto out;
+                return 0;
        if (unlikely(s != page->slab_cache)) {
                if (!PageSlab(page)) {
@@ -1110,7 +1127,37 @@ next_object:
                } else
                        object_err(s, page, object,
                                        "page slab pointer corrupt.");
-                goto fail;
+                return 0;
+        }
+        return 1;
+}
+/* Supports checking bulk free of a constructed freelist */
+static noinline int free_debug_processing(
+        struct kmem_cache *s, struct page *page,
+        void *head, void *tail, int bulk_cnt,
+        unsigned long addr)
+{
+        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+        void *object = head;
+        int cnt = 0;
+        unsigned long uninitialized_var(flags);
+        int ret = 0;
+        spin_lock_irqsave(&n->list_lock, flags);
+        slab_lock(page);
+        if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+                if (!check_slab(s, page))
+                        goto out;
+        }
+next_object:
+        cnt++;
+        if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+                if (!free_consistency_checks(s, page, object, addr))
+                        goto out;
        }
        if (s->flags & SLAB_STORE_USER)
@@ -1124,23 +1171,18 @@ next_object:
                object = get_freepointer(s, object);
                goto next_object;
        }
+        ret = 1;
 out:
        if (cnt != bulk_cnt)
                slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
                         bulk_cnt, cnt);
        slab_unlock(page);
-        /*
+        spin_unlock_irqrestore(&n->list_lock, flags);
-         * Keep node_lock to preserve integrity
+        if (!ret)
-         * until the object is actually freed
+                slab_fix(s, "Object at 0x%p not freed", object);
-         */
+        return ret;
-        return n;
-fail:
-        slab_unlock(page);
-        spin_unlock_irqrestore(&n->list_lock, *flags);
-        slab_fix(s, "Object at 0x%p not freed", object);
-        return NULL;
 }
 static int __init setup_slub_debug(char *str)
@@ -1172,7 +1214,7 @@ static int __init setup_slub_debug(char *str)
        for (; *str && *str != ','; str++) {
                switch (tolower(*str)) {
                case 'f':
-                        slub_debug |= SLAB_DEBUG_FREE;
+                        slub_debug |= SLAB_CONSISTENCY_CHECKS;
                        break;
                case 'z':
                        slub_debug |= SLAB_RED_ZONE;
@@ -1231,10 +1273,10 @@ static inline void setup_object_debug(struct kmem_cache *s,
 static inline int alloc_debug_processing(struct kmem_cache *s,
        struct page *page, void *object, unsigned long addr) { return 0; }
-static inline struct kmem_cache_node *free_debug_processing(
+static inline int free_debug_processing(
        struct kmem_cache *s, struct page *page,
        void *head, void *tail, int bulk_cnt,
-        unsigned long addr, unsigned long *flags) { return NULL; }
+        unsigned long addr) { return 0; }
 static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
                        { return 1; }
@@ -1281,36 +1323,6 @@ static inline void kfree_hook(const void *x)
        kasan_kfree_large(x);
 }
-static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
-                                                     gfp_t flags)
-{
-        flags &= gfp_allowed_mask;
-        lockdep_trace_alloc(flags);
-        might_sleep_if(gfpflags_allow_blocking(flags));
-        if (should_failslab(s->object_size, flags, s->flags))
-                return NULL;
-        return memcg_kmem_get_cache(s, flags);
-}
-static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
-                                        size_t size, void **p)
-{
-        size_t i;
-        flags &= gfp_allowed_mask;
-        for (i = 0; i < size; i++) {
-                void *object = p[i];
-                kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
-                kmemleak_alloc_recursive(object, s->object_size, 1,
-                                         s->flags, flags);
-                kasan_slab_alloc(s, object);
-        }
-        memcg_kmem_put_cache(s);
-}
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
 {
        kmemleak_free_recursive(x, s->flags);
@@ -1470,7 +1482,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                        set_freepointer(s, p, NULL);
        }
-        page->freelist = start;
+        page->freelist = fixup_red_left(s, start);
        page->inuse = page->objects;
        page->frozen = 1;
@@ -1506,7 +1518,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        int order = compound_order(page);
        int pages = 1 << order;
-        if (kmem_cache_debug(s)) {
+        if (s->flags & SLAB_CONSISTENCY_CHECKS) {
                void *p;
                slab_pad_check(s, page);
@@ -2224,8 +2236,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
        if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
                return;
-        pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
+        pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
-                nid, gfpflags);
+                nid, gfpflags, &gfpflags);
        pr_warn("  cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
                s->name, s->object_size, s->size, oo_order(s->oo),
                oo_order(s->min));
@@ -2642,8 +2654,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
        stat(s, FREE_SLOWPATH);
        if (kmem_cache_debug(s) &&
-            !(n = free_debug_processing(s, page, head, tail, cnt,
+            !free_debug_processing(s, page, head, tail, cnt, addr))
-                                        addr, &flags)))
                return;
        do {
@@ -2815,6 +2826,7 @@ struct detached_freelist {
        void *tail;
        void *freelist;
        int cnt;
+        struct kmem_cache *s;
 };
 /*
@@ -2829,26 +2841,45 @@ struct detached_freelist {
 * synchronization primitive.  Look ahead in the array is limited due
 * to performance reasons.
 */
-static int build_detached_freelist(struct kmem_cache *s, size_t size,
+static inline
-                                   void **p, struct detached_freelist *df)
+int build_detached_freelist(struct kmem_cache *s, size_t size,
+                            void **p, struct detached_freelist *df)
 {
        size_t first_skipped_index = 0;
        int lookahead = 3;
        void *object;
+        struct page *page;
        /* Always re-init detached_freelist */
        df->page = NULL;
        do {
                object = p[--size];
+                /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
        } while (!object && size);
        if (!object)
                return 0;
+        page = virt_to_head_page(object);
+        if (!s) {
+                /* Handle kalloc'ed objects */
+                if (unlikely(!PageSlab(page))) {
+                        BUG_ON(!PageCompound(page));
+                        kfree_hook(object);
+                        __free_kmem_pages(page, compound_order(page));
+                        p[size] = NULL; /* mark object processed */
+                        return size;
+                }
+                /* Derive kmem_cache from object */
+                df->s = page->slab_cache;
+        } else {
+                df->s = cache_from_obj(s, object); /* Support for memcg */
+        }
        /* Start new detached freelist */
-        set_freepointer(s, object, NULL);
+        df->page = page;
-        df->page = virt_to_head_page(object);
+        set_freepointer(df->s, object, NULL);
        df->tail = object;
        df->freelist = object;
        p[size] = NULL; /* mark object processed */
@@ -2862,7 +2893,7 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size,
                /* df->page is always set at this point */
                if (df->page == virt_to_head_page(object)) {
                        /* Opportunity build freelist */
-                        set_freepointer(s, object, df->freelist);
+                        set_freepointer(df->s, object, df->freelist);
                        df->freelist = object;
                        df->cnt++;
                        p[size] = NULL; /* mark object processed */
@@ -2881,25 +2912,20 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size,
        return first_skipped_index;
 }
 /* Note that interrupts must be enabled when calling this function. */
-void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 {
        if (WARN_ON(!size))
                return;
        do {
                struct detached_freelist df;
-                struct kmem_cache *s;
-                /* Support for memcg */
-                s = cache_from_obj(orig_s, p[size - 1]);
                size = build_detached_freelist(s, size, p, &df);
                if (unlikely(!df.page))
                        continue;
-                slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_);
+                slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
        } while (likely(size));
 }
 EXPORT_SYMBOL(kmem_cache_free_bulk);
@@ -3285,7 +3311,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
                 */
                size += 2 * sizeof(struct track);
-        if (flags & SLAB_RED_ZONE)
+        if (flags & SLAB_RED_ZONE) {
                /*
                 * Add some empty padding so that we can catch
                 * overwrites from earlier objects rather than let
@@ -3294,6 +3320,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
                 * of the object.
                 */
                size += sizeof(void *);
+                s->red_left_pad = sizeof(void *);
+                s->red_left_pad = ALIGN(s->red_left_pad, s->align);
+                size += s->red_left_pad;
+        }
 #endif
        /*
@@ -3357,7 +3388,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
-        if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
+        if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
                /* Enable fast mode */
                s->flags |= __CMPXCHG_DOUBLE;
 #endif
@@ -4812,16 +4843,16 @@ SLAB_ATTR_RO(total_objects);
 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
 }
 static ssize_t sanity_checks_store(struct kmem_cache *s,
                                const char *buf, size_t length)
 {
-        s->flags &= ~SLAB_DEBUG_FREE;
+        s->flags &= ~SLAB_CONSISTENCY_CHECKS;
        if (buf[0] == '1') {
                s->flags &= ~__CMPXCHG_DOUBLE;
-                s->flags |= SLAB_DEBUG_FREE;
+                s->flags |= SLAB_CONSISTENCY_CHECKS;
        }
        return length;
 }
@@ -4865,7 +4896,6 @@ static ssize_t red_zone_store(struct kmem_cache *s,
        s->flags &= ~SLAB_RED_ZONE;
        if (buf[0] == '1') {
-                s->flags &= ~__CMPXCHG_DOUBLE;
                s->flags |= SLAB_RED_ZONE;
        }
        calculate_sizes(s, -1);
@@ -4886,7 +4916,6 @@ static ssize_t poison_store(struct kmem_cache *s,
        s->flags &= ~SLAB_POISON;
        if (buf[0] == '1') {
-                s->flags &= ~__CMPXCHG_DOUBLE;
                s->flags |= SLAB_POISON;
        }
        calculate_sizes(s, -1);
@@ -5356,7 +5385,7 @@ static char *create_unique_id(struct kmem_cache *s)
                *p++ = 'd';
        if (s->flags & SLAB_RECLAIM_ACCOUNT)
                *p++ = 'a';
-        if (s->flags & SLAB_DEBUG_FREE)
+        if (s->flags & SLAB_CONSISTENCY_CHECKS)
                *p++ = 'F';
        if (!(s->flags & SLAB_NOTRACK))
                *p++ = 't';
diff --git a/mm/truncate.c b/mm/truncate.c
index e3ee0e27cd17..7598b552ae03 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -519,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
 static int
 invalidate_complete_page2(struct address_space *mapping, struct page *page)
 {
-        struct mem_cgroup *memcg;
        unsigned long flags;
        if (page->mapping != mapping)
@@ -528,15 +527,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
                return 0;
-        memcg = mem_cgroup_begin_page_stat(page);
        spin_lock_irqsave(&mapping->tree_lock, flags);
        if (PageDirty(page))
                goto failed;
        BUG_ON(page_has_private(page));
-        __delete_from_page_cache(page, NULL, memcg);
+        __delete_from_page_cache(page, NULL);
        spin_unlock_irqrestore(&mapping->tree_lock, flags);
-        mem_cgroup_end_page_stat(memcg);
        if (mapping->a_ops->freepage)
                mapping->a_ops->freepage(page);
@@ -545,7 +542,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        return 1;
 failed:
        spin_unlock_irqrestore(&mapping->tree_lock, flags);
-        mem_cgroup_end_page_stat(memcg);
        return 0;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 71b1c29948db..dd984470248f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -195,25 +195,25 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
 {
        unsigned long nr;
-        nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+        nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
-             zone_page_state(zone, NR_INACTIVE_FILE) +
+             zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
-             zone_page_state(zone, NR_ISOLATED_FILE);
+             zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
        if (get_nr_swap_pages() > 0)
-                nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+                nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
-                      zone_page_state(zone, NR_INACTIVE_ANON) +
+                      zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
-                      zone_page_state(zone, NR_ISOLATED_ANON);
+                      zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
        return nr;
 }
 bool zone_reclaimable(struct zone *zone)
 {
-        return zone_page_state(zone, NR_PAGES_SCANNED) <
+        return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
                zone_reclaimable_pages(zone) * 6;
 }
-static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
        if (!mem_cgroup_disabled())
                return mem_cgroup_get_lru_size(lruvec, lru);
@@ -228,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker)
 {
        size_t size = sizeof(*shrinker->nr_deferred);
-        /*
-         * If we only have one possible node in the system anyway, save
-         * ourselves the trouble and disable NUMA aware behavior. This way we
-         * will save memory and some small loop time later.
-         */
-        if (nr_node_ids == 1)
-                shrinker->flags &= ~SHRINKER_NUMA_AWARE;
        if (shrinker->flags & SHRINKER_NUMA_AWARE)
                size *= nr_node_ids;
@@ -611,12 +603,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                            bool reclaimed)
 {
        unsigned long flags;
-        struct mem_cgroup *memcg;
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
-        memcg = mem_cgroup_begin_page_stat(page);
        spin_lock_irqsave(&mapping->tree_lock, flags);
        /*
         * The non racy check for a busy page.
@@ -656,7 +646,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                mem_cgroup_swapout(page, swap);
                __delete_from_swap_cache(page);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
-                mem_cgroup_end_page_stat(memcg);
                swapcache_free(swap);
        } else {
                void (*freepage)(struct page *);
@@ -682,9 +671,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                if (reclaimed && page_is_file_cache(page) &&
                    !mapping_exiting(mapping) && !dax_mapping(mapping))
                        shadow = workingset_eviction(mapping, page);
-                __delete_from_page_cache(page, shadow, memcg);
+                __delete_from_page_cache(page, shadow);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
-                mem_cgroup_end_page_stat(memcg);
                if (freepage != NULL)
                        freepage(page);
@@ -694,7 +682,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 cannot_free:
        spin_unlock_irqrestore(&mapping->tree_lock, flags);
-        mem_cgroup_end_page_stat(memcg);
        return 0;
 }
@@ -1931,8 +1918,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
        unsigned long inactive;
        unsigned long active;
-        inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
+        inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
-        active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+        active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
        return active > inactive;
 }
@@ -2071,7 +2058,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * system is under heavy pressure.
         */
        if (!inactive_file_is_low(lruvec) &&
-            get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+            lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
                scan_balance = SCAN_FILE;
                goto out;
        }
@@ -2097,10 +2084,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * anon in [0], file in [1]
         */
-        anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+        anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
-                get_lru_size(lruvec, LRU_INACTIVE_ANON);
+                lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
-        file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+        file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
-                get_lru_size(lruvec, LRU_INACTIVE_FILE);
+                lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
        spin_lock_irq(&zone->lru_lock);
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2138,7 +2125,7 @@ out:
                        unsigned long size;
                        unsigned long scan;
-                        size = get_lru_size(lruvec, lru);
+                        size = lruvec_lru_size(lruvec, lru);
                        scan = size >> sc->priority;
                        if (!scan && pass && force_scan)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 084c6725b373..69ce64f7b8d7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -924,19 +924,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
 #endif
 #ifdef CONFIG_PROC_FS
-static char * const migratetype_names[MIGRATE_TYPES] = {
-        "Unmovable",
-        "Movable",
-        "Reclaimable",
-        "HighAtomic",
-#ifdef CONFIG_CMA
-        "CMA",
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
-        "Isolate",
-#endif
-};
 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
                                                struct zone *zone)
 {
@@ -1133,7 +1120,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
 #ifdef CONFIG_PAGE_OWNER
        int mtype;
-        if (!page_owner_inited)
+        if (!static_branch_unlikely(&page_owner_inited))
                return;
        drain_all_pages(NULL);
diff --git a/mm/workingset.c b/mm/workingset.c
index 61ead9e5549d..6130ba0b2641 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -152,8 +152,25 @@
 * refault distance will immediately activate the refaulting page.
 */
-static void *pack_shadow(unsigned long eviction, struct zone *zone)
+#define EVICTION_SHIFT  (RADIX_TREE_EXCEPTIONAL_ENTRY + \
+                         ZONES_SHIFT + NODES_SHIFT +    \
+                         MEM_CGROUP_ID_SHIFT)
+#define EVICTION_MASK   (~0UL >> EVICTION_SHIFT)
+/*
+ * Eviction timestamps need to be able to cover the full range of
+ * actionable refaults. However, bits are tight in the radix tree
+ * entry, and after storing the identifier for the lruvec there might
+ * not be enough left to represent every single actionable refault. In
+ * that case, we have to sacrifice granularity for distance, and group
+ * evictions into coarser buckets by shaving off lower timestamp bits.
+ */
+static unsigned int bucket_order __read_mostly;
+static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
 {
+        eviction >>= bucket_order;
+        eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
        eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
        eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
        eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -161,45 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone)
        return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
 }
-static void unpack_shadow(void *shadow,
+static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
-                          struct zone **zone,
+                          unsigned long *evictionp)
-                          unsigned long *distance)
 {
        unsigned long entry = (unsigned long)shadow;
-        unsigned long eviction;
+        int memcgid, nid, zid;
-        unsigned long refault;
-        unsigned long mask;
-        int zid, nid;
        entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
        zid = entry & ((1UL << ZONES_SHIFT) - 1);
        entry >>= ZONES_SHIFT;
        nid = entry & ((1UL << NODES_SHIFT) - 1);
        entry >>= NODES_SHIFT;
-        eviction = entry;
+        memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
+        entry >>= MEM_CGROUP_ID_SHIFT;
-        *zone = NODE_DATA(nid)->node_zones + zid;
+        *memcgidp = memcgid;
+        *zonep = NODE_DATA(nid)->node_zones + zid;
-        refault = atomic_long_read(&(*zone)->inactive_age);
+        *evictionp = entry << bucket_order;
-        mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
-                        RADIX_TREE_EXCEPTIONAL_SHIFT);
-        /*
-         * The unsigned subtraction here gives an accurate distance
-         * across inactive_age overflows in most cases.
-         *
-         * There is a special case: usually, shadow entries have a
-         * short lifetime and are either refaulted or reclaimed along
-         * with the inode before they get too old.  But it is not
-         * impossible for the inactive_age to lap a shadow entry in
-         * the field, which can then can result in a false small
-         * refault distance, leading to a false activation should this
-         * old entry actually refault again.  However, earlier kernels
-         * used to deactivate unconditionally with *every* reclaim
-         * invocation for the longest time, so the occasional
-         * inappropriate activation leading to pressure on the active
-         * list is not a problem.
-         */
-        *distance = (refault - eviction) & mask;
 }
 /**
@@ -212,11 +207,20 @@ static void unpack_shadow(void *shadow,
 */
 void *workingset_eviction(struct address_space *mapping, struct page *page)
 {
+        struct mem_cgroup *memcg = page_memcg(page);
        struct zone *zone = page_zone(page);
+        int memcgid = mem_cgroup_id(memcg);
        unsigned long eviction;
+        struct lruvec *lruvec;
-        eviction = atomic_long_inc_return(&zone->inactive_age);
+        /* Page is fully exclusive and pins page->mem_cgroup */
-        return pack_shadow(eviction, zone);
+        VM_BUG_ON_PAGE(PageLRU(page), page);
+        VM_BUG_ON_PAGE(page_count(page), page);
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
+        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+        eviction = atomic_long_inc_return(&lruvec->inactive_age);
+        return pack_shadow(memcgid, zone, eviction);
 }
 /**
@@ -231,12 +235,64 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
 bool workingset_refault(void *shadow)
 {
        unsigned long refault_distance;
+        unsigned long active_file;
+        struct mem_cgroup *memcg;
+        unsigned long eviction;
+        struct lruvec *lruvec;
+        unsigned long refault;
        struct zone *zone;
+        int memcgid;
+        unpack_shadow(shadow, &memcgid, &zone, &eviction);
+        rcu_read_lock();
+        /*
+         * Look up the memcg associated with the stored ID. It might
+         * have been deleted since the page's eviction.
+         *
+         * Note that in rare events the ID could have been recycled
+         * for a new cgroup that refaults a shared page. This is
+         * impossible to tell from the available data. However, this
+         * should be a rare and limited disturbance, and activations
+         * are always speculative anyway. Ultimately, it's the aging
+         * algorithm's job to shake out the minimum access frequency
+         * for the active cache.
+         *
+         * XXX: On !CONFIG_MEMCG, this will always return NULL; it
+         * would be better if the root_mem_cgroup existed in all
+         * configurations instead.
+         */
+        memcg = mem_cgroup_from_id(memcgid);
+        if (!mem_cgroup_disabled() && !memcg) {
+                rcu_read_unlock();
+                return false;
+        }
+        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+        refault = atomic_long_read(&lruvec->inactive_age);
+        active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+        rcu_read_unlock();
+        /*
+         * The unsigned subtraction here gives an accurate distance
+         * across inactive_age overflows in most cases.
+         *
+         * There is a special case: usually, shadow entries have a
+         * short lifetime and are either refaulted or reclaimed along
+         * with the inode before they get too old.  But it is not
+         * impossible for the inactive_age to lap a shadow entry in
+         * the field, which can then can result in a false small
+         * refault distance, leading to a false activation should this
+         * old entry actually refault again.  However, earlier kernels
+         * used to deactivate unconditionally with *every* reclaim
+         * invocation for the longest time, so the occasional
+         * inappropriate activation leading to pressure on the active
+         * list is not a problem.
+         */
+        refault_distance = (refault - eviction) & EVICTION_MASK;
-        unpack_shadow(shadow, &zone, &refault_distance);
        inc_zone_state(zone, WORKINGSET_REFAULT);
-        if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
+        if (refault_distance <= active_file) {
                inc_zone_state(zone, WORKINGSET_ACTIVATE);
                return true;
        }
@@ -249,7 +305,22 @@ bool workingset_refault(void *shadow)
 */
 void workingset_activation(struct page *page)
 {
-        atomic_long_inc(&page_zone(page)->inactive_age);
+        struct lruvec *lruvec;
+        lock_page_memcg(page);
+        /*
+         * Filter non-memcg pages here, e.g. unmap can call
+         * mark_page_accessed() on VDSO pages.
+         *
+         * XXX: See workingset_refault() - this should return
+         * root_mem_cgroup even for !CONFIG_MEMCG.
+         */
+        if (!mem_cgroup_disabled() && !page_memcg(page))
+                goto out;
+        lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
+        atomic_long_inc(&lruvec->inactive_age);
+out:
+        unlock_page_memcg(page);
 }
 /*
@@ -398,8 +469,25 @@ static struct lock_class_key shadow_nodes_key;
 static int __init workingset_init(void)
 {
+        unsigned int timestamp_bits;
+        unsigned int max_order;
        int ret;
+        BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+        /*
+         * Calculate the eviction bucket size to cover the longest
+         * actionable refault distance, which is currently half of
+         * memory (totalram_pages/2). However, memory hotplug may add
+         * some more pages at runtime, so keep working with up to
+         * double the initial memory by using totalram_pages as-is.
+         */
+        timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+        max_order = fls_long(totalram_pages - 1);
+        if (max_order > timestamp_bits)
+                bucket_order = max_order - timestamp_bits;
+        printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
+               timestamp_bits, max_order, bucket_order);
        ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
        if (ret)
                goto err;
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-03-16 14:51:08 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-03-16 14:51:08 -0400
commit	271ecc5253e2b317d729d366560789cd7f93836c (patch)
tree	d3a60bc4dfa8245ff934f357f2367db76b59e7cf /mm
parent	aa6865d836418eb2ba888a4cb1318a28e9aa2e0c (diff)
parent	63c06227a22b098a3849c5c99e836aea161ca0d7 (diff)