151 files changed, 2277 insertions, 1321 deletions
diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt
index ab0baa692c13..22dd6af2e4bd 100644
--- a/Documentation/lockup-watchdogs.txt
+++ b/Documentation/lockup-watchdogs.txt
@@ -61,3 +61,21 @@ As explained above, a kernel knob is provided that allows
 administrators to configure the period of the hrtimer and the perf
 event. The right value for a particular environment is a trade-off
 between fast response to lockups and detection overhead.
+By default, the watchdog runs on all online cores.  However, on a
+kernel configured with NO_HZ_FULL, by default the watchdog runs only
+on the housekeeping cores, not the cores specified in the "nohz_full"
+boot argument.  If we allowed the watchdog to run by default on
+the "nohz_full" cores, we would have to run timer ticks to activate
+the scheduler, which would prevent the "nohz_full" functionality
+from protecting the user code on those cores from the kernel.
+Of course, disabling it by default on the nohz_full cores means that
+when those cores do enter the kernel, by default we will not be
+able to detect if they lock up.  However, allowing the watchdog
+to continue to run on the housekeeping (non-tickless) cores means
+that we will continue to detect lockups properly on those cores.
+In either case, the set of cores excluded from running the watchdog
+may be adjusted via the kernel.watchdog_cpumask sysctl.  For
+nohz_full cores, this may be useful for debugging a case where the
+kernel seems to be hanging on the nohz_full cores.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index c831001c45f1..e5d528e0c46e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -923,6 +923,27 @@ and nmi_watchdog.
 ==============================================================
+watchdog_cpumask:
+This value can be used to control on which cpus the watchdog may run.
+The default cpumask is all possible cores, but if NO_HZ_FULL is
+enabled in the kernel config, and cores are specified with the
+nohz_full= boot argument, those cores are excluded by default.
+Offline cores can be included in this mask, and if the core is later
+brought online, the watchdog will be started based on the mask value.
+Typically this value would only be touched in the nohz_full case
+to re-enable cores that by default were not running the watchdog,
+if a kernel lockup was suspected on those cores.
+The argument value is the standard cpulist format for cpumasks,
+so for example to enable the watchdog on cores 0, 2, 3, and 4 you
+might say:
+  echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask
+==============================================================
 watchdog_thresh:
 This value can be used to control the frequency of hrtimer and NMI
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index 3be0bfc4738d..32ee3a67dba2 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -467,7 +467,13 @@ mmap(MAP_LOCKED) SYSTEM CALL HANDLING
 In addition the mlock()/mlockall() system calls, an application can request
 that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap()
-call.  Furthermore, any mmap() call or brk() call that expands the heap by a
+call. There is one important and subtle difference here, though. mmap() + mlock()
+will fail if the range cannot be faulted in (e.g. because mm_populate fails)
+and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. The mmaped
+area will still have properties of the locked area - aka. pages will not get
+swapped out - but major page faults to fault memory in might still happen.
+Furthermore, any mmap() call or brk() call that expands the heap by a
 task that has previously called mlockall() with the MCL_FUTURE flag will result
 in the newly mapped memory being mlocked.  Before the unevictable/mlock
 changes, the kernel simply called make_pages_present() to allocate pages and
diff --git a/arch/alpha/include/asm/mm-arch-hooks.h b/arch/alpha/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..b07fd862fec3
--- /dev/null
+++ b/arch/alpha/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_ALPHA_MM_ARCH_HOOKS_H
+#define _ASM_ALPHA_MM_ARCH_HOOKS_H
+#endif /* _ASM_ALPHA_MM_ARCH_HOOKS_H */
diff --git a/arch/arc/include/asm/mm-arch-hooks.h b/arch/arc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..c37541c5f8ba
--- /dev/null
+++ b/arch/arc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_ARC_MM_ARCH_HOOKS_H
+#define _ASM_ARC_MM_ARCH_HOOKS_H
+#endif /* _ASM_ARC_MM_ARCH_HOOKS_H */
diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index 1f1b1cd112f3..31bb7dccb971 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -53,10 +53,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
 static inline int huge_pte_none(pte_t pte)
 {
        return pte_none(pte);
diff --git a/arch/arm/include/asm/mm-arch-hooks.h b/arch/arm/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..7056660c7cc4
--- /dev/null
+++ b/arch/arm/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_ARM_MM_ARCH_HOOKS_H
+#define _ASM_ARM_MM_ARCH_HOOKS_H
+#endif /* _ASM_ARM_MM_ARCH_HOOKS_H */
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
index c72412415093..fcafb521f14e 100644
--- a/arch/arm/mm/hugetlbpage.c
+++ b/arch/arm/mm/hugetlbpage.c
@@ -41,11 +41,6 @@ int pud_huge(pud_t pud)
        return 0;
 }
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-        return 0;
-}
 int pmd_huge(pmd_t pmd)
 {
        return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 5b7ca8ace95f..734c17e89e94 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -86,10 +86,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
 static inline int huge_pte_none(pte_t pte)
 {
        return pte_none(pte);
diff --git a/arch/arm64/include/asm/mm-arch-hooks.h b/arch/arm64/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..562b655f5ba9
--- /dev/null
+++ b/arch/arm64/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_ARM64_MM_ARCH_HOOKS_H
+#define _ASM_ARM64_MM_ARCH_HOOKS_H
+#endif /* _ASM_ARM64_MM_ARCH_HOOKS_H */
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 2de9d2e59d96..cccc4af87a03 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -31,13 +31,6 @@
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
-#ifndef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-        return 0;
-}
-#endif
 int pmd_huge(pmd_t pmd)
 {
        return !(pmd_val(pmd) & PMD_TABLE_BIT);
diff --git a/arch/avr32/include/asm/mm-arch-hooks.h b/arch/avr32/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..145452ffbdad
--- /dev/null
+++ b/arch/avr32/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_AVR32_MM_ARCH_HOOKS_H
+#define _ASM_AVR32_MM_ARCH_HOOKS_H
+#endif /* _ASM_AVR32_MM_ARCH_HOOKS_H */
diff --git a/arch/blackfin/include/asm/mm-arch-hooks.h b/arch/blackfin/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..1c5211ec338f
--- /dev/null
+++ b/arch/blackfin/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_BLACKFIN_MM_ARCH_HOOKS_H
+#define _ASM_BLACKFIN_MM_ARCH_HOOKS_H
+#endif /* _ASM_BLACKFIN_MM_ARCH_HOOKS_H */
diff --git a/arch/c6x/include/asm/mm-arch-hooks.h b/arch/c6x/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..bb3c4a6ce8e9
--- /dev/null
+++ b/arch/c6x/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_C6X_MM_ARCH_HOOKS_H
+#define _ASM_C6X_MM_ARCH_HOOKS_H
+#endif /* _ASM_C6X_MM_ARCH_HOOKS_H */
diff --git a/arch/cris/include/asm/mm-arch-hooks.h b/arch/cris/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..314f774db2b0
--- /dev/null
+++ b/arch/cris/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_CRIS_MM_ARCH_HOOKS_H
+#define _ASM_CRIS_MM_ARCH_HOOKS_H
+#endif /* _ASM_CRIS_MM_ARCH_HOOKS_H */
diff --git a/arch/frv/include/asm/mm-arch-hooks.h b/arch/frv/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..51d13a870404
--- /dev/null
+++ b/arch/frv/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_FRV_MM_ARCH_HOOKS_H
+#define _ASM_FRV_MM_ARCH_HOOKS_H
+#endif /* _ASM_FRV_MM_ARCH_HOOKS_H */
diff --git a/arch/hexagon/include/asm/mm-arch-hooks.h b/arch/hexagon/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..05e8b939e416
--- /dev/null
+++ b/arch/hexagon/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_HEXAGON_MM_ARCH_HOOKS_H
+#define _ASM_HEXAGON_MM_ARCH_HOOKS_H
+#endif /* _ASM_HEXAGON_MM_ARCH_HOOKS_H */
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
index aa910054b8e7..ff1377bc02a6 100644
--- a/arch/ia64/include/asm/hugetlb.h
+++ b/arch/ia64/include/asm/hugetlb.h
@@ -20,10 +20,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
                REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE);
 }
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                                   pte_t *ptep, pte_t pte)
 {
diff --git a/arch/ia64/include/asm/mm-arch-hooks.h b/arch/ia64/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..ab4b5c698322
--- /dev/null
+++ b/arch/ia64/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_IA64_MM_ARCH_HOOKS_H
+#define _ASM_IA64_MM_ARCH_HOOKS_H
+#endif /* _ASM_IA64_MM_ARCH_HOOKS_H */
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 52b7604b5215..f50d4b3f501a 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -65,11 +65,6 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
        return pte;
 }
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-        return 0;
-}
 #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
 /*
diff --git a/arch/m32r/include/asm/mm-arch-hooks.h b/arch/m32r/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..6d60b4750f41
--- /dev/null
+++ b/arch/m32r/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_M32R_MM_ARCH_HOOKS_H
+#define _ASM_M32R_MM_ARCH_HOOKS_H
+#endif /* _ASM_M32R_MM_ARCH_HOOKS_H */
diff --git a/arch/m68k/include/asm/mm-arch-hooks.h b/arch/m68k/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..7e8709bc90ae
--- /dev/null
+++ b/arch/m68k/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_M68K_MM_ARCH_HOOKS_H
+#define _ASM_M68K_MM_ARCH_HOOKS_H
+#endif /* _ASM_M68K_MM_ARCH_HOOKS_H */
diff --git a/arch/metag/include/asm/dma-mapping.h b/arch/metag/include/asm/dma-mapping.h
index 14b23efd9b7a..eb5cdec94be0 100644
--- a/arch/metag/include/asm/dma-mapping.h
+++ b/arch/metag/include/asm/dma-mapping.h
@@ -134,20 +134,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
 }
 static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems,
                    enum dma_data_direction direction)
 {
        int i;
-        for (i = 0; i < nelems; i++, sg++)
+        struct scatterlist *sg;
+        for_each_sg(sglist, sg, nelems, i)
                dma_sync_for_cpu(sg_virt(sg), sg->length, direction);
 }
 static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
-                       enum dma_data_direction direction)
+                       int nelems, enum dma_data_direction direction)
 {
        int i;
-        for (i = 0; i < nelems; i++, sg++)
+        struct scatterlist *sg;
+        for_each_sg(sglist, sg, nelems, i)
                dma_sync_for_device(sg_virt(sg), sg->length, direction);
 }
diff --git a/arch/metag/include/asm/hugetlb.h b/arch/metag/include/asm/hugetlb.h
index 471f481e67f3..f730b396d79b 100644
--- a/arch/metag/include/asm/hugetlb.h
+++ b/arch/metag/include/asm/hugetlb.h
@@ -14,10 +14,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
 int prepare_hugepage_range(struct file *file, unsigned long addr,
                                                unsigned long len);
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                          unsigned long addr, unsigned long end,
                                          unsigned long floor,
diff --git a/arch/metag/include/asm/mm-arch-hooks.h b/arch/metag/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..b0072b2eb0de
--- /dev/null
+++ b/arch/metag/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_METAG_MM_ARCH_HOOKS_H
+#define _ASM_METAG_MM_ARCH_HOOKS_H
+#endif /* _ASM_METAG_MM_ARCH_HOOKS_H */
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index 7ca80ac42ed5..53f0f6c47027 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -89,11 +89,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return pte;
 }
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-        return 0;
-}
 int pmd_huge(pmd_t pmd)
 {
        return pmd_page_shift(pmd) > PAGE_SHIFT;
diff --git a/arch/microblaze/include/asm/mm-arch-hooks.h b/arch/microblaze/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..5c4065911bda
--- /dev/null
+++ b/arch/microblaze/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_MICROBLAZE_MM_ARCH_HOOKS_H
+#define _ASM_MICROBLAZE_MM_ARCH_HOOKS_H
+#endif /* _ASM_MICROBLAZE_MM_ARCH_HOOKS_H */
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index fe0d15d32660..4a5bb5453408 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -38,10 +38,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                          unsigned long addr,
                                          unsigned long end,
diff --git a/arch/mips/include/asm/mm-arch-hooks.h b/arch/mips/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..b5609fe8e475
--- /dev/null
+++ b/arch/mips/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_MIPS_MM_ARCH_HOOKS_H
+#define _ASM_MIPS_MM_ARCH_HOOKS_H
+#endif /* _ASM_MIPS_MM_ARCH_HOOKS_H */
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 819af9d057a8..9d8106758142 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -568,12 +568,12 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 }
 /*
- * The generic version pmdp_get_and_clear uses a version of pmd_clear() with a
+ * The generic version pmdp_huge_get_and_clear uses a version of pmd_clear() with a
 * different prototype.
 */
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-                                       unsigned long address, pmd_t *pmdp)
+                                            unsigned long address, pmd_t *pmdp)
 {
        pmd_t old = *pmdp;
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index 06e0f421b41b..74aa6f62468f 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -51,11 +51,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return (pte_t *) pmd;
 }
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-        return 0;
-}
 /*
 * This function checks for proper alignment of input addr and len parameters.
 */
diff --git a/arch/mn10300/include/asm/mm-arch-hooks.h b/arch/mn10300/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..e2029a652f4c
--- /dev/null
+++ b/arch/mn10300/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_MN10300_MM_ARCH_HOOKS_H
+#define _ASM_MN10300_MM_ARCH_HOOKS_H
+#endif /* _ASM_MN10300_MM_ARCH_HOOKS_H */
diff --git a/arch/nios2/include/asm/mm-arch-hooks.h b/arch/nios2/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..d7290dc68558
--- /dev/null
+++ b/arch/nios2/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_NIOS2_MM_ARCH_HOOKS_H
+#define _ASM_NIOS2_MM_ARCH_HOOKS_H
+#endif /* _ASM_NIOS2_MM_ARCH_HOOKS_H */
diff --git a/arch/openrisc/include/asm/mm-arch-hooks.h b/arch/openrisc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..6d33cb555fe1
--- /dev/null
+++ b/arch/openrisc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_OPENRISC_MM_ARCH_HOOKS_H
+#define _ASM_OPENRISC_MM_ARCH_HOOKS_H
+#endif /* _ASM_OPENRISC_MM_ARCH_HOOKS_H */
diff --git a/arch/parisc/include/asm/mm-arch-hooks.h b/arch/parisc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..654ec63b0ee9
--- /dev/null
+++ b/arch/parisc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_PARISC_MM_ARCH_HOOKS_H
+#define _ASM_PARISC_MM_ARCH_HOOKS_H
+#endif /* _ASM_PARISC_MM_ARCH_HOOKS_H */
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index ff834fd67478..b9402c9b3454 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -478,14 +478,16 @@ static void pa11_dma_unmap_single(struct device *dev, dma_addr_t dma_handle, siz
 static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
 {
        int i;
+        struct scatterlist *sg;
        BUG_ON(direction == DMA_NONE);
-        for (i = 0; i < nents; i++, sglist++ ) {
+        for_each_sg(sglist, sg, nents, i) {
-                unsigned long vaddr = (unsigned long)sg_virt(sglist);
+                unsigned long vaddr = (unsigned long)sg_virt(sg);
-                sg_dma_address(sglist) = (dma_addr_t) virt_to_phys(vaddr);
-                sg_dma_len(sglist) = sglist->length;
+                sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr);
-                flush_kernel_dcache_range(vaddr, sglist->length);
+                sg_dma_len(sg) = sg->length;
+                flush_kernel_dcache_range(vaddr, sg->length);
        }
        return nents;
 }
@@ -493,6 +495,7 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int n
 static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
 {
        int i;
+        struct scatterlist *sg;
        BUG_ON(direction == DMA_NONE);
@@ -501,8 +504,8 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, in
        /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
-        for (i = 0; i < nents; i++, sglist++ )
+        for_each_sg(sglist, sg, nents, i)
-                flush_kernel_vmap_range(sg_virt(sglist), sglist->length);
+                flush_kernel_vmap_range(sg_virt(sg), sg->length);
        return;
 }
@@ -523,21 +526,23 @@ static void pa11_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_h
 static void pa11_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
 {
        int i;
+        struct scatterlist *sg;
        /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
-        for (i = 0; i < nents; i++, sglist++ )
+        for_each_sg(sglist, sg, nents, i)
-                flush_kernel_vmap_range(sg_virt(sglist), sglist->length);
+                flush_kernel_vmap_range(sg_virt(sg), sg->length);
 }
 static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
 {
        int i;
+        struct scatterlist *sg;
        /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
-        for (i = 0; i < nents; i++, sglist++ )
+        for_each_sg(sglist, sg, nents, i)
-                flush_kernel_vmap_range(sg_virt(sglist), sglist->length);
+                flush_kernel_vmap_range(sg_virt(sg), sg->length);
 }
 struct hppa_dma_ops pcxl_dma_ops = {
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 1d53a65b4ec1..4bbd3c8c2888 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -112,11 +112,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                                   pte_t *ptep, pte_t pte)
 {
diff --git a/arch/powerpc/include/asm/mm-arch-hooks.h b/arch/powerpc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..f2a2da895897
--- /dev/null
+++ b/arch/powerpc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,28 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_POWERPC_MM_ARCH_HOOKS_H
+#define _ASM_POWERPC_MM_ARCH_HOOKS_H
+static inline void arch_remap(struct mm_struct *mm,
+                              unsigned long old_start, unsigned long old_end,
+                              unsigned long new_start, unsigned long new_end)
+{
+        /*
+         * mremap() doesn't allow moving multiple vmas so we can limit the
+         * check to old_start == vdso_base.
+         */
+        if (old_start == mm->context.vdso_base)
+                mm->context.vdso_base = new_start;
+}
+#define arch_remap arch_remap
+#endif /* _ASM_POWERPC_MM_ARCH_HOOKS_H */
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 3e5184210d9b..878c27771717 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -8,7 +8,6 @@
 #include <linux/spinlock.h>
 #include <asm/mmu.h>    
 #include <asm/cputable.h>
-#include <asm-generic/mm_hooks.h>
 #include <asm/cputhreads.h>
 /*
@@ -127,5 +126,27 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
 #endif
 }
+static inline void arch_dup_mmap(struct mm_struct *oldmm,
+                                 struct mm_struct *mm)
+{
+}
+static inline void arch_exit_mmap(struct mm_struct *mm)
+{
+}
+static inline void arch_unmap(struct mm_struct *mm,
+                              struct vm_area_struct *vma,
+                              unsigned long start, unsigned long end)
+{
+        if (start <= mm->context.vdso_base && mm->context.vdso_base < end)
+                mm->context.vdso_base = 0;
+}
+static inline void arch_bprm_mm_init(struct mm_struct *mm,
+                                     struct vm_area_struct *vma)
+{
+}
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index f890f7ce1593..3bb7488bd24b 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -569,13 +569,9 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
-extern pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-                                unsigned long addr, pmd_t *pmdp);
+                                     unsigned long addr, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
-extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
-                              pmd_t *pmdp);
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
 static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
@@ -592,6 +588,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 extern void pmdp_splitting_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
+extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+                                 unsigned long address, pmd_t *pmdp);
+#define pmdp_collapse_flush pmdp_collapse_flush
 #define __HAVE_ARCH_PGTABLE_DEPOSIT
 extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                       pgtable_t pgtable);
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index b41426c60ef6..5f8dcdaa2820 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -557,11 +557,11 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
        struct vio_dev *viodev = to_vio_dev(dev);
        struct iommu_table *tbl;
        struct scatterlist *sgl;
-        int ret, count = 0;
+        int ret, count;
        size_t alloc_size = 0;
        tbl = get_iommu_table_base(dev);
-        for (sgl = sglist; count < nelems; count++, sgl++)
+        for_each_sg(sglist, sgl, nelems, count)
                alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
        if (vio_cmo_alloc(viodev, alloc_size)) {
@@ -577,7 +577,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
                return ret;
        }
-        for (sgl = sglist, count = 0; count < ret; count++, sgl++)
+        for_each_sg(sglist, sgl, ret, count)
                alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
        if (alloc_size)
                vio_cmo_dealloc(viodev, alloc_size);
@@ -594,10 +594,10 @@ static void vio_dma_iommu_unmap_sg(struct device *dev,
        struct iommu_table *tbl;
        struct scatterlist *sgl;
        size_t alloc_size = 0;
-        int count = 0;
+        int count;
        tbl = get_iommu_table_base(dev);
-        for (sgl = sglist; count < nelems; count++, sgl++)
+        for_each_sg(sglist, sgl, nelems, count)
                alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
        dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 3385e3d0506e..38bd5d998c81 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -439,11 +439,6 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
 }
 #endif
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-        return 0;
-}
 #ifdef CONFIG_PPC_FSL_BOOK3E
 #define HUGEPD_FREELIST_SIZE \
        ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 6bfadf1aa5cb..876232d64126 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -554,47 +554,42 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
        return old;
 }
-pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
-                       pmd_t *pmdp)
+                          pmd_t *pmdp)
 {
        pmd_t pmd;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-        if (pmd_trans_huge(*pmdp)) {
+        VM_BUG_ON(pmd_trans_huge(*pmdp));
-                pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
-        } else {
+        pmd = *pmdp;
-                /*
+        pmd_clear(pmdp);
-                 * khugepaged calls this for normal pmd
+        /*
-                 */
+         * Wait for all pending hash_page to finish. This is needed
-                pmd = *pmdp;
+         * in case of subpage collapse. When we collapse normal pages
-                pmd_clear(pmdp);
+         * to hugepage, we first clear the pmd, then invalidate all
-                /*
+         * the PTE entries. The assumption here is that any low level
-                 * Wait for all pending hash_page to finish. This is needed
+         * page fault will see a none pmd and take the slow path that
-                 * in case of subpage collapse. When we collapse normal pages
+         * will wait on mmap_sem. But we could very well be in a
-                 * to hugepage, we first clear the pmd, then invalidate all
+         * hash_page with local ptep pointer value. Such a hash page
-                 * the PTE entries. The assumption here is that any low level
+         * can result in adding new HPTE entries for normal subpages.
-                 * page fault will see a none pmd and take the slow path that
+         * That means we could be modifying the page content as we
-                 * will wait on mmap_sem. But we could very well be in a
+         * copy them to a huge page. So wait for parallel hash_page
-                 * hash_page with local ptep pointer value. Such a hash page
+         * to finish before invalidating HPTE entries. We can do this
-                 * can result in adding new HPTE entries for normal subpages.
+         * by sending an IPI to all the cpus and executing a dummy
-                 * That means we could be modifying the page content as we
+         * function there.
-                 * copy them to a huge page. So wait for parallel hash_page
+         */
-                 * to finish before invalidating HPTE entries. We can do this
+        kick_all_cpus_sync();
-                 * by sending an IPI to all the cpus and executing a dummy
+        /*
-                 * function there.
+         * Now invalidate the hpte entries in the range
-                 */
+         * covered by pmd. This make sure we take a
-                kick_all_cpus_sync();
+         * fault and will find the pmd as none, which will
-                /*
+         * result in a major fault which takes mmap_sem and
-                 * Now invalidate the hpte entries in the range
+         * hence wait for collapse to complete. Without this
-                 * covered by pmd. This make sure we take a
+         * the __collapse_huge_page_copy can result in copying
-                 * fault and will find the pmd as none, which will
+         * the old content.
-                 * result in a major fault which takes mmap_sem and
+         */
-                 * hence wait for collapse to complete. Without this
+        flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
-                 * the __collapse_huge_page_copy can result in copying
-                 * the old content.
-                 */
-                flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
-        }
        return pmd;
 }
@@ -817,8 +812,8 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
        return;
 }
-pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-                         unsigned long addr, pmd_t *pmdp)
+                              unsigned long addr, pmd_t *pmdp)
 {
        pmd_t old_pmd;
        pgtable_t pgtable;
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 11eae5f55b70..dfb542ade6b1 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -35,7 +35,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
-#define hugetlb_prefault_arch_hook(mm)          do { } while (0)
 #define arch_clear_hugepage_flags(page)         do { } while (0)
 int arch_prepare_hugepage(struct page *page);
diff --git a/arch/s390/include/asm/mm-arch-hooks.h b/arch/s390/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..07680b2f3c59
--- /dev/null
+++ b/arch/s390/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_S390_MM_ARCH_HOOKS_H
+#define _ASM_S390_MM_ARCH_HOOKS_H
+#endif /* _ASM_S390_MM_ARCH_HOOKS_H */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 0bb2da79adf3..f66d82798a6a 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1498,9 +1498,9 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
        return pmd_young(pmd);
 }
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-                                       unsigned long address, pmd_t *pmdp)
+                                            unsigned long address, pmd_t *pmdp)
 {
        pmd_t pmd = *pmdp;
@@ -1509,10 +1509,10 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
        return pmd;
 }
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
-static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
-                                            unsigned long address,
+                                                 unsigned long address,
-                                            pmd_t *pmdp, int full)
+                                                 pmd_t *pmdp, int full)
 {
        pmd_t pmd = *pmdp;
@@ -1522,11 +1522,11 @@ static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
        return pmd;
 }
-#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
+#define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
-static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
-                                     unsigned long address, pmd_t *pmdp)
+                                          unsigned long address, pmd_t *pmdp)
 {
-        return pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+        return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
 }
 #define __HAVE_ARCH_PMDP_INVALIDATE
@@ -1548,6 +1548,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
        }
 }
+static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+                                        unsigned long address,
+                                        pmd_t *pmdp)
+{
+        return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
+}
+#define pmdp_collapse_flush pmdp_collapse_flush
 #define pfn_pmd(pfn, pgprot)    mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot))
 #define mk_pmd(page, pgprot)    pfn_pmd(page_to_pfn(page), (pgprot))
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index d9f0dcfcae5e..7a75ad4594e3 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -33,11 +33,12 @@ static struct memblock_type oldmem_type = {
 };
 #define for_each_dump_mem_range(i, nid, p_start, p_end, p_nid)          \
-        for (i = 0, __next_mem_range(&i, nid, &memblock.physmem,        \
+        for (i = 0, __next_mem_range(&i, nid, MEMBLOCK_NONE,            \
+                                     &memblock.physmem,                 \
                                     &oldmem_type, p_start,             \
                                     p_end, p_nid);                     \
             i != (u64)ULLONG_MAX;                                      \
-             __next_mem_range(&i, nid, &memblock.physmem,               \
+             __next_mem_range(&i, nid, MEMBLOCK_NONE, &memblock.physmem,\
                              &oldmem_type,                             \
                              p_start, p_end, p_nid))
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index e617e74b7be2..c3f8e3df92ff 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -193,11 +193,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return (pte_t *) pmdp;
 }
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-        return 0;
-}
 int pmd_huge(pmd_t pmd)
 {
        if (!MACHINE_HAS_HPAGE)
diff --git a/arch/score/include/asm/mm-arch-hooks.h b/arch/score/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..5e38689f189a
--- /dev/null
+++ b/arch/score/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_SCORE_MM_ARCH_HOOKS_H
+#define _ASM_SCORE_MM_ARCH_HOOKS_H
+#endif /* _ASM_SCORE_MM_ARCH_HOOKS_H */
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index 699255d6d1c6..b788a9bc8918 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
-}
 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                          unsigned long addr, unsigned long end,
                                          unsigned long floor,
diff --git a/arch/sh/include/asm/mm-arch-hooks.h b/arch/sh/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..18087298b728
--- /dev/null
+++ b/arch/sh/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_SH_MM_ARCH_HOOKS_H
+#define _ASM_SH_MM_ARCH_HOOKS_H
+#endif /* _ASM_SH_MM_ARCH_HOOKS_H */
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 534bc978af8a..6385f60209b6 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -62,11 +62,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return pte;
 }
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-        return 0;
-}
 int pmd_huge(pmd_t pmd)
 {
        return 0;
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index e4cab465b81f..3130d7636312 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -11,10 +11,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep);
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
 static inline int is_hugepage_only_range(struct mm_struct *mm,
                                         unsigned long addr,
                                         unsigned long len) {
diff --git a/arch/sparc/include/asm/mm-arch-hooks.h b/arch/sparc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..b89ba44c16f1
--- /dev/null
+++ b/arch/sparc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_SPARC_MM_ARCH_HOOKS_H
+#define _ASM_SPARC_MM_ARCH_HOOKS_H
+#endif /* _ASM_SPARC_MM_ARCH_HOOKS_H */
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 2a52c91d2c8a..131d36fcd07a 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -865,10 +865,10 @@ static inline unsigned long pud_pfn(pud_t pud)
 void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
                   pte_t *ptep, pte_t orig, int fullmm);
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-                                       unsigned long addr,
+                                            unsigned long addr,
-                                       pmd_t *pmdp)
+                                            pmd_t *pmdp)
 {
        pmd_t pmd = *pmdp;
        set_pmd_at(mm, addr, pmdp, __pmd(0UL));
diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c
index 7d3ca30fcd15..1ae5eb1bb045 100644
--- a/arch/sparc/kernel/ldc.c
+++ b/arch/sparc/kernel/ldc.c
@@ -2086,6 +2086,7 @@ int ldc_map_sg(struct ldc_channel *lp,
        struct cookie_state state;
        struct ldc_iommu *iommu;
        int err;
+        struct scatterlist *s;
        if (map_perm & ~LDC_MAP_ALL)
                return -EINVAL;
@@ -2112,9 +2113,10 @@ int ldc_map_sg(struct ldc_channel *lp,
        state.pte_idx = (base - iommu->page_table);
        state.nc = 0;
-        for (i = 0; i < num_sg; i++)
+        for_each_sg(sg, s, num_sg, i) {
-                fill_cookies(&state, page_to_pfn(sg_page(&sg[i])) << PAGE_SHIFT,
+                fill_cookies(&state, page_to_pfn(sg_page(s)) << PAGE_SHIFT,
-                             sg[i].offset, sg[i].length);
+                             s->offset, s->length);
+        }
        return state.nc;
 }
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 4242eab12e10..131eaf4ad7f5 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -172,11 +172,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return pte;
 }
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-        return 0;
-}
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t entry)
 {
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index c5d08b89a96c..4ac88b757514 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1966,7 +1966,8 @@ static phys_addr_t __init available_memory(void)
        phys_addr_t pa_start, pa_end;
        u64 i;
-        for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL)
+        for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
+                                &pa_end, NULL)
                available = available + (pa_end  - pa_start);
        return available;
@@ -1992,7 +1993,8 @@ static void __init reduce_memory(phys_addr_t limit_ram)
        if (limit_ram >= avail_ram)
                return;
-        for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) {
+        for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
+                                &pa_end, NULL) {
                phys_addr_t region_size = pa_end - pa_start;
                phys_addr_t clip_start = pa_start;
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h
index 3257733003f8..1abd00c55236 100644
--- a/arch/tile/include/asm/hugetlb.h
+++ b/arch/tile/include/asm/hugetlb.h
@@ -40,10 +40,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                          unsigned long addr, unsigned long end,
                                          unsigned long floor,
diff --git a/arch/tile/include/asm/mm-arch-hooks.h b/arch/tile/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..d1709ea774f7
--- /dev/null
+++ b/arch/tile/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_TILE_MM_ARCH_HOOKS_H
+#define _ASM_TILE_MM_ARCH_HOOKS_H
+#endif /* _ASM_TILE_MM_ARCH_HOOKS_H */
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 95a4f19d16c5..2b05ccbebed9 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -414,10 +414,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 }
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-                                       unsigned long address,
+                                            unsigned long address,
-                                       pmd_t *pmdp)
+                                            pmd_t *pmdp)
 {
        return pte_pmd(ptep_get_and_clear(mm, address, pmdp_ptep(pmdp)));
 }
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 8416240c322c..c034dc3fe2d4 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -160,11 +160,6 @@ int pud_huge(pud_t pud)
        return !!(pud_val(pud) & _PAGE_HUGE_PAGE);
 }
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-        return 0;
-}
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
                unsigned long addr, unsigned long len,
diff --git a/arch/um/include/asm/mm-arch-hooks.h b/arch/um/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..a7c8b0dfdd4e
--- /dev/null
+++ b/arch/um/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_UM_MM_ARCH_HOOKS_H
+#define _ASM_UM_MM_ARCH_HOOKS_H
+#endif /* _ASM_UM_MM_ARCH_HOOKS_H */
diff --git a/arch/unicore32/include/asm/mm-arch-hooks.h b/arch/unicore32/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..4d79a850c509
--- /dev/null
+++ b/arch/unicore32/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_UNICORE32_MM_ARCH_HOOKS_H
+#define _ASM_UNICORE32_MM_ARCH_HOOKS_H
+#endif /* _ASM_UNICORE32_MM_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 68c05398bba9..dab7a3a750bf 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
-}
 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                          unsigned long addr, unsigned long end,
                                          unsigned long floor,
diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..4e881a342236
--- /dev/null
+++ b/arch/x86/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_X86_MM_ARCH_HOOKS_H
+#define _ASM_X86_MM_ARCH_HOOKS_H
+#endif /* _ASM_X86_MM_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2562e303405b..867da5bbb4a3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -805,8 +805,8 @@ static inline int pmd_write(pmd_t pmd)
        return pmd_flags(pmd) & _PAGE_RW;
 }
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pmd_t *pmdp)
 {
        pmd_t pmd = native_pmdp_get_and_clear(pmdp);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 83a7995625a6..58118e207a69 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,8 @@ void __init setup_bios_corruption_check(void)
        corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
-        for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
+        for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+                                NULL) {
                start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
                                PAGE_SIZE, corruption_check_size);
                end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e2ce85db2283..c8dda42cb6a3 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1123,7 +1123,8 @@ void __init memblock_find_dma_reserve(void)
                nr_pages += end_pfn - start_pfn;
        }
-        for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
+        for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+                                NULL) {
                start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
                end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
                if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 39ca113676fe..d3b95b89e9b2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1105,6 +1105,9 @@ void __init setup_arch(char **cmdline_p)
        memblock_set_current_limit(ISA_END_ADDRESS);
        memblock_x86_fill();
+        if (efi_enabled(EFI_BOOT))
+                efi_find_mirror();
        /*
         * The EFI specification says that boot service code won't be called
         * after ExitBootServices(). This is, in fact, a lie.
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c8140e12816a..8340e45c891a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -433,7 +433,7 @@ void __init add_highpages_with_active_regions(int nid,
        phys_addr_t start, end;
        u64 i;
-        for_each_free_mem_range(i, nid, &start, &end, NULL) {
+        for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
                unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
                                            start_pfn, end_pfn);
                unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 3b984c3aa1b0..c1c382c58c60 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -117,6 +117,27 @@ void efi_get_time(struct timespec *now)
        now->tv_nsec = 0;
 }
+void __init efi_find_mirror(void)
+{
+        void *p;
+        u64 mirror_size = 0, total_size = 0;
+        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+                efi_memory_desc_t *md = p;
+                unsigned long long start = md->phys_addr;
+                unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+                total_size += size;
+                if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+                        memblock_mark_mirror(start, size);
+                        mirror_size += size;
+                }
+        }
+        if (mirror_size)
+                pr_info("Memory: %lldM/%lldM mirrored memory\n",
+                        mirror_size>>20, total_size>>20);
+}
 /*
 * Tell the kernel about the EFI memory map.  This might include
 * more than the max 128 entries that can fit in the e820 legacy
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index ba78ccf651e7..1f5f6dc09736 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -52,14 +52,15 @@ dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
 }
 static inline int
-dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
           enum dma_data_direction direction)
 {
        int i;
+        struct scatterlist *sg;
        BUG_ON(direction == DMA_NONE);
-        for (i = 0; i < nents; i++, sg++ ) {
+        for_each_sg(sglist, sg, nents, i) {
                BUG_ON(!sg_page(sg));
                sg->dma_address = sg_phys(sg);
@@ -124,20 +125,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
        consistent_sync((void *)bus_to_virt(dma_handle)+offset,size,direction);
 }
 static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems,
                 enum dma_data_direction dir)
 {
        int i;
-        for (i = 0; i < nelems; i++, sg++)
+        struct scatterlist *sg;
+        for_each_sg(sglist, sg, nelems, i)
                consistent_sync(sg_virt(sg), sg->length, dir);
 }
 static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
-                 enum dma_data_direction dir)
+                       int nelems, enum dma_data_direction dir)
 {
        int i;
-        for (i = 0; i < nelems; i++, sg++)
+        struct scatterlist *sg;
+        for_each_sg(sglist, sg, nelems, i)
                consistent_sync(sg_virt(sg), sg->length, dir);
 }
 static inline int
diff --git a/arch/xtensa/include/asm/mm-arch-hooks.h b/arch/xtensa/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..d2e5cfd3dd02
--- /dev/null
+++ b/arch/xtensa/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_XTENSA_MM_ARCH_HOOKS_H
+#define _ASM_XTENSA_MM_ARCH_HOOKS_H
+#endif /* _ASM_XTENSA_MM_ARCH_HOOKS_H */
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index feafa172b155..2345ee7342d9 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -165,7 +165,7 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
                 * infrastructure. There is no real reason why the selected
                 * task should have access to the memory reserves.
                 */
-                mark_tsk_oom_victim(selected);
+                mark_oom_victim(selected);
                send_sig(SIGKILL, selected, 0);
                rem += selected_tasksize;
        }
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 9ffdfcf2ec6e..1c4791033b72 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -353,9 +353,11 @@ static struct sysrq_key_op sysrq_term_op = {
 static void moom_callback(struct work_struct *ignored)
 {
+        mutex_lock(&oom_lock);
        if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
                           GFP_KERNEL, 0, NULL, true))
                pr_info("OOM request ignored because killer is disabled\n");
+        mutex_unlock(&oom_lock);
 }
 static DECLARE_WORK(moom_work, moom_callback);
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index c4211a31612d..d88f36754bf7 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -381,15 +381,9 @@ static int __init xen_tmem_init(void)
 #ifdef CONFIG_FRONTSWAP
        if (tmem_enabled && frontswap) {
                char *s = "";
-                struct frontswap_ops *old_ops;
                tmem_frontswap_poolid = -1;
-                old_ops = frontswap_register_ops(&tmem_frontswap_ops);
+                frontswap_register_ops(&tmem_frontswap_ops);
-                if (IS_ERR(old_ops) || old_ops) {
-                        if (IS_ERR(old_ops))
-                                return PTR_ERR(old_ops);
-                        s = " (WARNING: frontswap_ops overridden)";
-                }
                pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n",
                        s);
        }
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e65f9ffbb999..4d6a30e76168 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -47,12 +47,11 @@ static void config_item_release(struct kref *kref);
 *      config_item_init - initialize item.
 *      @item:  item in question.
 */
-void config_item_init(struct config_item *item)
+static void config_item_init(struct config_item *item)
 {
        kref_init(&item->ci_kref);
        INIT_LIST_HEAD(&item->ci_entry);
 }
-EXPORT_SYMBOL(config_item_init);
 /**
 *      config_item_set_name - Set the name of an item
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 87724c1d7be6..0cf74df68617 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -130,7 +130,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
                goto out;
        ret = 0;
-        hugetlb_prefault_arch_hook(vma->vm_mm);
        if (vma->vm_flags & VM_WRITE && inode->i_size < len)
                inode->i_size = len;
 out:
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 7bb487e663b4..2cd653670764 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -525,7 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                }
                        }
                        err = add_to_page_cache_lru(*cached_page, mapping,
-                                        index, GFP_KERNEL);
+                                        index,
+                                        GFP_KERNEL & mapping_gfp_mask(mapping));
                        if (unlikely(err)) {
                                if (err == -EEXIST)
                                        continue;
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index a44b14cbceeb..ab172e5f51d9 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -85,12 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
 static inline void ntfs_free(void *addr)
 {
-        if (!is_vmalloc_addr(addr)) {
+        kvfree(addr);
-                kfree(addr);
-                /* free_page((unsigned long)addr); */
-                return;
-        }
-        vfree(addr);
 }
 #endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 2d7f76e52c37..5997c00a1515 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2925,7 +2925,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
        struct ocfs2_path *right_path = NULL;
        struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
-        BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+        if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])))
+                return 0;
        *empty_extent_path = NULL;
@@ -4311,13 +4312,13 @@ out:
        return ret;
 }
-static enum ocfs2_contig_type
+static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
-ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                               struct ocfs2_path *path,
                               struct ocfs2_extent_list *el, int index,
-                               struct ocfs2_extent_rec *split_rec)
+                               struct ocfs2_extent_rec *split_rec,
+                               struct ocfs2_merge_ctxt *ctxt)
 {
-        int status;
+        int status = 0;
        enum ocfs2_contig_type ret = CONTIG_NONE;
        u32 left_cpos, right_cpos;
        struct ocfs2_extent_rec *rec = NULL;
@@ -4336,8 +4337,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                if (left_cpos != 0) {
                        left_path = ocfs2_new_path_from_path(path);
-                        if (!left_path)
+                        if (!left_path) {
+                                status = -ENOMEM;
+                                mlog_errno(status);
                                goto exit;
+                        }
                        status = ocfs2_find_path(et->et_ci, left_path,
                                                 left_cpos);
@@ -4392,8 +4396,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                        goto free_left_path;
                right_path = ocfs2_new_path_from_path(path);
-                if (!right_path)
+                if (!right_path) {
+                        status = -ENOMEM;
+                        mlog_errno(status);
                        goto free_left_path;
+                }
                status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
                if (status)
@@ -4433,7 +4440,10 @@ free_right_path:
 free_left_path:
        ocfs2_free_path(left_path);
 exit:
-        return ret;
+        if (status == 0)
+                ctxt->c_contig_type = ret;
+        return status;
 }
 static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
@@ -5039,9 +5049,14 @@ int ocfs2_split_extent(handle_t *handle,
                goto out;
        }
-        ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
+        ret = ocfs2_figure_merge_contig_type(et, path, el,
-                                                            split_index,
+                                             split_index,
-                                                            split_rec);
+                                             split_rec,
+                                             &ctxt);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
        /*
         * The core merge / split code wants to know how much room is
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f906a250da6a..1a35c6139656 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -523,7 +523,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
        unsigned long len = bh_result->b_size;
-        unsigned int clusters_to_alloc = 0;
+        unsigned int clusters_to_alloc = 0, contig_clusters = 0;
        cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
@@ -560,8 +560,10 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                /* fill hole, allocate blocks can't be larger than the size
                 * of the hole */
                clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
-                if (clusters_to_alloc > contig_blocks)
+                contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
-                        clusters_to_alloc = contig_blocks;
+                                contig_blocks);
+                if (clusters_to_alloc > contig_clusters)
+                        clusters_to_alloc = contig_clusters;
                /* allocate extent and insert them into the extent tree */
                ret = ocfs2_extend_allocation(inode, cpos,
@@ -619,9 +621,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
-        if (ocfs2_iocb_is_sem_locked(iocb))
-                ocfs2_iocb_clear_sem_locked(iocb);
        if (ocfs2_iocb_is_unaligned_aio(iocb)) {
                ocfs2_iocb_clear_unaligned_aio(iocb);
@@ -925,13 +924,23 @@ clean_orphan:
                int update_isize = written > 0 ? 1 : 0;
                loff_t end = update_isize ? offset + written : 0;
-                tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
+                tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
+                if (tmp_ret < 0) {
+                        ret = tmp_ret;
+                        mlog_errno(ret);
+                        goto out;
+                }
+                tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
                                update_isize, end);
                if (tmp_ret < 0) {
                        ret = tmp_ret;
+                        mlog_errno(ret);
                        goto out;
                }
+                ocfs2_inode_unlock(inode, 1);
                tmp_ret = jbd2_journal_force_commit(journal);
                if (tmp_ret < 0) {
                        ret = tmp_ret;
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index dd59599b022d..24e496d6bdcd 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -79,7 +79,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
 enum ocfs2_iocb_lock_bits {
        OCFS2_IOCB_RW_LOCK = 0,
        OCFS2_IOCB_RW_LOCK_LEVEL,
-        OCFS2_IOCB_SEM,
        OCFS2_IOCB_UNALIGNED_IO,
        OCFS2_IOCB_NUM_LOCKS
 };
@@ -88,12 +87,6 @@ enum ocfs2_iocb_lock_bits {
        clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_rw_locked_level(iocb) \
        test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_sem_locked(iocb) \
-        set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_sem_locked(iocb) \
-        clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_sem_locked(iocb) \
-        test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_set_unaligned_aio(iocb) \
        set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index af7598bff1b5..dfe162f5fd4c 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -64,6 +64,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
        return count;
 }
+void __mlog_printk(const u64 *mask, const char *func, int line,
+                   const char *fmt, ...)
+{
+        struct va_format vaf;
+        va_list args;
+        const char *level;
+        const char *prefix = "";
+        if (!__mlog_test_u64(*mask, mlog_and_bits) ||
+            __mlog_test_u64(*mask, mlog_not_bits))
+                return;
+        if (*mask & ML_ERROR) {
+                level = KERN_ERR;
+                prefix = "ERROR: ";
+        } else if (*mask & ML_NOTICE) {
+                level = KERN_NOTICE;
+        } else {
+                level = KERN_INFO;
+        }
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk("%s(%s,%u,%u):%s:%d %s%pV",
+               level, current->comm, task_pid_nr(current),
+               raw_smp_processor_id(), func, line, prefix, &vaf);
+        va_end(args);
+}
+EXPORT_SYMBOL_GPL(__mlog_printk);
 struct mlog_attribute {
        struct attribute attr;
        u64 mask;
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 7fdc25a4d8c0..308ea0eb35fd 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -162,38 +162,20 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 #endif
-/*
+__printf(4, 5)
- * smp_processor_id() "helpfully" screams when called outside preemptible
+void __mlog_printk(const u64 *m, const char *func, int line,
- * regions in current kernels.  sles doesn't have the variants that don't
+                   const char *fmt, ...);
- * scream.  just do this instead of trying to guess which we're building
- * against.. *sigh*.
- */
-#define __mlog_cpu_guess ({             \
-        unsigned long _cpu = get_cpu(); \
-        put_cpu();                      \
-        _cpu;                           \
-})
-/* In the following two macros, the whitespace after the ',' just
+/*
- * before ##args is intentional. Otherwise, gcc 2.95 will eat the
+ * Testing before the __mlog_printk call lets the compiler eliminate the
- * previous token if args expands to nothing.
+ * call completely when (m & ML_ALLOWED_BITS) is 0.
 */
-#define __mlog_printk(level, fmt, args...)                              \
+#define mlog(mask, fmt, ...)                                            \
-        printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm,           \
+do {                                                                    \
-               task_pid_nr(current), __mlog_cpu_guess,                  \
+        u64 _m = MLOG_MASK_PREFIX | (mask);                             \
-               __PRETTY_FUNCTION__, __LINE__ , ##args)
+        if (_m & ML_ALLOWED_BITS)                                       \
+                __mlog_printk(&_m, __func__, __LINE__, fmt,             \
-#define mlog(mask, fmt, args...) do {                                   \
+                              ##__VA_ARGS__);                           \
-        u64 __m = MLOG_MASK_PREFIX | (mask);                            \
-        if ((__m & ML_ALLOWED_BITS) &&                                  \
-            __mlog_test_u64(__m, mlog_and_bits) &&                      \
-            !__mlog_test_u64(__m, mlog_not_bits)) {                     \
-                if (__m & ML_ERROR)                                     \
-                        __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
-                else if (__m & ML_NOTICE)                               \
-                        __mlog_printk(KERN_NOTICE, fmt , ##args);       \
-                else __mlog_printk(KERN_INFO, fmt , ##args);            \
-        }                                                               \
 } while (0)
 #define mlog_errno(st) ({                                               \
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 56c403a563bc..2d0acd6678fe 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -2204,7 +2204,7 @@ out:
        kfree(o2net_hand);
        kfree(o2net_keep_req);
        kfree(o2net_keep_resp);
+        o2net_debugfs_exit();
        o2quo_exit();
        return -ENOMEM;
 }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ccd4dcfc3645..02878a83f0b4 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1617,7 +1617,7 @@ int __ocfs2_add_entry(handle_t *handle,
        struct ocfs2_dir_entry *de, *de1;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
        struct super_block *sb = dir->i_sb;
-        int retval, status;
+        int retval;
        unsigned int size = sb->s_blocksize;
        struct buffer_head *insert_bh = lookup->dl_leaf_bh;
        char *data_start = insert_bh->b_data;
@@ -1695,25 +1695,25 @@ int __ocfs2_add_entry(handle_t *handle,
                        }
                        if (insert_bh == parent_fe_bh)
-                                status = ocfs2_journal_access_di(handle,
+                                retval = ocfs2_journal_access_di(handle,
                                                                 INODE_CACHE(dir),
                                                                 insert_bh,
                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                        else {
-                                status = ocfs2_journal_access_db(handle,
+                                retval = ocfs2_journal_access_db(handle,
                                                                 INODE_CACHE(dir),
                                                                 insert_bh,
                                              OCFS2_JOURNAL_ACCESS_WRITE);
-                                if (ocfs2_dir_indexed(dir)) {
+                                if (!retval && ocfs2_dir_indexed(dir))
-                                        status = ocfs2_dx_dir_insert(dir,
+                                        retval = ocfs2_dx_dir_insert(dir,
                                                                handle,
                                                                lookup);
-                                        if (status) {
+                        }
-                                                mlog_errno(status);
-                                                goto bail;
+                        if (retval) {
-                                        }
+                                mlog_errno(retval);
-                                }
+                                goto bail;
                        }
                        /* By now the buffer is marked for journaling */
@@ -3543,13 +3543,10 @@ static void dx_leaf_sort_swap(void *a, void *b, int size)
 {
        struct ocfs2_dx_entry *entry1 = a;
        struct ocfs2_dx_entry *entry2 = b;
-        struct ocfs2_dx_entry tmp;
        BUG_ON(size != sizeof(*entry1));
-        tmp = *entry1;
+        swap(*entry1, *entry2);
-        *entry1 = *entry2;
-        *entry2 = tmp;
 }
 static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index fae17c640df3..e88ccf8c83ff 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1014,7 +1014,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
 /* will exit holding res->spinlock, but may drop in function */
 void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
-void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
 /* will exit holding res->spinlock, but may drop in function */
 static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d8b670cbd909..fbfadb289e62 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2250,7 +2250,7 @@ out:
 static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                                    struct iov_iter *from)
 {
-        int direct_io, appending, rw_level, have_alloc_sem  = 0;
+        int direct_io, appending, rw_level;
        int can_do_direct, has_refcount = 0;
        ssize_t written = 0;
        ssize_t ret;
@@ -2279,16 +2279,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
        mutex_lock(&inode->i_mutex);
-        ocfs2_iocb_clear_sem_locked(iocb);
 relock:
-        /* to match setattr's i_mutex -> rw_lock ordering */
-        if (direct_io) {
-                have_alloc_sem = 1;
-                /* communicate with ocfs2_dio_end_io */
-                ocfs2_iocb_set_sem_locked(iocb);
-        }
        /*
         * Concurrent O_DIRECT writes are allowed with
         * mount_option "coherency=buffered".
@@ -2298,7 +2289,7 @@ relock:
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
                mlog_errno(ret);
-                goto out_sems;
+                goto out_mutex;
        }
        /*
@@ -2347,7 +2338,6 @@ relock:
        if (direct_io && !can_do_direct) {
                ocfs2_rw_unlock(inode, rw_level);
-                have_alloc_sem = 0;
                rw_level = -1;
                direct_io = 0;
@@ -2416,7 +2406,6 @@ no_sync:
         */
        if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                rw_level = -1;
-                have_alloc_sem = 0;
                unaligned_dio = 0;
        }
@@ -2429,10 +2418,7 @@ out:
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
-out_sems:
+out_mutex:
-        if (have_alloc_sem)
-                ocfs2_iocb_clear_sem_locked(iocb);
        mutex_unlock(&inode->i_mutex);
        if (written)
@@ -2473,7 +2459,7 @@ bail:
 static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
                                   struct iov_iter *to)
 {
-        int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
+        int ret = 0, rw_level = -1, lock_level = 0;
        struct file *filp = iocb->ki_filp;
        struct inode *inode = file_inode(filp);
@@ -2490,16 +2476,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
                goto bail;
        }
-        ocfs2_iocb_clear_sem_locked(iocb);
        /*
         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
         * need locks to protect pending reads from racing with truncate.
         */
        if (iocb->ki_flags & IOCB_DIRECT) {
-                have_alloc_sem = 1;
-                ocfs2_iocb_set_sem_locked(iocb);
                ret = ocfs2_rw_lock(inode, 0);
                if (ret < 0) {
                        mlog_errno(ret);
@@ -2535,13 +2516,9 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
        /* see ocfs2_file_write_iter */
        if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
                rw_level = -1;
-                have_alloc_sem = 0;
        }
 bail:
-        if (have_alloc_sem)
-                ocfs2_iocb_clear_sem_locked(iocb);
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ff531928269e..7c099f7032fd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -108,7 +108,7 @@ struct ocfs2_replay_map {
        unsigned char rm_replay_slots[0];
 };
-void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
 {
        if (!osb->replay_map)
                return;
@@ -153,7 +153,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
        return 0;
 }
-void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
+static void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
                enum ocfs2_orphan_reco_type orphan_reco_type)
 {
        struct ocfs2_replay_map *replay_map = osb->replay_map;
@@ -173,7 +173,7 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
        replay_map->rm_state = REPLAY_DONE;
 }
-void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+static void ocfs2_free_replay_slots(struct ocfs2_super *osb)
 {
        struct ocfs2_replay_map *replay_map = osb->replay_map;
@@ -571,9 +571,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
             (unsigned long)bh,
             (unsigned long long)bh->b_blocknr);
-        /* We aren't guaranteed to have the superblock here - but if we
+        ocfs2_error(bh->b_bdev->bd_super,
-         * don't, it'll just crash. */
-        ocfs2_error(bh->b_assoc_map->host->i_sb,
                    "JBD2 has aborted our journal, ocfs2 cannot continue\n");
 }
@@ -775,7 +773,20 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
        trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
        status = jbd2_journal_dirty_metadata(handle, bh);
-        BUG_ON(status);
+        if (status) {
+                mlog_errno(status);
+                if (!is_handle_aborted(handle)) {
+                        journal_t *journal = handle->h_transaction->t_journal;
+                        struct super_block *sb = bh->b_bdev->bd_super;
+                        mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. "
+                                        "Aborting transaction and journal.\n");
+                        handle->h_err = status;
+                        jbd2_journal_abort_handle(handle);
+                        jbd2_journal_abort(journal, status);
+                        ocfs2_abort(sb, "Journal already aborted.\n");
+                }
+        }
 }
 #define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -1884,7 +1895,7 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
 * hasn't happened.  The node queues a scan and increments the
 * sequence number in the LVB.
 */
-void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 {
        struct ocfs2_orphan_scan *os;
        int status, i;
@@ -1933,7 +1944,7 @@ out:
 }
 /* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
-void ocfs2_orphan_scan_work(struct work_struct *work)
+static void ocfs2_orphan_scan_work(struct work_struct *work)
 {
        struct ocfs2_orphan_scan *os;
        struct ocfs2_super *osb;
@@ -2137,6 +2148,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
        struct inode *inode = NULL;
        struct inode *iter;
        struct ocfs2_inode_info *oi;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di = NULL;
        trace_ocfs2_recover_orphans(slot);
@@ -2157,16 +2170,22 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                iter = oi->ip_next_orphan;
                oi->ip_next_orphan = NULL;
+                ret = ocfs2_rw_lock(inode, 1);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto next;
+                }
                /*
                 * We need to take and drop the inode lock to
                 * force read inode from disk.
                 */
-                ret = ocfs2_inode_lock(inode, NULL, 0);
+                ret = ocfs2_inode_lock(inode, &di_bh, 1);
                if (ret) {
                        mlog_errno(ret);
-                        goto next;
+                        goto unlock_rw;
                }
-                ocfs2_inode_unlock(inode, 0);
+                di = (struct ocfs2_dinode *)di_bh->b_data;
                if (inode->i_nlink == 0) {
                        spin_lock(&oi->ip_lock);
@@ -2174,43 +2193,30 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                         * ocfs2_delete_inode. */
                        oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
                        spin_unlock(&oi->ip_lock);
-                } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
+                } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
-                        struct buffer_head *di_bh = NULL;
+                                (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
-                        ret = ocfs2_rw_lock(inode, 1);
-                        if (ret) {
-                                mlog_errno(ret);
-                                goto next;
-                        }
-                        ret = ocfs2_inode_lock(inode, &di_bh, 1);
-                        if (ret < 0) {
-                                ocfs2_rw_unlock(inode, 1);
-                                mlog_errno(ret);
-                                goto next;
-                        }
                        ret = ocfs2_truncate_file(inode, di_bh,
                                        i_size_read(inode));
-                        ocfs2_inode_unlock(inode, 1);
-                        ocfs2_rw_unlock(inode, 1);
-                        brelse(di_bh);
                        if (ret < 0) {
                                if (ret != -ENOSPC)
                                        mlog_errno(ret);
-                                goto next;
+                                goto unlock_inode;
                        }
-                        ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
+                        ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
                        if (ret)
                                mlog_errno(ret);
                        wake_up(&OCFS2_I(inode)->append_dio_wq);
                } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
+unlock_inode:
+                ocfs2_inode_unlock(inode, 1);
+unlock_rw:
+                ocfs2_rw_unlock(inode, 1);
 next:
                iput(inode);
+                brelse(di_bh);
+                di_bh = NULL;
                inode = iter;
        }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 176fe6afd94e..6e6abb93fda5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1116,8 +1116,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
        int inode1_is_ancestor, inode2_is_ancestor;
        struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
        struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
-        struct buffer_head **tmpbh;
-        struct inode *tmpinode;
        trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
                                (unsigned long long)oi2->ip_blkno);
@@ -1148,13 +1146,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                                (oi1->ip_blkno < oi2->ip_blkno &&
                                inode2_is_ancestor == 0)) {
                        /* switch id1 and id2 around */
-                        tmpbh = bh2;
+                        swap(bh2, bh1);
-                        bh2 = bh1;
+                        swap(inode2, inode1);
-                        bh1 = tmpbh;
-                        tmpinode = inode2;
-                        inode2 = inode1;
-                        inode1 = tmpinode;
                }
                /* lock id2 */
                status = ocfs2_inode_lock_nested(inode2, bh2, 1,
@@ -2670,30 +2663,22 @@ bail:
 }
 int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
-                struct inode *inode, int update_isize,
+                struct inode *inode, struct buffer_head *di_bh,
-                loff_t end)
+                int update_isize, loff_t end)
 {
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
-        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-        struct ocfs2_dinode *di = NULL;
        handle_t *handle = NULL;
        int status = 0;
-        status = ocfs2_inode_lock(inode, &di_bh, 1);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        di = (struct ocfs2_dinode *) di_bh->b_data;
        orphan_dir_inode = ocfs2_get_system_file_inode(osb,
                        ORPHAN_DIR_SYSTEM_INODE,
                        le16_to_cpu(di->i_dio_orphaned_slot));
        if (!orphan_dir_inode) {
                status = -ENOENT;
                mlog_errno(status);
-                goto bail_unlock_inode;
+                goto bail;
        }
        mutex_lock(&orphan_dir_inode->i_mutex);
@@ -2702,7 +2687,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
                mutex_unlock(&orphan_dir_inode->i_mutex);
                iput(orphan_dir_inode);
                mlog_errno(status);
-                goto bail_unlock_inode;
+                goto bail;
        }
        handle = ocfs2_start_trans(osb,
@@ -2749,10 +2734,6 @@ bail_unlock_orphan:
        brelse(orphan_dir_bh);
        iput(orphan_dir_inode);
-bail_unlock_inode:
-        ocfs2_inode_unlock(inode, 1);
-        brelse(di_bh);
 bail:
        return status;
 }
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 5ddecce172fa..e173329eb830 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -42,8 +42,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
                struct inode *inode);
 int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
-                struct inode *inode, int update_isize,
+                struct inode *inode, struct buffer_head *di_bh,
-                loff_t end);
+                int update_isize, loff_t end);
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                                   struct inode *new_inode,
                                   struct dentry *new_dentry);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 460c6c37e683..690ddc60189b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -717,6 +717,16 @@ static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
        return (u64)clusters << c_to_b_bits;
 }
+static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb,
+                u64 blocks)
+{
+        int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
+                        sb->s_blocksize_bits;
+        blocks += (1 << b_to_c_bits) - 1;
+        return (u32)(blocks >> b_to_c_bits);
+}
 static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
                                           u64 blocks)
 {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d8c6af101f3f..b69dd14c0b9b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1406,11 +1406,9 @@ static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
 static void swap_refcount_rec(void *a, void *b, int size)
 {
-        struct ocfs2_refcount_rec *l = a, *r = b, tmp;
+        struct ocfs2_refcount_rec *l = a, *r = b;
-        tmp = *l;
+        swap(*l, *r);
-        *l = *r;
-        *r = tmp;
 }
 /*
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d03bfbf3d27d..889f3796a0d7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7271,7 +7271,7 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
                               name, value, size, flags);
 }
-int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
                     void *fs_info)
 {
        const struct xattr *xattr;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fd02a9ebfc30..3f57dac31ba6 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -126,6 +126,14 @@ static inline const char *get_task_state(struct task_struct *tsk)
 {
        unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
+        /*
+         * Parked tasks do not run; they sit in __kthread_parkme().
+         * Without this check, we would report them as running, which is
+         * clearly wrong, so we report them as sleeping instead.
+         */
+        if (tsk->state == TASK_PARKED)
+                state = TASK_INTERRUPTIBLE;
        BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
        return task_state_array[fls(state)];
diff --git a/fs/splice.c b/fs/splice.c
index 4f355a1c1a9e..5fc1e50a7f30 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                break;
                        error = add_to_page_cache_lru(page, mapping, index,
-                                                GFP_KERNEL);
+                                        GFP_KERNEL & mapping_gfp_mask(mapping));
                        if (unlikely(error)) {
                                page_cache_release(page);
                                if (error == -EEXIST)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index bd910ceaccfa..29c57b2cb344 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -96,11 +96,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 }
 #endif
-#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-                                       unsigned long address,
+                                            unsigned long address,
-                                       pmd_t *pmdp)
+                                            pmd_t *pmdp)
 {
        pmd_t pmd = *pmdp;
        pmd_clear(pmdp);
@@ -109,13 +109,13 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
-#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL
+#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long address, pmd_t *pmdp,
                                            int full)
 {
-        return pmdp_get_and_clear(mm, address, pmdp);
+        return pmdp_huge_get_and_clear(mm, address, pmdp);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
@@ -152,8 +152,8 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
                              pte_t *ptep);
 #endif
-#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
-extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pmd_t *pmdp);
 #endif
@@ -189,6 +189,22 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
 #endif
+#ifndef pmdp_collapse_flush
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+                                 unsigned long address, pmd_t *pmdp);
+#else
+static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+                                        unsigned long address,
+                                        pmd_t *pmdp)
+{
+        BUILD_BUG();
+        return *pmdp;
+}
+#define pmdp_collapse_flush pmdp_collapse_flush
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
 extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                       pgtable_t pgtable);
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 0995c2de8162..f589222bfa87 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -357,12 +357,12 @@ extern void *alloc_large_system_hash(const char *tablename,
 /* Only NUMA needs hash distribution. 64bit NUMA architectures have
 * sufficient vmalloc space.
 */
-#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT)
+#ifdef CONFIG_NUMA
-#define HASHDIST_DEFAULT 1
+#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT)
+extern int hashdist;            /* Distribute hashes across NUMA nodes? */
 #else
-#define HASHDIST_DEFAULT 0
+#define hashdist (0)
 #endif
-extern int hashdist;            /* Distribute hashes across NUMA nodes? */
 #endif /* _LINUX_BOOTMEM_H */
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 34025df61829..c9e5c57e4edf 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -71,7 +71,6 @@ static inline char *config_item_name(struct config_item * item)
        return item->ci_name;
 }
-extern void config_item_init(struct config_item *);
 extern void config_item_init_type_name(struct config_item *item,
                                       const char *name,
                                       struct config_item_type *type);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 2092965afca3..5f19efe4eb3f 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -96,6 +96,8 @@ typedef	struct {
 #define EFI_MEMORY_WP           ((u64)0x0000000000001000ULL)    /* write-protect */
 #define EFI_MEMORY_RP           ((u64)0x0000000000002000ULL)    /* read-protect */
 #define EFI_MEMORY_XP           ((u64)0x0000000000004000ULL)    /* execute-protect */
+#define EFI_MEMORY_MORE_RELIABLE \
+                                ((u64)0x0000000000010000ULL)    /* higher reliability */
 #define EFI_MEMORY_RUNTIME      ((u64)0x8000000000000000ULL)    /* range requires runtime mapping */
 #define EFI_MEMORY_DESCRIPTOR_VERSION   1
@@ -868,6 +870,7 @@ extern void efi_enter_virtual_mode (void);	/* switch EFI to virtual mode, if pos
 extern void efi_late_init(void);
 extern void efi_free_boot_services(void);
 extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size);
+extern void efi_find_mirror(void);
 #else
 static inline void efi_late_init(void) {}
 static inline void efi_free_boot_services(void) {}
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 8293262401de..e65ef959546c 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -6,16 +6,16 @@
 #include <linux/bitops.h>
 struct frontswap_ops {
-        void (*init)(unsigned);
+        void (*init)(unsigned); /* this swap type was just swapon'ed */
-        int (*store)(unsigned, pgoff_t, struct page *);
+        int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
-        int (*load)(unsigned, pgoff_t, struct page *);
+        int (*load)(unsigned, pgoff_t, struct page *); /* load a page */
-        void (*invalidate_page)(unsigned, pgoff_t);
+        void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */
-        void (*invalidate_area)(unsigned);
+        void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */
+        struct frontswap_ops *next; /* private pointer to next ops */
 };
 extern bool frontswap_enabled;
-extern struct frontswap_ops *
+extern void frontswap_register_ops(struct frontswap_ops *ops);
-        frontswap_register_ops(struct frontswap_ops *ops);
 extern void frontswap_shrink(unsigned long);
 extern unsigned long frontswap_curr_pages(void);
 extern void frontswap_writethrough(bool);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0f313f93c586..65a517dd32f7 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -84,8 +84,6 @@ struct fsnotify_fname;
 * Each group much define these ops.  The fsnotify infrastructure will call
 * these operations for each relevant group.
 *
- * should_send_event - given a group, inode, and mask this function determines
- *              if the group is interested in this event.
 * handle_event - main call for a group to handle an fs event
 * free_group_priv - called when a group refcnt hits 0 to clean up the private union
 * freeing_mark - called when a mark is being destroyed for some reason.  The group
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index e705467ddb47..d0a1f99e24e3 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -28,7 +28,8 @@
 extern void kmemleak_init(void) __ref;
 extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
                           gfp_t gfp) __ref;
-extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref;
+extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+                                  gfp_t gfp) __ref;
 extern void kmemleak_free(const void *ptr) __ref;
 extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
 extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
@@ -71,7 +72,8 @@ static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
                                            gfp_t gfp)
 {
 }
-static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
+static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+                                         gfp_t gfp)
 {
 }
 static inline void kmemleak_free(const void *ptr)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 9497ec7c77ea..0215ffd63069 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -21,7 +21,11 @@
 #define INIT_PHYSMEM_REGIONS    4
 /* Definition of memblock flags. */
-#define MEMBLOCK_HOTPLUG        0x1     /* hotpluggable region */
+enum {
+        MEMBLOCK_NONE           = 0x0,  /* No special request */
+        MEMBLOCK_HOTPLUG        = 0x1,  /* hotpluggable region */
+        MEMBLOCK_MIRROR         = 0x2,  /* mirrored region */
+};
 struct memblock_region {
        phys_addr_t base;
@@ -61,7 +65,7 @@ extern bool movable_node_enabled;
 phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
                                            phys_addr_t start, phys_addr_t end,
-                                            int nid);
+                                            int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
                                   phys_addr_t size, phys_addr_t align);
 phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
@@ -75,6 +79,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
 int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
+int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
+ulong choose_memblock_flags(void);
 /* Low level functions */
 int memblock_add_range(struct memblock_type *type,
@@ -85,11 +91,13 @@ int memblock_remove_range(struct memblock_type *type,
                          phys_addr_t base,
                          phys_addr_t size);
-void __next_mem_range(u64 *idx, int nid, struct memblock_type *type_a,
+void __next_mem_range(u64 *idx, int nid, ulong flags,
+                      struct memblock_type *type_a,
                      struct memblock_type *type_b, phys_addr_t *out_start,
                      phys_addr_t *out_end, int *out_nid);
-void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
+void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
+                          struct memblock_type *type_a,
                          struct memblock_type *type_b, phys_addr_t *out_start,
                          phys_addr_t *out_end, int *out_nid);
@@ -100,16 +108,17 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
 * @type_a: ptr to memblock_type to iterate
 * @type_b: ptr to memblock_type which excludes from the iteration
 * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @p_nid: ptr to int for nid of the range, can be %NULL
 */
-#define for_each_mem_range(i, type_a, type_b, nid,                      \
+#define for_each_mem_range(i, type_a, type_b, nid, flags,               \
                           p_start, p_end, p_nid)                       \
-        for (i = 0, __next_mem_range(&i, nid, type_a, type_b,           \
+        for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b,    \
                                     p_start, p_end, p_nid);            \
             i != (u64)ULLONG_MAX;                                      \
-             __next_mem_range(&i, nid, type_a, type_b,                  \
+             __next_mem_range(&i, nid, flags, type_a, type_b,           \
                              p_start, p_end, p_nid))
 /**
@@ -119,17 +128,18 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
 * @type_a: ptr to memblock_type to iterate
 * @type_b: ptr to memblock_type which excludes from the iteration
 * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @p_nid: ptr to int for nid of the range, can be %NULL
 */
-#define for_each_mem_range_rev(i, type_a, type_b, nid,                  \
+#define for_each_mem_range_rev(i, type_a, type_b, nid, flags,           \
                               p_start, p_end, p_nid)                   \
        for (i = (u64)ULLONG_MAX,                                       \
-                     __next_mem_range_rev(&i, nid, type_a, type_b,      \
+                     __next_mem_range_rev(&i, nid, flags, type_a, type_b,\
                                         p_start, p_end, p_nid);        \
             i != (u64)ULLONG_MAX;                                      \
-             __next_mem_range_rev(&i, nid, type_a, type_b,              \
+             __next_mem_range_rev(&i, nid, flags, type_a, type_b,       \
                                  p_start, p_end, p_nid))
 #ifdef CONFIG_MOVABLE_NODE
@@ -153,6 +163,11 @@ static inline bool movable_node_is_enabled(void)
 }
 #endif
+static inline bool memblock_is_mirror(struct memblock_region *m)
+{
+        return m->flags & MEMBLOCK_MIRROR;
+}
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
                            unsigned long  *end_pfn);
@@ -181,13 +196,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @flags: pick from blocks based on memory attributes
 *
 * Walks over free (memory && !reserved) areas of memblock.  Available as
 * soon as memblock is initialized.
 */
-#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid)          \
+#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid)   \
        for_each_mem_range(i, &memblock.memory, &memblock.reserved,     \
-                           nid, p_start, p_end, p_nid)
+                           nid, flags, p_start, p_end, p_nid)
 /**
 * for_each_free_mem_range_reverse - rev-iterate through free memblock areas
@@ -196,13 +212,15 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @flags: pick from blocks based on memory attributes
 *
 * Walks over free (memory && !reserved) areas of memblock in reverse
 * order.  Available as soon as memblock is initialized.
 */
-#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid)  \
+#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end,  \
+                                        p_nid)                          \
        for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
-                               nid, p_start, p_end, p_nid)
+                               nid, flags, p_start, p_end, p_nid)
 static inline void memblock_set_region_flags(struct memblock_region *r,
                                             unsigned long flags)
@@ -273,7 +291,8 @@ static inline bool memblock_bottom_up(void) { return false; }
 #define MEMBLOCK_ALLOC_ACCESSIBLE       0
 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
-                                        phys_addr_t start, phys_addr_t end);
+                                        phys_addr_t start, phys_addr_t end,
+                                        ulong flags);
 phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
                                phys_addr_t max_addr);
 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
new file mode 100644
index 000000000000..4efc3f56e6df
--- /dev/null
+++ b/include/linux/mm-arch-hooks.h
@@ -0,0 +1,25 @@
+/*
+ * Generic mm no-op hooks.
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _LINUX_MM_ARCH_HOOKS_H
+#define _LINUX_MM_ARCH_HOOKS_H
+#include <asm/mm-arch-hooks.h>
+#ifndef arch_remap
+static inline void arch_remap(struct mm_struct *mm,
+                              unsigned long old_start, unsigned long old_end,
+                              unsigned long new_start, unsigned long new_end)
+{
+}
+#define arch_remap arch_remap
+#endif
+#endif /* _LINUX_MM_ARCH_HOOKS_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0755b9fd03a7..24ad583596d1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -499,7 +499,7 @@ static inline int page_count(struct page *page)
 static inline bool __compound_tail_refcounted(struct page *page)
 {
-        return !PageSlab(page) && !PageHeadHuge(page);
+        return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
 }
 /*
@@ -2146,12 +2146,47 @@ enum mf_flags {
 extern int memory_failure(unsigned long pfn, int trapno, int flags);
 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
 extern int unpoison_memory(unsigned long pfn);
+extern int get_hwpoison_page(struct page *page);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
 extern atomic_long_t num_poisoned_pages;
 extern int soft_offline_page(struct page *page, int flags);
+/*
+ * Error handlers for various types of pages.
+ */
+enum mf_result {
+        MF_IGNORED,     /* Error: cannot be handled */
+        MF_FAILED,      /* Error: handling failed */
+        MF_DELAYED,     /* Will be handled later */
+        MF_RECOVERED,   /* Successfully recovered */
+};
+enum mf_action_page_type {
+        MF_MSG_KERNEL,
+        MF_MSG_KERNEL_HIGH_ORDER,
+        MF_MSG_SLAB,
+        MF_MSG_DIFFERENT_COMPOUND,
+        MF_MSG_POISONED_HUGE,
+        MF_MSG_HUGE,
+        MF_MSG_FREE_HUGE,
+        MF_MSG_UNMAP_FAILED,
+        MF_MSG_DIRTY_SWAPCACHE,
+        MF_MSG_CLEAN_SWAPCACHE,
+        MF_MSG_DIRTY_MLOCKED_LRU,
+        MF_MSG_CLEAN_MLOCKED_LRU,
+        MF_MSG_DIRTY_UNEVICTABLE_LRU,
+        MF_MSG_CLEAN_UNEVICTABLE_LRU,
+        MF_MSG_DIRTY_LRU,
+        MF_MSG_CLEAN_LRU,
+        MF_MSG_TRUNCATED_LRU,
+        MF_MSG_BUDDY,
+        MF_MSG_BUDDY_2ND,
+        MF_MSG_UNKNOWN,
+};
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
 extern void clear_huge_page(struct page *page,
                            unsigned long addr,
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 95243d28a0ee..61cd67f4d788 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -324,25 +324,25 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
        ___pte;                                                         \
 })
-#define pmdp_clear_flush_notify(__vma, __haddr, __pmd)                  \
+#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd)             \
 ({                                                                      \
        unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;              \
        struct mm_struct *___mm = (__vma)->vm_mm;                       \
        pmd_t ___pmd;                                                   \
                                                                        \
-        ___pmd = pmdp_clear_flush(__vma, __haddr, __pmd);               \
+        ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd);          \
        mmu_notifier_invalidate_range(___mm, ___haddr,                  \
                                      ___haddr + HPAGE_PMD_SIZE);       \
                                                                        \
        ___pmd;                                                         \
 })
-#define pmdp_get_and_clear_notify(__mm, __haddr, __pmd)                 \
+#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd)            \
 ({                                                                      \
        unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;              \
        pmd_t ___pmd;                                                   \
                                                                        \
-        ___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd);              \
+        ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd);         \
        mmu_notifier_invalidate_range(__mm, ___haddr,                   \
                                      ___haddr + HPAGE_PMD_SIZE);       \
                                                                        \
@@ -428,8 +428,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
 #define ptep_clear_flush_notify ptep_clear_flush
-#define pmdp_clear_flush_notify pmdp_clear_flush
+#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
-#define pmdp_get_and_clear_notify pmdp_get_and_clear
+#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
 #define set_pte_at_notify set_pte_at
 #endif /* CONFIG_MMU_NOTIFIER */
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 3d46fb4708e0..f94da0e65dea 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled;
 extern int soft_watchdog_enabled;
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
+extern unsigned long *watchdog_cpumask_bits;
 extern int sysctl_softlockup_all_cpu_backtrace;
 struct ctl_table;
 extern int proc_watchdog(struct ctl_table *, int ,
@@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int ,
                              void __user *, size_t *, loff_t *);
 extern int proc_watchdog_thresh(struct ctl_table *, int ,
                                void __user *, size_t *, loff_t *);
+extern int proc_watchdog_cpumask(struct ctl_table *, int,
+                                 void __user *, size_t *, loff_t *);
 #endif
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 44b2f6f7bbd8..7deecb7bca5e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -32,6 +32,8 @@ enum oom_scan_t {
 /* Thread is the potential origin of an oom condition; kill first on oom */
 #define OOM_FLAG_ORIGIN         ((__force oom_flags_t)0x1)
+extern struct mutex oom_lock;
 static inline void set_current_oom_origin(void)
 {
        current->signal->oom_flags |= OOM_FLAG_ORIGIN;
@@ -47,9 +49,7 @@ static inline bool oom_task_origin(const struct task_struct *p)
        return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
 }
-extern void mark_tsk_oom_victim(struct task_struct *tsk);
+extern void mark_oom_victim(struct task_struct *tsk);
-extern void unmark_oom_victim(void);
 extern unsigned long oom_badness(struct task_struct *p,
                struct mem_cgroup *memcg, const nodemask_t *nodemask,
@@ -62,9 +62,6 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                             struct mem_cgroup *memcg, nodemask_t *nodemask,
                             const char *message);
-extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags);
-extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
                               int order, const nodemask_t *nodemask,
                               struct mem_cgroup *memcg);
@@ -75,6 +72,9 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                int order, nodemask_t *mask, bool force_kill);
+extern void exit_oom_victim(void);
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ffd24c830151..9de2fdc8b5e4 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -153,8 +153,30 @@ size_t ksize(const void *);
 #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
 #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
 #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
+/*
+ * The KMALLOC_LOOP_LOW is the definition for the for loop index start number
+ * to create the kmalloc_caches object in create_kmalloc_caches(). The first
+ * and the second are 96 and 192. You can see that in the kmalloc_index(), if
+ * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64,
+ * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't
+ * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW.
+ */
+#if KMALLOC_MIN_SIZE <= 32
+#define KMALLOC_LOOP_LOW 1
+#elif KMALLOC_MIN_SIZE <= 64
+#define KMALLOC_LOOP_LOW 2
+#else
+#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW
+#endif
 #else
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
+/*
+ * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used.
+ * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be
+ * initialized.
+ */
+#define KMALLOC_LOOP_LOW 1
 #endif
 /*
@@ -240,8 +262,8 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
 * belongs to.
 * 0 = zero alloc
 * 1 =  65 .. 96 bytes
- * 2 = 120 .. 192 bytes
+ * 2 = 129 .. 192 bytes
- * n = 2^(n-1) .. 2^n -1
+ * n = 2^(n-1)+1 .. 2^n
 */
 static __always_inline int kmalloc_index(size_t size)
 {
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index d600afb21926..da3c593f9845 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -27,6 +27,8 @@ struct smpboot_thread_data;
 * @pre_unpark:         Optional unpark function, called before the thread is
 *                      unparked (cpu online). This is not guaranteed to be
 *                      called on the target cpu of the thread. Careful!
+ * @cpumask:            Internal state.  To update which threads are unparked,
+ *                      call smpboot_update_cpumask_percpu_thread().
 * @selfparking:        Thread is not parked by the park function.
 * @thread_comm:        The base name of the thread
 */
@@ -41,11 +43,14 @@ struct smp_hotplug_thread {
        void                            (*park)(unsigned int cpu);
        void                            (*unpark)(unsigned int cpu);
        void                            (*pre_unpark)(unsigned int cpu);
+        cpumask_var_t                   cpumask;
        bool                            selfparking;
        const char                      *thread_comm;
 };
 int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+                                         const struct cpumask *);
 #endif
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 79abb9c71772..1443d79e4fe6 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -11,6 +11,7 @@
 #include <linux/pci.h>
 #include <linux/aer.h>
 #include <linux/cper.h>
+#include <linux/mm.h>
 /*
 * MCE Extended Error Log trace event
@@ -232,6 +233,90 @@ TRACE_EVENT(aer_event,
                __print_flags(__entry->status, "|", aer_uncorrectable_errors))
 );
+/*
+ * memory-failure recovery action result event
+ *
+ * unsigned long pfn -  Page Frame Number of the corrupted page
+ * int type     -       Page types of the corrupted page
+ * int result   -       Result of recovery action
+ */
+#ifdef CONFIG_MEMORY_FAILURE
+#define MF_ACTION_RESULT        \
+        EM ( MF_IGNORED, "Ignored" )    \
+        EM ( MF_FAILED,  "Failed" )     \
+        EM ( MF_DELAYED, "Delayed" )    \
+        EMe ( MF_RECOVERED, "Recovered" )
+#define MF_PAGE_TYPE            \
+        EM ( MF_MSG_KERNEL, "reserved kernel page" )                    \
+        EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" )       \
+        EM ( MF_MSG_SLAB, "kernel slab page" )                          \
+        EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
+        EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" )      \
+        EM ( MF_MSG_HUGE, "huge page" )                                 \
+        EM ( MF_MSG_FREE_HUGE, "free huge page" )                       \
+        EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" )             \
+        EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" )           \
+        EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" )           \
+        EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" )       \
+        EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" )       \
+        EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" )       \
+        EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" )       \
+        EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" )                       \
+        EM ( MF_MSG_CLEAN_LRU, "clean LRU page" )                       \
+        EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" )       \
+        EM ( MF_MSG_BUDDY, "free buddy page" )                          \
+        EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" )            \
+        EMe ( MF_MSG_UNKNOWN, "unknown page" )
+/*
+ * First define the enums in MM_ACTION_RESULT to be exported to userspace
+ * via TRACE_DEFINE_ENUM().
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)       TRACE_DEFINE_ENUM(a);
+MF_ACTION_RESULT
+MF_PAGE_TYPE
+/*
+ * Now redefine the EM() and EMe() macros to map the enums to the strings
+ * that will be printed in the output.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)                { a, b },
+#define EMe(a, b)       { a, b }
+TRACE_EVENT(memory_failure_event,
+        TP_PROTO(unsigned long pfn,
+                 int type,
+                 int result),
+        TP_ARGS(pfn, type, result),
+        TP_STRUCT__entry(
+                __field(unsigned long, pfn)
+                __field(int, type)
+                __field(int, result)
+        ),
+        TP_fast_assign(
+                __entry->pfn    = pfn;
+                __entry->type   = type;
+                __entry->result = result;
+        ),
+        TP_printk("pfn %#lx: recovery action for %s: %s",
+                __entry->pfn,
+                __print_symbolic(__entry->type, MF_PAGE_TYPE),
+                __print_symbolic(__entry->result, MF_ACTION_RESULT)
+        )
+);
+#endif /* CONFIG_MEMORY_FAILURE */
 #endif /* _TRACE_HW_EVENT_MC_H */
 /* This part must be outside protection */
diff --git a/kernel/exit.c b/kernel/exit.c
index 22fcc05dec40..185752a729f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -436,7 +436,7 @@ static void exit_mm(struct task_struct *tsk)
        mm_update_next_owner(mm);
        mmput(mm);
        if (test_thread_flag(TIF_MEMDIE))
-                unmark_oom_victim();
+                exit_oom_victim();
 }
 static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index c697f73d82d6..7c434c39f02a 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -232,7 +232,8 @@ void smpboot_unpark_threads(unsigned int cpu)
        mutex_lock(&smpboot_threads_lock);
        list_for_each_entry(cur, &hotplug_threads, list)
-                smpboot_unpark_thread(cur, cpu);
+                if (cpumask_test_cpu(cpu, cur->cpumask))
+                        smpboot_unpark_thread(cur, cpu);
        mutex_unlock(&smpboot_threads_lock);
 }
@@ -258,6 +259,15 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 {
        unsigned int cpu;
+        /* Unpark any threads that were voluntarily parked. */
+        for_each_cpu_not(cpu, ht->cpumask) {
+                if (cpu_online(cpu)) {
+                        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+                        if (tsk)
+                                kthread_unpark(tsk);
+                }
+        }
        /* We need to destroy also the parked threads of offline cpus */
        for_each_possible_cpu(cpu) {
                struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,6 +291,10 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
        unsigned int cpu;
        int ret = 0;
+        if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
+                return -ENOMEM;
+        cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
        for_each_online_cpu(cpu) {
@@ -313,9 +327,53 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
        smpboot_destroy_threads(plug_thread);
        mutex_unlock(&smpboot_threads_lock);
        put_online_cpus();
+        free_cpumask_var(plug_thread->cpumask);
 }
 EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
+/**
+ * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
+ * @plug_thread:        Hotplug thread descriptor
+ * @new:                Revised mask to use
+ *
+ * The cpumask field in the smp_hotplug_thread must not be updated directly
+ * by the client, but only by calling this function.
+ * This function can only be called on a registered smp_hotplug_thread.
+ */
+int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+                                         const struct cpumask *new)
+{
+        struct cpumask *old = plug_thread->cpumask;
+        cpumask_var_t tmp;
+        unsigned int cpu;
+        if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
+                return -ENOMEM;
+        get_online_cpus();
+        mutex_lock(&smpboot_threads_lock);
+        /* Park threads that were exclusively enabled on the old mask. */
+        cpumask_andnot(tmp, old, new);
+        for_each_cpu_and(cpu, tmp, cpu_online_mask)
+                smpboot_park_thread(plug_thread, cpu);
+        /* Unpark threads that are exclusively enabled on the new mask. */
+        cpumask_andnot(tmp, new, old);
+        for_each_cpu_and(cpu, tmp, cpu_online_mask)
+                smpboot_unpark_thread(plug_thread, cpu);
+        cpumask_copy(old, new);
+        mutex_unlock(&smpboot_threads_lock);
+        put_online_cpus();
+        free_cpumask_var(tmp);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
 static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b13e9d2de302..812fcc3fd390 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -872,6 +872,13 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
        {
+                .procname       = "watchdog_cpumask",
+                .data           = &watchdog_cpumask_bits,
+                .maxlen         = NR_CPUS,
+                .mode           = 0644,
+                .proc_handler   = proc_watchdog_cpumask,
+        },
+        {
                .procname       = "softlockup_panic",
                .data           = &softlockup_panic,
                .maxlen         = sizeof(int),
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 581a68a04c64..a6ffa43f2993 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -19,6 +19,7 @@
 #include <linux/sysctl.h>
 #include <linux/smpboot.h>
 #include <linux/sched/rt.h>
+#include <linux/tick.h>
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
@@ -58,6 +59,12 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 #else
 #define sysctl_softlockup_all_cpu_backtrace 0
 #endif
+static struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+/* Helper for online, unparked cpus. */
+#define for_each_watchdog_cpu(cpu) \
+        for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
@@ -207,7 +214,7 @@ void touch_all_softlockup_watchdogs(void)
         * do we care if a 0 races with a timestamp?
         * all it means is the softlock check starts one cycle later
         */
-        for_each_online_cpu(cpu)
+        for_each_watchdog_cpu(cpu)
                per_cpu(watchdog_touch_ts, cpu) = 0;
 }
@@ -616,7 +623,7 @@ void watchdog_nmi_enable_all(void)
                goto unlock;
        get_online_cpus();
-        for_each_online_cpu(cpu)
+        for_each_watchdog_cpu(cpu)
                watchdog_nmi_enable(cpu);
        put_online_cpus();
@@ -634,7 +641,7 @@ void watchdog_nmi_disable_all(void)
                goto unlock;
        get_online_cpus();
-        for_each_online_cpu(cpu)
+        for_each_watchdog_cpu(cpu)
                watchdog_nmi_disable(cpu);
        put_online_cpus();
@@ -696,7 +703,7 @@ static void update_watchdog_all_cpus(void)
        int cpu;
        get_online_cpus();
-        for_each_online_cpu(cpu)
+        for_each_watchdog_cpu(cpu)
                update_watchdog(cpu);
        put_online_cpus();
 }
@@ -709,8 +716,12 @@ static int watchdog_enable_all_cpus(void)
                err = smpboot_register_percpu_thread(&watchdog_threads);
                if (err)
                        pr_err("Failed to create watchdog threads, disabled\n");
-                else
+                else {
+                        if (smpboot_update_cpumask_percpu_thread(
+                                    &watchdog_threads, &watchdog_cpumask))
+                                pr_err("Failed to set cpumask for watchdog threads\n");
                        watchdog_running = 1;
+                }
        } else {
                /*
                 * Enable/disable the lockup detectors or
@@ -879,12 +890,58 @@ out:
        mutex_unlock(&watchdog_proc_mutex);
        return err;
 }
+/*
+ * The cpumask is the mask of possible cpus that the watchdog can run
+ * on, not the mask of cpus it is actually running on.  This allows the
+ * user to specify a mask that will include cpus that have not yet
+ * been brought online, if desired.
+ */
+int proc_watchdog_cpumask(struct ctl_table *table, int write,
+                          void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int err;
+        mutex_lock(&watchdog_proc_mutex);
+        err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
+        if (!err && write) {
+                /* Remove impossible cpus to keep sysctl output cleaner. */
+                cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
+                            cpu_possible_mask);
+                if (watchdog_running) {
+                        /*
+                         * Failure would be due to being unable to allocate
+                         * a temporary cpumask, so we are likely not in a
+                         * position to do much else to make things better.
+                         */
+                        if (smpboot_update_cpumask_percpu_thread(
+                                    &watchdog_threads, &watchdog_cpumask) != 0)
+                                pr_err("cpumask update failed\n");
+                }
+        }
+        mutex_unlock(&watchdog_proc_mutex);
+        return err;
+}
 #endif /* CONFIG_SYSCTL */
 void __init lockup_detector_init(void)
 {
        set_sample_period();
+#ifdef CONFIG_NO_HZ_FULL
+        if (tick_nohz_full_enabled()) {
+                if (!cpumask_empty(tick_nohz_full_mask))
+                        pr_info("Disabling watchdog on nohz_full cores by default\n");
+                cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
+                               tick_nohz_full_mask);
+        } else
+                cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#else
+        cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#endif
        if (watchdog_enabled)
                watchdog_enable_all_cpus();
 }
diff --git a/mm/Kconfig b/mm/Kconfig
index 390214da4546..c180af880ed5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -368,6 +368,7 @@ config MEMORY_FAILURE
        depends on ARCH_SUPPORTS_MEMORY_FAILURE
        bool "Enable recovery from hardware memory errors"
        select MEMORY_ISOLATION
+        select RAS
        help
          Enables code to recover from some memory failures on systems
          with MCA recovery. This allows a system to continue running
diff --git a/mm/cma.c b/mm/cma.c
index 3a7a67b93394..e7d1db533025 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -182,7 +182,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
        if (!size || !memblock_is_region_reserved(base, size))
                return -EINVAL;
-        /* ensure minimal alignment requied by mm core */
+        /* ensure minimal alignment required by mm core */
        alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
        /* alignment should be aligned with order_per_bit */
@@ -238,7 +238,7 @@ int __init cma_declare_contiguous(phys_addr_t base,
        /*
         * high_memory isn't direct mapped memory so retrieving its physical
         * address isn't appropriate.  But it would be useful to check the
-         * physical address of the highmem boundary so it's justfiable to get
+         * physical address of the highmem boundary so it's justifiable to get
         * the physical address from it.  On x86 there is a validation check for
         * this case, so the following workaround is needed to avoid it.
         */
@@ -316,13 +316,15 @@ int __init cma_declare_contiguous(phys_addr_t base,
                 */
                if (base < highmem_start && limit > highmem_start) {
                        addr = memblock_alloc_range(size, alignment,
-                                                    highmem_start, limit);
+                                                    highmem_start, limit,
+                                                    MEMBLOCK_NONE);
                        limit = highmem_start;
                }
                if (!addr) {
                        addr = memblock_alloc_range(size, alignment, base,
-                                                    limit);
+                                                    limit,
+                                                    MEMBLOCK_NONE);
                        if (!addr) {
                                ret = -ENOMEM;
                                goto err;
diff --git a/mm/filemap.c b/mm/filemap.c
index 6bf5e42d560a..8d17ceea8dbe 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -196,7 +196,9 @@ void __delete_from_page_cache(struct page *page, void *shadow)
        page->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */
-        __dec_zone_page_state(page, NR_FILE_PAGES);
+        /* hugetlb pages do not participate in page cache accounting. */
+        if (!PageHuge(page))
+                __dec_zone_page_state(page, NR_FILE_PAGES);
        if (PageSwapBacked(page))
                __dec_zone_page_state(page, NR_SHMEM);
        BUG_ON(page_mapped(page));
@@ -483,7 +485,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                error = radix_tree_insert(&mapping->page_tree, offset, new);
                BUG_ON(error);
                mapping->nrpages++;
-                __inc_zone_page_state(new, NR_FILE_PAGES);
+                /*
+                 * hugetlb pages do not participate in page cache accounting.
+                 */
+                if (!PageHuge(new))
+                        __inc_zone_page_state(new, NR_FILE_PAGES);
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
                spin_unlock_irq(&mapping->tree_lock);
@@ -575,7 +582,10 @@ static int __add_to_page_cache_locked(struct page *page,
        radix_tree_preload_end();
        if (unlikely(error))
                goto err_insert;
-        __inc_zone_page_state(page, NR_FILE_PAGES);
+        /* hugetlb pages do not participate in page cache accounting. */
+        if (!huge)
+                __inc_zone_page_state(page, NR_FILE_PAGES);
        spin_unlock_irq(&mapping->tree_lock);
        if (!huge)
                mem_cgroup_commit_charge(page, memcg, false);
@@ -1654,8 +1664,8 @@ no_cached_page:
                        error = -ENOMEM;
                        goto out;
                }
-                error = add_to_page_cache_lru(page, mapping,
+                error = add_to_page_cache_lru(page, mapping, index,
-                                                index, GFP_KERNEL);
+                                        GFP_KERNEL & mapping_gfp_mask(mapping));
                if (error) {
                        page_cache_release(page);
                        if (error == -EEXIST) {
@@ -1756,7 +1766,8 @@ static int page_cache_read(struct file *file, pgoff_t offset)
                if (!page)
                        return -ENOMEM;
-                ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+                ret = add_to_page_cache_lru(page, mapping, offset,
+                                GFP_KERNEL & mapping_gfp_mask(mapping));
                if (ret == 0)
                        ret = mapping->a_ops->readpage(file, page);
                else if (ret == -EEXIST)
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 8d82809eb085..27a9924caf61 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -21,11 +21,16 @@
 #include <linux/swapfile.h>
 /*
- * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * frontswap_ops are added by frontswap_register_ops, and provide the
- * to the frontswap "backend" implementation functions.
+ * frontswap "backend" implementation functions.  Multiple implementations
+ * may be registered, but implementations can never deregister.  This
+ * is a simple singly-linked list of all registered implementations.
 */
 static struct frontswap_ops *frontswap_ops __read_mostly;
+#define for_each_frontswap_ops(ops)             \
+        for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
 /*
 * If enabled, frontswap_store will return failure even on success.  As
 * a result, the swap subsystem will always write the page to swap, in
@@ -79,15 +84,6 @@ static inline void inc_frontswap_invalidates(void) { }
 * on all frontswap functions to not call the backend until the backend
 * has registered.
 *
- * Specifically when no backend is registered (nobody called
- * frontswap_register_ops) all calls to frontswap_init (which is done via
- * swapon -> enable_swap_info -> frontswap_init) are registered and remembered
- * (via the setting of need_init bitmap) but fail to create tmem_pools. When a
- * backend registers with frontswap at some later point the previous
- * calls to frontswap_init are executed (by iterating over the need_init
- * bitmap) to create tmem_pools and set the respective poolids. All of that is
- * guarded by us using atomic bit operations on the 'need_init' bitmap.
- *
 * This would not guards us against the user deciding to call swapoff right as
 * we are calling the backend to initialize (so swapon is in action).
 * Fortunatly for us, the swapon_mutex has been taked by the callee so we are
@@ -106,37 +102,64 @@ static inline void inc_frontswap_invalidates(void) { }
 *
 * Obviously the opposite (unloading the backend) must be done after all
 * the frontswap_[store|load|invalidate_area|invalidate_page] start
- * ignorning or failing the requests - at which point frontswap_ops
+ * ignoring or failing the requests.  However, there is currently no way
- * would have to be made in some fashion atomic.
+ * to unload a backend once it is registered.
 */
-static DECLARE_BITMAP(need_init, MAX_SWAPFILES);
 /*
- * Register operations for frontswap, returning previous thus allowing
+ * Register operations for frontswap
- * detection of multiple backends and possible nesting.
 */
-struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
+void frontswap_register_ops(struct frontswap_ops *ops)
 {
-        struct frontswap_ops *old = frontswap_ops;
+        DECLARE_BITMAP(a, MAX_SWAPFILES);
-        int i;
+        DECLARE_BITMAP(b, MAX_SWAPFILES);
+        struct swap_info_struct *si;
-        for (i = 0; i < MAX_SWAPFILES; i++) {
+        unsigned int i;
-                if (test_and_clear_bit(i, need_init)) {
-                        struct swap_info_struct *sis = swap_info[i];
+        bitmap_zero(a, MAX_SWAPFILES);
-                        /* __frontswap_init _should_ have set it! */
+        bitmap_zero(b, MAX_SWAPFILES);
-                        if (!sis->frontswap_map)
-                                return ERR_PTR(-EINVAL);
+        spin_lock(&swap_lock);
-                        ops->init(i);
+        plist_for_each_entry(si, &swap_active_head, list) {
-                }
+                if (!WARN_ON(!si->frontswap_map))
+                        set_bit(si->type, a);
        }
+        spin_unlock(&swap_lock);
+        /* the new ops needs to know the currently active swap devices */
+        for_each_set_bit(i, a, MAX_SWAPFILES)
+                ops->init(i);
        /*
-         * We MUST have frontswap_ops set _after_ the frontswap_init's
+         * Setting frontswap_ops must happen after the ops->init() calls
-         * have been called. Otherwise __frontswap_store might fail. Hence
+         * above; cmpxchg implies smp_mb() which will ensure the init is
-         * the barrier to make sure compiler does not re-order us.
+         * complete at this point.
         */
-        barrier();
+        do {
-        frontswap_ops = ops;
+                ops->next = frontswap_ops;
-        return old;
+        } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
+        spin_lock(&swap_lock);
+        plist_for_each_entry(si, &swap_active_head, list) {
+                if (si->frontswap_map)
+                        set_bit(si->type, b);
+        }
+        spin_unlock(&swap_lock);
+        /*
+         * On the very unlikely chance that a swap device was added or
+         * removed between setting the "a" list bits and the ops init
+         * calls, we re-check and do init or invalidate for any changed
+         * bits.
+         */
+        if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
+                for (i = 0; i < MAX_SWAPFILES; i++) {
+                        if (!test_bit(i, a) && test_bit(i, b))
+                                ops->init(i);
+                        else if (test_bit(i, a) && !test_bit(i, b))
+                                ops->invalidate_area(i);
+                }
+        }
 }
 EXPORT_SYMBOL(frontswap_register_ops);
@@ -164,6 +187,7 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
 void __frontswap_init(unsigned type, unsigned long *map)
 {
        struct swap_info_struct *sis = swap_info[type];
+        struct frontswap_ops *ops;
        BUG_ON(sis == NULL);
@@ -179,28 +203,30 @@ void __frontswap_init(unsigned type, unsigned long *map)
         * p->frontswap set to something valid to work properly.
         */
        frontswap_map_set(sis, map);
-        if (frontswap_ops)
-                frontswap_ops->init(type);
+        for_each_frontswap_ops(ops)
-        else {
+                ops->init(type);
-                BUG_ON(type >= MAX_SWAPFILES);
-                set_bit(type, need_init);
-        }
 }
 EXPORT_SYMBOL(__frontswap_init);
 bool __frontswap_test(struct swap_info_struct *sis,
                                pgoff_t offset)
 {
-        bool ret = false;
+        if (sis->frontswap_map)
+                return test_bit(offset, sis->frontswap_map);
-        if (frontswap_ops && sis->frontswap_map)
+        return false;
-                ret = test_bit(offset, sis->frontswap_map);
-        return ret;
 }
 EXPORT_SYMBOL(__frontswap_test);
+static inline void __frontswap_set(struct swap_info_struct *sis,
+                                   pgoff_t offset)
+{
+        set_bit(offset, sis->frontswap_map);
+        atomic_inc(&sis->frontswap_pages);
+}
 static inline void __frontswap_clear(struct swap_info_struct *sis,
-                                pgoff_t offset)
+                                     pgoff_t offset)
 {
        clear_bit(offset, sis->frontswap_map);
        atomic_dec(&sis->frontswap_pages);
@@ -215,39 +241,46 @@ static inline void __frontswap_clear(struct swap_info_struct *sis,
 */
 int __frontswap_store(struct page *page)
 {
-        int ret = -1, dup = 0;
+        int ret = -1;
        swp_entry_t entry = { .val = page_private(page), };
        int type = swp_type(entry);
        struct swap_info_struct *sis = swap_info[type];
        pgoff_t offset = swp_offset(entry);
+        struct frontswap_ops *ops;
        /*
         * Return if no backend registed.
         * Don't need to inc frontswap_failed_stores here.
         */
        if (!frontswap_ops)
-                return ret;
+                return -1;
        BUG_ON(!PageLocked(page));
        BUG_ON(sis == NULL);
-        if (__frontswap_test(sis, offset))
-                dup = 1;
+        /*
-        ret = frontswap_ops->store(type, offset, page);
+         * If a dup, we must remove the old page first; we can't leave the
+         * old page no matter if the store of the new page succeeds or fails,
+         * and we can't rely on the new page replacing the old page as we may
+         * not store to the same implementation that contains the old page.
+         */
+        if (__frontswap_test(sis, offset)) {
+                __frontswap_clear(sis, offset);
+                for_each_frontswap_ops(ops)
+                        ops->invalidate_page(type, offset);
+        }
+        /* Try to store in each implementation, until one succeeds. */
+        for_each_frontswap_ops(ops) {
+                ret = ops->store(type, offset, page);
+                if (!ret) /* successful store */
+                        break;
+        }
        if (ret == 0) {
-                set_bit(offset, sis->frontswap_map);
+                __frontswap_set(sis, offset);
                inc_frontswap_succ_stores();
-                if (!dup)
-                        atomic_inc(&sis->frontswap_pages);
        } else {
-                /*
-                  failed dup always results in automatic invalidate of
-                  the (older) page from frontswap
-                 */
                inc_frontswap_failed_stores();
-                if (dup) {
-                        __frontswap_clear(sis, offset);
-                        frontswap_ops->invalidate_page(type, offset);
-                }
        }
        if (frontswap_writethrough_enabled)
                /* report failure so swap also writes to swap device */
@@ -268,14 +301,22 @@ int __frontswap_load(struct page *page)
        int type = swp_type(entry);
        struct swap_info_struct *sis = swap_info[type];
        pgoff_t offset = swp_offset(entry);
+        struct frontswap_ops *ops;
+        if (!frontswap_ops)
+                return -1;
        BUG_ON(!PageLocked(page));
        BUG_ON(sis == NULL);
-        /*
+        if (!__frontswap_test(sis, offset))
-         * __frontswap_test() will check whether there is backend registered
+                return -1;
-         */
-        if (__frontswap_test(sis, offset))
+        /* Try loading from each implementation, until one succeeds. */
-                ret = frontswap_ops->load(type, offset, page);
+        for_each_frontswap_ops(ops) {
+                ret = ops->load(type, offset, page);
+                if (!ret) /* successful load */
+                        break;
+        }
        if (ret == 0) {
                inc_frontswap_loads();
                if (frontswap_tmem_exclusive_gets_enabled) {
@@ -294,16 +335,19 @@ EXPORT_SYMBOL(__frontswap_load);
 void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
 {
        struct swap_info_struct *sis = swap_info[type];
+        struct frontswap_ops *ops;
+        if (!frontswap_ops)
+                return;
        BUG_ON(sis == NULL);
-        /*
+        if (!__frontswap_test(sis, offset))
-         * __frontswap_test() will check whether there is backend registered
+                return;
-         */
-        if (__frontswap_test(sis, offset)) {
+        for_each_frontswap_ops(ops)
-                frontswap_ops->invalidate_page(type, offset);
+                ops->invalidate_page(type, offset);
-                __frontswap_clear(sis, offset);
+        __frontswap_clear(sis, offset);
-                inc_frontswap_invalidates();
+        inc_frontswap_invalidates();
-        }
 }
 EXPORT_SYMBOL(__frontswap_invalidate_page);
@@ -314,16 +358,19 @@ EXPORT_SYMBOL(__frontswap_invalidate_page);
 void __frontswap_invalidate_area(unsigned type)
 {
        struct swap_info_struct *sis = swap_info[type];
+        struct frontswap_ops *ops;
-        if (frontswap_ops) {
+        if (!frontswap_ops)
-                BUG_ON(sis == NULL);
+                return;
-                if (sis->frontswap_map == NULL)
-                        return;
+        BUG_ON(sis == NULL);
-                frontswap_ops->invalidate_area(type);
+        if (sis->frontswap_map == NULL)
-                atomic_set(&sis->frontswap_pages, 0);
+                return;
-                bitmap_zero(sis->frontswap_map, sis->max);
-        }
+        for_each_frontswap_ops(ops)
-        clear_bit(type, need_init);
+                ops->invalidate_area(type);
+        atomic_set(&sis->frontswap_pages, 0);
+        bitmap_zero(sis->frontswap_map, sis->max);
 }
 EXPORT_SYMBOL(__frontswap_invalidate_area);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 078832cf3636..c107094f79ba 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1031,7 +1031,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                goto out_free_pages;
        VM_BUG_ON_PAGE(!PageHead(page), page);
-        pmdp_clear_flush_notify(vma, haddr, pmd);
+        pmdp_huge_clear_flush_notify(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
@@ -1174,7 +1174,7 @@ alloc:
                pmd_t entry;
                entry = mk_huge_pmd(new_page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-                pmdp_clear_flush_notify(vma, haddr, pmd);
+                pmdp_huge_clear_flush_notify(vma, haddr, pmd);
                page_add_new_anon_rmap(new_page, vma, haddr);
                mem_cgroup_commit_charge(new_page, memcg, false);
                lru_cache_add_active_or_unevictable(new_page, vma);
@@ -1396,12 +1396,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                pmd_t orig_pmd;
                /*
                 * For architectures like ppc64 we look at deposited pgtable
-                 * when calling pmdp_get_and_clear. So do the
+                 * when calling pmdp_huge_get_and_clear. So do the
                 * pgtable_trans_huge_withdraw after finishing pmdp related
                 * operations.
                 */
-                orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd,
+                orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
-                                                   tlb->fullmm);
+                                                        tlb->fullmm);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
                if (is_huge_zero_pmd(orig_pmd)) {
@@ -1459,7 +1459,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
-                pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+                pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
                VM_BUG_ON(!pmd_none(*new_pmd));
                if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
@@ -1505,7 +1505,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                }
                if (!prot_numa || !pmd_protnone(*pmd)) {
-                        entry = pmdp_get_and_clear_notify(mm, addr, pmd);
+                        entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
                        entry = pmd_modify(entry, newprot);
                        if (preserve_write)
                                entry = pmd_mkwrite(entry);
@@ -2499,7 +2499,7 @@ static void collapse_huge_page(struct mm_struct *mm,
         * huge and small TLB entries for the same virtual address
         * to avoid the risk of CPU bugs in that area.
         */
-        _pmd = pmdp_clear_flush(vma, address, pmd);
+        _pmd = pmdp_collapse_flush(vma, address, pmd);
        spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -2799,7 +2799,7 @@ static void khugepaged_do_scan(void)
                cond_resched();
-                if (unlikely(kthread_should_stop() || freezing(current)))
+                if (unlikely(kthread_should_stop() || try_to_freeze()))
                        break;
                spin_lock(&khugepaged_mm_lock);
@@ -2820,8 +2820,6 @@ static void khugepaged_do_scan(void)
 static void khugepaged_wait_work(void)
 {
-        try_to_freeze();
        if (khugepaged_has_work()) {
                if (!khugepaged_scan_sleep_millisecs)
                        return;
@@ -2865,7 +2863,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
        pmd_t _pmd;
        int i;
-        pmdp_clear_flush_notify(vma, haddr, pmd);
+        pmdp_huge_clear_flush_notify(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 271e4432734c..75c0eef52c5d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,11 @@ int hugepages_treat_as_movable;
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
 struct hstate hstates[HUGE_MAX_HSTATE];
+/*
+ * Minimum page order among possible hugepage sizes, set to a proper value
+ * at boot time.
+ */
+static unsigned int minimum_order __read_mostly = UINT_MAX;
 __initdata LIST_HEAD(huge_boot_pages);
@@ -212,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
 *
- * The region data structures are embedded into a resv_map and
+ * The region data structures are embedded into a resv_map and protected
- * protected by a resv_map's lock
+ * by a resv_map's lock.  The set of regions within the resv_map represent
+ * reservations for huge pages, or huge pages that have already been
+ * instantiated within the map.  The from and to elements are huge page
+ * indicies into the associated mapping.  from indicates the starting index
+ * of the region.  to represents the first index past the end of  the region.
+ *
+ * For example, a file region structure with from == 0 and to == 4 represents
+ * four huge pages in a mapping.  It is important to note that the to element
+ * represents the first element past the end of the region. This is used in
+ * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
+ *
+ * Interval notation of the form [from, to) will be used to indicate that
+ * the endpoint from is inclusive and to is exclusive.
 */
 struct file_region {
        struct list_head link;
@@ -221,10 +238,22 @@ struct file_region {
        long to;
 };
+/*
+ * Add the huge page range represented by [f, t) to the reserve
+ * map.  Existing regions will be expanded to accommodate the
+ * specified range.  We know only existing regions need to be
+ * expanded, because region_add is only called after region_chg
+ * with the same range.  If a new file_region structure must
+ * be allocated, it is done in region_chg.
+ *
+ * Return the number of new huge pages added to the map.  This
+ * number is greater than or equal to zero.
+ */
 static long region_add(struct resv_map *resv, long f, long t)
 {
        struct list_head *head = &resv->regions;
        struct file_region *rg, *nrg, *trg;
+        long add = 0;
        spin_lock(&resv->lock);
        /* Locate the region we are either in or before. */
@@ -250,16 +279,45 @@ static long region_add(struct resv_map *resv, long f, long t)
                if (rg->to > t)
                        t = rg->to;
                if (rg != nrg) {
+                        /* Decrement return value by the deleted range.
+                         * Another range will span this area so that by
+                         * end of routine add will be >= zero
+                         */
+                        add -= (rg->to - rg->from);
                        list_del(&rg->link);
                        kfree(rg);
                }
        }
+        add += (nrg->from - f);         /* Added to beginning of region */
        nrg->from = f;
+        add += t - nrg->to;             /* Added to end of region */
        nrg->to = t;
        spin_unlock(&resv->lock);
-        return 0;
+        VM_BUG_ON(add < 0);
+        return add;
 }
+/*
+ * Examine the existing reserve map and determine how many
+ * huge pages in the specified range [f, t) are NOT currently
+ * represented.  This routine is called before a subsequent
+ * call to region_add that will actually modify the reserve
+ * map to add the specified range [f, t).  region_chg does
+ * not change the number of huge pages represented by the
+ * map.  However, if the existing regions in the map can not
+ * be expanded to represent the new range, a new file_region
+ * structure is added to the map as a placeholder.  This is
+ * so that the subsequent region_add call will have all the
+ * regions it needs and will not fail.
+ *
+ * Returns the number of huge pages that need to be added
+ * to the existing reservation map for the range [f, t).
+ * This number is greater or equal to zero.  -ENOMEM is
+ * returned if a new file_region structure is needed and can
+ * not be allocated.
+ */
 static long region_chg(struct resv_map *resv, long f, long t)
 {
        struct list_head *head = &resv->regions;
@@ -326,6 +384,11 @@ out_nrg:
        return chg;
 }
+/*
+ * Truncate the reserve map at index 'end'.  Modify/truncate any
+ * region which contains end.  Delete any regions past end.
+ * Return the number of huge pages removed from the map.
+ */
 static long region_truncate(struct resv_map *resv, long end)
 {
        struct list_head *head = &resv->regions;
@@ -361,6 +424,10 @@ out:
        return chg;
 }
+/*
+ * Count and return the number of huge pages in the reserve map
+ * that intersect with the range [f, t).
+ */
 static long region_count(struct resv_map *resv, long f, long t)
 {
        struct list_head *head = &resv->regions;
@@ -1188,19 +1255,13 @@ static void dissolve_free_huge_page(struct page *page)
 */
 void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
-        unsigned int order = 8 * sizeof(void *);
        unsigned long pfn;
-        struct hstate *h;
        if (!hugepages_supported())
                return;
-        /* Set scan step to minimum hugepage size */
+        VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
-        for_each_hstate(h)
+        for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
-                if (order > huge_page_order(h))
-                        order = huge_page_order(h);
-        VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
-        for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
                dissolve_free_huge_page(pfn_to_page(pfn));
 }
@@ -1423,46 +1484,56 @@ static void return_unused_surplus_pages(struct hstate *h,
 }
 /*
- * Determine if the huge page at addr within the vma has an associated
+ * vma_needs_reservation and vma_commit_reservation are used by the huge
- * reservation.  Where it does not we will need to logically increase
+ * page allocation routines to manage reservations.
- * reservation and actually increase subpool usage before an allocation
+ *
- * can occur.  Where any new reservation would be required the
+ * vma_needs_reservation is called to determine if the huge page at addr
- * reservation change is prepared, but not committed.  Once the page
+ * within the vma has an associated reservation.  If a reservation is
- * has been allocated from the subpool and instantiated the change should
+ * needed, the value 1 is returned.  The caller is then responsible for
- * be committed via vma_commit_reservation.  No action is required on
+ * managing the global reservation and subpool usage counts.  After
- * failure.
+ * the huge page has been allocated, vma_commit_reservation is called
+ * to add the page to the reservation map.
+ *
+ * In the normal case, vma_commit_reservation returns the same value
+ * as the preceding vma_needs_reservation call.  The only time this
+ * is not the case is if a reserve map was changed between calls.  It
+ * is the responsibility of the caller to notice the difference and
+ * take appropriate action.
 */
-static long vma_needs_reservation(struct hstate *h,
+static long __vma_reservation_common(struct hstate *h,
-                        struct vm_area_struct *vma, unsigned long addr)
+                                struct vm_area_struct *vma, unsigned long addr,
+                                bool commit)
 {
        struct resv_map *resv;
        pgoff_t idx;
-        long chg;
+        long ret;
        resv = vma_resv_map(vma);
        if (!resv)
                return 1;
        idx = vma_hugecache_offset(h, vma, addr);
-        chg = region_chg(resv, idx, idx + 1);
+        if (commit)
+                ret = region_add(resv, idx, idx + 1);
+        else
+                ret = region_chg(resv, idx, idx + 1);
        if (vma->vm_flags & VM_MAYSHARE)
-                return chg;
+                return ret;
        else
-                return chg < 0 ? chg : 0;
+                return ret < 0 ? ret : 0;
 }
-static void vma_commit_reservation(struct hstate *h,
+static long vma_needs_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
 {
-        struct resv_map *resv;
+        return __vma_reservation_common(h, vma, addr, false);
-        pgoff_t idx;
+}
-        resv = vma_resv_map(vma);
-        if (!resv)
-                return;
-        idx = vma_hugecache_offset(h, vma, addr);
+static long vma_commit_reservation(struct hstate *h,
-        region_add(resv, idx, idx + 1);
+                        struct vm_area_struct *vma, unsigned long addr)
+{
+        return __vma_reservation_common(h, vma, addr, true);
 }
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
@@ -1471,7 +1542,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        struct hugepage_subpool *spool = subpool_vma(vma);
        struct hstate *h = hstate_vma(vma);
        struct page *page;
-        long chg;
+        long chg, commit;
        int ret, idx;
        struct hugetlb_cgroup *h_cg;
@@ -1512,7 +1583,22 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        set_page_private(page, (unsigned long)spool);
-        vma_commit_reservation(h, vma, addr);
+        commit = vma_commit_reservation(h, vma, addr);
+        if (unlikely(chg > commit)) {
+                /*
+                 * The page was added to the reservation map between
+                 * vma_needs_reservation and vma_commit_reservation.
+                 * This indicates a race with hugetlb_reserve_pages.
+                 * Adjust for the subpool count incremented above AND
+                 * in hugetlb_reserve_pages for the same page.  Also,
+                 * the reservation count added in hugetlb_reserve_pages
+                 * no longer applies.
+                 */
+                long rsv_adjust;
+                rsv_adjust = hugepage_subpool_put_pages(spool, 1);
+                hugetlb_acct_memory(h, -rsv_adjust);
+        }
        return page;
 out_uncharge_cgroup:
@@ -1627,10 +1713,14 @@ static void __init hugetlb_init_hstates(void)
        struct hstate *h;
        for_each_hstate(h) {
+                if (minimum_order > huge_page_order(h))
+                        minimum_order = huge_page_order(h);
                /* oversize hugepages were init'ed in early boot */
                if (!hstate_is_gigantic(h))
                        hugetlb_hstate_alloc_pages(h);
        }
+        VM_BUG_ON(minimum_order == UINT_MAX);
 }
 static char * __init memfmt(char *buf, unsigned long n)
@@ -3626,8 +3716,24 @@ int hugetlb_reserve_pages(struct inode *inode,
         * consumed reservations are stored in the map. Hence, nothing
         * else has to be done for private mappings here
         */
-        if (!vma || vma->vm_flags & VM_MAYSHARE)
+        if (!vma || vma->vm_flags & VM_MAYSHARE) {
-                region_add(resv_map, from, to);
+                long add = region_add(resv_map, from, to);
+                if (unlikely(chg > add)) {
+                        /*
+                         * pages in this range were added to the reserve
+                         * map between region_chg and region_add.  This
+                         * indicates a race with alloc_huge_page.  Adjust
+                         * the subpool and reserve counts modified above
+                         * based on the difference.
+                         */
+                        long rsv_adjust;
+                        rsv_adjust = hugepage_subpool_put_pages(spool,
+                                                                chg - add);
+                        hugetlb_acct_memory(h, -rsv_adjust);
+                }
+        }
        return 0;
 out_err:
        if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
@@ -3789,6 +3895,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 {
        return NULL;
 }
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+        return 0;
+}
 #define want_pmd_share()        (0)
 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 4ca5fe0042e1..bf73ac17dad4 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -28,7 +28,7 @@ static int hwpoison_inject(void *data, u64 val)
        /*
         * This implies unable to support free buddy pages.
         */
-        if (!get_page_unless_zero(hpage))
+        if (!get_hwpoison_page(p))
                return 0;
        if (!hwpoison_filter_enable)
@@ -58,7 +58,7 @@ inject:
        pr_info("Injecting memory failure at pfn %#lx\n", pfn);
        return memory_failure(pfn, 18, MF_COUNT_INCREASED);
 put_out:
-        put_page(hpage);
+        put_page(p);
        return 0;
 }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f0fe4f2c1fa7..cf79f110157c 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -53,6 +53,13 @@
 *   modifications to the memory scanning parameters including the scan_thread
 *   pointer
 *
+ * Locks and mutexes are acquired/nested in the following order:
+ *
+ *   scan_mutex [-> object->lock] -> kmemleak_lock -> other_object->lock (SINGLE_DEPTH_NESTING)
+ *
+ * No kmemleak_lock and object->lock nesting is allowed outside scan_mutex
+ * regions.
+ *
 * The kmemleak_object structures have a use_count incremented or decremented
 * using the get_object()/put_object() functions. When the use_count becomes
 * 0, this count can no longer be incremented and put_object() schedules the
@@ -195,6 +202,8 @@ static struct kmem_cache *scan_area_cache;
 /* set if tracing memory operations is enabled */
 static int kmemleak_enabled;
+/* same as above but only for the kmemleak_free() callback */
+static int kmemleak_free_enabled;
 /* set in the late_initcall if there were no errors */
 static int kmemleak_initialized;
 /* enables or disables early logging of the memory operations */
@@ -483,8 +492,7 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
        rcu_read_lock();
        read_lock_irqsave(&kmemleak_lock, flags);
-        if (ptr >= min_addr && ptr < max_addr)
+        object = lookup_object(ptr, alias);
-                object = lookup_object(ptr, alias);
        read_unlock_irqrestore(&kmemleak_lock, flags);
        /* check whether the object is still available */
@@ -496,6 +504,27 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
 }
 /*
+ * Look up an object in the object search tree and remove it from both
+ * object_tree_root and object_list. The returned object's use_count should be
+ * at least 1, as initially set by create_object().
+ */
+static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias)
+{
+        unsigned long flags;
+        struct kmemleak_object *object;
+        write_lock_irqsave(&kmemleak_lock, flags);
+        object = lookup_object(ptr, alias);
+        if (object) {
+                rb_erase(&object->rb_node, &object_tree_root);
+                list_del_rcu(&object->object_list);
+        }
+        write_unlock_irqrestore(&kmemleak_lock, flags);
+        return object;
+}
+/*
 * Save stack trace to the given array of MAX_TRACE size.
 */
 static int __save_stack_trace(unsigned long *trace)
@@ -580,11 +609,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
                        kmemleak_stop("Cannot insert 0x%lx into the object "
                                      "search tree (overlaps existing)\n",
                                      ptr);
+                        /*
+                         * No need for parent->lock here since "parent" cannot
+                         * be freed while the kmemleak_lock is held.
+                         */
+                        dump_object_info(parent);
                        kmem_cache_free(object_cache, object);
-                        object = parent;
+                        object = NULL;
-                        spin_lock(&object->lock);
-                        dump_object_info(object);
-                        spin_unlock(&object->lock);
                        goto out;
                }
        }
@@ -598,20 +629,14 @@ out:
 }
 /*
- * Remove the metadata (struct kmemleak_object) for a memory block from the
+ * Mark the object as not allocated and schedule RCU freeing via put_object().
- * object_list and object_tree_root and decrement its use_count.
 */
 static void __delete_object(struct kmemleak_object *object)
 {
        unsigned long flags;
-        write_lock_irqsave(&kmemleak_lock, flags);
-        rb_erase(&object->rb_node, &object_tree_root);
-        list_del_rcu(&object->object_list);
-        write_unlock_irqrestore(&kmemleak_lock, flags);
        WARN_ON(!(object->flags & OBJECT_ALLOCATED));
-        WARN_ON(atomic_read(&object->use_count) < 2);
+        WARN_ON(atomic_read(&object->use_count) < 1);
        /*
         * Locking here also ensures that the corresponding memory block
@@ -631,7 +656,7 @@ static void delete_object_full(unsigned long ptr)
 {
        struct kmemleak_object *object;
-        object = find_and_get_object(ptr, 0);
+        object = find_and_remove_object(ptr, 0);
        if (!object) {
 #ifdef DEBUG
                kmemleak_warn("Freeing unknown object at 0x%08lx\n",
@@ -640,7 +665,6 @@ static void delete_object_full(unsigned long ptr)
                return;
        }
        __delete_object(object);
-        put_object(object);
 }
 /*
@@ -653,7 +677,7 @@ static void delete_object_part(unsigned long ptr, size_t size)
        struct kmemleak_object *object;
        unsigned long start, end;
-        object = find_and_get_object(ptr, 1);
+        object = find_and_remove_object(ptr, 1);
        if (!object) {
 #ifdef DEBUG
                kmemleak_warn("Partially freeing unknown object at 0x%08lx "
@@ -661,7 +685,6 @@ static void delete_object_part(unsigned long ptr, size_t size)
 #endif
                return;
        }
-        __delete_object(object);
        /*
         * Create one or two objects that may result from the memory block
@@ -679,7 +702,7 @@ static void delete_object_part(unsigned long ptr, size_t size)
                create_object(ptr + size, end - ptr - size, object->min_count,
                              GFP_KERNEL);
-        put_object(object);
+        __delete_object(object);
 }
 static void __paint_it(struct kmemleak_object *object, int color)
@@ -907,12 +930,13 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
 * kmemleak_alloc_percpu - register a newly allocated __percpu object
 * @ptr:        __percpu pointer to beginning of the object
 * @size:       size of the object
+ * @gfp:        flags used for kmemleak internal memory allocations
 *
 * This function is called from the kernel percpu allocator when a new object
- * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL
+ * (memory block) is allocated (alloc_percpu).
- * allocation.
 */
-void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
+void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+                                 gfp_t gfp)
 {
        unsigned int cpu;
@@ -925,7 +949,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
        if (kmemleak_enabled && ptr && !IS_ERR(ptr))
                for_each_possible_cpu(cpu)
                        create_object((unsigned long)per_cpu_ptr(ptr, cpu),
-                                      size, 0, GFP_KERNEL);
+                                      size, 0, gfp);
        else if (kmemleak_early_log)
                log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
 }
@@ -942,7 +966,7 @@ void __ref kmemleak_free(const void *ptr)
 {
        pr_debug("%s(0x%p)\n", __func__, ptr);
-        if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+        if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
                delete_object_full((unsigned long)ptr);
        else if (kmemleak_early_log)
                log_early(KMEMLEAK_FREE, ptr, 0, 0);
@@ -982,7 +1006,7 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr)
        pr_debug("%s(0x%p)\n", __func__, ptr);
-        if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+        if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
                for_each_possible_cpu(cpu)
                        delete_object_full((unsigned long)per_cpu_ptr(ptr,
                                                                      cpu));
@@ -1148,19 +1172,18 @@ static int scan_should_stop(void)
 * found to the gray list.
 */
 static void scan_block(void *_start, void *_end,
-                       struct kmemleak_object *scanned, int allow_resched)
+                       struct kmemleak_object *scanned)
 {
        unsigned long *ptr;
        unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
        unsigned long *end = _end - (BYTES_PER_POINTER - 1);
+        unsigned long flags;
+        read_lock_irqsave(&kmemleak_lock, flags);
        for (ptr = start; ptr < end; ptr++) {
                struct kmemleak_object *object;
-                unsigned long flags;
                unsigned long pointer;
-                if (allow_resched)
-                        cond_resched();
                if (scan_should_stop())
                        break;
@@ -1173,26 +1196,31 @@ static void scan_block(void *_start, void *_end,
                pointer = *ptr;
                kasan_enable_current();
-                object = find_and_get_object(pointer, 1);
+                if (pointer < min_addr || pointer >= max_addr)
+                        continue;
+                /*
+                 * No need for get_object() here since we hold kmemleak_lock.
+                 * object->use_count cannot be dropped to 0 while the object
+                 * is still present in object_tree_root and object_list
+                 * (with updates protected by kmemleak_lock).
+                 */
+                object = lookup_object(pointer, 1);
                if (!object)
                        continue;
-                if (object == scanned) {
+                if (object == scanned)
                        /* self referenced, ignore */
-                        put_object(object);
                        continue;
-                }
                /*
                 * Avoid the lockdep recursive warning on object->lock being
                 * previously acquired in scan_object(). These locks are
                 * enclosed by scan_mutex.
                 */
-                spin_lock_irqsave_nested(&object->lock, flags,
+                spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
-                                         SINGLE_DEPTH_NESTING);
                if (!color_white(object)) {
                        /* non-orphan, ignored or new */
-                        spin_unlock_irqrestore(&object->lock, flags);
+                        spin_unlock(&object->lock);
-                        put_object(object);
                        continue;
                }
@@ -1204,13 +1232,27 @@ static void scan_block(void *_start, void *_end,
                 */
                object->count++;
                if (color_gray(object)) {
+                        /* put_object() called when removing from gray_list */
+                        WARN_ON(!get_object(object));
                        list_add_tail(&object->gray_list, &gray_list);
-                        spin_unlock_irqrestore(&object->lock, flags);
-                        continue;
                }
+                spin_unlock(&object->lock);
+        }
+        read_unlock_irqrestore(&kmemleak_lock, flags);
+}
-                spin_unlock_irqrestore(&object->lock, flags);
+/*
-                put_object(object);
+ * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency.
+ */
+static void scan_large_block(void *start, void *end)
+{
+        void *next;
+        while (start < end) {
+                next = min(start + MAX_SCAN_SIZE, end);
+                scan_block(start, next, NULL);
+                start = next;
+                cond_resched();
        }
 }
@@ -1236,22 +1278,25 @@ static void scan_object(struct kmemleak_object *object)
        if (hlist_empty(&object->area_list)) {
                void *start = (void *)object->pointer;
                void *end = (void *)(object->pointer + object->size);
+                void *next;
-                while (start < end && (object->flags & OBJECT_ALLOCATED) &&
+                do {
-                       !(object->flags & OBJECT_NO_SCAN)) {
+                        next = min(start + MAX_SCAN_SIZE, end);
-                        scan_block(start, min(start + MAX_SCAN_SIZE, end),
+                        scan_block(start, next, object);
-                                   object, 0);
-                        start += MAX_SCAN_SIZE;
+                        start = next;
+                        if (start >= end)
+                                break;
                        spin_unlock_irqrestore(&object->lock, flags);
                        cond_resched();
                        spin_lock_irqsave(&object->lock, flags);
-                }
+                } while (object->flags & OBJECT_ALLOCATED);
        } else
                hlist_for_each_entry(area, &object->area_list, node)
                        scan_block((void *)area->start,
                                   (void *)(area->start + area->size),
-                                   object, 0);
+                                   object);
 out:
        spin_unlock_irqrestore(&object->lock, flags);
 }
@@ -1328,14 +1373,14 @@ static void kmemleak_scan(void)
        rcu_read_unlock();
        /* data/bss scanning */
-        scan_block(_sdata, _edata, NULL, 1);
+        scan_large_block(_sdata, _edata);
-        scan_block(__bss_start, __bss_stop, NULL, 1);
+        scan_large_block(__bss_start, __bss_stop);
 #ifdef CONFIG_SMP
        /* per-cpu sections scanning */
        for_each_possible_cpu(i)
-                scan_block(__per_cpu_start + per_cpu_offset(i),
+                scan_large_block(__per_cpu_start + per_cpu_offset(i),
-                           __per_cpu_end + per_cpu_offset(i), NULL, 1);
+                                 __per_cpu_end + per_cpu_offset(i));
 #endif
        /*
@@ -1356,7 +1401,7 @@ static void kmemleak_scan(void)
                        /* only scan if page is in use */
                        if (page_count(page) == 0)
                                continue;
-                        scan_block(page, page + 1, NULL, 1);
+                        scan_block(page, page + 1, NULL);
                }
        }
        put_online_mems();
@@ -1370,7 +1415,7 @@ static void kmemleak_scan(void)
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        scan_block(task_stack_page(p), task_stack_page(p) +
-                                   THREAD_SIZE, NULL, 0);
+                                   THREAD_SIZE, NULL);
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
        }
@@ -1747,15 +1792,20 @@ static void __kmemleak_do_cleanup(void)
 */
 static void kmemleak_do_cleanup(struct work_struct *work)
 {
-        mutex_lock(&scan_mutex);
        stop_scan_thread();
+        /*
+         * Once the scan thread has stopped, it is safe to no longer track
+         * object freeing. Ordering of the scan thread stopping and the memory
+         * accesses below is guaranteed by the kthread_stop() function.
+         */
+        kmemleak_free_enabled = 0;
        if (!kmemleak_found_leaks)
                __kmemleak_do_cleanup();
        else
                pr_info("Kmemleak disabled without freeing internal data. "
                        "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
-        mutex_unlock(&scan_mutex);
 }
 static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
@@ -1776,6 +1826,8 @@ static void kmemleak_disable(void)
        /* check whether it is too early for a kernel thread */
        if (kmemleak_initialized)
                schedule_work(&cleanup_work);
+        else
+                kmemleak_free_enabled = 0;
        pr_info("Kernel memory leak detector disabled\n");
 }
@@ -1840,8 +1892,10 @@ void __init kmemleak_init(void)
        if (kmemleak_error) {
                local_irq_restore(flags);
                return;
-        } else
+        } else {
                kmemleak_enabled = 1;
+                kmemleak_free_enabled = 1;
+        }
        local_irq_restore(flags);
        /*
diff --git a/mm/memblock.c b/mm/memblock.c
index 9318b567ed79..1b444c730846 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -54,10 +54,16 @@ int memblock_debug __initdata_memblock;
 #ifdef CONFIG_MOVABLE_NODE
 bool movable_node_enabled __initdata_memblock = false;
 #endif
+static bool system_has_some_mirror __initdata_memblock = false;
 static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
 static int memblock_reserved_in_slab __initdata_memblock = 0;
+ulong __init_memblock choose_memblock_flags(void)
+{
+        return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
+}
 /* inline so we don't get a warning when pr_debug is compiled out */
 static __init_memblock const char *
 memblock_type_name(struct memblock_type *type)
@@ -107,6 +113,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
 * @size: size of free area to find
 * @align: alignment of free area to find
 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
 *
 * Utility called from memblock_find_in_range_node(), find free area bottom-up.
 *
@@ -115,12 +122,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
 */
 static phys_addr_t __init_memblock
 __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
-                                phys_addr_t size, phys_addr_t align, int nid)
+                                phys_addr_t size, phys_addr_t align, int nid,
+                                ulong flags)
 {
        phys_addr_t this_start, this_end, cand;
        u64 i;
-        for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
+        for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
                this_start = clamp(this_start, start, end);
                this_end = clamp(this_end, start, end);
@@ -139,6 +147,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
 * @size: size of free area to find
 * @align: alignment of free area to find
 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
 *
 * Utility called from memblock_find_in_range_node(), find free area top-down.
 *
@@ -147,12 +156,14 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
 */
 static phys_addr_t __init_memblock
 __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
-                               phys_addr_t size, phys_addr_t align, int nid)
+                               phys_addr_t size, phys_addr_t align, int nid,
+                               ulong flags)
 {
        phys_addr_t this_start, this_end, cand;
        u64 i;
-        for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+        for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
+                                        NULL) {
                this_start = clamp(this_start, start, end);
                this_end = clamp(this_end, start, end);
@@ -174,6 +185,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
 * @start: start of candidate range
 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
 *
 * Find @size free area aligned to @align in the specified range and node.
 *
@@ -190,7 +202,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
 */
 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
                                        phys_addr_t align, phys_addr_t start,
-                                        phys_addr_t end, int nid)
+                                        phys_addr_t end, int nid, ulong flags)
 {
        phys_addr_t kernel_end, ret;
@@ -215,7 +227,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
                /* ok, try bottom-up allocation first */
                ret = __memblock_find_range_bottom_up(bottom_up_start, end,
-                                                      size, align, nid);
+                                                      size, align, nid, flags);
                if (ret)
                        return ret;
@@ -233,7 +245,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
                             "memory hotunplug may be affected\n");
        }
-        return __memblock_find_range_top_down(start, end, size, align, nid);
+        return __memblock_find_range_top_down(start, end, size, align, nid,
+                                              flags);
 }
 /**
@@ -252,8 +265,21 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
                                        phys_addr_t end, phys_addr_t size,
                                        phys_addr_t align)
 {
-        return memblock_find_in_range_node(size, align, start, end,
+        phys_addr_t ret;
-                                            NUMA_NO_NODE);
+        ulong flags = choose_memblock_flags();
+again:
+        ret = memblock_find_in_range_node(size, align, start, end,
+                                            NUMA_NO_NODE, flags);
+        if (!ret && (flags & MEMBLOCK_MIRROR)) {
+                pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+                        &size);
+                flags &= ~MEMBLOCK_MIRROR;
+                goto again;
+        }
+        return ret;
 }
 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -779,9 +805,25 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
 }
 /**
+ * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
+{
+        system_has_some_mirror = true;
+        return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
+}
+/**
 * __next__mem_range - next function for for_each_free_mem_range() etc.
 * @idx: pointer to u64 loop variable
 * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
 * @type_a: pointer to memblock_type from where the range is taken
 * @type_b: pointer to memblock_type which excludes memory from being taken
 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -803,7 +845,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
 * As both region arrays are sorted, the function advances the two indices
 * in lockstep and returns each intersection.
 */
-void __init_memblock __next_mem_range(u64 *idx, int nid,
+void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
                                      struct memblock_type *type_a,
                                      struct memblock_type *type_b,
                                      phys_addr_t *out_start,
@@ -831,6 +873,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
                if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
                        continue;
+                /* if we want mirror memory skip non-mirror memory regions */
+                if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+                        continue;
                if (!type_b) {
                        if (out_start)
                                *out_start = m_start;
@@ -895,6 +941,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
 *
 * @idx: pointer to u64 loop variable
 * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
 * @type_a: pointer to memblock_type from where the range is taken
 * @type_b: pointer to memblock_type which excludes memory from being taken
 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -903,7 +950,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
 *
 * Reverse of __next_mem_range().
 */
-void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
                                          struct memblock_type *type_a,
                                          struct memblock_type *type_b,
                                          phys_addr_t *out_start,
@@ -935,6 +982,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
                if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
                        continue;
+                /* if we want mirror memory skip non-mirror memory regions */
+                if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+                        continue;
                if (!type_b) {
                        if (out_start)
                                *out_start = m_start;
@@ -1050,14 +1101,15 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
 static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
                                        phys_addr_t align, phys_addr_t start,
-                                        phys_addr_t end, int nid)
+                                        phys_addr_t end, int nid, ulong flags)
 {
        phys_addr_t found;
        if (!align)
                align = SMP_CACHE_BYTES;
-        found = memblock_find_in_range_node(size, align, start, end, nid);
+        found = memblock_find_in_range_node(size, align, start, end, nid,
+                                            flags);
        if (found && !memblock_reserve(found, size)) {
                /*
                 * The min_count is set to 0 so that memblock allocations are
@@ -1070,26 +1122,40 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
 }
 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
-                                        phys_addr_t start, phys_addr_t end)
+                                        phys_addr_t start, phys_addr_t end,
+                                        ulong flags)
 {
-        return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+        return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
+                                        flags);
 }
 static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
                                        phys_addr_t align, phys_addr_t max_addr,
-                                        int nid)
+                                        int nid, ulong flags)
 {
-        return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+        return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
 }
 phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
-        return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+        ulong flags = choose_memblock_flags();
+        phys_addr_t ret;
+again:
+        ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
+                                      nid, flags);
+        if (!ret && (flags & MEMBLOCK_MIRROR)) {
+                flags &= ~MEMBLOCK_MIRROR;
+                goto again;
+        }
+        return ret;
 }
 phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
 {
-        return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
+        return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE,
+                                       MEMBLOCK_NONE);
 }
 phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -1153,6 +1219,7 @@ static void * __init memblock_virt_alloc_internal(
 {
        phys_addr_t alloc;
        void *ptr;
+        ulong flags = choose_memblock_flags();
        if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
                nid = NUMA_NO_NODE;
@@ -1173,13 +1240,14 @@ static void * __init memblock_virt_alloc_internal(
 again:
        alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
-                                            nid);
+                                            nid, flags);
        if (alloc)
                goto done;
        if (nid != NUMA_NO_NODE) {
                alloc = memblock_find_in_range_node(size, align, min_addr,
-                                                    max_addr,  NUMA_NO_NODE);
+                                                    max_addr, NUMA_NO_NODE,
+                                                    flags);
                if (alloc)
                        goto done;
        }
@@ -1187,10 +1255,16 @@ again:
        if (min_addr) {
                min_addr = 0;
                goto again;
-        } else {
-                goto error;
        }
+        if (flags & MEMBLOCK_MIRROR) {
+                flags &= ~MEMBLOCK_MIRROR;
+                pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+                        &size);
+                goto again;
+        }
+        return NULL;
 done:
        memblock_reserve(alloc, size);
        ptr = phys_to_virt(alloc);
@@ -1205,9 +1279,6 @@ done:
        kmemleak_alloc(ptr, size, 0, 0);
        return ptr;
-error:
-        return NULL;
 }
 /**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a04225d372ba..e65f7b0131d3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -285,9 +285,9 @@ struct mem_cgroup {
         */
        bool use_hierarchy;
+        /* protected by memcg_oom_lock */
        bool            oom_lock;
-        atomic_t        under_oom;
+        int             under_oom;
-        atomic_t        oom_wakeups;
        int     swappiness;
        /* OOM-Killer disable */
@@ -1530,14 +1530,16 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
        unsigned int points = 0;
        struct task_struct *chosen = NULL;
+        mutex_lock(&oom_lock);
        /*
         * If current has a pending SIGKILL or is exiting, then automatically
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
         */
        if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-                mark_tsk_oom_victim(current);
+                mark_oom_victim(current);
-                return;
+                goto unlock;
        }
        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
@@ -1564,7 +1566,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                mem_cgroup_iter_break(memcg, iter);
                                if (chosen)
                                        put_task_struct(chosen);
-                                return;
+                                goto unlock;
                        case OOM_SCAN_OK:
                                break;
                        };
@@ -1585,11 +1587,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                css_task_iter_end(&it);
        }
-        if (!chosen)
+        if (chosen) {
-                return;
+                points = chosen_points * 1000 / totalpages;
-        points = chosen_points * 1000 / totalpages;
+                oom_kill_process(chosen, gfp_mask, order, points, totalpages,
-        oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
+                                 memcg, NULL, "Memory cgroup out of memory");
-                         NULL, "Memory cgroup out of memory");
+        }
+unlock:
+        mutex_unlock(&oom_lock);
 }
 #if MAX_NUMNODES > 1
@@ -1806,8 +1810,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *iter;
+        spin_lock(&memcg_oom_lock);
        for_each_mem_cgroup_tree(iter, memcg)
-                atomic_inc(&iter->under_oom);
+                iter->under_oom++;
+        spin_unlock(&memcg_oom_lock);
 }
 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
@@ -1816,11 +1822,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
        /*
         * When a new child is created while the hierarchy is under oom,
-         * mem_cgroup_oom_lock() may not be called. We have to use
+         * mem_cgroup_oom_lock() may not be called. Watch for underflow.
-         * atomic_add_unless() here.
         */
+        spin_lock(&memcg_oom_lock);
        for_each_mem_cgroup_tree(iter, memcg)
-                atomic_add_unless(&iter->under_oom, -1, 0);
+                if (iter->under_oom > 0)
+                        iter->under_oom--;
+        spin_unlock(&memcg_oom_lock);
 }
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1846,17 +1854,18 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
        return autoremove_wake_function(wait, mode, sync, arg);
 }
-static void memcg_wakeup_oom(struct mem_cgroup *memcg)
-{
-        atomic_inc(&memcg->oom_wakeups);
-        /* for filtering, pass "memcg" as argument. */
-        __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
-}
 static void memcg_oom_recover(struct mem_cgroup *memcg)
 {
-        if (memcg && atomic_read(&memcg->under_oom))
+        /*
-                memcg_wakeup_oom(memcg);
+         * For the following lockless ->under_oom test, the only required
+         * guarantee is that it must see the state asserted by an OOM when
+         * this function is called as a result of userland actions
+         * triggered by the notification of the OOM.  This is trivially
+         * achieved by invoking mem_cgroup_mark_under_oom() before
+         * triggering notification.
+         */
+        if (memcg && memcg->under_oom)
+                __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 }
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
@@ -3864,7 +3873,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
        list_add(&event->list, &memcg->oom_notify);
        /* already in OOM ? */
-        if (atomic_read(&memcg->under_oom))
+        if (memcg->under_oom)
                eventfd_signal(eventfd, 1);
        spin_unlock(&memcg_oom_lock);
@@ -3893,7 +3902,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
        seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
-        seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
+        seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
        return 0;
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 501820c815b3..c53543d89282 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -20,6 +20,14 @@
 * this code has to be extremely careful. Generally it tries to use 
 * normal locking rules, as in get the standard locks, even if that means 
 * the error handling takes potentially a long time.
+ *
+ * It can be very tempting to add handling for obscure cases here.
+ * In general any code for handling new cases should only be added iff:
+ * - You know how to test it.
+ * - You have a test that can be added to mce-test
+ *   https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
+ * - The case actually shows up as a frequent (top 10) page state in
+ *   tools/vm/page-types when running a real workload.
 * 
 * There are several operations here with exponential complexity because
 * of unsuitable VM data structures. For example the operation to map back 
@@ -28,13 +36,6 @@
 * are rare we hope to get away with this. This avoids impacting the core 
 * VM.
 */
-/*
- * Notebook:
- * - hugetlb needs more code
- * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
- * - pass bad pages to kdump next kernel
- */
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
@@ -56,6 +57,7 @@
 #include <linux/mm_inline.h>
 #include <linux/kfifo.h>
 #include "internal.h"
+#include "ras/ras_event.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -503,68 +505,34 @@ static void collect_procs(struct page *page, struct list_head *tokill,
        kfree(tk);
 }
-/*
- * Error handlers for various types of pages.
- */
-enum outcome {
-        IGNORED,        /* Error: cannot be handled */
-        FAILED,         /* Error: handling failed */
-        DELAYED,        /* Will be handled later */
-        RECOVERED,      /* Successfully recovered */
-};
 static const char *action_name[] = {
-        [IGNORED] = "Ignored",
+        [MF_IGNORED] = "Ignored",
-        [FAILED] = "Failed",
+        [MF_FAILED] = "Failed",
-        [DELAYED] = "Delayed",
+        [MF_DELAYED] = "Delayed",
-        [RECOVERED] = "Recovered",
+        [MF_RECOVERED] = "Recovered",
-};
-enum action_page_type {
-        MSG_KERNEL,
-        MSG_KERNEL_HIGH_ORDER,
-        MSG_SLAB,
-        MSG_DIFFERENT_COMPOUND,
-        MSG_POISONED_HUGE,
-        MSG_HUGE,
-        MSG_FREE_HUGE,
-        MSG_UNMAP_FAILED,
-        MSG_DIRTY_SWAPCACHE,
-        MSG_CLEAN_SWAPCACHE,
-        MSG_DIRTY_MLOCKED_LRU,
-        MSG_CLEAN_MLOCKED_LRU,
-        MSG_DIRTY_UNEVICTABLE_LRU,
-        MSG_CLEAN_UNEVICTABLE_LRU,
-        MSG_DIRTY_LRU,
-        MSG_CLEAN_LRU,
-        MSG_TRUNCATED_LRU,
-        MSG_BUDDY,
-        MSG_BUDDY_2ND,
-        MSG_UNKNOWN,
 };
 static const char * const action_page_types[] = {
-        [MSG_KERNEL]                    = "reserved kernel page",
+        [MF_MSG_KERNEL]                 = "reserved kernel page",
-        [MSG_KERNEL_HIGH_ORDER]         = "high-order kernel page",
+        [MF_MSG_KERNEL_HIGH_ORDER]      = "high-order kernel page",
-        [MSG_SLAB]                      = "kernel slab page",
+        [MF_MSG_SLAB]                   = "kernel slab page",
-        [MSG_DIFFERENT_COMPOUND]        = "different compound page after locking",
+        [MF_MSG_DIFFERENT_COMPOUND]     = "different compound page after locking",
-        [MSG_POISONED_HUGE]             = "huge page already hardware poisoned",
+        [MF_MSG_POISONED_HUGE]          = "huge page already hardware poisoned",
-        [MSG_HUGE]                      = "huge page",
+        [MF_MSG_HUGE]                   = "huge page",
-        [MSG_FREE_HUGE]                 = "free huge page",
+        [MF_MSG_FREE_HUGE]              = "free huge page",
-        [MSG_UNMAP_FAILED]              = "unmapping failed page",
+        [MF_MSG_UNMAP_FAILED]           = "unmapping failed page",
-        [MSG_DIRTY_SWAPCACHE]           = "dirty swapcache page",
+        [MF_MSG_DIRTY_SWAPCACHE]        = "dirty swapcache page",
-        [MSG_CLEAN_SWAPCACHE]           = "clean swapcache page",
+        [MF_MSG_CLEAN_SWAPCACHE]        = "clean swapcache page",
-        [MSG_DIRTY_MLOCKED_LRU]         = "dirty mlocked LRU page",
+        [MF_MSG_DIRTY_MLOCKED_LRU]      = "dirty mlocked LRU page",
-        [MSG_CLEAN_MLOCKED_LRU]         = "clean mlocked LRU page",
+        [MF_MSG_CLEAN_MLOCKED_LRU]      = "clean mlocked LRU page",
-        [MSG_DIRTY_UNEVICTABLE_LRU]     = "dirty unevictable LRU page",
+        [MF_MSG_DIRTY_UNEVICTABLE_LRU]  = "dirty unevictable LRU page",
-        [MSG_CLEAN_UNEVICTABLE_LRU]     = "clean unevictable LRU page",
+        [MF_MSG_CLEAN_UNEVICTABLE_LRU]  = "clean unevictable LRU page",
-        [MSG_DIRTY_LRU]                 = "dirty LRU page",
+        [MF_MSG_DIRTY_LRU]              = "dirty LRU page",
-        [MSG_CLEAN_LRU]                 = "clean LRU page",
+        [MF_MSG_CLEAN_LRU]              = "clean LRU page",
-        [MSG_TRUNCATED_LRU]             = "already truncated LRU page",
+        [MF_MSG_TRUNCATED_LRU]          = "already truncated LRU page",
-        [MSG_BUDDY]                     = "free buddy page",
+        [MF_MSG_BUDDY]                  = "free buddy page",
-        [MSG_BUDDY_2ND]                 = "free buddy page (2nd try)",
+        [MF_MSG_BUDDY_2ND]              = "free buddy page (2nd try)",
-        [MSG_UNKNOWN]                   = "unknown page",
+        [MF_MSG_UNKNOWN]                = "unknown page",
 };
 /*
@@ -598,7 +566,7 @@ static int delete_from_lru_cache(struct page *p)
 */
 static int me_kernel(struct page *p, unsigned long pfn)
 {
-        return IGNORED;
+        return MF_IGNORED;
 }
 /*
@@ -607,7 +575,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
 static int me_unknown(struct page *p, unsigned long pfn)
 {
        printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
-        return FAILED;
+        return MF_FAILED;
 }
 /*
@@ -616,7 +584,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
 {
        int err;
-        int ret = FAILED;
+        int ret = MF_FAILED;
        struct address_space *mapping;
        delete_from_lru_cache(p);
@@ -626,7 +594,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
         * should be the one m_f() holds.
         */
        if (PageAnon(p))
-                return RECOVERED;
+                return MF_RECOVERED;
        /*
         * Now truncate the page in the page cache. This is really
@@ -640,7 +608,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                /*
                 * Page has been teared down in the meanwhile
                 */
-                return FAILED;
+                return MF_FAILED;
        }
        /*
@@ -657,7 +625,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                                !try_to_release_page(p, GFP_NOIO)) {
                        pr_info("MCE %#lx: failed to release buffers\n", pfn);
                } else {
-                        ret = RECOVERED;
+                        ret = MF_RECOVERED;
                }
        } else {
                /*
@@ -665,7 +633,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                 * This fails on dirty or anything with private pages
                 */
                if (invalidate_inode_page(p))
-                        ret = RECOVERED;
+                        ret = MF_RECOVERED;
                else
                        printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
                                pfn);
@@ -751,9 +719,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
        ClearPageUptodate(p);
        if (!delete_from_lru_cache(p))
-                return DELAYED;
+                return MF_DELAYED;
        else
-                return FAILED;
+                return MF_FAILED;
 }
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
@@ -761,9 +729,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
        delete_from_swap_cache(p);
        if (!delete_from_lru_cache(p))
-                return RECOVERED;
+                return MF_RECOVERED;
        else
-                return FAILED;
+                return MF_FAILED;
 }
 /*
@@ -776,6 +744,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 {
        int res = 0;
        struct page *hpage = compound_head(p);
+        if (!PageHuge(hpage))
+                return MF_DELAYED;
        /*
         * We can safely recover from error on free or reserved (i.e.
         * not in-use) hugepage by dequeuing it from freelist.
@@ -789,9 +761,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
        if (!(page_mapping(hpage) || PageAnon(hpage))) {
                res = dequeue_hwpoisoned_huge_page(hpage);
                if (!res)
-                        return RECOVERED;
+                        return MF_RECOVERED;
        }
-        return DELAYED;
+        return MF_DELAYED;
 }
 /*
@@ -823,10 +795,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 static struct page_state {
        unsigned long mask;
        unsigned long res;
-        enum action_page_type type;
+        enum mf_action_page_type type;
        int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
-        { reserved,     reserved,       MSG_KERNEL,     me_kernel },
+        { reserved,     reserved,       MF_MSG_KERNEL,  me_kernel },
        /*
         * free pages are specially detected outside this table:
         * PG_buddy pages only make a small fraction of all free pages.
@@ -837,31 +809,31 @@ static struct page_state {
         * currently unused objects without touching them. But just
         * treat it as standard kernel for now.
         */
-        { slab,         slab,           MSG_SLAB,       me_kernel },
+        { slab,         slab,           MF_MSG_SLAB,    me_kernel },
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
-        { head,         head,           MSG_HUGE,               me_huge_page },
+        { head,         head,           MF_MSG_HUGE,            me_huge_page },
-        { tail,         tail,           MSG_HUGE,               me_huge_page },
+        { tail,         tail,           MF_MSG_HUGE,            me_huge_page },
 #else
-        { compound,     compound,       MSG_HUGE,               me_huge_page },
+        { compound,     compound,       MF_MSG_HUGE,            me_huge_page },
 #endif
-        { sc|dirty,     sc|dirty,       MSG_DIRTY_SWAPCACHE,    me_swapcache_dirty },
+        { sc|dirty,     sc|dirty,       MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
-        { sc|dirty,     sc,             MSG_CLEAN_SWAPCACHE,    me_swapcache_clean },
+        { sc|dirty,     sc,             MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
-        { mlock|dirty,  mlock|dirty,    MSG_DIRTY_MLOCKED_LRU,  me_pagecache_dirty },
+        { mlock|dirty,  mlock|dirty,    MF_MSG_DIRTY_MLOCKED_LRU,       me_pagecache_dirty },
-        { mlock|dirty,  mlock,          MSG_CLEAN_MLOCKED_LRU,  me_pagecache_clean },
+        { mlock|dirty,  mlock,          MF_MSG_CLEAN_MLOCKED_LRU,       me_pagecache_clean },
-        { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU,      me_pagecache_dirty },
+        { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU,   me_pagecache_dirty },
-        { unevict|dirty, unevict,       MSG_CLEAN_UNEVICTABLE_LRU,      me_pagecache_clean },
+        { unevict|dirty, unevict,       MF_MSG_CLEAN_UNEVICTABLE_LRU,   me_pagecache_clean },
-        { lru|dirty,    lru|dirty,      MSG_DIRTY_LRU,  me_pagecache_dirty },
+        { lru|dirty,    lru|dirty,      MF_MSG_DIRTY_LRU,       me_pagecache_dirty },
-        { lru|dirty,    lru,            MSG_CLEAN_LRU,  me_pagecache_clean },
+        { lru|dirty,    lru,            MF_MSG_CLEAN_LRU,       me_pagecache_clean },
        /*
         * Catchall entry: must be at end.
         */
-        { 0,            0,              MSG_UNKNOWN,    me_unknown },
+        { 0,            0,              MF_MSG_UNKNOWN, me_unknown },
 };
 #undef dirty
@@ -881,8 +853,11 @@ static struct page_state {
 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
 */
-static void action_result(unsigned long pfn, enum action_page_type type, int result)
+static void action_result(unsigned long pfn, enum mf_action_page_type type,
+                          enum mf_result result)
 {
+        trace_memory_failure_event(pfn, type, result);
        pr_err("MCE %#lx: recovery action for %s: %s\n",
                pfn, action_page_types[type], action_name[result]);
 }
@@ -896,13 +871,13 @@ static int page_action(struct page_state *ps, struct page *p,
        result = ps->action(p, pfn);
        count = page_count(p) - 1;
-        if (ps->action == me_swapcache_dirty && result == DELAYED)
+        if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
                count--;
        if (count != 0) {
                printk(KERN_ERR
                       "MCE %#lx: %s still referenced by %d users\n",
                       pfn, action_page_types[ps->type], count);
-                result = FAILED;
+                result = MF_FAILED;
        }
        action_result(pfn, ps->type, result);
@@ -911,9 +886,42 @@ static int page_action(struct page_state *ps, struct page *p,
         * Could adjust zone counters here to correct for the missing page.
         */
-        return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
+        return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
 }
+/**
+ * get_hwpoison_page() - Get refcount for memory error handling:
+ * @page:       raw error page (hit by memory error)
+ *
+ * Return: return 0 if failed to grab the refcount, otherwise true (some
+ * non-zero value.)
+ */
+int get_hwpoison_page(struct page *page)
+{
+        struct page *head = compound_head(page);
+        if (PageHuge(head))
+                return get_page_unless_zero(head);
+        /*
+         * Thp tail page has special refcounting rule (refcount of tail pages
+         * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
+         * directly for tail pages.
+         */
+        if (PageTransHuge(head)) {
+                if (get_page_unless_zero(head)) {
+                        if (PageTail(page))
+                                get_page(page);
+                        return 1;
+                } else {
+                        return 0;
+                }
+        }
+        return get_page_unless_zero(page);
+}
+EXPORT_SYMBOL_GPL(get_hwpoison_page);
 /*
 * Do all that is necessary to remove user space mappings. Unmap
 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -927,7 +935,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int ret;
        int kill = 1, forcekill;
        struct page *hpage = *hpagep;
-        struct page *ppage;
        /*
         * Here we are interested only in user-mapped pages, so skip any
@@ -977,59 +984,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        }
        /*
-         * ppage: poisoned page
-         *   if p is regular page(4k page)
-         *        ppage == real poisoned page;
-         *   else p is hugetlb or THP, ppage == head page.
-         */
-        ppage = hpage;
-        if (PageTransHuge(hpage)) {
-                /*
-                 * Verify that this isn't a hugetlbfs head page, the check for
-                 * PageAnon is just for avoid tripping a split_huge_page
-                 * internal debug check, as split_huge_page refuses to deal with
-                 * anything that isn't an anon page. PageAnon can't go away fro
-                 * under us because we hold a refcount on the hpage, without a
-                 * refcount on the hpage. split_huge_page can't be safely called
-                 * in the first place, having a refcount on the tail isn't
-                 * enough * to be safe.
-                 */
-                if (!PageHuge(hpage) && PageAnon(hpage)) {
-                        if (unlikely(split_huge_page(hpage))) {
-                                /*
-                                 * FIXME: if splitting THP is failed, it is
-                                 * better to stop the following operation rather
-                                 * than causing panic by unmapping. System might
-                                 * survive if the page is freed later.
-                                 */
-                                printk(KERN_INFO
-                                        "MCE %#lx: failed to split THP\n", pfn);
-                                BUG_ON(!PageHWPoison(p));
-                                return SWAP_FAIL;
-                        }
-                        /*
-                         * We pinned the head page for hwpoison handling,
-                         * now we split the thp and we are interested in
-                         * the hwpoisoned raw page, so move the refcount
-                         * to it. Similarly, page lock is shifted.
-                         */
-                        if (hpage != p) {
-                                if (!(flags & MF_COUNT_INCREASED)) {
-                                        put_page(hpage);
-                                        get_page(p);
-                                }
-                                lock_page(p);
-                                unlock_page(hpage);
-                                *hpagep = p;
-                        }
-                        /* THP is split, so ppage should be the real poisoned page. */
-                        ppage = p;
-                }
-        }
-        /*
         * First collect all the processes that have the page
         * mapped in dirty form.  This has to be done before try_to_unmap,
         * because ttu takes the rmap data structures down.
@@ -1038,12 +992,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-                collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
+                collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
-        ret = try_to_unmap(ppage, ttu);
+        ret = try_to_unmap(hpage, ttu);
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                                pfn, page_mapcount(ppage));
+                                pfn, page_mapcount(hpage));
        /*
         * Now that the dirty bit has been propagated to the
@@ -1055,7 +1009,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-        forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+        forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
        kill_procs(&tokill, forcekill, trapno,
                      ret != SWAP_SUCCESS, p, pfn, flags);
@@ -1101,6 +1055,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        struct page_state *ps;
        struct page *p;
        struct page *hpage;
+        struct page *orig_head;
        int res;
        unsigned int nr_pages;
        unsigned long page_flags;
@@ -1116,7 +1071,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        }
        p = pfn_to_page(pfn);
-        hpage = compound_head(p);
+        orig_head = hpage = compound_head(p);
        if (TestSetPageHWPoison(p)) {
                printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
                return 0;
@@ -1149,10 +1104,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * In fact it's dangerous to directly bump up page count from 0,
         * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
         */
-        if (!(flags & MF_COUNT_INCREASED) &&
+        if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
-                !get_page_unless_zero(hpage)) {
                if (is_free_buddy_page(p)) {
-                        action_result(pfn, MSG_BUDDY, DELAYED);
+                        action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
                        return 0;
                } else if (PageHuge(hpage)) {
                        /*
@@ -1169,16 +1123,39 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        }
                        set_page_hwpoison_huge_page(hpage);
                        res = dequeue_hwpoisoned_huge_page(hpage);
-                        action_result(pfn, MSG_FREE_HUGE,
+                        action_result(pfn, MF_MSG_FREE_HUGE,
-                                      res ? IGNORED : DELAYED);
+                                      res ? MF_IGNORED : MF_DELAYED);
                        unlock_page(hpage);
                        return res;
                } else {
-                        action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED);
+                        action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
                        return -EBUSY;
                }
        }
+        if (!PageHuge(p) && PageTransHuge(hpage)) {
+                if (!PageAnon(hpage)) {
+                        pr_err("MCE: %#lx: non anonymous thp\n", pfn);
+                        if (TestClearPageHWPoison(p))
+                                atomic_long_sub(nr_pages, &num_poisoned_pages);
+                        put_page(p);
+                        if (p != hpage)
+                                put_page(hpage);
+                        return -EBUSY;
+                }
+                if (unlikely(split_huge_page(hpage))) {
+                        pr_err("MCE: %#lx: thp split failed\n", pfn);
+                        if (TestClearPageHWPoison(p))
+                                atomic_long_sub(nr_pages, &num_poisoned_pages);
+                        put_page(p);
+                        if (p != hpage)
+                                put_page(hpage);
+                        return -EBUSY;
+                }
+                VM_BUG_ON_PAGE(!page_count(p), p);
+                hpage = compound_head(p);
+        }
        /*
         * We ignore non-LRU pages for good reasons.
         * - PG_locked is only well defined for LRU pages and a few others
@@ -1188,18 +1165,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * walked by the page reclaim code, however that's not a big loss.
         */
        if (!PageHuge(p)) {
-                if (!PageLRU(hpage))
+                if (!PageLRU(p))
-                        shake_page(hpage, 0);
+                        shake_page(p, 0);
-                if (!PageLRU(hpage)) {
+                if (!PageLRU(p)) {
                        /*
                         * shake_page could have turned it free.
                         */
                        if (is_free_buddy_page(p)) {
                                if (flags & MF_COUNT_INCREASED)
-                                        action_result(pfn, MSG_BUDDY, DELAYED);
+                                        action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
                                else
-                                        action_result(pfn, MSG_BUDDY_2ND,
+                                        action_result(pfn, MF_MSG_BUDDY_2ND,
-                                                      DELAYED);
+                                                      MF_DELAYED);
                                return 0;
                        }
                }
@@ -1211,8 +1188,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * The page could have changed compound pages during the locking.
         * If this happens just bail out.
         */
-        if (compound_head(p) != hpage) {
+        if (PageCompound(p) && compound_head(p) != orig_head) {
-                action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED);
+                action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1252,7 +1229,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * on the head page to show that the hugepage is hwpoisoned
         */
        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
-                action_result(pfn, MSG_POISONED_HUGE, IGNORED);
+                action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
                unlock_page(hpage);
                put_page(hpage);
                return 0;
@@ -1281,7 +1258,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         */
        if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
            != SWAP_SUCCESS) {
-                action_result(pfn, MSG_UNMAP_FAILED, IGNORED);
+                action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1290,7 +1267,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * Torn down by someone else?
         */
        if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
-                action_result(pfn, MSG_TRUNCATED_LRU, IGNORED);
+                action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1450,12 +1427,12 @@ int unpoison_memory(unsigned long pfn)
         */
        if (!PageHuge(page) && PageTransHuge(page)) {
                pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
-                        return 0;
+                return 0;
        }
        nr_pages = 1 << compound_order(page);
-        if (!get_page_unless_zero(page)) {
+        if (!get_hwpoison_page(p)) {
                /*
                 * Since HWPoisoned hugepage should have non-zero refcount,
                 * race between memory failure and unpoison seems to happen.
@@ -1523,7 +1500,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
         * When the target page is a free hugepage, just remove it
         * from free hugepage list.
         */
-        if (!get_page_unless_zero(compound_head(p))) {
+        if (!get_hwpoison_page(p)) {
                if (PageHuge(p)) {
                        pr_info("%s: %#lx free huge page\n", __func__, pfn);
                        ret = 0;
@@ -1694,20 +1671,7 @@ static int __soft_offline_page(struct page *page, int flags)
                        if (ret > 0)
                                ret = -EIO;
                } else {
-                        /*
-                         * After page migration succeeds, the source page can
-                         * be trapped in pagevec and actual freeing is delayed.
-                         * Freeing code works differently based on PG_hwpoison,
-                         * so there's a race. We need to make sure that the
-                         * source page should be freed back to buddy before
-                         * setting PG_hwpoison.
-                         */
-                        if (!is_free_buddy_page(page))
-                                drain_all_pages(page_zone(page));
                        SetPageHWPoison(page);
-                        if (!is_free_buddy_page(page))
-                                pr_info("soft offline: %#lx: page leaked\n",
-                                        pfn);
                        atomic_long_inc(&num_poisoned_pages);
                }
        } else {
@@ -1759,14 +1723,6 @@ int soft_offline_page(struct page *page, int flags)
        get_online_mems();
-        /*
-         * Isolate the page, so that it doesn't get reallocated if it
-         * was free. This flag should be kept set until the source page
-         * is freed and PG_hwpoison on it is set.
-         */
-        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
-                set_migratetype_isolate(page, true);
        ret = get_any_page(page, pfn, flags);
        put_online_mems();
        if (ret > 0) { /* for in-use pages */
@@ -1785,6 +1741,5 @@ int soft_offline_page(struct page *page, int flags)
                                atomic_long_inc(&num_poisoned_pages);
                }
        }
-        unset_migratetype_isolate(page, MIGRATE_MOVABLE);
        return ret;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 17734c3c1183..11b9ca176740 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2081,11 +2081,12 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                        goto oom;
                cow_user_page(new_page, old_page, address, vma);
        }
-        __SetPageUptodate(new_page);
        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
                goto oom_free_new;
+        __SetPageUptodate(new_page);
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
@@ -2689,6 +2690,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        page = alloc_zeroed_user_highpage_movable(vma, address);
        if (!page)
                goto oom;
+        if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+                goto oom_free_page;
        /*
         * The memory barrier inside __SetPageUptodate makes sure that
         * preceeding stores to the page contents become visible before
@@ -2696,9 +2701,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        __SetPageUptodate(page);
-        if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
-                goto oom_free_page;
        entry = mk_pte(page, vma->vm_page_prot);
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry));
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9e88f749aa51..26fbba7d888f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -513,6 +513,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
                        break;
                err = 0;
        }
+        vmemmap_populate_print_last();
        return err;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 747743237d9f..99d4c1d0b858 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1972,35 +1972,41 @@ retry_cpuset:
        pol = get_vma_policy(vma, addr);
        cpuset_mems_cookie = read_mems_allowed_begin();
-        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
+        if (pol->mode == MPOL_INTERLEAVE) {
-                                        pol->mode != MPOL_INTERLEAVE)) {
+                unsigned nid;
+                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+                mpol_cond_put(pol);
+                page = alloc_page_interleave(gfp, order, nid);
+                goto out;
+        }
+        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
+                int hpage_node = node;
                /*
                 * For hugepage allocation and non-interleave policy which
-                 * allows the current node, we only try to allocate from the
+                 * allows the current node (or other explicitly preferred
-                 * current node and don't fall back to other nodes, as the
+                 * node) we only try to allocate from the current/preferred
-                 * cost of remote accesses would likely offset THP benefits.
+                 * node and don't fall back to other nodes, as the cost of
+                 * remote accesses would likely offset THP benefits.
                 *
                 * If the policy is interleave, or does not allow the current
                 * node in its nodemask, we allocate the standard way.
                 */
+                if (pol->mode == MPOL_PREFERRED &&
+                                                !(pol->flags & MPOL_F_LOCAL))
+                        hpage_node = pol->v.preferred_node;
                nmask = policy_nodemask(gfp, pol);
-                if (!nmask || node_isset(node, *nmask)) {
+                if (!nmask || node_isset(hpage_node, *nmask)) {
                        mpol_cond_put(pol);
-                        page = alloc_pages_exact_node(node,
+                        page = alloc_pages_exact_node(hpage_node,
                                                gfp | __GFP_THISNODE, order);
                        goto out;
                }
        }
-        if (pol->mode == MPOL_INTERLEAVE) {
-                unsigned nid;
-                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
-                mpol_cond_put(pol);
-                page = alloc_page_interleave(gfp, order, nid);
-                goto out;
-        }
        nmask = policy_nodemask(gfp, pol);
        zl = policy_zonelist(gfp, pol, node);
        mpol_cond_put(pol);
diff --git a/mm/memtest.c b/mm/memtest.c
index 1997d934b13b..0a1cc133f6d7 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -74,7 +74,8 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
        u64 i;
        phys_addr_t this_start, this_end;
-        for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
+        for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start,
+                                &this_end, NULL) {
                this_start = clamp(this_start, start, end);
                this_end = clamp(this_end, start, end);
                if (this_start < this_end) {
diff --git a/mm/migrate.c b/mm/migrate.c
index f53838fe3dfe..ee401e4e5ef1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -918,7 +918,8 @@ out:
 static ICE_noinline int unmap_and_move(new_page_t get_new_page,
                                   free_page_t put_new_page,
                                   unsigned long private, struct page *page,
-                                   int force, enum migrate_mode mode)
+                                   int force, enum migrate_mode mode,
+                                   enum migrate_reason reason)
 {
        int rc = 0;
        int *result = NULL;
@@ -949,7 +950,8 @@ out:
                list_del(&page->lru);
                dec_zone_page_state(page, NR_ISOLATED_ANON +
                                page_is_file_cache(page));
-                putback_lru_page(page);
+                if (reason != MR_MEMORY_FAILURE)
+                        putback_lru_page(page);
        }
        /*
@@ -1122,7 +1124,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
                                                pass > 2, mode);
                        else
                                rc = unmap_and_move(get_new_page, put_new_page,
-                                                private, page, pass > 2, mode);
+                                                private, page, pass > 2, mode,
+                                                reason);
                        switch(rc) {
                        case -ENOMEM:
@@ -1796,7 +1799,7 @@ fail_putback:
         */
        flush_cache_range(vma, mmun_start, mmun_end);
        page_add_anon_rmap(new_page, vma, mmun_start);
-        pmdp_clear_flush_notify(vma, mmun_start, pmd);
+        pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
        set_pmd_at(mm, mmun_start, pmd, entry);
        flush_tlb_range(vma, mmun_start, mmun_end);
        update_mmu_cache_pmd(vma, address, &entry);
diff --git a/mm/mmap.c b/mm/mmap.c
index bb50cacc3ea5..aa632ade2be7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1258,6 +1258,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        *populate = 0;
+        if (!len)
+                return -EINVAL;
        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
         *
@@ -1268,9 +1271,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
                        prot |= PROT_EXEC;
-        if (!len)
-                return -EINVAL;
        if (!(flags & MAP_FIXED))
                addr = round_hint_to_min(addr);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 88584838e704..e7d6f1171ecb 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,6 +29,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
+#include "internal.h"
 /*
 * For a prot_numa update we only hold mmap_sem for read so there is a
 * potential race with faulting where a pmd was temporarily none. This
@@ -322,6 +324,15 @@ success:
        change_protection(vma, start, end, vma->vm_page_prot,
                          dirty_accountable, 0);
+        /*
+         * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
+         * fault on access.
+         */
+        if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
+                        (newflags & VM_WRITE)) {
+                populate_vma_page_range(vma, start, end, NULL);
+        }
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
        perf_event_mmap(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index 034e2d360652..a7c93eceb1c8 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,6 +22,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/sched/sysctl.h>
 #include <linux/uaccess.h>
+#include <linux/mm-arch-hooks.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -286,13 +287,17 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                old_len = new_len;
                old_addr = new_addr;
                new_addr = -ENOMEM;
-        } else if (vma->vm_file && vma->vm_file->f_op->mremap) {
+        } else {
-                err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+                if (vma->vm_file && vma->vm_file->f_op->mremap) {
-                if (err < 0) {
+                        err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
-                        move_page_tables(new_vma, new_addr, vma, old_addr,
+                        if (err < 0) {
-                                         moved_len, true);
+                                move_page_tables(new_vma, new_addr, vma,
-                        return err;
+                                                 old_addr, moved_len, true);
+                                return err;
+                        }
                }
+                arch_remap(mm, old_addr, old_addr + old_len,
+                           new_addr, new_addr + new_len);
        }
        /* Conceal VM_ACCOUNT so old reservation is not undone */
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 90b50468333e..5258386fa1be 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -37,11 +37,20 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 {
        void *ptr;
        u64 addr;
+        ulong flags = choose_memblock_flags();
        if (limit > memblock.current_limit)
                limit = memblock.current_limit;
-        addr = memblock_find_in_range_node(size, align, goal, limit, nid);
+again:
+        addr = memblock_find_in_range_node(size, align, goal, limit, nid,
+                                           flags);
+        if (!addr && (flags & MEMBLOCK_MIRROR)) {
+                flags &= ~MEMBLOCK_MIRROR;
+                pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+                        &size);
+                goto again;
+        }
        if (!addr)
                return NULL;
@@ -121,7 +130,8 @@ static unsigned long __init free_low_memory_core_early(void)
        memblock_clear_hotplug(0, -1);
-        for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
+        for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+                                NULL)
                count += __free_memory_core(start, end);
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
diff --git a/mm/nommu.c b/mm/nommu.c
index e544508e2a4b..05e7447d960b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -42,22 +42,6 @@
 #include <asm/mmu_context.h>
 #include "internal.h"
-#if 0
-#define kenter(FMT, ...) \
-        printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
-#define kleave(FMT, ...) \
-        printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
-#define kdebug(FMT, ...) \
-        printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
-#else
-#define kenter(FMT, ...) \
-        no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
-#define kleave(FMT, ...) \
-        no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
-#define kdebug(FMT, ...) \
-        no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
-#endif
 void *high_memory;
 EXPORT_SYMBOL(high_memory);
 struct page *mem_map;
@@ -665,11 +649,7 @@ static void free_page_series(unsigned long from, unsigned long to)
        for (; from < to; from += PAGE_SIZE) {
                struct page *page = virt_to_page(from);
-                kdebug("- free %lx", from);
                atomic_long_dec(&mmap_pages_allocated);
-                if (page_count(page) != 1)
-                        kdebug("free page %p: refcount not one: %d",
-                               page, page_count(page));
                put_page(page);
        }
 }
@@ -683,8 +663,6 @@ static void free_page_series(unsigned long from, unsigned long to)
 static void __put_nommu_region(struct vm_region *region)
        __releases(nommu_region_sem)
 {
-        kenter("%p{%d}", region, region->vm_usage);
        BUG_ON(!nommu_region_tree.rb_node);
        if (--region->vm_usage == 0) {
@@ -697,10 +675,8 @@ static void __put_nommu_region(struct vm_region *region)
                /* IO memory and memory shared directly out of the pagecache
                 * from ramfs/tmpfs mustn't be released here */
-                if (region->vm_flags & VM_MAPPED_COPY) {
+                if (region->vm_flags & VM_MAPPED_COPY)
-                        kdebug("free series");
                        free_page_series(region->vm_start, region->vm_top);
-                }
                kmem_cache_free(vm_region_jar, region);
        } else {
                up_write(&nommu_region_sem);
@@ -744,8 +720,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
        struct address_space *mapping;
        struct rb_node **p, *parent, *rb_prev;
-        kenter(",%p", vma);
        BUG_ON(!vma->vm_region);
        mm->map_count++;
@@ -813,8 +787,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
        struct mm_struct *mm = vma->vm_mm;
        struct task_struct *curr = current;
-        kenter("%p", vma);
        protect_vma(vma, 0);
        mm->map_count--;
@@ -854,7 +826,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
 */
 static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-        kenter("%p", vma);
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
        if (vma->vm_file)
@@ -957,12 +928,8 @@ static int validate_mmap_request(struct file *file,
        int ret;
        /* do the simple checks first */
-        if (flags & MAP_FIXED) {
+        if (flags & MAP_FIXED)
-                printk(KERN_DEBUG
-                       "%d: Can't do fixed-address/overlay mmap of RAM\n",
-                       current->pid);
                return -EINVAL;
-        }
        if ((flags & MAP_TYPE) != MAP_PRIVATE &&
            (flags & MAP_TYPE) != MAP_SHARED)
@@ -1060,8 +1027,7 @@ static int validate_mmap_request(struct file *file,
                            ) {
                                capabilities &= ~NOMMU_MAP_DIRECT;
                                if (flags & MAP_SHARED) {
-                                        printk(KERN_WARNING
+                                        pr_warn("MAP_SHARED not completely supported on !MMU\n");
-                                               "MAP_SHARED not completely supported on !MMU\n");
                                        return -EINVAL;
                                }
                        }
@@ -1205,16 +1171,12 @@ static int do_mmap_private(struct vm_area_struct *vma,
         *   we're allocating is smaller than a page
         */
        order = get_order(len);
-        kdebug("alloc order %d for %lx", order, len);
        total = 1 << order;
        point = len >> PAGE_SHIFT;
        /* we don't want to allocate a power-of-2 sized page set */
-        if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
+        if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages)
                total = point;
-                kdebug("try to alloc exact %lu pages", total);
-        }
        base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL);
        if (!base)
@@ -1285,18 +1247,14 @@ unsigned long do_mmap_pgoff(struct file *file,
        unsigned long capabilities, vm_flags, result;
        int ret;
-        kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
        *populate = 0;
        /* decide whether we should attempt the mapping, and if so what sort of
         * mapping */
        ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
                                    &capabilities);
-        if (ret < 0) {
+        if (ret < 0)
-                kleave(" = %d [val]", ret);
                return ret;
-        }
        /* we ignore the address hint */
        addr = 0;
@@ -1383,11 +1341,9 @@ unsigned long do_mmap_pgoff(struct file *file,
                        vma->vm_start = start;
                        vma->vm_end = start + len;
-                        if (pregion->vm_flags & VM_MAPPED_COPY) {
+                        if (pregion->vm_flags & VM_MAPPED_COPY)
-                                kdebug("share copy");
                                vma->vm_flags |= VM_MAPPED_COPY;
-                        } else {
+                        else {
-                                kdebug("share mmap");
                                ret = do_mmap_shared_file(vma);
                                if (ret < 0) {
                                        vma->vm_region = NULL;
@@ -1467,7 +1423,6 @@ share:
        up_write(&nommu_region_sem);
-        kleave(" = %lx", result);
        return result;
 error_just_free:
@@ -1479,27 +1434,24 @@ error:
        if (vma->vm_file)
                fput(vma->vm_file);
        kmem_cache_free(vm_area_cachep, vma);
-        kleave(" = %d", ret);
        return ret;
 sharing_violation:
        up_write(&nommu_region_sem);
-        printk(KERN_WARNING "Attempt to share mismatched mappings\n");
+        pr_warn("Attempt to share mismatched mappings\n");
        ret = -EINVAL;
        goto error;
 error_getting_vma:
        kmem_cache_free(vm_region_jar, region);
-        printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
+        pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
-               " from process %d failed\n",
+                        len, current->pid);
-               len, current->pid);
        show_free_areas(0);
        return -ENOMEM;
 error_getting_region:
-        printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
+        pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
-               " from process %d failed\n",
+                        len, current->pid);
-               len, current->pid);
        show_free_areas(0);
        return -ENOMEM;
 }
@@ -1563,8 +1515,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
        struct vm_region *region;
        unsigned long npages;
-        kenter("");
        /* we're only permitted to split anonymous regions (these should have
         * only a single usage on the region) */
        if (vma->vm_file)
@@ -1628,8 +1578,6 @@ static int shrink_vma(struct mm_struct *mm,
 {
        struct vm_region *region;
-        kenter("");
        /* adjust the VMA's pointers, which may reposition it in the MM's tree
         * and list */
        delete_vma_from_mm(vma);
@@ -1669,8 +1617,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        unsigned long end;
        int ret;
-        kenter(",%lx,%zx", start, len);
        len = PAGE_ALIGN(len);
        if (len == 0)
                return -EINVAL;
@@ -1682,11 +1628,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        if (!vma) {
                static int limit;
                if (limit < 5) {
-                        printk(KERN_WARNING
+                        pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n",
-                               "munmap of memory not mmapped by process %d"
+                                        current->pid, current->comm,
-                               " (%s): 0x%lx-0x%lx\n",
+                                        start, start + len - 1);
-                               current->pid, current->comm,
-                               start, start + len - 1);
                        limit++;
                }
                return -EINVAL;
@@ -1695,38 +1639,27 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        /* we're allowed to split an anonymous VMA but not a file-backed one */
        if (vma->vm_file) {
                do {
-                        if (start > vma->vm_start) {
+                        if (start > vma->vm_start)
-                                kleave(" = -EINVAL [miss]");
                                return -EINVAL;
-                        }
                        if (end == vma->vm_end)
                                goto erase_whole_vma;
                        vma = vma->vm_next;
                } while (vma);
-                kleave(" = -EINVAL [split file]");
                return -EINVAL;
        } else {
                /* the chunk must be a subset of the VMA found */
                if (start == vma->vm_start && end == vma->vm_end)
                        goto erase_whole_vma;
-                if (start < vma->vm_start || end > vma->vm_end) {
+                if (start < vma->vm_start || end > vma->vm_end)
-                        kleave(" = -EINVAL [superset]");
                        return -EINVAL;
-                }
+                if (start & ~PAGE_MASK)
-                if (start & ~PAGE_MASK) {
-                        kleave(" = -EINVAL [unaligned start]");
                        return -EINVAL;
-                }
+                if (end != vma->vm_end && end & ~PAGE_MASK)
-                if (end != vma->vm_end && end & ~PAGE_MASK) {
-                        kleave(" = -EINVAL [unaligned split]");
                        return -EINVAL;
-                }
                if (start != vma->vm_start && end != vma->vm_end) {
                        ret = split_vma(mm, vma, start, 1);
-                        if (ret < 0) {
+                        if (ret < 0)
-                                kleave(" = %d [split]", ret);
                                return ret;
-                        }
                }
                return shrink_vma(mm, vma, start, end);
        }
@@ -1734,7 +1667,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 erase_whole_vma:
        delete_vma_from_mm(vma);
        delete_vma(mm, vma);
-        kleave(" = 0");
        return 0;
 }
 EXPORT_SYMBOL(do_munmap);
@@ -1766,8 +1698,6 @@ void exit_mmap(struct mm_struct *mm)
        if (!mm)
                return;
-        kenter("");
        mm->total_vm = 0;
        while ((vma = mm->mmap)) {
@@ -1776,8 +1706,6 @@ void exit_mmap(struct mm_struct *mm)
                delete_vma(mm, vma);
                cond_resched();
        }
-        kleave("");
 }
 unsigned long vm_brk(unsigned long addr, unsigned long len)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2b665da1b3c9..dff991e0681e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,7 +42,8 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
-static DEFINE_SPINLOCK(zone_scan_lock);
+DEFINE_MUTEX(oom_lock);
 #ifdef CONFIG_NUMA
 /**
@@ -405,16 +406,15 @@ static atomic_t oom_victims = ATOMIC_INIT(0);
 static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
 bool oom_killer_disabled __read_mostly;
-static DECLARE_RWSEM(oom_sem);
 /**
- * mark_tsk_oom_victim - marks the given task as OOM victim.
+ * mark_oom_victim - mark the given task as OOM victim
 * @tsk: task to mark
 *
- * Has to be called with oom_sem taken for read and never after
+ * Has to be called with oom_lock held and never after
 * oom has been disabled already.
 */
-void mark_tsk_oom_victim(struct task_struct *tsk)
+void mark_oom_victim(struct task_struct *tsk)
 {
        WARN_ON(oom_killer_disabled);
        /* OOM killer might race with memcg OOM */
@@ -431,23 +431,14 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
 }
 /**
- * unmark_oom_victim - unmarks the current task as OOM victim.
+ * exit_oom_victim - note the exit of an OOM victim
- *
- * Wakes up all waiters in oom_killer_disable()
 */
-void unmark_oom_victim(void)
+void exit_oom_victim(void)
 {
-        if (!test_and_clear_thread_flag(TIF_MEMDIE))
+        clear_thread_flag(TIF_MEMDIE);
-                return;
-        down_read(&oom_sem);
+        if (!atomic_dec_return(&oom_victims))
-        /*
-         * There is no need to signal the lasst oom_victim if there
-         * is nobody who cares.
-         */
-        if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
                wake_up_all(&oom_victims_wait);
-        up_read(&oom_sem);
 }
 /**
@@ -469,14 +460,14 @@ bool oom_killer_disable(void)
         * Make sure to not race with an ongoing OOM killer
         * and that the current is not the victim.
         */
-        down_write(&oom_sem);
+        mutex_lock(&oom_lock);
        if (test_thread_flag(TIF_MEMDIE)) {
-                up_write(&oom_sem);
+                mutex_unlock(&oom_lock);
                return false;
        }
        oom_killer_disabled = true;
-        up_write(&oom_sem);
+        mutex_unlock(&oom_lock);
        wait_event(oom_victims_wait, !atomic_read(&oom_victims));
@@ -488,9 +479,7 @@ bool oom_killer_disable(void)
 */
 void oom_killer_enable(void)
 {
-        down_write(&oom_sem);
        oom_killer_disabled = false;
-        up_write(&oom_sem);
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -517,7 +506,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         */
        task_lock(p);
        if (p->mm && task_will_free_mem(p)) {
-                mark_tsk_oom_victim(p);
+                mark_oom_victim(p);
                task_unlock(p);
                put_task_struct(p);
                return;
@@ -528,7 +517,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                dump_header(p, gfp_mask, order, memcg, nodemask);
        task_lock(p);
-        pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
+        pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
                message, task_pid_nr(p), p->comm, points);
        task_unlock(p);
@@ -572,7 +561,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        /* mm cannot safely be dereferenced after task_unlock(victim) */
        mm = victim->mm;
-        mark_tsk_oom_victim(victim);
+        mark_oom_victim(victim);
        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
                task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
                K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -645,52 +634,6 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
-/*
- * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
- * if a parallel OOM killing is already taking place that includes a zone in
- * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
- */
-bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
-{
-        struct zoneref *z;
-        struct zone *zone;
-        bool ret = true;
-        spin_lock(&zone_scan_lock);
-        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-                if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) {
-                        ret = false;
-                        goto out;
-                }
-        /*
-         * Lock each zone in the zonelist under zone_scan_lock so a parallel
-         * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
-         */
-        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-                set_bit(ZONE_OOM_LOCKED, &zone->flags);
-out:
-        spin_unlock(&zone_scan_lock);
-        return ret;
-}
-/*
- * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
- * allocation attempts with zonelists containing them may now recall the OOM
- * killer, if necessary.
- */
-void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
-{
-        struct zoneref *z;
-        struct zone *zone;
-        spin_lock(&zone_scan_lock);
-        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-                clear_bit(ZONE_OOM_LOCKED, &zone->flags);
-        spin_unlock(&zone_scan_lock);
-}
 /**
 * __out_of_memory - kill the "best" process when we run out of memory
 * @zonelist: zonelist pointer
@@ -704,8 +647,8 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
 * OR try to be smart about which process to kill. Note that we
 * don't have to be perfect here, we just have to be good.
 */
-static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-                int order, nodemask_t *nodemask, bool force_kill)
+                   int order, nodemask_t *nodemask, bool force_kill)
 {
        const nodemask_t *mpol_mask;
        struct task_struct *p;
@@ -715,10 +658,13 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        enum oom_constraint constraint = CONSTRAINT_NONE;
        int killed = 0;
+        if (oom_killer_disabled)
+                return false;
        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
        if (freed > 0)
                /* Got some memory back in the last second. */
-                return;
+                goto out;
        /*
         * If current has a pending SIGKILL or is exiting, then automatically
@@ -730,8 +676,8 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
         */
        if (current->mm &&
            (fatal_signal_pending(current) || task_will_free_mem(current))) {
-                mark_tsk_oom_victim(current);
+                mark_oom_victim(current);
-                return;
+                goto out;
        }
        /*
@@ -771,32 +717,8 @@ out:
         */
        if (killed)
                schedule_timeout_killable(1);
-}
-/**
- * out_of_memory -  tries to invoke OOM killer.
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
- *
- * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
- * when it returns false. Otherwise returns true.
- */
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-                int order, nodemask_t *nodemask, bool force_kill)
-{
-        bool ret = false;
-        down_read(&oom_sem);
-        if (!oom_killer_disabled) {
-                __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
-                ret = true;
-        }
-        up_read(&oom_sem);
-        return ret;
+        return true;
 }
 /*
@@ -806,27 +728,21 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 */
 void pagefault_out_of_memory(void)
 {
-        struct zonelist *zonelist;
-        down_read(&oom_sem);
        if (mem_cgroup_oom_synchronize(true))
-                goto unlock;
+                return;
-        zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
+        if (!mutex_trylock(&oom_lock))
-        if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
+                return;
-                if (!oom_killer_disabled)
-                        __out_of_memory(NULL, 0, 0, NULL, false);
-                else
-                        /*
-                         * There shouldn't be any user tasks runable while the
-                         * OOM killer is disabled so the current task has to
-                         * be a racing OOM victim for which oom_killer_disable()
-                         * is waiting for.
-                         */
-                        WARN_ON(test_thread_flag(TIF_MEMDIE));
-                oom_zonelist_unlock(zonelist, GFP_KERNEL);
+        if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+                /*
+                 * There shouldn't be any user tasks runnable while the
+                 * OOM killer is disabled, so the current task has to
+                 * be a racing OOM victim for which oom_killer_disable()
+                 * is waiting for.
+                 */
+                WARN_ON(test_thread_flag(TIF_MEMDIE));
        }
-unlock:
-        up_read(&oom_sem);
+        mutex_unlock(&oom_lock);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2fd31aebef30..5e6fa06f2784 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -380,20 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order)
        }
 }
-static inline void prep_zero_page(struct page *page, unsigned int order,
-                                                        gfp_t gfp_flags)
-{
-        int i;
-        /*
-         * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
-         * and __GFP_HIGHMEM from hard or soft interrupt context.
-         */
-        VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
-        for (i = 0; i < (1 << order); i++)
-                clear_highpage(page + i);
-}
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 bool _debug_pagealloc_enabled __read_mostly;
@@ -975,7 +961,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
        kasan_alloc_pages(page, order);
        if (gfp_flags & __GFP_ZERO)
-                prep_zero_page(page, order, gfp_flags);
+                for (i = 0; i < (1 << order); i++)
+                        clear_highpage(page + i);
        if (order && (gfp_flags & __GFP_COMP))
                prep_compound_page(page, order);
@@ -2322,48 +2309,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                show_mem(filter);
 }
-static inline int
-should_alloc_retry(gfp_t gfp_mask, unsigned int order,
-                                unsigned long did_some_progress,
-                                unsigned long pages_reclaimed)
-{
-        /* Do not loop if specifically requested */
-        if (gfp_mask & __GFP_NORETRY)
-                return 0;
-        /* Always retry if specifically requested */
-        if (gfp_mask & __GFP_NOFAIL)
-                return 1;
-        /*
-         * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
-         * making forward progress without invoking OOM. Suspend also disables
-         * storage devices so kswapd will not help. Bail if we are suspending.
-         */
-        if (!did_some_progress && pm_suspended_storage())
-                return 0;
-        /*
-         * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
-         * means __GFP_NOFAIL, but that may not be true in other
-         * implementations.
-         */
-        if (order <= PAGE_ALLOC_COSTLY_ORDER)
-                return 1;
-        /*
-         * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
-         * specified, then we retry until we no longer reclaim any pages
-         * (above), or we've reclaimed an order of pages at least as
-         * large as the allocation's order. In both cases, if the
-         * allocation still fails, we stop retrying.
-         */
-        if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
-                return 1;
-        return 0;
-}
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        const struct alloc_context *ac, unsigned long *did_some_progress)
@@ -2373,10 +2318,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        *did_some_progress = 0;
        /*
-         * Acquire the per-zone oom lock for each zone.  If that
+         * Acquire the oom lock.  If that fails, somebody else is
-         * fails, somebody else is making progress for us.
+         * making progress for us.
         */
-        if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
+        if (!mutex_trylock(&oom_lock)) {
                *did_some_progress = 1;
                schedule_timeout_uninterruptible(1);
                return NULL;
@@ -2402,16 +2347,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                /* The OOM killer does not needlessly kill tasks for lowmem */
                if (ac->high_zoneidx < ZONE_NORMAL)
                        goto out;
-                /* The OOM killer does not compensate for light reclaim */
+                /* The OOM killer does not compensate for IO-less reclaim */
                if (!(gfp_mask & __GFP_FS)) {
                        /*
                         * XXX: Page reclaim didn't yield anything,
                         * and the OOM killer can't be invoked, but
-                         * keep looping as per should_alloc_retry().
+                         * keep looping as per tradition.
                         */
                        *did_some_progress = 1;
                        goto out;
                }
+                if (pm_suspended_storage())
+                        goto out;
                /* The OOM killer may not free memory on a specific node */
                if (gfp_mask & __GFP_THISNODE)
                        goto out;
@@ -2421,7 +2368,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
                *did_some_progress = 1;
 out:
-        oom_zonelist_unlock(ac->zonelist, gfp_mask);
+        mutex_unlock(&oom_lock);
        return page;
 }
@@ -2794,40 +2741,40 @@ retry:
        if (page)
                goto got_pg;
-        /* Check if we should retry the allocation */
+        /* Do not loop if specifically requested */
+        if (gfp_mask & __GFP_NORETRY)
+                goto noretry;
+        /* Keep reclaiming pages as long as there is reasonable progress */
        pages_reclaimed += did_some_progress;
-        if (should_alloc_retry(gfp_mask, order, did_some_progress,
+        if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
-                                                pages_reclaimed)) {
+            ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
-                /*
-                 * If we fail to make progress by freeing individual
-                 * pages, but the allocation wants us to keep going,
-                 * start OOM killing tasks.
-                 */
-                if (!did_some_progress) {
-                        page = __alloc_pages_may_oom(gfp_mask, order, ac,
-                                                        &did_some_progress);
-                        if (page)
-                                goto got_pg;
-                        if (!did_some_progress)
-                                goto nopage;
-                }
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto retry;
-        } else {
-                /*
-                 * High-order allocations do not necessarily loop after
-                 * direct reclaim and reclaim/compaction depends on compaction
-                 * being called after reclaim so call directly if necessary
-                 */
-                page = __alloc_pages_direct_compact(gfp_mask, order,
-                                        alloc_flags, ac, migration_mode,
-                                        &contended_compaction,
-                                        &deferred_compaction);
-                if (page)
-                        goto got_pg;
        }
+        /* Reclaim has failed us, start killing things */
+        page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+        if (page)
+                goto got_pg;
+        /* Retry as long as the OOM killer is making progress */
+        if (did_some_progress)
+                goto retry;
+noretry:
+        /*
+         * High-order allocations do not necessarily loop after
+         * direct reclaim and reclaim/compaction depends on compaction
+         * being called after reclaim so call directly if necessary
+         */
+        page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
+                                            ac, migration_mode,
+                                            &contended_compaction,
+                                            &deferred_compaction);
+        if (page)
+                goto got_pg;
 nopage:
        warn_alloc_failed(gfp_mask, order, NULL);
 got_pg:
@@ -4867,22 +4814,28 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
                                                unsigned long *zones_size,
                                                unsigned long *zholes_size)
 {
-        unsigned long realtotalpages, totalpages = 0;
+        unsigned long realtotalpages = 0, totalpages = 0;
        enum zone_type i;
-        for (i = 0; i < MAX_NR_ZONES; i++)
+        for (i = 0; i < MAX_NR_ZONES; i++) {
-                totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
+                struct zone *zone = pgdat->node_zones + i;
-                                                         node_start_pfn,
+                unsigned long size, real_size;
-                                                         node_end_pfn,
-                                                         zones_size);
+                size = zone_spanned_pages_in_node(pgdat->node_id, i,
-        pgdat->node_spanned_pages = totalpages;
+                                                  node_start_pfn,
+                                                  node_end_pfn,
-        realtotalpages = totalpages;
+                                                  zones_size);
-        for (i = 0; i < MAX_NR_ZONES; i++)
+                real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
-                realtotalpages -=
-                        zone_absent_pages_in_node(pgdat->node_id, i,
                                                  node_start_pfn, node_end_pfn,
                                                  zholes_size);
+                zone->spanned_pages = size;
+                zone->present_pages = real_size;
+                totalpages += size;
+                realtotalpages += real_size;
+        }
+        pgdat->node_spanned_pages = totalpages;
        pgdat->node_present_pages = realtotalpages;
        printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
                                                        realtotalpages);
@@ -4992,8 +4945,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
 * NOTE: pgdat should get zeroed by caller.
 */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
-                unsigned long node_start_pfn, unsigned long node_end_pfn,
+                unsigned long node_start_pfn, unsigned long node_end_pfn)
-                unsigned long *zones_size, unsigned long *zholes_size)
 {
        enum zone_type j;
        int nid = pgdat->node_id;
@@ -5014,12 +4966,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize, freesize, memmap_pages;
-                size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
+                size = zone->spanned_pages;
-                                                  node_end_pfn, zones_size);
+                realsize = freesize = zone->present_pages;
-                realsize = freesize = size - zone_absent_pages_in_node(nid, j,
-                                                                node_start_pfn,
-                                                                node_end_pfn,
-                                                                zholes_size);
                /*
                 * Adjust freesize so that it accounts for how much memory
@@ -5054,8 +5002,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                        nr_kernel_pages -= memmap_pages;
                nr_all_pages += freesize;
-                zone->spanned_pages = size;
-                zone->present_pages = realsize;
                /*
                 * Set an approximate value for lowmem here, it will be adjusted
                 * when the bootmem allocator frees pages into the buddy system.
@@ -5161,8 +5107,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                (unsigned long)pgdat->node_mem_map);
 #endif
-        free_area_init_core(pgdat, start_pfn, end_pfn,
+        free_area_init_core(pgdat, start_pfn, end_pfn);
-                            zones_size, zholes_size);
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -6111,9 +6056,9 @@ out:
        return ret;
 }
+#ifdef CONFIG_NUMA
 int hashdist = HASHDIST_DEFAULT;
-#ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
        if (!str)
diff --git a/mm/percpu.c b/mm/percpu.c
index dfd02484e8de..2dd74487a0af 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1030,7 +1030,7 @@ area_found:
                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
        ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
-        kmemleak_alloc_percpu(ptr, size);
+        kmemleak_alloc_percpu(ptr, size, gfp);
        return ptr;
 fail_unlock:
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index c25f94b33811..6b674e00153c 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -119,14 +119,15 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
 }
 #endif
-#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
-                       pmd_t *pmdp)
+                            pmd_t *pmdp)
 {
        pmd_t pmd;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-        pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+        VM_BUG_ON(!pmd_trans_huge(*pmdp));
+        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
 }
@@ -198,3 +199,23 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
+#ifndef pmdp_collapse_flush
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+                          pmd_t *pmdp)
+{
+        /*
+         * pmd and hugepage pte format are same. So we could
+         * use the same function.
+         */
+        pmd_t pmd;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        VM_BUG_ON(pmd_trans_huge(*pmdp));
+        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
+        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        return pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 24dd3f9fee27..7af1ecb21ccb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -625,7 +625,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
        pmd = pmd_offset(pud, address);
        /*
-         * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+         * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
         * without holding anon_vma lock for write.  So when looking for a
         * genuine pmde (in which to find pte), test present and !THP together.
         */
@@ -950,7 +950,12 @@ void page_move_anon_rmap(struct page *page,
        VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-        page->mapping = (struct address_space *) anon_vma;
+        /*
+         * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
+         * simultaneously, so a concurrent reader (eg page_referenced()'s
+         * PageAnon()) will not see one without the other.
+         */
+        WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
 }
 /**
diff --git a/mm/shmem.c b/mm/shmem.c
index 3759099d8ce4..4caf8ed24d65 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -569,7 +569,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
                        i_size_write(inode, newsize);
                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
                }
-                if (newsize < oldsize) {
+                if (newsize <= oldsize) {
                        loff_t holebegin = round_up(newsize, PAGE_SIZE);
                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
                        shmem_truncate_range(inode, newsize, (loff_t)-1);
diff --git a/mm/slab.c b/mm/slab.c
index 7eb38dd1cefa..200e22412a16 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1454,6 +1454,7 @@ void __init kmem_cache_init(void)
        kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
                                kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
        slab_state = PARTIAL_NODE;
+        setup_kmalloc_cache_index_table();
        slab_early_init = 0;
diff --git a/mm/slab.h b/mm/slab.h
index 4c3ac12dd644..8da63e4e470f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -71,6 +71,7 @@ unsigned long calculate_alignment(unsigned long flags,
 #ifndef CONFIG_SLOB
 /* Kmalloc array related functions */
+void setup_kmalloc_cache_index_table(void);
 void create_kmalloc_caches(unsigned long);
 /* Find the kmalloc slab corresponding for a certain size */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 999bb3424d44..9f8d71f78404 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -784,25 +784,45 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
 }
 /*
- * Create the kmalloc array. Some of the regular kmalloc arrays
+ * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
- * may already have been created because they were needed to
+ * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
- * enable allocations for slab creation.
+ * kmalloc-67108864.
 */
-void __init create_kmalloc_caches(unsigned long flags)
+static struct {
+        const char *name;
+        unsigned long size;
+} const kmalloc_info[] __initconst = {
+        {NULL,                      0},         {"kmalloc-96",             96},
+        {"kmalloc-192",           192},         {"kmalloc-8",               8},
+        {"kmalloc-16",             16},         {"kmalloc-32",             32},
+        {"kmalloc-64",             64},         {"kmalloc-128",           128},
+        {"kmalloc-256",           256},         {"kmalloc-512",           512},
+        {"kmalloc-1024",         1024},         {"kmalloc-2048",         2048},
+        {"kmalloc-4096",         4096},         {"kmalloc-8192",         8192},
+        {"kmalloc-16384",       16384},         {"kmalloc-32768",       32768},
+        {"kmalloc-65536",       65536},         {"kmalloc-131072",     131072},
+        {"kmalloc-262144",     262144},         {"kmalloc-524288",     524288},
+        {"kmalloc-1048576",   1048576},         {"kmalloc-2097152",   2097152},
+        {"kmalloc-4194304",   4194304},         {"kmalloc-8388608",   8388608},
+        {"kmalloc-16777216", 16777216},         {"kmalloc-33554432", 33554432},
+        {"kmalloc-67108864", 67108864}
+};
+/*
+ * Patch up the size_index table if we have strange large alignment
+ * requirements for the kmalloc array. This is only the case for
+ * MIPS it seems. The standard arches will not generate any code here.
+ *
+ * Largest permitted alignment is 256 bytes due to the way we
+ * handle the index determination for the smaller caches.
+ *
+ * Make sure that nothing crazy happens if someone starts tinkering
+ * around with ARCH_KMALLOC_MINALIGN
+ */
+void __init setup_kmalloc_cache_index_table(void)
 {
        int i;
-        /*
-         * Patch up the size_index table if we have strange large alignment
-         * requirements for the kmalloc array. This is only the case for
-         * MIPS it seems. The standard arches will not generate any code here.
-         *
-         * Largest permitted alignment is 256 bytes due to the way we
-         * handle the index determination for the smaller caches.
-         *
-         * Make sure that nothing crazy happens if someone starts tinkering
-         * around with ARCH_KMALLOC_MINALIGN
-         */
        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
                (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
@@ -833,39 +853,41 @@ void __init create_kmalloc_caches(unsigned long flags)
                for (i = 128 + 8; i <= 192; i += 8)
                        size_index[size_index_elem(i)] = 8;
        }
-        for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+}
+/*
+ * Create the kmalloc array. Some of the regular kmalloc arrays
+ * may already have been created because they were needed to
+ * enable allocations for slab creation.
+ */
+void __init create_kmalloc_caches(unsigned long flags)
+{
+        int i;
+        for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
                if (!kmalloc_caches[i]) {
-                        kmalloc_caches[i] = create_kmalloc_cache(NULL,
+                        kmalloc_caches[i] = create_kmalloc_cache(
-                                                        1 << i, flags);
+                                                kmalloc_info[i].name,
+                                                kmalloc_info[i].size,
+                                                flags);
                }
                /*
-                 * Caches that are not of the two-to-the-power-of size.
+                 * "i == 2" is the "kmalloc-192" case which is the last special
-                 * These have to be created immediately after the
+                 * case for initialization and it's the point to jump to
-                 * earlier power of two caches
+                 * allocate the minimize size of the object. In slab allocator,
+                 * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4
+                 * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is
+                 * defined, it may be larger than 2^5 and here is also the
+                 * trick to skip the empty gap.
                 */
-                if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
+                if (i == 2)
-                        kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
+                        i = (KMALLOC_SHIFT_LOW - 1);
-                if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
-                        kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
        }
        /* Kmalloc array is now usable */
        slab_state = UP;
-        for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
-                struct kmem_cache *s = kmalloc_caches[i];
-                char *n;
-                if (s) {
-                        n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));
-                        BUG_ON(!n);
-                        s->name = n;
-                }
-        }
 #ifdef CONFIG_ZONE_DMA
        for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
                struct kmem_cache *s = kmalloc_caches[i];
diff --git a/mm/slub.c b/mm/slub.c
index 54c0876b43d5..816df0016555 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3700,6 +3700,7 @@ void __init kmem_cache_init(void)
        kmem_cache_node = bootstrap(&boot_kmem_cache_node);
        /* Now we can use the kmem_cache to allocate kmalloc slabs */
+        setup_kmalloc_cache_index_table();
        create_kmalloc_caches(0);
 #ifdef CONFIG_SMP
diff --git a/mm/swap.c b/mm/swap.c
index a7251a8ed532..a3a0a2f1f7c3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -131,7 +131,6 @@ void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
                 * here, see the comment above this function.
                 */
                VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
-                VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
                if (put_page_testzero(page_head)) {
                        /*
                         * If this is the tail of a slab THP page,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e8eadd71bac..19ef01e90ac4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2646,7 +2646,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
        for (i = 0; i <= ZONE_NORMAL; i++) {
                zone = &pgdat->node_zones[i];
-                if (!populated_zone(zone))
+                if (!populated_zone(zone) ||
+                    zone_reclaimable_pages(zone) == 0)
                        continue;
                pfmemalloc_reserve += min_wmark_pages(zone);
@@ -3596,7 +3597,7 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_OFF 0
 #define RECLAIM_ZONE (1<<0)     /* Run shrink_inactive_list on the zone */
 #define RECLAIM_WRITE (1<<1)    /* Writeout pages during reclaim */
-#define RECLAIM_SWAP (1<<2)     /* Swap pages out during reclaim */
+#define RECLAIM_UNMAP (1<<2)    /* Unmap pages during reclaim */
 /*
 * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -3638,12 +3639,12 @@ static long zone_pagecache_reclaimable(struct zone *zone)
        long delta = 0;
        /*
-         * If RECLAIM_SWAP is set, then all file pages are considered
+         * If RECLAIM_UNMAP is set, then all file pages are considered
         * potentially reclaimable. Otherwise, we have to worry about
         * pages like swapcache and zone_unmapped_file_pages() provides
         * a better estimate
         */
-        if (zone_reclaim_mode & RECLAIM_SWAP)
+        if (zone_reclaim_mode & RECLAIM_UNMAP)
                nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
        else
                nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
@@ -3674,15 +3675,15 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .order = order,
                .priority = ZONE_RECLAIM_PRIORITY,
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-                .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+                .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
                .may_swap = 1,
        };
        cond_resched();
        /*
-         * We need to be able to allocate from the reserves for RECLAIM_SWAP
+         * We need to be able to allocate from the reserves for RECLAIM_UNMAP
         * and we also need to be able to write out pages for RECLAIM_WRITE
-         * and RECLAIM_SWAP.
+         * and RECLAIM_UNMAP.
         */
        p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
        lockdep_set_current_reclaim_state(gfp_mask);