153 files changed, 2312 insertions, 1419 deletions
diff --git a/Documentation/cma/debugfs.txt b/Documentation/cma/debugfs.txt
new file mode 100644
index 000000000000..6cef20a8cedc
--- /dev/null
+++ b/Documentation/cma/debugfs.txt
@@ -0,0 +1,21 @@
+The CMA debugfs interface is useful to retrieve basic information out of the
+different CMA areas and to test allocation/release in each of the areas.
+Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
+kernel's CMA index. So the first CMA zone would be:
+        <debugfs>/cma/cma-0
+The structure of the files created under that directory is as follows:
+ - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
+ - [RO] count: Amount of memory in the CMA area.
+ - [RO] order_per_bit: Order of pages represented by one bit.
+ - [RO] bitmap: The bitmap of page states in the zone.
+ - [WO] alloc: Allocate N pages from that CMA area. For example:
+        echo 5 > <debugfs>/cma/cma-2/alloc
+would try to allocate 5 pages from the cma-2 area.
+ - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 05c36118f8d7..327556349757 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1989,7 +1989,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        seconds.  Use this parameter to check at some
                        other rate.  0 disables periodic checking.
-        memtest=        [KNL,X86] Enable memtest
+        memtest=        [KNL,X86,ARM] Enable memtest
                        Format: <integer>
                        default : 0 <disable>
                        Specifies the number of memtest passes to be
@@ -2236,8 +2236,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
        nmi_watchdog=   [KNL,BUGS=X86] Debugging features for SMP kernels
                        Format: [panic,][nopanic,][num]
-                        Valid num: 0
+                        Valid num: 0 or 1
                        0 - turn nmi_watchdog off
+                        1 - turn nmi_watchdog on
                        When panic is specified, panic when an NMI watchdog
                        timeout occurs (or 'nopanic' to override the opposite
                        default).
@@ -2322,6 +2323,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        register save and restore. The kernel will only save
                        legacy floating-point registers on task switch.
+        nohugeiomap     [KNL,x86] Disable kernel huge I/O mappings.
        noxsave         [BUGS=X86] Disables x86 extended register state save
                        and restore using xsave. The kernel will fallback to
                        enabling legacy floating-point and sse state.
@@ -2464,7 +2467,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
        nousb           [USB] Disable the USB subsystem
-        nowatchdog      [KNL] Disable the lockup detector (NMI watchdog).
+        nowatchdog      [KNL] Disable both lockup detectors, i.e.
+                        soft-lockup and NMI watchdog (hard-lockup).
        nowb            [ARM]
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 83ab25660fc9..99d7eb3a1416 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -77,12 +77,14 @@ show up in /proc/sys/kernel:
 - shmmax                      [ sysv ipc ]
 - shmmni
 - softlockup_all_cpu_backtrace
+- soft_watchdog
 - stop-a                      [ SPARC only ]
 - sysrq                       ==> Documentation/sysrq.txt
 - sysctl_writes_strict
 - tainted
 - threads-max
 - unknown_nmi_panic
+- watchdog
 - watchdog_thresh
 - version
@@ -417,16 +419,23 @@ successful IPC object allocation.
 nmi_watchdog:
-Enables/Disables the NMI watchdog on x86 systems. When the value is
+This parameter can be used to control the NMI watchdog
-non-zero the NMI watchdog is enabled and will continuously test all
+(i.e. the hard lockup detector) on x86 systems.
-online cpus to determine whether or not they are still functioning
-properly. Currently, passing "nmi_watchdog=" parameter at boot time is
-required for this function to work.
-If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel
+   0 - disable the hard lockup detector
-parameter), the NMI watchdog shares registers with oprofile. By
+   1 - enable the hard lockup detector
-disabling the NMI watchdog, oprofile may have more registers to
-utilize.
+The hard lockup detector monitors each CPU for its ability to respond to
+timer interrupts. The mechanism utilizes CPU performance counter registers
+that are programmed to generate Non-Maskable Interrupts (NMIs) periodically
+while a CPU is busy. Hence, the alternative name 'NMI watchdog'.
+The NMI watchdog is disabled by default if the kernel is running as a guest
+in a KVM virtual machine. This default can be overridden by adding
+   nmi_watchdog=1
+to the guest kernel command line (see Documentation/kernel-parameters.txt).
 ==============================================================
@@ -816,6 +825,22 @@ NMI.
 ==============================================================
+soft_watchdog
+This parameter can be used to control the soft lockup detector.
+   0 - disable the soft lockup detector
+   1 - enable the soft lockup detector
+The soft lockup detector monitors CPUs for threads that are hogging the CPUs
+without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads
+from running. The mechanism depends on the CPUs ability to respond to timer
+interrupts which are needed for the 'watchdog/N' threads to be woken up by
+the watchdog timer function, otherwise the NMI watchdog - if enabled - can
+detect a hard lockup condition.
+==============================================================
 tainted:
 Non-zero if the kernel has been tainted.  Numeric values, which
@@ -858,6 +883,25 @@ example.  If a system hangs up, try pressing the NMI switch.
 ==============================================================
+watchdog:
+This parameter can be used to disable or enable the soft lockup detector
+_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time.
+   0 - disable both lockup detectors
+   1 - enable both lockup detectors
+The soft lockup detector and the NMI watchdog can also be disabled or
+enabled individually, using the soft_watchdog and nmi_watchdog parameters.
+If the watchdog parameter is read, for example by executing
+   cat /proc/sys/kernel/watchdog
+the output of this command (0 or 1) shows the logical OR of soft_watchdog
+and nmi_watchdog.
+==============================================================
 watchdog_thresh:
 This value can be used to control the frequency of hrtimer and NMI
diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt
index 01d76282444e..e4b49df7a048 100644
--- a/Documentation/vm/cleancache.txt
+++ b/Documentation/vm/cleancache.txt
@@ -28,9 +28,7 @@ IMPLEMENTATION OVERVIEW
 A cleancache "backend" that provides transcendent memory registers itself
 to the kernel's cleancache "frontend" by calling cleancache_register_ops,
 passing a pointer to a cleancache_ops structure with funcs set appropriately.
-Note that cleancache_register_ops returns the previous settings so that
+The functions provided must conform to certain semantics as follows:
-chaining can be performed if desired. The functions provided must conform to
-certain semantics as follows:
 Most important, cleancache is "ephemeral".  Pages which are copied into
 cleancache have an indefinite lifetime which is completely unknowable
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index 744f82f86c58..86cb4624fc5a 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -317,7 +317,7 @@ If the VMA passes some filtering as described in "Filtering Special Vmas"
 below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
 off a subset of the VMA if the range does not cover the entire VMA.  Once the
 VMA has been merged or split or neither, mlock_fixup() will call
-__mlock_vma_pages_range() to fault in the pages via get_user_pages() and to
+populate_vma_page_range() to fault in the pages via get_user_pages() and to
 mark the pages as mlocked via mlock_vma_page().
 Note that the VMA being mlocked might be mapped with PROT_NONE.  In this case,
@@ -327,7 +327,7 @@ fault path or in vmscan.
 Also note that a page returned by get_user_pages() could be truncated or
 migrated out from under us, while we're trying to mlock it.  To detect this,
-__mlock_vma_pages_range() checks page_mapping() after acquiring the page lock.
+populate_vma_page_range() checks page_mapping() after acquiring the page lock.
 If the page is still associated with its mapping, we'll go ahead and call
 mlock_vma_page().  If the mapping is gone, we just unlock the page and move on.
 In the worst case, this will result in a page mapped in a VM_LOCKED VMA
@@ -392,7 +392,7 @@ ignored for munlock.
 If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
 specified range.  The range is then munlocked via the function
-__mlock_vma_pages_range() - the same function used to mlock a VMA range -
+populate_vma_page_range() - the same function used to mlock a VMA range -
 passing a flag to indicate that munlock() is being performed.
 Because the VMA access protections could have been changed to PROT_NONE after
@@ -402,7 +402,7 @@ get_user_pages() was enhanced to accept a flag to ignore the permissions when
 fetching the pages - all of which should be resident as a result of previous
 mlocking.
-For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling
+For munlock(), populate_vma_page_range() unlocks individual pages by calling
 munlock_vma_page().  munlock_vma_page() unconditionally clears the PG_mlocked
 flag using TestClearPageMlocked().  As with mlock_vma_page(),
 munlock_vma_page() use the Test*PageMlocked() function to handle the case where
@@ -463,21 +463,11 @@ populate the page table.
 To mlock a range of memory under the unevictable/mlock infrastructure, the
 mmap() handler and task address space expansion functions call
-mlock_vma_pages_range() specifying the vma and the address range to mlock.
+populate_vma_page_range() specifying the vma and the address range to mlock.
-mlock_vma_pages_range() filters VMAs like mlock_fixup(), as described above in
-"Filtering Special VMAs".  It will clear the VM_LOCKED flag, which will have
+The callers of populate_vma_page_range() will have already added the memory range
-already been set by the caller, in filtered VMAs.  Thus these VMA's need not be
-visited for munlock when the region is unmapped.
-For "normal" VMAs, mlock_vma_pages_range() calls __mlock_vma_pages_range() to
-fault/allocate the pages and mlock them.  Again, like mlock_fixup(),
-mlock_vma_pages_range() downgrades the mmap semaphore to read mode before
-attempting to fault/allocate and mlock the pages and "upgrades" the semaphore
-back to write mode before returning.
-The callers of mlock_vma_pages_range() will have already added the memory range
 to be mlocked to the task's "locked_vm".  To account for filtered VMAs,
-mlock_vma_pages_range() returns the number of pages NOT mlocked.  All of the
+populate_vma_page_range() returns the number of pages NOT mlocked.  All of the
 callers then subtract a non-negative return value from the task's locked_vm.  A
 negative return value represent an error - for example, from get_user_pages()
 attempting to fault in a VMA with PROT_NONE access.  In this case, we leave the
diff --git a/arch/Kconfig b/arch/Kconfig
index 05d7a8a458d5..e1068987bad1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -446,6 +446,9 @@ config HAVE_IRQ_TIME_ACCOUNTING
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
        bool
+config HAVE_ARCH_HUGE_VMAP
+        bool
 config HAVE_ARCH_SOFT_DIRTY
        bool
@@ -484,6 +487,18 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
          This spares a stack switch and improves cache usage on softirq
          processing.
+config PGTABLE_LEVELS
+        int
+        default 2
+config ARCH_HAS_ELF_RANDOMIZE
+        bool
+        help
+          An architecture supports choosing randomized locations for
+          stack, mmap, brk, and ET_DYN. Defined functions:
+          - arch_mmap_rnd()
+          - arch_randomize_brk()
 #
 # ABI hall of shame
 #
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index b7ff9a318c31..bf9e9d3b3792 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -76,6 +76,10 @@ config GENERIC_ISA_DMA
        bool
        default y
+config PGTABLE_LEVELS
+        int
+        default 3
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index cf4c0c99aa25..4b62f4caf0ce 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1,8 +1,8 @@
 config ARM
        bool
        default y
-        select ARCH_BINFMT_ELF_RANDOMIZE_PIE
        select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
+        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
        select ARCH_HAVE_CUSTOM_GPIO_H
        select ARCH_HAS_GCOV_PROFILE_ALL
@@ -286,6 +286,11 @@ config GENERIC_BUG
        def_bool y
        depends on BUG
+config PGTABLE_LEVELS
+        int
+        default 3 if ARM_LPAE
+        default 2
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index afb9cafd3786..c1ff8ab12914 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -125,10 +125,6 @@ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs);
 extern void elf_set_personality(const struct elf32_hdr *);
 #define SET_PERSONALITY(ex)     elf_set_personality(&(ex))
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
 #ifdef CONFIG_MMU
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
 struct linux_binprm;
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 1609b022a72f..3d0e9aed4b40 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -335,6 +335,9 @@ void __init bootmem_init(void)
        find_limits(&min, &max_low, &max_high);
+        early_memtest((phys_addr_t)min << PAGE_SHIFT,
+                      (phys_addr_t)max_low << PAGE_SHIFT);
        /*
         * Sparsemem tries to allocate bootmem in memory_present(),
         * so must be done after the fixed reservations
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 5e85ed371364..407dc786583a 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -169,14 +169,22 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        return addr;
 }
+unsigned long arch_mmap_rnd(void)
+{
+        unsigned long rnd;
+        /* 8 bits of randomness in 20 address space bits */
+        rnd = (unsigned long)get_random_int() % (1 << 8);
+        return rnd << PAGE_SHIFT;
+}
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
        unsigned long random_factor = 0UL;
-        /* 8 bits of randomness in 20 address space bits */
+        if (current->flags & PF_RANDOMIZE)
-        if ((current->flags & PF_RANDOMIZE) &&
+                random_factor = arch_mmap_rnd();
-            !(current->personality & ADDR_NO_RANDOMIZE))
-                random_factor = (get_random_int() % (1 << 8)) << PAGE_SHIFT;
        if (mmap_is_legacy()) {
                mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1b8e97331ffb..34f487d5d84e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1,7 +1,7 @@
 config ARM64
        def_bool y
-        select ARCH_BINFMT_ELF_RANDOMIZE_PIE
        select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
+        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_GCOV_PROFILE_ALL
        select ARCH_HAS_SG_CHAIN
        select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
@@ -143,6 +143,13 @@ config KERNEL_MODE_NEON
 config FIX_EARLYCON_MEM
        def_bool y
+config PGTABLE_LEVELS
+        int
+        default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42
+        default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48
+        default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39
+        default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
@@ -413,13 +420,6 @@ config ARM64_VA_BITS
        default 42 if ARM64_VA_BITS_42
        default 48 if ARM64_VA_BITS_48
-config ARM64_PGTABLE_LEVELS
-        int
-        default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42
-        default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48
-        default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39
-        default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48
 config CPU_BIG_ENDIAN
       bool "Build big-endian kernel"
       help
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 1f65be393139..faad6df49e5b 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -125,7 +125,6 @@ typedef struct user_fpsimd_state elf_fpregset_t;
 * the loader.  We need to make sure that it is out of the way of the program
 * that it will "exec", and that there is sufficient room for the brk.
 */
-extern unsigned long randomize_et_dyn(unsigned long base);
 #define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3)
 /*
@@ -157,10 +156,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 #define STACK_RND_MASK                  (0x3ffff >> (PAGE_SHIFT - 12))
 #endif
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
 #ifdef CONFIG_COMPAT
 #ifdef __AARCH64EB__
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index bbfb600fa822..36250705dc4c 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -163,12 +163,12 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 /*
 * If we are concatenating first level stage-2 page tables, we would have less
 * than or equal to 16 pointers in the fake PGD, because that's what the
- * architecture allows.  In this case, (4 - CONFIG_ARM64_PGTABLE_LEVELS)
+ * architecture allows.  In this case, (4 - CONFIG_PGTABLE_LEVELS)
 * represents the first level for the host, and we add 1 to go to the next
 * level (which uses contatenation) for the stage-2 tables.
 */
 #if PTRS_PER_S2_PGD <= 16
-#define KVM_PREALLOC_LEVEL      (4 - CONFIG_ARM64_PGTABLE_LEVELS + 1)
+#define KVM_PREALLOC_LEVEL      (4 - CONFIG_PGTABLE_LEVELS + 1)
 #else
 #define KVM_PREALLOC_LEVEL      (0)
 #endif
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 22b16232bd60..8fc8fa280e92 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -36,9 +36,9 @@
 * for more information).
 */
 #ifdef CONFIG_ARM64_64K_PAGES
-#define SWAPPER_PGTABLE_LEVELS  (CONFIG_ARM64_PGTABLE_LEVELS)
+#define SWAPPER_PGTABLE_LEVELS  (CONFIG_PGTABLE_LEVELS)
 #else
-#define SWAPPER_PGTABLE_LEVELS  (CONFIG_ARM64_PGTABLE_LEVELS - 1)
+#define SWAPPER_PGTABLE_LEVELS  (CONFIG_PGTABLE_LEVELS - 1)
 #endif
 #define SWAPPER_DIR_SIZE        (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index e20df38a8ff3..76420568d66a 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -28,7 +28,7 @@
 #define PGALLOC_GFP     (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
@@ -46,9 +46,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
        set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE));
 }
-#endif  /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */
+#endif  /* CONFIG_PGTABLE_LEVELS > 2 */
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
@@ -66,7 +66,7 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
        set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE));
 }
-#endif  /* CONFIG_ARM64_PGTABLE_LEVELS > 3 */
+#endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 5f930cc9ea83..80f3d241cff8 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -21,7 +21,7 @@
 /*
 * PMD_SHIFT determines the size a level 2 page table entry can map.
 */
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 #define PMD_SHIFT               ((PAGE_SHIFT - 3) * 2 + 3)
 #define PMD_SIZE                (_AC(1, UL) << PMD_SHIFT)
 #define PMD_MASK                (~(PMD_SIZE-1))
@@ -31,7 +31,7 @@
 /*
 * PUD_SHIFT determines the size a level 1 page table entry can map.
 */
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 #define PUD_SHIFT               ((PAGE_SHIFT - 3) * 3 + 3)
 #define PUD_SIZE                (_AC(1, UL) << PUD_SHIFT)
 #define PUD_MASK                (~(PUD_SIZE-1))
@@ -42,7 +42,7 @@
 * PGDIR_SHIFT determines the size a top-level page table entry can map
 * (depending on the configuration, this level can be 0, 1 or 2).
 */
-#define PGDIR_SHIFT             ((PAGE_SHIFT - 3) * CONFIG_ARM64_PGTABLE_LEVELS + 3)
+#define PGDIR_SHIFT             ((PAGE_SHIFT - 3) * CONFIG_PGTABLE_LEVELS + 3)
 #define PGDIR_SIZE              (_AC(1, UL) << PGDIR_SHIFT)
 #define PGDIR_MASK              (~(PGDIR_SIZE-1))
 #define PTRS_PER_PGD            (1 << (VA_BITS - PGDIR_SHIFT))
diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
index ca9df80af896..2b1bd7e52c3b 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -38,13 +38,13 @@ typedef struct { pteval_t pte; } pte_t;
 #define pte_val(x)      ((x).pte)
 #define __pte(x)        ((pte_t) { (x) } )
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 typedef struct { pmdval_t pmd; } pmd_t;
 #define pmd_val(x)      ((x).pmd)
 #define __pmd(x)        ((pmd_t) { (x) } )
 #endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 typedef struct { pudval_t pud; } pud_t;
 #define pud_val(x)      ((x).pud)
 #define __pud(x)        ((pud_t) { (x) } )
@@ -64,13 +64,13 @@ typedef pteval_t pte_t;
 #define pte_val(x)      (x)
 #define __pte(x)        (x)
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 typedef pmdval_t pmd_t;
 #define pmd_val(x)      (x)
 #define __pmd(x)        (x)
 #endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 typedef pudval_t pud_t;
 #define pud_val(x)      (x)
 #define __pud(x)        (x)
@@ -86,9 +86,9 @@ typedef pteval_t pgprot_t;
 #endif /* STRICT_MM_TYPECHECKS */
-#if CONFIG_ARM64_PGTABLE_LEVELS == 2
+#if CONFIG_PGTABLE_LEVELS == 2
 #include <asm-generic/pgtable-nopmd.h>
-#elif CONFIG_ARM64_PGTABLE_LEVELS == 3
+#elif CONFIG_PGTABLE_LEVELS == 3
 #include <asm-generic/pgtable-nopud.h>
 #endif
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 800ec0e87ed9..56283f8a675c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -374,7 +374,7 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd)
 */
 #define mk_pte(page,prot)       pfn_pte(page_to_pfn(page),prot)
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 #define pmd_ERROR(pmd)          __pmd_error(__FILE__, __LINE__, pmd_val(pmd))
@@ -409,9 +409,9 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
 #define pud_page(pud)           pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK))
-#endif  /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */
+#endif  /* CONFIG_PGTABLE_LEVELS > 2 */
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 #define pud_ERROR(pud)          __pud_error(__FILE__, __LINE__, pud_val(pud))
@@ -445,7 +445,7 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr)
 #define pgd_page(pgd)           pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK))
-#endif  /* CONFIG_ARM64_PGTABLE_LEVELS > 3 */
+#endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 #define pgd_ERROR(pgd)          __pgd_error(__FILE__, __LINE__, pgd_val(pgd))
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 53d9c354219f..3a0242c7eb8d 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -53,7 +53,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
        tlb_remove_entry(tlb, pte);
 }
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
                                  unsigned long addr)
 {
@@ -62,7 +62,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
 }
 #endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
                                  unsigned long addr)
 {
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index ae85da6307bb..597831bdddf3 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -190,6 +190,8 @@ void __init bootmem_init(void)
        min = PFN_UP(memblock_start_of_DRAM());
        max = PFN_DOWN(memblock_end_of_DRAM());
+        early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);
        /*
         * Sparsemem tries to allocate bootmem in memory_present(), so must be
         * done after the fixed reservations.
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 54922d1275b8..ed177475dd8c 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -47,17 +47,16 @@ static int mmap_is_legacy(void)
        return sysctl_legacy_va_layout;
 }
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
 {
-        unsigned long rnd = 0;
+        unsigned long rnd;
-        if (current->flags & PF_RANDOMIZE)
+        rnd = (unsigned long)get_random_int() & STACK_RND_MASK;
-                rnd = (long)get_random_int() & STACK_RND_MASK;
        return rnd << PAGE_SHIFT;
 }
-static unsigned long mmap_base(void)
+static unsigned long mmap_base(unsigned long rnd)
 {
        unsigned long gap = rlimit(RLIMIT_STACK);
@@ -66,7 +65,7 @@ static unsigned long mmap_base(void)
        else if (gap > MAX_GAP)
                gap = MAX_GAP;
-        return PAGE_ALIGN(STACK_TOP - gap - mmap_rnd());
+        return PAGE_ALIGN(STACK_TOP - gap - rnd);
 }
 /*
@@ -75,15 +74,20 @@ static unsigned long mmap_base(void)
 */
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
+        unsigned long random_factor = 0UL;
+        if (current->flags & PF_RANDOMIZE)
+                random_factor = arch_mmap_rnd();
        /*
         * Fall back to the standard layout if the personality bit is set, or
         * if the expected stack growth is unlimited:
         */
        if (mmap_is_legacy()) {
-                mm->mmap_base = TASK_UNMAPPED_BASE;
+                mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
                mm->get_unmapped_area = arch_get_unmapped_area;
        } else {
-                mm->mmap_base = mmap_base();
+                mm->mmap_base = mmap_base(random_factor);
                mm->get_unmapped_area = arch_get_unmapped_area_topdown;
        }
 }
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index c6daaf6c6f97..79e01163a981 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -550,10 +550,10 @@ void vmemmap_free(unsigned long start, unsigned long end)
 #endif  /* CONFIG_SPARSEMEM_VMEMMAP */
 static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss;
 #endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss;
 #endif
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 074e52bf815c..4f9a6661491b 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -1,3 +1,8 @@
+config PGTABLE_LEVELS
+        int "Page Table Levels" if !IA64_PAGE_SIZE_64KB
+        range 3 4 if !IA64_PAGE_SIZE_64KB
+        default 3
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
@@ -286,19 +291,6 @@ config IA64_PAGE_SIZE_64KB
 endchoice
-choice
-        prompt "Page Table Levels"
-        default PGTABLE_3
-config PGTABLE_3
-        bool "3 Levels"
-config PGTABLE_4
-        depends on !IA64_PAGE_SIZE_64KB
-        bool "4 Levels"
-endchoice
 if IA64_HP_SIM
 config HZ
        default 32
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
index 1f1bf144fe62..ec48bb9f95e1 100644
--- a/arch/ia64/include/asm/page.h
+++ b/arch/ia64/include/asm/page.h
@@ -173,7 +173,7 @@ get_order (unsigned long size)
   */
  typedef struct { unsigned long pte; } pte_t;
  typedef struct { unsigned long pmd; } pmd_t;
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
  typedef struct { unsigned long pud; } pud_t;
 #endif
  typedef struct { unsigned long pgd; } pgd_t;
@@ -182,7 +182,7 @@ get_order (unsigned long size)
 # define pte_val(x)     ((x).pte)
 # define pmd_val(x)     ((x).pmd)
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
 # define pud_val(x)     ((x).pud)
 #endif
 # define pgd_val(x)     ((x).pgd)
diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h
index 5767cdfc08db..f5e70e961948 100644
--- a/arch/ia64/include/asm/pgalloc.h
+++ b/arch/ia64/include/asm/pgalloc.h
@@ -32,7 +32,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
        quicklist_free(0, NULL, pgd);
 }
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
 static inline void
 pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud)
 {
@@ -49,7 +49,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
        quicklist_free(0, NULL, pud);
 }
 #define __pud_free_tlb(tlb, pud, address)       pud_free((tlb)->mm, pud)
-#endif /* CONFIG_PGTABLE_4 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
 static inline void
 pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 7b6f8801df57..9f3ed9ee8f13 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -99,7 +99,7 @@
 #define PMD_MASK        (~(PMD_SIZE-1))
 #define PTRS_PER_PMD    (1UL << (PTRS_PER_PTD_SHIFT))
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
 /*
 * Definitions for second level:
 *
@@ -117,7 +117,7 @@
 *
 * PGDIR_SHIFT determines what a first-level page table entry can map.
 */
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
 #define PGDIR_SHIFT             (PUD_SHIFT + (PTRS_PER_PTD_SHIFT))
 #else
 #define PGDIR_SHIFT             (PMD_SHIFT + (PTRS_PER_PTD_SHIFT))
@@ -180,7 +180,7 @@
 #define __S111  __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RWX)
 #define pgd_ERROR(e)    printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
 #define pud_ERROR(e)    printk("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
 #endif
 #define pmd_ERROR(e)    printk("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
@@ -281,7 +281,7 @@ extern unsigned long VMALLOC_END;
 #define pud_page_vaddr(pud)             ((unsigned long) __va(pud_val(pud) & _PFN_MASK))
 #define pud_page(pud)                   virt_to_page((pud_val(pud) + PAGE_OFFSET))
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
 #define pgd_none(pgd)                   (!pgd_val(pgd))
 #define pgd_bad(pgd)                    (!ia64_phys_addr_valid(pgd_val(pgd)))
 #define pgd_present(pgd)                (pgd_val(pgd) != 0UL)
@@ -384,7 +384,7 @@ pgd_offset (const struct mm_struct *mm, unsigned long address)
   here.  */
 #define pgd_offset_gate(mm, addr)       pgd_offset_k(addr)
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
 /* Find an entry in the second-level page table.. */
 #define pud_offset(dir,addr) \
        ((pud_t *) pgd_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
@@ -586,7 +586,7 @@ extern struct page *zero_page_memmap_ptr;
 #define __HAVE_ARCH_PGD_OFFSET_GATE
-#ifndef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 3
 #include <asm-generic/pgtable-nopud.h>
 #endif
 #include <asm-generic/pgtable.h>
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index 18e794a57248..e42bf7a913f3 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -146,7 +146,7 @@ ENTRY(vhpt_miss)
 (p6)    dep r17=r18,r19,3,(PAGE_SHIFT-3)        // r17=pgd_offset for region 5
 (p7)    dep r17=r18,r17,3,(PAGE_SHIFT-6)        // r17=pgd_offset for region[0-4]
        cmp.eq p7,p6=0,r21                      // unused address bits all zeroes?
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
        shr.u r28=r22,PUD_SHIFT                 // shift pud index into position
 #else
        shr.u r18=r22,PMD_SHIFT                 // shift pmd index into position
@@ -155,7 +155,7 @@ ENTRY(vhpt_miss)
        ld8 r17=[r17]                           // get *pgd (may be 0)
        ;;
 (p7)    cmp.eq p6,p7=r17,r0                     // was pgd_present(*pgd) == NULL?
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
        dep r28=r28,r17,3,(PAGE_SHIFT-3)        // r28=pud_offset(pgd,addr)
        ;;
        shr.u r18=r22,PMD_SHIFT                 // shift pmd index into position
@@ -222,13 +222,13 @@ ENTRY(vhpt_miss)
         */
        ld8 r25=[r21]                           // read *pte again
        ld8 r26=[r17]                           // read *pmd again
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
        ld8 r19=[r28]                           // read *pud again
 #endif
        cmp.ne p6,p7=r0,r0
        ;;
        cmp.ne.or.andcm p6,p7=r26,r20           // did *pmd change
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
        cmp.ne.or.andcm p6,p7=r19,r29           // did *pud change
 #endif
        mov r27=PAGE_SHIFT<<2
@@ -476,7 +476,7 @@ ENTRY(nested_dtlb_miss)
 (p6)    dep r17=r18,r19,3,(PAGE_SHIFT-3)        // r17=pgd_offset for region 5
 (p7)    dep r17=r18,r17,3,(PAGE_SHIFT-6)        // r17=pgd_offset for region[0-4]
        cmp.eq p7,p6=0,r21                      // unused address bits all zeroes?
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
        shr.u r18=r22,PUD_SHIFT                 // shift pud index into position
 #else
        shr.u r18=r22,PMD_SHIFT                 // shift pmd index into position
@@ -487,7 +487,7 @@ ENTRY(nested_dtlb_miss)
 (p7)    cmp.eq p6,p7=r17,r0                     // was pgd_present(*pgd) == NULL?
        dep r17=r18,r17,3,(PAGE_SHIFT-3)        // r17=p[u|m]d_offset(pgd,addr)
        ;;
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
 (p7)    ld8 r17=[r17]                           // get *pud (may be 0)
        shr.u r18=r22,PMD_SHIFT                 // shift pmd index into position
        ;;
diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
index 5151a649c96b..b72cd7a07222 100644
--- a/arch/ia64/kernel/machine_kexec.c
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -156,9 +156,9 @@ void arch_crash_save_vmcoreinfo(void)
        VMCOREINFO_OFFSET(node_memblk_s, start_paddr);
        VMCOREINFO_OFFSET(node_memblk_s, size);
 #endif
-#ifdef CONFIG_PGTABLE_3
+#if CONFIG_PGTABLE_LEVELS == 3
        VMCOREINFO_CONFIG(PGTABLE_3);
-#elif defined(CONFIG_PGTABLE_4)
+#elif CONFIG_PGTABLE_LEVELS == 4
        VMCOREINFO_CONFIG(PGTABLE_4);
 #endif
 }
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 87b7c7581b1d..2dd8f63bfbbb 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -67,6 +67,10 @@ config HZ
        default 1000 if CLEOPATRA
        default 100
+config PGTABLE_LEVELS
+        default 2 if SUN3 || COLDFIRE
+        default 3
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index c7a16904cd03..a326c4cb8cf0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -23,7 +23,7 @@ config MIPS
        select HAVE_KRETPROBES
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_SYSCALL_TRACEPOINTS
-        select ARCH_BINFMT_ELF_RANDOMIZE_PIE
+        select ARCH_HAS_ELF_RANDOMIZE
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT
        select RTC_LIB if !MACH_LOONGSON
        select GENERIC_ATOMIC64 if !64BIT
@@ -2600,6 +2600,11 @@ config STACKTRACE_SUPPORT
        bool
        default y
+config PGTABLE_LEVELS
+        int
+        default 3 if 64BIT && !PAGE_SIZE_64KB
+        default 2
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h
index 535f196ffe02..31d747d46a23 100644
--- a/arch/mips/include/asm/elf.h
+++ b/arch/mips/include/asm/elf.h
@@ -410,10 +410,6 @@ struct linux_binprm;
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
                                       int uses_interp);
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
 struct arch_elf_state {
        int fp_abi;
        int interp_fp_abi;
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index f1baadd56e82..5c81fdd032c3 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -142,18 +142,26 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
                        addr0, len, pgoff, flags, DOWN);
 }
+unsigned long arch_mmap_rnd(void)
+{
+        unsigned long rnd;
+        rnd = (unsigned long)get_random_int();
+        rnd <<= PAGE_SHIFT;
+        if (TASK_IS_32BIT_ADDR)
+                rnd &= 0xfffffful;
+        else
+                rnd &= 0xffffffful;
+        return rnd;
+}
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
        unsigned long random_factor = 0UL;
-        if (current->flags & PF_RANDOMIZE) {
+        if (current->flags & PF_RANDOMIZE)
-                random_factor = get_random_int();
+                random_factor = arch_mmap_rnd();
-                random_factor = random_factor << PAGE_SHIFT;
-                if (TASK_IS_32BIT_ADDR)
-                        random_factor &= 0xfffffful;
-                else
-                        random_factor &= 0xffffffful;
-        }
        if (mmap_is_legacy()) {
                mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 8014727a2743..c36546959e86 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -103,6 +103,11 @@ config ARCH_MAY_HAVE_PC_FDC
        depends on BROKEN
        default y
+config PGTABLE_LEVELS
+        int
+        default 3 if 64BIT && PARISC_PAGE_SIZE_4KB
+        default 2
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h
index d17437238a2c..1ba29369257c 100644
--- a/arch/parisc/include/asm/pgalloc.h
+++ b/arch/parisc/include/asm/pgalloc.h
@@ -51,7 +51,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
        free_pages((unsigned long)pgd, PGD_ALLOC_ORDER);
 }
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
 /* Three Level Page Table Support for pmd's */
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 15207b9362bf..0a183756d6ec 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -68,13 +68,11 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long);
 #define KERNEL_INITIAL_ORDER    24      /* 0 to 1<<24 = 16MB */
 #define KERNEL_INITIAL_SIZE     (1 << KERNEL_INITIAL_ORDER)
-#if defined(CONFIG_64BIT) && defined(CONFIG_PARISC_PAGE_SIZE_4KB)
+#if CONFIG_PGTABLE_LEVELS == 3
-#define PT_NLEVELS      3
 #define PGD_ORDER       1 /* Number of pages per pgd */
 #define PMD_ORDER       1 /* Number of pages per pmd */
 #define PGD_ALLOC_ORDER 2 /* first pgd contains pmd */
 #else
-#define PT_NLEVELS      2
 #define PGD_ORDER       1 /* Number of pages per pgd */
 #define PGD_ALLOC_ORDER PGD_ORDER
 #endif
@@ -93,7 +91,7 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long);
 #define PMD_SHIFT       (PLD_SHIFT + BITS_PER_PTE)
 #define PMD_SIZE        (1UL << PMD_SHIFT)
 #define PMD_MASK        (~(PMD_SIZE-1))
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
 #define BITS_PER_PMD    (PAGE_SHIFT + PMD_ORDER - BITS_PER_PMD_ENTRY)
 #else
 #define __PAGETABLE_PMD_FOLDED
@@ -277,7 +275,7 @@ extern unsigned long *empty_zero_page;
 #define pgd_flag(x)     (pgd_val(x) & PxD_FLAG_MASK)
 #define pgd_address(x)  ((unsigned long)(pgd_val(x) &~ PxD_FLAG_MASK) << PxD_VALUE_SHIFT)
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
 /* The first entry of the permanent pmd is not there if it contains
 * the gateway marker */
 #define pmd_none(x)     (!pmd_val(x) || pmd_flag(x) == PxD_FLAG_ATTACHED)
@@ -287,7 +285,7 @@ extern unsigned long *empty_zero_page;
 #define pmd_bad(x)      (!(pmd_flag(x) & PxD_FLAG_VALID))
 #define pmd_present(x)  (pmd_flag(x) & PxD_FLAG_PRESENT)
 static inline void pmd_clear(pmd_t *pmd) {
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
        if (pmd_flag(*pmd) & PxD_FLAG_ATTACHED)
                /* This is the entry pointing to the permanent pmd
                 * attached to the pgd; cannot clear it */
@@ -299,7 +297,7 @@ static inline void pmd_clear(pmd_t *pmd) {
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
 #define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_address(pgd)))
 #define pgd_page(pgd)   virt_to_page((void *)pgd_page_vaddr(pgd))
@@ -309,7 +307,7 @@ static inline void pmd_clear(pmd_t *pmd) {
 #define pgd_bad(x)      (!(pgd_flag(x) & PxD_FLAG_VALID))
 #define pgd_present(x)  (pgd_flag(x) & PxD_FLAG_PRESENT)
 static inline void pgd_clear(pgd_t *pgd) {
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
        if(pgd_flag(*pgd) & PxD_FLAG_ATTACHED)
                /* This is the permanent pmd attached to the pgd; cannot
                 * free it */
@@ -393,7 +391,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 /* Find an entry in the second-level page table.. */
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
 #define pmd_offset(dir,address) \
 ((pmd_t *) pgd_page_vaddr(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1)))
 #else
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index 2ab16bb160a8..75819617f93b 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -398,7 +398,7 @@
         * can address up to 1TB
         */
        .macro          L2_ptep pmd,pte,index,va,fault
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
        extru           \va,31-ASM_PMD_SHIFT,ASM_BITS_PER_PMD,\index
 #else
 # if defined(CONFIG_64BIT)
@@ -436,7 +436,7 @@
         * all ILP32 processes and all the kernel for machines with
         * under 4GB of memory) */
        .macro          L3_ptep pgd,pte,index,va,fault
-#if PT_NLEVELS == 3 /* we might have a 2-Level scheme, e.g. with 16kb page size */
+#if CONFIG_PGTABLE_LEVELS == 3 /* we might have a 2-Level scheme, e.g. with 16kb page size */
        extrd,u         \va,63-ASM_PGDIR_SHIFT,ASM_BITS_PER_PGD,\index
        copy            %r0,\pte
        extrd,u,*=      \va,63-ASM_PGDIR_SHIFT,64-ASM_PGDIR_SHIFT,%r0
diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S
index d4dc588c0dc1..e7d64527aff9 100644
--- a/arch/parisc/kernel/head.S
+++ b/arch/parisc/kernel/head.S
@@ -74,7 +74,7 @@ $bss_loop:
        mtctl           %r4,%cr24       /* Initialize kernel root pointer */
        mtctl           %r4,%cr25       /* Initialize user root pointer */
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
        /* Set pmd in pgd */
        load32          PA(pmd0),%r5
        shrd            %r5,PxD_VALUE_SHIFT,%r3 
@@ -97,7 +97,7 @@ $bss_loop:
        stw             %r3,0(%r4)
        ldo             (PAGE_SIZE >> PxD_VALUE_SHIFT)(%r3),%r3
        addib,>         -1,%r1,1b
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
        ldo             ASM_PMD_ENTRY_SIZE(%r4),%r4
 #else
        ldo             ASM_PGD_ENTRY_SIZE(%r4),%r4
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 15dbe81cf5f3..c229427fa546 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -34,7 +34,7 @@
 extern int  data_start;
 extern void parisc_kernel_start(void);  /* Kernel entry point in head.S */
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
 /* NOTE: This layout exactly conforms to the hybrid L2/L3 page table layout
 * with the first pmd adjacent to the pgd and below it. gcc doesn't actually
 * guarantee that global objects will be laid out in memory in the same order
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 22b0940494bb..e99014adf017 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -88,7 +88,7 @@ config PPC
        select ARCH_MIGHT_HAVE_PC_PARPORT
        select ARCH_MIGHT_HAVE_PC_SERIO
        select BINFMT_ELF
-        select ARCH_BINFMT_ELF_RANDOMIZE_PIE
+        select ARCH_HAS_ELF_RANDOMIZE
        select OF
        select OF_EARLY_FLATTREE
        select OF_RESERVED_MEM
@@ -297,6 +297,12 @@ config ZONE_DMA32
        bool
        default y if PPC64
+config PGTABLE_LEVELS
+        int
+        default 2 if !PPC64
+        default 3 if PPC_64K_PAGES
+        default 4
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h
index 57d289acb803..ee46ffef608e 100644
--- a/arch/powerpc/include/asm/elf.h
+++ b/arch/powerpc/include/asm/elf.h
@@ -128,10 +128,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
        (0x7ff >> (PAGE_SHIFT - 12)) : \
        (0x3ffff >> (PAGE_SHIFT - 12)))
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
 #ifdef CONFIG_SPU_BASE
 /* Notes used in ET_CORE. Note name is "SPU/<fd>/<filename>". */
 #define NT_SPU          1
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index cb8bdbe4972f..0f0502e12f6c 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -53,21 +53,20 @@ static inline int mmap_is_legacy(void)
        return sysctl_legacy_va_layout;
 }
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
 {
-        unsigned long rnd = 0;
+        unsigned long rnd;
+        /* 8MB for 32bit, 1GB for 64bit */
+        if (is_32bit_task())
+                rnd = (unsigned long)get_random_int() % (1<<(23-PAGE_SHIFT));
+        else
+                rnd = (unsigned long)get_random_int() % (1<<(30-PAGE_SHIFT));
-        if (current->flags & PF_RANDOMIZE) {
-                /* 8MB for 32bit, 1GB for 64bit */
-                if (is_32bit_task())
-                        rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
-                else
-                        rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
-        }
        return rnd << PAGE_SHIFT;
 }
-static inline unsigned long mmap_base(void)
+static inline unsigned long mmap_base(unsigned long rnd)
 {
        unsigned long gap = rlimit(RLIMIT_STACK);
@@ -76,7 +75,7 @@ static inline unsigned long mmap_base(void)
        else if (gap > MAX_GAP)
                gap = MAX_GAP;
-        return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+        return PAGE_ALIGN(TASK_SIZE - gap - rnd);
 }
 /*
@@ -85,6 +84,11 @@ static inline unsigned long mmap_base(void)
 */
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
+        unsigned long random_factor = 0UL;
+        if (current->flags & PF_RANDOMIZE)
+                random_factor = arch_mmap_rnd();
        /*
         * Fall back to the standard layout if the personality
         * bit is set, or if the expected stack growth is unlimited:
@@ -93,7 +97,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
                mm->mmap_base = TASK_UNMAPPED_BASE;
                mm->get_unmapped_area = arch_get_unmapped_area;
        } else {
-                mm->mmap_base = mmap_base();
+                mm->mmap_base = mmap_base(random_factor);
                mm->get_unmapped_area = arch_get_unmapped_area_topdown;
        }
 }
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b2d7ec1669b4..6321fd8bf813 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -65,6 +65,7 @@ config S390
        def_bool y
        select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
        select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_GCOV_PROFILE_ALL
        select ARCH_HAS_SG_CHAIN
        select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -156,6 +157,11 @@ config S390
 config SCHED_OMIT_FRAME_POINTER
        def_bool y
+config PGTABLE_LEVELS
+        int
+        default 4 if 64BIT
+        default 2
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index c9c875d9ed31..a5c4978462c1 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -161,10 +161,11 @@ extern unsigned int vdso_enabled;
 /* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
   use of this is to invoke "./ld.so someprog" to test out a new version of
   the loader.  We need to make sure that it is out of the way of the program
-   that it will "exec", and that there is sufficient room for the brk.  */
+   that it will "exec", and that there is sufficient room for the brk. 64-bit
+   tasks are aligned to 4GB. */
-extern unsigned long randomize_et_dyn(void);
+#define ELF_ET_DYN_BASE (is_32bit_task() ? \
-#define ELF_ET_DYN_BASE         randomize_et_dyn()
+                                (STACK_TOP / 3 * 2) : \
+                                (STACK_TOP / 3 * 2) & ~((1UL << 32) - 1))
 /* This yields a mask that user programs can use to figure out what
   instruction set this CPU supports. */
@@ -225,9 +226,6 @@ struct linux_binprm;
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
 int arch_setup_additional_pages(struct linux_binprm *, int);
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
 void *fill_cpu_elf_notes(void *ptr, struct save_area *sa, __vector128 *vxrs);
 #endif
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 179a2c20b01f..bb3367c5cb0b 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -60,22 +60,20 @@ static inline int mmap_is_legacy(void)
        return sysctl_legacy_va_layout;
 }
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
 {
-        if (!(current->flags & PF_RANDOMIZE))
-                return 0;
        if (is_32bit_task())
                return (get_random_int() & 0x7ff) << PAGE_SHIFT;
        else
                return (get_random_int() & mmap_rnd_mask) << PAGE_SHIFT;
 }
-static unsigned long mmap_base_legacy(void)
+static unsigned long mmap_base_legacy(unsigned long rnd)
 {
-        return TASK_UNMAPPED_BASE + mmap_rnd();
+        return TASK_UNMAPPED_BASE + rnd;
 }
-static inline unsigned long mmap_base(void)
+static inline unsigned long mmap_base(unsigned long rnd)
 {
        unsigned long gap = rlimit(RLIMIT_STACK);
@@ -84,7 +82,7 @@ static inline unsigned long mmap_base(void)
        else if (gap > MAX_GAP)
                gap = MAX_GAP;
        gap &= PAGE_MASK;
-        return STACK_TOP - stack_maxrandom_size() - mmap_rnd() - gap;
+        return STACK_TOP - stack_maxrandom_size() - rnd - gap;
 }
 unsigned long
@@ -179,17 +177,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        return addr;
 }
-unsigned long randomize_et_dyn(void)
-{
-        unsigned long base;
-        base = STACK_TOP / 3 * 2;
-        if (!is_32bit_task())
-                /* Align to 4GB */
-                base &= ~((1UL << 32) - 1);
-        return base + mmap_rnd();
-}
 #ifndef CONFIG_64BIT
 /*
@@ -198,15 +185,20 @@ unsigned long randomize_et_dyn(void)
 */
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
+        unsigned long random_factor = 0UL;
+        if (current->flags & PF_RANDOMIZE)
+                random_factor = arch_mmap_rnd();
        /*
         * Fall back to the standard layout if the personality
         * bit is set, or if the expected stack growth is unlimited:
         */
        if (mmap_is_legacy()) {
-                mm->mmap_base = mmap_base_legacy();
+                mm->mmap_base = mmap_base_legacy(random_factor);
                mm->get_unmapped_area = arch_get_unmapped_area;
        } else {
-                mm->mmap_base = mmap_base();
+                mm->mmap_base = mmap_base(random_factor);
                mm->get_unmapped_area = arch_get_unmapped_area_topdown;
        }
 }
@@ -273,15 +265,20 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
 */
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
+        unsigned long random_factor = 0UL;
+        if (current->flags & PF_RANDOMIZE)
+                random_factor = arch_mmap_rnd();
        /*
         * Fall back to the standard layout if the personality
         * bit is set, or if the expected stack growth is unlimited:
         */
        if (mmap_is_legacy()) {
-                mm->mmap_base = mmap_base_legacy();
+                mm->mmap_base = mmap_base_legacy(random_factor);
                mm->get_unmapped_area = s390_get_unmapped_area;
        } else {
-                mm->mmap_base = mmap_base();
+                mm->mmap_base = mmap_base(random_factor);
                mm->get_unmapped_area = s390_get_unmapped_area_topdown;
        }
 }
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index eb4ef274ae9b..50057fed819d 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -162,6 +162,10 @@ config NEED_DMA_MAP_STATE
 config NEED_SG_DMA_LENGTH
        def_bool y
+config PGTABLE_LEVELS
+        default 3 if X2TLB
+        default 2
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index 67a049e75ec1..9d209a07235e 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -993,7 +993,7 @@ static struct unwinder dwarf_unwinder = {
        .rating = 150,
 };
-static void dwarf_unwinder_cleanup(void)
+static void __init dwarf_unwinder_cleanup(void)
 {
        struct dwarf_fde *fde, *next_fde;
        struct dwarf_cie *cie, *next_cie;
@@ -1009,6 +1009,10 @@ static void dwarf_unwinder_cleanup(void)
        rbtree_postorder_for_each_entry_safe(cie, next_cie, &cie_root, node)
                kfree(cie);
+        if (dwarf_reg_pool)
+                mempool_destroy(dwarf_reg_pool);
+        if (dwarf_frame_pool)
+                mempool_destroy(dwarf_frame_pool);
        kmem_cache_destroy(dwarf_reg_cachep);
        kmem_cache_destroy(dwarf_frame_cachep);
 }
@@ -1176,17 +1180,13 @@ static int __init dwarf_unwinder_init(void)
                        sizeof(struct dwarf_reg), 0,
                        SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL);
-        dwarf_frame_pool = mempool_create(DWARF_FRAME_MIN_REQ,
+        dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ,
-                                          mempool_alloc_slab,
+                                                    dwarf_frame_cachep);
-                                          mempool_free_slab,
-                                          dwarf_frame_cachep);
        if (!dwarf_frame_pool)
                goto out;
-        dwarf_reg_pool = mempool_create(DWARF_REG_MIN_REQ,
+        dwarf_reg_pool = mempool_create_slab_pool(DWARF_REG_MIN_REQ,
-                                         mempool_alloc_slab,
+                                                  dwarf_reg_cachep);
-                                         mempool_free_slab,
-                                         dwarf_reg_cachep);
        if (!dwarf_reg_pool)
                goto out;
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index efb00ec75805..e49502acbab4 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -146,6 +146,10 @@ config GENERIC_ISA_DMA
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
        def_bool y if SPARC64
+config PGTABLE_LEVELS
+        default 4 if 64BIT
+        default 3
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index 99632a87e697..26c80e18d7b1 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -130,26 +130,26 @@ static struct mdesc_mem_ops memblock_mdesc_ops = {
 static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size)
 {
        unsigned int handle_size;
+        struct mdesc_handle *hp;
+        unsigned long addr;
        void *base;
        handle_size = (sizeof(struct mdesc_handle) -
                       sizeof(struct mdesc_hdr) +
                       mdesc_size);
+        /*
+         * Allocation has to succeed because mdesc update would be missed
+         * and such events are not retransmitted.
+         */
        base = kmalloc(handle_size + 15, GFP_KERNEL | __GFP_NOFAIL);
-        if (base) {
+        addr = (unsigned long)base;
-                struct mdesc_handle *hp;
+        addr = (addr + 15UL) & ~15UL;
-                unsigned long addr;
+        hp = (struct mdesc_handle *) addr;
-                addr = (unsigned long)base;
-                addr = (addr + 15UL) & ~15UL;
-                hp = (struct mdesc_handle *) addr;
-                mdesc_handle_init(hp, handle_size, base);
+        mdesc_handle_init(hp, handle_size, base);
-                return hp;
-        }
-        return NULL;
+        return hp;
 }
 static void mdesc_kfree(struct mdesc_handle *hp)
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 7cca41842a9e..0142d578b5a8 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -147,6 +147,11 @@ config ARCH_DEFCONFIG
        default "arch/tile/configs/tilepro_defconfig" if !TILEGX
        default "arch/tile/configs/tilegx_defconfig" if TILEGX
+config PGTABLE_LEVELS
+        int
+        default 3 if 64BIT
+        default 2
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um
index a7520c90f62d..5dbfe3d9107c 100644
--- a/arch/um/Kconfig.um
+++ b/arch/um/Kconfig.um
@@ -155,3 +155,8 @@ config MMAPPER
 config NO_DMA
        def_bool y
+config PGTABLE_LEVELS
+        int
+        default 3 if 3_LEVEL_PGTABLES
+        default 2
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index faff6934c05a..d43e7e1c784b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -87,7 +87,7 @@ config X86
        select HAVE_ARCH_KMEMCHECK
        select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
        select HAVE_USER_RETURN_NOTIFIER
-        select ARCH_BINFMT_ELF_RANDOMIZE_PIE
+        select ARCH_HAS_ELF_RANDOMIZE
        select HAVE_ARCH_JUMP_LABEL
        select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
        select SPARSE_IRQ
@@ -99,6 +99,7 @@ config X86
        select IRQ_FORCED_THREADING
        select HAVE_BPF_JIT if X86_64
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+        select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
        select ARCH_HAS_SG_CHAIN
        select CLKEVT_I8253
        select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -277,6 +278,12 @@ config ARCH_SUPPORTS_UPROBES
 config FIX_EARLYCON_MEM
        def_bool y
+config PGTABLE_LEVELS
+        int
+        default 4 if X86_64
+        default 3 if X86_PAE
+        default 2
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
@@ -714,17 +721,6 @@ endif #HYPERVISOR_GUEST
 config NO_BOOTMEM
        def_bool y
-config MEMTEST
-        bool "Memtest"
-        ---help---
-          This option adds a kernel parameter 'memtest', which allows memtest
-          to be set.
-                memtest=0, mean disabled; -- default
-                memtest=1, mean do 1 test pattern;
-                ...
-                memtest=4, mean do 4 test patterns.
-          If you are unsure how to answer this question, answer N.
 source "arch/x86/Kconfig.cpu"
 config HPET_TIMER
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 779c2efe2e97..3ab0537872fb 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
 }
 #endif
-#ifdef CONFIG_MEMTEST
-extern void early_memtest(unsigned long start, unsigned long end);
-#else
-static inline void early_memtest(unsigned long start, unsigned long end)
-{
-}
-#endif
 extern unsigned long e820_end_of_ram_pfn(void);
 extern unsigned long e820_end_of_low_ram_pfn(void);
 extern u64 early_reserve_e820(u64 sizet, u64 align);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 935588d95c82..f161c189c27b 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -339,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
                                              int uses_interp);
 #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
 /*
 * True on X86_32 or when emulating IA32 on X86_64
 */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..c7c712f2648b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,8 +40,10 @@
 #ifdef CONFIG_X86_64
 #include <asm/page_64_types.h>
+#define IOREMAP_MAX_ORDER       (PUD_SHIFT)
 #else
 #include <asm/page_32_types.h>
+#define IOREMAP_MAX_ORDER       (PMD_SHIFT)
 #endif  /* CONFIG_X86_64 */
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5f6051d5d139..8957810ad7d1 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
                PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
 }
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
 static inline pmd_t __pmd(pmdval_t val)
 {
        pmdval_t ret;
@@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
                PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
                            val);
 }
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 static inline pud_t __pud(pudval_t val)
 {
        pudval_t ret;
@@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp)
        set_pud(pudp, __pud(0));
 }
-#endif  /* PAGETABLE_LEVELS == 4 */
+#endif  /* CONFIG_PGTABLE_LEVELS == 4 */
-#endif  /* PAGETABLE_LEVELS >= 3 */
+#endif  /* CONFIG_PGTABLE_LEVELS >= 3 */
 #ifdef CONFIG_X86_PAE
 /* Special-case pte-setting operations for PAE, which can't update a
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7549b8b369e4..f7b0b5c112f2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -294,7 +294,7 @@ struct pv_mmu_ops {
        struct paravirt_callee_save pgd_val;
        struct paravirt_callee_save make_pgd;
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
        void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
        void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
@@ -308,13 +308,13 @@ struct pv_mmu_ops {
        struct paravirt_callee_save pmd_val;
        struct paravirt_callee_save make_pmd;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
        struct paravirt_callee_save pud_val;
        struct paravirt_callee_save make_pud;
        void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
-#endif  /* PAGETABLE_LEVELS == 4 */
+#endif  /* CONFIG_PGTABLE_LEVELS == 4 */
-#endif  /* PAGETABLE_LEVELS >= 3 */
+#endif  /* CONFIG_PGTABLE_LEVELS >= 3 */
        struct pv_lazy_ops lazy_mode;
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c4412e972bbd..bf7f8b55b0f9 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 #define pmd_pgtable(pmd) pmd_page(pmd)
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
        struct page *page;
@@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 }
 #endif  /* CONFIG_X86_PAE */
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 {
        paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
@@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
        ___pud_free_tlb(tlb, pud);
 }
-#endif  /* PAGETABLE_LEVELS > 3 */
+#endif  /* CONFIG_PGTABLE_LEVELS > 3 */
-#endif  /* PAGETABLE_LEVELS > 2 */
+#endif  /* CONFIG_PGTABLE_LEVELS > 2 */
 #endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index daacc23e3fb9..392576433e77 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -17,7 +17,6 @@ typedef union {
 #endif  /* !__ASSEMBLY__ */
 #define SHARED_KERNEL_PMD       0
-#define PAGETABLE_LEVELS        2
 /*
 * traditional i386 two-level paging structure:
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 1bd5876c8649..bcc89625ebe5 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -24,8 +24,6 @@ typedef union {
 #define SHARED_KERNEL_PMD       1
 #endif
-#define PAGETABLE_LEVELS        3
 /*
 * PGDIR_SHIFT determines what a top-level page table entry can map
 */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a0c35bf6cb92..fe57e7a98839 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -551,7 +551,7 @@ static inline unsigned long pages_to_mb(unsigned long npg)
        return npg >> (20 - PAGE_SHIFT);
 }
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 static inline int pud_none(pud_t pud)
 {
        return native_pud_val(pud) == 0;
@@ -594,9 +594,9 @@ static inline int pud_large(pud_t pud)
 {
        return 0;
 }
-#endif  /* PAGETABLE_LEVELS > 2 */
+#endif  /* CONFIG_PGTABLE_LEVELS > 2 */
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 static inline int pgd_present(pgd_t pgd)
 {
        return pgd_flags(pgd) & _PAGE_PRESENT;
@@ -633,7 +633,7 @@ static inline int pgd_none(pgd_t pgd)
 {
        return !native_pgd_val(pgd);
 }
-#endif  /* PAGETABLE_LEVELS > 3 */
+#endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 #endif  /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 602b6028c5b6..e6844dfb4471 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t;
 #endif  /* !__ASSEMBLY__ */
 #define SHARED_KERNEL_PMD       0
-#define PAGETABLE_LEVELS        4
 /*
 * PGDIR_SHIFT determines what a top-level page table entry can map
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 8c7c10802e9c..78f0c8cbe316 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
        return native_pgd_val(pgd) & PTE_FLAGS_MASK;
 }
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 typedef struct { pudval_t pud; } pud_t;
 static inline pud_t native_make_pud(pmdval_t val)
@@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud)
 }
 #endif
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 typedef struct { pmdval_t pmd; } pmd_t;
 static inline pmd_t native_make_pmd(pmdval_t val)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e354cc6446ab..9435620062df 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -513,7 +513,7 @@ void __init kvm_guest_init(void)
         * can get false positives too easily, for example if the host is
         * overcommitted.
         */
-        watchdog_enable_hardlockup_detector(false);
+        hardlockup_detector_disable();
 }
 static noinline uint32_t __kvm_cpuid_base(void)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 548d25f00c90..c614dd492f5f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
        .ptep_modify_prot_start = __ptep_modify_prot_start,
        .ptep_modify_prot_commit = __ptep_modify_prot_commit,
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
        .set_pte_atomic = native_set_pte_atomic,
        .pte_clear = native_pte_clear,
@@ -454,13 +454,13 @@ struct pv_mmu_ops pv_mmu_ops = {
        .pmd_val = PTE_IDENT,
        .make_pmd = PTE_IDENT,
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
        .pud_val = PTE_IDENT,
        .make_pud = PTE_IDENT,
        .set_pgd = native_set_pgd,
 #endif
-#endif /* PAGETABLE_LEVELS >= 3 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
        .pte_val = PTE_IDENT,
        .pgd_val = PTE_IDENT,
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c4cc74006c61..a482d105172b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o
 obj-$(CONFIG_ACPI_NUMA)         += srat.o
 obj-$(CONFIG_NUMA_EMU)          += numa_emulation.o
-obj-$(CONFIG_MEMTEST)           += memtest.o
 obj-$(CONFIG_X86_INTEL_MPX)     += mpx.o
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index fdf617c00e2f..5ead4d6cf3a7 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -67,8 +67,13 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
 /*
 * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
+ * address space. It transparently creates kernel huge I/O mapping when
- * directly.
+ * the physical address is aligned by a huge page size (1GB or 2MB) and
+ * the requested size is at least the huge page size.
+ *
+ * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
+ * Therefore, the mapping code falls back to use a smaller page toward 4KB
+ * when a mapping range is covered by non-WB type of MTRRs.
 *
 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
 * have to convert them into an offset in a page-aligned mapping, but the
@@ -326,6 +331,20 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
+int arch_ioremap_pud_supported(void)
+{
+#ifdef CONFIG_X86_64
+        return cpu_has_gbpages;
+#else
+        return 0;
+#endif
+}
+int arch_ioremap_pmd_supported(void)
+{
+        return cpu_has_pse;
+}
 /*
 * Convert a physical pointer to a virtual kernel pointer for /dev/mem
 * access
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index df4552bd239e..9d518d693b4b 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -65,24 +65,23 @@ static int mmap_is_legacy(void)
        return sysctl_legacy_va_layout;
 }
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
 {
-        unsigned long rnd = 0;
+        unsigned long rnd;
        /*
-        *  8 bits of randomness in 32bit mmaps, 20 address space bits
+         *  8 bits of randomness in 32bit mmaps, 20 address space bits
-        * 28 bits of randomness in 64bit mmaps, 40 address space bits
+         * 28 bits of randomness in 64bit mmaps, 40 address space bits
-        */
+         */
-        if (current->flags & PF_RANDOMIZE) {
+        if (mmap_is_ia32())
-                if (mmap_is_ia32())
+                rnd = (unsigned long)get_random_int() % (1<<8);
-                        rnd = get_random_int() % (1<<8);
+        else
-                else
+                rnd = (unsigned long)get_random_int() % (1<<28);
-                        rnd = get_random_int() % (1<<28);
-        }
        return rnd << PAGE_SHIFT;
 }
-static unsigned long mmap_base(void)
+static unsigned long mmap_base(unsigned long rnd)
 {
        unsigned long gap = rlimit(RLIMIT_STACK);
@@ -91,19 +90,19 @@ static unsigned long mmap_base(void)
        else if (gap > MAX_GAP)
                gap = MAX_GAP;
-        return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+        return PAGE_ALIGN(TASK_SIZE - gap - rnd);
 }
 /*
 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
 * does, but not when emulating X86_32
 */
-static unsigned long mmap_legacy_base(void)
+static unsigned long mmap_legacy_base(unsigned long rnd)
 {
        if (mmap_is_ia32())
                return TASK_UNMAPPED_BASE;
        else
-                return TASK_UNMAPPED_BASE + mmap_rnd();
+                return TASK_UNMAPPED_BASE + rnd;
 }
 /*
@@ -112,13 +111,18 @@ static unsigned long mmap_legacy_base(void)
 */
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
-        mm->mmap_legacy_base = mmap_legacy_base();
+        unsigned long random_factor = 0UL;
-        mm->mmap_base = mmap_base();
+        if (current->flags & PF_RANDOMIZE)
+                random_factor = arch_mmap_rnd();
+        mm->mmap_legacy_base = mmap_legacy_base(random_factor);
        if (mmap_is_legacy()) {
                mm->mmap_base = mm->mmap_legacy_base;
                mm->get_unmapped_area = arch_get_unmapped_area;
        } else {
+                mm->mmap_base = mmap_base(random_factor);
                mm->get_unmapped_area = arch_get_unmapped_area_topdown;
        }
 }
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5a7e5252c878..0b97d2c75df3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
 #include <asm/fixmap.h>
+#include <asm/mtrr.h>
 #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
@@ -58,7 +59,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
        tlb_remove_page(tlb, pte);
 }
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
        struct page *page = virt_to_page(pmd);
@@ -74,14 +75,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
        tlb_remove_page(tlb, page);
 }
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
        paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
        tlb_remove_page(tlb, virt_to_page(pud));
 }
-#endif  /* PAGETABLE_LEVELS > 3 */
+#endif  /* CONFIG_PGTABLE_LEVELS > 3 */
-#endif  /* PAGETABLE_LEVELS > 2 */
+#endif  /* CONFIG_PGTABLE_LEVELS > 2 */
 static inline void pgd_list_add(pgd_t *pgd)
 {
@@ -117,9 +118,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
        /* If the pgd points to a shared pagetable level (either the
           ptes in non-PAE, or shared PMD in PAE), then just copy the
           references from swapper_pg_dir. */
-        if (PAGETABLE_LEVELS == 2 ||
+        if (CONFIG_PGTABLE_LEVELS == 2 ||
-            (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+            (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
-            PAGETABLE_LEVELS == 4) {
+            CONFIG_PGTABLE_LEVELS == 4) {
                clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
                                swapper_pg_dir + KERNEL_PGD_BOUNDARY,
                                KERNEL_PGD_PTRS);
@@ -560,3 +561,67 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
 {
        __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
 }
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+        u8 mtrr;
+        /*
+         * Do not use a huge page when the range is covered by non-WB type
+         * of MTRRs.
+         */
+        mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
+        if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+                return 0;
+        prot = pgprot_4k_2_large(prot);
+        set_pte((pte_t *)pud, pfn_pte(
+                (u64)addr >> PAGE_SHIFT,
+                __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+        return 1;
+}
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+        u8 mtrr;
+        /*
+         * Do not use a huge page when the range is covered by non-WB type
+         * of MTRRs.
+         */
+        mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
+        if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+                return 0;
+        prot = pgprot_4k_2_large(prot);
+        set_pte((pte_t *)pmd, pfn_pte(
+                (u64)addr >> PAGE_SHIFT,
+                __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+        return 1;
+}
+int pud_clear_huge(pud_t *pud)
+{
+        if (pud_large(*pud)) {
+                pud_clear(pud);
+                return 1;
+        }
+        return 0;
+}
+int pmd_clear_huge(pmd_t *pmd)
+{
+        if (pmd_large(*pmd)) {
+                pmd_clear(pmd);
+                return 1;
+        }
+        return 0;
+}
+#endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index adca9e2b6553..65083ad63b6f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 __visible pudval_t xen_pud_val(pud_t pud)
 {
        return pte_mfn_to_pfn(pud.pud);
@@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
        xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
-#endif  /* PAGETABLE_LEVELS == 4 */
+#endif  /* CONFIG_PGTABLE_LEVELS == 4 */
 /*
 * (Yet another) pagetable walker.  This one is intended for pinning a
@@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn)
        xen_release_ptpage(pfn, PT_PMD);
 }
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
 {
        xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void)
        pv_mmu_ops.set_pte = xen_set_pte;
        pv_mmu_ops.set_pmd = xen_set_pmd;
        pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
        pv_mmu_ops.set_pgd = xen_set_pgd;
 #endif
@@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void)
        pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
        pv_mmu_ops.release_pte = xen_release_pte;
        pv_mmu_ops.release_pmd = xen_release_pmd;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
        pv_mmu_ops.alloc_pud = xen_alloc_pud;
        pv_mmu_ops.release_pud = xen_release_pud;
 #endif
@@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
        .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
        .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
        .pud_val = PV_CALLEE_SAVE(xen_pud_val),
        .make_pud = PV_CALLEE_SAVE(xen_make_pud),
        .set_pgd = xen_set_pgd_hyper,
        .alloc_pud = xen_alloc_pmd_init,
        .release_pud = xen_release_pmd_init,
-#endif  /* PAGETABLE_LEVELS == 4 */
+#endif  /* CONFIG_PGTABLE_LEVELS == 4 */
        .activate_mm = xen_activate_mm,
        .dup_mmap = xen_dup_mmap,
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index af9c911cd6b5..2804aed3f416 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -219,6 +219,7 @@ static bool pages_correctly_reserved(unsigned long start_pfn)
 /*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
+ * Must already be protected by mem_hotplug_begin().
 */
 static int
 memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
@@ -228,7 +229,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
        struct page *first_page;
        int ret;
-        start_pfn = phys_index << PFN_SECTION_SHIFT;
+        start_pfn = section_nr_to_pfn(phys_index);
        first_page = pfn_to_page(start_pfn);
        switch (action) {
@@ -286,6 +287,7 @@ static int memory_subsys_online(struct device *dev)
        if (mem->online_type < 0)
                mem->online_type = MMOP_ONLINE_KEEP;
+        /* Already under protection of mem_hotplug_begin() */
        ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
        /* clear online_type */
@@ -328,17 +330,19 @@ store_mem_state(struct device *dev,
                goto err;
        }
+        /*
+         * Memory hotplug needs to hold mem_hotplug_begin() for probe to find
+         * the correct memory block to online before doing device_online(dev),
+         * which will take dev->mutex.  Take the lock early to prevent an
+         * inversion, memory_subsys_online() callbacks will be implemented by
+         * assuming it's already protected.
+         */
+        mem_hotplug_begin();
        switch (online_type) {
        case MMOP_ONLINE_KERNEL:
        case MMOP_ONLINE_MOVABLE:
        case MMOP_ONLINE_KEEP:
-                /*
-                 * mem->online_type is not protected so there can be a
-                 * race here.  However, when racing online, the first
-                 * will succeed and the second will just return as the
-                 * block will already be online.  The online type
-                 * could be either one, but that is expected.
-                 */
                mem->online_type = online_type;
                ret = device_online(&mem->dev);
                break;
@@ -349,6 +353,7 @@ store_mem_state(struct device *dev,
                ret = -EINVAL; /* should never happen */
        }
+        mem_hotplug_done();
 err:
        unlock_device_hotplug();
diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 2c5d4567d1da..acde3f5d6e9e 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -738,11 +738,11 @@ static int zfcp_erp_adapter_strategy_open_fsf(struct zfcp_erp_action *act)
                return ZFCP_ERP_FAILED;
        if (mempool_resize(act->adapter->pool.sr_data,
-                           act->adapter->stat_read_buf_num, GFP_KERNEL))
+                           act->adapter->stat_read_buf_num))
                return ZFCP_ERP_FAILED;
        if (mempool_resize(act->adapter->pool.status_read_req,
-                           act->adapter->stat_read_buf_num, GFP_KERNEL))
+                           act->adapter->stat_read_buf_num))
                return ZFCP_ERP_FAILED;
        atomic_set(&act->adapter->stat_miss, act->adapter->stat_read_buf_num);
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
index a260e99a4447..d72605864b0a 100644
--- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -55,7 +55,9 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        if (PagePrivate(page))
                page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
-        cancel_dirty_page(page, PAGE_SIZE);
+        if (TestClearPageDirty(page))
+                account_page_cleaned(page, mapping);
        ClearPageMappedToDisk(page);
        ll_delete_from_page_cache(page);
 }
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index 8a65423bc696..c4211a31612d 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -397,13 +397,15 @@ static int __init xen_tmem_init(void)
 #ifdef CONFIG_CLEANCACHE
        BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
        if (tmem_enabled && cleancache) {
-                char *s = "";
+                int err;
-                struct cleancache_ops *old_ops =
-                        cleancache_register_ops(&tmem_cleancache_ops);
+                err = cleancache_register_ops(&tmem_cleancache_ops);
-                if (old_ops)
+                if (err)
-                        s = " (WARNING: cleancache_ops overridden)";
+                        pr_warn("xen-tmem: failed to enable cleancache: %d\n",
-                pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n",
+                                err);
-                        s);
+                else
+                        pr_info("cleancache enabled, RAM provided by "
+                                "Xen Transcendent Memory\n");
        }
 #endif
 #ifdef CONFIG_XEN_SELFBALLOONING
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 270c48148f79..2d0cbbd14cfc 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,9 +27,6 @@ config COMPAT_BINFMT_ELF
        bool
        depends on COMPAT && BINFMT_ELF
-config ARCH_BINFMT_ELF_RANDOMIZE_PIE
-        bool
 config ARCH_BINFMT_ELF_STATE
        bool
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 995986b8e36b..241ef68d2893 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,6 +31,7 @@
 #include <linux/security.h>
 #include <linux/random.h>
 #include <linux/elf.h>
+#include <linux/elf-randomize.h>
 #include <linux/utsname.h>
 #include <linux/coredump.h>
 #include <linux/sched.h>
@@ -862,6 +863,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
            i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
                int elf_prot = 0, elf_flags;
                unsigned long k, vaddr;
+                unsigned long total_size = 0;
                if (elf_ppnt->p_type != PT_LOAD)
                        continue;
@@ -909,25 +911,20 @@ static int load_elf_binary(struct linux_binprm *bprm)
                         * default mmap base, as well as whatever program they
                         * might try to exec.  This is because the brk will
                         * follow the loader, and is not movable.  */
-#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
+                        load_bias = ELF_ET_DYN_BASE - vaddr;
-                        /* Memory randomization might have been switched off
-                         * in runtime via sysctl or explicit setting of
-                         * personality flags.
-                         * If that is the case, retain the original non-zero
-                         * load_bias value in order to establish proper
-                         * non-randomized mappings.
-                         */
                        if (current->flags & PF_RANDOMIZE)
-                                load_bias = 0;
+                                load_bias += arch_mmap_rnd();
-                        else
+                        load_bias = ELF_PAGESTART(load_bias);
-                                load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+                        total_size = total_mapping_size(elf_phdata,
-#else
+                                                        loc->elf_ex.e_phnum);
-                        load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+                        if (!total_size) {
-#endif
+                                error = -EINVAL;
+                                goto out_free_dentry;
+                        }
                }
                error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-                                elf_prot, elf_flags, 0);
+                                elf_prot, elf_flags, total_size);
                if (BAD_ADDR(error)) {
                        retval = IS_ERR((void *)error) ?
                                PTR_ERR((void*)error) : -EINVAL;
@@ -1053,15 +1050,13 @@ static int load_elf_binary(struct linux_binprm *bprm)
        current->mm->end_data = end_data;
        current->mm->start_stack = bprm->p;
-#ifdef arch_randomize_brk
        if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
                current->mm->brk = current->mm->start_brk =
                        arch_randomize_brk(current->mm);
-#ifdef CONFIG_COMPAT_BRK
+#ifdef compat_brk_randomized
                current->brk_randomized = 1;
 #endif
        }
-#endif
        if (current->personality & MMAP_PAGE_ZERO) {
                /* Why this, you ask???  Well SVr4 maps page 0 as read-only,
diff --git a/fs/buffer.c b/fs/buffer.c
index 20805db2c987..c7a5602d01ee 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3243,8 +3243,8 @@ int try_to_free_buffers(struct page *page)
         * to synchronise against __set_page_dirty_buffers and prevent the
         * dirty bit from being lost.
         */
-        if (ret)
+        if (ret && TestClearPageDirty(page))
-                cancel_dirty_page(page, PAGE_CACHE_SIZE);
+                account_page_cleaned(page, mapping);
        spin_unlock(&mapping->private_lock);
 out:
        if (buffers_to_free) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 480cf9c81d50..f3bfe08e177b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -773,8 +773,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
        length = atomic_dec_return(&tcpSesAllocCount);
        if (length > 0)
-                mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
+                mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
-                                GFP_KERNEL);
 }
 static int
@@ -848,8 +847,7 @@ cifs_demultiplex_thread(void *p)
        length = atomic_inc_return(&tcpSesAllocCount);
        if (length > 1)
-                mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
+                mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
-                                GFP_KERNEL);
        set_freezable();
        while (server->tcpStatus != CifsExiting) {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c274aca8e8dc..db76cec3ce21 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -319,7 +319,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
 static void truncate_huge_page(struct page *page)
 {
-        cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
+        ClearPageDirty(page);
        ClearPageUptodate(page);
        delete_from_page_cache(page);
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 849ed784d6ac..759931088094 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1876,11 +1876,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
                 * request from the inode / page_private pointer and
                 * release it */
                nfs_inode_remove_request(req);
-                /*
-                 * In case nfs_inode_remove_request has marked the
-                 * page as being dirty
-                 */
-                cancel_dirty_page(page, PAGE_CACHE_SIZE);
                nfs_unlock_and_release_request(req);
        }
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 044158bd22be..2d7f76e52c37 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3370,7 +3370,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
                ret = ocfs2_get_right_path(et, left_path, &right_path);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        return ret;
                }
                right_el = path_leaf_el(right_path);
@@ -3453,8 +3453,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
                                           subtree_index);
        }
 out:
-        if (right_path)
+        ocfs2_free_path(right_path);
-                ocfs2_free_path(right_path);
        return ret;
 }
@@ -3536,7 +3535,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
                ret = ocfs2_get_left_path(et, right_path, &left_path);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        return ret;
                }
                left_el = path_leaf_el(left_path);
@@ -3647,8 +3646,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
                                                   right_path, subtree_index);
        }
 out:
-        if (left_path)
+        ocfs2_free_path(left_path);
-                ocfs2_free_path(left_path);
        return ret;
 }
@@ -4334,17 +4332,17 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
        } else if (path->p_tree_depth > 0) {
                status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
                if (status)
-                        goto out;
+                        goto exit;
                if (left_cpos != 0) {
                        left_path = ocfs2_new_path_from_path(path);
                        if (!left_path)
-                                goto out;
+                                goto exit;
                        status = ocfs2_find_path(et->et_ci, left_path,
                                                 left_cpos);
                        if (status)
-                                goto out;
+                                goto free_left_path;
                        new_el = path_leaf_el(left_path);
@@ -4361,7 +4359,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                                            le16_to_cpu(new_el->l_next_free_rec),
                                            le16_to_cpu(new_el->l_count));
                                status = -EINVAL;
-                                goto out;
+                                goto free_left_path;
                        }
                        rec = &new_el->l_recs[
                                le16_to_cpu(new_el->l_next_free_rec) - 1];
@@ -4388,18 +4386,18 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                 path->p_tree_depth > 0) {
                status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
                if (status)
-                        goto out;
+                        goto free_left_path;
                if (right_cpos == 0)
-                        goto out;
+                        goto free_left_path;
                right_path = ocfs2_new_path_from_path(path);
                if (!right_path)
-                        goto out;
+                        goto free_left_path;
                status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
                if (status)
-                        goto out;
+                        goto free_right_path;
                new_el = path_leaf_el(right_path);
                rec = &new_el->l_recs[0];
@@ -4413,7 +4411,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
                                            le16_to_cpu(new_el->l_next_free_rec));
                                status = -EINVAL;
-                                goto out;
+                                goto free_right_path;
                        }
                        rec = &new_el->l_recs[1];
                }
@@ -4430,12 +4428,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                        ret = contig_type;
        }
-out:
+free_right_path:
-        if (left_path)
+        ocfs2_free_path(right_path);
-                ocfs2_free_path(left_path);
+free_left_path:
-        if (right_path)
+        ocfs2_free_path(left_path);
-                ocfs2_free_path(right_path);
+exit:
        return ret;
 }
@@ -6858,13 +6855,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                if (pages == NULL) {
                        ret = -ENOMEM;
                        mlog_errno(ret);
-                        goto out;
+                        return ret;
                }
                ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        goto free_pages;
                }
        }
@@ -6996,9 +6993,8 @@ out_commit:
 out:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
-        if (pages)
+free_pages:
-                kfree(pages);
+        kfree(pages);
        return ret;
 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e1bf18c5d25e..8d2bc840c288 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -664,6 +664,117 @@ static int ocfs2_is_overwrite(struct ocfs2_super *osb,
        return 0;
 }
+static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
+                struct inode *inode, loff_t offset,
+                u64 zero_len, int cluster_align)
+{
+        u32 p_cpos = 0;
+        u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
+        int ret = 0;
+        if (offset <= i_size_read(inode) || cluster_align)
+                return 0;
+        ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
+                        &ext_flags);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+                u64 s = i_size_read(inode);
+                sector_t sector = (p_cpos << (osb->s_clustersize_bits - 9)) +
+                        (do_div(s, osb->s_clustersize) >> 9);
+                ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
+                                zero_len >> 9, GFP_NOFS, false);
+                if (ret < 0)
+                        mlog_errno(ret);
+        }
+        return ret;
+}
+static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
+                struct inode *inode, loff_t offset)
+{
+        u64 zero_start, zero_len, total_zero_len;
+        u32 p_cpos = 0, clusters_to_add;
+        u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
+        u32 size_div, offset_div;
+        int ret = 0;
+        {
+                u64 o = offset;
+                u64 s = i_size_read(inode);
+                offset_div = do_div(o, osb->s_clustersize);
+                size_div = do_div(s, osb->s_clustersize);
+        }
+        if (offset <= i_size_read(inode))
+                return 0;
+        clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
+                ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
+        total_zero_len = offset - i_size_read(inode);
+        if (clusters_to_add)
+                total_zero_len -= offset_div;
+        /* Allocate clusters to fill out holes, and this is only needed
+         * when we add more than one clusters. Otherwise the cluster will
+         * be allocated during direct IO */
+        if (clusters_to_add > 1) {
+                ret = ocfs2_extend_allocation(inode,
+                                OCFS2_I(inode)->ip_clusters,
+                                clusters_to_add - 1, 0);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        while (total_zero_len) {
+                ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
+                                &ext_flags);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
+                        size_div;
+                zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
+                        size_div;
+                zero_len = min(total_zero_len, zero_len);
+                if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+                        ret = blkdev_issue_zeroout(osb->sb->s_bdev,
+                                        zero_start >> 9, zero_len >> 9,
+                                        GFP_NOFS, false);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+                total_zero_len -= zero_len;
+                v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
+                /* Only at first iteration can be cluster not aligned.
+                 * So set size_div to 0 for the rest */
+                size_div = 0;
+        }
+out:
+        return ret;
+}
 static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
                struct iov_iter *iter,
                loff_t offset)
@@ -678,8 +789,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
        struct buffer_head *di_bh = NULL;
        size_t count = iter->count;
        journal_t *journal = osb->journal->j_journal;
-        u32 zero_len;
+        u64 zero_len_head, zero_len_tail;
-        int cluster_align;
+        int cluster_align_head, cluster_align_tail;
        loff_t final_size = offset + count;
        int append_write = offset >= i_size_read(inode) ? 1 : 0;
        unsigned int num_clusters = 0;
@@ -687,9 +798,16 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
        {
                u64 o = offset;
+                u64 s = i_size_read(inode);
+                zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
+                cluster_align_head = !zero_len_head;
-                zero_len = do_div(o, 1 << osb->s_clustersize_bits);
+                zero_len_tail = osb->s_clustersize -
-                cluster_align = !zero_len;
+                        do_div(s, osb->s_clustersize);
+                if ((offset - i_size_read(inode)) < zero_len_tail)
+                        zero_len_tail = offset - i_size_read(inode);
+                cluster_align_tail = !zero_len_tail;
        }
        /*
@@ -707,21 +825,23 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
        }
        if (append_write) {
-                ret = ocfs2_inode_lock(inode, &di_bh, 1);
+                ret = ocfs2_inode_lock(inode, NULL, 1);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto clean_orphan;
                }
+                /* zeroing out the previously allocated cluster tail
+                 * that but not zeroed */
                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-                        ret = ocfs2_zero_extend(inode, di_bh, offset);
+                        ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
+                                        zero_len_tail, cluster_align_tail);
                else
-                        ret = ocfs2_extend_no_holes(inode, di_bh, offset,
+                        ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
                                        offset);
                if (ret < 0) {
                        mlog_errno(ret);
                        ocfs2_inode_unlock(inode, 1);
-                        brelse(di_bh);
                        goto clean_orphan;
                }
@@ -729,13 +849,10 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
                if (is_overwrite < 0) {
                        mlog_errno(is_overwrite);
                        ocfs2_inode_unlock(inode, 1);
-                        brelse(di_bh);
                        goto clean_orphan;
                }
                ocfs2_inode_unlock(inode, 1);
-                brelse(di_bh);
-                di_bh = NULL;
        }
        written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
@@ -772,15 +889,23 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
                        if (ret < 0)
                                mlog_errno(ret);
                }
-        } else if (written < 0 && append_write && !is_overwrite &&
+        } else if (written > 0 && append_write && !is_overwrite &&
-                        !cluster_align) {
+                        !cluster_align_head) {
+                /* zeroing out the allocated cluster head */
                u32 p_cpos = 0;
                u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+                ret = ocfs2_inode_lock(inode, NULL, 0);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto clean_orphan;
+                }
                ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
                                &num_clusters, &ext_flags);
                if (ret < 0) {
                        mlog_errno(ret);
+                        ocfs2_inode_unlock(inode, 0);
                        goto clean_orphan;
                }
@@ -788,9 +913,11 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
                ret = blkdev_issue_zeroout(osb->sb->s_bdev,
                                p_cpos << (osb->s_clustersize_bits - 9),
-                                zero_len >> 9, GFP_KERNEL, false);
+                                zero_len_head >> 9, GFP_NOFS, false);
                if (ret < 0)
                        mlog_errno(ret);
+                ocfs2_inode_unlock(inode, 0);
        }
 clean_orphan:
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 16eff45727ee..8e19b9d7aba8 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1312,7 +1312,9 @@ static int o2hb_debug_init(void)
        int ret = -ENOMEM;
        o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
-        if (!o2hb_debug_dir) {
+        if (IS_ERR_OR_NULL(o2hb_debug_dir)) {
+                ret = o2hb_debug_dir ?
+                        PTR_ERR(o2hb_debug_dir) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -1325,7 +1327,9 @@ static int o2hb_debug_init(void)
                                                 sizeof(o2hb_live_node_bitmap),
                                                 O2NM_MAX_NODES,
                                                 o2hb_live_node_bitmap);
-        if (!o2hb_debug_livenodes) {
+        if (IS_ERR_OR_NULL(o2hb_debug_livenodes)) {
+                ret = o2hb_debug_livenodes ?
+                        PTR_ERR(o2hb_debug_livenodes) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -1338,7 +1342,9 @@ static int o2hb_debug_init(void)
                                                   sizeof(o2hb_live_region_bitmap),
                                                   O2NM_MAX_REGIONS,
                                                   o2hb_live_region_bitmap);
-        if (!o2hb_debug_liveregions) {
+        if (IS_ERR_OR_NULL(o2hb_debug_liveregions)) {
+                ret = o2hb_debug_liveregions ?
+                        PTR_ERR(o2hb_debug_liveregions) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -1352,7 +1358,9 @@ static int o2hb_debug_init(void)
                                          sizeof(o2hb_quorum_region_bitmap),
                                          O2NM_MAX_REGIONS,
                                          o2hb_quorum_region_bitmap);
-        if (!o2hb_debug_quorumregions) {
+        if (IS_ERR_OR_NULL(o2hb_debug_quorumregions)) {
+                ret = o2hb_debug_quorumregions ?
+                        PTR_ERR(o2hb_debug_quorumregions) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -1366,7 +1374,9 @@ static int o2hb_debug_init(void)
                                          sizeof(o2hb_failed_region_bitmap),
                                          O2NM_MAX_REGIONS,
                                          o2hb_failed_region_bitmap);
-        if (!o2hb_debug_failedregions) {
+        if (IS_ERR_OR_NULL(o2hb_debug_failedregions)) {
+                ret = o2hb_debug_failedregions ?
+                        PTR_ERR(o2hb_debug_failedregions) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2000,7 +2010,8 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
        reg->hr_debug_dir =
                debugfs_create_dir(config_item_name(&reg->hr_item), dir);
-        if (!reg->hr_debug_dir) {
+        if (IS_ERR_OR_NULL(reg->hr_debug_dir)) {
+                ret = reg->hr_debug_dir ? PTR_ERR(reg->hr_debug_dir) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2013,7 +2024,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                                          O2HB_DB_TYPE_REGION_LIVENODES,
                                          sizeof(reg->hr_live_node_bitmap),
                                          O2NM_MAX_NODES, reg);
-        if (!reg->hr_debug_livenodes) {
+        if (IS_ERR_OR_NULL(reg->hr_debug_livenodes)) {
+                ret = reg->hr_debug_livenodes ?
+                        PTR_ERR(reg->hr_debug_livenodes) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2025,7 +2038,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                                          sizeof(*(reg->hr_db_regnum)),
                                          O2HB_DB_TYPE_REGION_NUMBER,
                                          0, O2NM_MAX_NODES, reg);
-        if (!reg->hr_debug_regnum) {
+        if (IS_ERR_OR_NULL(reg->hr_debug_regnum)) {
+                ret = reg->hr_debug_regnum ?
+                        PTR_ERR(reg->hr_debug_regnum) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2037,7 +2052,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                                          sizeof(*(reg->hr_db_elapsed_time)),
                                          O2HB_DB_TYPE_REGION_ELAPSED_TIME,
                                          0, 0, reg);
-        if (!reg->hr_debug_elapsed_time) {
+        if (IS_ERR_OR_NULL(reg->hr_debug_elapsed_time)) {
+                ret = reg->hr_debug_elapsed_time ?
+                        PTR_ERR(reg->hr_debug_elapsed_time) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2049,13 +2066,16 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                                          sizeof(*(reg->hr_db_pinned)),
                                          O2HB_DB_TYPE_REGION_PINNED,
                                          0, 0, reg);
-        if (!reg->hr_debug_pinned) {
+        if (IS_ERR_OR_NULL(reg->hr_debug_pinned)) {
+                ret = reg->hr_debug_pinned ?
+                        PTR_ERR(reg->hr_debug_pinned) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
-        ret = 0;
+        return 0;
 bail:
+        debugfs_remove_recursive(reg->hr_debug_dir);
        return ret;
 }
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 2260fb9e6508..7fdc25a4d8c0 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -196,13 +196,14 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
        }                                                               \
 } while (0)
-#define mlog_errno(st) do {                                             \
+#define mlog_errno(st) ({                                               \
        int _st = (st);                                                 \
        if (_st != -ERESTARTSYS && _st != -EINTR &&                     \
            _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC &&              \
            _st != -EDQUOT)                                             \
                mlog(ML_ERROR, "status = %lld\n", (long long)_st);      \
-} while (0)
+        _st;                                                            \
+})
 #define mlog_bug_on_msg(cond, fmt, args...) do {                        \
        if (cond) {                                                     \
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b08050bd3f2e..ccd4dcfc3645 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -18,7 +18,7 @@
 *
 *   linux/fs/minix/dir.c
 *
- *   Copyright (C) 1991, 1992 Linux Torvalds
+ *   Copyright (C) 1991, 1992 Linus Torvalds
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
@@ -2047,22 +2047,19 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
                              const char *name,
                              int namelen)
 {
-        int ret;
+        int ret = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
        trace_ocfs2_check_dir_for_entry(
                (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
-        ret = -EEXIST;
+        if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) {
-        if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
+                ret = -EEXIST;
-                goto bail;
+                mlog_errno(ret);
+        }
-        ret = 0;
-bail:
        ocfs2_free_dir_lookup_result(&lookup);
-        if (ret)
-                mlog_errno(ret);
        return ret;
 }
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 11849a44dc5a..956edf67be20 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1391,6 +1391,11 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
        int noqueue_attempted = 0;
        int dlm_locked = 0;
+        if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
+                mlog_errno(-EINVAL);
+                return -EINVAL;
+        }
        ocfs2_init_mask_waiter(&mw);
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
@@ -2954,7 +2959,7 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
                                                         osb->osb_debug_root,
                                                         osb,
                                                         &ocfs2_dlm_debug_fops);
-        if (!dlm_debug->d_locking_state) {
+        if (IS_ERR_OR_NULL(dlm_debug->d_locking_state)) {
                ret = -EINVAL;
                mlog(ML_ERROR,
                     "Unable to create locking state debugfs file.\n");
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 29651167190d..540dc4bdd042 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -82,7 +82,6 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
        }
        status = ocfs2_test_inode_bit(osb, blkno, &set);
-        trace_ocfs2_get_dentry_test_bit(status, set);
        if (status < 0) {
                if (status == -EINVAL) {
                        /*
@@ -96,6 +95,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
                goto unlock_nfs_sync;
        }
+        trace_ocfs2_get_dentry_test_bit(status, set);
        /* If the inode allocator bit is clear, this inode must be stale */
        if (!set) {
                status = -ESTALE;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 3025c0da6b8a..be71ca0937f7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -624,7 +624,7 @@ static int ocfs2_remove_inode(struct inode *inode,
                ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
                                            le16_to_cpu(di->i_suballoc_slot));
        if (!inode_alloc_inode) {
-                status = -EEXIST;
+                status = -ENOENT;
                mlog_errno(status);
                goto bail;
        }
@@ -742,7 +742,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
                                                               ORPHAN_DIR_SYSTEM_INODE,
                                                               orphaned_slot);
                if (!orphan_dir_inode) {
-                        status = -EEXIST;
+                        status = -ENOENT;
                        mlog_errno(status);
                        goto bail;
                }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 044013455621..857bbbcd39f3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -666,7 +666,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
        if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
            ocfs2_local_alloc_count_bits(alloc)) {
                ocfs2_error(osb->sb, "local alloc inode %llu says it has "
-                            "%u free bits, but a count shows %u",
+                            "%u used bits, but a count shows %u",
                            (unsigned long long)le64_to_cpu(alloc->i_blkno),
                            le32_to_cpu(alloc->id1.bitmap1.i_used),
                            ocfs2_local_alloc_count_bits(alloc));
@@ -839,7 +839,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                                     u32 *numbits,
                                     struct ocfs2_alloc_reservation *resv)
 {
-        int numfound, bitoff, left, startoff, lastzero;
+        int numfound = 0, bitoff, left, startoff, lastzero;
        int local_resv = 0;
        struct ocfs2_alloc_reservation r;
        void *bitmap = NULL;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b5c3a5ea3ee6..09f90cbf0e24 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2322,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        trace_ocfs2_orphan_del(
             (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
-             name, namelen);
+             name, strlen(name));
        /* find it's spot in the orphan directory */
-        status = ocfs2_find_entry(name, namelen, orphan_dir_inode,
+        status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
                                  &lookup);
        if (status) {
                mlog_errno(status);
@@ -2808,7 +2808,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                                                       ORPHAN_DIR_SYSTEM_INODE,
                                                       osb->slot_num);
        if (!orphan_dir_inode) {
-                status = -EEXIST;
+                status = -ENOENT;
                mlog_errno(status);
                goto leave;
        }
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ee541f92dab4..df3a500789c7 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4276,7 +4276,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
        error = posix_acl_create(dir, &mode, &default_acl, &acl);
        if (error) {
                mlog_errno(error);
-                goto out;
+                return error;
        }
        error = ocfs2_create_inode_in_orphan(dir, mode,
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d5493e361a38..e78a203d44c8 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -427,7 +427,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
        if (!si) {
                status = -ENOMEM;
                mlog_errno(status);
-                goto bail;
+                return status;
        }
        si->si_extended = ocfs2_uses_extended_slot_map(osb);
@@ -452,7 +452,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
        osb->slot_info = (struct ocfs2_slot_info *)si;
 bail:
-        if (status < 0 && si)
+        if (status < 0)
                __ocfs2_free_slot_info(si);
        return status;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 1724d43d3da1..220cae7bbdbc 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -295,7 +295,7 @@ static int o2cb_cluster_check(void)
                set_bit(node_num, netmap);
                if (!memcmp(hbmap, netmap, sizeof(hbmap)))
                        return 0;
-                if (i < O2CB_MAP_STABILIZE_COUNT)
+                if (i < O2CB_MAP_STABILIZE_COUNT - 1)
                        msleep(1000);
        }
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 720aa389e0ea..2768eb1da2b8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1004,10 +1004,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
        BUG_ON(conn == NULL);
        lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
-        if (!lc) {
+        if (!lc)
-                rc = -ENOMEM;
+                return -ENOMEM;
-                goto out;
-        }
        init_waitqueue_head(&lc->oc_wait);
        init_completion(&lc->oc_sync_wait);
@@ -1063,7 +1061,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
        }
 out:
-        if (rc && lc)
+        if (rc)
                kfree(lc);
        return rc;
 }
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 0cb889a17ae1..4479029630bb 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2499,6 +2499,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
                                         alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
+                ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
+                                start_bit, count);
                goto bail;
        }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 26675185b886..837ddce4b659 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1112,7 +1112,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
                                                 ocfs2_debugfs_root);
-        if (!osb->osb_debug_root) {
+        if (IS_ERR_OR_NULL(osb->osb_debug_root)) {
                status = -EINVAL;
                mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
                goto read_super_error;
@@ -1122,7 +1122,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                                            osb->osb_debug_root,
                                            osb,
                                            &ocfs2_osb_debug_fops);
-        if (!osb->osb_ctxt) {
+        if (IS_ERR_OR_NULL(osb->osb_ctxt)) {
                status = -EINVAL;
                mlog_errno(status);
                goto read_super_error;
@@ -1606,8 +1606,9 @@ static int __init ocfs2_init(void)
        }
        ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
-        if (!ocfs2_debugfs_root) {
+        if (IS_ERR_OR_NULL(ocfs2_debugfs_root)) {
-                status = -ENOMEM;
+                status = ocfs2_debugfs_root ?
+                        PTR_ERR(ocfs2_debugfs_root) : -ENOMEM;
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
                goto out4;
        }
@@ -2069,6 +2070,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
        cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
        bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
        sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+        memcpy(sb->s_uuid, di->id2.i_super.s_uuid,
+               sizeof(di->id2.i_super.s_uuid));
        osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
@@ -2333,7 +2336,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
                mlog_errno(status);
                goto bail;
        }
-        cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
+        cleancache_init_shared_fs(sb);
 bail:
        return status;
@@ -2563,22 +2566,22 @@ static void ocfs2_handle_error(struct super_block *sb)
        ocfs2_set_ro_flag(osb, 0);
 }
-static char error_buf[1024];
+void __ocfs2_error(struct super_block *sb, const char *function,
+                  const char *fmt, ...)
-void __ocfs2_error(struct super_block *sb,
-                   const char *function,
-                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+        vaf.fmt = fmt;
-        va_end(args);
+        vaf.va = &args;
        /* Not using mlog here because we want to show the actual
         * function the error came from. */
-        printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
+        printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
-               sb->s_id, function, error_buf);
+               sb->s_id, function, &vaf);
+        va_end(args);
        ocfs2_handle_error(sb);
 }
@@ -2586,18 +2589,21 @@ void __ocfs2_error(struct super_block *sb,
 /* Handle critical errors. This is intentionally more drastic than
 * ocfs2_handle_error, so we only use for things like journal errors,
 * etc. */
-void __ocfs2_abort(struct super_block* sb,
+void __ocfs2_abort(struct super_block *sb, const char *function,
-                   const char *function,
                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        vsnprintf(error_buf, sizeof(error_buf), fmt, args);
-        va_end(args);
-        printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
+        vaf.fmt = fmt;
-               sb->s_id, function, error_buf);
+        vaf.va = &args;
+        printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
+        va_end(args);
        /* We don't have the cluster support yet to go straight to
         * hard readonly in here. Until then, we want to keep
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 85b190dc132f..4ca7533be479 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1238,6 +1238,10 @@ static int ocfs2_xattr_block_get(struct inode *inode,
                                                                i,
                                                                &block_off,
                                                                &name_offset);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto cleanup;
+                        }
                        xs->base = bucket_block(xs->bucket, block_off);
                }
                if (ocfs2_xattr_is_local(xs->here)) {
@@ -5665,6 +5669,10 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
                ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
                                                      i, &xv, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        break;
+                }
                ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
                                                         args->ref_ci,
diff --git a/fs/super.c b/fs/super.c
index 2b7dc90ccdbb..928c20f47af9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -224,7 +224,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        s->s_maxbytes = MAX_NON_LFS;
        s->s_op = &default_op;
        s->s_time_gran = 1000000000;
-        s->cleancache_poolid = -1;
+        s->cleancache_poolid = CLEANCACHE_NO_POOL;
        s->s_shrink.seeks = DEFAULT_SEEKS;
        s->s_shrink.scan_objects = super_cache_scan;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 4d46085c1b90..39f1d6a2b04d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -6,6 +6,12 @@
 #include <linux/mm_types.h>
 #include <linux/bug.h>
+#include <linux/errno.h>
+#if 4 - defined(__PAGETABLE_PUD_FOLDED) - defined(__PAGETABLE_PMD_FOLDED) != \
+        CONFIG_PGTABLE_LEVELS
+#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{PUD,PMD}_FOLDED
+#endif
 /*
 * On almost all architectures and configurations, 0 can be used as the
@@ -691,6 +697,30 @@ static inline int pmd_protnone(pmd_t pmd)
 #endif /* CONFIG_MMU */
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
+int pud_clear_huge(pud_t *pud);
+int pmd_clear_huge(pmd_t *pmd);
+#else   /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+        return 0;
+}
+static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+        return 0;
+}
+static inline int pud_clear_huge(pud_t *pud)
+{
+        return 0;
+}
+static inline int pmd_clear_huge(pmd_t *pmd)
+{
+        return 0;
+}
+#endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
 #endif /* !__ASSEMBLY__ */
 #ifndef io_remap_pfn_range
diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
index 4ce9056b31a8..bda5ec0b4b4d 100644
--- a/include/linux/cleancache.h
+++ b/include/linux/cleancache.h
@@ -5,6 +5,10 @@
 #include <linux/exportfs.h>
 #include <linux/mm.h>
+#define CLEANCACHE_NO_POOL              -1
+#define CLEANCACHE_NO_BACKEND           -2
+#define CLEANCACHE_NO_BACKEND_SHARED    -3
 #define CLEANCACHE_KEY_MAX 6
 /*
@@ -33,10 +37,9 @@ struct cleancache_ops {
        void (*invalidate_fs)(int);
 };
-extern struct cleancache_ops *
+extern int cleancache_register_ops(struct cleancache_ops *ops);
-        cleancache_register_ops(struct cleancache_ops *ops);
 extern void __cleancache_init_fs(struct super_block *);
-extern void __cleancache_init_shared_fs(char *, struct super_block *);
+extern void __cleancache_init_shared_fs(struct super_block *);
 extern int  __cleancache_get_page(struct page *);
 extern void __cleancache_put_page(struct page *);
 extern void __cleancache_invalidate_page(struct address_space *, struct page *);
@@ -78,10 +81,10 @@ static inline void cleancache_init_fs(struct super_block *sb)
                __cleancache_init_fs(sb);
 }
-static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+static inline void cleancache_init_shared_fs(struct super_block *sb)
 {
        if (cleancache_enabled)
-                __cleancache_init_shared_fs(uuid, sb);
+                __cleancache_init_shared_fs(sb);
 }
 static inline int cleancache_get_page(struct page *page)
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 9384ba66e975..f7ef093ec49a 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -16,16 +16,16 @@
 struct cma;
 extern unsigned long totalcma_pages;
-extern phys_addr_t cma_get_base(struct cma *cma);
+extern phys_addr_t cma_get_base(const struct cma *cma);
-extern unsigned long cma_get_size(struct cma *cma);
+extern unsigned long cma_get_size(const struct cma *cma);
 extern int __init cma_declare_contiguous(phys_addr_t base,
                        phys_addr_t size, phys_addr_t limit,
                        phys_addr_t alignment, unsigned int order_per_bit,
                        bool fixed, struct cma **res_cma);
-extern int cma_init_reserved_mem(phys_addr_t base,
+extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
-                                        phys_addr_t size, int order_per_bit,
+                                        unsigned int order_per_bit,
                                        struct cma **res_cma);
-extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align);
+extern struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align);
-extern bool cma_release(struct cma *cma, struct page *pages, int count);
+extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
 #endif
diff --git a/include/linux/elf-randomize.h b/include/linux/elf-randomize.h
new file mode 100644
index 000000000000..b5f0bda9472e
--- /dev/null
+++ b/include/linux/elf-randomize.h
@@ -0,0 +1,22 @@
+#ifndef _ELF_RANDOMIZE_H
+#define _ELF_RANDOMIZE_H
+struct mm_struct;
+#ifndef CONFIG_ARCH_HAS_ELF_RANDOMIZE
+static inline unsigned long arch_mmap_rnd(void) { return 0; }
+# if defined(arch_randomize_brk) && defined(CONFIG_COMPAT_BRK)
+#  define compat_brk_randomized
+# endif
+# ifndef arch_randomize_brk
+#  define arch_randomize_brk(mm)        (mm->brk)
+# endif
+#else
+extern unsigned long arch_mmap_rnd(void);
+extern unsigned long arch_randomize_brk(struct mm_struct *mm);
+# ifdef CONFIG_COMPAT_BRK
+#  define compat_brk_randomized
+# endif
+#endif
+#endif
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 51bd1e72a917..97a9373e61e8 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -57,8 +57,10 @@ struct vm_area_struct;
 * _might_ fail.  This depends upon the particular VM implementation.
 *
 * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
- * cannot handle allocation failures.  This modifier is deprecated and no new
+ * cannot handle allocation failures. New users should be evaluated carefully
- * users should be added.
+ * (and the flag should be used only when there is no reasonable failure policy)
+ * but it is definitely preferable to use the flag rather than opencode endless
+ * loop around allocator.
 *
 * __GFP_NORETRY: The VM implementation must not retry indefinitely.
 *
@@ -117,16 +119,6 @@ struct vm_area_struct;
                         __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
                         __GFP_NO_KSWAPD)
-/*
- * GFP_THISNODE does not perform any reclaim, you most likely want to
- * use __GFP_THISNODE to allocate from a given node without fallback!
- */
-#ifdef CONFIG_NUMA
-#define GFP_THISNODE    (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
-#else
-#define GFP_THISNODE    ((__force gfp_t)0)
-#endif
 /* This mask makes up all the page movable related flags */
 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
diff --git a/include/linux/io.h b/include/linux/io.h
index fa02e55e5a2e..4cc299c598e0 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -38,6 +38,14 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
 }
 #endif
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+void __init ioremap_huge_init(void);
+int arch_ioremap_pud_supported(void);
+int arch_ioremap_pmd_supported(void);
+#else
+static inline void ioremap_huge_init(void) { }
+#endif
 /*
 * Managed iomap interface
 */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index e8cc45307f8f..9497ec7c77ea 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -365,6 +365,14 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
 #define __initdata_memblock
 #endif
+#ifdef CONFIG_MEMTEST
+extern void early_memtest(phys_addr_t start, phys_addr_t end);
+#else
+static inline void early_memtest(phys_addr_t start, phys_addr_t end)
+{
+}
+#endif
 #else
 static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
 {
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 8f1a41951df9..6ffa0ac7f7d6 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -192,6 +192,9 @@ extern void get_page_bootmem(unsigned long ingo, struct page *page,
 void get_online_mems(void);
 void put_online_mems(void);
+void mem_hotplug_begin(void);
+void mem_hotplug_done(void);
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
 * Stub functions for when hotplug is off
@@ -231,6 +234,9 @@ static inline int try_online_node(int nid)
 static inline void get_online_mems(void) {}
 static inline void put_online_mems(void) {}
+static inline void mem_hotplug_begin(void) {}
+static inline void mem_hotplug_done(void) {}
 #endif /* ! CONFIG_MEMORY_HOTPLUG */
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 39ed62ab5b8a..b19b3023c880 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -29,7 +29,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
                        mempool_free_t *free_fn, void *pool_data,
                        gfp_t gfp_mask, int nid);
-extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask);
+extern int mempool_resize(mempool_t *pool, int new_min_nr);
 extern void mempool_destroy(mempool_t *pool);
 extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
 extern void mempool_free(void *element, mempool_t *pool);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 78baed5f2952..cac1c0904d5f 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -69,7 +69,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 extern bool pmd_trans_migrating(pmd_t pmd);
 extern int migrate_misplaced_page(struct page *page,
                                  struct vm_area_struct *vma, int node);
-extern bool migrate_ratelimited(int node);
 #else
 static inline bool pmd_trans_migrating(pmd_t pmd)
 {
@@ -80,10 +79,6 @@ static inline int migrate_misplaced_page(struct page *page,
 {
        return -EAGAIN; /* can't migrate now */
 }
-static inline bool migrate_ratelimited(int node)
-{
-        return false;
-}
 #endif /* CONFIG_NUMA_BALANCING */
 #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 47a93928b90f..6571dd78e984 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1294,9 +1294,11 @@ int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
                                struct page *page);
 void account_page_dirtied(struct page *page, struct address_space *mapping);
+void account_page_cleaned(struct page *page, struct address_space *mapping);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 /* Is the vma a continuation of the stack vma above it? */
@@ -2109,7 +2111,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
 #define FOLL_FORCE      0x10    /* get_user_pages read/write w/o permission */
 #define FOLL_NOWAIT     0x20    /* if a disk transfer is needed, start the IO
                                 * and return without waiting upon it */
-#define FOLL_MLOCK      0x40    /* mark page as mlocked */
+#define FOLL_POPULATE   0x40    /* fault in page */
 #define FOLL_SPLIT      0x80    /* don't return transhuge pages, split them */
 #define FOLL_HWPOISON   0x100   /* check page is hwpoisoned */
 #define FOLL_NUMA       0x200   /* force NUMA hinting page fault */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 199a03aab8dc..590630eb59ba 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -364,7 +364,9 @@ struct mm_struct {
        atomic_t mm_users;                      /* How many users with user space? */
        atomic_t mm_count;                      /* How many references to "struct mm_struct" (users count as 1) */
        atomic_long_t nr_ptes;                  /* PTE page table pages */
+#if CONFIG_PGTABLE_LEVELS > 2
        atomic_long_t nr_pmds;                  /* PMD page table pages */
+#endif
        int map_count;                          /* number of VMAs */
        spinlock_t page_table_lock;             /* Protects page tables and some counters */
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 9b2022ab4d85..3d46fb4708e0 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -25,16 +25,11 @@ static inline void touch_nmi_watchdog(void)
 #endif
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
-extern void watchdog_enable_hardlockup_detector(bool val);
+extern void hardlockup_detector_disable(void);
-extern bool watchdog_hardlockup_detector_is_enabled(void);
 #else
-static inline void watchdog_enable_hardlockup_detector(bool val)
+static inline void hardlockup_detector_disable(void)
 {
 }
-static inline bool watchdog_hardlockup_detector_is_enabled(void)
-{
-        return true;
-}
 #endif
 /*
@@ -68,12 +63,20 @@ static inline bool trigger_allbutself_cpu_backtrace(void)
 #ifdef CONFIG_LOCKUP_DETECTOR
 int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
+extern int nmi_watchdog_enabled;
+extern int soft_watchdog_enabled;
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
 extern int sysctl_softlockup_all_cpu_backtrace;
 struct ctl_table;
-extern int proc_dowatchdog(struct ctl_table *, int ,
+extern int proc_watchdog(struct ctl_table *, int ,
-                           void __user *, size_t *, loff_t *);
+                         void __user *, size_t *, loff_t *);
+extern int proc_nmi_watchdog(struct ctl_table *, int ,
+                             void __user *, size_t *, loff_t *);
+extern int proc_soft_watchdog(struct ctl_table *, int ,
+                              void __user *, size_t *, loff_t *);
+extern int proc_watchdog_thresh(struct ctl_table *, int ,
+                                void __user *, size_t *, loff_t *);
 #endif
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/include/linux/oom.h b/include/linux/oom.h
index d5771bed59c9..44b2f6f7bbd8 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -66,7 +66,8 @@ extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
-                               int order, const nodemask_t *nodemask);
+                               int order, const nodemask_t *nodemask,
+                               struct mem_cgroup *memcg);
 extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
                unsigned long totalpages, const nodemask_t *nodemask,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5ed7bdaf22d5..c851ff92d5b3 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -328,8 +328,6 @@ static inline void SetPageUptodate(struct page *page)
 CLEARPAGEFLAG(Uptodate, uptodate)
-extern void cancel_dirty_page(struct page *page, unsigned int account_size);
 int test_clear_page_writeback(struct page *page);
 int __test_set_page_writeback(struct page *page, bool keep_write);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 76f1feeabd38..ffd24c830151 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -18,7 +18,7 @@
 /*
 * Flags to pass to kmem_cache_create().
- * The ones marked DEBUG are only valid if CONFIG_SLAB_DEBUG is set.
+ * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
 */
 #define SLAB_DEBUG_FREE         0x00000100UL    /* DEBUG: Perform (expensive) checks on free */
 #define SLAB_RED_ZONE           0x00000400UL    /* DEBUG: Red zone objs in a cache */
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index d06b6da5c1e3..bce990f5a35d 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -224,7 +224,7 @@ TRACE_EVENT(xen_mmu_pmd_clear,
            TP_printk("pmdp %p", __entry->pmdp)
        );
-#if PAGETABLE_LEVELS >= 4
+#if CONFIG_PGTABLE_LEVELS >= 4
 TRACE_EVENT(xen_mmu_set_pud,
            TP_PROTO(pud_t *pudp, pud_t pudval),
diff --git a/init/main.c b/init/main.c
index e82171b99874..a7e969d12f51 100644
--- a/init/main.c
+++ b/init/main.c
@@ -80,6 +80,7 @@
 #include <linux/list.h>
 #include <linux/integrity.h>
 #include <linux/proc_ns.h>
+#include <linux/io.h>
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -485,6 +486,7 @@ static void __init mm_init(void)
        percpu_init_late();
        pgtable_init();
        vmalloc_init();
+        ioremap_huge_init();
 }
 asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c68f0721df10..ee14e3a35a29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2453,20 +2453,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 * @node: is this an allowed node?
 * @gfp_mask: memory allocation flags
 *
- * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
+ * If we're in interrupt, yes, we can always allocate.  If @node is set in
- * set, yes, we can always allocate.  If node is in our task's mems_allowed,
+ * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
- * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
+ * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
+ * yes.  If current has access to memory reserves due to TIF_MEMDIE, yes.
- * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
- * flag, yes.
 * Otherwise, no.
 *
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first.  By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
 * and do not allow allocations outside the current tasks cpuset
 * unless the task has been OOM killed as is marked TIF_MEMDIE.
@@ -2502,7 +2494,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
        int allowed;                    /* is allocation in zone z allowed? */
        unsigned long flags;
-        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+        if (in_interrupt())
                return 1;
        if (node_isset(node, current->mems_allowed))
                return 1;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4012336de30f..8c0eabd41886 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -847,7 +847,7 @@ static struct ctl_table kern_table[] = {
                .data           = &watchdog_user_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = proc_dowatchdog,
+                .proc_handler   = proc_watchdog,
                .extra1         = &zero,
                .extra2         = &one,
        },
@@ -856,11 +856,33 @@ static struct ctl_table kern_table[] = {
                .data           = &watchdog_thresh,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dowatchdog,
+                .proc_handler   = proc_watchdog_thresh,
                .extra1         = &zero,
                .extra2         = &sixty,
        },
        {
+                .procname       = "nmi_watchdog",
+                .data           = &nmi_watchdog_enabled,
+                .maxlen         = sizeof (int),
+                .mode           = 0644,
+                .proc_handler   = proc_nmi_watchdog,
+                .extra1         = &zero,
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+                .extra2         = &one,
+#else
+                .extra2         = &zero,
+#endif
+        },
+        {
+                .procname       = "soft_watchdog",
+                .data           = &soft_watchdog_enabled,
+                .maxlen         = sizeof (int),
+                .mode           = 0644,
+                .proc_handler   = proc_soft_watchdog,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+        {
                .procname       = "softlockup_panic",
                .data           = &softlockup_panic,
                .maxlen         = sizeof(int),
@@ -880,15 +902,6 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif /* CONFIG_SMP */
-        {
-                .procname       = "nmi_watchdog",
-                .data           = &watchdog_user_enabled,
-                .maxlen         = sizeof (int),
-                .mode           = 0644,
-                .proc_handler   = proc_dowatchdog,
-                .extra1         = &zero,
-                .extra2         = &one,
-        },
 #endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
        {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9a056f5bc02c..2316f50b07a4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,8 +24,33 @@
 #include <linux/kvm_para.h>
 #include <linux/perf_event.h>
-int watchdog_user_enabled = 1;
+/*
+ * The run state of the lockup detectors is controlled by the content of the
+ * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
+ * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
+ *
+ * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
+ * are variables that are only used as an 'interface' between the parameters
+ * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
+ * 'watchdog_thresh' variable is handled differently because its value is not
+ * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
+ * is equal zero.
+ */
+#define NMI_WATCHDOG_ENABLED_BIT   0
+#define SOFT_WATCHDOG_ENABLED_BIT  1
+#define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
+#define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+#else
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+#endif
+int __read_mostly nmi_watchdog_enabled;
+int __read_mostly soft_watchdog_enabled;
+int __read_mostly watchdog_user_enabled;
 int __read_mostly watchdog_thresh = 10;
 #ifdef CONFIG_SMP
 int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 #else
@@ -58,8 +83,6 @@ static unsigned long soft_lockup_nmi_warn;
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static int hardlockup_panic =
                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-static bool hardlockup_detector_enabled = true;
 /*
 * We may not want to enable hard lockup detection by default in all cases,
 * for example when running the kernel as a guest on a hypervisor. In these
@@ -68,14 +91,9 @@ static bool hardlockup_detector_enabled = true;
 * kernel command line parameters are parsed, because otherwise it is not
 * possible to override this in hardlockup_panic_setup().
 */
-void watchdog_enable_hardlockup_detector(bool val)
+void hardlockup_detector_disable(void)
-{
-        hardlockup_detector_enabled = val;
-}
-bool watchdog_hardlockup_detector_is_enabled(void)
 {
-        return hardlockup_detector_enabled;
+        watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
 }
 static int __init hardlockup_panic_setup(char *str)
@@ -85,15 +103,9 @@ static int __init hardlockup_panic_setup(char *str)
        else if (!strncmp(str, "nopanic", 7))
                hardlockup_panic = 0;
        else if (!strncmp(str, "0", 1))
-                watchdog_user_enabled = 0;
+                watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-        else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) {
+        else if (!strncmp(str, "1", 1))
-                /*
+                watchdog_enabled |= NMI_WATCHDOG_ENABLED;
-                 * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option)
-                 * has the same effect.
-                 */
-                watchdog_user_enabled = 1;
-                watchdog_enable_hardlockup_detector(true);
-        }
        return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -112,19 +124,18 @@ __setup("softlockup_panic=", softlockup_panic_setup);
 static int __init nowatchdog_setup(char *str)
 {
-        watchdog_user_enabled = 0;
+        watchdog_enabled = 0;
        return 1;
 }
 __setup("nowatchdog", nowatchdog_setup);
-/* deprecated */
 static int __init nosoftlockup_setup(char *str)
 {
-        watchdog_user_enabled = 0;
+        watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
        return 1;
 }
 __setup("nosoftlockup", nosoftlockup_setup);
-/*  */
 #ifdef CONFIG_SMP
 static int __init softlockup_all_cpu_backtrace_setup(char *str)
 {
@@ -239,10 +250,11 @@ static int is_softlockup(unsigned long touch_ts)
 {
        unsigned long now = get_timestamp();
-        /* Warn about unreasonable delays: */
+        if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
-        if (time_after(now, touch_ts + get_softlockup_thresh()))
+                /* Warn about unreasonable delays. */
-                return now - touch_ts;
+                if (time_after(now, touch_ts + get_softlockup_thresh()))
+                        return now - touch_ts;
+        }
        return 0;
 }
@@ -477,6 +489,21 @@ static void watchdog(unsigned int cpu)
        __this_cpu_write(soft_lockup_hrtimer_cnt,
                         __this_cpu_read(hrtimer_interrupts));
        __touch_watchdog();
+        /*
+         * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
+         * failure path. Check for failures that can occur asynchronously -
+         * for example, when CPUs are on-lined - and shut down the hardware
+         * perf event on each CPU accordingly.
+         *
+         * The only non-obvious place this bit can be cleared is through
+         * watchdog_nmi_enable(), so a pr_info() is placed there.  Placing a
+         * pr_info here would be too noisy as it would result in a message
+         * every few seconds if the hardlockup was disabled but the softlockup
+         * enabled.
+         */
+        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+                watchdog_nmi_disable(cpu);
 }
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -492,14 +519,9 @@ static int watchdog_nmi_enable(unsigned int cpu)
        struct perf_event_attr *wd_attr;
        struct perf_event *event = per_cpu(watchdog_ev, cpu);
-        /*
+        /* nothing to do if the hard lockup detector is disabled */
-         * Some kernels need to default hard lockup detection to
+        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-         * 'disabled', for example a guest on a hypervisor.
+                goto out;
-         */
-        if (!watchdog_hardlockup_detector_is_enabled()) {
-                event = ERR_PTR(-ENOENT);
-                goto handle_err;
-        }
        /* is it already setup and enabled? */
        if (event && event->state > PERF_EVENT_STATE_OFF)
@@ -515,7 +537,6 @@ static int watchdog_nmi_enable(unsigned int cpu)
        /* Try to register using hardware perf events */
        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-handle_err:
        /* save cpu0 error for future comparision */
        if (cpu == 0 && IS_ERR(event))
                cpu0_err = PTR_ERR(event);
@@ -527,6 +548,18 @@ handle_err:
                goto out_save;
        }
+        /*
+         * Disable the hard lockup detector if _any_ CPU fails to set up
+         * set up the hardware perf event. The watchdog() function checks
+         * the NMI_WATCHDOG_ENABLED bit periodically.
+         *
+         * The barriers are for syncing up watchdog_enabled across all the
+         * cpus, as clear_bit() does not use barriers.
+         */
+        smp_mb__before_atomic();
+        clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+        smp_mb__after_atomic();
        /* skip displaying the same error again */
        if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
                return PTR_ERR(event);
@@ -540,6 +573,9 @@ handle_err:
        else
                pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
                        cpu, PTR_ERR(event));
+        pr_info("Shutting down hard lockup detector on all cpus\n");
        return PTR_ERR(event);
        /* success path */
@@ -628,7 +664,7 @@ static void restart_watchdog_hrtimer(void *info)
                                HRTIMER_MODE_REL_PINNED);
 }
-static void update_timers(int cpu)
+static void update_watchdog(int cpu)
 {
        /*
         * Make sure that perf event counter will adopt to a new
@@ -643,17 +679,17 @@ static void update_timers(int cpu)
        watchdog_nmi_enable(cpu);
 }
-static void update_timers_all_cpus(void)
+static void update_watchdog_all_cpus(void)
 {
        int cpu;
        get_online_cpus();
        for_each_online_cpu(cpu)
-                update_timers(cpu);
+                update_watchdog(cpu);
        put_online_cpus();
 }
-static int watchdog_enable_all_cpus(bool sample_period_changed)
+static int watchdog_enable_all_cpus(void)
 {
        int err = 0;
@@ -663,8 +699,12 @@ static int watchdog_enable_all_cpus(bool sample_period_changed)
                        pr_err("Failed to create watchdog threads, disabled\n");
                else
                        watchdog_running = 1;
-        } else if (sample_period_changed) {
+        } else {
-                update_timers_all_cpus();
+                /*
+                 * Enable/disable the lockup detectors or
+                 * change the sample period 'on the fly'.
+                 */
+                update_watchdog_all_cpus();
        }
        return err;
@@ -682,48 +722,149 @@ static void watchdog_disable_all_cpus(void)
 }
 /*
- * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
+ * Update the run state of the lockup detectors.
+ */
+static int proc_watchdog_update(void)
+{
+        int err = 0;
+        /*
+         * Watchdog threads won't be started if they are already active.
+         * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
+         * care of this. If those threads are already active, the sample
+         * period will be updated and the lockup detectors will be enabled
+         * or disabled 'on the fly'.
+         */
+        if (watchdog_enabled && watchdog_thresh)
+                err = watchdog_enable_all_cpus();
+        else
+                watchdog_disable_all_cpus();
+        return err;
+}
+static DEFINE_MUTEX(watchdog_proc_mutex);
+/*
+ * common function for watchdog, nmi_watchdog and soft_watchdog parameter
+ *
+ * caller             | table->data points to | 'which' contains the flag(s)
+ * -------------------|-----------------------|-----------------------------
+ * proc_watchdog      | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
+ *                    |                       | with SOFT_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_nmi_watchdog  | nmi_watchdog_enabled  | NMI_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ */
+static int proc_watchdog_common(int which, struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int err, old, new;
+        int *watchdog_param = (int *)table->data;
+        mutex_lock(&watchdog_proc_mutex);
+        /*
+         * If the parameter is being read return the state of the corresponding
+         * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
+         * run state of the lockup detectors.
+         */
+        if (!write) {
+                *watchdog_param = (watchdog_enabled & which) != 0;
+                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        } else {
+                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+                if (err)
+                        goto out;
+                /*
+                 * There is a race window between fetching the current value
+                 * from 'watchdog_enabled' and storing the new value. During
+                 * this race window, watchdog_nmi_enable() can sneak in and
+                 * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
+                 * The 'cmpxchg' detects this race and the loop retries.
+                 */
+                do {
+                        old = watchdog_enabled;
+                        /*
+                         * If the parameter value is not zero set the
+                         * corresponding bit(s), else clear it(them).
+                         */
+                        if (*watchdog_param)
+                                new = old | which;
+                        else
+                                new = old & ~which;
+                } while (cmpxchg(&watchdog_enabled, old, new) != old);
+                /*
+                 * Update the run state of the lockup detectors.
+                 * Restore 'watchdog_enabled' on failure.
+                 */
+                err = proc_watchdog_update();
+                if (err)
+                        watchdog_enabled = old;
+        }
+out:
+        mutex_unlock(&watchdog_proc_mutex);
+        return err;
+}
+/*
+ * /proc/sys/kernel/watchdog
+ */
+int proc_watchdog(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
+                                    table, write, buffer, lenp, ppos);
+}
+/*
+ * /proc/sys/kernel/nmi_watchdog
 */
+int proc_nmi_watchdog(struct ctl_table *table, int write,
+                      void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
+                                    table, write, buffer, lenp, ppos);
+}
+/*
+ * /proc/sys/kernel/soft_watchdog
+ */
+int proc_soft_watchdog(struct ctl_table *table, int write,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
+                                    table, write, buffer, lenp, ppos);
+}
-int proc_dowatchdog(struct ctl_table *table, int write,
+/*
-                    void __user *buffer, size_t *lenp, loff_t *ppos)
+ * /proc/sys/kernel/watchdog_thresh
+ */
+int proc_watchdog_thresh(struct ctl_table *table, int write,
+                         void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-        int err, old_thresh, old_enabled;
+        int err, old;
-        bool old_hardlockup;
-        static DEFINE_MUTEX(watchdog_proc_mutex);
        mutex_lock(&watchdog_proc_mutex);
-        old_thresh = ACCESS_ONCE(watchdog_thresh);
-        old_enabled = ACCESS_ONCE(watchdog_user_enabled);
-        old_hardlockup = watchdog_hardlockup_detector_is_enabled();
+        old = ACCESS_ONCE(watchdog_thresh);
        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (err || !write)
                goto out;
-        set_sample_period();
        /*
-         * Watchdog threads shouldn't be enabled if they are
+         * Update the sample period.
-         * disabled. The 'watchdog_running' variable check in
+         * Restore 'watchdog_thresh' on failure.
-         * watchdog_*_all_cpus() function takes care of this.
         */
-        if (watchdog_user_enabled && watchdog_thresh) {
+        set_sample_period();
-                /*
+        err = proc_watchdog_update();
-                 * Prevent a change in watchdog_thresh accidentally overriding
+        if (err)
-                 * the enablement of the hardlockup detector.
+                watchdog_thresh = old;
-                 */
-                if (watchdog_user_enabled != old_enabled)
-                        watchdog_enable_hardlockup_detector(true);
-                err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
-        } else
-                watchdog_disable_all_cpus();
-        /* Restore old values on failure */
-        if (err) {
-                watchdog_thresh = old_thresh;
-                watchdog_user_enabled = old_enabled;
-                watchdog_enable_hardlockup_detector(old_hardlockup);
-        }
 out:
        mutex_unlock(&watchdog_proc_mutex);
        return err;
@@ -734,6 +875,6 @@ void __init lockup_detector_init(void)
 {
        set_sample_period();
-        if (watchdog_user_enabled)
+        if (watchdog_enabled)
-                watchdog_enable_all_cpus(false);
+                watchdog_enable_all_cpus();
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 93967e634a1e..17670573dda8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1760,6 +1760,18 @@ config TEST_UDELAY
          If unsure, say N.
+config MEMTEST
+        bool "Memtest"
+        depends on HAVE_MEMBLOCK
+        ---help---
+          This option adds a kernel parameter 'memtest', which allows memtest
+          to be set.
+                memtest=0, mean disabled; -- default
+                memtest=1, mean do 1 test pattern;
+                ...
+                memtest=17, mean do 17 test patterns.
+          If you are unsure how to answer this question, answer N.
 source "samples/Kconfig"
 source "lib/Kconfig.kgdb"
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 0c9216c48762..86c8911b0e3a 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -13,6 +13,43 @@
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static int __read_mostly ioremap_pud_capable;
+static int __read_mostly ioremap_pmd_capable;
+static int __read_mostly ioremap_huge_disabled;
+static int __init set_nohugeiomap(char *str)
+{
+        ioremap_huge_disabled = 1;
+        return 0;
+}
+early_param("nohugeiomap", set_nohugeiomap);
+void __init ioremap_huge_init(void)
+{
+        if (!ioremap_huge_disabled) {
+                if (arch_ioremap_pud_supported())
+                        ioremap_pud_capable = 1;
+                if (arch_ioremap_pmd_supported())
+                        ioremap_pmd_capable = 1;
+        }
+}
+static inline int ioremap_pud_enabled(void)
+{
+        return ioremap_pud_capable;
+}
+static inline int ioremap_pmd_enabled(void)
+{
+        return ioremap_pmd_capable;
+}
+#else   /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static inline int ioremap_pud_enabled(void) { return 0; }
+static inline int ioremap_pmd_enabled(void) { return 0; }
+#endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
 static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
                unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
@@ -43,6 +80,14 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);
+                if (ioremap_pmd_enabled() &&
+                    ((next - addr) == PMD_SIZE) &&
+                    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
+                        if (pmd_set_huge(pmd, phys_addr + addr, prot))
+                                continue;
+                }
                if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
                        return -ENOMEM;
        } while (pmd++, addr = next, addr != end);
@@ -61,6 +106,14 @@ static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
+                if (ioremap_pud_enabled() &&
+                    ((next - addr) == PUD_SIZE) &&
+                    IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
+                        if (pud_set_huge(pud, phys_addr + addr, prot))
+                                continue;
+                }
                if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
                        return -ENOMEM;
        } while (pud++, addr = next, addr != end);
diff --git a/mm/Kconfig b/mm/Kconfig
index a03131b6ba8e..390214da4546 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -517,6 +517,12 @@ config CMA_DEBUG
          processing calls such as dma_alloc_from_contiguous().
          This option does not affect warning and error messages.
+config CMA_DEBUGFS
+        bool "CMA debugfs interface"
+        depends on CMA && DEBUG_FS
+        help
+          Turns on the DebugFS interface for CMA.
 config CMA_AREAS
        int "Maximum count of the CMA areas"
        depends on CMA
diff --git a/mm/Makefile b/mm/Makefile
index 15dbe9903c27..98c4eaeabdcb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
 obj-$(CONFIG_KASAN)     += kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+obj-$(CONFIG_MEMTEST)           += memtest.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
@@ -76,3 +77,4 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
 obj-$(CONFIG_CMA)       += cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
+obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 053bcd8f12fb..8fc50811119b 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -19,7 +19,7 @@
 #include <linux/cleancache.h>
 /*
- * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * cleancache_ops is set by cleancache_register_ops to contain the pointers
 * to the cleancache "backend" implementation functions.
 */
 static struct cleancache_ops *cleancache_ops __read_mostly;
@@ -34,145 +34,107 @@ static u64 cleancache_failed_gets;
 static u64 cleancache_puts;
 static u64 cleancache_invalidates;
-/*
+static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
- * When no backend is registered all calls to init_fs and init_shared_fs
+{
- * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or
+        switch (sb->cleancache_poolid) {
- * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array
+        case CLEANCACHE_NO_BACKEND:
- * [shared_|]fs_poolid_map) are given to the respective super block
+                __cleancache_init_fs(sb);
- * (sb->cleancache_poolid) and no tmem_pools are created. When a backend
+                break;
- * registers with cleancache the previous calls to init_fs and init_shared_fs
+        case CLEANCACHE_NO_BACKEND_SHARED:
- * are executed to create tmem_pools and set the respective poolids. While no
+                __cleancache_init_shared_fs(sb);
- * backend is registered all "puts", "gets" and "flushes" are ignored or failed.
+                break;
- */
+        }
-#define MAX_INITIALIZABLE_FS 32
+}
-#define FAKE_FS_POOLID_OFFSET 1000
-#define FAKE_SHARED_FS_POOLID_OFFSET 2000
-#define FS_NO_BACKEND (-1)
-#define FS_UNKNOWN (-2)
-static int fs_poolid_map[MAX_INITIALIZABLE_FS];
-static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
-static char *uuids[MAX_INITIALIZABLE_FS];
-/*
- * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
- * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
- * threads calling mount (and ending up in __cleancache_init_[shared|]fs).
- */
-static DEFINE_MUTEX(poolid_mutex);
-/*
- * When set to false (default) all calls to the cleancache functions, except
- * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
- * by the if (!cleancache_ops) return. This means multiple threads (from
- * different filesystems) will be checking cleancache_ops. The usage of a
- * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
- * OK if the time between the backend's have been initialized (and
- * cleancache_ops has been set to not NULL) and when the filesystems start
- * actually calling the backends. The inverse (when unloading) is obviously
- * not good - but this shim does not do that (yet).
- */
-/*
- * The backends and filesystems work all asynchronously. This is b/c the
- * backends can be built as modules.
- * The usual sequence of events is:
- *      a) mount /      -> __cleancache_init_fs is called. We set the
- *              [shared_|]fs_poolid_map and uuids for.
- *
- *      b). user does I/Os -> we call the rest of __cleancache_* functions
- *              which return immediately as cleancache_ops is false.
- *
- *      c). modprobe zcache -> cleancache_register_ops. We init the backend
- *              and set cleancache_ops to true, and for any fs_poolid_map
- *              (which is set by __cleancache_init_fs) we initialize the poolid.
- *
- *      d). user does I/Os -> now that cleancache_ops is true all the
- *              __cleancache_* functions can call the backend. They all check
- *              that fs_poolid_map is valid and if so invoke the backend.
- *
- *      e). umount /    -> __cleancache_invalidate_fs, the fs_poolid_map is
- *              reset (which is the second check in the __cleancache_* ops
- *              to call the backend).
- *
- * The sequence of event could also be c), followed by a), and d). and e). The
- * c) would not happen anymore. There is also the chance of c), and one thread
- * doing a) + d), and another doing e). For that case we depend on the
- * filesystem calling __cleancache_invalidate_fs in the proper sequence (so
- * that it handles all I/Os before it invalidates the fs (which is last part
- * of unmounting process).
- *
- * Note: The acute reader will notice that there is no "rmmod zcache" case.
- * This is b/c the functionality for that is not yet implemented and when
- * done, will require some extra locking not yet devised.
- */
 /*
- * Register operations for cleancache, returning previous thus allowing
+ * Register operations for cleancache. Returns 0 on success.
- * detection of multiple backends and possible nesting.
 */
-struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(struct cleancache_ops *ops)
 {
-        struct cleancache_ops *old = cleancache_ops;
+        if (cmpxchg(&cleancache_ops, NULL, ops))
-        int i;
+                return -EBUSY;
-        mutex_lock(&poolid_mutex);
-        for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
-                if (fs_poolid_map[i] == FS_NO_BACKEND)
-                        fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
-                if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
-                        shared_fs_poolid_map[i] = ops->init_shared_fs
-                                        (uuids[i], PAGE_SIZE);
-        }
        /*
-         * We MUST set cleancache_ops _after_ we have called the backends
+         * A cleancache backend can be built as a module and hence loaded after
-         * init_fs or init_shared_fs functions. Otherwise the compiler might
+         * a cleancache enabled filesystem has called cleancache_init_fs. To
-         * re-order where cleancache_ops is set in this function.
+         * handle such a scenario, here we call ->init_fs or ->init_shared_fs
+         * for each active super block. To differentiate between local and
+         * shared filesystems, we temporarily initialize sb->cleancache_poolid
+         * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
+         * respectively in case there is no backend registered at the time
+         * cleancache_init_fs or cleancache_init_shared_fs is called.
+         *
+         * Since filesystems can be mounted concurrently with cleancache
+         * backend registration, we have to be careful to guarantee that all
+         * cleancache enabled filesystems that has been mounted by the time
+         * cleancache_register_ops is called has got and all mounted later will
+         * get cleancache_poolid. This is assured by the following statements
+         * tied together:
+         *
+         * a) iterate_supers skips only those super blocks that has started
+         *    ->kill_sb
+         *
+         * b) if iterate_supers encounters a super block that has not finished
+         *    ->mount yet, it waits until it is finished
+         *
+         * c) cleancache_init_fs is called from ->mount and
+         *    cleancache_invalidate_fs is called from ->kill_sb
+         *
+         * d) we call iterate_supers after cleancache_ops has been set
+         *
+         * From a) it follows that if iterate_supers skips a super block, then
+         * either the super block is already dead, in which case we do not need
+         * to bother initializing cleancache for it, or it was mounted after we
+         * initiated iterate_supers. In the latter case, it must have seen
+         * cleancache_ops set according to d) and initialized cleancache from
+         * ->mount by itself according to c). This proves that we call
+         * ->init_fs at least once for each active super block.
+         *
+         * From b) and c) it follows that if iterate_supers encounters a super
+         * block that has already started ->init_fs, it will wait until ->mount
+         * and hence ->init_fs has finished, then check cleancache_poolid, see
+         * that it has already been set and therefore do nothing. This proves
+         * that we call ->init_fs no more than once for each super block.
+         *
+         * Combined together, the last two paragraphs prove the function
+         * correctness.
+         *
+         * Note that various cleancache callbacks may proceed before this
+         * function is called or even concurrently with it, but since
+         * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
+         * until the corresponding ->init_fs has been actually called and
+         * cleancache_ops has been set.
         */
-        barrier();
+        iterate_supers(cleancache_register_ops_sb, NULL);
-        cleancache_ops = ops;
+        return 0;
-        mutex_unlock(&poolid_mutex);
-        return old;
 }
 EXPORT_SYMBOL(cleancache_register_ops);
 /* Called by a cleancache-enabled filesystem at time of mount */
 void __cleancache_init_fs(struct super_block *sb)
 {
-        int i;
+        int pool_id = CLEANCACHE_NO_BACKEND;
-        mutex_lock(&poolid_mutex);
+        if (cleancache_ops) {
-        for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
+                pool_id = cleancache_ops->init_fs(PAGE_SIZE);
-                if (fs_poolid_map[i] == FS_UNKNOWN) {
+                if (pool_id < 0)
-                        sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET;
+                        pool_id = CLEANCACHE_NO_POOL;
-                        if (cleancache_ops)
-                                fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
-                        else
-                                fs_poolid_map[i] = FS_NO_BACKEND;
-                        break;
-                }
        }
-        mutex_unlock(&poolid_mutex);
+        sb->cleancache_poolid = pool_id;
 }
 EXPORT_SYMBOL(__cleancache_init_fs);
 /* Called by a cleancache-enabled clustered filesystem at time of mount */
-void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+void __cleancache_init_shared_fs(struct super_block *sb)
 {
-        int i;
+        int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
-        mutex_lock(&poolid_mutex);
+        if (cleancache_ops) {
-        for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
+                pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
-                if (shared_fs_poolid_map[i] == FS_UNKNOWN) {
+                if (pool_id < 0)
-                        sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET;
+                        pool_id = CLEANCACHE_NO_POOL;
-                        uuids[i] = uuid;
-                        if (cleancache_ops)
-                                shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
-                                                (uuid, PAGE_SIZE);
-                        else
-                                shared_fs_poolid_map[i] = FS_NO_BACKEND;
-                        break;
-                }
        }
-        mutex_unlock(&poolid_mutex);
+        sb->cleancache_poolid = pool_id;
 }
 EXPORT_SYMBOL(__cleancache_init_shared_fs);
@@ -202,19 +164,6 @@ static int cleancache_get_key(struct inode *inode,
 }
 /*
- * Returns a pool_id that is associated with a given fake poolid.
- */
-static int get_poolid_from_fake(int fake_pool_id)
-{
-        if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
-                return shared_fs_poolid_map[fake_pool_id -
-                        FAKE_SHARED_FS_POOLID_OFFSET];
-        else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
-                return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
-        return FS_NO_BACKEND;
-}
-/*
 * "Get" data from cleancache associated with the poolid/inode/index
 * that were specified when the data was put to cleanache and, if
 * successful, use it to fill the specified page with data and return 0.
@@ -229,7 +178,6 @@ int __cleancache_get_page(struct page *page)
 {
        int ret = -1;
        int pool_id;
-        int fake_pool_id;
        struct cleancache_filekey key = { .u.key = { 0 } };
        if (!cleancache_ops) {
@@ -238,17 +186,14 @@ int __cleancache_get_page(struct page *page)
        }
        VM_BUG_ON_PAGE(!PageLocked(page), page);
-        fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
+        pool_id = page->mapping->host->i_sb->cleancache_poolid;
-        if (fake_pool_id < 0)
+        if (pool_id < 0)
                goto out;
-        pool_id = get_poolid_from_fake(fake_pool_id);
        if (cleancache_get_key(page->mapping->host, &key) < 0)
                goto out;
-        if (pool_id >= 0)
+        ret = cleancache_ops->get_page(pool_id, key, page->index, page);
-                ret = cleancache_ops->get_page(pool_id,
-                                key, page->index, page);
        if (ret == 0)
                cleancache_succ_gets++;
        else
@@ -271,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page);
 void __cleancache_put_page(struct page *page)
 {
        int pool_id;
-        int fake_pool_id;
        struct cleancache_filekey key = { .u.key = { 0 } };
        if (!cleancache_ops) {
@@ -280,12 +224,7 @@ void __cleancache_put_page(struct page *page)
        }
        VM_BUG_ON_PAGE(!PageLocked(page), page);
-        fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
+        pool_id = page->mapping->host->i_sb->cleancache_poolid;
-        if (fake_pool_id < 0)
-                return;
-        pool_id = get_poolid_from_fake(fake_pool_id);
        if (pool_id >= 0 &&
                cleancache_get_key(page->mapping->host, &key) >= 0) {
                cleancache_ops->put_page(pool_id, key, page->index, page);
@@ -306,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping,
                                        struct page *page)
 {
        /* careful... page->mapping is NULL sometimes when this is called */
-        int pool_id;
+        int pool_id = mapping->host->i_sb->cleancache_poolid;
-        int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
        struct cleancache_filekey key = { .u.key = { 0 } };
        if (!cleancache_ops)
                return;
-        if (fake_pool_id >= 0) {
+        if (pool_id >= 0) {
-                pool_id = get_poolid_from_fake(fake_pool_id);
-                if (pool_id < 0)
-                        return;
                VM_BUG_ON_PAGE(!PageLocked(page), page);
                if (cleancache_get_key(mapping->host, &key) >= 0) {
                        cleancache_ops->invalidate_page(pool_id,
@@ -339,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
 */
 void __cleancache_invalidate_inode(struct address_space *mapping)
 {
-        int pool_id;
+        int pool_id = mapping->host->i_sb->cleancache_poolid;
-        int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
        struct cleancache_filekey key = { .u.key = { 0 } };
        if (!cleancache_ops)
                return;
-        if (fake_pool_id < 0)
-                return;
-        pool_id = get_poolid_from_fake(fake_pool_id);
        if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
                cleancache_ops->invalidate_inode(pool_id, key);
 }
@@ -363,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode);
 */
 void __cleancache_invalidate_fs(struct super_block *sb)
 {
-        int index;
+        int pool_id;
-        int fake_pool_id = sb->cleancache_poolid;
-        int old_poolid = fake_pool_id;
-        mutex_lock(&poolid_mutex);
+        pool_id = sb->cleancache_poolid;
-        if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) {
+        sb->cleancache_poolid = CLEANCACHE_NO_POOL;
-                index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET;
-                old_poolid = shared_fs_poolid_map[index];
+        if (cleancache_ops && pool_id >= 0)
-                shared_fs_poolid_map[index] = FS_UNKNOWN;
+                cleancache_ops->invalidate_fs(pool_id);
-                uuids[index] = NULL;
-        } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
-                index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
-                old_poolid = fs_poolid_map[index];
-                fs_poolid_map[index] = FS_UNKNOWN;
-        }
-        sb->cleancache_poolid = -1;
-        if (cleancache_ops)
-                cleancache_ops->invalidate_fs(old_poolid);
-        mutex_unlock(&poolid_mutex);
 }
 EXPORT_SYMBOL(__cleancache_invalidate_fs);
 static int __init init_cleancache(void)
 {
-        int i;
 #ifdef CONFIG_DEBUG_FS
        struct dentry *root = debugfs_create_dir("cleancache", NULL);
        if (root == NULL)
@@ -400,10 +314,6 @@ static int __init init_cleancache(void)
        debugfs_create_u64("invalidates", S_IRUGO,
                                root, &cleancache_invalidates);
 #endif
-        for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
-                fs_poolid_map[i] = FS_UNKNOWN;
-                shared_fs_poolid_map[i] = FS_UNKNOWN;
-        }
        return 0;
 }
 module_init(init_cleancache)
diff --git a/mm/cma.c b/mm/cma.c
index 68ecb7a42983..47203faaf65e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -35,29 +35,24 @@
 #include <linux/highmem.h>
 #include <linux/io.h>
-struct cma {
+#include "cma.h"
-        unsigned long   base_pfn;
-        unsigned long   count;
+struct cma cma_areas[MAX_CMA_AREAS];
-        unsigned long   *bitmap;
+unsigned cma_area_count;
-        unsigned int order_per_bit; /* Order of pages represented by one bit */
-        struct mutex    lock;
-};
-static struct cma cma_areas[MAX_CMA_AREAS];
-static unsigned cma_area_count;
 static DEFINE_MUTEX(cma_mutex);
-phys_addr_t cma_get_base(struct cma *cma)
+phys_addr_t cma_get_base(const struct cma *cma)
 {
        return PFN_PHYS(cma->base_pfn);
 }
-unsigned long cma_get_size(struct cma *cma)
+unsigned long cma_get_size(const struct cma *cma)
 {
        return cma->count << PAGE_SHIFT;
 }
-static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
+static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
+                                             int align_order)
 {
        if (align_order <= cma->order_per_bit)
                return 0;
@@ -68,7 +63,8 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
 * Find a PFN aligned to the specified order and return an offset represented in
 * order_per_bits.
 */
-static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
+static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
+                                               int align_order)
 {
        if (align_order <= cma->order_per_bit)
                return 0;
@@ -77,18 +73,14 @@ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
                - cma->base_pfn) >> cma->order_per_bit;
 }
-static unsigned long cma_bitmap_maxno(struct cma *cma)
+static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
-{
+                                              unsigned long pages)
-        return cma->count >> cma->order_per_bit;
-}
-static unsigned long cma_bitmap_pages_to_bits(struct cma *cma,
-                                                unsigned long pages)
 {
        return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
 }
-static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count)
+static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
+                             unsigned int count)
 {
        unsigned long bitmap_no, bitmap_count;
@@ -134,6 +126,12 @@ static int __init cma_activate_area(struct cma *cma)
        } while (--i);
        mutex_init(&cma->lock);
+#ifdef CONFIG_CMA_DEBUGFS
+        INIT_HLIST_HEAD(&cma->mem_head);
+        spin_lock_init(&cma->mem_head_lock);
+#endif
        return 0;
 err:
@@ -167,7 +165,8 @@ core_initcall(cma_init_reserved_areas);
 * This function creates custom contiguous area from already reserved memory.
 */
 int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
-                                 int order_per_bit, struct cma **res_cma)
+                                 unsigned int order_per_bit,
+                                 struct cma **res_cma)
 {
        struct cma *cma;
        phys_addr_t alignment;
@@ -358,7 +357,7 @@ err:
 * This function allocates part of contiguous memory on specific
 * contiguous memory area.
 */
-struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
+struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align)
 {
        unsigned long mask, offset, pfn, start = 0;
        unsigned long bitmap_maxno, bitmap_no, bitmap_count;
@@ -429,7 +428,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
 * It returns false when provided pages do not belong to contiguous area and
 * true otherwise.
 */
-bool cma_release(struct cma *cma, struct page *pages, int count)
+bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
 {
        unsigned long pfn;
diff --git a/mm/cma.h b/mm/cma.h
new file mode 100644
index 000000000000..1132d733556d
--- /dev/null
+++ b/mm/cma.h
@@ -0,0 +1,24 @@
+#ifndef __MM_CMA_H__
+#define __MM_CMA_H__
+struct cma {
+        unsigned long   base_pfn;
+        unsigned long   count;
+        unsigned long   *bitmap;
+        unsigned int order_per_bit; /* Order of pages represented by one bit */
+        struct mutex    lock;
+#ifdef CONFIG_CMA_DEBUGFS
+        struct hlist_head mem_head;
+        spinlock_t mem_head_lock;
+#endif
+};
+extern struct cma cma_areas[MAX_CMA_AREAS];
+extern unsigned cma_area_count;
+static unsigned long cma_bitmap_maxno(struct cma *cma)
+{
+        return cma->count >> cma->order_per_bit;
+}
+#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
new file mode 100644
index 000000000000..0b377536ccde
--- /dev/null
+++ b/mm/cma_debug.c
@@ -0,0 +1,170 @@
+/*
+ * CMA DebugFS Interface
+ *
+ * Copyright (c) 2015 Sasha Levin <sasha.levin@oracle.com>
+ */
+#include <linux/debugfs.h>
+#include <linux/cma.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm_types.h>
+#include "cma.h"
+struct cma_mem {
+        struct hlist_node node;
+        struct page *p;
+        unsigned long n;
+};
+static struct dentry *cma_debugfs_root;
+static int cma_debugfs_get(void *data, u64 *val)
+{
+        unsigned long *p = data;
+        *val = *p;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
+static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
+{
+        spin_lock(&cma->mem_head_lock);
+        hlist_add_head(&mem->node, &cma->mem_head);
+        spin_unlock(&cma->mem_head_lock);
+}
+static struct cma_mem *cma_get_entry_from_list(struct cma *cma)
+{
+        struct cma_mem *mem = NULL;
+        spin_lock(&cma->mem_head_lock);
+        if (!hlist_empty(&cma->mem_head)) {
+                mem = hlist_entry(cma->mem_head.first, struct cma_mem, node);
+                hlist_del_init(&mem->node);
+        }
+        spin_unlock(&cma->mem_head_lock);
+        return mem;
+}
+static int cma_free_mem(struct cma *cma, int count)
+{
+        struct cma_mem *mem = NULL;
+        while (count) {
+                mem = cma_get_entry_from_list(cma);
+                if (mem == NULL)
+                        return 0;
+                if (mem->n <= count) {
+                        cma_release(cma, mem->p, mem->n);
+                        count -= mem->n;
+                        kfree(mem);
+                } else if (cma->order_per_bit == 0) {
+                        cma_release(cma, mem->p, count);
+                        mem->p += count;
+                        mem->n -= count;
+                        count = 0;
+                        cma_add_to_cma_mem_list(cma, mem);
+                } else {
+                        pr_debug("cma: cannot release partial block when order_per_bit != 0\n");
+                        cma_add_to_cma_mem_list(cma, mem);
+                        break;
+                }
+        }
+        return 0;
+}
+static int cma_free_write(void *data, u64 val)
+{
+        int pages = val;
+        struct cma *cma = data;
+        return cma_free_mem(cma, pages);
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
+static int cma_alloc_mem(struct cma *cma, int count)
+{
+        struct cma_mem *mem;
+        struct page *p;
+        mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+        if (!mem)
+                return -ENOMEM;
+        p = cma_alloc(cma, count, 0);
+        if (!p) {
+                kfree(mem);
+                return -ENOMEM;
+        }
+        mem->p = p;
+        mem->n = count;
+        cma_add_to_cma_mem_list(cma, mem);
+        return 0;
+}
+static int cma_alloc_write(void *data, u64 val)
+{
+        int pages = val;
+        struct cma *cma = data;
+        return cma_alloc_mem(cma, pages);
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
+static void cma_debugfs_add_one(struct cma *cma, int idx)
+{
+        struct dentry *tmp;
+        char name[16];
+        int u32s;
+        sprintf(name, "cma-%d", idx);
+        tmp = debugfs_create_dir(name, cma_debugfs_root);
+        debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma,
+                                &cma_alloc_fops);
+        debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma,
+                                &cma_free_fops);
+        debugfs_create_file("base_pfn", S_IRUGO, tmp,
+                                &cma->base_pfn, &cma_debugfs_fops);
+        debugfs_create_file("count", S_IRUGO, tmp,
+                                &cma->count, &cma_debugfs_fops);
+        debugfs_create_file("order_per_bit", S_IRUGO, tmp,
+                                &cma->order_per_bit, &cma_debugfs_fops);
+        u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
+        debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s);
+}
+static int __init cma_debugfs_init(void)
+{
+        int i;
+        cma_debugfs_root = debugfs_create_dir("cma", NULL);
+        if (!cma_debugfs_root)
+                return -ENOMEM;
+        for (i = 0; i < cma_area_count; i++)
+                cma_debugfs_add_one(&cma_areas[i], i);
+        return 0;
+}
+late_initcall(cma_debugfs_init);
diff --git a/mm/compaction.c b/mm/compaction.c
index 8c0d9459b54a..a18201a8124e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1174,13 +1174,24 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
        /* Direct compactor: Is a suitable page free? */
        for (order = cc->order; order < MAX_ORDER; order++) {
                struct free_area *area = &zone->free_area[order];
+                bool can_steal;
                /* Job done if page is free of the right migratetype */
                if (!list_empty(&area->free_list[migratetype]))
                        return COMPACT_PARTIAL;
-                /* Job done if allocation would set block type */
+#ifdef CONFIG_CMA
-                if (order >= pageblock_order && area->nr_free)
+                /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
+                if (migratetype == MIGRATE_MOVABLE &&
+                        !list_empty(&area->free_list[MIGRATE_CMA]))
+                        return COMPACT_PARTIAL;
+#endif
+                /*
+                 * Job done if allocation would steal freepages from
+                 * other migratetype buddy lists.
+                 */
+                if (find_suitable_fallback(area, order, migratetype,
+                                                true, &can_steal) != -1)
                        return COMPACT_PARTIAL;
        }
diff --git a/mm/filemap.c b/mm/filemap.c
index 876f4e6f3ed6..12548d03c11d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -202,16 +202,15 @@ void __delete_from_page_cache(struct page *page, void *shadow)
        BUG_ON(page_mapped(page));
        /*
-         * Some filesystems seem to re-dirty the page even after
+         * At this point page must be either written or cleaned by truncate.
-         * the VM has canceled the dirty bit (eg ext3 journaling).
+         * Dirty page here signals a bug and loss of unwritten data.
         *
-         * Fix it up by doing a final dirty accounting check after
+         * This fixes dirty accounting after removing the page entirely but
-         * having removed the page entirely.
+         * leaves PageDirty set: it has no effect for truncated page and
+         * anyway will be cleared before returning page into buddy allocator.
         */
-        if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+        if (WARN_ON_ONCE(PageDirty(page)))
-                dec_zone_page_state(page, NR_FILE_DIRTY);
+                account_page_cleaned(page, mapping);
-                dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
-        }
 }
 /**
diff --git a/mm/gup.c b/mm/gup.c
index a6e24e246f86..ca7b607ab671 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -92,7 +92,7 @@ retry:
                 */
                mark_page_accessed(page);
        }
-        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+        if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
                /*
                 * The preliminary mapping check is mainly to avoid the
                 * pointless overhead of lock_page on the ZERO_PAGE
@@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
        unsigned int fault_flags = 0;
        int ret;
-        /* For mlock, just skip the stack guard page. */
+        /* For mm_populate(), just skip the stack guard page. */
-        if ((*flags & FOLL_MLOCK) &&
+        if ((*flags & FOLL_POPULATE) &&
                        (stack_guard_page_start(vma, address) ||
                         stack_guard_page_end(vma, address + PAGE_SIZE)))
                return -ENOENT;
@@ -819,6 +819,124 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 EXPORT_SYMBOL(get_user_pages);
 /**
+ * populate_vma_page_range() -  populate a range of pages in the vma.
+ * @vma:   target vma
+ * @start: start address
+ * @end:   end address
+ * @nonblocking:
+ *
+ * This takes care of mlocking the pages too if VM_LOCKED is set.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held.
+ *
+ * If @nonblocking is NULL, it may be held for read or write and will
+ * be unperturbed.
+ *
+ * If @nonblocking is non-NULL, it must held for read only and may be
+ * released.  If it's released, *@nonblocking will be set to 0.
+ */
+long populate_vma_page_range(struct vm_area_struct *vma,
+                unsigned long start, unsigned long end, int *nonblocking)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long nr_pages = (end - start) / PAGE_SIZE;
+        int gup_flags;
+        VM_BUG_ON(start & ~PAGE_MASK);
+        VM_BUG_ON(end   & ~PAGE_MASK);
+        VM_BUG_ON_VMA(start < vma->vm_start, vma);
+        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
+        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+        gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+        /*
+         * We want to touch writable mappings with a write fault in order
+         * to break COW, except for shared mappings because these don't COW
+         * and we would not want to dirty them for nothing.
+         */
+        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+                gup_flags |= FOLL_WRITE;
+        /*
+         * We want mlock to succeed for regions that have any permissions
+         * other than PROT_NONE.
+         */
+        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+                gup_flags |= FOLL_FORCE;
+        /*
+         * We made sure addr is within a VMA, so the following will
+         * not result in a stack expansion that recurses back here.
+         */
+        return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+                                NULL, NULL, nonblocking);
+}
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_sem must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long end, nstart, nend;
+        struct vm_area_struct *vma = NULL;
+        int locked = 0;
+        long ret = 0;
+        VM_BUG_ON(start & ~PAGE_MASK);
+        VM_BUG_ON(len != PAGE_ALIGN(len));
+        end = start + len;
+        for (nstart = start; nstart < end; nstart = nend) {
+                /*
+                 * We want to fault in pages for [nstart; end) address range.
+                 * Find first corresponding VMA.
+                 */
+                if (!locked) {
+                        locked = 1;
+                        down_read(&mm->mmap_sem);
+                        vma = find_vma(mm, nstart);
+                } else if (nstart >= vma->vm_end)
+                        vma = vma->vm_next;
+                if (!vma || vma->vm_start >= end)
+                        break;
+                /*
+                 * Set [nstart; nend) to intersection of desired address
+                 * range with the first VMA. Also, skip undesirable VMA types.
+                 */
+                nend = min(end, vma->vm_end);
+                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+                        continue;
+                if (nstart < vma->vm_start)
+                        nstart = vma->vm_start;
+                /*
+                 * Now fault in a range of pages. populate_vma_page_range()
+                 * double checks the vma flags, so that it won't mlock pages
+                 * if the vma was already munlocked.
+                 */
+                ret = populate_vma_page_range(vma, nstart, nend, &locked);
+                if (ret < 0) {
+                        if (ignore_errors) {
+                                ret = 0;
+                                continue;       /* continue at next VMA */
+                        }
+                        break;
+                }
+                nend = nstart + ret * PAGE_SIZE;
+                ret = 0;
+        }
+        if (locked)
+                up_read(&mm->mmap_sem);
+        return ret;     /* 0 or negative error code */
+}
+/**
 * get_dump_page() - pin user page in memory while writing it to core dump
 * @addr: user address
 *
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6817b0350c71..3afb5cbe1312 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1231,7 +1231,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                          pmd, _pmd,  1))
                        update_mmu_cache_pmd(vma, addr, pmd);
        }
-        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+        if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
                if (page->mapping && trylock_page(page)) {
                        lru_add_drain();
                        if (page->mapping)
@@ -2109,7 +2109,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
 {
        while (--_pte >= pte) {
                pte_t pteval = *_pte;
-                if (!pte_none(pteval))
+                if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
                        release_pte_page(pte_page(pteval));
        }
 }
@@ -2120,13 +2120,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 {
        struct page *page;
        pte_t *_pte;
-        int none = 0;
+        int none_or_zero = 0;
        bool referenced = false, writable = false;
        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
-                if (pte_none(pteval)) {
+                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-                        if (++none <= khugepaged_max_ptes_none)
+                        if (++none_or_zero <= khugepaged_max_ptes_none)
                                continue;
                        else
                                goto out;
@@ -2207,9 +2207,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
                pte_t pteval = *_pte;
                struct page *src_page;
-                if (pte_none(pteval)) {
+                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        clear_user_highpage(page, address);
                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+                        if (is_zero_pfn(pte_pfn(pteval))) {
+                                /*
+                                 * ptl mostly unnecessary.
+                                 */
+                                spin_lock(ptl);
+                                /*
+                                 * paravirt calls inside pte_clear here are
+                                 * superfluous.
+                                 */
+                                pte_clear(vma->vm_mm, address, _pte);
+                                spin_unlock(ptl);
+                        }
                } else {
                        src_page = pte_page(pteval);
                        copy_user_highpage(page, src_page, address, vma);
@@ -2316,8 +2328,14 @@ static struct page
                       struct vm_area_struct *vma, unsigned long address,
                       int node)
 {
+        gfp_t flags;
        VM_BUG_ON_PAGE(*hpage, *hpage);
+        /* Only allocate from the target node */
+        flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
+                __GFP_THISNODE;
        /*
         * Before allocating the hugepage, release the mmap_sem read lock.
         * The allocation can take potentially a long time if it involves
@@ -2326,8 +2344,7 @@ static struct page
         */
        up_read(&mm->mmap_sem);
-        *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
+        *hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER);
-                khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
@@ -2543,7 +2560,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 {
        pmd_t *pmd;
        pte_t *pte, *_pte;
-        int ret = 0, none = 0;
+        int ret = 0, none_or_zero = 0;
        struct page *page;
        unsigned long _address;
        spinlock_t *ptl;
@@ -2561,8 +2578,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = *_pte;
-                if (pte_none(pteval)) {
+                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-                        if (++none <= khugepaged_max_ptes_none)
+                        if (++none_or_zero <= khugepaged_max_ptes_none)
                                continue;
                        else
                                goto out_unmap;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c41b2a0ee273..8874c8ad55aa 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3278,6 +3278,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                struct page *page;
                /*
+                 * If we have a pending SIGKILL, don't keep faulting pages and
+                 * potentially allocating memory.
+                 */
+                if (unlikely(fatal_signal_pending(current))) {
+                        remainder = 0;
+                        break;
+                }
+                /*
                 * Some archs (sparc64, sh*) have multiple pte_ts to
                 * each hugepage.  We have to make sure we get the
                 * first, for the page indexing below to work.
@@ -3735,8 +3744,7 @@ retry:
        if (!pmd_huge(*pmd))
                goto out;
        if (pmd_present(*pmd)) {
-                page = pte_page(*(pte_t *)pmd) +
+                page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
-                        ((address & ~PMD_MASK) >> PAGE_SHIFT);
                if (flags & FOLL_GET)
                        get_page(page);
        } else {
diff --git a/mm/internal.h b/mm/internal.h
index a96da5b0029d..edaab69a9c35 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -200,6 +200,8 @@ isolate_freepages_range(struct compact_control *cc,
 unsigned long
 isolate_migratepages_range(struct compact_control *cc,
                           unsigned long low_pfn, unsigned long end_pfn);
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+                        int migratetype, bool only_stealable, bool *can_steal);
 #endif
@@ -240,7 +242,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev, struct rb_node *rb_parent);
 #ifdef CONFIG_MMU
-extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
+extern long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *nonblocking);
 extern void munlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
diff --git a/mm/memblock.c b/mm/memblock.c
index 252b77bdf65e..3f37a0bca5d5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -699,14 +699,14 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
                                                   int nid,
                                                   unsigned long flags)
 {
-        struct memblock_type *_rgn = &memblock.reserved;
+        struct memblock_type *type = &memblock.reserved;
        memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
                     (unsigned long long)base,
                     (unsigned long long)base + size - 1,
                     flags, (void *)_RET_IP_);
-        return memblock_add_range(_rgn, base, size, nid, flags);
+        return memblock_add_range(type, base, size, nid, flags);
 }
 int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b34ef4a32a3b..c3f09b2dda5f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -14,6 +14,12 @@
 * Copyright (C) 2012 Parallels Inc. and Google Inc.
 * Authors: Glauber Costa and Suleiman Souhlal
 *
+ * Native page reclaim
+ * Charge lifetime sanitation
+ * Lockless page tracking & accounting
+ * Unified hierarchy configuration model
+ * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
+ *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
@@ -1436,15 +1442,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        struct mem_cgroup *iter;
        unsigned int i;
-        if (!p)
-                return;
        mutex_lock(&oom_info_lock);
        rcu_read_lock();
-        pr_info("Task in ");
+        if (p) {
-        pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+                pr_info("Task in ");
-        pr_cont(" killed as a result of limit of ");
+                pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+                pr_cont(" killed as a result of limit of ");
+        } else {
+                pr_info("Memory limit reached of cgroup ");
+        }
        pr_cont_cgroup_path(memcg->css.cgroup);
        pr_cont("\n");
@@ -1531,7 +1539,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                return;
        }
-        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
        totalpages = mem_cgroup_get_limit(memcg) ? : 1;
        for_each_mem_cgroup_tree(iter, memcg) {
                struct css_task_iter it;
@@ -2779,92 +2787,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-/**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
- * @nr_pages: number of regular pages (>1 for huge pages)
- * @from: mem_cgroup which the page is moved from.
- * @to: mem_cgroup which the page is moved to. @from != @to.
- *
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
- *
- * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
- * from old cgroup.
- */
-static int mem_cgroup_move_account(struct page *page,
-                                   unsigned int nr_pages,
-                                   struct mem_cgroup *from,
-                                   struct mem_cgroup *to)
-{
-        unsigned long flags;
-        int ret;
-        VM_BUG_ON(from == to);
-        VM_BUG_ON_PAGE(PageLRU(page), page);
-        /*
-         * The page is isolated from LRU. So, collapse function
-         * will not handle this page. But page splitting can happen.
-         * Do this check under compound_page_lock(). The caller should
-         * hold it.
-         */
-        ret = -EBUSY;
-        if (nr_pages > 1 && !PageTransHuge(page))
-                goto out;
-        /*
-         * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
-         * of its source page while we change it: page migration takes
-         * both pages off the LRU, but page cache replacement doesn't.
-         */
-        if (!trylock_page(page))
-                goto out;
-        ret = -EINVAL;
-        if (page->mem_cgroup != from)
-                goto out_unlock;
-        spin_lock_irqsave(&from->move_lock, flags);
-        if (!PageAnon(page) && page_mapped(page)) {
-                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
-                               nr_pages);
-                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
-                               nr_pages);
-        }
-        if (PageWriteback(page)) {
-                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
-                               nr_pages);
-                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
-                               nr_pages);
-        }
-        /*
-         * It is safe to change page->mem_cgroup here because the page
-         * is referenced, charged, and isolated - we can't race with
-         * uncharging, charging, migration, or LRU putback.
-         */
-        /* caller should have done css_get */
-        page->mem_cgroup = to;
-        spin_unlock_irqrestore(&from->move_lock, flags);
-        ret = 0;
-        local_irq_disable();
-        mem_cgroup_charge_statistics(to, page, nr_pages);
-        memcg_check_events(to, page);
-        mem_cgroup_charge_statistics(from, page, -nr_pages);
-        memcg_check_events(from, page);
-        local_irq_enable();
-out_unlock:
-        unlock_page(page);
-out:
-        return ret;
-}
 #ifdef CONFIG_MEMCG_SWAP
 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
                                         bool charge)
@@ -4816,6 +4738,92 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
        return page;
 }
+/**
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @nr_pages: number of regular pages (>1 for huge pages)
+ * @from: mem_cgroup which the page is moved from.
+ * @to: mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must confirm following.
+ * - page is not on LRU (isolate_page() is useful.)
+ * - compound_lock is held when nr_pages > 1
+ *
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
+ */
+static int mem_cgroup_move_account(struct page *page,
+                                   unsigned int nr_pages,
+                                   struct mem_cgroup *from,
+                                   struct mem_cgroup *to)
+{
+        unsigned long flags;
+        int ret;
+        VM_BUG_ON(from == to);
+        VM_BUG_ON_PAGE(PageLRU(page), page);
+        /*
+         * The page is isolated from LRU. So, collapse function
+         * will not handle this page. But page splitting can happen.
+         * Do this check under compound_page_lock(). The caller should
+         * hold it.
+         */
+        ret = -EBUSY;
+        if (nr_pages > 1 && !PageTransHuge(page))
+                goto out;
+        /*
+         * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
+         * of its source page while we change it: page migration takes
+         * both pages off the LRU, but page cache replacement doesn't.
+         */
+        if (!trylock_page(page))
+                goto out;
+        ret = -EINVAL;
+        if (page->mem_cgroup != from)
+                goto out_unlock;
+        spin_lock_irqsave(&from->move_lock, flags);
+        if (!PageAnon(page) && page_mapped(page)) {
+                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+                               nr_pages);
+                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+                               nr_pages);
+        }
+        if (PageWriteback(page)) {
+                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+                               nr_pages);
+                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+                               nr_pages);
+        }
+        /*
+         * It is safe to change page->mem_cgroup here because the page
+         * is referenced, charged, and isolated - we can't race with
+         * uncharging, charging, migration, or LRU putback.
+         */
+        /* caller should have done css_get */
+        page->mem_cgroup = to;
+        spin_unlock_irqrestore(&from->move_lock, flags);
+        ret = 0;
+        local_irq_disable();
+        mem_cgroup_charge_statistics(to, page, nr_pages);
+        memcg_check_events(to, page);
+        mem_cgroup_charge_statistics(from, page, -nr_pages);
+        memcg_check_events(from, page);
+        local_irq_enable();
+out_unlock:
+        unlock_page(page);
+out:
+        return ret;
+}
 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                unsigned long addr, pte_t ptent, union mc_target *target)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 97839f5c8c30..ac20b2a6a0c3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1983,167 +1983,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 }
 /*
- * This routine handles present pages, when users try to write
+ * Handle write page faults for pages that can be reused in the current vma
- * to a shared page. It is done by copying the page to a new address
- * and decrementing the shared-page counter for the old page.
- *
- * Note that this routine assumes that the protection checks have been
- * done by the caller (the low-level page fault routine in most cases).
- * Thus we can safely just mark it writable once we've done any necessary
- * COW.
 *
- * We also mark the page dirty at this point even though the page will
+ * This can happen either due to the mapping being with the VM_SHARED flag,
- * change only once the write actually happens. This avoids a few races,
+ * or due to us being the last reference standing to the page. In either
- * and potentially makes it more efficient.
+ * case, all we need to do here is to mark the page as writable and update
- *
+ * any related book-keeping.
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
 */
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static inline int wp_page_reuse(struct mm_struct *mm,
-                unsigned long address, pte_t *page_table, pmd_t *pmd,
+                        struct vm_area_struct *vma, unsigned long address,
-                spinlock_t *ptl, pte_t orig_pte)
+                        pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+                        struct page *page, int page_mkwrite,
+                        int dirty_shared)
        __releases(ptl)
 {
-        struct page *old_page, *new_page = NULL;
        pte_t entry;
-        int ret = 0;
-        int page_mkwrite = 0;
-        bool dirty_shared = false;
-        unsigned long mmun_start = 0;   /* For mmu_notifiers */
-        unsigned long mmun_end = 0;     /* For mmu_notifiers */
-        struct mem_cgroup *memcg;
-        old_page = vm_normal_page(vma, address, orig_pte);
-        if (!old_page) {
-                /*
-                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
-                 * VM_PFNMAP VMA.
-                 *
-                 * We should not cow pages in a shared writeable mapping.
-                 * Just mark the pages writable as we can't do any dirty
-                 * accounting on raw pfn maps.
-                 */
-                if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
-                                     (VM_WRITE|VM_SHARED))
-                        goto reuse;
-                goto gotten;
-        }
        /*
-         * Take out anonymous pages first, anonymous shared vmas are
+         * Clear the pages cpupid information as the existing
-         * not dirty accountable.
+         * information potentially belongs to a now completely
+         * unrelated process.
         */
-        if (PageAnon(old_page) && !PageKsm(old_page)) {
+        if (page)
-                if (!trylock_page(old_page)) {
+                page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
-                        page_cache_get(old_page);
-                        pte_unmap_unlock(page_table, ptl);
-                        lock_page(old_page);
-                        page_table = pte_offset_map_lock(mm, pmd, address,
-                                                         &ptl);
-                        if (!pte_same(*page_table, orig_pte)) {
-                                unlock_page(old_page);
-                                goto unlock;
-                        }
-                        page_cache_release(old_page);
-                }
-                if (reuse_swap_page(old_page)) {
-                        /*
-                         * The page is all ours.  Move it to our anon_vma so
-                         * the rmap code will not search our parent or siblings.
-                         * Protected against the rmap code by the page lock.
-                         */
-                        page_move_anon_rmap(old_page, vma, address);
-                        unlock_page(old_page);
-                        goto reuse;
-                }
-                unlock_page(old_page);
-        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
-                                        (VM_WRITE|VM_SHARED))) {
-                page_cache_get(old_page);
-                /*
-                 * Only catch write-faults on shared writable pages,
-                 * read-only shared pages can get COWed by
-                 * get_user_pages(.write=1, .force=1).
-                 */
-                if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
-                        int tmp;
-                        pte_unmap_unlock(page_table, ptl);
-                        tmp = do_page_mkwrite(vma, old_page, address);
-                        if (unlikely(!tmp || (tmp &
-                                        (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-                                page_cache_release(old_page);
-                                return tmp;
-                        }
-                        /*
-                         * Since we dropped the lock we need to revalidate
-                         * the PTE as someone else may have changed it.  If
-                         * they did, we just return, as we can count on the
-                         * MMU to tell us if they didn't also make it writable.
-                         */
-                        page_table = pte_offset_map_lock(mm, pmd, address,
-                                                         &ptl);
-                        if (!pte_same(*page_table, orig_pte)) {
-                                unlock_page(old_page);
-                                goto unlock;
-                        }
-                        page_mkwrite = 1;
-                }
-                dirty_shared = true;
-reuse:
-                /*
-                 * Clear the pages cpupid information as the existing
-                 * information potentially belongs to a now completely
-                 * unrelated process.
-                 */
-                if (old_page)
-                        page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
-                flush_cache_page(vma, address, pte_pfn(orig_pte));
-                entry = pte_mkyoung(orig_pte);
-                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                if (ptep_set_access_flags(vma, address, page_table, entry,1))
-                        update_mmu_cache(vma, address, page_table);
-                pte_unmap_unlock(page_table, ptl);
-                ret |= VM_FAULT_WRITE;
-                if (dirty_shared) {
+        flush_cache_page(vma, address, pte_pfn(orig_pte));
-                        struct address_space *mapping;
+        entry = pte_mkyoung(orig_pte);
-                        int dirtied;
+        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+        if (ptep_set_access_flags(vma, address, page_table, entry, 1))
+                update_mmu_cache(vma, address, page_table);
+        pte_unmap_unlock(page_table, ptl);
-                        if (!page_mkwrite)
+        if (dirty_shared) {
-                                lock_page(old_page);
+                struct address_space *mapping;
+                int dirtied;
-                        dirtied = set_page_dirty(old_page);
+                if (!page_mkwrite)
-                        VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
+                        lock_page(page);
-                        mapping = old_page->mapping;
-                        unlock_page(old_page);
-                        page_cache_release(old_page);
-                        if ((dirtied || page_mkwrite) && mapping) {
+                dirtied = set_page_dirty(page);
-                                /*
+                VM_BUG_ON_PAGE(PageAnon(page), page);
-                                 * Some device drivers do not set page.mapping
+                mapping = page->mapping;
-                                 * but still dirty their pages
+                unlock_page(page);
-                                 */
+                page_cache_release(page);
-                                balance_dirty_pages_ratelimited(mapping);
-                        }
-                        if (!page_mkwrite)
+                if ((dirtied || page_mkwrite) && mapping) {
-                                file_update_time(vma->vm_file);
+                        /*
+                         * Some device drivers do not set page.mapping
+                         * but still dirty their pages
+                         */
+                        balance_dirty_pages_ratelimited(mapping);
                }
-                return ret;
+                if (!page_mkwrite)
+                        file_update_time(vma->vm_file);
        }
-        /*
+        return VM_FAULT_WRITE;
-         * Ok, we need to copy. Oh, well..
+}
-         */
-        page_cache_get(old_page);
+/*
-gotten:
+ * Handle the case of a page which we actually need to copy to a new page.
-        pte_unmap_unlock(page_table, ptl);
+ *
+ * Called with mmap_sem locked and the old page referenced, but
+ * without the ptl held.
+ *
+ * High level logic flow:
+ *
+ * - Allocate a page, copy the content of the old page to the new one.
+ * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
+ * - Take the PTL. If the pte changed, bail out and release the allocated page
+ * - If the pte is still the way we remember it, update the page table and all
+ *   relevant references. This includes dropping the reference the page-table
+ *   held to the old page, as well as updating the rmap.
+ * - In any case, unlock the PTL and drop the reference we took to the old page.
+ */
+static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
+                        unsigned long address, pte_t *page_table, pmd_t *pmd,
+                        pte_t orig_pte, struct page *old_page)
+{
+        struct page *new_page = NULL;
+        spinlock_t *ptl = NULL;
+        pte_t entry;
+        int page_copied = 0;
+        const unsigned long mmun_start = address & PAGE_MASK;   /* For mmu_notifiers */
+        const unsigned long mmun_end = mmun_start + PAGE_SIZE;  /* For mmu_notifiers */
+        struct mem_cgroup *memcg;
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
@@ -2163,8 +2087,6 @@ gotten:
        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
                goto oom_free_new;
-        mmun_start  = address & PAGE_MASK;
-        mmun_end    = mmun_start + PAGE_SIZE;
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
@@ -2177,8 +2099,9 @@ gotten:
                                dec_mm_counter_fast(mm, MM_FILEPAGES);
                                inc_mm_counter_fast(mm, MM_ANONPAGES);
                        }
-                } else
+                } else {
                        inc_mm_counter_fast(mm, MM_ANONPAGES);
+                }
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2227,29 +2150,29 @@ gotten:
                /* Free the old page.. */
                new_page = old_page;
-                ret |= VM_FAULT_WRITE;
+                page_copied = 1;
-        } else
+        } else {
                mem_cgroup_cancel_charge(new_page, memcg);
+        }
        if (new_page)
                page_cache_release(new_page);
-unlock:
        pte_unmap_unlock(page_table, ptl);
-        if (mmun_end > mmun_start)
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (old_page) {
                /*
                 * Don't let another task, with possibly unlocked vma,
                 * keep the mlocked page.
                 */
-                if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+                if (page_copied && (vma->vm_flags & VM_LOCKED)) {
                        lock_page(old_page);    /* LRU manipulation */
                        munlock_vma_page(old_page);
                        unlock_page(old_page);
                }
                page_cache_release(old_page);
        }
-        return ret;
+        return page_copied ? VM_FAULT_WRITE : 0;
 oom_free_new:
        page_cache_release(new_page);
 oom:
@@ -2258,6 +2181,144 @@ oom:
        return VM_FAULT_OOM;
 }
+static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
+                          unsigned long address, pte_t *page_table,
+                          pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
+                          struct page *old_page)
+        __releases(ptl)
+{
+        int page_mkwrite = 0;
+        page_cache_get(old_page);
+        /*
+         * Only catch write-faults on shared writable pages,
+         * read-only shared pages can get COWed by
+         * get_user_pages(.write=1, .force=1).
+         */
+        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+                int tmp;
+                pte_unmap_unlock(page_table, ptl);
+                tmp = do_page_mkwrite(vma, old_page, address);
+                if (unlikely(!tmp || (tmp &
+                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+                        page_cache_release(old_page);
+                        return tmp;
+                }
+                /*
+                 * Since we dropped the lock we need to revalidate
+                 * the PTE as someone else may have changed it.  If
+                 * they did, we just return, as we can count on the
+                 * MMU to tell us if they didn't also make it writable.
+                 */
+                page_table = pte_offset_map_lock(mm, pmd, address,
+                                                 &ptl);
+                if (!pte_same(*page_table, orig_pte)) {
+                        unlock_page(old_page);
+                        pte_unmap_unlock(page_table, ptl);
+                        page_cache_release(old_page);
+                        return 0;
+                }
+                page_mkwrite = 1;
+        }
+        return wp_page_reuse(mm, vma, address, page_table, ptl,
+                             orig_pte, old_page, page_mkwrite, 1);
+}
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                unsigned long address, pte_t *page_table, pmd_t *pmd,
+                spinlock_t *ptl, pte_t orig_pte)
+        __releases(ptl)
+{
+        struct page *old_page;
+        old_page = vm_normal_page(vma, address, orig_pte);
+        if (!old_page) {
+                /*
+                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
+                 * VM_PFNMAP VMA.
+                 *
+                 * We should not cow pages in a shared writeable mapping.
+                 * Just mark the pages writable as we can't do any dirty
+                 * accounting on raw pfn maps.
+                 */
+                if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+                                     (VM_WRITE|VM_SHARED))
+                        return wp_page_reuse(mm, vma, address, page_table, ptl,
+                                             orig_pte, old_page, 0, 0);
+                pte_unmap_unlock(page_table, ptl);
+                return wp_page_copy(mm, vma, address, page_table, pmd,
+                                    orig_pte, old_page);
+        }
+        /*
+         * Take out anonymous pages first, anonymous shared vmas are
+         * not dirty accountable.
+         */
+        if (PageAnon(old_page) && !PageKsm(old_page)) {
+                if (!trylock_page(old_page)) {
+                        page_cache_get(old_page);
+                        pte_unmap_unlock(page_table, ptl);
+                        lock_page(old_page);
+                        page_table = pte_offset_map_lock(mm, pmd, address,
+                                                         &ptl);
+                        if (!pte_same(*page_table, orig_pte)) {
+                                unlock_page(old_page);
+                                pte_unmap_unlock(page_table, ptl);
+                                page_cache_release(old_page);
+                                return 0;
+                        }
+                        page_cache_release(old_page);
+                }
+                if (reuse_swap_page(old_page)) {
+                        /*
+                         * The page is all ours.  Move it to our anon_vma so
+                         * the rmap code will not search our parent or siblings.
+                         * Protected against the rmap code by the page lock.
+                         */
+                        page_move_anon_rmap(old_page, vma, address);
+                        unlock_page(old_page);
+                        return wp_page_reuse(mm, vma, address, page_table, ptl,
+                                             orig_pte, old_page, 0, 0);
+                }
+                unlock_page(old_page);
+        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+                                        (VM_WRITE|VM_SHARED))) {
+                return wp_page_shared(mm, vma, address, page_table, pmd,
+                                      ptl, orig_pte, old_page);
+        }
+        /*
+         * Ok, we need to copy. Oh, well..
+         */
+        page_cache_get(old_page);
+        pte_unmap_unlock(page_table, ptl);
+        return wp_page_copy(mm, vma, address, page_table, pmd,
+                            orig_pte, old_page);
+}
 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 65842d688b7c..e2e8014fb755 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -104,7 +104,7 @@ void put_online_mems(void)
 }
-static void mem_hotplug_begin(void)
+void mem_hotplug_begin(void)
 {
        mem_hotplug.active_writer = current;
@@ -119,7 +119,7 @@ static void mem_hotplug_begin(void)
        }
 }
-static void mem_hotplug_done(void)
+void mem_hotplug_done(void)
 {
        mem_hotplug.active_writer = NULL;
        mutex_unlock(&mem_hotplug.lock);
@@ -502,7 +502,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
        end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
        for (i = start_sec; i <= end_sec; i++) {
-                err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
+                err = __add_section(nid, zone, section_nr_to_pfn(i));
                /*
                 * EEXIST is finally dealt with by ioresource collision
@@ -959,6 +959,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
 }
+/* Must be protected by mem_hotplug_begin() */
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 {
        unsigned long flags;
@@ -969,7 +970,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        int ret;
        struct memory_notify arg;
-        mem_hotplug_begin();
        /*
         * This doesn't need a lock to do pfn_to_page().
         * The section can't be removed here because of the
@@ -977,21 +977,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
         */
        zone = page_zone(pfn_to_page(pfn));
-        ret = -EINVAL;
        if ((zone_idx(zone) > ZONE_NORMAL ||
            online_type == MMOP_ONLINE_MOVABLE) &&
            !can_online_high_movable(zone))
-                goto out;
+                return -EINVAL;
        if (online_type == MMOP_ONLINE_KERNEL &&
            zone_idx(zone) == ZONE_MOVABLE) {
                if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
-                        goto out;
+                        return -EINVAL;
        }
        if (online_type == MMOP_ONLINE_MOVABLE &&
            zone_idx(zone) == ZONE_MOVABLE - 1) {
                if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
-                        goto out;
+                        return -EINVAL;
        }
        /* Previous code may changed the zone of the pfn range */
@@ -1007,7 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        ret = notifier_to_errno(ret);
        if (ret) {
                memory_notify(MEM_CANCEL_ONLINE, &arg);
-                goto out;
+                return ret;
        }
        /*
         * If this zone is not populated, then it is not in zonelist.
@@ -1031,7 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
                       (((unsigned long long) pfn + nr_pages)
                            << PAGE_SHIFT) - 1);
                memory_notify(MEM_CANCEL_ONLINE, &arg);
-                goto out;
+                return ret;
        }
        zone->present_pages += onlined_pages;
@@ -1061,9 +1060,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        if (onlined_pages)
                memory_notify(MEM_ONLINE, &arg);
-out:
+        return 0;
-        mem_hotplug_done();
-        return ret;
 }
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1688,21 +1685,18 @@ static int __ref __offline_pages(unsigned long start_pfn,
        if (!test_pages_in_a_zone(start_pfn, end_pfn))
                return -EINVAL;
-        mem_hotplug_begin();
        zone = page_zone(pfn_to_page(start_pfn));
        node = zone_to_nid(zone);
        nr_pages = end_pfn - start_pfn;
-        ret = -EINVAL;
        if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
-                goto out;
+                return -EINVAL;
        /* set above range as isolated */
        ret = start_isolate_page_range(start_pfn, end_pfn,
                                       MIGRATE_MOVABLE, true);
        if (ret)
-                goto out;
+                return ret;
        arg.start_pfn = start_pfn;
        arg.nr_pages = nr_pages;
@@ -1795,7 +1789,6 @@ repeat:
        writeback_set_ratelimit();
        memory_notify(MEM_OFFLINE, &arg);
-        mem_hotplug_done();
        return 0;
 failed_removal:
@@ -1805,12 +1798,10 @@ failed_removal:
        memory_notify(MEM_CANCEL_OFFLINE, &arg);
        /* pushback to free area */
        undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
-out:
-        mem_hotplug_done();
        return ret;
 }
+/* Must be protected by mem_hotplug_begin() */
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
        return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4721046a134a..ede26291d4aa 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -945,7 +945,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
                return alloc_huge_page_node(page_hstate(compound_head(page)),
                                        node);
        else
-                return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+                return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
+                                                    __GFP_THISNODE, 0);
 }
 /*
@@ -1985,7 +1986,8 @@ retry_cpuset:
                nmask = policy_nodemask(gfp, pol);
                if (!nmask || node_isset(node, *nmask)) {
                        mpol_cond_put(pol);
-                        page = alloc_pages_exact_node(node, gfp, order);
+                        page = alloc_pages_exact_node(node,
+                                                gfp | __GFP_THISNODE, order);
                        goto out;
                }
        }
diff --git a/mm/mempool.c b/mm/mempool.c
index e209c98c7203..949970db2874 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -113,23 +113,24 @@ EXPORT_SYMBOL(mempool_create_node);
 *              mempool_create().
 * @new_min_nr: the new minimum number of elements guaranteed to be
 *              allocated for this pool.
- * @gfp_mask:   the usual allocation bitmask.
 *
 * This function shrinks/grows the pool. In the case of growing,
 * it cannot be guaranteed that the pool will be grown to the new
 * size immediately, but new mempool_free() calls will refill it.
+ * This function may sleep.
 *
 * Note, the caller must guarantee that no mempool_destroy is called
 * while this function is running. mempool_alloc() & mempool_free()
 * might be called (eg. from IRQ contexts) while this function executes.
 */
-int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
+int mempool_resize(mempool_t *pool, int new_min_nr)
 {
        void *element;
        void **new_elements;
        unsigned long flags;
        BUG_ON(new_min_nr <= 0);
+        might_sleep();
        spin_lock_irqsave(&pool->lock, flags);
        if (new_min_nr <= pool->min_nr) {
@@ -145,7 +146,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
        spin_unlock_irqrestore(&pool->lock, flags);
        /* Grow the pool */
-        new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
+        new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
+                                     GFP_KERNEL);
        if (!new_elements)
                return -ENOMEM;
@@ -164,7 +166,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
        while (pool->curr_nr < pool->min_nr) {
                spin_unlock_irqrestore(&pool->lock, flags);
-                element = pool->alloc(gfp_mask, pool->pool_data);
+                element = pool->alloc(GFP_KERNEL, pool->pool_data);
                if (!element)
                        goto out;
                spin_lock_irqsave(&pool->lock, flags);
diff --git a/arch/x86/mm/memtest.c b/mm/memtest.c
index 1e9da795767a..1997d934b13b 100644
--- a/arch/x86/mm/memtest.c
+++ b/mm/memtest.c
@@ -29,7 +29,7 @@ static u64 patterns[] __initdata = {
        0x7a6c7258554e494cULL, /* yeah ;-) */
 };
-static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
+static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
 {
        printk(KERN_INFO "  %016llx bad mem addr %010llx - %010llx reserved\n",
               (unsigned long long) pattern,
@@ -38,11 +38,11 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
        memblock_reserve(start_bad, end_bad - start_bad);
 }
-static void __init memtest(u64 pattern, u64 start_phys, u64 size)
+static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size)
 {
        u64 *p, *start, *end;
-        u64 start_bad, last_bad;
+        phys_addr_t start_bad, last_bad;
-        u64 start_phys_aligned;
+        phys_addr_t start_phys_aligned;
        const size_t incr = sizeof(pattern);
        start_phys_aligned = ALIGN(start_phys, incr);
@@ -69,14 +69,14 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size)
                reserve_bad_mem(pattern, start_bad, last_bad + incr);
 }
-static void __init do_one_pass(u64 pattern, u64 start, u64 end)
+static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
 {
        u64 i;
        phys_addr_t this_start, this_end;
        for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
-                this_start = clamp_t(phys_addr_t, this_start, start, end);
+                this_start = clamp(this_start, start, end);
-                this_end = clamp_t(phys_addr_t, this_end, start, end);
+                this_end = clamp(this_end, start, end);
                if (this_start < this_end) {
                        printk(KERN_INFO "  %010llx - %010llx pattern %016llx\n",
                               (unsigned long long)this_start,
@@ -102,7 +102,7 @@ static int __init parse_memtest(char *arg)
 early_param("memtest", parse_memtest);
-void __init early_memtest(unsigned long start, unsigned long end)
+void __init early_memtest(phys_addr_t start, phys_addr_t end)
 {
        unsigned int i;
        unsigned int idx = 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index 85e042686031..a65ff72ab739 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -901,12 +901,23 @@ out:
 }
 /*
+ * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move().  Work
+ * around it.
+ */
+#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM)
+#define ICE_noinline noinline
+#else
+#define ICE_noinline
+#endif
+/*
 * Obtain the lock on page, remove all ptes and migrate the page
 * to the newly allocated page in newpage.
 */
-static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+static ICE_noinline int unmap_and_move(new_page_t get_new_page,
-                        unsigned long private, struct page *page, int force,
+                                   free_page_t put_new_page,
-                        enum migrate_mode mode)
+                                   unsigned long private, struct page *page,
+                                   int force, enum migrate_mode mode)
 {
        int rc = 0;
        int *result = NULL;
@@ -1554,30 +1565,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
 * page migration rate limiting control.
 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
 * window of time. Default here says do not migrate more than 1280M per second.
- * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
- * as it is faults that reset the window, pte updates will happen unconditionally
- * if there has not been a fault since @pteupdate_interval_millisecs after the
- * throttle window closed.
 */
 static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
 static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-/* Returns true if NUMA migration is currently rate limited */
-bool migrate_ratelimited(int node)
-{
-        pg_data_t *pgdat = NODE_DATA(node);
-        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
-                                msecs_to_jiffies(pteupdate_interval_millisecs)))
-                return false;
-        if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
-                return false;
-        return true;
-}
 /* Returns true if the node is migrate rate-limited after the update */
 static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
                                        unsigned long nr_pages)
diff --git a/mm/mlock.c b/mm/mlock.c
index 8a54cd214925..6fd2cf15e868 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -205,62 +205,6 @@ out:
        return nr_pages - 1;
 }
-/**
- * __mlock_vma_pages_range() -  mlock a range of pages in the vma.
- * @vma:   target vma
- * @start: start address
- * @end:   end address
- * @nonblocking:
- *
- * This takes care of making the pages present too.
- *
- * return 0 on success, negative error code on error.
- *
- * vma->vm_mm->mmap_sem must be held.
- *
- * If @nonblocking is NULL, it may be held for read or write and will
- * be unperturbed.
- *
- * If @nonblocking is non-NULL, it must held for read only and may be
- * released.  If it's released, *@nonblocking will be set to 0.
- */
-long __mlock_vma_pages_range(struct vm_area_struct *vma,
-                unsigned long start, unsigned long end, int *nonblocking)
-{
-        struct mm_struct *mm = vma->vm_mm;
-        unsigned long nr_pages = (end - start) / PAGE_SIZE;
-        int gup_flags;
-        VM_BUG_ON(start & ~PAGE_MASK);
-        VM_BUG_ON(end   & ~PAGE_MASK);
-        VM_BUG_ON_VMA(start < vma->vm_start, vma);
-        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
-        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
-        gup_flags = FOLL_TOUCH | FOLL_MLOCK;
-        /*
-         * We want to touch writable mappings with a write fault in order
-         * to break COW, except for shared mappings because these don't COW
-         * and we would not want to dirty them for nothing.
-         */
-        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
-                gup_flags |= FOLL_WRITE;
-        /*
-         * We want mlock to succeed for regions that have any permissions
-         * other than PROT_NONE.
-         */
-        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
-                gup_flags |= FOLL_FORCE;
-        /*
-         * We made sure addr is within a VMA, so the following will
-         * not result in a stack expansion that recurses back here.
-         */
-        return __get_user_pages(current, mm, start, nr_pages, gup_flags,
-                                NULL, NULL, nonblocking);
-}
 /*
 * convert get_user_pages() return value to posix mlock() error
 */
@@ -596,7 +540,7 @@ success:
        /*
         * vm_flags is protected by the mmap_sem held in write mode.
         * It's okay if try_to_unmap_one unmaps a page just after we
-         * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
+         * set VM_LOCKED, populate_vma_page_range will bring it back.
         */
        if (lock)
@@ -660,69 +604,6 @@ static int do_mlock(unsigned long start, size_t len, int on)
        return error;
 }
-/*
- * __mm_populate - populate and/or mlock pages within a range of address space.
- *
- * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
- * flags. VMAs must be already marked with the desired vm_flags, and
- * mmap_sem must not be held.
- */
-int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
-{
-        struct mm_struct *mm = current->mm;
-        unsigned long end, nstart, nend;
-        struct vm_area_struct *vma = NULL;
-        int locked = 0;
-        long ret = 0;
-        VM_BUG_ON(start & ~PAGE_MASK);
-        VM_BUG_ON(len != PAGE_ALIGN(len));
-        end = start + len;
-        for (nstart = start; nstart < end; nstart = nend) {
-                /*
-                 * We want to fault in pages for [nstart; end) address range.
-                 * Find first corresponding VMA.
-                 */
-                if (!locked) {
-                        locked = 1;
-                        down_read(&mm->mmap_sem);
-                        vma = find_vma(mm, nstart);
-                } else if (nstart >= vma->vm_end)
-                        vma = vma->vm_next;
-                if (!vma || vma->vm_start >= end)
-                        break;
-                /*
-                 * Set [nstart; nend) to intersection of desired address
-                 * range with the first VMA. Also, skip undesirable VMA types.
-                 */
-                nend = min(end, vma->vm_end);
-                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
-                        continue;
-                if (nstart < vma->vm_start)
-                        nstart = vma->vm_start;
-                /*
-                 * Now fault in a range of pages. __mlock_vma_pages_range()
-                 * double checks the vma flags, so that it won't mlock pages
-                 * if the vma was already munlocked.
-                 */
-                ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
-                if (ret < 0) {
-                        if (ignore_errors) {
-                                ret = 0;
-                                continue;       /* continue at next VMA */
-                        }
-                        ret = __mlock_posix_error_return(ret);
-                        break;
-                }
-                nend = nstart + ret * PAGE_SIZE;
-                ret = 0;
-        }
-        if (locked)
-                up_read(&mm->mmap_sem);
-        return ret;     /* 0 or negative error code */
-}
 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 {
        unsigned long locked;
@@ -750,9 +631,13 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
                error = do_mlock(start, len, 1);
        up_write(&current->mm->mmap_sem);
-        if (!error)
+        if (error)
-                error = __mm_populate(start, len, 0);
+                return error;
-        return error;
+        error = __mm_populate(start, len, 0);
+        if (error)
+                return __mlock_posix_error_return(error);
+        return 0;
 }
 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
diff --git a/mm/mmap.c b/mm/mmap.c
index 9ec50a368634..06a6076c92e5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2316,7 +2316,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
        if (!prev || expand_stack(prev, addr))
                return NULL;
        if (prev->vm_flags & VM_LOCKED)
-                __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
+                populate_vma_page_range(prev, addr, prev->vm_end, NULL);
        return prev;
 }
 #else
@@ -2351,7 +2351,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
        if (expand_stack(vma, addr))
                return NULL;
        if (vma->vm_flags & VM_LOCKED)
-                __mlock_vma_pages_range(vma, addr, start, NULL);
+                populate_vma_page_range(vma, addr, start, NULL);
        return vma;
 }
 #endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 642f38cb175a..52628c819bf7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -612,7 +612,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
 */
 void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
-                        int order, const nodemask_t *nodemask)
+                        int order, const nodemask_t *nodemask,
+                        struct mem_cgroup *memcg)
 {
        if (likely(!sysctl_panic_on_oom))
                return;
@@ -625,7 +626,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
                if (constraint != CONSTRAINT_NONE)
                        return;
        }
-        dump_header(NULL, gfp_mask, order, NULL, nodemask);
+        dump_header(NULL, gfp_mask, order, memcg, nodemask);
        panic("Out of memory: %s panic_on_oom is enabled\n",
                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
@@ -740,7 +741,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
                                                &totalpages);
        mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
-        check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
+        check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
        if (sysctl_oom_kill_allocating_task && current->mm &&
            !oom_unkillable_task(current, NULL, nodemask) &&
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 644bcb665773..0372411f38fc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2111,6 +2111,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 EXPORT_SYMBOL(account_page_dirtied);
 /*
+ * Helper function for deaccounting dirty page without writeback.
+ *
+ * Doing this should *normally* only ever be done when a page
+ * is truncated, and is not actually mapped anywhere at all. However,
+ * fs/buffer.c does this when it notices that somebody has cleaned
+ * out all the buffers on a page without actually doing it through
+ * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
+ */
+void account_page_cleaned(struct page *page, struct address_space *mapping)
+{
+        if (mapping_cap_account_dirty(mapping)) {
+                dec_zone_page_state(page, NR_FILE_DIRTY);
+                dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
+                task_io_account_cancelled_write(PAGE_CACHE_SIZE);
+        }
+}
+EXPORT_SYMBOL(account_page_cleaned);
+/*
 * For address_spaces which do not use buffers.  Just tag the page as dirty in
 * its radix tree.
 *
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40e29429e7b0..1b849500640c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1032,11 +1032,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 static int fallbacks[MIGRATE_TYPES][4] = {
        [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
+        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #ifdef CONFIG_CMA
-        [MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
        [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
-#else
-        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #endif
        [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
 #ifdef CONFIG_MEMORY_ISOLATION
@@ -1044,6 +1042,17 @@ static int fallbacks[MIGRATE_TYPES][4] = {
 #endif
 };
+#ifdef CONFIG_CMA
+static struct page *__rmqueue_cma_fallback(struct zone *zone,
+                                        unsigned int order)
+{
+        return __rmqueue_smallest(zone, order, MIGRATE_CMA);
+}
+#else
+static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+                                        unsigned int order) { return NULL; }
+#endif
 /*
 * Move the free pages in a range to the free lists of the requested type.
 * Note that start_page and end_pages are not aligned on a pageblock
@@ -1136,14 +1145,40 @@ static void change_pageblock_range(struct page *pageblock_page,
 * as fragmentation caused by those allocations polluting movable pageblocks
 * is worse than movable allocations stealing from unmovable and reclaimable
 * pageblocks.
- *
- * If we claim more than half of the pageblock, change pageblock's migratetype
- * as well.
 */
-static void try_to_steal_freepages(struct zone *zone, struct page *page,
+static bool can_steal_fallback(unsigned int order, int start_mt)
-                                  int start_type, int fallback_type)
+{
+        /*
+         * Leaving this order check is intended, although there is
+         * relaxed order check in next check. The reason is that
+         * we can actually steal whole pageblock if this condition met,
+         * but, below check doesn't guarantee it and that is just heuristic
+         * so could be changed anytime.
+         */
+        if (order >= pageblock_order)
+                return true;
+        if (order >= pageblock_order / 2 ||
+                start_mt == MIGRATE_RECLAIMABLE ||
+                start_mt == MIGRATE_UNMOVABLE ||
+                page_group_by_mobility_disabled)
+                return true;
+        return false;
+}
+/*
+ * This function implements actual steal behaviour. If order is large enough,
+ * we can steal whole pageblock. If not, we first move freepages in this
+ * pageblock and check whether half of pages are moved or not. If half of
+ * pages are moved, we can change migratetype of pageblock and permanently
+ * use it's pages as requested migratetype in the future.
+ */
+static void steal_suitable_fallback(struct zone *zone, struct page *page,
+                                                          int start_type)
 {
        int current_order = page_order(page);
+        int pages;
        /* Take ownership for orders >= pageblock_order */
        if (current_order >= pageblock_order) {
@@ -1151,19 +1186,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page,
                return;
        }
-        if (current_order >= pageblock_order / 2 ||
+        pages = move_freepages_block(zone, page, start_type);
-            start_type == MIGRATE_RECLAIMABLE ||
-            start_type == MIGRATE_UNMOVABLE ||
+        /* Claim the whole block if over half of it is free */
-            page_group_by_mobility_disabled) {
+        if (pages >= (1 << (pageblock_order-1)) ||
-                int pages;
+                        page_group_by_mobility_disabled)
+                set_pageblock_migratetype(page, start_type);
+}
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If only_stealable is true, this function returns fallback_mt only if
+ * we can steal other freepages all together. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+                        int migratetype, bool only_stealable, bool *can_steal)
+{
+        int i;
+        int fallback_mt;
+        if (area->nr_free == 0)
+                return -1;
+        *can_steal = false;
+        for (i = 0;; i++) {
+                fallback_mt = fallbacks[migratetype][i];
+                if (fallback_mt == MIGRATE_RESERVE)
+                        break;
+                if (list_empty(&area->free_list[fallback_mt]))
+                        continue;
-                pages = move_freepages_block(zone, page, start_type);
+                if (can_steal_fallback(order, migratetype))
+                        *can_steal = true;
-                /* Claim the whole block if over half of it is free */
+                if (!only_stealable)
-                if (pages >= (1 << (pageblock_order-1)) ||
+                        return fallback_mt;
-                                page_group_by_mobility_disabled)
-                        set_pageblock_migratetype(page, start_type);
+                if (*can_steal)
+                        return fallback_mt;
        }
+        return -1;
 }
 /* Remove an element from the buddy allocator from the fallback list */
@@ -1173,64 +1238,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
        struct free_area *area;
        unsigned int current_order;
        struct page *page;
+        int fallback_mt;
+        bool can_steal;
        /* Find the largest possible block of pages in the other list */
        for (current_order = MAX_ORDER-1;
                                current_order >= order && current_order <= MAX_ORDER-1;
                                --current_order) {
-                int i;
+                area = &(zone->free_area[current_order]);
-                for (i = 0;; i++) {
+                fallback_mt = find_suitable_fallback(area, current_order,
-                        int migratetype = fallbacks[start_migratetype][i];
+                                start_migratetype, false, &can_steal);
-                        int buddy_type = start_migratetype;
+                if (fallback_mt == -1)
+                        continue;
-                        /* MIGRATE_RESERVE handled later if necessary */
-                        if (migratetype == MIGRATE_RESERVE)
-                                break;
-                        area = &(zone->free_area[current_order]);
-                        if (list_empty(&area->free_list[migratetype]))
-                                continue;
-                        page = list_entry(area->free_list[migratetype].next,
-                                        struct page, lru);
-                        area->nr_free--;
-                        if (!is_migrate_cma(migratetype)) {
-                                try_to_steal_freepages(zone, page,
-                                                        start_migratetype,
-                                                        migratetype);
-                        } else {
-                                /*
-                                 * When borrowing from MIGRATE_CMA, we need to
-                                 * release the excess buddy pages to CMA
-                                 * itself, and we do not try to steal extra
-                                 * free pages.
-                                 */
-                                buddy_type = migratetype;
-                        }
-                        /* Remove the page from the freelists */
+                page = list_entry(area->free_list[fallback_mt].next,
-                        list_del(&page->lru);
+                                                struct page, lru);
-                        rmv_page_order(page);
+                if (can_steal)
+                        steal_suitable_fallback(zone, page, start_migratetype);
-                        expand(zone, page, order, current_order, area,
+                /* Remove the page from the freelists */
-                                        buddy_type);
+                area->nr_free--;
+                list_del(&page->lru);
+                rmv_page_order(page);
-                        /*
+                expand(zone, page, order, current_order, area,
-                         * The freepage_migratetype may differ from pageblock's
+                                        start_migratetype);
-                         * migratetype depending on the decisions in
+                /*
-                         * try_to_steal_freepages(). This is OK as long as it
+                 * The freepage_migratetype may differ from pageblock's
-                         * does not differ for MIGRATE_CMA pageblocks. For CMA
+                 * migratetype depending on the decisions in
-                         * we need to make sure unallocated pages flushed from
+                 * try_to_steal_freepages(). This is OK as long as it
-                         * pcp lists are returned to the correct freelist.
+                 * does not differ for MIGRATE_CMA pageblocks. For CMA
-                         */
+                 * we need to make sure unallocated pages flushed from
-                        set_freepage_migratetype(page, buddy_type);
+                 * pcp lists are returned to the correct freelist.
+                 */
+                set_freepage_migratetype(page, start_migratetype);
-                        trace_mm_page_alloc_extfrag(page, order, current_order,
+                trace_mm_page_alloc_extfrag(page, order, current_order,
-                                start_migratetype, migratetype);
+                        start_migratetype, fallback_mt);
-                        return page;
+                return page;
-                }
        }
        return NULL;
@@ -1249,7 +1295,11 @@ retry_reserve:
        page = __rmqueue_smallest(zone, order, migratetype);
        if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
-                page = __rmqueue_fallback(zone, order, migratetype);
+                if (migratetype == MIGRATE_MOVABLE)
+                        page = __rmqueue_cma_fallback(zone, order);
+                if (!page)
+                        page = __rmqueue_fallback(zone, order, migratetype);
                /*
                 * Use MIGRATE_RESERVE rather than fail an allocation. goto
@@ -2362,13 +2412,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        *did_some_progress = 1;
                        goto out;
                }
-                /*
+                /* The OOM killer may not free memory on a specific node */
-                 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
-                 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
-                 * The caller should handle page allocation failure by itself if
-                 * it specifies __GFP_THISNODE.
-                 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
-                 */
                if (gfp_mask & __GFP_THISNODE)
                        goto out;
        }
@@ -2623,15 +2667,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        }
        /*
-         * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+         * If this allocation cannot block and it is for a specific node, then
-         * __GFP_NOWARN set) should not cause reclaim since the subsystem
+         * fail early.  There's no need to wakeup kswapd or retry for a
-         * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+         * speculative node-specific allocation.
-         * using a larger set of nodes after it has established that the
-         * allowed per node queues are empty and that nodes are
-         * over allocated.
         */
-        if (IS_ENABLED(CONFIG_NUMA) &&
+        if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
-            (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
 retry:
@@ -2824,7 +2864,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        /*
         * Check the zones suitable for the gfp_mask contain at least one
         * valid zone. It's possible to have an empty zonelist as a result
-         * of GFP_THISNODE and a memoryless node
+         * of __GFP_THISNODE and a memoryless node
         */
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
@@ -3201,38 +3241,31 @@ static void show_migration_types(unsigned char type)
 * Show free area list (used inside shift_scroll-lock stuff)
 * We also calculate the percentage fragmentation. We do this by counting the
 * memory on each free list with the exception of the first item on the list.
- * Suppresses nodes that are not allowed by current's cpuset if
+ *
- * SHOW_MEM_FILTER_NODES is passed.
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ *   cpuset.
 */
 void show_free_areas(unsigned int filter)
 {
+        unsigned long free_pcp = 0;
        int cpu;
        struct zone *zone;
        for_each_populated_zone(zone) {
                if (skip_free_areas_node(filter, zone_to_nid(zone)))
                        continue;
-                show_node(zone);
-                printk("%s per-cpu:\n", zone->name);
-                for_each_online_cpu(cpu) {
+                for_each_online_cpu(cpu)
-                        struct per_cpu_pageset *pageset;
+                        free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
-                        pageset = per_cpu_ptr(zone->pageset, cpu);
-                        printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
-                               cpu, pageset->pcp.high,
-                               pageset->pcp.batch, pageset->pcp.count);
-                }
        }
        printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
                " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
-                " unevictable:%lu"
+                " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
-                " dirty:%lu writeback:%lu unstable:%lu\n"
+                " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
-                " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
-                " free_cma:%lu\n",
+                " free:%lu free_pcp:%lu free_cma:%lu\n",
                global_page_state(NR_ACTIVE_ANON),
                global_page_state(NR_INACTIVE_ANON),
                global_page_state(NR_ISOLATED_ANON),
@@ -3243,13 +3276,14 @@ void show_free_areas(unsigned int filter)
                global_page_state(NR_FILE_DIRTY),
                global_page_state(NR_WRITEBACK),
                global_page_state(NR_UNSTABLE_NFS),
-                global_page_state(NR_FREE_PAGES),
                global_page_state(NR_SLAB_RECLAIMABLE),
                global_page_state(NR_SLAB_UNRECLAIMABLE),
                global_page_state(NR_FILE_MAPPED),
                global_page_state(NR_SHMEM),
                global_page_state(NR_PAGETABLE),
                global_page_state(NR_BOUNCE),
+                global_page_state(NR_FREE_PAGES),
+                free_pcp,
                global_page_state(NR_FREE_CMA_PAGES));
        for_each_populated_zone(zone) {
@@ -3257,6 +3291,11 @@ void show_free_areas(unsigned int filter)
                if (skip_free_areas_node(filter, zone_to_nid(zone)))
                        continue;
+                free_pcp = 0;
+                for_each_online_cpu(cpu)
+                        free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
                show_node(zone);
                printk("%s"
                        " free:%lukB"
@@ -3283,6 +3322,8 @@ void show_free_areas(unsigned int filter)
                        " pagetables:%lukB"
                        " unstable:%lukB"
                        " bounce:%lukB"
+                        " free_pcp:%lukB"
+                        " local_pcp:%ukB"
                        " free_cma:%lukB"
                        " writeback_tmp:%lukB"
                        " pages_scanned:%lu"
@@ -3314,6 +3355,8 @@ void show_free_areas(unsigned int filter)
                        K(zone_page_state(zone, NR_PAGETABLE)),
                        K(zone_page_state(zone, NR_UNSTABLE_NFS)),
                        K(zone_page_state(zone, NR_BOUNCE)),
+                        K(free_pcp),
+                        K(this_cpu_read(zone->pageset->pcp.count)),
                        K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
                        K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
                        K(zone_page_state(zone, NR_PAGES_SCANNED)),
@@ -5717,7 +5760,7 @@ static void __setup_per_zone_wmarks(void)
                         * value here.
                         *
                         * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
-                         * deltas controls asynch page reclaim, and so should
+                         * deltas control asynch page reclaim, and so should
                         * not be capped for highmem.
                         */
                        unsigned long min_pages;
diff --git a/mm/slab.c b/mm/slab.c
index c4b89eaf4c96..7eb38dd1cefa 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
        return NULL;
 }
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+        return flags;
+}
 #else   /* CONFIG_NUMA */
 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
@@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
        return __cache_free_alien(cachep, objp, node, page_node);
 }
+/*
+ * Construct gfp mask to allocate from a specific node but do not invoke reclaim
+ * or warn about failures.
+ */
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+        return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT;
+}
 #endif
 /*
@@ -2825,7 +2839,7 @@ alloc_done:
        if (unlikely(!ac->avail)) {
                int x;
 force_grow:
-                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
+                x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
                /* cache_grow can reenable interrupts, then ac could change. */
                ac = cpu_cache_get(cachep);
@@ -3019,7 +3033,7 @@ retry:
                        get_node(cache, nid) &&
                        get_node(cache, nid)->free_objects) {
                                obj = ____cache_alloc_node(cache,
-                                        flags | GFP_THISNODE, nid);
+                                        gfp_exact_node(flags), nid);
                                if (obj)
                                        break;
                }
@@ -3047,7 +3061,7 @@ retry:
                        nid = page_to_nid(page);
                        if (cache_grow(cache, flags, nid, page)) {
                                obj = ____cache_alloc_node(cache,
-                                        flags | GFP_THISNODE, nid);
+                                        gfp_exact_node(flags), nid);
                                if (!obj)
                                        /*
                                         * Another processor may allocate the
@@ -3118,7 +3132,7 @@ retry:
 must_grow:
        spin_unlock(&n->list_lock);
-        x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+        x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
        if (x)
                goto retry;
diff --git a/mm/slob.c b/mm/slob.c
index 94a7fede6d48..4765f65019c7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -532,7 +532,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
        return 0;
 }
-void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
        void *b;
@@ -558,7 +558,6 @@ void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
        kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
        return b;
 }
-EXPORT_SYMBOL(slob_alloc_node);
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
diff --git a/mm/slub.c b/mm/slub.c
index 82c473780c91..0fdd6c1e1f82 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -374,7 +374,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
                if (cmpxchg_double(&page->freelist, &page->counters,
                                   freelist_old, counters_old,
                                   freelist_new, counters_new))
-                        return 1;
+                        return true;
        } else
 #endif
        {
@@ -384,7 +384,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
                        page->freelist = freelist_new;
                        set_page_slub_counters(page, counters_new);
                        slab_unlock(page);
-                        return 1;
+                        return true;
                }
                slab_unlock(page);
        }
@@ -396,7 +396,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
        pr_info("%s %s: cmpxchg double redo ", n, s->name);
 #endif
-        return 0;
+        return false;
 }
 static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
@@ -410,7 +410,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
                if (cmpxchg_double(&page->freelist, &page->counters,
                                   freelist_old, counters_old,
                                   freelist_new, counters_new))
-                        return 1;
+                        return true;
        } else
 #endif
        {
@@ -424,7 +424,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
                        set_page_slub_counters(page, counters_new);
                        slab_unlock(page);
                        local_irq_restore(flags);
-                        return 1;
+                        return true;
                }
                slab_unlock(page);
                local_irq_restore(flags);
@@ -437,7 +437,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
        pr_info("%s %s: cmpxchg double redo ", n, s->name);
 #endif
-        return 0;
+        return false;
 }
 #ifdef CONFIG_SLUB_DEBUG
@@ -1137,15 +1137,6 @@ static int __init setup_slub_debug(char *str)
                 */
                goto check_slabs;
-        if (tolower(*str) == 'o') {
-                /*
-                 * Avoid enabling debugging on caches if its minimum order
-                 * would increase as a result.
-                 */
-                disable_higher_order_debug = 1;
-                goto out;
-        }
        slub_debug = 0;
        if (*str == '-')
                /*
@@ -1176,6 +1167,13 @@ static int __init setup_slub_debug(char *str)
                case 'a':
                        slub_debug |= SLAB_FAILSLAB;
                        break;
+                case 'o':
+                        /*
+                         * Avoid enabling debugging on caches if its minimum
+                         * order would increase as a result.
+                         */
+                        disable_higher_order_debug = 1;
+                        break;
                default:
                        pr_err("slub_debug option '%c' unknown. skipped\n",
                               *str);
diff --git a/mm/truncate.c b/mm/truncate.c
index ddec5a5966d7..7a9d8a3cb143 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,35 +93,6 @@ void do_invalidatepage(struct page *page, unsigned int offset,
 }
 /*
- * This cancels just the dirty bit on the kernel page itself, it
- * does NOT actually remove dirty bits on any mmap's that may be
- * around. It also leaves the page tagged dirty, so any sync
- * activity will still find it on the dirty lists, and in particular,
- * clear_page_dirty_for_io() will still look at the dirty bits in
- * the VM.
- *
- * Doing this should *normally* only ever be done when a page
- * is truncated, and is not actually mapped anywhere at all. However,
- * fs/buffer.c does this when it notices that somebody has cleaned
- * out all the buffers on a page without actually doing it through
- * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
- */
-void cancel_dirty_page(struct page *page, unsigned int account_size)
-{
-        if (TestClearPageDirty(page)) {
-                struct address_space *mapping = page->mapping;
-                if (mapping && mapping_cap_account_dirty(mapping)) {
-                        dec_zone_page_state(page, NR_FILE_DIRTY);
-                        dec_bdi_stat(inode_to_bdi(mapping->host),
-                                        BDI_RECLAIMABLE);
-                        if (account_size)
-                                task_io_account_cancelled_write(account_size);
-                }
-        }
-}
-EXPORT_SYMBOL(cancel_dirty_page);
-/*
 * If truncate cannot remove the fs-private metadata from the page, the page
 * becomes orphaned.  It will be left on the LRU and may even be mapped into
 * user pagetables if we're racing with filemap_fault().
@@ -140,7 +111,13 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        if (page_has_private(page))
                do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
-        cancel_dirty_page(page, PAGE_CACHE_SIZE);
+        /*
+         * Some filesystems seem to re-dirty the page even after
+         * the VM has canceled the dirty bit (eg ext3 journaling).
+         * Hence dirty accounting check is placed after invalidation.
+         */
+        if (TestClearPageDirty(page))
+                account_page_cleaned(page, mapping);
        ClearPageMappedToDisk(page);
        delete_from_page_cache(page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 49abccf29a29..a5bbdd3b5d67 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -29,6 +29,7 @@
 #include <linux/atomic.h>
 #include <linux/compiler.h>
 #include <linux/llist.h>
+#include <linux/bitops.h>
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
@@ -74,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_clear_huge(pmd))
+                        continue;
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                vunmap_pte_range(pmd, addr, next);
@@ -88,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
+                if (pud_clear_huge(pud))
+                        continue;
                if (pud_none_or_clear_bad(pud))
                        continue;
                vunmap_pmd_range(pud, addr, next);
@@ -1314,7 +1319,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
        BUG_ON(in_interrupt());
        if (flags & VM_IOREMAP)
-                align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
+                align = 1ul << clamp_t(int, fls_long(size),
+                                       PAGE_SHIFT, IOREMAP_MAX_ORDER);
        size = PAGE_ALIGN(size);
        if (unlikely(!size))
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 50ec42f170a0..2dacc7b5af23 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -100,7 +100,9 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
                                new_stats =
                                        kmem_cache_alloc_node(flow_stats_cache,
-                                                              GFP_THISNODE |
+                                                              GFP_NOWAIT |
+                                                              __GFP_THISNODE |
+                                                              __GFP_NOWARN |
                                                              __GFP_NOMEMALLOC,
                                                              node);
                                if (likely(new_stats)) {
diff --git a/scripts/coccinelle/misc/bugon.cocci b/scripts/coccinelle/misc/bugon.cocci
index 3b7eec24fb5a..27c97f1f2767 100644
--- a/scripts/coccinelle/misc/bugon.cocci
+++ b/scripts/coccinelle/misc/bugon.cocci
@@ -57,6 +57,6 @@ coccilib.org.print_todo(p[0], "WARNING use BUG_ON")
 p << r.p;
 @@
-msg="WARNING: Use BUG_ON"
+msg="WARNING: Use BUG_ON instead of if condition followed by BUG.\nPlease make sure the condition has no side effects (see conditional BUG_ON definition in include/asm-generic/bug.h)"
 coccilib.report.print_report(p[0], msg)