Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton: "14 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm: revert x86_64 and arm64 ELF_ET_DYN_BASE base changes mm/vmalloc.c: don't unconditonally use __GFP_HIGHMEM mm/mempolicy: fix use after free when calling get_mempolicy mm/cma_debug.c: fix stack corruption due to sprintf usage signal: don't remove SIGNAL_UNKILLABLE for traced tasks. mm, oom: fix potential data corruption when oom_reaper races with writer mm: fix double mmap_sem unlock on MMF_UNSTABLE enforced SIGBUS slub: fix per memcg cache leak on css offline mm: discard memblock data later test_kmod: fix description for -s -and -c parameters kmod: fix wait on recursive loop wait: add wait_event_killable_timeout() kernel/watchdog: fix Kconfig constraints for perf hardlockup watchdog mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-08-18 19:06:33 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-08-18 19:06:33 -0400
commit: 58d4e450a490d5f02183f6834c12550ba26d3b47 (patch)
tree: 4c1ada2d3cb98f5fb8c546a0c95fc28a3f733b83
parent: cc28fcdc017e553375c999ca12107ceb27f34ab3 (diff)
parent: c715b72c1ba406f133217b509044c38d8e714a37 (diff)
22 files changed, 224 insertions, 103 deletions
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index acae781f7359..3288c2b36731 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -114,10 +114,10 @@
 /*
 * This is the base location for PIE (ET_DYN with INTERP) loads. On
- * 64-bit, this is raised to 4GB to leave the entire 32-bit address
+ * 64-bit, this is above 4GB to leave the entire 32-bit address
 * space open for things that want to use the area for 32-bit pointers.
 */
-#define ELF_ET_DYN_BASE         0x100000000UL
+#define ELF_ET_DYN_BASE         (2 * TASK_SIZE_64 / 3)
 #ifndef __ASSEMBLY__
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 36f858c37ca7..81b0031f909f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -199,7 +199,7 @@ config PPC
        select HAVE_OPTPROBES                   if PPC64
        select HAVE_PERF_EVENTS
        select HAVE_PERF_EVENTS_NMI             if PPC64
-        select HAVE_HARDLOCKUP_DETECTOR_PERF    if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
+        select HAVE_HARDLOCKUP_DETECTOR_PERF    if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
        select HAVE_PERF_REGS
        select HAVE_PERF_USER_STACK_DUMP
        select HAVE_RCU_TABLE_FREE              if SMP
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..29a1bf85e507 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -163,7 +163,7 @@ config X86
        select HAVE_PCSPKR_PLATFORM
        select HAVE_PERF_EVENTS
        select HAVE_PERF_EVENTS_NMI
-        select HAVE_HARDLOCKUP_DETECTOR_PERF    if HAVE_PERF_EVENTS_NMI
+        select HAVE_HARDLOCKUP_DETECTOR_PERF    if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
        select HAVE_PERF_REGS
        select HAVE_PERF_USER_STACK_DUMP
        select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1c18d83d3f09..9aeb91935ce0 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -247,11 +247,11 @@ extern int force_personality32;
 /*
 * This is the base location for PIE (ET_DYN with INTERP) loads. On
- * 64-bit, this is raised to 4GB to leave the entire 32-bit address
+ * 64-bit, this is above 4GB to leave the entire 32-bit address
 * space open for things that want to use the area for 32-bit pointers.
 */
 #define ELF_ET_DYN_BASE         (mmap_is_ia32() ? 0x000400000UL : \
-                                                  0x100000000UL)
+                                                  (TASK_SIZE / 3 * 2))
 /* This yields a mask that user programs can use to figure out what
   instruction set this CPU supports.  This could be done in user space,
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 77d427974f57..bae11c7e7bf3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -61,6 +61,7 @@ extern int memblock_debug;
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 #define __init_memblock __meminit
 #define __initdata_memblock __meminitdata
+void memblock_discard(void);
 #else
 #define __init_memblock
 #define __initdata_memblock
@@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
                                        int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
                                   phys_addr_t size, phys_addr_t align);
-phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
-phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
@@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
                                phys_addr_t *out_end);
+void __memblock_free_early(phys_addr_t base, phys_addr_t size);
+void __memblock_free_late(phys_addr_t base, phys_addr_t size);
 /**
 * for_each_mem_range - iterate through memblock areas from type_a and not
 * included in type_b. Or just type_a if type_b is NULL.
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3914e3dd6168..9b15a4bcfa77 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -484,7 +484,8 @@ bool mem_cgroup_oom_synchronize(bool wait);
 extern int do_swap_account;
 #endif
-void lock_page_memcg(struct page *page);
+struct mem_cgroup *lock_page_memcg(struct page *page);
+void __unlock_page_memcg(struct mem_cgroup *memcg);
 void unlock_page_memcg(struct page *page);
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
@@ -809,7 +810,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
-static inline void lock_page_memcg(struct page *page)
+static inline struct mem_cgroup *lock_page_memcg(struct page *page)
+{
+        return NULL;
+}
+static inline void __unlock_page_memcg(struct mem_cgroup *memcg)
 {
 }
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 8a266e2be5a6..76aac4ce39bc 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -6,6 +6,8 @@
 #include <linux/types.h>
 #include <linux/nodemask.h>
 #include <uapi/linux/oom.h>
+#include <linux/sched/coredump.h> /* MMF_* */
+#include <linux/mm.h> /* VM_FAULT* */
 struct zonelist;
 struct notifier_block;
@@ -63,6 +65,26 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
        return tsk->signal->oom_mm;
 }
+/*
+ * Checks whether a page fault on the given mm is still reliable.
+ * This is no longer true if the oom reaper started to reap the
+ * address space which is reflected by MMF_UNSTABLE flag set in
+ * the mm. At that moment any !shared mapping would lose the content
+ * and could cause a memory corruption (zero pages instead of the
+ * original content).
+ *
+ * User should call this before establishing a page table entry for
+ * a !shared mapping and under the proper page table lock.
+ *
+ * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise.
+ */
+static inline int check_stable_address_space(struct mm_struct *mm)
+{
+        if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
+                return VM_FAULT_SIGBUS;
+        return 0;
+}
 extern unsigned long oom_badness(struct task_struct *p,
                struct mem_cgroup *memcg, const nodemask_t *nodemask,
                unsigned long totalpages);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5b74e36c0ca8..dc19880c02f5 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -757,6 +757,43 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);
        __ret;                                                                  \
 })
+#define __wait_event_killable_timeout(wq_head, condition, timeout)              \
+        ___wait_event(wq_head, ___wait_cond_timeout(condition),                 \
+                      TASK_KILLABLE, 0, timeout,                                \
+                      __ret = schedule_timeout(__ret))
+/**
+ * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_KILLABLE) until the
+ * @condition evaluates to true or a kill signal is received.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
+ * interrupted by a kill signal.
+ *
+ * Only kill signals interrupt this process.
+ */
+#define wait_event_killable_timeout(wq_head, condition, timeout)                \
+({                                                                              \
+        long __ret = timeout;                                                   \
+        might_sleep();                                                          \
+        if (!___wait_cond_timeout(condition))                                   \
+                __ret = __wait_event_killable_timeout(wq_head,                  \
+                                                condition, timeout);            \
+        __ret;                                                                  \
+})
 #define __wait_event_lock_irq(wq_head, condition, lock, cmd)                    \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,     \
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6d016c5d97c8..2f37acde640b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -71,6 +71,18 @@ static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
 static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
 /*
+ * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
+ * running at the same time without returning. When this happens we
+ * believe you've somehow ended up with a recursive module dependency
+ * creating a loop.
+ *
+ * We have no option but to fail.
+ *
+ * Userspace should proactively try to detect and prevent these.
+ */
+#define MAX_KMOD_ALL_BUSY_TIMEOUT 5
+/*
        modprobe_path is set via /proc/sys.
 */
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
@@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...)
                pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
                                    atomic_read(&kmod_concurrent_max),
                                    MAX_KMOD_CONCURRENT, module_name);
-                wait_event_interruptible(kmod_wq,
+                ret = wait_event_killable_timeout(kmod_wq,
-                                         atomic_dec_if_positive(&kmod_concurrent_max) >= 0);
+                                                  atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
+                                                  MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
+                if (!ret) {
+                        pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+                                            module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+                        return -ETIME;
+                } else if (ret == -ERESTARTSYS) {
+                        pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
+                        return ret;
+                }
        }
        trace_module_request(module_name, wait, _RET_IP_);
diff --git a/kernel/signal.c b/kernel/signal.c
index 7e33f8c583e6..ed804a470dcd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
                        recalc_sigpending_and_wake(t);
                }
        }
-        if (action->sa.sa_handler == SIG_DFL)
+        /*
+         * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
+         * debugging to leave init killable.
+         */
+        if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
                t->signal->flags &= ~SIGNAL_UNKILLABLE;
        ret = specific_send_sig_info(sig, info, t);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 595b757bef72..c03ccbc405a0 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
        char name[16];
        int u32s;
-        sprintf(name, "cma-%s", cma->name);
+        scnprintf(name, sizeof(name), "cma-%s", cma->name);
        tmp = debugfs_create_dir(name, cma_debugfs_root);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 216114f6ef0b..90731e3b7e58 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -32,6 +32,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/page_idle.h>
 #include <linux/shmem_fs.h>
+#include <linux/oom.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
        struct mem_cgroup *memcg;
        pgtable_t pgtable;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+        int ret = 0;
        VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -561,9 +563,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
        pgtable = pte_alloc_one(vma->vm_mm, haddr);
        if (unlikely(!pgtable)) {
-                mem_cgroup_cancel_charge(page, memcg, true);
+                ret = VM_FAULT_OOM;
-                put_page(page);
+                goto release;
-                return VM_FAULT_OOM;
        }
        clear_huge_page(page, haddr, HPAGE_PMD_NR);
@@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd))) {
-                spin_unlock(vmf->ptl);
+                goto unlock_release;
-                mem_cgroup_cancel_charge(page, memcg, true);
-                put_page(page);
-                pte_free(vma->vm_mm, pgtable);
        } else {
                pmd_t entry;
+                ret = check_stable_address_space(vma->vm_mm);
+                if (ret)
+                        goto unlock_release;
                /* Deliver the page fault to userland */
                if (userfaultfd_missing(vma)) {
                        int ret;
@@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
        }
        return 0;
+unlock_release:
+        spin_unlock(vmf->ptl);
+release:
+        if (pgtable)
+                pte_free(vma->vm_mm, pgtable);
+        mem_cgroup_cancel_charge(page, memcg, true);
+        put_page(page);
+        return ret;
 }
 /*
@@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
                ret = 0;
                set = false;
                if (pmd_none(*vmf->pmd)) {
-                        if (userfaultfd_missing(vma)) {
+                        ret = check_stable_address_space(vma->vm_mm);
+                        if (ret) {
+                                spin_unlock(vmf->ptl);
+                        } else if (userfaultfd_missing(vma)) {
                                spin_unlock(vmf->ptl);
                                ret = handle_userfault(vmf, VM_UFFD_MISSING);
                                VM_BUG_ON(ret & VM_FAULT_FALLBACK);
diff --git a/mm/memblock.c b/mm/memblock.c
index 2cb25fe4452c..bf14aea6ab70 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
 }
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+/**
-phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
+ * Discard memory and reserved arrays if they were allocated
-                                        phys_addr_t *addr)
+ */
-{
+void __init memblock_discard(void)
-        if (memblock.reserved.regions == memblock_reserved_init_regions)
-                return 0;
-        *addr = __pa(memblock.reserved.regions);
-        return PAGE_ALIGN(sizeof(struct memblock_region) *
-                          memblock.reserved.max);
-}
-phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
-                                        phys_addr_t *addr)
 {
-        if (memblock.memory.regions == memblock_memory_init_regions)
+        phys_addr_t addr, size;
-                return 0;
-        *addr = __pa(memblock.memory.regions);
+        if (memblock.reserved.regions != memblock_reserved_init_regions) {
+                addr = __pa(memblock.reserved.regions);
+                size = PAGE_ALIGN(sizeof(struct memblock_region) *
+                                  memblock.reserved.max);
+                __memblock_free_late(addr, size);
+        }
-        return PAGE_ALIGN(sizeof(struct memblock_region) *
+        if (memblock.memory.regions == memblock_memory_init_regions) {
-                          memblock.memory.max);
+                addr = __pa(memblock.memory.regions);
+                size = PAGE_ALIGN(sizeof(struct memblock_region) *
+                                  memblock.memory.max);
+                __memblock_free_late(addr, size);
+        }
 }
 #endif
 /**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3df3c04d73ab..e09741af816f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1611,9 +1611,13 @@ cleanup:
 * @page: the page
 *
 * This function protects unlocked LRU pages from being moved to
- * another cgroup and stabilizes their page->mem_cgroup binding.
+ * another cgroup.
+ *
+ * It ensures lifetime of the returned memcg. Caller is responsible
+ * for the lifetime of the page; __unlock_page_memcg() is available
+ * when @page might get freed inside the locked section.
 */
-void lock_page_memcg(struct page *page)
+struct mem_cgroup *lock_page_memcg(struct page *page)
 {
        struct mem_cgroup *memcg;
        unsigned long flags;
@@ -1622,18 +1626,24 @@ void lock_page_memcg(struct page *page)
         * The RCU lock is held throughout the transaction.  The fast
         * path can get away without acquiring the memcg->move_lock
         * because page moving starts with an RCU grace period.
-         */
+         *
+         * The RCU lock also protects the memcg from being freed when
+         * the page state that is going to change is the only thing
+         * preventing the page itself from being freed. E.g. writeback
+         * doesn't hold a page reference and relies on PG_writeback to
+         * keep off truncation, migration and so forth.
+         */
        rcu_read_lock();
        if (mem_cgroup_disabled())
-                return;
+                return NULL;
 again:
        memcg = page->mem_cgroup;
        if (unlikely(!memcg))
-                return;
+                return NULL;
        if (atomic_read(&memcg->moving_account) <= 0)
-                return;
+                return memcg;
        spin_lock_irqsave(&memcg->move_lock, flags);
        if (memcg != page->mem_cgroup) {
@@ -1649,18 +1659,18 @@ again:
        memcg->move_lock_task = current;
        memcg->move_lock_flags = flags;
-        return;
+        return memcg;
 }
 EXPORT_SYMBOL(lock_page_memcg);
 /**
- * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * __unlock_page_memcg - unlock and unpin a memcg
- * @page: the page
+ * @memcg: the memcg
+ *
+ * Unlock and unpin a memcg returned by lock_page_memcg().
 */
-void unlock_page_memcg(struct page *page)
+void __unlock_page_memcg(struct mem_cgroup *memcg)
 {
-        struct mem_cgroup *memcg = page->mem_cgroup;
        if (memcg && memcg->move_lock_task == current) {
                unsigned long flags = memcg->move_lock_flags;
@@ -1672,6 +1682,15 @@ void unlock_page_memcg(struct page *page)
        rcu_read_unlock();
 }
+/**
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
+ */
+void unlock_page_memcg(struct page *page)
+{
+        __unlock_page_memcg(page->mem_cgroup);
+}
 EXPORT_SYMBOL(unlock_page_memcg);
 /*
diff --git a/mm/memory.c b/mm/memory.c
index e158f7ac6730..fe2fba27ded2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,6 +68,7 @@
 #include <linux/debugfs.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/dax.h>
+#include <linux/oom.h>
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -2893,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
        struct vm_area_struct *vma = vmf->vma;
        struct mem_cgroup *memcg;
        struct page *page;
+        int ret = 0;
        pte_t entry;
        /* File mapping without ->vm_ops ? */
@@ -2925,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf)
                                vmf->address, &vmf->ptl);
                if (!pte_none(*vmf->pte))
                        goto unlock;
+                ret = check_stable_address_space(vma->vm_mm);
+                if (ret)
+                        goto unlock;
                /* Deliver the page fault to userland, check inside PT lock */
                if (userfaultfd_missing(vma)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2959,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
        if (!pte_none(*vmf->pte))
                goto release;
+        ret = check_stable_address_space(vma->vm_mm);
+        if (ret)
+                goto release;
        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2978,7 +2987,7 @@ setpte:
        update_mmu_cache(vma, vmf->address, vmf->pte);
 unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
-        return 0;
+        return ret;
 release:
        mem_cgroup_cancel_charge(page, memcg, false);
        put_page(page);
@@ -3252,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
 int finish_fault(struct vm_fault *vmf)
 {
        struct page *page;
-        int ret;
+        int ret = 0;
        /* Did we COW the page? */
        if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3260,7 +3269,15 @@ int finish_fault(struct vm_fault *vmf)
                page = vmf->cow_page;
        else
                page = vmf->page;
-        ret = alloc_set_pte(vmf, vmf->memcg, page);
+        /*
+         * check even for read faults because we might have lost our CoWed
+         * page
+         */
+        if (!(vmf->vma->vm_flags & VM_SHARED))
+                ret = check_stable_address_space(vmf->vma->vm_mm);
+        if (!ret)
+                ret = alloc_set_pte(vmf, vmf->memcg, page);
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
@@ -3900,19 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                        mem_cgroup_oom_synchronize(false);
        }
-        /*
-         * This mm has been already reaped by the oom reaper and so the
-         * refault cannot be trusted in general. Anonymous refaults would
-         * lose data and give a zero page instead e.g. This is especially
-         * problem for use_mm() because regular tasks will just die and
-         * the corrupted data will not be visible anywhere while kthread
-         * will outlive the oom victim and potentially propagate the date
-         * further.
-         */
-        if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
-                                && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
-                ret = VM_FAULT_SIGBUS;
        return ret;
 }
 EXPORT_SYMBOL_GPL(handle_mm_fault);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d911fa5cb2a7..618ab125228b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -861,11 +861,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
                *policy |= (pol->flags & MPOL_MODE_FLAGS);
        }
-        if (vma) {
-                up_read(&current->mm->mmap_sem);
-                vma = NULL;
-        }
        err = 0;
        if (nmask) {
                if (mpol_store_user_nodemask(pol)) {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 36454d0f96ee..3637809a18d0 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -146,22 +146,6 @@ static unsigned long __init free_low_memory_core_early(void)
                                NULL)
                count += __free_memory_core(start, end);
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-        {
-                phys_addr_t size;
-                /* Free memblock.reserved array if it was allocated */
-                size = get_allocated_memblock_reserved_regions_info(&start);
-                if (size)
-                        count += __free_memory_core(start, start + size);
-                /* Free memblock.memory array if it was allocated */
-                size = get_allocated_memblock_memory_regions_info(&start);
-                if (size)
-                        count += __free_memory_core(start, start + size);
-        }
-#endif
        return count;
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 96e93b214d31..bf050ab025b7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
 int test_clear_page_writeback(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
+        struct mem_cgroup *memcg;
+        struct lruvec *lruvec;
        int ret;
-        lock_page_memcg(page);
+        memcg = lock_page_memcg(page);
+        lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
        if (mapping && mapping_use_writeback_tags(mapping)) {
                struct inode *inode = mapping->host;
                struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page)
        } else {
                ret = TestClearPageWriteback(page);
        }
+        /*
+         * NOTE: Page might be free now! Writeback doesn't hold a page
+         * reference on its own, it relies on truncation to wait for
+         * the clearing of PG_writeback. The below can only access
+         * page state that is static across allocation cycles.
+         */
        if (ret) {
-                dec_lruvec_page_state(page, NR_WRITEBACK);
+                dec_lruvec_state(lruvec, NR_WRITEBACK);
                dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                inc_node_page_state(page, NR_WRITTEN);
        }
-        unlock_page_memcg(page);
+        __unlock_page_memcg(memcg);
        return ret;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d00f746c2fd..1bad301820c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1584,6 +1584,10 @@ void __init page_alloc_init_late(void)
        /* Reinit limits that are based on free pages after the kernel is up */
        files_maxfiles_init();
 #endif
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+        /* Discard memblock private memory */
+        memblock_discard();
+#endif
        for_each_populated_zone(zone)
                set_zone_contiguous(zone);
diff --git a/mm/slub.c b/mm/slub.c
index 1d3f9835f4ea..e8b4e31162ca 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5642,13 +5642,14 @@ static void sysfs_slab_remove_workfn(struct work_struct *work)
                 * A cache is never shut down before deactivation is
                 * complete, so no need to worry about synchronization.
                 */
-                return;
+                goto out;
 #ifdef CONFIG_MEMCG
        kset_unregister(s->memcg_kset);
 #endif
        kobject_uevent(&s->kobj, KOBJ_REMOVE);
        kobject_del(&s->kobj);
+out:
        kobject_put(&s->kobj);
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8698c1c86c4d..a47e3894c775 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1671,7 +1671,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        struct page **pages;
        unsigned int nr_pages, array_size, i;
        const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
-        const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN;
+        const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
+        const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
+                                        0 :
+                                        __GFP_HIGHMEM;
        nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
        array_size = (nr_pages * sizeof(struct page *));
@@ -1679,7 +1682,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        area->nr_pages = nr_pages;
        /* Please note that the recursion is strictly bounded. */
        if (array_size > PAGE_SIZE) {
-                pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
+                pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
                                PAGE_KERNEL, node, area->caller);
        } else {
                pages = kmalloc_node(array_size, nested_gfp, node);
@@ -1700,9 +1703,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                }
                if (node == NUMA_NO_NODE)
-                        page = alloc_page(alloc_mask);
+                        page = alloc_page(alloc_mask|highmem_mask);
                else
-                        page = alloc_pages_node(node, alloc_mask, 0);
+                        page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
                if (unlikely(!page)) {
                        /* Successfully allocated i pages, free them in __vunmap() */
@@ -1710,7 +1713,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                        goto fail;
                }
                area->pages[i] = page;
-                if (gfpflags_allow_blocking(gfp_mask))
+                if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
                        cond_resched();
        }
diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh
index 8cecae9a8bca..7956ea3be667 100755
--- a/tools/testing/selftests/kmod/kmod.sh
+++ b/tools/testing/selftests/kmod/kmod.sh
@@ -473,8 +473,8 @@ usage()
        echo "    all     Runs all tests (default)"
        echo "    -t      Run test ID the number amount of times is recommended"
        echo "    -w      Watch test ID run until it runs into an error"
-        echo "    -c      Run test ID once"
+        echo "    -s      Run test ID once"
-        echo "    -s      Run test ID x test-count number of times"
+        echo "    -c      Run test ID x test-count number of times"
        echo "    -l      List all test ID list"
        echo " -h|--help  Help"
        echo
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-08-18 19:06:33 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-08-18 19:06:33 -0400
commit	58d4e450a490d5f02183f6834c12550ba26d3b47 (patch)
tree	4c1ada2d3cb98f5fb8c546a0c95fc28a3f733b83
parent	cc28fcdc017e553375c999ca12107ceb27f34ab3 (diff)
parent	c715b72c1ba406f133217b509044c38d8e714a37 (diff)