aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-08-18 19:06:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-08-18 19:06:33 -0400
commit58d4e450a490d5f02183f6834c12550ba26d3b47 (patch)
tree4c1ada2d3cb98f5fb8c546a0c95fc28a3f733b83
parentcc28fcdc017e553375c999ca12107ceb27f34ab3 (diff)
parentc715b72c1ba406f133217b509044c38d8e714a37 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton: "14 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm: revert x86_64 and arm64 ELF_ET_DYN_BASE base changes mm/vmalloc.c: don't unconditonally use __GFP_HIGHMEM mm/mempolicy: fix use after free when calling get_mempolicy mm/cma_debug.c: fix stack corruption due to sprintf usage signal: don't remove SIGNAL_UNKILLABLE for traced tasks. mm, oom: fix potential data corruption when oom_reaper races with writer mm: fix double mmap_sem unlock on MMF_UNSTABLE enforced SIGBUS slub: fix per memcg cache leak on css offline mm: discard memblock data later test_kmod: fix description for -s -and -c parameters kmod: fix wait on recursive loop wait: add wait_event_killable_timeout() kernel/watchdog: fix Kconfig constraints for perf hardlockup watchdog mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
-rw-r--r--arch/arm64/include/asm/elf.h4
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/include/asm/elf.h4
-rw-r--r--include/linux/memblock.h6
-rw-r--r--include/linux/memcontrol.h10
-rw-r--r--include/linux/oom.h22
-rw-r--r--include/linux/wait.h37
-rw-r--r--kernel/kmod.c25
-rw-r--r--kernel/signal.c6
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/huge_memory.c30
-rw-r--r--mm/memblock.c38
-rw-r--r--mm/memcontrol.c43
-rw-r--r--mm/memory.c36
-rw-r--r--mm/mempolicy.c5
-rw-r--r--mm/nobootmem.c16
-rw-r--r--mm/page-writeback.c15
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/slub.c3
-rw-r--r--mm/vmalloc.c13
-rwxr-xr-xtools/testing/selftests/kmod/kmod.sh4
22 files changed, 224 insertions, 103 deletions
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index acae781f7359..3288c2b36731 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -114,10 +114,10 @@
114 114
115/* 115/*
116 * This is the base location for PIE (ET_DYN with INTERP) loads. On 116 * This is the base location for PIE (ET_DYN with INTERP) loads. On
117 * 64-bit, this is raised to 4GB to leave the entire 32-bit address 117 * 64-bit, this is above 4GB to leave the entire 32-bit address
118 * space open for things that want to use the area for 32-bit pointers. 118 * space open for things that want to use the area for 32-bit pointers.
119 */ 119 */
120#define ELF_ET_DYN_BASE 0x100000000UL 120#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3)
121 121
122#ifndef __ASSEMBLY__ 122#ifndef __ASSEMBLY__
123 123
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 36f858c37ca7..81b0031f909f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -199,7 +199,7 @@ config PPC
199 select HAVE_OPTPROBES if PPC64 199 select HAVE_OPTPROBES if PPC64
200 select HAVE_PERF_EVENTS 200 select HAVE_PERF_EVENTS
201 select HAVE_PERF_EVENTS_NMI if PPC64 201 select HAVE_PERF_EVENTS_NMI if PPC64
202 select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH 202 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
203 select HAVE_PERF_REGS 203 select HAVE_PERF_REGS
204 select HAVE_PERF_USER_STACK_DUMP 204 select HAVE_PERF_USER_STACK_DUMP
205 select HAVE_RCU_TABLE_FREE if SMP 205 select HAVE_RCU_TABLE_FREE if SMP
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..29a1bf85e507 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -163,7 +163,7 @@ config X86
163 select HAVE_PCSPKR_PLATFORM 163 select HAVE_PCSPKR_PLATFORM
164 select HAVE_PERF_EVENTS 164 select HAVE_PERF_EVENTS
165 select HAVE_PERF_EVENTS_NMI 165 select HAVE_PERF_EVENTS_NMI
166 select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI 166 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
167 select HAVE_PERF_REGS 167 select HAVE_PERF_REGS
168 select HAVE_PERF_USER_STACK_DUMP 168 select HAVE_PERF_USER_STACK_DUMP
169 select HAVE_REGS_AND_STACK_ACCESS_API 169 select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1c18d83d3f09..9aeb91935ce0 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -247,11 +247,11 @@ extern int force_personality32;
247 247
248/* 248/*
249 * This is the base location for PIE (ET_DYN with INTERP) loads. On 249 * This is the base location for PIE (ET_DYN with INTERP) loads. On
250 * 64-bit, this is raised to 4GB to leave the entire 32-bit address 250 * 64-bit, this is above 4GB to leave the entire 32-bit address
251 * space open for things that want to use the area for 32-bit pointers. 251 * space open for things that want to use the area for 32-bit pointers.
252 */ 252 */
253#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ 253#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \
254 0x100000000UL) 254 (TASK_SIZE / 3 * 2))
255 255
256/* This yields a mask that user programs can use to figure out what 256/* This yields a mask that user programs can use to figure out what
257 instruction set this CPU supports. This could be done in user space, 257 instruction set this CPU supports. This could be done in user space,
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 77d427974f57..bae11c7e7bf3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -61,6 +61,7 @@ extern int memblock_debug;
61#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK 61#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
62#define __init_memblock __meminit 62#define __init_memblock __meminit
63#define __initdata_memblock __meminitdata 63#define __initdata_memblock __meminitdata
64void memblock_discard(void);
64#else 65#else
65#define __init_memblock 66#define __init_memblock
66#define __initdata_memblock 67#define __initdata_memblock
@@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
74 int nid, ulong flags); 75 int nid, ulong flags);
75phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, 76phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
76 phys_addr_t size, phys_addr_t align); 77 phys_addr_t size, phys_addr_t align);
77phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
78phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
79void memblock_allow_resize(void); 78void memblock_allow_resize(void);
80int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); 79int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
81int memblock_add(phys_addr_t base, phys_addr_t size); 80int memblock_add(phys_addr_t base, phys_addr_t size);
@@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
110void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, 109void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
111 phys_addr_t *out_end); 110 phys_addr_t *out_end);
112 111
112void __memblock_free_early(phys_addr_t base, phys_addr_t size);
113void __memblock_free_late(phys_addr_t base, phys_addr_t size);
114
113/** 115/**
114 * for_each_mem_range - iterate through memblock areas from type_a and not 116 * for_each_mem_range - iterate through memblock areas from type_a and not
115 * included in type_b. Or just type_a if type_b is NULL. 117 * included in type_b. Or just type_a if type_b is NULL.
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3914e3dd6168..9b15a4bcfa77 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -484,7 +484,8 @@ bool mem_cgroup_oom_synchronize(bool wait);
484extern int do_swap_account; 484extern int do_swap_account;
485#endif 485#endif
486 486
487void lock_page_memcg(struct page *page); 487struct mem_cgroup *lock_page_memcg(struct page *page);
488void __unlock_page_memcg(struct mem_cgroup *memcg);
488void unlock_page_memcg(struct page *page); 489void unlock_page_memcg(struct page *page);
489 490
490static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, 491static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
@@ -809,7 +810,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
809{ 810{
810} 811}
811 812
812static inline void lock_page_memcg(struct page *page) 813static inline struct mem_cgroup *lock_page_memcg(struct page *page)
814{
815 return NULL;
816}
817
818static inline void __unlock_page_memcg(struct mem_cgroup *memcg)
813{ 819{
814} 820}
815 821
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 8a266e2be5a6..76aac4ce39bc 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -6,6 +6,8 @@
6#include <linux/types.h> 6#include <linux/types.h>
7#include <linux/nodemask.h> 7#include <linux/nodemask.h>
8#include <uapi/linux/oom.h> 8#include <uapi/linux/oom.h>
9#include <linux/sched/coredump.h> /* MMF_* */
10#include <linux/mm.h> /* VM_FAULT* */
9 11
10struct zonelist; 12struct zonelist;
11struct notifier_block; 13struct notifier_block;
@@ -63,6 +65,26 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
63 return tsk->signal->oom_mm; 65 return tsk->signal->oom_mm;
64} 66}
65 67
68/*
69 * Checks whether a page fault on the given mm is still reliable.
70 * This is no longer true if the oom reaper started to reap the
71 * address space which is reflected by MMF_UNSTABLE flag set in
72 * the mm. At that moment any !shared mapping would lose the content
73 * and could cause a memory corruption (zero pages instead of the
74 * original content).
75 *
76 * User should call this before establishing a page table entry for
77 * a !shared mapping and under the proper page table lock.
78 *
79 * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise.
80 */
81static inline int check_stable_address_space(struct mm_struct *mm)
82{
83 if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
84 return VM_FAULT_SIGBUS;
85 return 0;
86}
87
66extern unsigned long oom_badness(struct task_struct *p, 88extern unsigned long oom_badness(struct task_struct *p,
67 struct mem_cgroup *memcg, const nodemask_t *nodemask, 89 struct mem_cgroup *memcg, const nodemask_t *nodemask,
68 unsigned long totalpages); 90 unsigned long totalpages);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5b74e36c0ca8..dc19880c02f5 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -757,6 +757,43 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);
757 __ret; \ 757 __ret; \
758}) 758})
759 759
760#define __wait_event_killable_timeout(wq_head, condition, timeout) \
761 ___wait_event(wq_head, ___wait_cond_timeout(condition), \
762 TASK_KILLABLE, 0, timeout, \
763 __ret = schedule_timeout(__ret))
764
765/**
766 * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
767 * @wq_head: the waitqueue to wait on
768 * @condition: a C expression for the event to wait for
769 * @timeout: timeout, in jiffies
770 *
771 * The process is put to sleep (TASK_KILLABLE) until the
772 * @condition evaluates to true or a kill signal is received.
773 * The @condition is checked each time the waitqueue @wq_head is woken up.
774 *
775 * wake_up() has to be called after changing any variable that could
776 * change the result of the wait condition.
777 *
778 * Returns:
779 * 0 if the @condition evaluated to %false after the @timeout elapsed,
780 * 1 if the @condition evaluated to %true after the @timeout elapsed,
781 * the remaining jiffies (at least 1) if the @condition evaluated
782 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
783 * interrupted by a kill signal.
784 *
785 * Only kill signals interrupt this process.
786 */
787#define wait_event_killable_timeout(wq_head, condition, timeout) \
788({ \
789 long __ret = timeout; \
790 might_sleep(); \
791 if (!___wait_cond_timeout(condition)) \
792 __ret = __wait_event_killable_timeout(wq_head, \
793 condition, timeout); \
794 __ret; \
795})
796
760 797
761#define __wait_event_lock_irq(wq_head, condition, lock, cmd) \ 798#define __wait_event_lock_irq(wq_head, condition, lock, cmd) \
762 (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ 799 (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6d016c5d97c8..2f37acde640b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -71,6 +71,18 @@ static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
71static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); 71static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
72 72
73/* 73/*
74 * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
75 * running at the same time without returning. When this happens we
76 * believe you've somehow ended up with a recursive module dependency
77 * creating a loop.
78 *
79 * We have no option but to fail.
80 *
81 * Userspace should proactively try to detect and prevent these.
82 */
83#define MAX_KMOD_ALL_BUSY_TIMEOUT 5
84
85/*
74 modprobe_path is set via /proc/sys. 86 modprobe_path is set via /proc/sys.
75*/ 87*/
76char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; 88char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
@@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...)
167 pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", 179 pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
168 atomic_read(&kmod_concurrent_max), 180 atomic_read(&kmod_concurrent_max),
169 MAX_KMOD_CONCURRENT, module_name); 181 MAX_KMOD_CONCURRENT, module_name);
170 wait_event_interruptible(kmod_wq, 182 ret = wait_event_killable_timeout(kmod_wq,
171 atomic_dec_if_positive(&kmod_concurrent_max) >= 0); 183 atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
184 MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
185 if (!ret) {
186 pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
187 module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
188 return -ETIME;
189 } else if (ret == -ERESTARTSYS) {
190 pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
191 return ret;
192 }
172 } 193 }
173 194
174 trace_module_request(module_name, wait, _RET_IP_); 195 trace_module_request(module_name, wait, _RET_IP_);
diff --git a/kernel/signal.c b/kernel/signal.c
index 7e33f8c583e6..ed804a470dcd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1194 recalc_sigpending_and_wake(t); 1194 recalc_sigpending_and_wake(t);
1195 } 1195 }
1196 } 1196 }
1197 if (action->sa.sa_handler == SIG_DFL) 1197 /*
1198 * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
1199 * debugging to leave init killable.
1200 */
1201 if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
1198 t->signal->flags &= ~SIGNAL_UNKILLABLE; 1202 t->signal->flags &= ~SIGNAL_UNKILLABLE;
1199 ret = specific_send_sig_info(sig, info, t); 1203 ret = specific_send_sig_info(sig, info, t);
1200 spin_unlock_irqrestore(&t->sighand->siglock, flags); 1204 spin_unlock_irqrestore(&t->sighand->siglock, flags);
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 595b757bef72..c03ccbc405a0 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
167 char name[16]; 167 char name[16];
168 int u32s; 168 int u32s;
169 169
170 sprintf(name, "cma-%s", cma->name); 170 scnprintf(name, sizeof(name), "cma-%s", cma->name);
171 171
172 tmp = debugfs_create_dir(name, cma_debugfs_root); 172 tmp = debugfs_create_dir(name, cma_debugfs_root);
173 173
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 216114f6ef0b..90731e3b7e58 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -32,6 +32,7 @@
32#include <linux/userfaultfd_k.h> 32#include <linux/userfaultfd_k.h>
33#include <linux/page_idle.h> 33#include <linux/page_idle.h>
34#include <linux/shmem_fs.h> 34#include <linux/shmem_fs.h>
35#include <linux/oom.h>
35 36
36#include <asm/tlb.h> 37#include <asm/tlb.h>
37#include <asm/pgalloc.h> 38#include <asm/pgalloc.h>
@@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
550 struct mem_cgroup *memcg; 551 struct mem_cgroup *memcg;
551 pgtable_t pgtable; 552 pgtable_t pgtable;
552 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 553 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
554 int ret = 0;
553 555
554 VM_BUG_ON_PAGE(!PageCompound(page), page); 556 VM_BUG_ON_PAGE(!PageCompound(page), page);
555 557
@@ -561,9 +563,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
561 563
562 pgtable = pte_alloc_one(vma->vm_mm, haddr); 564 pgtable = pte_alloc_one(vma->vm_mm, haddr);
563 if (unlikely(!pgtable)) { 565 if (unlikely(!pgtable)) {
564 mem_cgroup_cancel_charge(page, memcg, true); 566 ret = VM_FAULT_OOM;
565 put_page(page); 567 goto release;
566 return VM_FAULT_OOM;
567 } 568 }
568 569
569 clear_huge_page(page, haddr, HPAGE_PMD_NR); 570 clear_huge_page(page, haddr, HPAGE_PMD_NR);
@@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
576 577
577 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 578 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
578 if (unlikely(!pmd_none(*vmf->pmd))) { 579 if (unlikely(!pmd_none(*vmf->pmd))) {
579 spin_unlock(vmf->ptl); 580 goto unlock_release;
580 mem_cgroup_cancel_charge(page, memcg, true);
581 put_page(page);
582 pte_free(vma->vm_mm, pgtable);
583 } else { 581 } else {
584 pmd_t entry; 582 pmd_t entry;
585 583
584 ret = check_stable_address_space(vma->vm_mm);
585 if (ret)
586 goto unlock_release;
587
586 /* Deliver the page fault to userland */ 588 /* Deliver the page fault to userland */
587 if (userfaultfd_missing(vma)) { 589 if (userfaultfd_missing(vma)) {
588 int ret; 590 int ret;
@@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
610 } 612 }
611 613
612 return 0; 614 return 0;
615unlock_release:
616 spin_unlock(vmf->ptl);
617release:
618 if (pgtable)
619 pte_free(vma->vm_mm, pgtable);
620 mem_cgroup_cancel_charge(page, memcg, true);
621 put_page(page);
622 return ret;
623
613} 624}
614 625
615/* 626/*
@@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
688 ret = 0; 699 ret = 0;
689 set = false; 700 set = false;
690 if (pmd_none(*vmf->pmd)) { 701 if (pmd_none(*vmf->pmd)) {
691 if (userfaultfd_missing(vma)) { 702 ret = check_stable_address_space(vma->vm_mm);
703 if (ret) {
704 spin_unlock(vmf->ptl);
705 } else if (userfaultfd_missing(vma)) {
692 spin_unlock(vmf->ptl); 706 spin_unlock(vmf->ptl);
693 ret = handle_userfault(vmf, VM_UFFD_MISSING); 707 ret = handle_userfault(vmf, VM_UFFD_MISSING);
694 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 708 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
diff --git a/mm/memblock.c b/mm/memblock.c
index 2cb25fe4452c..bf14aea6ab70 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
285} 285}
286 286
287#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK 287#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
288 288/**
289phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( 289 * Discard memory and reserved arrays if they were allocated
290 phys_addr_t *addr) 290 */
291{ 291void __init memblock_discard(void)
292 if (memblock.reserved.regions == memblock_reserved_init_regions)
293 return 0;
294
295 *addr = __pa(memblock.reserved.regions);
296
297 return PAGE_ALIGN(sizeof(struct memblock_region) *
298 memblock.reserved.max);
299}
300
301phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
302 phys_addr_t *addr)
303{ 292{
304 if (memblock.memory.regions == memblock_memory_init_regions) 293 phys_addr_t addr, size;
305 return 0;
306 294
307 *addr = __pa(memblock.memory.regions); 295 if (memblock.reserved.regions != memblock_reserved_init_regions) {
296 addr = __pa(memblock.reserved.regions);
297 size = PAGE_ALIGN(sizeof(struct memblock_region) *
298 memblock.reserved.max);
299 __memblock_free_late(addr, size);
300 }
308 301
309 return PAGE_ALIGN(sizeof(struct memblock_region) * 302 if (memblock.memory.regions == memblock_memory_init_regions) {
310 memblock.memory.max); 303 addr = __pa(memblock.memory.regions);
304 size = PAGE_ALIGN(sizeof(struct memblock_region) *
305 memblock.memory.max);
306 __memblock_free_late(addr, size);
307 }
311} 308}
312
313#endif 309#endif
314 310
315/** 311/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3df3c04d73ab..e09741af816f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1611,9 +1611,13 @@ cleanup:
1611 * @page: the page 1611 * @page: the page
1612 * 1612 *
1613 * This function protects unlocked LRU pages from being moved to 1613 * This function protects unlocked LRU pages from being moved to
1614 * another cgroup and stabilizes their page->mem_cgroup binding. 1614 * another cgroup.
1615 *
1616 * It ensures lifetime of the returned memcg. Caller is responsible
1617 * for the lifetime of the page; __unlock_page_memcg() is available
1618 * when @page might get freed inside the locked section.
1615 */ 1619 */
1616void lock_page_memcg(struct page *page) 1620struct mem_cgroup *lock_page_memcg(struct page *page)
1617{ 1621{
1618 struct mem_cgroup *memcg; 1622 struct mem_cgroup *memcg;
1619 unsigned long flags; 1623 unsigned long flags;
@@ -1622,18 +1626,24 @@ void lock_page_memcg(struct page *page)
1622 * The RCU lock is held throughout the transaction. The fast 1626 * The RCU lock is held throughout the transaction. The fast
1623 * path can get away without acquiring the memcg->move_lock 1627 * path can get away without acquiring the memcg->move_lock
1624 * because page moving starts with an RCU grace period. 1628 * because page moving starts with an RCU grace period.
1625 */ 1629 *
1630 * The RCU lock also protects the memcg from being freed when
1631 * the page state that is going to change is the only thing
1632 * preventing the page itself from being freed. E.g. writeback
1633 * doesn't hold a page reference and relies on PG_writeback to
1634 * keep off truncation, migration and so forth.
1635 */
1626 rcu_read_lock(); 1636 rcu_read_lock();
1627 1637
1628 if (mem_cgroup_disabled()) 1638 if (mem_cgroup_disabled())
1629 return; 1639 return NULL;
1630again: 1640again:
1631 memcg = page->mem_cgroup; 1641 memcg = page->mem_cgroup;
1632 if (unlikely(!memcg)) 1642 if (unlikely(!memcg))
1633 return; 1643 return NULL;
1634 1644
1635 if (atomic_read(&memcg->moving_account) <= 0) 1645 if (atomic_read(&memcg->moving_account) <= 0)
1636 return; 1646 return memcg;
1637 1647
1638 spin_lock_irqsave(&memcg->move_lock, flags); 1648 spin_lock_irqsave(&memcg->move_lock, flags);
1639 if (memcg != page->mem_cgroup) { 1649 if (memcg != page->mem_cgroup) {
@@ -1649,18 +1659,18 @@ again:
1649 memcg->move_lock_task = current; 1659 memcg->move_lock_task = current;
1650 memcg->move_lock_flags = flags; 1660 memcg->move_lock_flags = flags;
1651 1661
1652 return; 1662 return memcg;
1653} 1663}
1654EXPORT_SYMBOL(lock_page_memcg); 1664EXPORT_SYMBOL(lock_page_memcg);
1655 1665
1656/** 1666/**
1657 * unlock_page_memcg - unlock a page->mem_cgroup binding 1667 * __unlock_page_memcg - unlock and unpin a memcg
1658 * @page: the page 1668 * @memcg: the memcg
1669 *
1670 * Unlock and unpin a memcg returned by lock_page_memcg().
1659 */ 1671 */
1660void unlock_page_memcg(struct page *page) 1672void __unlock_page_memcg(struct mem_cgroup *memcg)
1661{ 1673{
1662 struct mem_cgroup *memcg = page->mem_cgroup;
1663
1664 if (memcg && memcg->move_lock_task == current) { 1674 if (memcg && memcg->move_lock_task == current) {
1665 unsigned long flags = memcg->move_lock_flags; 1675 unsigned long flags = memcg->move_lock_flags;
1666 1676
@@ -1672,6 +1682,15 @@ void unlock_page_memcg(struct page *page)
1672 1682
1673 rcu_read_unlock(); 1683 rcu_read_unlock();
1674} 1684}
1685
1686/**
1687 * unlock_page_memcg - unlock a page->mem_cgroup binding
1688 * @page: the page
1689 */
1690void unlock_page_memcg(struct page *page)
1691{
1692 __unlock_page_memcg(page->mem_cgroup);
1693}
1675EXPORT_SYMBOL(unlock_page_memcg); 1694EXPORT_SYMBOL(unlock_page_memcg);
1676 1695
1677/* 1696/*
diff --git a/mm/memory.c b/mm/memory.c
index e158f7ac6730..fe2fba27ded2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,6 +68,7 @@
68#include <linux/debugfs.h> 68#include <linux/debugfs.h>
69#include <linux/userfaultfd_k.h> 69#include <linux/userfaultfd_k.h>
70#include <linux/dax.h> 70#include <linux/dax.h>
71#include <linux/oom.h>
71 72
72#include <asm/io.h> 73#include <asm/io.h>
73#include <asm/mmu_context.h> 74#include <asm/mmu_context.h>
@@ -2893,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
2893 struct vm_area_struct *vma = vmf->vma; 2894 struct vm_area_struct *vma = vmf->vma;
2894 struct mem_cgroup *memcg; 2895 struct mem_cgroup *memcg;
2895 struct page *page; 2896 struct page *page;
2897 int ret = 0;
2896 pte_t entry; 2898 pte_t entry;
2897 2899
2898 /* File mapping without ->vm_ops ? */ 2900 /* File mapping without ->vm_ops ? */
@@ -2925,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf)
2925 vmf->address, &vmf->ptl); 2927 vmf->address, &vmf->ptl);
2926 if (!pte_none(*vmf->pte)) 2928 if (!pte_none(*vmf->pte))
2927 goto unlock; 2929 goto unlock;
2930 ret = check_stable_address_space(vma->vm_mm);
2931 if (ret)
2932 goto unlock;
2928 /* Deliver the page fault to userland, check inside PT lock */ 2933 /* Deliver the page fault to userland, check inside PT lock */
2929 if (userfaultfd_missing(vma)) { 2934 if (userfaultfd_missing(vma)) {
2930 pte_unmap_unlock(vmf->pte, vmf->ptl); 2935 pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2959,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
2959 if (!pte_none(*vmf->pte)) 2964 if (!pte_none(*vmf->pte))
2960 goto release; 2965 goto release;
2961 2966
2967 ret = check_stable_address_space(vma->vm_mm);
2968 if (ret)
2969 goto release;
2970
2962 /* Deliver the page fault to userland, check inside PT lock */ 2971 /* Deliver the page fault to userland, check inside PT lock */
2963 if (userfaultfd_missing(vma)) { 2972 if (userfaultfd_missing(vma)) {
2964 pte_unmap_unlock(vmf->pte, vmf->ptl); 2973 pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2978,7 +2987,7 @@ setpte:
2978 update_mmu_cache(vma, vmf->address, vmf->pte); 2987 update_mmu_cache(vma, vmf->address, vmf->pte);
2979unlock: 2988unlock:
2980 pte_unmap_unlock(vmf->pte, vmf->ptl); 2989 pte_unmap_unlock(vmf->pte, vmf->ptl);
2981 return 0; 2990 return ret;
2982release: 2991release:
2983 mem_cgroup_cancel_charge(page, memcg, false); 2992 mem_cgroup_cancel_charge(page, memcg, false);
2984 put_page(page); 2993 put_page(page);
@@ -3252,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3252int finish_fault(struct vm_fault *vmf) 3261int finish_fault(struct vm_fault *vmf)
3253{ 3262{
3254 struct page *page; 3263 struct page *page;
3255 int ret; 3264 int ret = 0;
3256 3265
3257 /* Did we COW the page? */ 3266 /* Did we COW the page? */
3258 if ((vmf->flags & FAULT_FLAG_WRITE) && 3267 if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3260,7 +3269,15 @@ int finish_fault(struct vm_fault *vmf)
3260 page = vmf->cow_page; 3269 page = vmf->cow_page;
3261 else 3270 else
3262 page = vmf->page; 3271 page = vmf->page;
3263 ret = alloc_set_pte(vmf, vmf->memcg, page); 3272
3273 /*
3274 * check even for read faults because we might have lost our CoWed
3275 * page
3276 */
3277 if (!(vmf->vma->vm_flags & VM_SHARED))
3278 ret = check_stable_address_space(vmf->vma->vm_mm);
3279 if (!ret)
3280 ret = alloc_set_pte(vmf, vmf->memcg, page);
3264 if (vmf->pte) 3281 if (vmf->pte)
3265 pte_unmap_unlock(vmf->pte, vmf->ptl); 3282 pte_unmap_unlock(vmf->pte, vmf->ptl);
3266 return ret; 3283 return ret;
@@ -3900,19 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3900 mem_cgroup_oom_synchronize(false); 3917 mem_cgroup_oom_synchronize(false);
3901 } 3918 }
3902 3919
3903 /*
3904 * This mm has been already reaped by the oom reaper and so the
3905 * refault cannot be trusted in general. Anonymous refaults would
3906 * lose data and give a zero page instead e.g. This is especially
3907 * problem for use_mm() because regular tasks will just die and
3908 * the corrupted data will not be visible anywhere while kthread
3909 * will outlive the oom victim and potentially propagate the date
3910 * further.
3911 */
3912 if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
3913 && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
3914 ret = VM_FAULT_SIGBUS;
3915
3916 return ret; 3920 return ret;
3917} 3921}
3918EXPORT_SYMBOL_GPL(handle_mm_fault); 3922EXPORT_SYMBOL_GPL(handle_mm_fault);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d911fa5cb2a7..618ab125228b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -861,11 +861,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
861 *policy |= (pol->flags & MPOL_MODE_FLAGS); 861 *policy |= (pol->flags & MPOL_MODE_FLAGS);
862 } 862 }
863 863
864 if (vma) {
865 up_read(&current->mm->mmap_sem);
866 vma = NULL;
867 }
868
869 err = 0; 864 err = 0;
870 if (nmask) { 865 if (nmask) {
871 if (mpol_store_user_nodemask(pol)) { 866 if (mpol_store_user_nodemask(pol)) {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 36454d0f96ee..3637809a18d0 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -146,22 +146,6 @@ static unsigned long __init free_low_memory_core_early(void)
146 NULL) 146 NULL)
147 count += __free_memory_core(start, end); 147 count += __free_memory_core(start, end);
148 148
149#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
150 {
151 phys_addr_t size;
152
153 /* Free memblock.reserved array if it was allocated */
154 size = get_allocated_memblock_reserved_regions_info(&start);
155 if (size)
156 count += __free_memory_core(start, start + size);
157
158 /* Free memblock.memory array if it was allocated */
159 size = get_allocated_memblock_memory_regions_info(&start);
160 if (size)
161 count += __free_memory_core(start, start + size);
162 }
163#endif
164
165 return count; 149 return count;
166} 150}
167 151
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 96e93b214d31..bf050ab025b7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
2724int test_clear_page_writeback(struct page *page) 2724int test_clear_page_writeback(struct page *page)
2725{ 2725{
2726 struct address_space *mapping = page_mapping(page); 2726 struct address_space *mapping = page_mapping(page);
2727 struct mem_cgroup *memcg;
2728 struct lruvec *lruvec;
2727 int ret; 2729 int ret;
2728 2730
2729 lock_page_memcg(page); 2731 memcg = lock_page_memcg(page);
2732 lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
2730 if (mapping && mapping_use_writeback_tags(mapping)) { 2733 if (mapping && mapping_use_writeback_tags(mapping)) {
2731 struct inode *inode = mapping->host; 2734 struct inode *inode = mapping->host;
2732 struct backing_dev_info *bdi = inode_to_bdi(inode); 2735 struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page)
2754 } else { 2757 } else {
2755 ret = TestClearPageWriteback(page); 2758 ret = TestClearPageWriteback(page);
2756 } 2759 }
2760 /*
2761 * NOTE: Page might be free now! Writeback doesn't hold a page
2762 * reference on its own, it relies on truncation to wait for
2763 * the clearing of PG_writeback. The below can only access
2764 * page state that is static across allocation cycles.
2765 */
2757 if (ret) { 2766 if (ret) {
2758 dec_lruvec_page_state(page, NR_WRITEBACK); 2767 dec_lruvec_state(lruvec, NR_WRITEBACK);
2759 dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); 2768 dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2760 inc_node_page_state(page, NR_WRITTEN); 2769 inc_node_page_state(page, NR_WRITTEN);
2761 } 2770 }
2762 unlock_page_memcg(page); 2771 __unlock_page_memcg(memcg);
2763 return ret; 2772 return ret;
2764} 2773}
2765 2774
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d00f746c2fd..1bad301820c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1584,6 +1584,10 @@ void __init page_alloc_init_late(void)
1584 /* Reinit limits that are based on free pages after the kernel is up */ 1584 /* Reinit limits that are based on free pages after the kernel is up */
1585 files_maxfiles_init(); 1585 files_maxfiles_init();
1586#endif 1586#endif
1587#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
1588 /* Discard memblock private memory */
1589 memblock_discard();
1590#endif
1587 1591
1588 for_each_populated_zone(zone) 1592 for_each_populated_zone(zone)
1589 set_zone_contiguous(zone); 1593 set_zone_contiguous(zone);
diff --git a/mm/slub.c b/mm/slub.c
index 1d3f9835f4ea..e8b4e31162ca 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5642,13 +5642,14 @@ static void sysfs_slab_remove_workfn(struct work_struct *work)
5642 * A cache is never shut down before deactivation is 5642 * A cache is never shut down before deactivation is
5643 * complete, so no need to worry about synchronization. 5643 * complete, so no need to worry about synchronization.
5644 */ 5644 */
5645 return; 5645 goto out;
5646 5646
5647#ifdef CONFIG_MEMCG 5647#ifdef CONFIG_MEMCG
5648 kset_unregister(s->memcg_kset); 5648 kset_unregister(s->memcg_kset);
5649#endif 5649#endif
5650 kobject_uevent(&s->kobj, KOBJ_REMOVE); 5650 kobject_uevent(&s->kobj, KOBJ_REMOVE);
5651 kobject_del(&s->kobj); 5651 kobject_del(&s->kobj);
5652out:
5652 kobject_put(&s->kobj); 5653 kobject_put(&s->kobj);
5653} 5654}
5654 5655
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8698c1c86c4d..a47e3894c775 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1671,7 +1671,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1671 struct page **pages; 1671 struct page **pages;
1672 unsigned int nr_pages, array_size, i; 1672 unsigned int nr_pages, array_size, i;
1673 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 1673 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
1674 const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN; 1674 const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
1675 const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
1676 0 :
1677 __GFP_HIGHMEM;
1675 1678
1676 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; 1679 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
1677 array_size = (nr_pages * sizeof(struct page *)); 1680 array_size = (nr_pages * sizeof(struct page *));
@@ -1679,7 +1682,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1679 area->nr_pages = nr_pages; 1682 area->nr_pages = nr_pages;
1680 /* Please note that the recursion is strictly bounded. */ 1683 /* Please note that the recursion is strictly bounded. */
1681 if (array_size > PAGE_SIZE) { 1684 if (array_size > PAGE_SIZE) {
1682 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, 1685 pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
1683 PAGE_KERNEL, node, area->caller); 1686 PAGE_KERNEL, node, area->caller);
1684 } else { 1687 } else {
1685 pages = kmalloc_node(array_size, nested_gfp, node); 1688 pages = kmalloc_node(array_size, nested_gfp, node);
@@ -1700,9 +1703,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1700 } 1703 }
1701 1704
1702 if (node == NUMA_NO_NODE) 1705 if (node == NUMA_NO_NODE)
1703 page = alloc_page(alloc_mask); 1706 page = alloc_page(alloc_mask|highmem_mask);
1704 else 1707 else
1705 page = alloc_pages_node(node, alloc_mask, 0); 1708 page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
1706 1709
1707 if (unlikely(!page)) { 1710 if (unlikely(!page)) {
1708 /* Successfully allocated i pages, free them in __vunmap() */ 1711 /* Successfully allocated i pages, free them in __vunmap() */
@@ -1710,7 +1713,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1710 goto fail; 1713 goto fail;
1711 } 1714 }
1712 area->pages[i] = page; 1715 area->pages[i] = page;
1713 if (gfpflags_allow_blocking(gfp_mask)) 1716 if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
1714 cond_resched(); 1717 cond_resched();
1715 } 1718 }
1716 1719
diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh
index 8cecae9a8bca..7956ea3be667 100755
--- a/tools/testing/selftests/kmod/kmod.sh
+++ b/tools/testing/selftests/kmod/kmod.sh
@@ -473,8 +473,8 @@ usage()
473 echo " all Runs all tests (default)" 473 echo " all Runs all tests (default)"
474 echo " -t Run test ID the number amount of times is recommended" 474 echo " -t Run test ID the number amount of times is recommended"
475 echo " -w Watch test ID run until it runs into an error" 475 echo " -w Watch test ID run until it runs into an error"
476 echo " -c Run test ID once" 476 echo " -s Run test ID once"
477 echo " -s Run test ID x test-count number of times" 477 echo " -c Run test ID x test-count number of times"
478 echo " -l List all test ID list" 478 echo " -l List all test ID list"
479 echo " -h|--help Help" 479 echo " -h|--help Help"
480 echo 480 echo