summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-06 19:13:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-06 19:13:31 -0400
commit0bc40e549aeea2de20fc571749de9bbfc099fb34 (patch)
treed18f3339bd383a17431fca23b6c5f3e54c93cf2f
parente913c4a4c21cd83317fafe63bfdc9d34d2910114 (diff)
parentcaa841360134f863987f2d4f77b8dc2fbb7596f8 (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar: "The changes in here are: - text_poke() fixes and an extensive set of executability lockdowns, to (hopefully) eliminate the last residual circumstances under which we are using W|X mappings even temporarily on x86 kernels. This required a broad range of surgery in text patching facilities, module loading, trampoline handling and other bits. - tweak page fault messages to be more informative and more structured. - remove DISCONTIGMEM support on x86-32 and make SPARSEMEM the default. - reduce KASLR granularity on 5-level paging kernels from 512 GB to 1 GB. - misc other changes and updates" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits) x86/mm: Initialize PGD cache during mm initialization x86/alternatives: Add comment about module removal races x86/kprobes: Use vmalloc special flag x86/ftrace: Use vmalloc special flag bpf: Use vmalloc special flag modules: Use vmalloc special flag mm/vmalloc: Add flag for freeing of special permsissions mm/hibernation: Make hibernation handle unmapped pages x86/mm/cpa: Add set_direct_map_*() functions x86/alternatives: Remove the return value of text_poke_*() x86/jump-label: Remove support for custom text poker x86/modules: Avoid breaking W^X while loading modules x86/kprobes: Set instruction page as executable x86/ftrace: Set trampoline pages as executable x86/kgdb: Avoid redundant comparison of patched code x86/alternatives: Use temporary mm for text poking x86/alternatives: Initialize temporary mm for patching fork: Provide a function for copying init_mm uprobes: Initialize uprobes earlier x86/mm: Save debug registers when loading a temporary mm ...
-rw-r--r--Documentation/x86/x86_64/mm.txt6
-rw-r--r--arch/Kconfig4
-rw-r--r--arch/x86/Kconfig11
-rw-r--r--arch/x86/include/asm/fixmap.h2
-rw-r--r--arch/x86/include/asm/mmu_context.h56
-rw-r--r--arch/x86/include/asm/pgtable.h3
-rw-r--r--arch/x86/include/asm/set_memory.h3
-rw-r--r--arch/x86/include/asm/text-patching.h7
-rw-r--r--arch/x86/include/asm/tlbflush.h2
-rw-r--r--arch/x86/kernel/alternative.c201
-rw-r--r--arch/x86/kernel/ftrace.c22
-rw-r--r--arch/x86/kernel/jump_label.c21
-rw-r--r--arch/x86/kernel/kgdb.c25
-rw-r--r--arch/x86/kernel/kprobes/core.c19
-rw-r--r--arch/x86/kernel/module.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S6
-rw-r--r--arch/x86/mm/fault.c55
-rw-r--r--arch/x86/mm/init.c37
-rw-r--r--arch/x86/mm/kaslr.c94
-rw-r--r--arch/x86/mm/pageattr.c16
-rw-r--r--arch/x86/mm/pgtable.c10
-rw-r--r--arch/x86/mm/tlb.c116
-rw-r--r--arch/x86/xen/mmu_pv.c2
-rw-r--r--include/asm-generic/pgtable.h2
-rw-r--r--include/asm-generic/tlb.h9
-rw-r--r--include/linux/filter.h18
-rw-r--r--include/linux/mm.h18
-rw-r--r--include/linux/sched/task.h1
-rw-r--r--include/linux/set_memory.h11
-rw-r--r--include/linux/uprobes.h5
-rw-r--r--include/linux/vmalloc.h15
-rw-r--r--init/main.c6
-rw-r--r--kernel/bpf/core.c1
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/module.c82
-rw-r--r--kernel/power/snapshot.c5
-rw-r--r--kernel/trace/bpf_trace.c8
-rw-r--r--mm/page_alloc.c7
-rw-r--r--mm/vmalloc.c113
40 files changed, 711 insertions, 343 deletions
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 804f9426ed17..6cbe652d7a49 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -72,7 +72,7 @@ Complete virtual memory map with 5-level page tables
72Notes: 72Notes:
73 73
74 - With 56-bit addresses, user-space memory gets expanded by a factor of 512x, 74 - With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
75 from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting 75 from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PB starting
76 offset and many of the regions expand to support the much larger physical 76 offset and many of the regions expand to support the much larger physical
77 memory supported. 77 memory supported.
78 78
@@ -83,7 +83,7 @@ Notes:
83 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm 83 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm
84__________________|____________|__________________|_________|___________________________________________________________ 84__________________|____________|__________________|_________|___________________________________________________________
85 | | | | 85 | | | |
86 0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical 86 0100000000000000 | +64 PB | feffffffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
87 | | | | virtual memory addresses up to the -64 PB 87 | | | | virtual memory addresses up to the -64 PB
88 | | | | starting offset of kernel mappings. 88 | | | | starting offset of kernel mappings.
89__________________|____________|__________________|_________|___________________________________________________________ 89__________________|____________|__________________|_________|___________________________________________________________
@@ -99,7 +99,7 @@ ____________________________________________________________|___________________
99 ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole 99 ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole
100 ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base) 100 ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base)
101 ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole 101 ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole
102 ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory 102 ffdf000000000000 | -8.25 PB | fffffbffffffffff | ~8 PB | KASAN shadow memory
103__________________|____________|__________________|_________|____________________________________________________________ 103__________________|____________|__________________|_________|____________________________________________________________
104 | 104 |
105 | Identical layout to the 47-bit one from here on: 105 | Identical layout to the 47-bit one from here on:
diff --git a/arch/Kconfig b/arch/Kconfig
index 3ab446bd12ef..5e43fcbad4ca 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -249,6 +249,10 @@ config ARCH_HAS_FORTIFY_SOURCE
249config ARCH_HAS_SET_MEMORY 249config ARCH_HAS_SET_MEMORY
250 bool 250 bool
251 251
252# Select if arch has all set_direct_map_invalid/default() functions
253config ARCH_HAS_SET_DIRECT_MAP
254 bool
255
252# Select if arch init_task must go in the __init_task_data section 256# Select if arch init_task must go in the __init_task_data section
253config ARCH_TASK_STRUCT_ON_STACK 257config ARCH_TASK_STRUCT_ON_STACK
254 bool 258 bool
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index db95da6d644d..9fc73ca17844 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -65,6 +65,7 @@ config X86
65 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 65 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
66 select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE 66 select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE
67 select ARCH_HAS_SET_MEMORY 67 select ARCH_HAS_SET_MEMORY
68 select ARCH_HAS_SET_DIRECT_MAP
68 select ARCH_HAS_STRICT_KERNEL_RWX 69 select ARCH_HAS_STRICT_KERNEL_RWX
69 select ARCH_HAS_STRICT_MODULE_RWX 70 select ARCH_HAS_STRICT_MODULE_RWX
70 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE 71 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
@@ -1592,12 +1593,9 @@ config ARCH_FLATMEM_ENABLE
1592 depends on X86_32 && !NUMA 1593 depends on X86_32 && !NUMA
1593 1594
1594config ARCH_DISCONTIGMEM_ENABLE 1595config ARCH_DISCONTIGMEM_ENABLE
1595 def_bool y 1596 def_bool n
1596 depends on NUMA && X86_32
1597
1598config ARCH_DISCONTIGMEM_DEFAULT
1599 def_bool y
1600 depends on NUMA && X86_32 1597 depends on NUMA && X86_32
1598 depends on BROKEN
1601 1599
1602config ARCH_SPARSEMEM_ENABLE 1600config ARCH_SPARSEMEM_ENABLE
1603 def_bool y 1601 def_bool y
@@ -1606,8 +1604,7 @@ config ARCH_SPARSEMEM_ENABLE
1606 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 1604 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
1607 1605
1608config ARCH_SPARSEMEM_DEFAULT 1606config ARCH_SPARSEMEM_DEFAULT
1609 def_bool y 1607 def_bool X86_64 || (NUMA && X86_32)
1610 depends on X86_64
1611 1608
1612config ARCH_SELECT_MEMORY_MODEL 1609config ARCH_SELECT_MEMORY_MODEL
1613 def_bool y 1610 def_bool y
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 50ba74a34a37..9da8cccdf3fb 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -103,8 +103,6 @@ enum fixed_addresses {
103#ifdef CONFIG_PARAVIRT 103#ifdef CONFIG_PARAVIRT
104 FIX_PARAVIRT_BOOTMAP, 104 FIX_PARAVIRT_BOOTMAP,
105#endif 105#endif
106 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
107 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
108#ifdef CONFIG_X86_INTEL_MID 106#ifdef CONFIG_X86_INTEL_MID
109 FIX_LNW_VRTC, 107 FIX_LNW_VRTC,
110#endif 108#endif
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 19d18fae6ec6..93dff1963337 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -13,6 +13,7 @@
13#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
14#include <asm/paravirt.h> 14#include <asm/paravirt.h>
15#include <asm/mpx.h> 15#include <asm/mpx.h>
16#include <asm/debugreg.h>
16 17
17extern atomic64_t last_mm_ctx_id; 18extern atomic64_t last_mm_ctx_id;
18 19
@@ -356,4 +357,59 @@ static inline unsigned long __get_current_cr3_fast(void)
356 return cr3; 357 return cr3;
357} 358}
358 359
360typedef struct {
361 struct mm_struct *mm;
362} temp_mm_state_t;
363
364/*
365 * Using a temporary mm allows to set temporary mappings that are not accessible
366 * by other CPUs. Such mappings are needed to perform sensitive memory writes
367 * that override the kernel memory protections (e.g., W^X), without exposing the
368 * temporary page-table mappings that are required for these write operations to
369 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
370 * mapping is torn down.
371 *
372 * Context: The temporary mm needs to be used exclusively by a single core. To
373 * harden security IRQs must be disabled while the temporary mm is
374 * loaded, thereby preventing interrupt handler bugs from overriding
375 * the kernel memory protection.
376 */
377static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
378{
379 temp_mm_state_t temp_state;
380
381 lockdep_assert_irqs_disabled();
382 temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
383 switch_mm_irqs_off(NULL, mm, current);
384
385 /*
386 * If breakpoints are enabled, disable them while the temporary mm is
387 * used. Userspace might set up watchpoints on addresses that are used
388 * in the temporary mm, which would lead to wrong signals being sent or
389 * crashes.
390 *
391 * Note that breakpoints are not disabled selectively, which also causes
392 * kernel breakpoints (e.g., perf's) to be disabled. This might be
393 * undesirable, but still seems reasonable as the code that runs in the
394 * temporary mm should be short.
395 */
396 if (hw_breakpoint_active())
397 hw_breakpoint_disable();
398
399 return temp_state;
400}
401
402static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
403{
404 lockdep_assert_irqs_disabled();
405 switch_mm_irqs_off(NULL, prev_state.mm, current);
406
407 /*
408 * Restore the breakpoints if they were disabled before the temporary mm
409 * was loaded.
410 */
411 if (hw_breakpoint_active())
412 hw_breakpoint_restore();
413}
414
359#endif /* _ASM_X86_MMU_CONTEXT_H */ 415#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 50b3e2d963c9..3a221942f805 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void)
1021 /* Default trampoline pgd value */ 1021 /* Default trampoline pgd value */
1022 trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; 1022 trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
1023} 1023}
1024
1025void __init poking_init(void);
1026
1024# ifdef CONFIG_RANDOMIZE_MEMORY 1027# ifdef CONFIG_RANDOMIZE_MEMORY
1025void __meminit init_trampoline(void); 1028void __meminit init_trampoline(void);
1026# else 1029# else
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 07a25753e85c..ae7b909dc242 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -85,6 +85,9 @@ int set_pages_nx(struct page *page, int numpages);
85int set_pages_ro(struct page *page, int numpages); 85int set_pages_ro(struct page *page, int numpages);
86int set_pages_rw(struct page *page, int numpages); 86int set_pages_rw(struct page *page, int numpages);
87 87
88int set_direct_map_invalid_noflush(struct page *page);
89int set_direct_map_default_noflush(struct page *page);
90
88extern int kernel_set_to_readonly; 91extern int kernel_set_to_readonly;
89void set_kernel_text_rw(void); 92void set_kernel_text_rw(void);
90void set_kernel_text_ro(void); 93void set_kernel_text_ro(void);
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index e85ff65c43c3..c90678fd391a 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -18,7 +18,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
18#define __parainstructions_end NULL 18#define __parainstructions_end NULL
19#endif 19#endif
20 20
21extern void *text_poke_early(void *addr, const void *opcode, size_t len); 21extern void text_poke_early(void *addr, const void *opcode, size_t len);
22 22
23/* 23/*
24 * Clear and restore the kernel write-protection flag on the local CPU. 24 * Clear and restore the kernel write-protection flag on the local CPU.
@@ -35,8 +35,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
35 * inconsistent instruction while you patch. 35 * inconsistent instruction while you patch.
36 */ 36 */
37extern void *text_poke(void *addr, const void *opcode, size_t len); 37extern void *text_poke(void *addr, const void *opcode, size_t len);
38extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
38extern int poke_int3_handler(struct pt_regs *regs); 39extern int poke_int3_handler(struct pt_regs *regs);
39extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); 40extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
40extern int after_bootmem; 41extern int after_bootmem;
42extern __ro_after_init struct mm_struct *poking_mm;
43extern __ro_after_init unsigned long poking_addr;
41 44
42#endif /* _ASM_X86_TEXT_PATCHING_H */ 45#endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 90926e8dd1f8..dee375831962 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -274,6 +274,8 @@ static inline bool nmi_uaccess_okay(void)
274 return true; 274 return true;
275} 275}
276 276
277#define nmi_uaccess_okay nmi_uaccess_okay
278
277/* Initialize cr4 shadow for this CPU. */ 279/* Initialize cr4 shadow for this CPU. */
278static inline void cr4_init_shadow(void) 280static inline void cr4_init_shadow(void)
279{ 281{
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9a79c7808f9c..7b9b49dfc05a 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -12,6 +12,7 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/kdebug.h> 13#include <linux/kdebug.h>
14#include <linux/kprobes.h> 14#include <linux/kprobes.h>
15#include <linux/mmu_context.h>
15#include <asm/text-patching.h> 16#include <asm/text-patching.h>
16#include <asm/alternative.h> 17#include <asm/alternative.h>
17#include <asm/sections.h> 18#include <asm/sections.h>
@@ -264,7 +265,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
264 265
265extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 266extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
266extern s32 __smp_locks[], __smp_locks_end[]; 267extern s32 __smp_locks[], __smp_locks_end[];
267void *text_poke_early(void *addr, const void *opcode, size_t len); 268void text_poke_early(void *addr, const void *opcode, size_t len);
268 269
269/* 270/*
270 * Are we looking at a near JMP with a 1 or 4-byte displacement. 271 * Are we looking at a near JMP with a 1 or 4-byte displacement.
@@ -666,16 +667,136 @@ void __init alternative_instructions(void)
666 * instructions. And on the local CPU you need to be protected again NMI or MCE 667 * instructions. And on the local CPU you need to be protected again NMI or MCE
667 * handlers seeing an inconsistent instruction while you patch. 668 * handlers seeing an inconsistent instruction while you patch.
668 */ 669 */
669void *__init_or_module text_poke_early(void *addr, const void *opcode, 670void __init_or_module text_poke_early(void *addr, const void *opcode,
670 size_t len) 671 size_t len)
671{ 672{
672 unsigned long flags; 673 unsigned long flags;
674
675 if (boot_cpu_has(X86_FEATURE_NX) &&
676 is_module_text_address((unsigned long)addr)) {
677 /*
678 * Modules text is marked initially as non-executable, so the
679 * code cannot be running and speculative code-fetches are
680 * prevented. Just change the code.
681 */
682 memcpy(addr, opcode, len);
683 } else {
684 local_irq_save(flags);
685 memcpy(addr, opcode, len);
686 local_irq_restore(flags);
687 sync_core();
688
689 /*
690 * Could also do a CLFLUSH here to speed up CPU recovery; but
691 * that causes hangs on some VIA CPUs.
692 */
693 }
694}
695
696__ro_after_init struct mm_struct *poking_mm;
697__ro_after_init unsigned long poking_addr;
698
699static void *__text_poke(void *addr, const void *opcode, size_t len)
700{
701 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
702 struct page *pages[2] = {NULL};
703 temp_mm_state_t prev;
704 unsigned long flags;
705 pte_t pte, *ptep;
706 spinlock_t *ptl;
707 pgprot_t pgprot;
708
709 /*
710 * While boot memory allocator is running we cannot use struct pages as
711 * they are not yet initialized. There is no way to recover.
712 */
713 BUG_ON(!after_bootmem);
714
715 if (!core_kernel_text((unsigned long)addr)) {
716 pages[0] = vmalloc_to_page(addr);
717 if (cross_page_boundary)
718 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
719 } else {
720 pages[0] = virt_to_page(addr);
721 WARN_ON(!PageReserved(pages[0]));
722 if (cross_page_boundary)
723 pages[1] = virt_to_page(addr + PAGE_SIZE);
724 }
725 /*
726 * If something went wrong, crash and burn since recovery paths are not
727 * implemented.
728 */
729 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
730
673 local_irq_save(flags); 731 local_irq_save(flags);
674 memcpy(addr, opcode, len); 732
733 /*
734 * Map the page without the global bit, as TLB flushing is done with
735 * flush_tlb_mm_range(), which is intended for non-global PTEs.
736 */
737 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
738
739 /*
740 * The lock is not really needed, but this allows to avoid open-coding.
741 */
742 ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
743
744 /*
745 * This must not fail; preallocated in poking_init().
746 */
747 VM_BUG_ON(!ptep);
748
749 pte = mk_pte(pages[0], pgprot);
750 set_pte_at(poking_mm, poking_addr, ptep, pte);
751
752 if (cross_page_boundary) {
753 pte = mk_pte(pages[1], pgprot);
754 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
755 }
756
757 /*
758 * Loading the temporary mm behaves as a compiler barrier, which
759 * guarantees that the PTE will be set at the time memcpy() is done.
760 */
761 prev = use_temporary_mm(poking_mm);
762
763 kasan_disable_current();
764 memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
765 kasan_enable_current();
766
767 /*
768 * Ensure that the PTE is only cleared after the instructions of memcpy
769 * were issued by using a compiler barrier.
770 */
771 barrier();
772
773 pte_clear(poking_mm, poking_addr, ptep);
774 if (cross_page_boundary)
775 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
776
777 /*
778 * Loading the previous page-table hierarchy requires a serializing
779 * instruction that already allows the core to see the updated version.
780 * Xen-PV is assumed to serialize execution in a similar manner.
781 */
782 unuse_temporary_mm(prev);
783
784 /*
785 * Flushing the TLB might involve IPIs, which would require enabled
786 * IRQs, but not if the mm is not used, as it is in this point.
787 */
788 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
789 (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
790 PAGE_SHIFT, false);
791
792 /*
793 * If the text does not match what we just wrote then something is
794 * fundamentally screwy; there's nothing we can really do about that.
795 */
796 BUG_ON(memcmp(addr, opcode, len));
797
798 pte_unmap_unlock(ptep, ptl);
675 local_irq_restore(flags); 799 local_irq_restore(flags);
676 sync_core();
677 /* Could also do a CLFLUSH here to speed up CPU recovery; but
678 that causes hangs on some VIA CPUs. */
679 return addr; 800 return addr;
680} 801}
681 802
@@ -689,48 +810,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
689 * It means the size must be writable atomically and the address must be aligned 810 * It means the size must be writable atomically and the address must be aligned
690 * in a way that permits an atomic write. It also makes sure we fit on a single 811 * in a way that permits an atomic write. It also makes sure we fit on a single
691 * page. 812 * page.
813 *
814 * Note that the caller must ensure that if the modified code is part of a
815 * module, the module would not be removed during poking. This can be achieved
816 * by registering a module notifier, and ordering module removal and patching
817 * trough a mutex.
692 */ 818 */
693void *text_poke(void *addr, const void *opcode, size_t len) 819void *text_poke(void *addr, const void *opcode, size_t len)
694{ 820{
695 unsigned long flags;
696 char *vaddr;
697 struct page *pages[2];
698 int i;
699
700 /*
701 * While boot memory allocator is runnig we cannot use struct
702 * pages as they are not yet initialized.
703 */
704 BUG_ON(!after_bootmem);
705
706 lockdep_assert_held(&text_mutex); 821 lockdep_assert_held(&text_mutex);
707 822
708 if (!core_kernel_text((unsigned long)addr)) { 823 return __text_poke(addr, opcode, len);
709 pages[0] = vmalloc_to_page(addr); 824}
710 pages[1] = vmalloc_to_page(addr + PAGE_SIZE); 825
711 } else { 826/**
712 pages[0] = virt_to_page(addr); 827 * text_poke_kgdb - Update instructions on a live kernel by kgdb
713 WARN_ON(!PageReserved(pages[0])); 828 * @addr: address to modify
714 pages[1] = virt_to_page(addr + PAGE_SIZE); 829 * @opcode: source of the copy
715 } 830 * @len: length to copy
716 BUG_ON(!pages[0]); 831 *
717 local_irq_save(flags); 832 * Only atomic text poke/set should be allowed when not doing early patching.
718 set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); 833 * It means the size must be writable atomically and the address must be aligned
719 if (pages[1]) 834 * in a way that permits an atomic write. It also makes sure we fit on a single
720 set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); 835 * page.
721 vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); 836 *
722 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); 837 * Context: should only be used by kgdb, which ensures no other core is running,
723 clear_fixmap(FIX_TEXT_POKE0); 838 * despite the fact it does not hold the text_mutex.
724 if (pages[1]) 839 */
725 clear_fixmap(FIX_TEXT_POKE1); 840void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
726 local_flush_tlb(); 841{
727 sync_core(); 842 return __text_poke(addr, opcode, len);
728 /* Could also do a CLFLUSH here to speed up CPU recovery; but
729 that causes hangs on some VIA CPUs. */
730 for (i = 0; i < len; i++)
731 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
732 local_irq_restore(flags);
733 return addr;
734} 843}
735 844
736static void do_sync_core(void *info) 845static void do_sync_core(void *info)
@@ -788,7 +897,7 @@ NOKPROBE_SYMBOL(poke_int3_handler);
788 * replacing opcode 897 * replacing opcode
789 * - sync cores 898 * - sync cores
790 */ 899 */
791void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) 900void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
792{ 901{
793 unsigned char int3 = 0xcc; 902 unsigned char int3 = 0xcc;
794 903
@@ -830,7 +939,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
830 * the writing of the new instruction. 939 * the writing of the new instruction.
831 */ 940 */
832 bp_patching_in_progress = false; 941 bp_patching_in_progress = false;
833
834 return addr;
835} 942}
836 943
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ef49517f6bb2..0caf8122d680 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -678,12 +678,8 @@ static inline void *alloc_tramp(unsigned long size)
678{ 678{
679 return module_alloc(size); 679 return module_alloc(size);
680} 680}
681static inline void tramp_free(void *tramp, int size) 681static inline void tramp_free(void *tramp)
682{ 682{
683 int npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
684
685 set_memory_nx((unsigned long)tramp, npages);
686 set_memory_rw((unsigned long)tramp, npages);
687 module_memfree(tramp); 683 module_memfree(tramp);
688} 684}
689#else 685#else
@@ -692,7 +688,7 @@ static inline void *alloc_tramp(unsigned long size)
692{ 688{
693 return NULL; 689 return NULL;
694} 690}
695static inline void tramp_free(void *tramp, int size) { } 691static inline void tramp_free(void *tramp) { }
696#endif 692#endif
697 693
698/* Defined as markers to the end of the ftrace default trampolines */ 694/* Defined as markers to the end of the ftrace default trampolines */
@@ -730,6 +726,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
730 unsigned long end_offset; 726 unsigned long end_offset;
731 unsigned long op_offset; 727 unsigned long op_offset;
732 unsigned long offset; 728 unsigned long offset;
729 unsigned long npages;
733 unsigned long size; 730 unsigned long size;
734 unsigned long retq; 731 unsigned long retq;
735 unsigned long *ptr; 732 unsigned long *ptr;
@@ -762,6 +759,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
762 return 0; 759 return 0;
763 760
764 *tramp_size = size + RET_SIZE + sizeof(void *); 761 *tramp_size = size + RET_SIZE + sizeof(void *);
762 npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
765 763
766 /* Copy ftrace_caller onto the trampoline memory */ 764 /* Copy ftrace_caller onto the trampoline memory */
767 ret = probe_kernel_read(trampoline, (void *)start_offset, size); 765 ret = probe_kernel_read(trampoline, (void *)start_offset, size);
@@ -806,9 +804,17 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
806 /* ALLOC_TRAMP flags lets us know we created it */ 804 /* ALLOC_TRAMP flags lets us know we created it */
807 ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; 805 ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
808 806
807 set_vm_flush_reset_perms(trampoline);
808
809 /*
810 * Module allocation needs to be completed by making the page
811 * executable. The page is still writable, which is a security hazard,
812 * but anyhow ftrace breaks W^X completely.
813 */
814 set_memory_x((unsigned long)trampoline, npages);
809 return (unsigned long)trampoline; 815 return (unsigned long)trampoline;
810fail: 816fail:
811 tramp_free(trampoline, *tramp_size); 817 tramp_free(trampoline);
812 return 0; 818 return 0;
813} 819}
814 820
@@ -939,7 +945,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
939 if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) 945 if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
940 return; 946 return;
941 947
942 tramp_free((void *)ops->trampoline, ops->trampoline_size); 948 tramp_free((void *)ops->trampoline);
943 ops->trampoline = 0; 949 ops->trampoline = 0;
944} 950}
945 951
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index f99bd26bd3f1..e631c358f7f4 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -37,7 +37,6 @@ static void bug_at(unsigned char *ip, int line)
37 37
38static void __ref __jump_label_transform(struct jump_entry *entry, 38static void __ref __jump_label_transform(struct jump_entry *entry,
39 enum jump_label_type type, 39 enum jump_label_type type,
40 void *(*poker)(void *, const void *, size_t),
41 int init) 40 int init)
42{ 41{
43 union jump_code_union jmp; 42 union jump_code_union jmp;
@@ -50,9 +49,6 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
50 jmp.offset = jump_entry_target(entry) - 49 jmp.offset = jump_entry_target(entry) -
51 (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); 50 (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
52 51
53 if (early_boot_irqs_disabled)
54 poker = text_poke_early;
55
56 if (type == JUMP_LABEL_JMP) { 52 if (type == JUMP_LABEL_JMP) {
57 if (init) { 53 if (init) {
58 expect = default_nop; line = __LINE__; 54 expect = default_nop; line = __LINE__;
@@ -75,16 +71,19 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
75 bug_at((void *)jump_entry_code(entry), line); 71 bug_at((void *)jump_entry_code(entry), line);
76 72
77 /* 73 /*
78 * Make text_poke_bp() a default fallback poker. 74 * As long as only a single processor is running and the code is still
75 * not marked as RO, text_poke_early() can be used; Checking that
76 * system_state is SYSTEM_BOOTING guarantees it. It will be set to
77 * SYSTEM_SCHEDULING before other cores are awaken and before the
78 * code is write-protected.
79 * 79 *
80 * At the time the change is being done, just ignore whether we 80 * At the time the change is being done, just ignore whether we
81 * are doing nop -> jump or jump -> nop transition, and assume 81 * are doing nop -> jump or jump -> nop transition, and assume
82 * always nop being the 'currently valid' instruction 82 * always nop being the 'currently valid' instruction
83 *
84 */ 83 */
85 if (poker) { 84 if (init || system_state == SYSTEM_BOOTING) {
86 (*poker)((void *)jump_entry_code(entry), code, 85 text_poke_early((void *)jump_entry_code(entry), code,
87 JUMP_LABEL_NOP_SIZE); 86 JUMP_LABEL_NOP_SIZE);
88 return; 87 return;
89 } 88 }
90 89
@@ -96,7 +95,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
96 enum jump_label_type type) 95 enum jump_label_type type)
97{ 96{
98 mutex_lock(&text_mutex); 97 mutex_lock(&text_mutex);
99 __jump_label_transform(entry, type, NULL, 0); 98 __jump_label_transform(entry, type, 0);
100 mutex_unlock(&text_mutex); 99 mutex_unlock(&text_mutex);
101} 100}
102 101
@@ -126,5 +125,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
126 jlstate = JL_STATE_NO_UPDATE; 125 jlstate = JL_STATE_NO_UPDATE;
127 } 126 }
128 if (jlstate == JL_STATE_UPDATE) 127 if (jlstate == JL_STATE_UPDATE)
129 __jump_label_transform(entry, type, text_poke_early, 1); 128 __jump_label_transform(entry, type, 1);
130} 129}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 4ff6b4cdb941..13b13311b792 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -747,7 +747,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
747int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) 747int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
748{ 748{
749 int err; 749 int err;
750 char opc[BREAK_INSTR_SIZE];
751 750
752 bpt->type = BP_BREAKPOINT; 751 bpt->type = BP_BREAKPOINT;
753 err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, 752 err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@@ -759,18 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
759 if (!err) 758 if (!err)
760 return err; 759 return err;
761 /* 760 /*
762 * It is safe to call text_poke() because normal kernel execution 761 * It is safe to call text_poke_kgdb() because normal kernel execution
763 * is stopped on all cores, so long as the text_mutex is not locked. 762 * is stopped on all cores, so long as the text_mutex is not locked.
764 */ 763 */
765 if (mutex_is_locked(&text_mutex)) 764 if (mutex_is_locked(&text_mutex))
766 return -EBUSY; 765 return -EBUSY;
767 text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, 766 text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
768 BREAK_INSTR_SIZE); 767 BREAK_INSTR_SIZE);
769 err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
770 if (err)
771 return err;
772 if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
773 return -EINVAL;
774 bpt->type = BP_POKE_BREAKPOINT; 768 bpt->type = BP_POKE_BREAKPOINT;
775 769
776 return err; 770 return err;
@@ -778,22 +772,17 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
778 772
779int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) 773int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
780{ 774{
781 int err;
782 char opc[BREAK_INSTR_SIZE];
783
784 if (bpt->type != BP_POKE_BREAKPOINT) 775 if (bpt->type != BP_POKE_BREAKPOINT)
785 goto knl_write; 776 goto knl_write;
786 /* 777 /*
787 * It is safe to call text_poke() because normal kernel execution 778 * It is safe to call text_poke_kgdb() because normal kernel execution
788 * is stopped on all cores, so long as the text_mutex is not locked. 779 * is stopped on all cores, so long as the text_mutex is not locked.
789 */ 780 */
790 if (mutex_is_locked(&text_mutex)) 781 if (mutex_is_locked(&text_mutex))
791 goto knl_write; 782 goto knl_write;
792 text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE); 783 text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
793 err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); 784 BREAK_INSTR_SIZE);
794 if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) 785 return 0;
795 goto knl_write;
796 return err;
797 786
798knl_write: 787knl_write:
799 return probe_kernel_write((char *)bpt->bpt_addr, 788 return probe_kernel_write((char *)bpt->bpt_addr,
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 122548ad5c2e..cf52ee0d8711 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -431,8 +431,21 @@ void *alloc_insn_page(void)
431 void *page; 431 void *page;
432 432
433 page = module_alloc(PAGE_SIZE); 433 page = module_alloc(PAGE_SIZE);
434 if (page) 434 if (!page)
435 set_memory_ro((unsigned long)page & PAGE_MASK, 1); 435 return NULL;
436
437 set_vm_flush_reset_perms(page);
438 /*
439 * First make the page read-only, and only then make it executable to
440 * prevent it from being W+X in between.
441 */
442 set_memory_ro((unsigned long)page, 1);
443
444 /*
445 * TODO: Once additional kernel code protection mechanisms are set, ensure
446 * that the page was not maliciously altered and it is still zeroed.
447 */
448 set_memory_x((unsigned long)page, 1);
436 449
437 return page; 450 return page;
438} 451}
@@ -440,8 +453,6 @@ void *alloc_insn_page(void)
440/* Recover page to RW mode before releasing it */ 453/* Recover page to RW mode before releasing it */
441void free_insn_page(void *page) 454void free_insn_page(void *page)
442{ 455{
443 set_memory_nx((unsigned long)page & PAGE_MASK, 1);
444 set_memory_rw((unsigned long)page & PAGE_MASK, 1);
445 module_memfree(page); 456 module_memfree(page);
446} 457}
447 458
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index b052e883dd8c..cfa3106faee4 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -87,7 +87,7 @@ void *module_alloc(unsigned long size)
87 p = __vmalloc_node_range(size, MODULE_ALIGN, 87 p = __vmalloc_node_range(size, MODULE_ALIGN,
88 MODULES_VADDR + get_module_load_offset(), 88 MODULES_VADDR + get_module_load_offset(),
89 MODULES_END, GFP_KERNEL, 89 MODULES_END, GFP_KERNEL,
90 PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, 90 PAGE_KERNEL, 0, NUMA_NO_NODE,
91 __builtin_return_address(0)); 91 __builtin_return_address(0));
92 if (p && (kasan_module_alloc(p, size) < 0)) { 92 if (p && (kasan_module_alloc(p, size) < 0)) {
93 vfree(p); 93 vfree(p);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 4d1517022a14..0850b5149345 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -141,11 +141,11 @@ SECTIONS
141 *(.text.__x86.indirect_thunk) 141 *(.text.__x86.indirect_thunk)
142 __indirect_thunk_end = .; 142 __indirect_thunk_end = .;
143#endif 143#endif
144
145 /* End of text section */
146 _etext = .;
147 } :text = 0x9090 144 } :text = 0x9090
148 145
146 /* End of text section */
147 _etext = .;
148
149 NOTES :text :note 149 NOTES :text :note
150 150
151 EXCEPTION_TABLE(16) :text = 0x9090 151 EXCEPTION_TABLE(16) :text = 0x9090
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 06c089513d39..46df4c6aae46 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -360,8 +360,6 @@ static noinline int vmalloc_fault(unsigned long address)
360 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 360 if (!(address >= VMALLOC_START && address < VMALLOC_END))
361 return -1; 361 return -1;
362 362
363 WARN_ON_ONCE(in_nmi());
364
365 /* 363 /*
366 * Copy kernel mappings over when needed. This can also 364 * Copy kernel mappings over when needed. This can also
367 * happen within a race in page table update. In the later 365 * happen within a race in page table update. In the later
@@ -604,24 +602,9 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
604 name, index, addr, (desc.limit0 | (desc.limit1 << 16))); 602 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
605} 603}
606 604
607/*
608 * This helper function transforms the #PF error_code bits into
609 * "[PROT] [USER]" type of descriptive, almost human-readable error strings:
610 */
611static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt)
612{
613 if (error_code & mask) {
614 if (buf[0])
615 strcat(buf, " ");
616 strcat(buf, txt);
617 }
618}
619
620static void 605static void
621show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) 606show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
622{ 607{
623 char err_txt[64];
624
625 if (!oops_may_print()) 608 if (!oops_may_print())
626 return; 609 return;
627 610
@@ -645,31 +628,29 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
645 from_kuid(&init_user_ns, current_uid())); 628 from_kuid(&init_user_ns, current_uid()));
646 } 629 }
647 630
648 pr_alert("BUG: unable to handle kernel %s at %px\n", 631 if (address < PAGE_SIZE && !user_mode(regs))
649 address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", 632 pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
650 (void *)address); 633 (void *)address);
651 634 else
652 err_txt[0] = 0; 635 pr_alert("BUG: unable to handle page fault for address: %px\n",
653 636 (void *)address);
654 /* 637
655 * Note: length of these appended strings including the separation space and the 638 pr_alert("#PF: %s %s in %s mode\n",
656 * zero delimiter must fit into err_txt[]. 639 (error_code & X86_PF_USER) ? "user" : "supervisor",
657 */ 640 (error_code & X86_PF_INSTR) ? "instruction fetch" :
658 err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" ); 641 (error_code & X86_PF_WRITE) ? "write access" :
659 err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]"); 642 "read access",
660 err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" ); 643 user_mode(regs) ? "user" : "kernel");
661 err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" ); 644 pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
662 err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]"); 645 !(error_code & X86_PF_PROT) ? "not-present page" :
663 err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" ); 646 (error_code & X86_PF_RSVD) ? "reserved bit violation" :
664 647 (error_code & X86_PF_PK) ? "protection keys violation" :
665 pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]"); 648 "permissions violation");
666 649
667 if (!(error_code & X86_PF_USER) && user_mode(regs)) { 650 if (!(error_code & X86_PF_USER) && user_mode(regs)) {
668 struct desc_ptr idt, gdt; 651 struct desc_ptr idt, gdt;
669 u16 ldtr, tr; 652 u16 ldtr, tr;
670 653
671 pr_alert("This was a system access from user code\n");
672
673 /* 654 /*
674 * This can happen for quite a few reasons. The more obvious 655 * This can happen for quite a few reasons. The more obvious
675 * ones are faults accessing the GDT, or LDT. Perhaps 656 * ones are faults accessing the GDT, or LDT. Perhaps
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8dacdb96899e..fd10d91a6115 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -6,6 +6,7 @@
6#include <linux/swapfile.h> 6#include <linux/swapfile.h>
7#include <linux/swapops.h> 7#include <linux/swapops.h>
8#include <linux/kmemleak.h> 8#include <linux/kmemleak.h>
9#include <linux/sched/task.h>
9 10
10#include <asm/set_memory.h> 11#include <asm/set_memory.h>
11#include <asm/e820/api.h> 12#include <asm/e820/api.h>
@@ -23,6 +24,7 @@
23#include <asm/hypervisor.h> 24#include <asm/hypervisor.h>
24#include <asm/cpufeature.h> 25#include <asm/cpufeature.h>
25#include <asm/pti.h> 26#include <asm/pti.h>
27#include <asm/text-patching.h>
26 28
27/* 29/*
28 * We need to define the tracepoints somewhere, and tlb.c 30 * We need to define the tracepoints somewhere, and tlb.c
@@ -702,6 +704,41 @@ void __init init_mem_mapping(void)
702} 704}
703 705
704/* 706/*
707 * Initialize an mm_struct to be used during poking and a pointer to be used
708 * during patching.
709 */
710void __init poking_init(void)
711{
712 spinlock_t *ptl;
713 pte_t *ptep;
714
715 poking_mm = copy_init_mm();
716 BUG_ON(!poking_mm);
717
718 /*
719 * Randomize the poking address, but make sure that the following page
720 * will be mapped at the same PMD. We need 2 pages, so find space for 3,
721 * and adjust the address if the PMD ends after the first one.
722 */
723 poking_addr = TASK_UNMAPPED_BASE;
724 if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
725 poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
726 (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
727
728 if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
729 poking_addr += PAGE_SIZE;
730
731 /*
732 * We need to trigger the allocation of the page-tables that will be
733 * needed for poking now. Later, poking may be performed in an atomic
734 * section, which might cause allocation to fail.
735 */
736 ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
737 BUG_ON(!ptep);
738 pte_unmap_unlock(ptep, ptl);
739}
740
741/*
705 * devmem_is_allowed() checks to see if /dev/mem access to a certain address 742 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
706 * is valid. The argument is a physical page number. 743 * is valid. The argument is a physical page number.
707 * 744 *
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index d669c5e797e0..dc3f058bdf9b 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -125,10 +125,7 @@ void __init kernel_randomize_memory(void)
125 */ 125 */
126 entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); 126 entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
127 prandom_bytes_state(&rand_state, &rand, sizeof(rand)); 127 prandom_bytes_state(&rand_state, &rand, sizeof(rand));
128 if (pgtable_l5_enabled()) 128 entropy = (rand % (entropy + 1)) & PUD_MASK;
129 entropy = (rand % (entropy + 1)) & P4D_MASK;
130 else
131 entropy = (rand % (entropy + 1)) & PUD_MASK;
132 vaddr += entropy; 129 vaddr += entropy;
133 *kaslr_regions[i].base = vaddr; 130 *kaslr_regions[i].base = vaddr;
134 131
@@ -137,84 +134,71 @@ void __init kernel_randomize_memory(void)
137 * randomization alignment. 134 * randomization alignment.
138 */ 135 */
139 vaddr += get_padding(&kaslr_regions[i]); 136 vaddr += get_padding(&kaslr_regions[i]);
140 if (pgtable_l5_enabled()) 137 vaddr = round_up(vaddr + 1, PUD_SIZE);
141 vaddr = round_up(vaddr + 1, P4D_SIZE);
142 else
143 vaddr = round_up(vaddr + 1, PUD_SIZE);
144 remain_entropy -= entropy; 138 remain_entropy -= entropy;
145 } 139 }
146} 140}
147 141
148static void __meminit init_trampoline_pud(void) 142static void __meminit init_trampoline_pud(void)
149{ 143{
150 unsigned long paddr, paddr_next; 144 pud_t *pud_page_tramp, *pud, *pud_tramp;
145 p4d_t *p4d_page_tramp, *p4d, *p4d_tramp;
146 unsigned long paddr, vaddr;
151 pgd_t *pgd; 147 pgd_t *pgd;
152 pud_t *pud_page, *pud_page_tramp;
153 int i;
154 148
155 pud_page_tramp = alloc_low_page(); 149 pud_page_tramp = alloc_low_page();
156 150
151 /*
152 * There are two mappings for the low 1MB area, the direct mapping
153 * and the 1:1 mapping for the real mode trampoline:
154 *
155 * Direct mapping: virt_addr = phys_addr + PAGE_OFFSET
156 * 1:1 mapping: virt_addr = phys_addr
157 */
157 paddr = 0; 158 paddr = 0;
158 pgd = pgd_offset_k((unsigned long)__va(paddr)); 159 vaddr = (unsigned long)__va(paddr);
159 pud_page = (pud_t *) pgd_page_vaddr(*pgd); 160 pgd = pgd_offset_k(vaddr);
160
161 for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) {
162 pud_t *pud, *pud_tramp;
163 unsigned long vaddr = (unsigned long)__va(paddr);
164 161
165 pud_tramp = pud_page_tramp + pud_index(paddr); 162 p4d = p4d_offset(pgd, vaddr);
166 pud = pud_page + pud_index(vaddr); 163 pud = pud_offset(p4d, vaddr);
167 paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
168
169 *pud_tramp = *pud;
170 }
171 164
172 set_pgd(&trampoline_pgd_entry, 165 pud_tramp = pud_page_tramp + pud_index(paddr);
173 __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); 166 *pud_tramp = *pud;
174}
175
176static void __meminit init_trampoline_p4d(void)
177{
178 unsigned long paddr, paddr_next;
179 pgd_t *pgd;
180 p4d_t *p4d_page, *p4d_page_tramp;
181 int i;
182 167
183 p4d_page_tramp = alloc_low_page(); 168 if (pgtable_l5_enabled()) {
184 169 p4d_page_tramp = alloc_low_page();
185 paddr = 0;
186 pgd = pgd_offset_k((unsigned long)__va(paddr));
187 p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
188
189 for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
190 p4d_t *p4d, *p4d_tramp;
191 unsigned long vaddr = (unsigned long)__va(paddr);
192 170
193 p4d_tramp = p4d_page_tramp + p4d_index(paddr); 171 p4d_tramp = p4d_page_tramp + p4d_index(paddr);
194 p4d = p4d_page + p4d_index(vaddr);
195 paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
196 172
197 *p4d_tramp = *p4d; 173 set_p4d(p4d_tramp,
198 } 174 __p4d(_KERNPG_TABLE | __pa(pud_page_tramp)));
199 175
200 set_pgd(&trampoline_pgd_entry, 176 set_pgd(&trampoline_pgd_entry,
201 __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); 177 __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
178 } else {
179 set_pgd(&trampoline_pgd_entry,
180 __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
181 }
202} 182}
203 183
204/* 184/*
205 * Create PGD aligned trampoline table to allow real mode initialization 185 * The real mode trampoline, which is required for bootstrapping CPUs
206 * of additional CPUs. Consume only 1 low memory page. 186 * occupies only a small area under the low 1MB. See reserve_real_mode()
187 * for details.
188 *
189 * If KASLR is disabled the first PGD entry of the direct mapping is copied
190 * to map the real mode trampoline.
191 *
192 * If KASLR is enabled, copy only the PUD which covers the low 1MB
193 * area. This limits the randomization granularity to 1GB for both 4-level
194 * and 5-level paging.
207 */ 195 */
208void __meminit init_trampoline(void) 196void __meminit init_trampoline(void)
209{ 197{
210
211 if (!kaslr_memory_enabled()) { 198 if (!kaslr_memory_enabled()) {
212 init_trampoline_default(); 199 init_trampoline_default();
213 return; 200 return;
214 } 201 }
215 202
216 if (pgtable_l5_enabled()) 203 init_trampoline_pud();
217 init_trampoline_p4d();
218 else
219 init_trampoline_pud();
220} 204}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4c570612e24e..daf4d645e537 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -2209,8 +2209,6 @@ int set_pages_rw(struct page *page, int numpages)
2209 return set_memory_rw(addr, numpages); 2209 return set_memory_rw(addr, numpages);
2210} 2210}
2211 2211
2212#ifdef CONFIG_DEBUG_PAGEALLOC
2213
2214static int __set_pages_p(struct page *page, int numpages) 2212static int __set_pages_p(struct page *page, int numpages)
2215{ 2213{
2216 unsigned long tempaddr = (unsigned long) page_address(page); 2214 unsigned long tempaddr = (unsigned long) page_address(page);
@@ -2249,6 +2247,16 @@ static int __set_pages_np(struct page *page, int numpages)
2249 return __change_page_attr_set_clr(&cpa, 0); 2247 return __change_page_attr_set_clr(&cpa, 0);
2250} 2248}
2251 2249
2250int set_direct_map_invalid_noflush(struct page *page)
2251{
2252 return __set_pages_np(page, 1);
2253}
2254
2255int set_direct_map_default_noflush(struct page *page)
2256{
2257 return __set_pages_p(page, 1);
2258}
2259
2252void __kernel_map_pages(struct page *page, int numpages, int enable) 2260void __kernel_map_pages(struct page *page, int numpages, int enable)
2253{ 2261{
2254 if (PageHighMem(page)) 2262 if (PageHighMem(page))
@@ -2282,7 +2290,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
2282} 2290}
2283 2291
2284#ifdef CONFIG_HIBERNATION 2292#ifdef CONFIG_HIBERNATION
2285
2286bool kernel_page_present(struct page *page) 2293bool kernel_page_present(struct page *page)
2287{ 2294{
2288 unsigned int level; 2295 unsigned int level;
@@ -2294,11 +2301,8 @@ bool kernel_page_present(struct page *page)
2294 pte = lookup_address((unsigned long)page_address(page), &level); 2301 pte = lookup_address((unsigned long)page_address(page), &level);
2295 return (pte_val(*pte) & _PAGE_PRESENT); 2302 return (pte_val(*pte) & _PAGE_PRESENT);
2296} 2303}
2297
2298#endif /* CONFIG_HIBERNATION */ 2304#endif /* CONFIG_HIBERNATION */
2299 2305
2300#endif /* CONFIG_DEBUG_PAGEALLOC */
2301
2302int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, 2306int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
2303 unsigned numpages, unsigned long page_flags) 2307 unsigned numpages, unsigned long page_flags)
2304{ 2308{
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3dbf440d4114..1f67b1e15bf6 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -373,14 +373,14 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
373 373
374static struct kmem_cache *pgd_cache; 374static struct kmem_cache *pgd_cache;
375 375
376static int __init pgd_cache_init(void) 376void __init pgd_cache_init(void)
377{ 377{
378 /* 378 /*
379 * When PAE kernel is running as a Xen domain, it does not use 379 * When PAE kernel is running as a Xen domain, it does not use
380 * shared kernel pmd. And this requires a whole page for pgd. 380 * shared kernel pmd. And this requires a whole page for pgd.
381 */ 381 */
382 if (!SHARED_KERNEL_PMD) 382 if (!SHARED_KERNEL_PMD)
383 return 0; 383 return;
384 384
385 /* 385 /*
386 * when PAE kernel is not running as a Xen domain, it uses 386 * when PAE kernel is not running as a Xen domain, it uses
@@ -390,9 +390,7 @@ static int __init pgd_cache_init(void)
390 */ 390 */
391 pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, 391 pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
392 SLAB_PANIC, NULL); 392 SLAB_PANIC, NULL);
393 return 0;
394} 393}
395core_initcall(pgd_cache_init);
396 394
397static inline pgd_t *_pgd_alloc(void) 395static inline pgd_t *_pgd_alloc(void)
398{ 396{
@@ -420,6 +418,10 @@ static inline void _pgd_free(pgd_t *pgd)
420} 418}
421#else 419#else
422 420
421void __init pgd_cache_init(void)
422{
423}
424
423static inline pgd_t *_pgd_alloc(void) 425static inline pgd_t *_pgd_alloc(void)
424{ 426{
425 return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); 427 return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 487b8474c01c..7f61431c75fb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
634 this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); 634 this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
635} 635}
636 636
637static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) 637static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
638{ 638{
639 const struct flush_tlb_info *f = info; 639 const struct flush_tlb_info *f = info;
640 640
@@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
722 */ 722 */
723unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; 723unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
724 724
725static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
726
727#ifdef CONFIG_DEBUG_VM
728static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
729#endif
730
731static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
732 unsigned long start, unsigned long end,
733 unsigned int stride_shift, bool freed_tables,
734 u64 new_tlb_gen)
735{
736 struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
737
738#ifdef CONFIG_DEBUG_VM
739 /*
740 * Ensure that the following code is non-reentrant and flush_tlb_info
741 * is not overwritten. This means no TLB flushing is initiated by
742 * interrupt handlers and machine-check exception handlers.
743 */
744 BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
745#endif
746
747 info->start = start;
748 info->end = end;
749 info->mm = mm;
750 info->stride_shift = stride_shift;
751 info->freed_tables = freed_tables;
752 info->new_tlb_gen = new_tlb_gen;
753
754 return info;
755}
756
757static inline void put_flush_tlb_info(void)
758{
759#ifdef CONFIG_DEBUG_VM
760 /* Complete reentrency prevention checks */
761 barrier();
762 this_cpu_dec(flush_tlb_info_idx);
763#endif
764}
765
725void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 766void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
726 unsigned long end, unsigned int stride_shift, 767 unsigned long end, unsigned int stride_shift,
727 bool freed_tables) 768 bool freed_tables)
728{ 769{
770 struct flush_tlb_info *info;
771 u64 new_tlb_gen;
729 int cpu; 772 int cpu;
730 773
731 struct flush_tlb_info info = {
732 .mm = mm,
733 .stride_shift = stride_shift,
734 .freed_tables = freed_tables,
735 };
736
737 cpu = get_cpu(); 774 cpu = get_cpu();
738 775
739 /* This is also a barrier that synchronizes with switch_mm(). */
740 info.new_tlb_gen = inc_mm_tlb_gen(mm);
741
742 /* Should we flush just the requested range? */ 776 /* Should we flush just the requested range? */
743 if ((end != TLB_FLUSH_ALL) && 777 if ((end == TLB_FLUSH_ALL) ||
744 ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { 778 ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
745 info.start = start; 779 start = 0;
746 info.end = end; 780 end = TLB_FLUSH_ALL;
747 } else {
748 info.start = 0UL;
749 info.end = TLB_FLUSH_ALL;
750 } 781 }
751 782
783 /* This is also a barrier that synchronizes with switch_mm(). */
784 new_tlb_gen = inc_mm_tlb_gen(mm);
785
786 info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
787 new_tlb_gen);
788
752 if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { 789 if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
753 VM_WARN_ON(irqs_disabled()); 790 lockdep_assert_irqs_enabled();
754 local_irq_disable(); 791 local_irq_disable();
755 flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); 792 flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
756 local_irq_enable(); 793 local_irq_enable();
757 } 794 }
758 795
759 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) 796 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
760 flush_tlb_others(mm_cpumask(mm), &info); 797 flush_tlb_others(mm_cpumask(mm), info);
761 798
799 put_flush_tlb_info();
762 put_cpu(); 800 put_cpu();
763} 801}
764 802
@@ -787,38 +825,48 @@ static void do_kernel_range_flush(void *info)
787 825
788void flush_tlb_kernel_range(unsigned long start, unsigned long end) 826void flush_tlb_kernel_range(unsigned long start, unsigned long end)
789{ 827{
790
791 /* Balance as user space task's flush, a bit conservative */ 828 /* Balance as user space task's flush, a bit conservative */
792 if (end == TLB_FLUSH_ALL || 829 if (end == TLB_FLUSH_ALL ||
793 (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { 830 (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
794 on_each_cpu(do_flush_tlb_all, NULL, 1); 831 on_each_cpu(do_flush_tlb_all, NULL, 1);
795 } else { 832 } else {
796 struct flush_tlb_info info; 833 struct flush_tlb_info *info;
797 info.start = start; 834
798 info.end = end; 835 preempt_disable();
799 on_each_cpu(do_kernel_range_flush, &info, 1); 836 info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
837
838 on_each_cpu(do_kernel_range_flush, info, 1);
839
840 put_flush_tlb_info();
841 preempt_enable();
800 } 842 }
801} 843}
802 844
845/*
846 * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
847 * This means that the 'struct flush_tlb_info' that describes which mappings to
848 * flush is actually fixed. We therefore set a single fixed struct and use it in
849 * arch_tlbbatch_flush().
850 */
851static const struct flush_tlb_info full_flush_tlb_info = {
852 .mm = NULL,
853 .start = 0,
854 .end = TLB_FLUSH_ALL,
855};
856
803void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) 857void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
804{ 858{
805 struct flush_tlb_info info = {
806 .mm = NULL,
807 .start = 0UL,
808 .end = TLB_FLUSH_ALL,
809 };
810
811 int cpu = get_cpu(); 859 int cpu = get_cpu();
812 860
813 if (cpumask_test_cpu(cpu, &batch->cpumask)) { 861 if (cpumask_test_cpu(cpu, &batch->cpumask)) {
814 VM_WARN_ON(irqs_disabled()); 862 lockdep_assert_irqs_enabled();
815 local_irq_disable(); 863 local_irq_disable();
816 flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); 864 flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN);
817 local_irq_enable(); 865 local_irq_enable();
818 } 866 }
819 867
820 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) 868 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
821 flush_tlb_others(&batch->cpumask, &info); 869 flush_tlb_others(&batch->cpumask, &full_flush_tlb_info);
822 870
823 cpumask_clear(&batch->cpumask); 871 cpumask_clear(&batch->cpumask);
824 872
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index a21e1734fc1f..beb44e22afdf 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2318,8 +2318,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2318#elif defined(CONFIG_X86_VSYSCALL_EMULATION) 2318#elif defined(CONFIG_X86_VSYSCALL_EMULATION)
2319 case VSYSCALL_PAGE: 2319 case VSYSCALL_PAGE:
2320#endif 2320#endif
2321 case FIX_TEXT_POKE0:
2322 case FIX_TEXT_POKE1:
2323 /* All local page mappings */ 2321 /* All local page mappings */
2324 pte = pfn_pte(phys, prot); 2322 pte = pfn_pte(phys, prot);
2325 break; 2323 break;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index fa782fba51ee..75d9d68a6de7 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1126,6 +1126,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
1126static inline void init_espfix_bsp(void) { } 1126static inline void init_espfix_bsp(void) { }
1127#endif 1127#endif
1128 1128
1129extern void __init pgd_cache_init(void);
1130
1129#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED 1131#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
1130static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) 1132static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
1131{ 1133{
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b9edc7608d90..480e5b2a5748 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -21,6 +21,15 @@
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22#include <asm/cacheflush.h> 22#include <asm/cacheflush.h>
23 23
24/*
25 * Blindly accessing user memory from NMI context can be dangerous
26 * if we're in the middle of switching the current user task or switching
27 * the loaded mm.
28 */
29#ifndef nmi_uaccess_okay
30# define nmi_uaccess_okay() true
31#endif
32
24#ifdef CONFIG_MMU 33#ifdef CONFIG_MMU
25 34
26/* 35/*
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6074aa064b54..7d3abde3f183 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -20,6 +20,7 @@
20#include <linux/set_memory.h> 20#include <linux/set_memory.h>
21#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
22#include <linux/if_vlan.h> 22#include <linux/if_vlan.h>
23#include <linux/vmalloc.h>
23 24
24#include <net/sch_generic.h> 25#include <net/sch_generic.h>
25 26
@@ -503,7 +504,6 @@ struct bpf_prog {
503 u16 pages; /* Number of allocated pages */ 504 u16 pages; /* Number of allocated pages */
504 u16 jited:1, /* Is our filter JIT'ed? */ 505 u16 jited:1, /* Is our filter JIT'ed? */
505 jit_requested:1,/* archs need to JIT the prog */ 506 jit_requested:1,/* archs need to JIT the prog */
506 undo_set_mem:1, /* Passed set_memory_ro() checkpoint */
507 gpl_compatible:1, /* Is filter GPL compatible? */ 507 gpl_compatible:1, /* Is filter GPL compatible? */
508 cb_access:1, /* Is control block accessed? */ 508 cb_access:1, /* Is control block accessed? */
509 dst_needed:1, /* Do we need dst entry? */ 509 dst_needed:1, /* Do we need dst entry? */
@@ -733,24 +733,15 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
733 733
734static inline void bpf_prog_lock_ro(struct bpf_prog *fp) 734static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
735{ 735{
736 fp->undo_set_mem = 1; 736 set_vm_flush_reset_perms(fp);
737 set_memory_ro((unsigned long)fp, fp->pages); 737 set_memory_ro((unsigned long)fp, fp->pages);
738} 738}
739 739
740static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
741{
742 if (fp->undo_set_mem)
743 set_memory_rw((unsigned long)fp, fp->pages);
744}
745
746static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) 740static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
747{ 741{
742 set_vm_flush_reset_perms(hdr);
748 set_memory_ro((unsigned long)hdr, hdr->pages); 743 set_memory_ro((unsigned long)hdr, hdr->pages);
749} 744 set_memory_x((unsigned long)hdr, hdr->pages);
750
751static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
752{
753 set_memory_rw((unsigned long)hdr, hdr->pages);
754} 745}
755 746
756static inline struct bpf_binary_header * 747static inline struct bpf_binary_header *
@@ -788,7 +779,6 @@ void __bpf_prog_free(struct bpf_prog *fp);
788 779
789static inline void bpf_prog_unlock_free(struct bpf_prog *fp) 780static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
790{ 781{
791 bpf_prog_unlock_ro(fp);
792 __bpf_prog_free(fp); 782 __bpf_prog_free(fp);
793} 783}
794 784
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6b10c21630f5..083d7b4863ed 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2610,37 +2610,31 @@ static inline void kernel_poison_pages(struct page *page, int numpages,
2610 int enable) { } 2610 int enable) { }
2611#endif 2611#endif
2612 2612
2613#ifdef CONFIG_DEBUG_PAGEALLOC
2614extern bool _debug_pagealloc_enabled; 2613extern bool _debug_pagealloc_enabled;
2615extern void __kernel_map_pages(struct page *page, int numpages, int enable);
2616 2614
2617static inline bool debug_pagealloc_enabled(void) 2615static inline bool debug_pagealloc_enabled(void)
2618{ 2616{
2619 return _debug_pagealloc_enabled; 2617 return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled;
2620} 2618}
2621 2619
2620#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
2621extern void __kernel_map_pages(struct page *page, int numpages, int enable);
2622
2622static inline void 2623static inline void
2623kernel_map_pages(struct page *page, int numpages, int enable) 2624kernel_map_pages(struct page *page, int numpages, int enable)
2624{ 2625{
2625 if (!debug_pagealloc_enabled())
2626 return;
2627
2628 __kernel_map_pages(page, numpages, enable); 2626 __kernel_map_pages(page, numpages, enable);
2629} 2627}
2630#ifdef CONFIG_HIBERNATION 2628#ifdef CONFIG_HIBERNATION
2631extern bool kernel_page_present(struct page *page); 2629extern bool kernel_page_present(struct page *page);
2632#endif /* CONFIG_HIBERNATION */ 2630#endif /* CONFIG_HIBERNATION */
2633#else /* CONFIG_DEBUG_PAGEALLOC */ 2631#else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
2634static inline void 2632static inline void
2635kernel_map_pages(struct page *page, int numpages, int enable) {} 2633kernel_map_pages(struct page *page, int numpages, int enable) {}
2636#ifdef CONFIG_HIBERNATION 2634#ifdef CONFIG_HIBERNATION
2637static inline bool kernel_page_present(struct page *page) { return true; } 2635static inline bool kernel_page_present(struct page *page) { return true; }
2638#endif /* CONFIG_HIBERNATION */ 2636#endif /* CONFIG_HIBERNATION */
2639static inline bool debug_pagealloc_enabled(void) 2637#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
2640{
2641 return false;
2642}
2643#endif /* CONFIG_DEBUG_PAGEALLOC */
2644 2638
2645#ifdef __HAVE_ARCH_GATE_AREA 2639#ifdef __HAVE_ARCH_GATE_AREA
2646extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); 2640extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 2e97a2227045..f1227f2c38a4 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -76,6 +76,7 @@ extern void exit_itimers(struct signal_struct *);
76extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); 76extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
77extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); 77extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
78struct task_struct *fork_idle(int); 78struct task_struct *fork_idle(int);
79struct mm_struct *copy_init_mm(void);
79extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); 80extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
80extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); 81extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
81 82
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index 2a986d282a97..b5071497b8cb 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -17,6 +17,17 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
17static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } 17static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
18#endif 18#endif
19 19
20#ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP
21static inline int set_direct_map_invalid_noflush(struct page *page)
22{
23 return 0;
24}
25static inline int set_direct_map_default_noflush(struct page *page)
26{
27 return 0;
28}
29#endif
30
20#ifndef set_mce_nospec 31#ifndef set_mce_nospec
21static inline int set_mce_nospec(unsigned long pfn) 32static inline int set_mce_nospec(unsigned long pfn)
22{ 33{
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 103a48a48872..12bf0b68ed92 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -115,6 +115,7 @@ struct uprobes_state {
115 struct xol_area *xol_area; 115 struct xol_area *xol_area;
116}; 116};
117 117
118extern void __init uprobes_init(void);
118extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); 119extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
119extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); 120extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
120extern bool is_swbp_insn(uprobe_opcode_t *insn); 121extern bool is_swbp_insn(uprobe_opcode_t *insn);
@@ -154,6 +155,10 @@ extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
154struct uprobes_state { 155struct uprobes_state {
155}; 156};
156 157
158static inline void uprobes_init(void)
159{
160}
161
157#define uprobe_get_trap_addr(regs) instruction_pointer(regs) 162#define uprobe_get_trap_addr(regs) instruction_pointer(regs)
158 163
159static inline int 164static inline int
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 398e9c95cd61..c6eebb839552 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -21,6 +21,11 @@ struct notifier_block; /* in notifier.h */
21#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ 21#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
22#define VM_NO_GUARD 0x00000040 /* don't add guard page */ 22#define VM_NO_GUARD 0x00000040 /* don't add guard page */
23#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ 23#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
24/*
25 * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with
26 * vfree_atomic().
27 */
28#define VM_FLUSH_RESET_PERMS 0x00000100 /* Reset direct map and flush TLB on unmap */
24/* bits [20..32] reserved for arch specific ioremap internals */ 29/* bits [20..32] reserved for arch specific ioremap internals */
25 30
26/* 31/*
@@ -142,6 +147,13 @@ extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
142 pgprot_t prot, struct page **pages); 147 pgprot_t prot, struct page **pages);
143extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); 148extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
144extern void unmap_kernel_range(unsigned long addr, unsigned long size); 149extern void unmap_kernel_range(unsigned long addr, unsigned long size);
150static inline void set_vm_flush_reset_perms(void *addr)
151{
152 struct vm_struct *vm = find_vm_area(addr);
153
154 if (vm)
155 vm->flags |= VM_FLUSH_RESET_PERMS;
156}
145#else 157#else
146static inline int 158static inline int
147map_kernel_range_noflush(unsigned long start, unsigned long size, 159map_kernel_range_noflush(unsigned long start, unsigned long size,
@@ -157,6 +169,9 @@ static inline void
157unmap_kernel_range(unsigned long addr, unsigned long size) 169unmap_kernel_range(unsigned long addr, unsigned long size)
158{ 170{
159} 171}
172static inline void set_vm_flush_reset_perms(void *addr)
173{
174}
160#endif 175#endif
161 176
162/* Allocate/destroy a 'vmalloc' VM area. */ 177/* Allocate/destroy a 'vmalloc' VM area. */
diff --git a/init/main.c b/init/main.c
index 7d4025d665eb..9dc2f3b4f753 100644
--- a/init/main.c
+++ b/init/main.c
@@ -504,6 +504,10 @@ void __init __weak thread_stack_cache_init(void)
504 504
505void __init __weak mem_encrypt_init(void) { } 505void __init __weak mem_encrypt_init(void) { }
506 506
507void __init __weak poking_init(void) { }
508
509void __init __weak pgd_cache_init(void) { }
510
507bool initcall_debug; 511bool initcall_debug;
508core_param(initcall_debug, initcall_debug, bool, 0644); 512core_param(initcall_debug, initcall_debug, bool, 0644);
509 513
@@ -535,6 +539,7 @@ static void __init mm_init(void)
535 init_espfix_bsp(); 539 init_espfix_bsp();
536 /* Should be run after espfix64 is set up. */ 540 /* Should be run after espfix64 is set up. */
537 pti_init(); 541 pti_init();
542 pgd_cache_init();
538} 543}
539 544
540void __init __weak arch_call_rest_init(void) 545void __init __weak arch_call_rest_init(void)
@@ -737,6 +742,7 @@ asmlinkage __visible void __init start_kernel(void)
737 taskstats_init_early(); 742 taskstats_init_early();
738 delayacct_init(); 743 delayacct_init();
739 744
745 poking_init();
740 check_bugs(); 746 check_bugs();
741 747
742 acpi_subsystem_init(); 748 acpi_subsystem_init();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ff09d32a8a1b..c605397c79f0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
848 if (fp->jited) { 848 if (fp->jited) {
849 struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); 849 struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
850 850
851 bpf_jit_binary_unlock_ro(hdr);
852 bpf_jit_binary_free(hdr); 851 bpf_jit_binary_free(hdr);
853 852
854 WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); 853 WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c5cde87329c7..e6a0d6be87e3 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = {
2294 .priority = INT_MAX-1, /* notified after kprobes, kgdb */ 2294 .priority = INT_MAX-1, /* notified after kprobes, kgdb */
2295}; 2295};
2296 2296
2297static int __init init_uprobes(void) 2297void __init uprobes_init(void)
2298{ 2298{
2299 int i; 2299 int i;
2300 2300
2301 for (i = 0; i < UPROBES_HASH_SZ; i++) 2301 for (i = 0; i < UPROBES_HASH_SZ; i++)
2302 mutex_init(&uprobes_mmap_mutex[i]); 2302 mutex_init(&uprobes_mmap_mutex[i]);
2303 2303
2304 if (percpu_init_rwsem(&dup_mmap_sem)) 2304 BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
2305 return -ENOMEM;
2306 2305
2307 return register_die_notifier(&uprobe_exception_nb); 2306 BUG_ON(register_die_notifier(&uprobe_exception_nb));
2308} 2307}
2309__initcall(init_uprobes);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa210b..fbe9dfcd8680 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -815,6 +815,7 @@ void __init fork_init(void)
815#endif 815#endif
816 816
817 lockdep_init_task(&init_task); 817 lockdep_init_task(&init_task);
818 uprobes_init();
818} 819}
819 820
820int __weak arch_dup_task_struct(struct task_struct *dst, 821int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -1298,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1298 complete_vfork_done(tsk); 1299 complete_vfork_done(tsk);
1299} 1300}
1300 1301
1301/* 1302/**
1302 * Allocate a new mm structure and copy contents from the 1303 * dup_mm() - duplicates an existing mm structure
1303 * mm structure of the passed in task structure. 1304 * @tsk: the task_struct with which the new mm will be associated.
1305 * @oldmm: the mm to duplicate.
1306 *
1307 * Allocates a new mm structure and duplicates the provided @oldmm structure
1308 * content into it.
1309 *
1310 * Return: the duplicated mm or NULL on failure.
1304 */ 1311 */
1305static struct mm_struct *dup_mm(struct task_struct *tsk) 1312static struct mm_struct *dup_mm(struct task_struct *tsk,
1313 struct mm_struct *oldmm)
1306{ 1314{
1307 struct mm_struct *mm, *oldmm = current->mm; 1315 struct mm_struct *mm;
1308 int err; 1316 int err;
1309 1317
1310 mm = allocate_mm(); 1318 mm = allocate_mm();
@@ -1371,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1371 } 1379 }
1372 1380
1373 retval = -ENOMEM; 1381 retval = -ENOMEM;
1374 mm = dup_mm(tsk); 1382 mm = dup_mm(tsk, current->mm);
1375 if (!mm) 1383 if (!mm)
1376 goto fail_nomem; 1384 goto fail_nomem;
1377 1385
@@ -2186,6 +2194,11 @@ struct task_struct *fork_idle(int cpu)
2186 return task; 2194 return task;
2187} 2195}
2188 2196
2197struct mm_struct *copy_init_mm(void)
2198{
2199 return dup_mm(NULL, &init_mm);
2200}
2201
2189/* 2202/*
2190 * Ok, this is the main fork-routine. 2203 * Ok, this is the main fork-routine.
2191 * 2204 *
diff --git a/kernel/module.c b/kernel/module.c
index 0b9aa8ab89f0..a9020bdd4cf6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex);
98EXPORT_SYMBOL_GPL(module_mutex); 98EXPORT_SYMBOL_GPL(module_mutex);
99static LIST_HEAD(modules); 99static LIST_HEAD(modules);
100 100
101/* Work queue for freeing init sections in success case */
102static struct work_struct init_free_wq;
103static struct llist_head init_free_list;
104
101#ifdef CONFIG_MODULES_TREE_LOOKUP 105#ifdef CONFIG_MODULES_TREE_LOOKUP
102 106
103/* 107/*
@@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init)
1949 if (!rodata_enabled) 1953 if (!rodata_enabled)
1950 return; 1954 return;
1951 1955
1956 set_vm_flush_reset_perms(mod->core_layout.base);
1957 set_vm_flush_reset_perms(mod->init_layout.base);
1952 frob_text(&mod->core_layout, set_memory_ro); 1958 frob_text(&mod->core_layout, set_memory_ro);
1959 frob_text(&mod->core_layout, set_memory_x);
1960
1953 frob_rodata(&mod->core_layout, set_memory_ro); 1961 frob_rodata(&mod->core_layout, set_memory_ro);
1962
1954 frob_text(&mod->init_layout, set_memory_ro); 1963 frob_text(&mod->init_layout, set_memory_ro);
1964 frob_text(&mod->init_layout, set_memory_x);
1965
1955 frob_rodata(&mod->init_layout, set_memory_ro); 1966 frob_rodata(&mod->init_layout, set_memory_ro);
1956 1967
1957 if (after_init) 1968 if (after_init)
@@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod)
1967 frob_writable_data(&mod->init_layout, set_memory_nx); 1978 frob_writable_data(&mod->init_layout, set_memory_nx);
1968} 1979}
1969 1980
1970static void module_disable_nx(const struct module *mod)
1971{
1972 frob_rodata(&mod->core_layout, set_memory_x);
1973 frob_ro_after_init(&mod->core_layout, set_memory_x);
1974 frob_writable_data(&mod->core_layout, set_memory_x);
1975 frob_rodata(&mod->init_layout, set_memory_x);
1976 frob_writable_data(&mod->init_layout, set_memory_x);
1977}
1978
1979/* Iterate through all modules and set each module's text as RW */ 1981/* Iterate through all modules and set each module's text as RW */
1980void set_all_modules_text_rw(void) 1982void set_all_modules_text_rw(void)
1981{ 1983{
@@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void)
2019 } 2021 }
2020 mutex_unlock(&module_mutex); 2022 mutex_unlock(&module_mutex);
2021} 2023}
2022
2023static void disable_ro_nx(const struct module_layout *layout)
2024{
2025 if (rodata_enabled) {
2026 frob_text(layout, set_memory_rw);
2027 frob_rodata(layout, set_memory_rw);
2028 frob_ro_after_init(layout, set_memory_rw);
2029 }
2030 frob_rodata(layout, set_memory_x);
2031 frob_ro_after_init(layout, set_memory_x);
2032 frob_writable_data(layout, set_memory_x);
2033}
2034
2035#else 2024#else
2036static void disable_ro_nx(const struct module_layout *layout) { }
2037static void module_enable_nx(const struct module *mod) { } 2025static void module_enable_nx(const struct module *mod) { }
2038static void module_disable_nx(const struct module *mod) { }
2039#endif 2026#endif
2040 2027
2041#ifdef CONFIG_LIVEPATCH 2028#ifdef CONFIG_LIVEPATCH
@@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod)
2115 2102
2116void __weak module_memfree(void *module_region) 2103void __weak module_memfree(void *module_region)
2117{ 2104{
2105 /*
2106 * This memory may be RO, and freeing RO memory in an interrupt is not
2107 * supported by vmalloc.
2108 */
2109 WARN_ON(in_interrupt());
2118 vfree(module_region); 2110 vfree(module_region);
2119} 2111}
2120 2112
@@ -2166,7 +2158,6 @@ static void free_module(struct module *mod)
2166 mutex_unlock(&module_mutex); 2158 mutex_unlock(&module_mutex);
2167 2159
2168 /* This may be empty, but that's OK */ 2160 /* This may be empty, but that's OK */
2169 disable_ro_nx(&mod->init_layout);
2170 module_arch_freeing_init(mod); 2161 module_arch_freeing_init(mod);
2171 module_memfree(mod->init_layout.base); 2162 module_memfree(mod->init_layout.base);
2172 kfree(mod->args); 2163 kfree(mod->args);
@@ -2176,7 +2167,6 @@ static void free_module(struct module *mod)
2176 lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); 2167 lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
2177 2168
2178 /* Finally, free the core (containing the module structure) */ 2169 /* Finally, free the core (containing the module structure) */
2179 disable_ro_nx(&mod->core_layout);
2180 module_memfree(mod->core_layout.base); 2170 module_memfree(mod->core_layout.base);
2181} 2171}
2182 2172
@@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod)
3415 3405
3416/* For freeing module_init on success, in case kallsyms traversing */ 3406/* For freeing module_init on success, in case kallsyms traversing */
3417struct mod_initfree { 3407struct mod_initfree {
3418 struct rcu_head rcu; 3408 struct llist_node node;
3419 void *module_init; 3409 void *module_init;
3420}; 3410};
3421 3411
3422static void do_free_init(struct rcu_head *head) 3412static void do_free_init(struct work_struct *w)
3423{ 3413{
3424 struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); 3414 struct llist_node *pos, *n, *list;
3425 module_memfree(m->module_init); 3415 struct mod_initfree *initfree;
3426 kfree(m); 3416
3417 list = llist_del_all(&init_free_list);
3418
3419 synchronize_rcu();
3420
3421 llist_for_each_safe(pos, n, list) {
3422 initfree = container_of(pos, struct mod_initfree, node);
3423 module_memfree(initfree->module_init);
3424 kfree(initfree);
3425 }
3427} 3426}
3428 3427
3428static int __init modules_wq_init(void)
3429{
3430 INIT_WORK(&init_free_wq, do_free_init);
3431 init_llist_head(&init_free_list);
3432 return 0;
3433}
3434module_init(modules_wq_init);
3435
3429/* 3436/*
3430 * This is where the real work happens. 3437 * This is where the real work happens.
3431 * 3438 *
@@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)
3502#endif 3509#endif
3503 module_enable_ro(mod, true); 3510 module_enable_ro(mod, true);
3504 mod_tree_remove_init(mod); 3511 mod_tree_remove_init(mod);
3505 disable_ro_nx(&mod->init_layout);
3506 module_arch_freeing_init(mod); 3512 module_arch_freeing_init(mod);
3507 mod->init_layout.base = NULL; 3513 mod->init_layout.base = NULL;
3508 mod->init_layout.size = 0; 3514 mod->init_layout.size = 0;
@@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)
3513 * We want to free module_init, but be aware that kallsyms may be 3519 * We want to free module_init, but be aware that kallsyms may be
3514 * walking this with preempt disabled. In all the failure paths, we 3520 * walking this with preempt disabled. In all the failure paths, we
3515 * call synchronize_rcu(), but we don't want to slow down the success 3521 * call synchronize_rcu(), but we don't want to slow down the success
3516 * path, so use actual RCU here. 3522 * path. module_memfree() cannot be called in an interrupt, so do the
3523 * work and call synchronize_rcu() in a work queue.
3524 *
3517 * Note that module_alloc() on most architectures creates W+X page 3525 * Note that module_alloc() on most architectures creates W+X page
3518 * mappings which won't be cleaned up until do_free_init() runs. Any 3526 * mappings which won't be cleaned up until do_free_init() runs. Any
3519 * code such as mark_rodata_ro() which depends on those mappings to 3527 * code such as mark_rodata_ro() which depends on those mappings to
3520 * be cleaned up needs to sync with the queued work - ie 3528 * be cleaned up needs to sync with the queued work - ie
3521 * rcu_barrier() 3529 * rcu_barrier()
3522 */ 3530 */
3523 call_rcu(&freeinit->rcu, do_free_init); 3531 if (llist_add(&freeinit->node, &init_free_list))
3532 schedule_work(&init_free_wq);
3533
3524 mutex_unlock(&module_mutex); 3534 mutex_unlock(&module_mutex);
3525 wake_up_all(&module_wq); 3535 wake_up_all(&module_wq);
3526 3536
@@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
3817 module_bug_cleanup(mod); 3827 module_bug_cleanup(mod);
3818 mutex_unlock(&module_mutex); 3828 mutex_unlock(&module_mutex);
3819 3829
3820 /* we can't deallocate the module until we clear memory protection */
3821 module_disable_ro(mod);
3822 module_disable_nx(mod);
3823
3824 ddebug_cleanup: 3830 ddebug_cleanup:
3825 ftrace_release_mod(mod); 3831 ftrace_release_mod(mod);
3826 dynamic_debug_remove(mod, info->debug); 3832 dynamic_debug_remove(mod, info->debug);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f08a1e4ee1d4..bc9558ab1e5b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)
1342 * safe_copy_page - Copy a page in a safe way. 1342 * safe_copy_page - Copy a page in a safe way.
1343 * 1343 *
1344 * Check if the page we are going to copy is marked as present in the kernel 1344 * Check if the page we are going to copy is marked as present in the kernel
1345 * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set 1345 * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
1346 * and in that case kernel_page_present() always returns 'true'). 1346 * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
1347 * always returns 'true'.
1347 */ 1348 */
1348static void safe_copy_page(void *dst, struct page *s_page) 1349static void safe_copy_page(void *dst, struct page *s_page)
1349{ 1350{
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d64c00afceb5..94b0e37d90ef 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,6 +14,8 @@
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
15#include <linux/error-injection.h> 15#include <linux/error-injection.h>
16 16
17#include <asm/tlb.h>
18
17#include "trace_probe.h" 19#include "trace_probe.h"
18#include "trace.h" 20#include "trace.h"
19 21
@@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
163 * access_ok() should prevent writing to non-user memory, but in 165 * access_ok() should prevent writing to non-user memory, but in
164 * some situations (nommu, temporary switch, etc) access_ok() does 166 * some situations (nommu, temporary switch, etc) access_ok() does
165 * not provide enough validation, hence the check on KERNEL_DS. 167 * not provide enough validation, hence the check on KERNEL_DS.
168 *
169 * nmi_uaccess_okay() ensures the probe is not run in an interim
170 * state, when the task or mm are switched. This is specifically
171 * required to prevent the use of temporary mm.
166 */ 172 */
167 173
168 if (unlikely(in_interrupt() || 174 if (unlikely(in_interrupt() ||
@@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
170 return -EPERM; 176 return -EPERM;
171 if (unlikely(uaccess_kernel())) 177 if (unlikely(uaccess_kernel()))
172 return -EPERM; 178 return -EPERM;
179 if (unlikely(!nmi_uaccess_okay()))
180 return -EPERM;
173 if (!access_ok(unsafe_ptr, size)) 181 if (!access_ok(unsafe_ptr, size))
174 return -EPERM; 182 return -EPERM;
175 183
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c02cff1ed56e..59661106da16 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1144,7 +1144,9 @@ static __always_inline bool free_pages_prepare(struct page *page,
1144 } 1144 }
1145 arch_free_page(page, order); 1145 arch_free_page(page, order);
1146 kernel_poison_pages(page, 1 << order, 0); 1146 kernel_poison_pages(page, 1 << order, 0);
1147 kernel_map_pages(page, 1 << order, 0); 1147 if (debug_pagealloc_enabled())
1148 kernel_map_pages(page, 1 << order, 0);
1149
1148 kasan_free_nondeferred_pages(page, order); 1150 kasan_free_nondeferred_pages(page, order);
1149 1151
1150 return true; 1152 return true;
@@ -2014,7 +2016,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
2014 set_page_refcounted(page); 2016 set_page_refcounted(page);
2015 2017
2016 arch_alloc_page(page, order); 2018 arch_alloc_page(page, order);
2017 kernel_map_pages(page, 1 << order, 1); 2019 if (debug_pagealloc_enabled())
2020 kernel_map_pages(page, 1 << order, 1);
2018 kasan_alloc_pages(page, order); 2021 kasan_alloc_pages(page, order);
2019 kernel_poison_pages(page, 1 << order, 1); 2022 kernel_poison_pages(page, 1 << order, 1);
2020 set_page_owner(page, order, gfp_flags); 2023 set_page_owner(page, order, gfp_flags);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e86ba6e74b50..e5e9e1fcac01 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -18,6 +18,7 @@
18#include <linux/interrupt.h> 18#include <linux/interrupt.h>
19#include <linux/proc_fs.h> 19#include <linux/proc_fs.h>
20#include <linux/seq_file.h> 20#include <linux/seq_file.h>
21#include <linux/set_memory.h>
21#include <linux/debugobjects.h> 22#include <linux/debugobjects.h>
22#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
23#include <linux/list.h> 24#include <linux/list.h>
@@ -1059,24 +1060,9 @@ static void vb_free(const void *addr, unsigned long size)
1059 spin_unlock(&vb->lock); 1060 spin_unlock(&vb->lock);
1060} 1061}
1061 1062
1062/** 1063static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
1063 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
1064 *
1065 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
1066 * to amortize TLB flushing overheads. What this means is that any page you
1067 * have now, may, in a former life, have been mapped into kernel virtual
1068 * address by the vmap layer and so there might be some CPUs with TLB entries
1069 * still referencing that page (additional to the regular 1:1 kernel mapping).
1070 *
1071 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
1072 * be sure that none of the pages we have control over will have any aliases
1073 * from the vmap layer.
1074 */
1075void vm_unmap_aliases(void)
1076{ 1064{
1077 unsigned long start = ULONG_MAX, end = 0;
1078 int cpu; 1065 int cpu;
1079 int flush = 0;
1080 1066
1081 if (unlikely(!vmap_initialized)) 1067 if (unlikely(!vmap_initialized))
1082 return; 1068 return;
@@ -1113,6 +1099,27 @@ void vm_unmap_aliases(void)
1113 flush_tlb_kernel_range(start, end); 1099 flush_tlb_kernel_range(start, end);
1114 mutex_unlock(&vmap_purge_lock); 1100 mutex_unlock(&vmap_purge_lock);
1115} 1101}
1102
1103/**
1104 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
1105 *
1106 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
1107 * to amortize TLB flushing overheads. What this means is that any page you
1108 * have now, may, in a former life, have been mapped into kernel virtual
1109 * address by the vmap layer and so there might be some CPUs with TLB entries
1110 * still referencing that page (additional to the regular 1:1 kernel mapping).
1111 *
1112 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
1113 * be sure that none of the pages we have control over will have any aliases
1114 * from the vmap layer.
1115 */
1116void vm_unmap_aliases(void)
1117{
1118 unsigned long start = ULONG_MAX, end = 0;
1119 int flush = 0;
1120
1121 _vm_unmap_aliases(start, end, flush);
1122}
1116EXPORT_SYMBOL_GPL(vm_unmap_aliases); 1123EXPORT_SYMBOL_GPL(vm_unmap_aliases);
1117 1124
1118/** 1125/**
@@ -1505,6 +1512,72 @@ struct vm_struct *remove_vm_area(const void *addr)
1505 return NULL; 1512 return NULL;
1506} 1513}
1507 1514
1515static inline void set_area_direct_map(const struct vm_struct *area,
1516 int (*set_direct_map)(struct page *page))
1517{
1518 int i;
1519
1520 for (i = 0; i < area->nr_pages; i++)
1521 if (page_address(area->pages[i]))
1522 set_direct_map(area->pages[i]);
1523}
1524
1525/* Handle removing and resetting vm mappings related to the vm_struct. */
1526static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
1527{
1528 unsigned long addr = (unsigned long)area->addr;
1529 unsigned long start = ULONG_MAX, end = 0;
1530 int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
1531 int i;
1532
1533 /*
1534 * The below block can be removed when all architectures that have
1535 * direct map permissions also have set_direct_map_() implementations.
1536 * This is concerned with resetting the direct map any an vm alias with
1537 * execute permissions, without leaving a RW+X window.
1538 */
1539 if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
1540 set_memory_nx(addr, area->nr_pages);
1541 set_memory_rw(addr, area->nr_pages);
1542 }
1543
1544 remove_vm_area(area->addr);
1545
1546 /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
1547 if (!flush_reset)
1548 return;
1549
1550 /*
1551 * If not deallocating pages, just do the flush of the VM area and
1552 * return.
1553 */
1554 if (!deallocate_pages) {
1555 vm_unmap_aliases();
1556 return;
1557 }
1558
1559 /*
1560 * If execution gets here, flush the vm mapping and reset the direct
1561 * map. Find the start and end range of the direct mappings to make sure
1562 * the vm_unmap_aliases() flush includes the direct map.
1563 */
1564 for (i = 0; i < area->nr_pages; i++) {
1565 if (page_address(area->pages[i])) {
1566 start = min(addr, start);
1567 end = max(addr, end);
1568 }
1569 }
1570
1571 /*
1572 * Set direct map to something invalid so that it won't be cached if
1573 * there are any accesses after the TLB flush, then flush the TLB and
1574 * reset the direct map permissions to the default.
1575 */
1576 set_area_direct_map(area, set_direct_map_invalid_noflush);
1577 _vm_unmap_aliases(start, end, 1);
1578 set_area_direct_map(area, set_direct_map_default_noflush);
1579}
1580
1508static void __vunmap(const void *addr, int deallocate_pages) 1581static void __vunmap(const void *addr, int deallocate_pages)
1509{ 1582{
1510 struct vm_struct *area; 1583 struct vm_struct *area;
@@ -1526,7 +1599,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
1526 debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); 1599 debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
1527 debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); 1600 debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
1528 1601
1529 remove_vm_area(addr); 1602 vm_remove_mappings(area, deallocate_pages);
1603
1530 if (deallocate_pages) { 1604 if (deallocate_pages) {
1531 int i; 1605 int i;
1532 1606
@@ -1961,8 +2035,9 @@ EXPORT_SYMBOL(vzalloc_node);
1961 */ 2035 */
1962void *vmalloc_exec(unsigned long size) 2036void *vmalloc_exec(unsigned long size)
1963{ 2037{
1964 return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, 2038 return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
1965 NUMA_NO_NODE, __builtin_return_address(0)); 2039 GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
2040 NUMA_NO_NODE, __builtin_return_address(0));
1966} 2041}
1967 2042
1968#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 2043#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)