diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-06 19:13:31 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-06 19:13:31 -0400 |
commit | 0bc40e549aeea2de20fc571749de9bbfc099fb34 (patch) | |
tree | d18f3339bd383a17431fca23b6c5f3e54c93cf2f | |
parent | e913c4a4c21cd83317fafe63bfdc9d34d2910114 (diff) | |
parent | caa841360134f863987f2d4f77b8dc2fbb7596f8 (diff) |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
"The changes in here are:
- text_poke() fixes and an extensive set of executability lockdowns,
to (hopefully) eliminate the last residual circumstances under
which we are using W|X mappings even temporarily on x86 kernels.
This required a broad range of surgery in text patching facilities,
module loading, trampoline handling and other bits.
- tweak page fault messages to be more informative and more
structured.
- remove DISCONTIGMEM support on x86-32 and make SPARSEMEM the
default.
- reduce KASLR granularity on 5-level paging kernels from 512 GB to
1 GB.
- misc other changes and updates"
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
x86/mm: Initialize PGD cache during mm initialization
x86/alternatives: Add comment about module removal races
x86/kprobes: Use vmalloc special flag
x86/ftrace: Use vmalloc special flag
bpf: Use vmalloc special flag
modules: Use vmalloc special flag
mm/vmalloc: Add flag for freeing of special permsissions
mm/hibernation: Make hibernation handle unmapped pages
x86/mm/cpa: Add set_direct_map_*() functions
x86/alternatives: Remove the return value of text_poke_*()
x86/jump-label: Remove support for custom text poker
x86/modules: Avoid breaking W^X while loading modules
x86/kprobes: Set instruction page as executable
x86/ftrace: Set trampoline pages as executable
x86/kgdb: Avoid redundant comparison of patched code
x86/alternatives: Use temporary mm for text poking
x86/alternatives: Initialize temporary mm for patching
fork: Provide a function for copying init_mm
uprobes: Initialize uprobes earlier
x86/mm: Save debug registers when loading a temporary mm
...
40 files changed, 711 insertions, 343 deletions
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 804f9426ed17..6cbe652d7a49 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt | |||
@@ -72,7 +72,7 @@ Complete virtual memory map with 5-level page tables | |||
72 | Notes: | 72 | Notes: |
73 | 73 | ||
74 | - With 56-bit addresses, user-space memory gets expanded by a factor of 512x, | 74 | - With 56-bit addresses, user-space memory gets expanded by a factor of 512x, |
75 | from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting | 75 | from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PB starting |
76 | offset and many of the regions expand to support the much larger physical | 76 | offset and many of the regions expand to support the much larger physical |
77 | memory supported. | 77 | memory supported. |
78 | 78 | ||
@@ -83,7 +83,7 @@ Notes: | |||
83 | 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm | 83 | 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm |
84 | __________________|____________|__________________|_________|___________________________________________________________ | 84 | __________________|____________|__________________|_________|___________________________________________________________ |
85 | | | | | | 85 | | | | | |
86 | 0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical | 86 | 0100000000000000 | +64 PB | feffffffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical |
87 | | | | | virtual memory addresses up to the -64 PB | 87 | | | | | virtual memory addresses up to the -64 PB |
88 | | | | | starting offset of kernel mappings. | 88 | | | | | starting offset of kernel mappings. |
89 | __________________|____________|__________________|_________|___________________________________________________________ | 89 | __________________|____________|__________________|_________|___________________________________________________________ |
@@ -99,7 +99,7 @@ ____________________________________________________________|___________________ | |||
99 | ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole | 99 | ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole |
100 | ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base) | 100 | ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base) |
101 | ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole | 101 | ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole |
102 | ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory | 102 | ffdf000000000000 | -8.25 PB | fffffbffffffffff | ~8 PB | KASAN shadow memory |
103 | __________________|____________|__________________|_________|____________________________________________________________ | 103 | __________________|____________|__________________|_________|____________________________________________________________ |
104 | | | 104 | | |
105 | | Identical layout to the 47-bit one from here on: | 105 | | Identical layout to the 47-bit one from here on: |
diff --git a/arch/Kconfig b/arch/Kconfig index 3ab446bd12ef..5e43fcbad4ca 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
@@ -249,6 +249,10 @@ config ARCH_HAS_FORTIFY_SOURCE | |||
249 | config ARCH_HAS_SET_MEMORY | 249 | config ARCH_HAS_SET_MEMORY |
250 | bool | 250 | bool |
251 | 251 | ||
252 | # Select if arch has all set_direct_map_invalid/default() functions | ||
253 | config ARCH_HAS_SET_DIRECT_MAP | ||
254 | bool | ||
255 | |||
252 | # Select if arch init_task must go in the __init_task_data section | 256 | # Select if arch init_task must go in the __init_task_data section |
253 | config ARCH_TASK_STRUCT_ON_STACK | 257 | config ARCH_TASK_STRUCT_ON_STACK |
254 | bool | 258 | bool |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index db95da6d644d..9fc73ca17844 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -65,6 +65,7 @@ config X86 | |||
65 | select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 | 65 | select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 |
66 | select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE | 66 | select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE |
67 | select ARCH_HAS_SET_MEMORY | 67 | select ARCH_HAS_SET_MEMORY |
68 | select ARCH_HAS_SET_DIRECT_MAP | ||
68 | select ARCH_HAS_STRICT_KERNEL_RWX | 69 | select ARCH_HAS_STRICT_KERNEL_RWX |
69 | select ARCH_HAS_STRICT_MODULE_RWX | 70 | select ARCH_HAS_STRICT_MODULE_RWX |
70 | select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE | 71 | select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE |
@@ -1592,12 +1593,9 @@ config ARCH_FLATMEM_ENABLE | |||
1592 | depends on X86_32 && !NUMA | 1593 | depends on X86_32 && !NUMA |
1593 | 1594 | ||
1594 | config ARCH_DISCONTIGMEM_ENABLE | 1595 | config ARCH_DISCONTIGMEM_ENABLE |
1595 | def_bool y | 1596 | def_bool n |
1596 | depends on NUMA && X86_32 | ||
1597 | |||
1598 | config ARCH_DISCONTIGMEM_DEFAULT | ||
1599 | def_bool y | ||
1600 | depends on NUMA && X86_32 | 1597 | depends on NUMA && X86_32 |
1598 | depends on BROKEN | ||
1601 | 1599 | ||
1602 | config ARCH_SPARSEMEM_ENABLE | 1600 | config ARCH_SPARSEMEM_ENABLE |
1603 | def_bool y | 1601 | def_bool y |
@@ -1606,8 +1604,7 @@ config ARCH_SPARSEMEM_ENABLE | |||
1606 | select SPARSEMEM_VMEMMAP_ENABLE if X86_64 | 1604 | select SPARSEMEM_VMEMMAP_ENABLE if X86_64 |
1607 | 1605 | ||
1608 | config ARCH_SPARSEMEM_DEFAULT | 1606 | config ARCH_SPARSEMEM_DEFAULT |
1609 | def_bool y | 1607 | def_bool X86_64 || (NUMA && X86_32) |
1610 | depends on X86_64 | ||
1611 | 1608 | ||
1612 | config ARCH_SELECT_MEMORY_MODEL | 1609 | config ARCH_SELECT_MEMORY_MODEL |
1613 | def_bool y | 1610 | def_bool y |
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 50ba74a34a37..9da8cccdf3fb 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h | |||
@@ -103,8 +103,6 @@ enum fixed_addresses { | |||
103 | #ifdef CONFIG_PARAVIRT | 103 | #ifdef CONFIG_PARAVIRT |
104 | FIX_PARAVIRT_BOOTMAP, | 104 | FIX_PARAVIRT_BOOTMAP, |
105 | #endif | 105 | #endif |
106 | FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ | ||
107 | FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ | ||
108 | #ifdef CONFIG_X86_INTEL_MID | 106 | #ifdef CONFIG_X86_INTEL_MID |
109 | FIX_LNW_VRTC, | 107 | FIX_LNW_VRTC, |
110 | #endif | 108 | #endif |
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 19d18fae6ec6..93dff1963337 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <asm/tlbflush.h> | 13 | #include <asm/tlbflush.h> |
14 | #include <asm/paravirt.h> | 14 | #include <asm/paravirt.h> |
15 | #include <asm/mpx.h> | 15 | #include <asm/mpx.h> |
16 | #include <asm/debugreg.h> | ||
16 | 17 | ||
17 | extern atomic64_t last_mm_ctx_id; | 18 | extern atomic64_t last_mm_ctx_id; |
18 | 19 | ||
@@ -356,4 +357,59 @@ static inline unsigned long __get_current_cr3_fast(void) | |||
356 | return cr3; | 357 | return cr3; |
357 | } | 358 | } |
358 | 359 | ||
360 | typedef struct { | ||
361 | struct mm_struct *mm; | ||
362 | } temp_mm_state_t; | ||
363 | |||
364 | /* | ||
365 | * Using a temporary mm allows to set temporary mappings that are not accessible | ||
366 | * by other CPUs. Such mappings are needed to perform sensitive memory writes | ||
367 | * that override the kernel memory protections (e.g., W^X), without exposing the | ||
368 | * temporary page-table mappings that are required for these write operations to | ||
369 | * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the | ||
370 | * mapping is torn down. | ||
371 | * | ||
372 | * Context: The temporary mm needs to be used exclusively by a single core. To | ||
373 | * harden security IRQs must be disabled while the temporary mm is | ||
374 | * loaded, thereby preventing interrupt handler bugs from overriding | ||
375 | * the kernel memory protection. | ||
376 | */ | ||
377 | static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) | ||
378 | { | ||
379 | temp_mm_state_t temp_state; | ||
380 | |||
381 | lockdep_assert_irqs_disabled(); | ||
382 | temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); | ||
383 | switch_mm_irqs_off(NULL, mm, current); | ||
384 | |||
385 | /* | ||
386 | * If breakpoints are enabled, disable them while the temporary mm is | ||
387 | * used. Userspace might set up watchpoints on addresses that are used | ||
388 | * in the temporary mm, which would lead to wrong signals being sent or | ||
389 | * crashes. | ||
390 | * | ||
391 | * Note that breakpoints are not disabled selectively, which also causes | ||
392 | * kernel breakpoints (e.g., perf's) to be disabled. This might be | ||
393 | * undesirable, but still seems reasonable as the code that runs in the | ||
394 | * temporary mm should be short. | ||
395 | */ | ||
396 | if (hw_breakpoint_active()) | ||
397 | hw_breakpoint_disable(); | ||
398 | |||
399 | return temp_state; | ||
400 | } | ||
401 | |||
402 | static inline void unuse_temporary_mm(temp_mm_state_t prev_state) | ||
403 | { | ||
404 | lockdep_assert_irqs_disabled(); | ||
405 | switch_mm_irqs_off(NULL, prev_state.mm, current); | ||
406 | |||
407 | /* | ||
408 | * Restore the breakpoints if they were disabled before the temporary mm | ||
409 | * was loaded. | ||
410 | */ | ||
411 | if (hw_breakpoint_active()) | ||
412 | hw_breakpoint_restore(); | ||
413 | } | ||
414 | |||
359 | #endif /* _ASM_X86_MMU_CONTEXT_H */ | 415 | #endif /* _ASM_X86_MMU_CONTEXT_H */ |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 50b3e2d963c9..3a221942f805 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void) | |||
1021 | /* Default trampoline pgd value */ | 1021 | /* Default trampoline pgd value */ |
1022 | trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; | 1022 | trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; |
1023 | } | 1023 | } |
1024 | |||
1025 | void __init poking_init(void); | ||
1026 | |||
1024 | # ifdef CONFIG_RANDOMIZE_MEMORY | 1027 | # ifdef CONFIG_RANDOMIZE_MEMORY |
1025 | void __meminit init_trampoline(void); | 1028 | void __meminit init_trampoline(void); |
1026 | # else | 1029 | # else |
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 07a25753e85c..ae7b909dc242 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h | |||
@@ -85,6 +85,9 @@ int set_pages_nx(struct page *page, int numpages); | |||
85 | int set_pages_ro(struct page *page, int numpages); | 85 | int set_pages_ro(struct page *page, int numpages); |
86 | int set_pages_rw(struct page *page, int numpages); | 86 | int set_pages_rw(struct page *page, int numpages); |
87 | 87 | ||
88 | int set_direct_map_invalid_noflush(struct page *page); | ||
89 | int set_direct_map_default_noflush(struct page *page); | ||
90 | |||
88 | extern int kernel_set_to_readonly; | 91 | extern int kernel_set_to_readonly; |
89 | void set_kernel_text_rw(void); | 92 | void set_kernel_text_rw(void); |
90 | void set_kernel_text_ro(void); | 93 | void set_kernel_text_ro(void); |
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index e85ff65c43c3..c90678fd391a 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h | |||
@@ -18,7 +18,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, | |||
18 | #define __parainstructions_end NULL | 18 | #define __parainstructions_end NULL |
19 | #endif | 19 | #endif |
20 | 20 | ||
21 | extern void *text_poke_early(void *addr, const void *opcode, size_t len); | 21 | extern void text_poke_early(void *addr, const void *opcode, size_t len); |
22 | 22 | ||
23 | /* | 23 | /* |
24 | * Clear and restore the kernel write-protection flag on the local CPU. | 24 | * Clear and restore the kernel write-protection flag on the local CPU. |
@@ -35,8 +35,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); | |||
35 | * inconsistent instruction while you patch. | 35 | * inconsistent instruction while you patch. |
36 | */ | 36 | */ |
37 | extern void *text_poke(void *addr, const void *opcode, size_t len); | 37 | extern void *text_poke(void *addr, const void *opcode, size_t len); |
38 | extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); | ||
38 | extern int poke_int3_handler(struct pt_regs *regs); | 39 | extern int poke_int3_handler(struct pt_regs *regs); |
39 | extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); | 40 | extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); |
40 | extern int after_bootmem; | 41 | extern int after_bootmem; |
42 | extern __ro_after_init struct mm_struct *poking_mm; | ||
43 | extern __ro_after_init unsigned long poking_addr; | ||
41 | 44 | ||
42 | #endif /* _ASM_X86_TEXT_PATCHING_H */ | 45 | #endif /* _ASM_X86_TEXT_PATCHING_H */ |
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 90926e8dd1f8..dee375831962 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h | |||
@@ -274,6 +274,8 @@ static inline bool nmi_uaccess_okay(void) | |||
274 | return true; | 274 | return true; |
275 | } | 275 | } |
276 | 276 | ||
277 | #define nmi_uaccess_okay nmi_uaccess_okay | ||
278 | |||
277 | /* Initialize cr4 shadow for this CPU. */ | 279 | /* Initialize cr4 shadow for this CPU. */ |
278 | static inline void cr4_init_shadow(void) | 280 | static inline void cr4_init_shadow(void) |
279 | { | 281 | { |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9a79c7808f9c..7b9b49dfc05a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
13 | #include <linux/kdebug.h> | 13 | #include <linux/kdebug.h> |
14 | #include <linux/kprobes.h> | 14 | #include <linux/kprobes.h> |
15 | #include <linux/mmu_context.h> | ||
15 | #include <asm/text-patching.h> | 16 | #include <asm/text-patching.h> |
16 | #include <asm/alternative.h> | 17 | #include <asm/alternative.h> |
17 | #include <asm/sections.h> | 18 | #include <asm/sections.h> |
@@ -264,7 +265,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) | |||
264 | 265 | ||
265 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | 266 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; |
266 | extern s32 __smp_locks[], __smp_locks_end[]; | 267 | extern s32 __smp_locks[], __smp_locks_end[]; |
267 | void *text_poke_early(void *addr, const void *opcode, size_t len); | 268 | void text_poke_early(void *addr, const void *opcode, size_t len); |
268 | 269 | ||
269 | /* | 270 | /* |
270 | * Are we looking at a near JMP with a 1 or 4-byte displacement. | 271 | * Are we looking at a near JMP with a 1 or 4-byte displacement. |
@@ -666,16 +667,136 @@ void __init alternative_instructions(void) | |||
666 | * instructions. And on the local CPU you need to be protected again NMI or MCE | 667 | * instructions. And on the local CPU you need to be protected again NMI or MCE |
667 | * handlers seeing an inconsistent instruction while you patch. | 668 | * handlers seeing an inconsistent instruction while you patch. |
668 | */ | 669 | */ |
669 | void *__init_or_module text_poke_early(void *addr, const void *opcode, | 670 | void __init_or_module text_poke_early(void *addr, const void *opcode, |
670 | size_t len) | 671 | size_t len) |
671 | { | 672 | { |
672 | unsigned long flags; | 673 | unsigned long flags; |
674 | |||
675 | if (boot_cpu_has(X86_FEATURE_NX) && | ||
676 | is_module_text_address((unsigned long)addr)) { | ||
677 | /* | ||
678 | * Modules text is marked initially as non-executable, so the | ||
679 | * code cannot be running and speculative code-fetches are | ||
680 | * prevented. Just change the code. | ||
681 | */ | ||
682 | memcpy(addr, opcode, len); | ||
683 | } else { | ||
684 | local_irq_save(flags); | ||
685 | memcpy(addr, opcode, len); | ||
686 | local_irq_restore(flags); | ||
687 | sync_core(); | ||
688 | |||
689 | /* | ||
690 | * Could also do a CLFLUSH here to speed up CPU recovery; but | ||
691 | * that causes hangs on some VIA CPUs. | ||
692 | */ | ||
693 | } | ||
694 | } | ||
695 | |||
696 | __ro_after_init struct mm_struct *poking_mm; | ||
697 | __ro_after_init unsigned long poking_addr; | ||
698 | |||
699 | static void *__text_poke(void *addr, const void *opcode, size_t len) | ||
700 | { | ||
701 | bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; | ||
702 | struct page *pages[2] = {NULL}; | ||
703 | temp_mm_state_t prev; | ||
704 | unsigned long flags; | ||
705 | pte_t pte, *ptep; | ||
706 | spinlock_t *ptl; | ||
707 | pgprot_t pgprot; | ||
708 | |||
709 | /* | ||
710 | * While boot memory allocator is running we cannot use struct pages as | ||
711 | * they are not yet initialized. There is no way to recover. | ||
712 | */ | ||
713 | BUG_ON(!after_bootmem); | ||
714 | |||
715 | if (!core_kernel_text((unsigned long)addr)) { | ||
716 | pages[0] = vmalloc_to_page(addr); | ||
717 | if (cross_page_boundary) | ||
718 | pages[1] = vmalloc_to_page(addr + PAGE_SIZE); | ||
719 | } else { | ||
720 | pages[0] = virt_to_page(addr); | ||
721 | WARN_ON(!PageReserved(pages[0])); | ||
722 | if (cross_page_boundary) | ||
723 | pages[1] = virt_to_page(addr + PAGE_SIZE); | ||
724 | } | ||
725 | /* | ||
726 | * If something went wrong, crash and burn since recovery paths are not | ||
727 | * implemented. | ||
728 | */ | ||
729 | BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); | ||
730 | |||
673 | local_irq_save(flags); | 731 | local_irq_save(flags); |
674 | memcpy(addr, opcode, len); | 732 | |
733 | /* | ||
734 | * Map the page without the global bit, as TLB flushing is done with | ||
735 | * flush_tlb_mm_range(), which is intended for non-global PTEs. | ||
736 | */ | ||
737 | pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); | ||
738 | |||
739 | /* | ||
740 | * The lock is not really needed, but this allows to avoid open-coding. | ||
741 | */ | ||
742 | ptep = get_locked_pte(poking_mm, poking_addr, &ptl); | ||
743 | |||
744 | /* | ||
745 | * This must not fail; preallocated in poking_init(). | ||
746 | */ | ||
747 | VM_BUG_ON(!ptep); | ||
748 | |||
749 | pte = mk_pte(pages[0], pgprot); | ||
750 | set_pte_at(poking_mm, poking_addr, ptep, pte); | ||
751 | |||
752 | if (cross_page_boundary) { | ||
753 | pte = mk_pte(pages[1], pgprot); | ||
754 | set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * Loading the temporary mm behaves as a compiler barrier, which | ||
759 | * guarantees that the PTE will be set at the time memcpy() is done. | ||
760 | */ | ||
761 | prev = use_temporary_mm(poking_mm); | ||
762 | |||
763 | kasan_disable_current(); | ||
764 | memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); | ||
765 | kasan_enable_current(); | ||
766 | |||
767 | /* | ||
768 | * Ensure that the PTE is only cleared after the instructions of memcpy | ||
769 | * were issued by using a compiler barrier. | ||
770 | */ | ||
771 | barrier(); | ||
772 | |||
773 | pte_clear(poking_mm, poking_addr, ptep); | ||
774 | if (cross_page_boundary) | ||
775 | pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); | ||
776 | |||
777 | /* | ||
778 | * Loading the previous page-table hierarchy requires a serializing | ||
779 | * instruction that already allows the core to see the updated version. | ||
780 | * Xen-PV is assumed to serialize execution in a similar manner. | ||
781 | */ | ||
782 | unuse_temporary_mm(prev); | ||
783 | |||
784 | /* | ||
785 | * Flushing the TLB might involve IPIs, which would require enabled | ||
786 | * IRQs, but not if the mm is not used, as it is in this point. | ||
787 | */ | ||
788 | flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + | ||
789 | (cross_page_boundary ? 2 : 1) * PAGE_SIZE, | ||
790 | PAGE_SHIFT, false); | ||
791 | |||
792 | /* | ||
793 | * If the text does not match what we just wrote then something is | ||
794 | * fundamentally screwy; there's nothing we can really do about that. | ||
795 | */ | ||
796 | BUG_ON(memcmp(addr, opcode, len)); | ||
797 | |||
798 | pte_unmap_unlock(ptep, ptl); | ||
675 | local_irq_restore(flags); | 799 | local_irq_restore(flags); |
676 | sync_core(); | ||
677 | /* Could also do a CLFLUSH here to speed up CPU recovery; but | ||
678 | that causes hangs on some VIA CPUs. */ | ||
679 | return addr; | 800 | return addr; |
680 | } | 801 | } |
681 | 802 | ||
@@ -689,48 +810,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, | |||
689 | * It means the size must be writable atomically and the address must be aligned | 810 | * It means the size must be writable atomically and the address must be aligned |
690 | * in a way that permits an atomic write. It also makes sure we fit on a single | 811 | * in a way that permits an atomic write. It also makes sure we fit on a single |
691 | * page. | 812 | * page. |
813 | * | ||
814 | * Note that the caller must ensure that if the modified code is part of a | ||
815 | * module, the module would not be removed during poking. This can be achieved | ||
816 | * by registering a module notifier, and ordering module removal and patching | ||
817 | * trough a mutex. | ||
692 | */ | 818 | */ |
693 | void *text_poke(void *addr, const void *opcode, size_t len) | 819 | void *text_poke(void *addr, const void *opcode, size_t len) |
694 | { | 820 | { |
695 | unsigned long flags; | ||
696 | char *vaddr; | ||
697 | struct page *pages[2]; | ||
698 | int i; | ||
699 | |||
700 | /* | ||
701 | * While boot memory allocator is runnig we cannot use struct | ||
702 | * pages as they are not yet initialized. | ||
703 | */ | ||
704 | BUG_ON(!after_bootmem); | ||
705 | |||
706 | lockdep_assert_held(&text_mutex); | 821 | lockdep_assert_held(&text_mutex); |
707 | 822 | ||
708 | if (!core_kernel_text((unsigned long)addr)) { | 823 | return __text_poke(addr, opcode, len); |
709 | pages[0] = vmalloc_to_page(addr); | 824 | } |
710 | pages[1] = vmalloc_to_page(addr + PAGE_SIZE); | 825 | |
711 | } else { | 826 | /** |
712 | pages[0] = virt_to_page(addr); | 827 | * text_poke_kgdb - Update instructions on a live kernel by kgdb |
713 | WARN_ON(!PageReserved(pages[0])); | 828 | * @addr: address to modify |
714 | pages[1] = virt_to_page(addr + PAGE_SIZE); | 829 | * @opcode: source of the copy |
715 | } | 830 | * @len: length to copy |
716 | BUG_ON(!pages[0]); | 831 | * |
717 | local_irq_save(flags); | 832 | * Only atomic text poke/set should be allowed when not doing early patching. |
718 | set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); | 833 | * It means the size must be writable atomically and the address must be aligned |
719 | if (pages[1]) | 834 | * in a way that permits an atomic write. It also makes sure we fit on a single |
720 | set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); | 835 | * page. |
721 | vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); | 836 | * |
722 | memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); | 837 | * Context: should only be used by kgdb, which ensures no other core is running, |
723 | clear_fixmap(FIX_TEXT_POKE0); | 838 | * despite the fact it does not hold the text_mutex. |
724 | if (pages[1]) | 839 | */ |
725 | clear_fixmap(FIX_TEXT_POKE1); | 840 | void *text_poke_kgdb(void *addr, const void *opcode, size_t len) |
726 | local_flush_tlb(); | 841 | { |
727 | sync_core(); | 842 | return __text_poke(addr, opcode, len); |
728 | /* Could also do a CLFLUSH here to speed up CPU recovery; but | ||
729 | that causes hangs on some VIA CPUs. */ | ||
730 | for (i = 0; i < len; i++) | ||
731 | BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); | ||
732 | local_irq_restore(flags); | ||
733 | return addr; | ||
734 | } | 843 | } |
735 | 844 | ||
736 | static void do_sync_core(void *info) | 845 | static void do_sync_core(void *info) |
@@ -788,7 +897,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); | |||
788 | * replacing opcode | 897 | * replacing opcode |
789 | * - sync cores | 898 | * - sync cores |
790 | */ | 899 | */ |
791 | void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) | 900 | void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) |
792 | { | 901 | { |
793 | unsigned char int3 = 0xcc; | 902 | unsigned char int3 = 0xcc; |
794 | 903 | ||
@@ -830,7 +939,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) | |||
830 | * the writing of the new instruction. | 939 | * the writing of the new instruction. |
831 | */ | 940 | */ |
832 | bp_patching_in_progress = false; | 941 | bp_patching_in_progress = false; |
833 | |||
834 | return addr; | ||
835 | } | 942 | } |
836 | 943 | ||
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ef49517f6bb2..0caf8122d680 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c | |||
@@ -678,12 +678,8 @@ static inline void *alloc_tramp(unsigned long size) | |||
678 | { | 678 | { |
679 | return module_alloc(size); | 679 | return module_alloc(size); |
680 | } | 680 | } |
681 | static inline void tramp_free(void *tramp, int size) | 681 | static inline void tramp_free(void *tramp) |
682 | { | 682 | { |
683 | int npages = PAGE_ALIGN(size) >> PAGE_SHIFT; | ||
684 | |||
685 | set_memory_nx((unsigned long)tramp, npages); | ||
686 | set_memory_rw((unsigned long)tramp, npages); | ||
687 | module_memfree(tramp); | 683 | module_memfree(tramp); |
688 | } | 684 | } |
689 | #else | 685 | #else |
@@ -692,7 +688,7 @@ static inline void *alloc_tramp(unsigned long size) | |||
692 | { | 688 | { |
693 | return NULL; | 689 | return NULL; |
694 | } | 690 | } |
695 | static inline void tramp_free(void *tramp, int size) { } | 691 | static inline void tramp_free(void *tramp) { } |
696 | #endif | 692 | #endif |
697 | 693 | ||
698 | /* Defined as markers to the end of the ftrace default trampolines */ | 694 | /* Defined as markers to the end of the ftrace default trampolines */ |
@@ -730,6 +726,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) | |||
730 | unsigned long end_offset; | 726 | unsigned long end_offset; |
731 | unsigned long op_offset; | 727 | unsigned long op_offset; |
732 | unsigned long offset; | 728 | unsigned long offset; |
729 | unsigned long npages; | ||
733 | unsigned long size; | 730 | unsigned long size; |
734 | unsigned long retq; | 731 | unsigned long retq; |
735 | unsigned long *ptr; | 732 | unsigned long *ptr; |
@@ -762,6 +759,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) | |||
762 | return 0; | 759 | return 0; |
763 | 760 | ||
764 | *tramp_size = size + RET_SIZE + sizeof(void *); | 761 | *tramp_size = size + RET_SIZE + sizeof(void *); |
762 | npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); | ||
765 | 763 | ||
766 | /* Copy ftrace_caller onto the trampoline memory */ | 764 | /* Copy ftrace_caller onto the trampoline memory */ |
767 | ret = probe_kernel_read(trampoline, (void *)start_offset, size); | 765 | ret = probe_kernel_read(trampoline, (void *)start_offset, size); |
@@ -806,9 +804,17 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) | |||
806 | /* ALLOC_TRAMP flags lets us know we created it */ | 804 | /* ALLOC_TRAMP flags lets us know we created it */ |
807 | ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; | 805 | ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; |
808 | 806 | ||
807 | set_vm_flush_reset_perms(trampoline); | ||
808 | |||
809 | /* | ||
810 | * Module allocation needs to be completed by making the page | ||
811 | * executable. The page is still writable, which is a security hazard, | ||
812 | * but anyhow ftrace breaks W^X completely. | ||
813 | */ | ||
814 | set_memory_x((unsigned long)trampoline, npages); | ||
809 | return (unsigned long)trampoline; | 815 | return (unsigned long)trampoline; |
810 | fail: | 816 | fail: |
811 | tramp_free(trampoline, *tramp_size); | 817 | tramp_free(trampoline); |
812 | return 0; | 818 | return 0; |
813 | } | 819 | } |
814 | 820 | ||
@@ -939,7 +945,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) | |||
939 | if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) | 945 | if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) |
940 | return; | 946 | return; |
941 | 947 | ||
942 | tramp_free((void *)ops->trampoline, ops->trampoline_size); | 948 | tramp_free((void *)ops->trampoline); |
943 | ops->trampoline = 0; | 949 | ops->trampoline = 0; |
944 | } | 950 | } |
945 | 951 | ||
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f99bd26bd3f1..e631c358f7f4 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c | |||
@@ -37,7 +37,6 @@ static void bug_at(unsigned char *ip, int line) | |||
37 | 37 | ||
38 | static void __ref __jump_label_transform(struct jump_entry *entry, | 38 | static void __ref __jump_label_transform(struct jump_entry *entry, |
39 | enum jump_label_type type, | 39 | enum jump_label_type type, |
40 | void *(*poker)(void *, const void *, size_t), | ||
41 | int init) | 40 | int init) |
42 | { | 41 | { |
43 | union jump_code_union jmp; | 42 | union jump_code_union jmp; |
@@ -50,9 +49,6 @@ static void __ref __jump_label_transform(struct jump_entry *entry, | |||
50 | jmp.offset = jump_entry_target(entry) - | 49 | jmp.offset = jump_entry_target(entry) - |
51 | (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); | 50 | (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); |
52 | 51 | ||
53 | if (early_boot_irqs_disabled) | ||
54 | poker = text_poke_early; | ||
55 | |||
56 | if (type == JUMP_LABEL_JMP) { | 52 | if (type == JUMP_LABEL_JMP) { |
57 | if (init) { | 53 | if (init) { |
58 | expect = default_nop; line = __LINE__; | 54 | expect = default_nop; line = __LINE__; |
@@ -75,16 +71,19 @@ static void __ref __jump_label_transform(struct jump_entry *entry, | |||
75 | bug_at((void *)jump_entry_code(entry), line); | 71 | bug_at((void *)jump_entry_code(entry), line); |
76 | 72 | ||
77 | /* | 73 | /* |
78 | * Make text_poke_bp() a default fallback poker. | 74 | * As long as only a single processor is running and the code is still |
75 | * not marked as RO, text_poke_early() can be used; Checking that | ||
76 | * system_state is SYSTEM_BOOTING guarantees it. It will be set to | ||
77 | * SYSTEM_SCHEDULING before other cores are awaken and before the | ||
78 | * code is write-protected. | ||
79 | * | 79 | * |
80 | * At the time the change is being done, just ignore whether we | 80 | * At the time the change is being done, just ignore whether we |
81 | * are doing nop -> jump or jump -> nop transition, and assume | 81 | * are doing nop -> jump or jump -> nop transition, and assume |
82 | * always nop being the 'currently valid' instruction | 82 | * always nop being the 'currently valid' instruction |
83 | * | ||
84 | */ | 83 | */ |
85 | if (poker) { | 84 | if (init || system_state == SYSTEM_BOOTING) { |
86 | (*poker)((void *)jump_entry_code(entry), code, | 85 | text_poke_early((void *)jump_entry_code(entry), code, |
87 | JUMP_LABEL_NOP_SIZE); | 86 | JUMP_LABEL_NOP_SIZE); |
88 | return; | 87 | return; |
89 | } | 88 | } |
90 | 89 | ||
@@ -96,7 +95,7 @@ void arch_jump_label_transform(struct jump_entry *entry, | |||
96 | enum jump_label_type type) | 95 | enum jump_label_type type) |
97 | { | 96 | { |
98 | mutex_lock(&text_mutex); | 97 | mutex_lock(&text_mutex); |
99 | __jump_label_transform(entry, type, NULL, 0); | 98 | __jump_label_transform(entry, type, 0); |
100 | mutex_unlock(&text_mutex); | 99 | mutex_unlock(&text_mutex); |
101 | } | 100 | } |
102 | 101 | ||
@@ -126,5 +125,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, | |||
126 | jlstate = JL_STATE_NO_UPDATE; | 125 | jlstate = JL_STATE_NO_UPDATE; |
127 | } | 126 | } |
128 | if (jlstate == JL_STATE_UPDATE) | 127 | if (jlstate == JL_STATE_UPDATE) |
129 | __jump_label_transform(entry, type, text_poke_early, 1); | 128 | __jump_label_transform(entry, type, 1); |
130 | } | 129 | } |
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 4ff6b4cdb941..13b13311b792 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -747,7 +747,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) | |||
747 | int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) | 747 | int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) |
748 | { | 748 | { |
749 | int err; | 749 | int err; |
750 | char opc[BREAK_INSTR_SIZE]; | ||
751 | 750 | ||
752 | bpt->type = BP_BREAKPOINT; | 751 | bpt->type = BP_BREAKPOINT; |
753 | err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, | 752 | err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, |
@@ -759,18 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) | |||
759 | if (!err) | 758 | if (!err) |
760 | return err; | 759 | return err; |
761 | /* | 760 | /* |
762 | * It is safe to call text_poke() because normal kernel execution | 761 | * It is safe to call text_poke_kgdb() because normal kernel execution |
763 | * is stopped on all cores, so long as the text_mutex is not locked. | 762 | * is stopped on all cores, so long as the text_mutex is not locked. |
764 | */ | 763 | */ |
765 | if (mutex_is_locked(&text_mutex)) | 764 | if (mutex_is_locked(&text_mutex)) |
766 | return -EBUSY; | 765 | return -EBUSY; |
767 | text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, | 766 | text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, |
768 | BREAK_INSTR_SIZE); | 767 | BREAK_INSTR_SIZE); |
769 | err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); | ||
770 | if (err) | ||
771 | return err; | ||
772 | if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) | ||
773 | return -EINVAL; | ||
774 | bpt->type = BP_POKE_BREAKPOINT; | 768 | bpt->type = BP_POKE_BREAKPOINT; |
775 | 769 | ||
776 | return err; | 770 | return err; |
@@ -778,22 +772,17 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) | |||
778 | 772 | ||
779 | int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) | 773 | int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) |
780 | { | 774 | { |
781 | int err; | ||
782 | char opc[BREAK_INSTR_SIZE]; | ||
783 | |||
784 | if (bpt->type != BP_POKE_BREAKPOINT) | 775 | if (bpt->type != BP_POKE_BREAKPOINT) |
785 | goto knl_write; | 776 | goto knl_write; |
786 | /* | 777 | /* |
787 | * It is safe to call text_poke() because normal kernel execution | 778 | * It is safe to call text_poke_kgdb() because normal kernel execution |
788 | * is stopped on all cores, so long as the text_mutex is not locked. | 779 | * is stopped on all cores, so long as the text_mutex is not locked. |
789 | */ | 780 | */ |
790 | if (mutex_is_locked(&text_mutex)) | 781 | if (mutex_is_locked(&text_mutex)) |
791 | goto knl_write; | 782 | goto knl_write; |
792 | text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE); | 783 | text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr, |
793 | err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); | 784 | BREAK_INSTR_SIZE); |
794 | if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) | 785 | return 0; |
795 | goto knl_write; | ||
796 | return err; | ||
797 | 786 | ||
798 | knl_write: | 787 | knl_write: |
799 | return probe_kernel_write((char *)bpt->bpt_addr, | 788 | return probe_kernel_write((char *)bpt->bpt_addr, |
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 122548ad5c2e..cf52ee0d8711 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c | |||
@@ -431,8 +431,21 @@ void *alloc_insn_page(void) | |||
431 | void *page; | 431 | void *page; |
432 | 432 | ||
433 | page = module_alloc(PAGE_SIZE); | 433 | page = module_alloc(PAGE_SIZE); |
434 | if (page) | 434 | if (!page) |
435 | set_memory_ro((unsigned long)page & PAGE_MASK, 1); | 435 | return NULL; |
436 | |||
437 | set_vm_flush_reset_perms(page); | ||
438 | /* | ||
439 | * First make the page read-only, and only then make it executable to | ||
440 | * prevent it from being W+X in between. | ||
441 | */ | ||
442 | set_memory_ro((unsigned long)page, 1); | ||
443 | |||
444 | /* | ||
445 | * TODO: Once additional kernel code protection mechanisms are set, ensure | ||
446 | * that the page was not maliciously altered and it is still zeroed. | ||
447 | */ | ||
448 | set_memory_x((unsigned long)page, 1); | ||
436 | 449 | ||
437 | return page; | 450 | return page; |
438 | } | 451 | } |
@@ -440,8 +453,6 @@ void *alloc_insn_page(void) | |||
440 | /* Recover page to RW mode before releasing it */ | 453 | /* Recover page to RW mode before releasing it */ |
441 | void free_insn_page(void *page) | 454 | void free_insn_page(void *page) |
442 | { | 455 | { |
443 | set_memory_nx((unsigned long)page & PAGE_MASK, 1); | ||
444 | set_memory_rw((unsigned long)page & PAGE_MASK, 1); | ||
445 | module_memfree(page); | 456 | module_memfree(page); |
446 | } | 457 | } |
447 | 458 | ||
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b052e883dd8c..cfa3106faee4 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c | |||
@@ -87,7 +87,7 @@ void *module_alloc(unsigned long size) | |||
87 | p = __vmalloc_node_range(size, MODULE_ALIGN, | 87 | p = __vmalloc_node_range(size, MODULE_ALIGN, |
88 | MODULES_VADDR + get_module_load_offset(), | 88 | MODULES_VADDR + get_module_load_offset(), |
89 | MODULES_END, GFP_KERNEL, | 89 | MODULES_END, GFP_KERNEL, |
90 | PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, | 90 | PAGE_KERNEL, 0, NUMA_NO_NODE, |
91 | __builtin_return_address(0)); | 91 | __builtin_return_address(0)); |
92 | if (p && (kasan_module_alloc(p, size) < 0)) { | 92 | if (p && (kasan_module_alloc(p, size) < 0)) { |
93 | vfree(p); | 93 | vfree(p); |
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4d1517022a14..0850b5149345 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -141,11 +141,11 @@ SECTIONS | |||
141 | *(.text.__x86.indirect_thunk) | 141 | *(.text.__x86.indirect_thunk) |
142 | __indirect_thunk_end = .; | 142 | __indirect_thunk_end = .; |
143 | #endif | 143 | #endif |
144 | |||
145 | /* End of text section */ | ||
146 | _etext = .; | ||
147 | } :text = 0x9090 | 144 | } :text = 0x9090 |
148 | 145 | ||
146 | /* End of text section */ | ||
147 | _etext = .; | ||
148 | |||
149 | NOTES :text :note | 149 | NOTES :text :note |
150 | 150 | ||
151 | EXCEPTION_TABLE(16) :text = 0x9090 | 151 | EXCEPTION_TABLE(16) :text = 0x9090 |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 06c089513d39..46df4c6aae46 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -360,8 +360,6 @@ static noinline int vmalloc_fault(unsigned long address) | |||
360 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | 360 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
361 | return -1; | 361 | return -1; |
362 | 362 | ||
363 | WARN_ON_ONCE(in_nmi()); | ||
364 | |||
365 | /* | 363 | /* |
366 | * Copy kernel mappings over when needed. This can also | 364 | * Copy kernel mappings over when needed. This can also |
367 | * happen within a race in page table update. In the later | 365 | * happen within a race in page table update. In the later |
@@ -604,24 +602,9 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) | |||
604 | name, index, addr, (desc.limit0 | (desc.limit1 << 16))); | 602 | name, index, addr, (desc.limit0 | (desc.limit1 << 16))); |
605 | } | 603 | } |
606 | 604 | ||
607 | /* | ||
608 | * This helper function transforms the #PF error_code bits into | ||
609 | * "[PROT] [USER]" type of descriptive, almost human-readable error strings: | ||
610 | */ | ||
611 | static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt) | ||
612 | { | ||
613 | if (error_code & mask) { | ||
614 | if (buf[0]) | ||
615 | strcat(buf, " "); | ||
616 | strcat(buf, txt); | ||
617 | } | ||
618 | } | ||
619 | |||
620 | static void | 605 | static void |
621 | show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) | 606 | show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) |
622 | { | 607 | { |
623 | char err_txt[64]; | ||
624 | |||
625 | if (!oops_may_print()) | 608 | if (!oops_may_print()) |
626 | return; | 609 | return; |
627 | 610 | ||
@@ -645,31 +628,29 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad | |||
645 | from_kuid(&init_user_ns, current_uid())); | 628 | from_kuid(&init_user_ns, current_uid())); |
646 | } | 629 | } |
647 | 630 | ||
648 | pr_alert("BUG: unable to handle kernel %s at %px\n", | 631 | if (address < PAGE_SIZE && !user_mode(regs)) |
649 | address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", | 632 | pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", |
650 | (void *)address); | 633 | (void *)address); |
651 | 634 | else | |
652 | err_txt[0] = 0; | 635 | pr_alert("BUG: unable to handle page fault for address: %px\n", |
653 | 636 | (void *)address); | |
654 | /* | 637 | |
655 | * Note: length of these appended strings including the separation space and the | 638 | pr_alert("#PF: %s %s in %s mode\n", |
656 | * zero delimiter must fit into err_txt[]. | 639 | (error_code & X86_PF_USER) ? "user" : "supervisor", |
657 | */ | 640 | (error_code & X86_PF_INSTR) ? "instruction fetch" : |
658 | err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" ); | 641 | (error_code & X86_PF_WRITE) ? "write access" : |
659 | err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]"); | 642 | "read access", |
660 | err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" ); | 643 | user_mode(regs) ? "user" : "kernel"); |
661 | err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" ); | 644 | pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, |
662 | err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]"); | 645 | !(error_code & X86_PF_PROT) ? "not-present page" : |
663 | err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" ); | 646 | (error_code & X86_PF_RSVD) ? "reserved bit violation" : |
664 | 647 | (error_code & X86_PF_PK) ? "protection keys violation" : | |
665 | pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]"); | 648 | "permissions violation"); |
666 | 649 | ||
667 | if (!(error_code & X86_PF_USER) && user_mode(regs)) { | 650 | if (!(error_code & X86_PF_USER) && user_mode(regs)) { |
668 | struct desc_ptr idt, gdt; | 651 | struct desc_ptr idt, gdt; |
669 | u16 ldtr, tr; | 652 | u16 ldtr, tr; |
670 | 653 | ||
671 | pr_alert("This was a system access from user code\n"); | ||
672 | |||
673 | /* | 654 | /* |
674 | * This can happen for quite a few reasons. The more obvious | 655 | * This can happen for quite a few reasons. The more obvious |
675 | * ones are faults accessing the GDT, or LDT. Perhaps | 656 | * ones are faults accessing the GDT, or LDT. Perhaps |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8dacdb96899e..fd10d91a6115 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/swapfile.h> | 6 | #include <linux/swapfile.h> |
7 | #include <linux/swapops.h> | 7 | #include <linux/swapops.h> |
8 | #include <linux/kmemleak.h> | 8 | #include <linux/kmemleak.h> |
9 | #include <linux/sched/task.h> | ||
9 | 10 | ||
10 | #include <asm/set_memory.h> | 11 | #include <asm/set_memory.h> |
11 | #include <asm/e820/api.h> | 12 | #include <asm/e820/api.h> |
@@ -23,6 +24,7 @@ | |||
23 | #include <asm/hypervisor.h> | 24 | #include <asm/hypervisor.h> |
24 | #include <asm/cpufeature.h> | 25 | #include <asm/cpufeature.h> |
25 | #include <asm/pti.h> | 26 | #include <asm/pti.h> |
27 | #include <asm/text-patching.h> | ||
26 | 28 | ||
27 | /* | 29 | /* |
28 | * We need to define the tracepoints somewhere, and tlb.c | 30 | * We need to define the tracepoints somewhere, and tlb.c |
@@ -702,6 +704,41 @@ void __init init_mem_mapping(void) | |||
702 | } | 704 | } |
703 | 705 | ||
704 | /* | 706 | /* |
707 | * Initialize an mm_struct to be used during poking and a pointer to be used | ||
708 | * during patching. | ||
709 | */ | ||
710 | void __init poking_init(void) | ||
711 | { | ||
712 | spinlock_t *ptl; | ||
713 | pte_t *ptep; | ||
714 | |||
715 | poking_mm = copy_init_mm(); | ||
716 | BUG_ON(!poking_mm); | ||
717 | |||
718 | /* | ||
719 | * Randomize the poking address, but make sure that the following page | ||
720 | * will be mapped at the same PMD. We need 2 pages, so find space for 3, | ||
721 | * and adjust the address if the PMD ends after the first one. | ||
722 | */ | ||
723 | poking_addr = TASK_UNMAPPED_BASE; | ||
724 | if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) | ||
725 | poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % | ||
726 | (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); | ||
727 | |||
728 | if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) | ||
729 | poking_addr += PAGE_SIZE; | ||
730 | |||
731 | /* | ||
732 | * We need to trigger the allocation of the page-tables that will be | ||
733 | * needed for poking now. Later, poking may be performed in an atomic | ||
734 | * section, which might cause allocation to fail. | ||
735 | */ | ||
736 | ptep = get_locked_pte(poking_mm, poking_addr, &ptl); | ||
737 | BUG_ON(!ptep); | ||
738 | pte_unmap_unlock(ptep, ptl); | ||
739 | } | ||
740 | |||
741 | /* | ||
705 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address | 742 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address |
706 | * is valid. The argument is a physical page number. | 743 | * is valid. The argument is a physical page number. |
707 | * | 744 | * |
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index d669c5e797e0..dc3f058bdf9b 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c | |||
@@ -125,10 +125,7 @@ void __init kernel_randomize_memory(void) | |||
125 | */ | 125 | */ |
126 | entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); | 126 | entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); |
127 | prandom_bytes_state(&rand_state, &rand, sizeof(rand)); | 127 | prandom_bytes_state(&rand_state, &rand, sizeof(rand)); |
128 | if (pgtable_l5_enabled()) | 128 | entropy = (rand % (entropy + 1)) & PUD_MASK; |
129 | entropy = (rand % (entropy + 1)) & P4D_MASK; | ||
130 | else | ||
131 | entropy = (rand % (entropy + 1)) & PUD_MASK; | ||
132 | vaddr += entropy; | 129 | vaddr += entropy; |
133 | *kaslr_regions[i].base = vaddr; | 130 | *kaslr_regions[i].base = vaddr; |
134 | 131 | ||
@@ -137,84 +134,71 @@ void __init kernel_randomize_memory(void) | |||
137 | * randomization alignment. | 134 | * randomization alignment. |
138 | */ | 135 | */ |
139 | vaddr += get_padding(&kaslr_regions[i]); | 136 | vaddr += get_padding(&kaslr_regions[i]); |
140 | if (pgtable_l5_enabled()) | 137 | vaddr = round_up(vaddr + 1, PUD_SIZE); |
141 | vaddr = round_up(vaddr + 1, P4D_SIZE); | ||
142 | else | ||
143 | vaddr = round_up(vaddr + 1, PUD_SIZE); | ||
144 | remain_entropy -= entropy; | 138 | remain_entropy -= entropy; |
145 | } | 139 | } |
146 | } | 140 | } |
147 | 141 | ||
148 | static void __meminit init_trampoline_pud(void) | 142 | static void __meminit init_trampoline_pud(void) |
149 | { | 143 | { |
150 | unsigned long paddr, paddr_next; | 144 | pud_t *pud_page_tramp, *pud, *pud_tramp; |
145 | p4d_t *p4d_page_tramp, *p4d, *p4d_tramp; | ||
146 | unsigned long paddr, vaddr; | ||
151 | pgd_t *pgd; | 147 | pgd_t *pgd; |
152 | pud_t *pud_page, *pud_page_tramp; | ||
153 | int i; | ||
154 | 148 | ||
155 | pud_page_tramp = alloc_low_page(); | 149 | pud_page_tramp = alloc_low_page(); |
156 | 150 | ||
151 | /* | ||
152 | * There are two mappings for the low 1MB area, the direct mapping | ||
153 | * and the 1:1 mapping for the real mode trampoline: | ||
154 | * | ||
155 | * Direct mapping: virt_addr = phys_addr + PAGE_OFFSET | ||
156 | * 1:1 mapping: virt_addr = phys_addr | ||
157 | */ | ||
157 | paddr = 0; | 158 | paddr = 0; |
158 | pgd = pgd_offset_k((unsigned long)__va(paddr)); | 159 | vaddr = (unsigned long)__va(paddr); |
159 | pud_page = (pud_t *) pgd_page_vaddr(*pgd); | 160 | pgd = pgd_offset_k(vaddr); |
160 | |||
161 | for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) { | ||
162 | pud_t *pud, *pud_tramp; | ||
163 | unsigned long vaddr = (unsigned long)__va(paddr); | ||
164 | 161 | ||
165 | pud_tramp = pud_page_tramp + pud_index(paddr); | 162 | p4d = p4d_offset(pgd, vaddr); |
166 | pud = pud_page + pud_index(vaddr); | 163 | pud = pud_offset(p4d, vaddr); |
167 | paddr_next = (paddr & PUD_MASK) + PUD_SIZE; | ||
168 | |||
169 | *pud_tramp = *pud; | ||
170 | } | ||
171 | 164 | ||
172 | set_pgd(&trampoline_pgd_entry, | 165 | pud_tramp = pud_page_tramp + pud_index(paddr); |
173 | __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); | 166 | *pud_tramp = *pud; |
174 | } | ||
175 | |||
176 | static void __meminit init_trampoline_p4d(void) | ||
177 | { | ||
178 | unsigned long paddr, paddr_next; | ||
179 | pgd_t *pgd; | ||
180 | p4d_t *p4d_page, *p4d_page_tramp; | ||
181 | int i; | ||
182 | 167 | ||
183 | p4d_page_tramp = alloc_low_page(); | 168 | if (pgtable_l5_enabled()) { |
184 | 169 | p4d_page_tramp = alloc_low_page(); | |
185 | paddr = 0; | ||
186 | pgd = pgd_offset_k((unsigned long)__va(paddr)); | ||
187 | p4d_page = (p4d_t *) pgd_page_vaddr(*pgd); | ||
188 | |||
189 | for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) { | ||
190 | p4d_t *p4d, *p4d_tramp; | ||
191 | unsigned long vaddr = (unsigned long)__va(paddr); | ||
192 | 170 | ||
193 | p4d_tramp = p4d_page_tramp + p4d_index(paddr); | 171 | p4d_tramp = p4d_page_tramp + p4d_index(paddr); |
194 | p4d = p4d_page + p4d_index(vaddr); | ||
195 | paddr_next = (paddr & P4D_MASK) + P4D_SIZE; | ||
196 | 172 | ||
197 | *p4d_tramp = *p4d; | 173 | set_p4d(p4d_tramp, |
198 | } | 174 | __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); |
199 | 175 | ||
200 | set_pgd(&trampoline_pgd_entry, | 176 | set_pgd(&trampoline_pgd_entry, |
201 | __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); | 177 | __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); |
178 | } else { | ||
179 | set_pgd(&trampoline_pgd_entry, | ||
180 | __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); | ||
181 | } | ||
202 | } | 182 | } |
203 | 183 | ||
204 | /* | 184 | /* |
205 | * Create PGD aligned trampoline table to allow real mode initialization | 185 | * The real mode trampoline, which is required for bootstrapping CPUs |
206 | * of additional CPUs. Consume only 1 low memory page. | 186 | * occupies only a small area under the low 1MB. See reserve_real_mode() |
187 | * for details. | ||
188 | * | ||
189 | * If KASLR is disabled the first PGD entry of the direct mapping is copied | ||
190 | * to map the real mode trampoline. | ||
191 | * | ||
192 | * If KASLR is enabled, copy only the PUD which covers the low 1MB | ||
193 | * area. This limits the randomization granularity to 1GB for both 4-level | ||
194 | * and 5-level paging. | ||
207 | */ | 195 | */ |
208 | void __meminit init_trampoline(void) | 196 | void __meminit init_trampoline(void) |
209 | { | 197 | { |
210 | |||
211 | if (!kaslr_memory_enabled()) { | 198 | if (!kaslr_memory_enabled()) { |
212 | init_trampoline_default(); | 199 | init_trampoline_default(); |
213 | return; | 200 | return; |
214 | } | 201 | } |
215 | 202 | ||
216 | if (pgtable_l5_enabled()) | 203 | init_trampoline_pud(); |
217 | init_trampoline_p4d(); | ||
218 | else | ||
219 | init_trampoline_pud(); | ||
220 | } | 204 | } |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4c570612e24e..daf4d645e537 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -2209,8 +2209,6 @@ int set_pages_rw(struct page *page, int numpages) | |||
2209 | return set_memory_rw(addr, numpages); | 2209 | return set_memory_rw(addr, numpages); |
2210 | } | 2210 | } |
2211 | 2211 | ||
2212 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
2213 | |||
2214 | static int __set_pages_p(struct page *page, int numpages) | 2212 | static int __set_pages_p(struct page *page, int numpages) |
2215 | { | 2213 | { |
2216 | unsigned long tempaddr = (unsigned long) page_address(page); | 2214 | unsigned long tempaddr = (unsigned long) page_address(page); |
@@ -2249,6 +2247,16 @@ static int __set_pages_np(struct page *page, int numpages) | |||
2249 | return __change_page_attr_set_clr(&cpa, 0); | 2247 | return __change_page_attr_set_clr(&cpa, 0); |
2250 | } | 2248 | } |
2251 | 2249 | ||
2250 | int set_direct_map_invalid_noflush(struct page *page) | ||
2251 | { | ||
2252 | return __set_pages_np(page, 1); | ||
2253 | } | ||
2254 | |||
2255 | int set_direct_map_default_noflush(struct page *page) | ||
2256 | { | ||
2257 | return __set_pages_p(page, 1); | ||
2258 | } | ||
2259 | |||
2252 | void __kernel_map_pages(struct page *page, int numpages, int enable) | 2260 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
2253 | { | 2261 | { |
2254 | if (PageHighMem(page)) | 2262 | if (PageHighMem(page)) |
@@ -2282,7 +2290,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) | |||
2282 | } | 2290 | } |
2283 | 2291 | ||
2284 | #ifdef CONFIG_HIBERNATION | 2292 | #ifdef CONFIG_HIBERNATION |
2285 | |||
2286 | bool kernel_page_present(struct page *page) | 2293 | bool kernel_page_present(struct page *page) |
2287 | { | 2294 | { |
2288 | unsigned int level; | 2295 | unsigned int level; |
@@ -2294,11 +2301,8 @@ bool kernel_page_present(struct page *page) | |||
2294 | pte = lookup_address((unsigned long)page_address(page), &level); | 2301 | pte = lookup_address((unsigned long)page_address(page), &level); |
2295 | return (pte_val(*pte) & _PAGE_PRESENT); | 2302 | return (pte_val(*pte) & _PAGE_PRESENT); |
2296 | } | 2303 | } |
2297 | |||
2298 | #endif /* CONFIG_HIBERNATION */ | 2304 | #endif /* CONFIG_HIBERNATION */ |
2299 | 2305 | ||
2300 | #endif /* CONFIG_DEBUG_PAGEALLOC */ | ||
2301 | |||
2302 | int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, | 2306 | int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, |
2303 | unsigned numpages, unsigned long page_flags) | 2307 | unsigned numpages, unsigned long page_flags) |
2304 | { | 2308 | { |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3dbf440d4114..1f67b1e15bf6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -373,14 +373,14 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, | |||
373 | 373 | ||
374 | static struct kmem_cache *pgd_cache; | 374 | static struct kmem_cache *pgd_cache; |
375 | 375 | ||
376 | static int __init pgd_cache_init(void) | 376 | void __init pgd_cache_init(void) |
377 | { | 377 | { |
378 | /* | 378 | /* |
379 | * When PAE kernel is running as a Xen domain, it does not use | 379 | * When PAE kernel is running as a Xen domain, it does not use |
380 | * shared kernel pmd. And this requires a whole page for pgd. | 380 | * shared kernel pmd. And this requires a whole page for pgd. |
381 | */ | 381 | */ |
382 | if (!SHARED_KERNEL_PMD) | 382 | if (!SHARED_KERNEL_PMD) |
383 | return 0; | 383 | return; |
384 | 384 | ||
385 | /* | 385 | /* |
386 | * when PAE kernel is not running as a Xen domain, it uses | 386 | * when PAE kernel is not running as a Xen domain, it uses |
@@ -390,9 +390,7 @@ static int __init pgd_cache_init(void) | |||
390 | */ | 390 | */ |
391 | pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, | 391 | pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, |
392 | SLAB_PANIC, NULL); | 392 | SLAB_PANIC, NULL); |
393 | return 0; | ||
394 | } | 393 | } |
395 | core_initcall(pgd_cache_init); | ||
396 | 394 | ||
397 | static inline pgd_t *_pgd_alloc(void) | 395 | static inline pgd_t *_pgd_alloc(void) |
398 | { | 396 | { |
@@ -420,6 +418,10 @@ static inline void _pgd_free(pgd_t *pgd) | |||
420 | } | 418 | } |
421 | #else | 419 | #else |
422 | 420 | ||
421 | void __init pgd_cache_init(void) | ||
422 | { | ||
423 | } | ||
424 | |||
423 | static inline pgd_t *_pgd_alloc(void) | 425 | static inline pgd_t *_pgd_alloc(void) |
424 | { | 426 | { |
425 | return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); | 427 | return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 487b8474c01c..7f61431c75fb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, | |||
634 | this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); | 634 | this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); |
635 | } | 635 | } |
636 | 636 | ||
637 | static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) | 637 | static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason) |
638 | { | 638 | { |
639 | const struct flush_tlb_info *f = info; | 639 | const struct flush_tlb_info *f = info; |
640 | 640 | ||
@@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
722 | */ | 722 | */ |
723 | unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; | 723 | unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; |
724 | 724 | ||
725 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); | ||
726 | |||
727 | #ifdef CONFIG_DEBUG_VM | ||
728 | static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); | ||
729 | #endif | ||
730 | |||
731 | static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, | ||
732 | unsigned long start, unsigned long end, | ||
733 | unsigned int stride_shift, bool freed_tables, | ||
734 | u64 new_tlb_gen) | ||
735 | { | ||
736 | struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info); | ||
737 | |||
738 | #ifdef CONFIG_DEBUG_VM | ||
739 | /* | ||
740 | * Ensure that the following code is non-reentrant and flush_tlb_info | ||
741 | * is not overwritten. This means no TLB flushing is initiated by | ||
742 | * interrupt handlers and machine-check exception handlers. | ||
743 | */ | ||
744 | BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); | ||
745 | #endif | ||
746 | |||
747 | info->start = start; | ||
748 | info->end = end; | ||
749 | info->mm = mm; | ||
750 | info->stride_shift = stride_shift; | ||
751 | info->freed_tables = freed_tables; | ||
752 | info->new_tlb_gen = new_tlb_gen; | ||
753 | |||
754 | return info; | ||
755 | } | ||
756 | |||
757 | static inline void put_flush_tlb_info(void) | ||
758 | { | ||
759 | #ifdef CONFIG_DEBUG_VM | ||
760 | /* Complete reentrency prevention checks */ | ||
761 | barrier(); | ||
762 | this_cpu_dec(flush_tlb_info_idx); | ||
763 | #endif | ||
764 | } | ||
765 | |||
725 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 766 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
726 | unsigned long end, unsigned int stride_shift, | 767 | unsigned long end, unsigned int stride_shift, |
727 | bool freed_tables) | 768 | bool freed_tables) |
728 | { | 769 | { |
770 | struct flush_tlb_info *info; | ||
771 | u64 new_tlb_gen; | ||
729 | int cpu; | 772 | int cpu; |
730 | 773 | ||
731 | struct flush_tlb_info info = { | ||
732 | .mm = mm, | ||
733 | .stride_shift = stride_shift, | ||
734 | .freed_tables = freed_tables, | ||
735 | }; | ||
736 | |||
737 | cpu = get_cpu(); | 774 | cpu = get_cpu(); |
738 | 775 | ||
739 | /* This is also a barrier that synchronizes with switch_mm(). */ | ||
740 | info.new_tlb_gen = inc_mm_tlb_gen(mm); | ||
741 | |||
742 | /* Should we flush just the requested range? */ | 776 | /* Should we flush just the requested range? */ |
743 | if ((end != TLB_FLUSH_ALL) && | 777 | if ((end == TLB_FLUSH_ALL) || |
744 | ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { | 778 | ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { |
745 | info.start = start; | 779 | start = 0; |
746 | info.end = end; | 780 | end = TLB_FLUSH_ALL; |
747 | } else { | ||
748 | info.start = 0UL; | ||
749 | info.end = TLB_FLUSH_ALL; | ||
750 | } | 781 | } |
751 | 782 | ||
783 | /* This is also a barrier that synchronizes with switch_mm(). */ | ||
784 | new_tlb_gen = inc_mm_tlb_gen(mm); | ||
785 | |||
786 | info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables, | ||
787 | new_tlb_gen); | ||
788 | |||
752 | if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { | 789 | if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { |
753 | VM_WARN_ON(irqs_disabled()); | 790 | lockdep_assert_irqs_enabled(); |
754 | local_irq_disable(); | 791 | local_irq_disable(); |
755 | flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); | 792 | flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN); |
756 | local_irq_enable(); | 793 | local_irq_enable(); |
757 | } | 794 | } |
758 | 795 | ||
759 | if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) | 796 | if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) |
760 | flush_tlb_others(mm_cpumask(mm), &info); | 797 | flush_tlb_others(mm_cpumask(mm), info); |
761 | 798 | ||
799 | put_flush_tlb_info(); | ||
762 | put_cpu(); | 800 | put_cpu(); |
763 | } | 801 | } |
764 | 802 | ||
@@ -787,38 +825,48 @@ static void do_kernel_range_flush(void *info) | |||
787 | 825 | ||
788 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) | 826 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) |
789 | { | 827 | { |
790 | |||
791 | /* Balance as user space task's flush, a bit conservative */ | 828 | /* Balance as user space task's flush, a bit conservative */ |
792 | if (end == TLB_FLUSH_ALL || | 829 | if (end == TLB_FLUSH_ALL || |
793 | (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { | 830 | (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { |
794 | on_each_cpu(do_flush_tlb_all, NULL, 1); | 831 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
795 | } else { | 832 | } else { |
796 | struct flush_tlb_info info; | 833 | struct flush_tlb_info *info; |
797 | info.start = start; | 834 | |
798 | info.end = end; | 835 | preempt_disable(); |
799 | on_each_cpu(do_kernel_range_flush, &info, 1); | 836 | info = get_flush_tlb_info(NULL, start, end, 0, false, 0); |
837 | |||
838 | on_each_cpu(do_kernel_range_flush, info, 1); | ||
839 | |||
840 | put_flush_tlb_info(); | ||
841 | preempt_enable(); | ||
800 | } | 842 | } |
801 | } | 843 | } |
802 | 844 | ||
845 | /* | ||
846 | * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. | ||
847 | * This means that the 'struct flush_tlb_info' that describes which mappings to | ||
848 | * flush is actually fixed. We therefore set a single fixed struct and use it in | ||
849 | * arch_tlbbatch_flush(). | ||
850 | */ | ||
851 | static const struct flush_tlb_info full_flush_tlb_info = { | ||
852 | .mm = NULL, | ||
853 | .start = 0, | ||
854 | .end = TLB_FLUSH_ALL, | ||
855 | }; | ||
856 | |||
803 | void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) | 857 | void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) |
804 | { | 858 | { |
805 | struct flush_tlb_info info = { | ||
806 | .mm = NULL, | ||
807 | .start = 0UL, | ||
808 | .end = TLB_FLUSH_ALL, | ||
809 | }; | ||
810 | |||
811 | int cpu = get_cpu(); | 859 | int cpu = get_cpu(); |
812 | 860 | ||
813 | if (cpumask_test_cpu(cpu, &batch->cpumask)) { | 861 | if (cpumask_test_cpu(cpu, &batch->cpumask)) { |
814 | VM_WARN_ON(irqs_disabled()); | 862 | lockdep_assert_irqs_enabled(); |
815 | local_irq_disable(); | 863 | local_irq_disable(); |
816 | flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); | 864 | flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN); |
817 | local_irq_enable(); | 865 | local_irq_enable(); |
818 | } | 866 | } |
819 | 867 | ||
820 | if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) | 868 | if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) |
821 | flush_tlb_others(&batch->cpumask, &info); | 869 | flush_tlb_others(&batch->cpumask, &full_flush_tlb_info); |
822 | 870 | ||
823 | cpumask_clear(&batch->cpumask); | 871 | cpumask_clear(&batch->cpumask); |
824 | 872 | ||
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index a21e1734fc1f..beb44e22afdf 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c | |||
@@ -2318,8 +2318,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) | |||
2318 | #elif defined(CONFIG_X86_VSYSCALL_EMULATION) | 2318 | #elif defined(CONFIG_X86_VSYSCALL_EMULATION) |
2319 | case VSYSCALL_PAGE: | 2319 | case VSYSCALL_PAGE: |
2320 | #endif | 2320 | #endif |
2321 | case FIX_TEXT_POKE0: | ||
2322 | case FIX_TEXT_POKE1: | ||
2323 | /* All local page mappings */ | 2321 | /* All local page mappings */ |
2324 | pte = pfn_pte(phys, prot); | 2322 | pte = pfn_pte(phys, prot); |
2325 | break; | 2323 | break; |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index fa782fba51ee..75d9d68a6de7 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -1126,6 +1126,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, | |||
1126 | static inline void init_espfix_bsp(void) { } | 1126 | static inline void init_espfix_bsp(void) { } |
1127 | #endif | 1127 | #endif |
1128 | 1128 | ||
1129 | extern void __init pgd_cache_init(void); | ||
1130 | |||
1129 | #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED | 1131 | #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED |
1130 | static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) | 1132 | static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) |
1131 | { | 1133 | { |
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index b9edc7608d90..480e5b2a5748 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h | |||
@@ -21,6 +21,15 @@ | |||
21 | #include <asm/tlbflush.h> | 21 | #include <asm/tlbflush.h> |
22 | #include <asm/cacheflush.h> | 22 | #include <asm/cacheflush.h> |
23 | 23 | ||
24 | /* | ||
25 | * Blindly accessing user memory from NMI context can be dangerous | ||
26 | * if we're in the middle of switching the current user task or switching | ||
27 | * the loaded mm. | ||
28 | */ | ||
29 | #ifndef nmi_uaccess_okay | ||
30 | # define nmi_uaccess_okay() true | ||
31 | #endif | ||
32 | |||
24 | #ifdef CONFIG_MMU | 33 | #ifdef CONFIG_MMU |
25 | 34 | ||
26 | /* | 35 | /* |
diff --git a/include/linux/filter.h b/include/linux/filter.h index 6074aa064b54..7d3abde3f183 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/set_memory.h> | 20 | #include <linux/set_memory.h> |
21 | #include <linux/kallsyms.h> | 21 | #include <linux/kallsyms.h> |
22 | #include <linux/if_vlan.h> | 22 | #include <linux/if_vlan.h> |
23 | #include <linux/vmalloc.h> | ||
23 | 24 | ||
24 | #include <net/sch_generic.h> | 25 | #include <net/sch_generic.h> |
25 | 26 | ||
@@ -503,7 +504,6 @@ struct bpf_prog { | |||
503 | u16 pages; /* Number of allocated pages */ | 504 | u16 pages; /* Number of allocated pages */ |
504 | u16 jited:1, /* Is our filter JIT'ed? */ | 505 | u16 jited:1, /* Is our filter JIT'ed? */ |
505 | jit_requested:1,/* archs need to JIT the prog */ | 506 | jit_requested:1,/* archs need to JIT the prog */ |
506 | undo_set_mem:1, /* Passed set_memory_ro() checkpoint */ | ||
507 | gpl_compatible:1, /* Is filter GPL compatible? */ | 507 | gpl_compatible:1, /* Is filter GPL compatible? */ |
508 | cb_access:1, /* Is control block accessed? */ | 508 | cb_access:1, /* Is control block accessed? */ |
509 | dst_needed:1, /* Do we need dst entry? */ | 509 | dst_needed:1, /* Do we need dst entry? */ |
@@ -733,24 +733,15 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) | |||
733 | 733 | ||
734 | static inline void bpf_prog_lock_ro(struct bpf_prog *fp) | 734 | static inline void bpf_prog_lock_ro(struct bpf_prog *fp) |
735 | { | 735 | { |
736 | fp->undo_set_mem = 1; | 736 | set_vm_flush_reset_perms(fp); |
737 | set_memory_ro((unsigned long)fp, fp->pages); | 737 | set_memory_ro((unsigned long)fp, fp->pages); |
738 | } | 738 | } |
739 | 739 | ||
740 | static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) | ||
741 | { | ||
742 | if (fp->undo_set_mem) | ||
743 | set_memory_rw((unsigned long)fp, fp->pages); | ||
744 | } | ||
745 | |||
746 | static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) | 740 | static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) |
747 | { | 741 | { |
742 | set_vm_flush_reset_perms(hdr); | ||
748 | set_memory_ro((unsigned long)hdr, hdr->pages); | 743 | set_memory_ro((unsigned long)hdr, hdr->pages); |
749 | } | 744 | set_memory_x((unsigned long)hdr, hdr->pages); |
750 | |||
751 | static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) | ||
752 | { | ||
753 | set_memory_rw((unsigned long)hdr, hdr->pages); | ||
754 | } | 745 | } |
755 | 746 | ||
756 | static inline struct bpf_binary_header * | 747 | static inline struct bpf_binary_header * |
@@ -788,7 +779,6 @@ void __bpf_prog_free(struct bpf_prog *fp); | |||
788 | 779 | ||
789 | static inline void bpf_prog_unlock_free(struct bpf_prog *fp) | 780 | static inline void bpf_prog_unlock_free(struct bpf_prog *fp) |
790 | { | 781 | { |
791 | bpf_prog_unlock_ro(fp); | ||
792 | __bpf_prog_free(fp); | 782 | __bpf_prog_free(fp); |
793 | } | 783 | } |
794 | 784 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b10c21630f5..083d7b4863ed 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -2610,37 +2610,31 @@ static inline void kernel_poison_pages(struct page *page, int numpages, | |||
2610 | int enable) { } | 2610 | int enable) { } |
2611 | #endif | 2611 | #endif |
2612 | 2612 | ||
2613 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
2614 | extern bool _debug_pagealloc_enabled; | 2613 | extern bool _debug_pagealloc_enabled; |
2615 | extern void __kernel_map_pages(struct page *page, int numpages, int enable); | ||
2616 | 2614 | ||
2617 | static inline bool debug_pagealloc_enabled(void) | 2615 | static inline bool debug_pagealloc_enabled(void) |
2618 | { | 2616 | { |
2619 | return _debug_pagealloc_enabled; | 2617 | return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled; |
2620 | } | 2618 | } |
2621 | 2619 | ||
2620 | #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) | ||
2621 | extern void __kernel_map_pages(struct page *page, int numpages, int enable); | ||
2622 | |||
2622 | static inline void | 2623 | static inline void |
2623 | kernel_map_pages(struct page *page, int numpages, int enable) | 2624 | kernel_map_pages(struct page *page, int numpages, int enable) |
2624 | { | 2625 | { |
2625 | if (!debug_pagealloc_enabled()) | ||
2626 | return; | ||
2627 | |||
2628 | __kernel_map_pages(page, numpages, enable); | 2626 | __kernel_map_pages(page, numpages, enable); |
2629 | } | 2627 | } |
2630 | #ifdef CONFIG_HIBERNATION | 2628 | #ifdef CONFIG_HIBERNATION |
2631 | extern bool kernel_page_present(struct page *page); | 2629 | extern bool kernel_page_present(struct page *page); |
2632 | #endif /* CONFIG_HIBERNATION */ | 2630 | #endif /* CONFIG_HIBERNATION */ |
2633 | #else /* CONFIG_DEBUG_PAGEALLOC */ | 2631 | #else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ |
2634 | static inline void | 2632 | static inline void |
2635 | kernel_map_pages(struct page *page, int numpages, int enable) {} | 2633 | kernel_map_pages(struct page *page, int numpages, int enable) {} |
2636 | #ifdef CONFIG_HIBERNATION | 2634 | #ifdef CONFIG_HIBERNATION |
2637 | static inline bool kernel_page_present(struct page *page) { return true; } | 2635 | static inline bool kernel_page_present(struct page *page) { return true; } |
2638 | #endif /* CONFIG_HIBERNATION */ | 2636 | #endif /* CONFIG_HIBERNATION */ |
2639 | static inline bool debug_pagealloc_enabled(void) | 2637 | #endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ |
2640 | { | ||
2641 | return false; | ||
2642 | } | ||
2643 | #endif /* CONFIG_DEBUG_PAGEALLOC */ | ||
2644 | 2638 | ||
2645 | #ifdef __HAVE_ARCH_GATE_AREA | 2639 | #ifdef __HAVE_ARCH_GATE_AREA |
2646 | extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); | 2640 | extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); |
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 2e97a2227045..f1227f2c38a4 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h | |||
@@ -76,6 +76,7 @@ extern void exit_itimers(struct signal_struct *); | |||
76 | extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); | 76 | extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); |
77 | extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); | 77 | extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); |
78 | struct task_struct *fork_idle(int); | 78 | struct task_struct *fork_idle(int); |
79 | struct mm_struct *copy_init_mm(void); | ||
79 | extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); | 80 | extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); |
80 | extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); | 81 | extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); |
81 | 82 | ||
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 2a986d282a97..b5071497b8cb 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h | |||
@@ -17,6 +17,17 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } | |||
17 | static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } | 17 | static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } |
18 | #endif | 18 | #endif |
19 | 19 | ||
20 | #ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP | ||
21 | static inline int set_direct_map_invalid_noflush(struct page *page) | ||
22 | { | ||
23 | return 0; | ||
24 | } | ||
25 | static inline int set_direct_map_default_noflush(struct page *page) | ||
26 | { | ||
27 | return 0; | ||
28 | } | ||
29 | #endif | ||
30 | |||
20 | #ifndef set_mce_nospec | 31 | #ifndef set_mce_nospec |
21 | static inline int set_mce_nospec(unsigned long pfn) | 32 | static inline int set_mce_nospec(unsigned long pfn) |
22 | { | 33 | { |
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 103a48a48872..12bf0b68ed92 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h | |||
@@ -115,6 +115,7 @@ struct uprobes_state { | |||
115 | struct xol_area *xol_area; | 115 | struct xol_area *xol_area; |
116 | }; | 116 | }; |
117 | 117 | ||
118 | extern void __init uprobes_init(void); | ||
118 | extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); | 119 | extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); |
119 | extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); | 120 | extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); |
120 | extern bool is_swbp_insn(uprobe_opcode_t *insn); | 121 | extern bool is_swbp_insn(uprobe_opcode_t *insn); |
@@ -154,6 +155,10 @@ extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, | |||
154 | struct uprobes_state { | 155 | struct uprobes_state { |
155 | }; | 156 | }; |
156 | 157 | ||
158 | static inline void uprobes_init(void) | ||
159 | { | ||
160 | } | ||
161 | |||
157 | #define uprobe_get_trap_addr(regs) instruction_pointer(regs) | 162 | #define uprobe_get_trap_addr(regs) instruction_pointer(regs) |
158 | 163 | ||
159 | static inline int | 164 | static inline int |
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 398e9c95cd61..c6eebb839552 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h | |||
@@ -21,6 +21,11 @@ struct notifier_block; /* in notifier.h */ | |||
21 | #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ | 21 | #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ |
22 | #define VM_NO_GUARD 0x00000040 /* don't add guard page */ | 22 | #define VM_NO_GUARD 0x00000040 /* don't add guard page */ |
23 | #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ | 23 | #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ |
24 | /* | ||
25 | * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with | ||
26 | * vfree_atomic(). | ||
27 | */ | ||
28 | #define VM_FLUSH_RESET_PERMS 0x00000100 /* Reset direct map and flush TLB on unmap */ | ||
24 | /* bits [20..32] reserved for arch specific ioremap internals */ | 29 | /* bits [20..32] reserved for arch specific ioremap internals */ |
25 | 30 | ||
26 | /* | 31 | /* |
@@ -142,6 +147,13 @@ extern int map_kernel_range_noflush(unsigned long start, unsigned long size, | |||
142 | pgprot_t prot, struct page **pages); | 147 | pgprot_t prot, struct page **pages); |
143 | extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); | 148 | extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); |
144 | extern void unmap_kernel_range(unsigned long addr, unsigned long size); | 149 | extern void unmap_kernel_range(unsigned long addr, unsigned long size); |
150 | static inline void set_vm_flush_reset_perms(void *addr) | ||
151 | { | ||
152 | struct vm_struct *vm = find_vm_area(addr); | ||
153 | |||
154 | if (vm) | ||
155 | vm->flags |= VM_FLUSH_RESET_PERMS; | ||
156 | } | ||
145 | #else | 157 | #else |
146 | static inline int | 158 | static inline int |
147 | map_kernel_range_noflush(unsigned long start, unsigned long size, | 159 | map_kernel_range_noflush(unsigned long start, unsigned long size, |
@@ -157,6 +169,9 @@ static inline void | |||
157 | unmap_kernel_range(unsigned long addr, unsigned long size) | 169 | unmap_kernel_range(unsigned long addr, unsigned long size) |
158 | { | 170 | { |
159 | } | 171 | } |
172 | static inline void set_vm_flush_reset_perms(void *addr) | ||
173 | { | ||
174 | } | ||
160 | #endif | 175 | #endif |
161 | 176 | ||
162 | /* Allocate/destroy a 'vmalloc' VM area. */ | 177 | /* Allocate/destroy a 'vmalloc' VM area. */ |
diff --git a/init/main.c b/init/main.c index 7d4025d665eb..9dc2f3b4f753 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -504,6 +504,10 @@ void __init __weak thread_stack_cache_init(void) | |||
504 | 504 | ||
505 | void __init __weak mem_encrypt_init(void) { } | 505 | void __init __weak mem_encrypt_init(void) { } |
506 | 506 | ||
507 | void __init __weak poking_init(void) { } | ||
508 | |||
509 | void __init __weak pgd_cache_init(void) { } | ||
510 | |||
507 | bool initcall_debug; | 511 | bool initcall_debug; |
508 | core_param(initcall_debug, initcall_debug, bool, 0644); | 512 | core_param(initcall_debug, initcall_debug, bool, 0644); |
509 | 513 | ||
@@ -535,6 +539,7 @@ static void __init mm_init(void) | |||
535 | init_espfix_bsp(); | 539 | init_espfix_bsp(); |
536 | /* Should be run after espfix64 is set up. */ | 540 | /* Should be run after espfix64 is set up. */ |
537 | pti_init(); | 541 | pti_init(); |
542 | pgd_cache_init(); | ||
538 | } | 543 | } |
539 | 544 | ||
540 | void __init __weak arch_call_rest_init(void) | 545 | void __init __weak arch_call_rest_init(void) |
@@ -737,6 +742,7 @@ asmlinkage __visible void __init start_kernel(void) | |||
737 | taskstats_init_early(); | 742 | taskstats_init_early(); |
738 | delayacct_init(); | 743 | delayacct_init(); |
739 | 744 | ||
745 | poking_init(); | ||
740 | check_bugs(); | 746 | check_bugs(); |
741 | 747 | ||
742 | acpi_subsystem_init(); | 748 | acpi_subsystem_init(); |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ff09d32a8a1b..c605397c79f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) | |||
848 | if (fp->jited) { | 848 | if (fp->jited) { |
849 | struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); | 849 | struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); |
850 | 850 | ||
851 | bpf_jit_binary_unlock_ro(hdr); | ||
852 | bpf_jit_binary_free(hdr); | 851 | bpf_jit_binary_free(hdr); |
853 | 852 | ||
854 | WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); | 853 | WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c5cde87329c7..e6a0d6be87e3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = { | |||
2294 | .priority = INT_MAX-1, /* notified after kprobes, kgdb */ | 2294 | .priority = INT_MAX-1, /* notified after kprobes, kgdb */ |
2295 | }; | 2295 | }; |
2296 | 2296 | ||
2297 | static int __init init_uprobes(void) | 2297 | void __init uprobes_init(void) |
2298 | { | 2298 | { |
2299 | int i; | 2299 | int i; |
2300 | 2300 | ||
2301 | for (i = 0; i < UPROBES_HASH_SZ; i++) | 2301 | for (i = 0; i < UPROBES_HASH_SZ; i++) |
2302 | mutex_init(&uprobes_mmap_mutex[i]); | 2302 | mutex_init(&uprobes_mmap_mutex[i]); |
2303 | 2303 | ||
2304 | if (percpu_init_rwsem(&dup_mmap_sem)) | 2304 | BUG_ON(percpu_init_rwsem(&dup_mmap_sem)); |
2305 | return -ENOMEM; | ||
2306 | 2305 | ||
2307 | return register_die_notifier(&uprobe_exception_nb); | 2306 | BUG_ON(register_die_notifier(&uprobe_exception_nb)); |
2308 | } | 2307 | } |
2309 | __initcall(init_uprobes); | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 9dcd18aa210b..fbe9dfcd8680 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -815,6 +815,7 @@ void __init fork_init(void) | |||
815 | #endif | 815 | #endif |
816 | 816 | ||
817 | lockdep_init_task(&init_task); | 817 | lockdep_init_task(&init_task); |
818 | uprobes_init(); | ||
818 | } | 819 | } |
819 | 820 | ||
820 | int __weak arch_dup_task_struct(struct task_struct *dst, | 821 | int __weak arch_dup_task_struct(struct task_struct *dst, |
@@ -1298,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
1298 | complete_vfork_done(tsk); | 1299 | complete_vfork_done(tsk); |
1299 | } | 1300 | } |
1300 | 1301 | ||
1301 | /* | 1302 | /** |
1302 | * Allocate a new mm structure and copy contents from the | 1303 | * dup_mm() - duplicates an existing mm structure |
1303 | * mm structure of the passed in task structure. | 1304 | * @tsk: the task_struct with which the new mm will be associated. |
1305 | * @oldmm: the mm to duplicate. | ||
1306 | * | ||
1307 | * Allocates a new mm structure and duplicates the provided @oldmm structure | ||
1308 | * content into it. | ||
1309 | * | ||
1310 | * Return: the duplicated mm or NULL on failure. | ||
1304 | */ | 1311 | */ |
1305 | static struct mm_struct *dup_mm(struct task_struct *tsk) | 1312 | static struct mm_struct *dup_mm(struct task_struct *tsk, |
1313 | struct mm_struct *oldmm) | ||
1306 | { | 1314 | { |
1307 | struct mm_struct *mm, *oldmm = current->mm; | 1315 | struct mm_struct *mm; |
1308 | int err; | 1316 | int err; |
1309 | 1317 | ||
1310 | mm = allocate_mm(); | 1318 | mm = allocate_mm(); |
@@ -1371,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) | |||
1371 | } | 1379 | } |
1372 | 1380 | ||
1373 | retval = -ENOMEM; | 1381 | retval = -ENOMEM; |
1374 | mm = dup_mm(tsk); | 1382 | mm = dup_mm(tsk, current->mm); |
1375 | if (!mm) | 1383 | if (!mm) |
1376 | goto fail_nomem; | 1384 | goto fail_nomem; |
1377 | 1385 | ||
@@ -2186,6 +2194,11 @@ struct task_struct *fork_idle(int cpu) | |||
2186 | return task; | 2194 | return task; |
2187 | } | 2195 | } |
2188 | 2196 | ||
2197 | struct mm_struct *copy_init_mm(void) | ||
2198 | { | ||
2199 | return dup_mm(NULL, &init_mm); | ||
2200 | } | ||
2201 | |||
2189 | /* | 2202 | /* |
2190 | * Ok, this is the main fork-routine. | 2203 | * Ok, this is the main fork-routine. |
2191 | * | 2204 | * |
diff --git a/kernel/module.c b/kernel/module.c index 0b9aa8ab89f0..a9020bdd4cf6 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex); | |||
98 | EXPORT_SYMBOL_GPL(module_mutex); | 98 | EXPORT_SYMBOL_GPL(module_mutex); |
99 | static LIST_HEAD(modules); | 99 | static LIST_HEAD(modules); |
100 | 100 | ||
101 | /* Work queue for freeing init sections in success case */ | ||
102 | static struct work_struct init_free_wq; | ||
103 | static struct llist_head init_free_list; | ||
104 | |||
101 | #ifdef CONFIG_MODULES_TREE_LOOKUP | 105 | #ifdef CONFIG_MODULES_TREE_LOOKUP |
102 | 106 | ||
103 | /* | 107 | /* |
@@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init) | |||
1949 | if (!rodata_enabled) | 1953 | if (!rodata_enabled) |
1950 | return; | 1954 | return; |
1951 | 1955 | ||
1956 | set_vm_flush_reset_perms(mod->core_layout.base); | ||
1957 | set_vm_flush_reset_perms(mod->init_layout.base); | ||
1952 | frob_text(&mod->core_layout, set_memory_ro); | 1958 | frob_text(&mod->core_layout, set_memory_ro); |
1959 | frob_text(&mod->core_layout, set_memory_x); | ||
1960 | |||
1953 | frob_rodata(&mod->core_layout, set_memory_ro); | 1961 | frob_rodata(&mod->core_layout, set_memory_ro); |
1962 | |||
1954 | frob_text(&mod->init_layout, set_memory_ro); | 1963 | frob_text(&mod->init_layout, set_memory_ro); |
1964 | frob_text(&mod->init_layout, set_memory_x); | ||
1965 | |||
1955 | frob_rodata(&mod->init_layout, set_memory_ro); | 1966 | frob_rodata(&mod->init_layout, set_memory_ro); |
1956 | 1967 | ||
1957 | if (after_init) | 1968 | if (after_init) |
@@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod) | |||
1967 | frob_writable_data(&mod->init_layout, set_memory_nx); | 1978 | frob_writable_data(&mod->init_layout, set_memory_nx); |
1968 | } | 1979 | } |
1969 | 1980 | ||
1970 | static void module_disable_nx(const struct module *mod) | ||
1971 | { | ||
1972 | frob_rodata(&mod->core_layout, set_memory_x); | ||
1973 | frob_ro_after_init(&mod->core_layout, set_memory_x); | ||
1974 | frob_writable_data(&mod->core_layout, set_memory_x); | ||
1975 | frob_rodata(&mod->init_layout, set_memory_x); | ||
1976 | frob_writable_data(&mod->init_layout, set_memory_x); | ||
1977 | } | ||
1978 | |||
1979 | /* Iterate through all modules and set each module's text as RW */ | 1981 | /* Iterate through all modules and set each module's text as RW */ |
1980 | void set_all_modules_text_rw(void) | 1982 | void set_all_modules_text_rw(void) |
1981 | { | 1983 | { |
@@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void) | |||
2019 | } | 2021 | } |
2020 | mutex_unlock(&module_mutex); | 2022 | mutex_unlock(&module_mutex); |
2021 | } | 2023 | } |
2022 | |||
2023 | static void disable_ro_nx(const struct module_layout *layout) | ||
2024 | { | ||
2025 | if (rodata_enabled) { | ||
2026 | frob_text(layout, set_memory_rw); | ||
2027 | frob_rodata(layout, set_memory_rw); | ||
2028 | frob_ro_after_init(layout, set_memory_rw); | ||
2029 | } | ||
2030 | frob_rodata(layout, set_memory_x); | ||
2031 | frob_ro_after_init(layout, set_memory_x); | ||
2032 | frob_writable_data(layout, set_memory_x); | ||
2033 | } | ||
2034 | |||
2035 | #else | 2024 | #else |
2036 | static void disable_ro_nx(const struct module_layout *layout) { } | ||
2037 | static void module_enable_nx(const struct module *mod) { } | 2025 | static void module_enable_nx(const struct module *mod) { } |
2038 | static void module_disable_nx(const struct module *mod) { } | ||
2039 | #endif | 2026 | #endif |
2040 | 2027 | ||
2041 | #ifdef CONFIG_LIVEPATCH | 2028 | #ifdef CONFIG_LIVEPATCH |
@@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod) | |||
2115 | 2102 | ||
2116 | void __weak module_memfree(void *module_region) | 2103 | void __weak module_memfree(void *module_region) |
2117 | { | 2104 | { |
2105 | /* | ||
2106 | * This memory may be RO, and freeing RO memory in an interrupt is not | ||
2107 | * supported by vmalloc. | ||
2108 | */ | ||
2109 | WARN_ON(in_interrupt()); | ||
2118 | vfree(module_region); | 2110 | vfree(module_region); |
2119 | } | 2111 | } |
2120 | 2112 | ||
@@ -2166,7 +2158,6 @@ static void free_module(struct module *mod) | |||
2166 | mutex_unlock(&module_mutex); | 2158 | mutex_unlock(&module_mutex); |
2167 | 2159 | ||
2168 | /* This may be empty, but that's OK */ | 2160 | /* This may be empty, but that's OK */ |
2169 | disable_ro_nx(&mod->init_layout); | ||
2170 | module_arch_freeing_init(mod); | 2161 | module_arch_freeing_init(mod); |
2171 | module_memfree(mod->init_layout.base); | 2162 | module_memfree(mod->init_layout.base); |
2172 | kfree(mod->args); | 2163 | kfree(mod->args); |
@@ -2176,7 +2167,6 @@ static void free_module(struct module *mod) | |||
2176 | lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); | 2167 | lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); |
2177 | 2168 | ||
2178 | /* Finally, free the core (containing the module structure) */ | 2169 | /* Finally, free the core (containing the module structure) */ |
2179 | disable_ro_nx(&mod->core_layout); | ||
2180 | module_memfree(mod->core_layout.base); | 2170 | module_memfree(mod->core_layout.base); |
2181 | } | 2171 | } |
2182 | 2172 | ||
@@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod) | |||
3415 | 3405 | ||
3416 | /* For freeing module_init on success, in case kallsyms traversing */ | 3406 | /* For freeing module_init on success, in case kallsyms traversing */ |
3417 | struct mod_initfree { | 3407 | struct mod_initfree { |
3418 | struct rcu_head rcu; | 3408 | struct llist_node node; |
3419 | void *module_init; | 3409 | void *module_init; |
3420 | }; | 3410 | }; |
3421 | 3411 | ||
3422 | static void do_free_init(struct rcu_head *head) | 3412 | static void do_free_init(struct work_struct *w) |
3423 | { | 3413 | { |
3424 | struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); | 3414 | struct llist_node *pos, *n, *list; |
3425 | module_memfree(m->module_init); | 3415 | struct mod_initfree *initfree; |
3426 | kfree(m); | 3416 | |
3417 | list = llist_del_all(&init_free_list); | ||
3418 | |||
3419 | synchronize_rcu(); | ||
3420 | |||
3421 | llist_for_each_safe(pos, n, list) { | ||
3422 | initfree = container_of(pos, struct mod_initfree, node); | ||
3423 | module_memfree(initfree->module_init); | ||
3424 | kfree(initfree); | ||
3425 | } | ||
3427 | } | 3426 | } |
3428 | 3427 | ||
3428 | static int __init modules_wq_init(void) | ||
3429 | { | ||
3430 | INIT_WORK(&init_free_wq, do_free_init); | ||
3431 | init_llist_head(&init_free_list); | ||
3432 | return 0; | ||
3433 | } | ||
3434 | module_init(modules_wq_init); | ||
3435 | |||
3429 | /* | 3436 | /* |
3430 | * This is where the real work happens. | 3437 | * This is where the real work happens. |
3431 | * | 3438 | * |
@@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod) | |||
3502 | #endif | 3509 | #endif |
3503 | module_enable_ro(mod, true); | 3510 | module_enable_ro(mod, true); |
3504 | mod_tree_remove_init(mod); | 3511 | mod_tree_remove_init(mod); |
3505 | disable_ro_nx(&mod->init_layout); | ||
3506 | module_arch_freeing_init(mod); | 3512 | module_arch_freeing_init(mod); |
3507 | mod->init_layout.base = NULL; | 3513 | mod->init_layout.base = NULL; |
3508 | mod->init_layout.size = 0; | 3514 | mod->init_layout.size = 0; |
@@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod) | |||
3513 | * We want to free module_init, but be aware that kallsyms may be | 3519 | * We want to free module_init, but be aware that kallsyms may be |
3514 | * walking this with preempt disabled. In all the failure paths, we | 3520 | * walking this with preempt disabled. In all the failure paths, we |
3515 | * call synchronize_rcu(), but we don't want to slow down the success | 3521 | * call synchronize_rcu(), but we don't want to slow down the success |
3516 | * path, so use actual RCU here. | 3522 | * path. module_memfree() cannot be called in an interrupt, so do the |
3523 | * work and call synchronize_rcu() in a work queue. | ||
3524 | * | ||
3517 | * Note that module_alloc() on most architectures creates W+X page | 3525 | * Note that module_alloc() on most architectures creates W+X page |
3518 | * mappings which won't be cleaned up until do_free_init() runs. Any | 3526 | * mappings which won't be cleaned up until do_free_init() runs. Any |
3519 | * code such as mark_rodata_ro() which depends on those mappings to | 3527 | * code such as mark_rodata_ro() which depends on those mappings to |
3520 | * be cleaned up needs to sync with the queued work - ie | 3528 | * be cleaned up needs to sync with the queued work - ie |
3521 | * rcu_barrier() | 3529 | * rcu_barrier() |
3522 | */ | 3530 | */ |
3523 | call_rcu(&freeinit->rcu, do_free_init); | 3531 | if (llist_add(&freeinit->node, &init_free_list)) |
3532 | schedule_work(&init_free_wq); | ||
3533 | |||
3524 | mutex_unlock(&module_mutex); | 3534 | mutex_unlock(&module_mutex); |
3525 | wake_up_all(&module_wq); | 3535 | wake_up_all(&module_wq); |
3526 | 3536 | ||
@@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3817 | module_bug_cleanup(mod); | 3827 | module_bug_cleanup(mod); |
3818 | mutex_unlock(&module_mutex); | 3828 | mutex_unlock(&module_mutex); |
3819 | 3829 | ||
3820 | /* we can't deallocate the module until we clear memory protection */ | ||
3821 | module_disable_ro(mod); | ||
3822 | module_disable_nx(mod); | ||
3823 | |||
3824 | ddebug_cleanup: | 3830 | ddebug_cleanup: |
3825 | ftrace_release_mod(mod); | 3831 | ftrace_release_mod(mod); |
3826 | dynamic_debug_remove(mod, info->debug); | 3832 | dynamic_debug_remove(mod, info->debug); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f08a1e4ee1d4..bc9558ab1e5b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src) | |||
1342 | * safe_copy_page - Copy a page in a safe way. | 1342 | * safe_copy_page - Copy a page in a safe way. |
1343 | * | 1343 | * |
1344 | * Check if the page we are going to copy is marked as present in the kernel | 1344 | * Check if the page we are going to copy is marked as present in the kernel |
1345 | * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set | 1345 | * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or |
1346 | * and in that case kernel_page_present() always returns 'true'). | 1346 | * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() |
1347 | * always returns 'true'. | ||
1347 | */ | 1348 | */ |
1348 | static void safe_copy_page(void *dst, struct page *s_page) | 1349 | static void safe_copy_page(void *dst, struct page *s_page) |
1349 | { | 1350 | { |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d64c00afceb5..94b0e37d90ef 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
@@ -14,6 +14,8 @@ | |||
14 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
15 | #include <linux/error-injection.h> | 15 | #include <linux/error-injection.h> |
16 | 16 | ||
17 | #include <asm/tlb.h> | ||
18 | |||
17 | #include "trace_probe.h" | 19 | #include "trace_probe.h" |
18 | #include "trace.h" | 20 | #include "trace.h" |
19 | 21 | ||
@@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, | |||
163 | * access_ok() should prevent writing to non-user memory, but in | 165 | * access_ok() should prevent writing to non-user memory, but in |
164 | * some situations (nommu, temporary switch, etc) access_ok() does | 166 | * some situations (nommu, temporary switch, etc) access_ok() does |
165 | * not provide enough validation, hence the check on KERNEL_DS. | 167 | * not provide enough validation, hence the check on KERNEL_DS. |
168 | * | ||
169 | * nmi_uaccess_okay() ensures the probe is not run in an interim | ||
170 | * state, when the task or mm are switched. This is specifically | ||
171 | * required to prevent the use of temporary mm. | ||
166 | */ | 172 | */ |
167 | 173 | ||
168 | if (unlikely(in_interrupt() || | 174 | if (unlikely(in_interrupt() || |
@@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, | |||
170 | return -EPERM; | 176 | return -EPERM; |
171 | if (unlikely(uaccess_kernel())) | 177 | if (unlikely(uaccess_kernel())) |
172 | return -EPERM; | 178 | return -EPERM; |
179 | if (unlikely(!nmi_uaccess_okay())) | ||
180 | return -EPERM; | ||
173 | if (!access_ok(unsafe_ptr, size)) | 181 | if (!access_ok(unsafe_ptr, size)) |
174 | return -EPERM; | 182 | return -EPERM; |
175 | 183 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c02cff1ed56e..59661106da16 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1144,7 +1144,9 @@ static __always_inline bool free_pages_prepare(struct page *page, | |||
1144 | } | 1144 | } |
1145 | arch_free_page(page, order); | 1145 | arch_free_page(page, order); |
1146 | kernel_poison_pages(page, 1 << order, 0); | 1146 | kernel_poison_pages(page, 1 << order, 0); |
1147 | kernel_map_pages(page, 1 << order, 0); | 1147 | if (debug_pagealloc_enabled()) |
1148 | kernel_map_pages(page, 1 << order, 0); | ||
1149 | |||
1148 | kasan_free_nondeferred_pages(page, order); | 1150 | kasan_free_nondeferred_pages(page, order); |
1149 | 1151 | ||
1150 | return true; | 1152 | return true; |
@@ -2014,7 +2016,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, | |||
2014 | set_page_refcounted(page); | 2016 | set_page_refcounted(page); |
2015 | 2017 | ||
2016 | arch_alloc_page(page, order); | 2018 | arch_alloc_page(page, order); |
2017 | kernel_map_pages(page, 1 << order, 1); | 2019 | if (debug_pagealloc_enabled()) |
2020 | kernel_map_pages(page, 1 << order, 1); | ||
2018 | kasan_alloc_pages(page, order); | 2021 | kasan_alloc_pages(page, order); |
2019 | kernel_poison_pages(page, 1 << order, 1); | 2022 | kernel_poison_pages(page, 1 << order, 1); |
2020 | set_page_owner(page, order, gfp_flags); | 2023 | set_page_owner(page, order, gfp_flags); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e86ba6e74b50..e5e9e1fcac01 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/interrupt.h> | 18 | #include <linux/interrupt.h> |
19 | #include <linux/proc_fs.h> | 19 | #include <linux/proc_fs.h> |
20 | #include <linux/seq_file.h> | 20 | #include <linux/seq_file.h> |
21 | #include <linux/set_memory.h> | ||
21 | #include <linux/debugobjects.h> | 22 | #include <linux/debugobjects.h> |
22 | #include <linux/kallsyms.h> | 23 | #include <linux/kallsyms.h> |
23 | #include <linux/list.h> | 24 | #include <linux/list.h> |
@@ -1059,24 +1060,9 @@ static void vb_free(const void *addr, unsigned long size) | |||
1059 | spin_unlock(&vb->lock); | 1060 | spin_unlock(&vb->lock); |
1060 | } | 1061 | } |
1061 | 1062 | ||
1062 | /** | 1063 | static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) |
1063 | * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer | ||
1064 | * | ||
1065 | * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily | ||
1066 | * to amortize TLB flushing overheads. What this means is that any page you | ||
1067 | * have now, may, in a former life, have been mapped into kernel virtual | ||
1068 | * address by the vmap layer and so there might be some CPUs with TLB entries | ||
1069 | * still referencing that page (additional to the regular 1:1 kernel mapping). | ||
1070 | * | ||
1071 | * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can | ||
1072 | * be sure that none of the pages we have control over will have any aliases | ||
1073 | * from the vmap layer. | ||
1074 | */ | ||
1075 | void vm_unmap_aliases(void) | ||
1076 | { | 1064 | { |
1077 | unsigned long start = ULONG_MAX, end = 0; | ||
1078 | int cpu; | 1065 | int cpu; |
1079 | int flush = 0; | ||
1080 | 1066 | ||
1081 | if (unlikely(!vmap_initialized)) | 1067 | if (unlikely(!vmap_initialized)) |
1082 | return; | 1068 | return; |
@@ -1113,6 +1099,27 @@ void vm_unmap_aliases(void) | |||
1113 | flush_tlb_kernel_range(start, end); | 1099 | flush_tlb_kernel_range(start, end); |
1114 | mutex_unlock(&vmap_purge_lock); | 1100 | mutex_unlock(&vmap_purge_lock); |
1115 | } | 1101 | } |
1102 | |||
1103 | /** | ||
1104 | * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer | ||
1105 | * | ||
1106 | * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily | ||
1107 | * to amortize TLB flushing overheads. What this means is that any page you | ||
1108 | * have now, may, in a former life, have been mapped into kernel virtual | ||
1109 | * address by the vmap layer and so there might be some CPUs with TLB entries | ||
1110 | * still referencing that page (additional to the regular 1:1 kernel mapping). | ||
1111 | * | ||
1112 | * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can | ||
1113 | * be sure that none of the pages we have control over will have any aliases | ||
1114 | * from the vmap layer. | ||
1115 | */ | ||
1116 | void vm_unmap_aliases(void) | ||
1117 | { | ||
1118 | unsigned long start = ULONG_MAX, end = 0; | ||
1119 | int flush = 0; | ||
1120 | |||
1121 | _vm_unmap_aliases(start, end, flush); | ||
1122 | } | ||
1116 | EXPORT_SYMBOL_GPL(vm_unmap_aliases); | 1123 | EXPORT_SYMBOL_GPL(vm_unmap_aliases); |
1117 | 1124 | ||
1118 | /** | 1125 | /** |
@@ -1505,6 +1512,72 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
1505 | return NULL; | 1512 | return NULL; |
1506 | } | 1513 | } |
1507 | 1514 | ||
1515 | static inline void set_area_direct_map(const struct vm_struct *area, | ||
1516 | int (*set_direct_map)(struct page *page)) | ||
1517 | { | ||
1518 | int i; | ||
1519 | |||
1520 | for (i = 0; i < area->nr_pages; i++) | ||
1521 | if (page_address(area->pages[i])) | ||
1522 | set_direct_map(area->pages[i]); | ||
1523 | } | ||
1524 | |||
1525 | /* Handle removing and resetting vm mappings related to the vm_struct. */ | ||
1526 | static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) | ||
1527 | { | ||
1528 | unsigned long addr = (unsigned long)area->addr; | ||
1529 | unsigned long start = ULONG_MAX, end = 0; | ||
1530 | int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; | ||
1531 | int i; | ||
1532 | |||
1533 | /* | ||
1534 | * The below block can be removed when all architectures that have | ||
1535 | * direct map permissions also have set_direct_map_() implementations. | ||
1536 | * This is concerned with resetting the direct map any an vm alias with | ||
1537 | * execute permissions, without leaving a RW+X window. | ||
1538 | */ | ||
1539 | if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { | ||
1540 | set_memory_nx(addr, area->nr_pages); | ||
1541 | set_memory_rw(addr, area->nr_pages); | ||
1542 | } | ||
1543 | |||
1544 | remove_vm_area(area->addr); | ||
1545 | |||
1546 | /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ | ||
1547 | if (!flush_reset) | ||
1548 | return; | ||
1549 | |||
1550 | /* | ||
1551 | * If not deallocating pages, just do the flush of the VM area and | ||
1552 | * return. | ||
1553 | */ | ||
1554 | if (!deallocate_pages) { | ||
1555 | vm_unmap_aliases(); | ||
1556 | return; | ||
1557 | } | ||
1558 | |||
1559 | /* | ||
1560 | * If execution gets here, flush the vm mapping and reset the direct | ||
1561 | * map. Find the start and end range of the direct mappings to make sure | ||
1562 | * the vm_unmap_aliases() flush includes the direct map. | ||
1563 | */ | ||
1564 | for (i = 0; i < area->nr_pages; i++) { | ||
1565 | if (page_address(area->pages[i])) { | ||
1566 | start = min(addr, start); | ||
1567 | end = max(addr, end); | ||
1568 | } | ||
1569 | } | ||
1570 | |||
1571 | /* | ||
1572 | * Set direct map to something invalid so that it won't be cached if | ||
1573 | * there are any accesses after the TLB flush, then flush the TLB and | ||
1574 | * reset the direct map permissions to the default. | ||
1575 | */ | ||
1576 | set_area_direct_map(area, set_direct_map_invalid_noflush); | ||
1577 | _vm_unmap_aliases(start, end, 1); | ||
1578 | set_area_direct_map(area, set_direct_map_default_noflush); | ||
1579 | } | ||
1580 | |||
1508 | static void __vunmap(const void *addr, int deallocate_pages) | 1581 | static void __vunmap(const void *addr, int deallocate_pages) |
1509 | { | 1582 | { |
1510 | struct vm_struct *area; | 1583 | struct vm_struct *area; |
@@ -1526,7 +1599,8 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
1526 | debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); | 1599 | debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); |
1527 | debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); | 1600 | debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); |
1528 | 1601 | ||
1529 | remove_vm_area(addr); | 1602 | vm_remove_mappings(area, deallocate_pages); |
1603 | |||
1530 | if (deallocate_pages) { | 1604 | if (deallocate_pages) { |
1531 | int i; | 1605 | int i; |
1532 | 1606 | ||
@@ -1961,8 +2035,9 @@ EXPORT_SYMBOL(vzalloc_node); | |||
1961 | */ | 2035 | */ |
1962 | void *vmalloc_exec(unsigned long size) | 2036 | void *vmalloc_exec(unsigned long size) |
1963 | { | 2037 | { |
1964 | return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, | 2038 | return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, |
1965 | NUMA_NO_NODE, __builtin_return_address(0)); | 2039 | GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, |
2040 | NUMA_NO_NODE, __builtin_return_address(0)); | ||
1966 | } | 2041 | } |
1967 | 2042 | ||
1968 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) | 2043 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) |