diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-06 19:13:31 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-06 19:13:31 -0400 |
| commit | 0bc40e549aeea2de20fc571749de9bbfc099fb34 (patch) | |
| tree | d18f3339bd383a17431fca23b6c5f3e54c93cf2f | |
| parent | e913c4a4c21cd83317fafe63bfdc9d34d2910114 (diff) | |
| parent | caa841360134f863987f2d4f77b8dc2fbb7596f8 (diff) | |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
"The changes in here are:
- text_poke() fixes and an extensive set of executability lockdowns,
to (hopefully) eliminate the last residual circumstances under
which we are using W|X mappings even temporarily on x86 kernels.
This required a broad range of surgery in text patching facilities,
module loading, trampoline handling and other bits.
- tweak page fault messages to be more informative and more
structured.
- remove DISCONTIGMEM support on x86-32 and make SPARSEMEM the
default.
- reduce KASLR granularity on 5-level paging kernels from 512 GB to
1 GB.
- misc other changes and updates"
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
x86/mm: Initialize PGD cache during mm initialization
x86/alternatives: Add comment about module removal races
x86/kprobes: Use vmalloc special flag
x86/ftrace: Use vmalloc special flag
bpf: Use vmalloc special flag
modules: Use vmalloc special flag
mm/vmalloc: Add flag for freeing of special permsissions
mm/hibernation: Make hibernation handle unmapped pages
x86/mm/cpa: Add set_direct_map_*() functions
x86/alternatives: Remove the return value of text_poke_*()
x86/jump-label: Remove support for custom text poker
x86/modules: Avoid breaking W^X while loading modules
x86/kprobes: Set instruction page as executable
x86/ftrace: Set trampoline pages as executable
x86/kgdb: Avoid redundant comparison of patched code
x86/alternatives: Use temporary mm for text poking
x86/alternatives: Initialize temporary mm for patching
fork: Provide a function for copying init_mm
uprobes: Initialize uprobes earlier
x86/mm: Save debug registers when loading a temporary mm
...
40 files changed, 711 insertions, 343 deletions
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 804f9426ed17..6cbe652d7a49 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt | |||
| @@ -72,7 +72,7 @@ Complete virtual memory map with 5-level page tables | |||
| 72 | Notes: | 72 | Notes: |
| 73 | 73 | ||
| 74 | - With 56-bit addresses, user-space memory gets expanded by a factor of 512x, | 74 | - With 56-bit addresses, user-space memory gets expanded by a factor of 512x, |
| 75 | from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting | 75 | from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PB starting |
| 76 | offset and many of the regions expand to support the much larger physical | 76 | offset and many of the regions expand to support the much larger physical |
| 77 | memory supported. | 77 | memory supported. |
| 78 | 78 | ||
| @@ -83,7 +83,7 @@ Notes: | |||
| 83 | 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm | 83 | 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm |
| 84 | __________________|____________|__________________|_________|___________________________________________________________ | 84 | __________________|____________|__________________|_________|___________________________________________________________ |
| 85 | | | | | | 85 | | | | | |
| 86 | 0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical | 86 | 0100000000000000 | +64 PB | feffffffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical |
| 87 | | | | | virtual memory addresses up to the -64 PB | 87 | | | | | virtual memory addresses up to the -64 PB |
| 88 | | | | | starting offset of kernel mappings. | 88 | | | | | starting offset of kernel mappings. |
| 89 | __________________|____________|__________________|_________|___________________________________________________________ | 89 | __________________|____________|__________________|_________|___________________________________________________________ |
| @@ -99,7 +99,7 @@ ____________________________________________________________|___________________ | |||
| 99 | ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole | 99 | ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole |
| 100 | ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base) | 100 | ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base) |
| 101 | ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole | 101 | ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole |
| 102 | ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory | 102 | ffdf000000000000 | -8.25 PB | fffffbffffffffff | ~8 PB | KASAN shadow memory |
| 103 | __________________|____________|__________________|_________|____________________________________________________________ | 103 | __________________|____________|__________________|_________|____________________________________________________________ |
| 104 | | | 104 | | |
| 105 | | Identical layout to the 47-bit one from here on: | 105 | | Identical layout to the 47-bit one from here on: |
diff --git a/arch/Kconfig b/arch/Kconfig index 3ab446bd12ef..5e43fcbad4ca 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
| @@ -249,6 +249,10 @@ config ARCH_HAS_FORTIFY_SOURCE | |||
| 249 | config ARCH_HAS_SET_MEMORY | 249 | config ARCH_HAS_SET_MEMORY |
| 250 | bool | 250 | bool |
| 251 | 251 | ||
| 252 | # Select if arch has all set_direct_map_invalid/default() functions | ||
| 253 | config ARCH_HAS_SET_DIRECT_MAP | ||
| 254 | bool | ||
| 255 | |||
| 252 | # Select if arch init_task must go in the __init_task_data section | 256 | # Select if arch init_task must go in the __init_task_data section |
| 253 | config ARCH_TASK_STRUCT_ON_STACK | 257 | config ARCH_TASK_STRUCT_ON_STACK |
| 254 | bool | 258 | bool |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index db95da6d644d..9fc73ca17844 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -65,6 +65,7 @@ config X86 | |||
| 65 | select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 | 65 | select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 |
| 66 | select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE | 66 | select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE |
| 67 | select ARCH_HAS_SET_MEMORY | 67 | select ARCH_HAS_SET_MEMORY |
| 68 | select ARCH_HAS_SET_DIRECT_MAP | ||
| 68 | select ARCH_HAS_STRICT_KERNEL_RWX | 69 | select ARCH_HAS_STRICT_KERNEL_RWX |
| 69 | select ARCH_HAS_STRICT_MODULE_RWX | 70 | select ARCH_HAS_STRICT_MODULE_RWX |
| 70 | select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE | 71 | select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE |
| @@ -1592,12 +1593,9 @@ config ARCH_FLATMEM_ENABLE | |||
| 1592 | depends on X86_32 && !NUMA | 1593 | depends on X86_32 && !NUMA |
| 1593 | 1594 | ||
| 1594 | config ARCH_DISCONTIGMEM_ENABLE | 1595 | config ARCH_DISCONTIGMEM_ENABLE |
| 1595 | def_bool y | 1596 | def_bool n |
| 1596 | depends on NUMA && X86_32 | ||
| 1597 | |||
| 1598 | config ARCH_DISCONTIGMEM_DEFAULT | ||
| 1599 | def_bool y | ||
| 1600 | depends on NUMA && X86_32 | 1597 | depends on NUMA && X86_32 |
| 1598 | depends on BROKEN | ||
| 1601 | 1599 | ||
| 1602 | config ARCH_SPARSEMEM_ENABLE | 1600 | config ARCH_SPARSEMEM_ENABLE |
| 1603 | def_bool y | 1601 | def_bool y |
| @@ -1606,8 +1604,7 @@ config ARCH_SPARSEMEM_ENABLE | |||
| 1606 | select SPARSEMEM_VMEMMAP_ENABLE if X86_64 | 1604 | select SPARSEMEM_VMEMMAP_ENABLE if X86_64 |
| 1607 | 1605 | ||
| 1608 | config ARCH_SPARSEMEM_DEFAULT | 1606 | config ARCH_SPARSEMEM_DEFAULT |
| 1609 | def_bool y | 1607 | def_bool X86_64 || (NUMA && X86_32) |
| 1610 | depends on X86_64 | ||
| 1611 | 1608 | ||
| 1612 | config ARCH_SELECT_MEMORY_MODEL | 1609 | config ARCH_SELECT_MEMORY_MODEL |
| 1613 | def_bool y | 1610 | def_bool y |
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 50ba74a34a37..9da8cccdf3fb 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h | |||
| @@ -103,8 +103,6 @@ enum fixed_addresses { | |||
| 103 | #ifdef CONFIG_PARAVIRT | 103 | #ifdef CONFIG_PARAVIRT |
| 104 | FIX_PARAVIRT_BOOTMAP, | 104 | FIX_PARAVIRT_BOOTMAP, |
| 105 | #endif | 105 | #endif |
| 106 | FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ | ||
| 107 | FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ | ||
| 108 | #ifdef CONFIG_X86_INTEL_MID | 106 | #ifdef CONFIG_X86_INTEL_MID |
| 109 | FIX_LNW_VRTC, | 107 | FIX_LNW_VRTC, |
| 110 | #endif | 108 | #endif |
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 19d18fae6ec6..93dff1963337 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <asm/tlbflush.h> | 13 | #include <asm/tlbflush.h> |
| 14 | #include <asm/paravirt.h> | 14 | #include <asm/paravirt.h> |
| 15 | #include <asm/mpx.h> | 15 | #include <asm/mpx.h> |
| 16 | #include <asm/debugreg.h> | ||
| 16 | 17 | ||
| 17 | extern atomic64_t last_mm_ctx_id; | 18 | extern atomic64_t last_mm_ctx_id; |
| 18 | 19 | ||
| @@ -356,4 +357,59 @@ static inline unsigned long __get_current_cr3_fast(void) | |||
| 356 | return cr3; | 357 | return cr3; |
| 357 | } | 358 | } |
| 358 | 359 | ||
| 360 | typedef struct { | ||
| 361 | struct mm_struct *mm; | ||
| 362 | } temp_mm_state_t; | ||
| 363 | |||
| 364 | /* | ||
| 365 | * Using a temporary mm allows to set temporary mappings that are not accessible | ||
| 366 | * by other CPUs. Such mappings are needed to perform sensitive memory writes | ||
| 367 | * that override the kernel memory protections (e.g., W^X), without exposing the | ||
| 368 | * temporary page-table mappings that are required for these write operations to | ||
| 369 | * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the | ||
| 370 | * mapping is torn down. | ||
| 371 | * | ||
| 372 | * Context: The temporary mm needs to be used exclusively by a single core. To | ||
| 373 | * harden security IRQs must be disabled while the temporary mm is | ||
| 374 | * loaded, thereby preventing interrupt handler bugs from overriding | ||
| 375 | * the kernel memory protection. | ||
| 376 | */ | ||
| 377 | static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) | ||
| 378 | { | ||
| 379 | temp_mm_state_t temp_state; | ||
| 380 | |||
| 381 | lockdep_assert_irqs_disabled(); | ||
| 382 | temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); | ||
| 383 | switch_mm_irqs_off(NULL, mm, current); | ||
| 384 | |||
| 385 | /* | ||
| 386 | * If breakpoints are enabled, disable them while the temporary mm is | ||
| 387 | * used. Userspace might set up watchpoints on addresses that are used | ||
| 388 | * in the temporary mm, which would lead to wrong signals being sent or | ||
| 389 | * crashes. | ||
| 390 | * | ||
| 391 | * Note that breakpoints are not disabled selectively, which also causes | ||
| 392 | * kernel breakpoints (e.g., perf's) to be disabled. This might be | ||
| 393 | * undesirable, but still seems reasonable as the code that runs in the | ||
| 394 | * temporary mm should be short. | ||
| 395 | */ | ||
| 396 | if (hw_breakpoint_active()) | ||
| 397 | hw_breakpoint_disable(); | ||
| 398 | |||
| 399 | return temp_state; | ||
| 400 | } | ||
| 401 | |||
| 402 | static inline void unuse_temporary_mm(temp_mm_state_t prev_state) | ||
| 403 | { | ||
| 404 | lockdep_assert_irqs_disabled(); | ||
| 405 | switch_mm_irqs_off(NULL, prev_state.mm, current); | ||
| 406 | |||
| 407 | /* | ||
| 408 | * Restore the breakpoints if they were disabled before the temporary mm | ||
| 409 | * was loaded. | ||
| 410 | */ | ||
| 411 | if (hw_breakpoint_active()) | ||
| 412 | hw_breakpoint_restore(); | ||
| 413 | } | ||
| 414 | |||
| 359 | #endif /* _ASM_X86_MMU_CONTEXT_H */ | 415 | #endif /* _ASM_X86_MMU_CONTEXT_H */ |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 50b3e2d963c9..3a221942f805 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
| @@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void) | |||
| 1021 | /* Default trampoline pgd value */ | 1021 | /* Default trampoline pgd value */ |
| 1022 | trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; | 1022 | trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; |
| 1023 | } | 1023 | } |
| 1024 | |||
| 1025 | void __init poking_init(void); | ||
| 1026 | |||
| 1024 | # ifdef CONFIG_RANDOMIZE_MEMORY | 1027 | # ifdef CONFIG_RANDOMIZE_MEMORY |
| 1025 | void __meminit init_trampoline(void); | 1028 | void __meminit init_trampoline(void); |
| 1026 | # else | 1029 | # else |
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 07a25753e85c..ae7b909dc242 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h | |||
| @@ -85,6 +85,9 @@ int set_pages_nx(struct page *page, int numpages); | |||
| 85 | int set_pages_ro(struct page *page, int numpages); | 85 | int set_pages_ro(struct page *page, int numpages); |
| 86 | int set_pages_rw(struct page *page, int numpages); | 86 | int set_pages_rw(struct page *page, int numpages); |
| 87 | 87 | ||
| 88 | int set_direct_map_invalid_noflush(struct page *page); | ||
| 89 | int set_direct_map_default_noflush(struct page *page); | ||
| 90 | |||
| 88 | extern int kernel_set_to_readonly; | 91 | extern int kernel_set_to_readonly; |
| 89 | void set_kernel_text_rw(void); | 92 | void set_kernel_text_rw(void); |
| 90 | void set_kernel_text_ro(void); | 93 | void set_kernel_text_ro(void); |
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index e85ff65c43c3..c90678fd391a 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h | |||
| @@ -18,7 +18,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, | |||
| 18 | #define __parainstructions_end NULL | 18 | #define __parainstructions_end NULL |
| 19 | #endif | 19 | #endif |
| 20 | 20 | ||
| 21 | extern void *text_poke_early(void *addr, const void *opcode, size_t len); | 21 | extern void text_poke_early(void *addr, const void *opcode, size_t len); |
| 22 | 22 | ||
| 23 | /* | 23 | /* |
| 24 | * Clear and restore the kernel write-protection flag on the local CPU. | 24 | * Clear and restore the kernel write-protection flag on the local CPU. |
| @@ -35,8 +35,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); | |||
| 35 | * inconsistent instruction while you patch. | 35 | * inconsistent instruction while you patch. |
| 36 | */ | 36 | */ |
| 37 | extern void *text_poke(void *addr, const void *opcode, size_t len); | 37 | extern void *text_poke(void *addr, const void *opcode, size_t len); |
| 38 | extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); | ||
| 38 | extern int poke_int3_handler(struct pt_regs *regs); | 39 | extern int poke_int3_handler(struct pt_regs *regs); |
| 39 | extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); | 40 | extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); |
| 40 | extern int after_bootmem; | 41 | extern int after_bootmem; |
| 42 | extern __ro_after_init struct mm_struct *poking_mm; | ||
| 43 | extern __ro_after_init unsigned long poking_addr; | ||
| 41 | 44 | ||
| 42 | #endif /* _ASM_X86_TEXT_PATCHING_H */ | 45 | #endif /* _ASM_X86_TEXT_PATCHING_H */ |
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 90926e8dd1f8..dee375831962 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h | |||
| @@ -274,6 +274,8 @@ static inline bool nmi_uaccess_okay(void) | |||
| 274 | return true; | 274 | return true; |
| 275 | } | 275 | } |
| 276 | 276 | ||
| 277 | #define nmi_uaccess_okay nmi_uaccess_okay | ||
| 278 | |||
| 277 | /* Initialize cr4 shadow for this CPU. */ | 279 | /* Initialize cr4 shadow for this CPU. */ |
| 278 | static inline void cr4_init_shadow(void) | 280 | static inline void cr4_init_shadow(void) |
| 279 | { | 281 | { |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9a79c7808f9c..7b9b49dfc05a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
| 13 | #include <linux/kdebug.h> | 13 | #include <linux/kdebug.h> |
| 14 | #include <linux/kprobes.h> | 14 | #include <linux/kprobes.h> |
| 15 | #include <linux/mmu_context.h> | ||
| 15 | #include <asm/text-patching.h> | 16 | #include <asm/text-patching.h> |
| 16 | #include <asm/alternative.h> | 17 | #include <asm/alternative.h> |
| 17 | #include <asm/sections.h> | 18 | #include <asm/sections.h> |
| @@ -264,7 +265,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) | |||
| 264 | 265 | ||
| 265 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | 266 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; |
| 266 | extern s32 __smp_locks[], __smp_locks_end[]; | 267 | extern s32 __smp_locks[], __smp_locks_end[]; |
| 267 | void *text_poke_early(void *addr, const void *opcode, size_t len); | 268 | void text_poke_early(void *addr, const void *opcode, size_t len); |
| 268 | 269 | ||
| 269 | /* | 270 | /* |
| 270 | * Are we looking at a near JMP with a 1 or 4-byte displacement. | 271 | * Are we looking at a near JMP with a 1 or 4-byte displacement. |
| @@ -666,16 +667,136 @@ void __init alternative_instructions(void) | |||
| 666 | * instructions. And on the local CPU you need to be protected again NMI or MCE | 667 | * instructions. And on the local CPU you need to be protected again NMI or MCE |
| 667 | * handlers seeing an inconsistent instruction while you patch. | 668 | * handlers seeing an inconsistent instruction while you patch. |
| 668 | */ | 669 | */ |
| 669 | void *__init_or_module text_poke_early(void *addr, const void *opcode, | 670 | void __init_or_module text_poke_early(void *addr, const void *opcode, |
| 670 | size_t len) | 671 | size_t len) |
| 671 | { | 672 | { |
| 672 | unsigned long flags; | 673 | unsigned long flags; |
| 674 | |||
| 675 | if (boot_cpu_has(X86_FEATURE_NX) && | ||
| 676 | is_module_text_address((unsigned long)addr)) { | ||
| 677 | /* | ||
| 678 | * Modules text is marked initially as non-executable, so the | ||
| 679 | * code cannot be running and speculative code-fetches are | ||
| 680 | * prevented. Just change the code. | ||
| 681 | */ | ||
| 682 | memcpy(addr, opcode, len); | ||
| 683 | } else { | ||
| 684 | local_irq_save(flags); | ||
| 685 | memcpy(addr, opcode, len); | ||
| 686 | local_irq_restore(flags); | ||
| 687 | sync_core(); | ||
| 688 | |||
| 689 | /* | ||
| 690 | * Could also do a CLFLUSH here to speed up CPU recovery; but | ||
| 691 | * that causes hangs on some VIA CPUs. | ||
| 692 | */ | ||
| 693 | } | ||
| 694 | } | ||
| 695 | |||
| 696 | __ro_after_init struct mm_struct *poking_mm; | ||
| 697 | __ro_after_init unsigned long poking_addr; | ||
| 698 | |||
| 699 | static void *__text_poke(void *addr, const void *opcode, size_t len) | ||
| 700 | { | ||
| 701 | bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; | ||
| 702 | struct page *pages[2] = {NULL}; | ||
| 703 | temp_mm_state_t prev; | ||
| 704 | unsigned long flags; | ||
| 705 | pte_t pte, *ptep; | ||
| 706 | spinlock_t *ptl; | ||
| 707 | pgprot_t pgprot; | ||
| 708 | |||
| 709 | /* | ||
| 710 | * While boot memory allocator is running we cannot use struct pages as | ||
| 711 | * they are not yet initialized. There is no way to recover. | ||
| 712 | */ | ||
| 713 | BUG_ON(!after_bootmem); | ||
| 714 | |||
| 715 | if (!core_kernel_text((unsigned long)addr)) { | ||
| 716 | pages[0] = vmalloc_to_page(addr); | ||
| 717 | if (cross_page_boundary) | ||
| 718 | pages[1] = vmalloc_to_page(addr + PAGE_SIZE); | ||
| 719 | } else { | ||
| 720 | pages[0] = virt_to_page(addr); | ||
| 721 | WARN_ON(!PageReserved(pages[0])); | ||
| 722 | if (cross_page_boundary) | ||
| 723 | pages[1] = virt_to_page(addr + PAGE_SIZE); | ||
| 724 | } | ||
| 725 | /* | ||
| 726 | * If something went wrong, crash and burn since recovery paths are not | ||
| 727 | * implemented. | ||
| 728 | */ | ||
| 729 | BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); | ||
| 730 | |||
| 673 | local_irq_save(flags); | 731 | local_irq_save(flags); |
| 674 | memcpy(addr, opcode, len); | 732 | |
| 733 | /* | ||
| 734 | * Map the page without the global bit, as TLB flushing is done with | ||
| 735 | * flush_tlb_mm_range(), which is intended for non-global PTEs. | ||
| 736 | */ | ||
| 737 | pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); | ||
| 738 | |||
| 739 | /* | ||
| 740 | * The lock is not really needed, but this allows to avoid open-coding. | ||
| 741 | */ | ||
| 742 | ptep = get_locked_pte(poking_mm, poking_addr, &ptl); | ||
| 743 | |||
| 744 | /* | ||
| 745 | * This must not fail; preallocated in poking_init(). | ||
| 746 | */ | ||
| 747 | VM_BUG_ON(!ptep); | ||
| 748 | |||
| 749 | pte = mk_pte(pages[0], pgprot); | ||
| 750 | set_pte_at(poking_mm, poking_addr, ptep, pte); | ||
| 751 | |||
| 752 | if (cross_page_boundary) { | ||
| 753 | pte = mk_pte(pages[1], pgprot); | ||
| 754 | set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); | ||
| 755 | } | ||
| 756 | |||
| 757 | /* | ||
| 758 | * Loading the temporary mm behaves as a compiler barrier, which | ||
| 759 | * guarantees that the PTE will be set at the time memcpy() is done. | ||
| 760 | */ | ||
| 761 | prev = use_temporary_mm(poking_mm); | ||
| 762 | |||
| 763 | kasan_disable_current(); | ||
| 764 | memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); | ||
| 765 | kasan_enable_current(); | ||
| 766 | |||
| 767 | /* | ||
| 768 | * Ensure that the PTE is only cleared after the instructions of memcpy | ||
| 769 | * were issued by using a compiler barrier. | ||
| 770 | */ | ||
| 771 | barrier(); | ||
| 772 | |||
| 773 | pte_clear(poking_mm, poking_addr, ptep); | ||
| 774 | if (cross_page_boundary) | ||
| 775 | pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); | ||
| 776 | |||
| 777 | /* | ||
| 778 | * Loading the previous page-table hierarchy requires a serializing | ||
| 779 | * instruction that already allows the core to see the updated version. | ||
| 780 | * Xen-PV is assumed to serialize execution in a similar manner. | ||
| 781 | */ | ||
| 782 | unuse_temporary_mm(prev); | ||
| 783 | |||
| 784 | /* | ||
| 785 | * Flushing the TLB might involve IPIs, which would require enabled | ||
| 786 | * IRQs, but not if the mm is not used, as it is in this point. | ||
| 787 | */ | ||
| 788 | flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + | ||
| 789 | (cross_page_boundary ? 2 : 1) * PAGE_SIZE, | ||
| 790 | PAGE_SHIFT, false); | ||
| 791 | |||
| 792 | /* | ||
| 793 | * If the text does not match what we just wrote then something is | ||
| 794 | * fundamentally screwy; there's nothing we can really do about that. | ||
| 795 | */ | ||
| 796 | BUG_ON(memcmp(addr, opcode, len)); | ||
| 797 | |||
| 798 | pte_unmap_unlock(ptep, ptl); | ||
| 675 | local_irq_restore(flags); | 799 | local_irq_restore(flags); |
| 676 | sync_core(); | ||
| 677 | /* Could also do a CLFLUSH here to speed up CPU recovery; but | ||
| 678 | that causes hangs on some VIA CPUs. */ | ||
| 679 | return addr; | 800 | return addr; |
| 680 | } | 801 | } |
| 681 | 802 | ||
| @@ -689,48 +810,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, | |||
| 689 | * It means the size must be writable atomically and the address must be aligned | 810 | * It means the size must be writable atomically and the address must be aligned |
| 690 | * in a way that permits an atomic write. It also makes sure we fit on a single | 811 | * in a way that permits an atomic write. It also makes sure we fit on a single |
| 691 | * page. | 812 | * page. |
| 813 | * | ||
| 814 | * Note that the caller must ensure that if the modified code is part of a | ||
| 815 | * module, the module would not be removed during poking. This can be achieved | ||
| 816 | * by registering a module notifier, and ordering module removal and patching | ||
| 817 | * trough a mutex. | ||
| 692 | */ | 818 | */ |
| 693 | void *text_poke(void *addr, const void *opcode, size_t len) | 819 | void *text_poke(void *addr, const void *opcode, size_t len) |
| 694 | { | 820 | { |
| 695 | unsigned long flags; | ||
| 696 | char *vaddr; | ||
| 697 | struct page *pages[2]; | ||
| 698 | int i; | ||
| 699 | |||
| 700 | /* | ||
| 701 | * While boot memory allocator is runnig we cannot use struct | ||
| 702 | * pages as they are not yet initialized. | ||
| 703 | */ | ||
| 704 | BUG_ON(!after_bootmem); | ||
| 705 | |||
| 706 | lockdep_assert_held(&text_mutex); | 821 | lockdep_assert_held(&text_mutex); |
| 707 | 822 | ||
| 708 | if (!core_kernel_text((unsigned long)addr)) { | 823 | return __text_poke(addr, opcode, len); |
| 709 | pages[0] = vmalloc_to_page(addr); | 824 | } |
| 710 | pages[1] = vmalloc_to_page(addr + PAGE_SIZE); | 825 | |
| 711 | } else { | 826 | /** |
| 712 | pages[0] = virt_to_page(addr); | 827 | * text_poke_kgdb - Update instructions on a live kernel by kgdb |
| 713 | WARN_ON(!PageReserved(pages[0])); | 828 | * @addr: address to modify |
| 714 | pages[1] = virt_to_page(addr + PAGE_SIZE); | 829 | * @opcode: source of the copy |
| 715 | } | 830 | * @len: length to copy |
| 716 | BUG_ON(!pages[0]); | 831 | * |
| 717 | local_irq_save(flags); | 832 | * Only atomic text poke/set should be allowed when not doing early patching. |
| 718 | set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); | 833 | * It means the size must be writable atomically and the address must be aligned |
| 719 | if (pages[1]) | 834 | * in a way that permits an atomic write. It also makes sure we fit on a single |
| 720 | set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); | 835 | * page. |
| 721 | vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); | 836 | * |
| 722 | memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); | 837 | * Context: should only be used by kgdb, which ensures no other core is running, |
| 723 | clear_fixmap(FIX_TEXT_POKE0); | 838 | * despite the fact it does not hold the text_mutex. |
| 724 | if (pages[1]) | 839 | */ |
| 725 | clear_fixmap(FIX_TEXT_POKE1); | 840 | void *text_poke_kgdb(void *addr, const void *opcode, size_t len) |
| 726 | local_flush_tlb(); | 841 | { |
| 727 | sync_core(); | 842 | return __text_poke(addr, opcode, len); |
| 728 | /* Could also do a CLFLUSH here to speed up CPU recovery; but | ||
| 729 | that causes hangs on some VIA CPUs. */ | ||
| 730 | for (i = 0; i < len; i++) | ||
| 731 | BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); | ||
| 732 | local_irq_restore(flags); | ||
| 733 | return addr; | ||
| 734 | } | 843 | } |
| 735 | 844 | ||
| 736 | static void do_sync_core(void *info) | 845 | static void do_sync_core(void *info) |
| @@ -788,7 +897,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); | |||
| 788 | * replacing opcode | 897 | * replacing opcode |
| 789 | * - sync cores | 898 | * - sync cores |
| 790 | */ | 899 | */ |
| 791 | void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) | 900 | void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) |
| 792 | { | 901 | { |
| 793 | unsigned char int3 = 0xcc; | 902 | unsigned char int3 = 0xcc; |
| 794 | 903 | ||
| @@ -830,7 +939,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) | |||
| 830 | * the writing of the new instruction. | 939 | * the writing of the new instruction. |
| 831 | */ | 940 | */ |
| 832 | bp_patching_in_progress = false; | 941 | bp_patching_in_progress = false; |
| 833 | |||
| 834 | return addr; | ||
| 835 | } | 942 | } |
| 836 | 943 | ||
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ef49517f6bb2..0caf8122d680 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c | |||
| @@ -678,12 +678,8 @@ static inline void *alloc_tramp(unsigned long size) | |||
| 678 | { | 678 | { |
| 679 | return module_alloc(size); | 679 | return module_alloc(size); |
| 680 | } | 680 | } |
| 681 | static inline void tramp_free(void *tramp, int size) | 681 | static inline void tramp_free(void *tramp) |
| 682 | { | 682 | { |
| 683 | int npages = PAGE_ALIGN(size) >> PAGE_SHIFT; | ||
| 684 | |||
| 685 | set_memory_nx((unsigned long)tramp, npages); | ||
| 686 | set_memory_rw((unsigned long)tramp, npages); | ||
| 687 | module_memfree(tramp); | 683 | module_memfree(tramp); |
| 688 | } | 684 | } |
| 689 | #else | 685 | #else |
| @@ -692,7 +688,7 @@ static inline void *alloc_tramp(unsigned long size) | |||
| 692 | { | 688 | { |
| 693 | return NULL; | 689 | return NULL; |
| 694 | } | 690 | } |
| 695 | static inline void tramp_free(void *tramp, int size) { } | 691 | static inline void tramp_free(void *tramp) { } |
| 696 | #endif | 692 | #endif |
| 697 | 693 | ||
| 698 | /* Defined as markers to the end of the ftrace default trampolines */ | 694 | /* Defined as markers to the end of the ftrace default trampolines */ |
| @@ -730,6 +726,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) | |||
| 730 | unsigned long end_offset; | 726 | unsigned long end_offset; |
| 731 | unsigned long op_offset; | 727 | unsigned long op_offset; |
| 732 | unsigned long offset; | 728 | unsigned long offset; |
| 729 | unsigned long npages; | ||
| 733 | unsigned long size; | 730 | unsigned long size; |
| 734 | unsigned long retq; | 731 | unsigned long retq; |
| 735 | unsigned long *ptr; | 732 | unsigned long *ptr; |
| @@ -762,6 +759,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) | |||
| 762 | return 0; | 759 | return 0; |
| 763 | 760 | ||
| 764 | *tramp_size = size + RET_SIZE + sizeof(void *); | 761 | *tramp_size = size + RET_SIZE + sizeof(void *); |
| 762 | npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); | ||
| 765 | 763 | ||
| 766 | /* Copy ftrace_caller onto the trampoline memory */ | 764 | /* Copy ftrace_caller onto the trampoline memory */ |
| 767 | ret = probe_kernel_read(trampoline, (void *)start_offset, size); | 765 | ret = probe_kernel_read(trampoline, (void *)start_offset, size); |
| @@ -806,9 +804,17 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) | |||
| 806 | /* ALLOC_TRAMP flags lets us know we created it */ | 804 | /* ALLOC_TRAMP flags lets us know we created it */ |
| 807 | ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; | 805 | ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; |
| 808 | 806 | ||
| 807 | set_vm_flush_reset_perms(trampoline); | ||
| 808 | |||
| 809 | /* | ||
| 810 | * Module allocation needs to be completed by making the page | ||
| 811 | * executable. The page is still writable, which is a security hazard, | ||
| 812 | * but anyhow ftrace breaks W^X completely. | ||
| 813 | */ | ||
| 814 | set_memory_x((unsigned long)trampoline, npages); | ||
| 809 | return (unsigned long)trampoline; | 815 | return (unsigned long)trampoline; |
| 810 | fail: | 816 | fail: |
| 811 | tramp_free(trampoline, *tramp_size); | 817 | tramp_free(trampoline); |
| 812 | return 0; | 818 | return 0; |
| 813 | } | 819 | } |
| 814 | 820 | ||
| @@ -939,7 +945,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) | |||
| 939 | if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) | 945 | if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) |
| 940 | return; | 946 | return; |
| 941 | 947 | ||
| 942 | tramp_free((void *)ops->trampoline, ops->trampoline_size); | 948 | tramp_free((void *)ops->trampoline); |
| 943 | ops->trampoline = 0; | 949 | ops->trampoline = 0; |
| 944 | } | 950 | } |
| 945 | 951 | ||
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f99bd26bd3f1..e631c358f7f4 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c | |||
| @@ -37,7 +37,6 @@ static void bug_at(unsigned char *ip, int line) | |||
| 37 | 37 | ||
| 38 | static void __ref __jump_label_transform(struct jump_entry *entry, | 38 | static void __ref __jump_label_transform(struct jump_entry *entry, |
| 39 | enum jump_label_type type, | 39 | enum jump_label_type type, |
| 40 | void *(*poker)(void *, const void *, size_t), | ||
| 41 | int init) | 40 | int init) |
| 42 | { | 41 | { |
| 43 | union jump_code_union jmp; | 42 | union jump_code_union jmp; |
| @@ -50,9 +49,6 @@ static void __ref __jump_label_transform(struct jump_entry *entry, | |||
| 50 | jmp.offset = jump_entry_target(entry) - | 49 | jmp.offset = jump_entry_target(entry) - |
| 51 | (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); | 50 | (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); |
| 52 | 51 | ||
| 53 | if (early_boot_irqs_disabled) | ||
| 54 | poker = text_poke_early; | ||
| 55 | |||
| 56 | if (type == JUMP_LABEL_JMP) { | 52 | if (type == JUMP_LABEL_JMP) { |
| 57 | if (init) { | 53 | if (init) { |
| 58 | expect = default_nop; line = __LINE__; | 54 | expect = default_nop; line = __LINE__; |
| @@ -75,16 +71,19 @@ static void __ref __jump_label_transform(struct jump_entry *entry, | |||
| 75 | bug_at((void *)jump_entry_code(entry), line); | 71 | bug_at((void *)jump_entry_code(entry), line); |
| 76 | 72 | ||
| 77 | /* | 73 | /* |
| 78 | * Make text_poke_bp() a default fallback poker. | 74 | * As long as only a single processor is running and the code is still |
| 75 | * not marked as RO, text_poke_early() can be used; Checking that | ||
| 76 | * system_state is SYSTEM_BOOTING guarantees it. It will be set to | ||
| 77 | * SYSTEM_SCHEDULING before other cores are awaken and before the | ||
| 78 | * code is write-protected. | ||
| 79 | * | 79 | * |
| 80 | * At the time the change is being done, just ignore whether we | 80 | * At the time the change is being done, just ignore whether we |
| 81 | * are doing nop -> jump or jump -> nop transition, and assume | 81 | * are doing nop -> jump or jump -> nop transition, and assume |
| 82 | * always nop being the 'currently valid' instruction | 82 | * always nop being the 'currently valid' instruction |
| 83 | * | ||
| 84 | */ | 83 | */ |
| 85 | if (poker) { | 84 | if (init || system_state == SYSTEM_BOOTING) { |
| 86 | (*poker)((void *)jump_entry_code(entry), code, | 85 | text_poke_early((void *)jump_entry_code(entry), code, |
| 87 | JUMP_LABEL_NOP_SIZE); | 86 | JUMP_LABEL_NOP_SIZE); |
| 88 | return; | 87 | return; |
| 89 | } | 88 | } |
| 90 | 89 | ||
| @@ -96,7 +95,7 @@ void arch_jump_label_transform(struct jump_entry *entry, | |||
| 96 | enum jump_label_type type) | 95 | enum jump_label_type type) |
| 97 | { | 96 | { |
| 98 | mutex_lock(&text_mutex); | 97 | mutex_lock(&text_mutex); |
| 99 | __jump_label_transform(entry, type, NULL, 0); | 98 | __jump_label_transform(entry, type, 0); |
| 100 | mutex_unlock(&text_mutex); | 99 | mutex_unlock(&text_mutex); |
| 101 | } | 100 | } |
| 102 | 101 | ||
| @@ -126,5 +125,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, | |||
| 126 | jlstate = JL_STATE_NO_UPDATE; | 125 | jlstate = JL_STATE_NO_UPDATE; |
| 127 | } | 126 | } |
| 128 | if (jlstate == JL_STATE_UPDATE) | 127 | if (jlstate == JL_STATE_UPDATE) |
| 129 | __jump_label_transform(entry, type, text_poke_early, 1); | 128 | __jump_label_transform(entry, type, 1); |
| 130 | } | 129 | } |
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 4ff6b4cdb941..13b13311b792 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
| @@ -747,7 +747,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) | |||
| 747 | int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) | 747 | int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) |
| 748 | { | 748 | { |
| 749 | int err; | 749 | int err; |
| 750 | char opc[BREAK_INSTR_SIZE]; | ||
| 751 | 750 | ||
| 752 | bpt->type = BP_BREAKPOINT; | 751 | bpt->type = BP_BREAKPOINT; |
| 753 | err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, | 752 | err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, |
| @@ -759,18 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) | |||
| 759 | if (!err) | 758 | if (!err) |
| 760 | return err; | 759 | return err; |
| 761 | /* | 760 | /* |
| 762 | * It is safe to call text_poke() because normal kernel execution | 761 | * It is safe to call text_poke_kgdb() because normal kernel execution |
| 763 | * is stopped on all cores, so long as the text_mutex is not locked. | 762 | * is stopped on all cores, so long as the text_mutex is not locked. |
| 764 | */ | 763 | */ |
| 765 | if (mutex_is_locked(&text_mutex)) | 764 | if (mutex_is_locked(&text_mutex)) |
| 766 | return -EBUSY; | 765 | return -EBUSY; |
| 767 | text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, | 766 | text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, |
| 768 | BREAK_INSTR_SIZE); | 767 | BREAK_INSTR_SIZE); |
| 769 | err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); | ||
| 770 | if (err) | ||
| 771 | return err; | ||
| 772 | if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) | ||
| 773 | return -EINVAL; | ||
| 774 | bpt->type = BP_POKE_BREAKPOINT; | 768 | bpt->type = BP_POKE_BREAKPOINT; |
| 775 | 769 | ||
| 776 | return err; | 770 | return err; |
| @@ -778,22 +772,17 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) | |||
| 778 | 772 | ||
| 779 | int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) | 773 | int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) |
| 780 | { | 774 | { |
| 781 | int err; | ||
| 782 | char opc[BREAK_INSTR_SIZE]; | ||
| 783 | |||
| 784 | if (bpt->type != BP_POKE_BREAKPOINT) | 775 | if (bpt->type != BP_POKE_BREAKPOINT) |
| 785 | goto knl_write; | 776 | goto knl_write; |
| 786 | /* | 777 | /* |
| 787 | * It is safe to call text_poke() because normal kernel execution | 778 | * It is safe to call text_poke_kgdb() because normal kernel execution |
| 788 | * is stopped on all cores, so long as the text_mutex is not locked. | 779 | * is stopped on all cores, so long as the text_mutex is not locked. |
| 789 | */ | 780 | */ |
| 790 | if (mutex_is_locked(&text_mutex)) | 781 | if (mutex_is_locked(&text_mutex)) |
| 791 | goto knl_write; | 782 | goto knl_write; |
| 792 | text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE); | 783 | text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr, |
| 793 | err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); | 784 | BREAK_INSTR_SIZE); |
| 794 | if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) | 785 | return 0; |
| 795 | goto knl_write; | ||
| 796 | return err; | ||
| 797 | 786 | ||
| 798 | knl_write: | 787 | knl_write: |
| 799 | return probe_kernel_write((char *)bpt->bpt_addr, | 788 | return probe_kernel_write((char *)bpt->bpt_addr, |
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 122548ad5c2e..cf52ee0d8711 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c | |||
| @@ -431,8 +431,21 @@ void *alloc_insn_page(void) | |||
| 431 | void *page; | 431 | void *page; |
| 432 | 432 | ||
| 433 | page = module_alloc(PAGE_SIZE); | 433 | page = module_alloc(PAGE_SIZE); |
| 434 | if (page) | 434 | if (!page) |
| 435 | set_memory_ro((unsigned long)page & PAGE_MASK, 1); | 435 | return NULL; |
| 436 | |||
| 437 | set_vm_flush_reset_perms(page); | ||
| 438 | /* | ||
| 439 | * First make the page read-only, and only then make it executable to | ||
| 440 | * prevent it from being W+X in between. | ||
| 441 | */ | ||
| 442 | set_memory_ro((unsigned long)page, 1); | ||
| 443 | |||
| 444 | /* | ||
| 445 | * TODO: Once additional kernel code protection mechanisms are set, ensure | ||
| 446 | * that the page was not maliciously altered and it is still zeroed. | ||
| 447 | */ | ||
| 448 | set_memory_x((unsigned long)page, 1); | ||
| 436 | 449 | ||
| 437 | return page; | 450 | return page; |
| 438 | } | 451 | } |
| @@ -440,8 +453,6 @@ void *alloc_insn_page(void) | |||
| 440 | /* Recover page to RW mode before releasing it */ | 453 | /* Recover page to RW mode before releasing it */ |
| 441 | void free_insn_page(void *page) | 454 | void free_insn_page(void *page) |
| 442 | { | 455 | { |
| 443 | set_memory_nx((unsigned long)page & PAGE_MASK, 1); | ||
| 444 | set_memory_rw((unsigned long)page & PAGE_MASK, 1); | ||
| 445 | module_memfree(page); | 456 | module_memfree(page); |
| 446 | } | 457 | } |
| 447 | 458 | ||
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b052e883dd8c..cfa3106faee4 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c | |||
| @@ -87,7 +87,7 @@ void *module_alloc(unsigned long size) | |||
| 87 | p = __vmalloc_node_range(size, MODULE_ALIGN, | 87 | p = __vmalloc_node_range(size, MODULE_ALIGN, |
| 88 | MODULES_VADDR + get_module_load_offset(), | 88 | MODULES_VADDR + get_module_load_offset(), |
| 89 | MODULES_END, GFP_KERNEL, | 89 | MODULES_END, GFP_KERNEL, |
| 90 | PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, | 90 | PAGE_KERNEL, 0, NUMA_NO_NODE, |
| 91 | __builtin_return_address(0)); | 91 | __builtin_return_address(0)); |
| 92 | if (p && (kasan_module_alloc(p, size) < 0)) { | 92 | if (p && (kasan_module_alloc(p, size) < 0)) { |
| 93 | vfree(p); | 93 | vfree(p); |
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4d1517022a14..0850b5149345 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
| @@ -141,11 +141,11 @@ SECTIONS | |||
| 141 | *(.text.__x86.indirect_thunk) | 141 | *(.text.__x86.indirect_thunk) |
| 142 | __indirect_thunk_end = .; | 142 | __indirect_thunk_end = .; |
| 143 | #endif | 143 | #endif |
| 144 | |||
| 145 | /* End of text section */ | ||
| 146 | _etext = .; | ||
| 147 | } :text = 0x9090 | 144 | } :text = 0x9090 |
| 148 | 145 | ||
| 146 | /* End of text section */ | ||
| 147 | _etext = .; | ||
| 148 | |||
| 149 | NOTES :text :note | 149 | NOTES :text :note |
| 150 | 150 | ||
| 151 | EXCEPTION_TABLE(16) :text = 0x9090 | 151 | EXCEPTION_TABLE(16) :text = 0x9090 |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 06c089513d39..46df4c6aae46 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
| @@ -360,8 +360,6 @@ static noinline int vmalloc_fault(unsigned long address) | |||
| 360 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | 360 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
| 361 | return -1; | 361 | return -1; |
| 362 | 362 | ||
| 363 | WARN_ON_ONCE(in_nmi()); | ||
| 364 | |||
| 365 | /* | 363 | /* |
| 366 | * Copy kernel mappings over when needed. This can also | 364 | * Copy kernel mappings over when needed. This can also |
| 367 | * happen within a race in page table update. In the later | 365 | * happen within a race in page table update. In the later |
| @@ -604,24 +602,9 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) | |||
| 604 | name, index, addr, (desc.limit0 | (desc.limit1 << 16))); | 602 | name, index, addr, (desc.limit0 | (desc.limit1 << 16))); |
| 605 | } | 603 | } |
| 606 | 604 | ||
| 607 | /* | ||
| 608 | * This helper function transforms the #PF error_code bits into | ||
| 609 | * "[PROT] [USER]" type of descriptive, almost human-readable error strings: | ||
| 610 | */ | ||
| 611 | static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt) | ||
| 612 | { | ||
| 613 | if (error_code & mask) { | ||
| 614 | if (buf[0]) | ||
| 615 | strcat(buf, " "); | ||
| 616 | strcat(buf, txt); | ||
| 617 | } | ||
| 618 | } | ||
| 619 | |||
| 620 | static void | 605 | static void |
| 621 | show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) | 606 | show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) |
| 622 | { | 607 | { |
| 623 | char err_txt[64]; | ||
| 624 | |||
| 625 | if (!oops_may_print()) | 608 | if (!oops_may_print()) |
| 626 | return; | 609 | return; |
| 627 | 610 | ||
| @@ -645,31 +628,29 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad | |||
| 645 | from_kuid(&init_user_ns, current_uid())); | 628 | from_kuid(&init_user_ns, current_uid())); |
| 646 | } | 629 | } |
| 647 | 630 | ||
| 648 | pr_alert("BUG: unable to handle kernel %s at %px\n", | 631 | if (address < PAGE_SIZE && !user_mode(regs)) |
| 649 | address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", | 632 | pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", |
| 650 | (void *)address); | 633 | (void *)address); |
| 651 | 634 | else | |
| 652 | err_txt[0] = 0; | 635 | pr_alert("BUG: unable to handle page fault for address: %px\n", |
| 653 | 636 | (void *)address); | |
| 654 | /* | 637 | |
| 655 | * Note: length of these appended strings including the separation space and the | 638 | pr_alert("#PF: %s %s in %s mode\n", |
| 656 | * zero delimiter must fit into err_txt[]. | 639 | (error_code & X86_PF_USER) ? "user" : "supervisor", |
| 657 | */ | 640 | (error_code & X86_PF_INSTR) ? "instruction fetch" : |
| 658 | err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" ); | 641 | (error_code & X86_PF_WRITE) ? "write access" : |
| 659 | err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]"); | 642 | "read access", |
| 660 | err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" ); | 643 | user_mode(regs) ? "user" : "kernel"); |
| 661 | err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" ); | 644 | pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, |
| 662 | err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]"); | 645 | !(error_code & X86_PF_PROT) ? "not-present page" : |
| 663 | err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" ); | 646 | (error_code & X86_PF_RSVD) ? "reserved bit violation" : |
| 664 | 647 | (error_code & X86_PF_PK) ? "protection keys violation" : | |
| 665 | pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]"); | 648 | "permissions violation"); |
| 666 | 649 | ||
| 667 | if (!(error_code & X86_PF_USER) && user_mode(regs)) { | 650 | if (!(error_code & X86_PF_USER) && user_mode(regs)) { |
| 668 | struct desc_ptr idt, gdt; | 651 | struct desc_ptr idt, gdt; |
| 669 | u16 ldtr, tr; | 652 | u16 ldtr, tr; |
| 670 | 653 | ||
| 671 | pr_alert("This was a system access from user code\n"); | ||
| 672 | |||
| 673 | /* | 654 | /* |
| 674 | * This can happen for quite a few reasons. The more obvious | 655 | * This can happen for quite a few reasons. The more obvious |
| 675 | * ones are faults accessing the GDT, or LDT. Perhaps | 656 | * ones are faults accessing the GDT, or LDT. Perhaps |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8dacdb96899e..fd10d91a6115 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <linux/swapfile.h> | 6 | #include <linux/swapfile.h> |
| 7 | #include <linux/swapops.h> | 7 | #include <linux/swapops.h> |
| 8 | #include <linux/kmemleak.h> | 8 | #include <linux/kmemleak.h> |
| 9 | #include <linux/sched/task.h> | ||
| 9 | 10 | ||
| 10 | #include <asm/set_memory.h> | 11 | #include <asm/set_memory.h> |
| 11 | #include <asm/e820/api.h> | 12 | #include <asm/e820/api.h> |
| @@ -23,6 +24,7 @@ | |||
| 23 | #include <asm/hypervisor.h> | 24 | #include <asm/hypervisor.h> |
| 24 | #include <asm/cpufeature.h> | 25 | #include <asm/cpufeature.h> |
| 25 | #include <asm/pti.h> | 26 | #include <asm/pti.h> |
| 27 | #include <asm/text-patching.h> | ||
| 26 | 28 | ||
| 27 | /* | 29 | /* |
| 28 | * We need to define the tracepoints somewhere, and tlb.c | 30 | * We need to define the tracepoints somewhere, and tlb.c |
| @@ -702,6 +704,41 @@ void __init init_mem_mapping(void) | |||
| 702 | } | 704 | } |
| 703 | 705 | ||
| 704 | /* | 706 | /* |
| 707 | * Initialize an mm_struct to be used during poking and a pointer to be used | ||
| 708 | * during patching. | ||
| 709 | */ | ||
| 710 | void __init poking_init(void) | ||
| 711 | { | ||
| 712 | spinlock_t *ptl; | ||
| 713 | pte_t *ptep; | ||
| 714 | |||
| 715 | poking_mm = copy_init_mm(); | ||
| 716 | BUG_ON(!poking_mm); | ||
| 717 | |||
| 718 | /* | ||
| 719 | * Randomize the poking address, but make sure that the following page | ||
| 720 | * will be mapped at the same PMD. We need 2 pages, so find space for 3, | ||
| 721 | * and adjust the address if the PMD ends after the first one. | ||
| 722 | */ | ||
| 723 | poking_addr = TASK_UNMAPPED_BASE; | ||
| 724 | if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) | ||
| 725 | poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % | ||
| 726 | (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); | ||
| 727 | |||
| 728 | if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) | ||
| 729 | poking_addr += PAGE_SIZE; | ||
| 730 | |||
| 731 | /* | ||
| 732 | * We need to trigger the allocation of the page-tables that will be | ||
| 733 | * needed for poking now. Later, poking may be performed in an atomic | ||
| 734 | * section, which might cause allocation to fail. | ||
| 735 | */ | ||
| 736 | ptep = get_locked_pte(poking_mm, poking_addr, &ptl); | ||
| 737 | BUG_ON(!ptep); | ||
| 738 | pte_unmap_unlock(ptep, ptl); | ||
| 739 | } | ||
| 740 | |||
| 741 | /* | ||
| 705 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address | 742 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address |
| 706 | * is valid. The argument is a physical page number. | 743 | * is valid. The argument is a physical page number. |
| 707 | * | 744 | * |
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index d669c5e797e0..dc3f058bdf9b 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c | |||
| @@ -125,10 +125,7 @@ void __init kernel_randomize_memory(void) | |||
| 125 | */ | 125 | */ |
| 126 | entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); | 126 | entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); |
| 127 | prandom_bytes_state(&rand_state, &rand, sizeof(rand)); | 127 | prandom_bytes_state(&rand_state, &rand, sizeof(rand)); |
| 128 | if (pgtable_l5_enabled()) | 128 | entropy = (rand % (entropy + 1)) & PUD_MASK; |
| 129 | entropy = (rand % (entropy + 1)) & P4D_MASK; | ||
| 130 | else | ||
| 131 | entropy = (rand % (entropy + 1)) & PUD_MASK; | ||
| 132 | vaddr += entropy; | 129 | vaddr += entropy; |
| 133 | *kaslr_regions[i].base = vaddr; | 130 | *kaslr_regions[i].base = vaddr; |
| 134 | 131 | ||
| @@ -137,84 +134,71 @@ void __init kernel_randomize_memory(void) | |||
| 137 | * randomization alignment. | 134 | * randomization alignment. |
| 138 | */ | 135 | */ |
| 139 | vaddr += get_padding(&kaslr_regions[i]); | 136 | vaddr += get_padding(&kaslr_regions[i]); |
| 140 | if (pgtable_l5_enabled()) | 137 | vaddr = round_up(vaddr + 1, PUD_SIZE); |
| 141 | vaddr = round_up(vaddr + 1, P4D_SIZE); | ||
| 142 | else | ||
| 143 | vaddr = round_up(vaddr + 1, PUD_SIZE); | ||
| 144 | remain_entropy -= entropy; | 138 | remain_entropy -= entropy; |
| 145 | } | 139 | } |
| 146 | } | 140 | } |
| 147 | 141 | ||
| 148 | static void __meminit init_trampoline_pud(void) | 142 | static void __meminit init_trampoline_pud(void) |
| 149 | { | 143 | { |
| 150 | unsigned long paddr, paddr_next; | 144 | pud_t *pud_page_tramp, *pud, *pud_tramp; |
| 145 | p4d_t *p4d_page_tramp, *p4d, *p4d_tramp; | ||
| 146 | unsigned long paddr, vaddr; | ||
| 151 | pgd_t *pgd; | 147 | pgd_t *pgd; |
| 152 | pud_t *pud_page, *pud_page_tramp; | ||
| 153 | int i; | ||
| 154 | 148 | ||
| 155 | pud_page_tramp = alloc_low_page(); | 149 | pud_page_tramp = alloc_low_page(); |
| 156 | 150 | ||
| 151 | /* | ||
| 152 | * There are two mappings for the low 1MB area, the direct mapping | ||
| 153 | * and the 1:1 mapping for the real mode trampoline: | ||
| 154 | * | ||
| 155 | * Direct mapping: virt_addr = phys_addr + PAGE_OFFSET | ||
| 156 | * 1:1 mapping: virt_addr = phys_addr | ||
| 157 | */ | ||
| 157 | paddr = 0; | 158 | paddr = 0; |
| 158 | pgd = pgd_offset_k((unsigned long)__va(paddr)); | 159 | vaddr = (unsigned long)__va(paddr); |
| 159 | pud_page = (pud_t *) pgd_page_vaddr(*pgd); | 160 | pgd = pgd_offset_k(vaddr); |
| 160 | |||
| 161 | for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) { | ||
| 162 | pud_t *pud, *pud_tramp; | ||
| 163 | unsigned long vaddr = (unsigned long)__va(paddr); | ||
| 164 | 161 | ||
| 165 | pud_tramp = pud_page_tramp + pud_index(paddr); | 162 | p4d = p4d_offset(pgd, vaddr); |
| 166 | pud = pud_page + pud_index(vaddr); | 163 | pud = pud_offset(p4d, vaddr); |
| 167 | paddr_next = (paddr & PUD_MASK) + PUD_SIZE; | ||
| 168 | |||
| 169 | *pud_tramp = *pud; | ||
| 170 | } | ||
| 171 | 164 | ||
| 172 | set_pgd(&trampoline_pgd_entry, | 165 | pud_tramp = pud_page_tramp + pud_index(paddr); |
| 173 | __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); | 166 | *pud_tramp = *pud; |
| 174 | } | ||
| 175 | |||
| 176 | static void __meminit init_trampoline_p4d(void) | ||
| 177 | { | ||
| 178 | unsigned long paddr, paddr_next; | ||
| 179 | pgd_t *pgd; | ||
| 180 | p4d_t *p4d_page, *p4d_page_tramp; | ||
| 181 | int i; | ||
| 182 | 167 | ||
| 183 | p4d_page_tramp = alloc_low_page(); | 168 | if (pgtable_l5_enabled()) { |
| 184 | 169 | p4d_page_tramp = alloc_low_page(); | |
| 185 | paddr = 0; | ||
| 186 | pgd = pgd_offset_k((unsigned long)__va(paddr)); | ||
| 187 | p4d_page = (p4d_t *) pgd_page_vaddr(*pgd); | ||
| 188 | |||
| 189 | for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) { | ||
| 190 | p4d_t *p4d, *p4d_tramp; | ||
| 191 | unsigned long vaddr = (unsigned long)__va(paddr); | ||
| 192 | 170 | ||
| 193 | p4d_tramp = p4d_page_tramp + p4d_index(paddr); | 171 | p4d_tramp = p4d_page_tramp + p4d_index(paddr); |
| 194 | p4d = p4d_page + p4d_index(vaddr); | ||
| 195 | paddr_next = (paddr & P4D_MASK) + P4D_SIZE; | ||
| 196 | 172 | ||
| 197 | *p4d_tramp = *p4d; | 173 | set_p4d(p4d_tramp, |
| 198 | } | 174 | __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); |
| 199 | 175 | ||
| 200 | set_pgd(&trampoline_pgd_entry, | 176 | set_pgd(&trampoline_pgd_entry, |
| 201 | __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); | 177 | __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); |
| 178 | } else { | ||
| 179 | set_pgd(&trampoline_pgd_entry, | ||
| 180 | __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); | ||
| 181 | } | ||
| 202 | } | 182 | } |
| 203 | 183 | ||
| 204 | /* | 184 | /* |
| 205 | * Create PGD aligned trampoline table to allow real mode initialization | 185 | * The real mode trampoline, which is required for bootstrapping CPUs |
| 206 | * of additional CPUs. Consume only 1 low memory page. | 186 | * occupies only a small area under the low 1MB. See reserve_real_mode() |
| 187 | * for details. | ||
| 188 | * | ||
| 189 | * If KASLR is disabled the first PGD entry of the direct mapping is copied | ||
| 190 | * to map the real mode trampoline. | ||
| 191 | * | ||
| 192 | * If KASLR is enabled, copy only the PUD which covers the low 1MB | ||
| 193 | * area. This limits the randomization granularity to 1GB for both 4-level | ||
| 194 | * and 5-level paging. | ||
| 207 | */ | 195 | */ |
| 208 | void __meminit init_trampoline(void) | 196 | void __meminit init_trampoline(void) |
| 209 | { | 197 | { |
| 210 | |||
| 211 | if (!kaslr_memory_enabled()) { | 198 | if (!kaslr_memory_enabled()) { |
| 212 | init_trampoline_default(); | 199 | init_trampoline_default(); |
| 213 | return; | 200 | return; |
| 214 | } | 201 | } |
| 215 | 202 | ||
| 216 | if (pgtable_l5_enabled()) | 203 | init_trampoline_pud(); |
| 217 | init_trampoline_p4d(); | ||
| 218 | else | ||
| 219 | init_trampoline_pud(); | ||
| 220 | } | 204 | } |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4c570612e24e..daf4d645e537 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
| @@ -2209,8 +2209,6 @@ int set_pages_rw(struct page *page, int numpages) | |||
| 2209 | return set_memory_rw(addr, numpages); | 2209 | return set_memory_rw(addr, numpages); |
| 2210 | } | 2210 | } |
| 2211 | 2211 | ||
| 2212 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 2213 | |||
| 2214 | static int __set_pages_p(struct page *page, int numpages) | 2212 | static int __set_pages_p(struct page *page, int numpages) |
| 2215 | { | 2213 | { |
| 2216 | unsigned long tempaddr = (unsigned long) page_address(page); | 2214 | unsigned long tempaddr = (unsigned long) page_address(page); |
| @@ -2249,6 +2247,16 @@ static int __set_pages_np(struct page *page, int numpages) | |||
| 2249 | return __change_page_attr_set_clr(&cpa, 0); | 2247 | return __change_page_attr_set_clr(&cpa, 0); |
| 2250 | } | 2248 | } |
| 2251 | 2249 | ||
| 2250 | int set_direct_map_invalid_noflush(struct page *page) | ||
| 2251 | { | ||
| 2252 | return __set_pages_np(page, 1); | ||
| 2253 | } | ||
| 2254 | |||
| 2255 | int set_direct_map_default_noflush(struct page *page) | ||
| 2256 | { | ||
| 2257 | return __set_pages_p(page, 1); | ||
| 2258 | } | ||
| 2259 | |||
| 2252 | void __kernel_map_pages(struct page *page, int numpages, int enable) | 2260 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
| 2253 | { | 2261 | { |
| 2254 | if (PageHighMem(page)) | 2262 | if (PageHighMem(page)) |
| @@ -2282,7 +2290,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) | |||
| 2282 | } | 2290 | } |
| 2283 | 2291 | ||
| 2284 | #ifdef CONFIG_HIBERNATION | 2292 | #ifdef CONFIG_HIBERNATION |
| 2285 | |||
| 2286 | bool kernel_page_present(struct page *page) | 2293 | bool kernel_page_present(struct page *page) |
| 2287 | { | 2294 | { |
| 2288 | unsigned int level; | 2295 | unsigned int level; |
| @@ -2294,11 +2301,8 @@ bool kernel_page_present(struct page *page) | |||
| 2294 | pte = lookup_address((unsigned long)page_address(page), &level); | 2301 | pte = lookup_address((unsigned long)page_address(page), &level); |
| 2295 | return (pte_val(*pte) & _PAGE_PRESENT); | 2302 | return (pte_val(*pte) & _PAGE_PRESENT); |
| 2296 | } | 2303 | } |
| 2297 | |||
| 2298 | #endif /* CONFIG_HIBERNATION */ | 2304 | #endif /* CONFIG_HIBERNATION */ |
| 2299 | 2305 | ||
| 2300 | #endif /* CONFIG_DEBUG_PAGEALLOC */ | ||
| 2301 | |||
| 2302 | int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, | 2306 | int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, |
| 2303 | unsigned numpages, unsigned long page_flags) | 2307 | unsigned numpages, unsigned long page_flags) |
| 2304 | { | 2308 | { |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3dbf440d4114..1f67b1e15bf6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
| @@ -373,14 +373,14 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, | |||
| 373 | 373 | ||
| 374 | static struct kmem_cache *pgd_cache; | 374 | static struct kmem_cache *pgd_cache; |
| 375 | 375 | ||
| 376 | static int __init pgd_cache_init(void) | 376 | void __init pgd_cache_init(void) |
| 377 | { | 377 | { |
| 378 | /* | 378 | /* |
| 379 | * When PAE kernel is running as a Xen domain, it does not use | 379 | * When PAE kernel is running as a Xen domain, it does not use |
| 380 | * shared kernel pmd. And this requires a whole page for pgd. | 380 | * shared kernel pmd. And this requires a whole page for pgd. |
| 381 | */ | 381 | */ |
| 382 | if (!SHARED_KERNEL_PMD) | 382 | if (!SHARED_KERNEL_PMD) |
| 383 | return 0; | 383 | return; |
| 384 | 384 | ||
| 385 | /* | 385 | /* |
| 386 | * when PAE kernel is not running as a Xen domain, it uses | 386 | * when PAE kernel is not running as a Xen domain, it uses |
| @@ -390,9 +390,7 @@ static int __init pgd_cache_init(void) | |||
| 390 | */ | 390 | */ |
| 391 | pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, | 391 | pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, |
| 392 | SLAB_PANIC, NULL); | 392 | SLAB_PANIC, NULL); |
| 393 | return 0; | ||
| 394 | } | 393 | } |
| 395 | core_initcall(pgd_cache_init); | ||
| 396 | 394 | ||
| 397 | static inline pgd_t *_pgd_alloc(void) | 395 | static inline pgd_t *_pgd_alloc(void) |
| 398 | { | 396 | { |
| @@ -420,6 +418,10 @@ static inline void _pgd_free(pgd_t *pgd) | |||
| 420 | } | 418 | } |
| 421 | #else | 419 | #else |
| 422 | 420 | ||
| 421 | void __init pgd_cache_init(void) | ||
| 422 | { | ||
| 423 | } | ||
| 424 | |||
| 423 | static inline pgd_t *_pgd_alloc(void) | 425 | static inline pgd_t *_pgd_alloc(void) |
| 424 | { | 426 | { |
| 425 | return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); | 427 | return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 487b8474c01c..7f61431c75fb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
| @@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, | |||
| 634 | this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); | 634 | this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); |
| 635 | } | 635 | } |
| 636 | 636 | ||
| 637 | static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) | 637 | static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason) |
| 638 | { | 638 | { |
| 639 | const struct flush_tlb_info *f = info; | 639 | const struct flush_tlb_info *f = info; |
| 640 | 640 | ||
| @@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
| 722 | */ | 722 | */ |
| 723 | unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; | 723 | unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; |
| 724 | 724 | ||
| 725 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); | ||
| 726 | |||
| 727 | #ifdef CONFIG_DEBUG_VM | ||
| 728 | static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); | ||
| 729 | #endif | ||
| 730 | |||
| 731 | static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, | ||
| 732 | unsigned long start, unsigned long end, | ||
| 733 | unsigned int stride_shift, bool freed_tables, | ||
| 734 | u64 new_tlb_gen) | ||
| 735 | { | ||
| 736 | struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info); | ||
| 737 | |||
| 738 | #ifdef CONFIG_DEBUG_VM | ||
| 739 | /* | ||
| 740 | * Ensure that the following code is non-reentrant and flush_tlb_info | ||
| 741 | * is not overwritten. This means no TLB flushing is initiated by | ||
| 742 | * interrupt handlers and machine-check exception handlers. | ||
| 743 | */ | ||
| 744 | BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); | ||
| 745 | #endif | ||
| 746 | |||
| 747 | info->start = start; | ||
| 748 | info->end = end; | ||
| 749 | info->mm = mm; | ||
| 750 | info->stride_shift = stride_shift; | ||
| 751 | info->freed_tables = freed_tables; | ||
| 752 | info->new_tlb_gen = new_tlb_gen; | ||
| 753 | |||
| 754 | return info; | ||
| 755 | } | ||
| 756 | |||
| 757 | static inline void put_flush_tlb_info(void) | ||
| 758 | { | ||
| 759 | #ifdef CONFIG_DEBUG_VM | ||
| 760 | /* Complete reentrency prevention checks */ | ||
| 761 | barrier(); | ||
| 762 | this_cpu_dec(flush_tlb_info_idx); | ||
| 763 | #endif | ||
| 764 | } | ||
| 765 | |||
| 725 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 766 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
| 726 | unsigned long end, unsigned int stride_shift, | 767 | unsigned long end, unsigned int stride_shift, |
| 727 | bool freed_tables) | 768 | bool freed_tables) |
| 728 | { | 769 | { |
| 770 | struct flush_tlb_info *info; | ||
| 771 | u64 new_tlb_gen; | ||
| 729 | int cpu; | 772 | int cpu; |
| 730 | 773 | ||
| 731 | struct flush_tlb_info info = { | ||
| 732 | .mm = mm, | ||
| 733 | .stride_shift = stride_shift, | ||
| 734 | .freed_tables = freed_tables, | ||
| 735 | }; | ||
| 736 | |||
| 737 | cpu = get_cpu(); | 774 | cpu = get_cpu(); |
| 738 | 775 | ||
| 739 | /* This is also a barrier that synchronizes with switch_mm(). */ | ||
| 740 | info.new_tlb_gen = inc_mm_tlb_gen(mm); | ||
| 741 | |||
| 742 | /* Should we flush just the requested range? */ | 776 | /* Should we flush just the requested range? */ |
| 743 | if ((end != TLB_FLUSH_ALL) && | 777 | if ((end == TLB_FLUSH_ALL) || |
| 744 | ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { | 778 | ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { |
| 745 | info.start = start; | 779 | start = 0; |
| 746 | info.end = end; | 780 | end = TLB_FLUSH_ALL; |
| 747 | } else { | ||
| 748 | info.start = 0UL; | ||
| 749 | info.end = TLB_FLUSH_ALL; | ||
| 750 | } | 781 | } |
| 751 | 782 | ||
| 783 | /* This is also a barrier that synchronizes with switch_mm(). */ | ||
| 784 | new_tlb_gen = inc_mm_tlb_gen(mm); | ||
| 785 | |||
| 786 | info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables, | ||
| 787 | new_tlb_gen); | ||
| 788 | |||
| 752 | if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { | 789 | if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { |
| 753 | VM_WARN_ON(irqs_disabled()); | 790 | lockdep_assert_irqs_enabled(); |
| 754 | local_irq_disable(); | 791 | local_irq_disable(); |
| 755 | flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); | 792 | flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN); |
| 756 | local_irq_enable(); | 793 | local_irq_enable(); |
| 757 | } | 794 | } |
| 758 | 795 | ||
| 759 | if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) | 796 | if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) |
| 760 | flush_tlb_others(mm_cpumask(mm), &info); | 797 | flush_tlb_others(mm_cpumask(mm), info); |
| 761 | 798 | ||
| 799 | put_flush_tlb_info(); | ||
| 762 | put_cpu(); | 800 | put_cpu(); |
| 763 | } | 801 | } |
| 764 | 802 | ||
| @@ -787,38 +825,48 @@ static void do_kernel_range_flush(void *info) | |||
| 787 | 825 | ||
| 788 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) | 826 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) |
| 789 | { | 827 | { |
| 790 | |||
| 791 | /* Balance as user space task's flush, a bit conservative */ | 828 | /* Balance as user space task's flush, a bit conservative */ |
| 792 | if (end == TLB_FLUSH_ALL || | 829 | if (end == TLB_FLUSH_ALL || |
| 793 | (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { | 830 | (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { |
| 794 | on_each_cpu(do_flush_tlb_all, NULL, 1); | 831 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
| 795 | } else { | 832 | } else { |
| 796 | struct flush_tlb_info info; | 833 | struct flush_tlb_info *info; |
| 797 | info.start = start; | 834 | |
| 798 | info.end = end; | 835 | preempt_disable(); |
| 799 | on_each_cpu(do_kernel_range_flush, &info, 1); | 836 | info = get_flush_tlb_info(NULL, start, end, 0, false, 0); |
| 837 | |||
| 838 | on_each_cpu(do_kernel_range_flush, info, 1); | ||
| 839 | |||
| 840 | put_flush_tlb_info(); | ||
| 841 | preempt_enable(); | ||
| 800 | } | 842 | } |
| 801 | } | 843 | } |
| 802 | 844 | ||
| 845 | /* | ||
| 846 | * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. | ||
| 847 | * This means that the 'struct flush_tlb_info' that describes which mappings to | ||
| 848 | * flush is actually fixed. We therefore set a single fixed struct and use it in | ||
| 849 | * arch_tlbbatch_flush(). | ||
| 850 | */ | ||
| 851 | static const struct flush_tlb_info full_flush_tlb_info = { | ||
| 852 | .mm = NULL, | ||
| 853 | .start = 0, | ||
| 854 | .end = TLB_FLUSH_ALL, | ||
| 855 | }; | ||
| 856 | |||
| 803 | void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) | 857 | void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) |
| 804 | { | 858 | { |
| 805 | struct flush_tlb_info info = { | ||
| 806 | .mm = NULL, | ||
| 807 | .start = 0UL, | ||
| 808 | .end = TLB_FLUSH_ALL, | ||
| 809 | }; | ||
| 810 | |||
| 811 | int cpu = get_cpu(); | 859 | int cpu = get_cpu(); |
| 812 | 860 | ||
| 813 | if (cpumask_test_cpu(cpu, &batch->cpumask)) { | 861 | if (cpumask_test_cpu(cpu, &batch->cpumask)) { |
| 814 | VM_WARN_ON(irqs_disabled()); | 862 | lockdep_assert_irqs_enabled(); |
| 815 | local_irq_disable(); | 863 | local_irq_disable(); |
| 816 | flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); | 864 | flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN); |
| 817 | local_irq_enable(); | 865 | local_irq_enable(); |
| 818 | } | 866 | } |
| 819 | 867 | ||
| 820 | if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) | 868 | if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) |
| 821 | flush_tlb_others(&batch->cpumask, &info); | 869 | flush_tlb_others(&batch->cpumask, &full_flush_tlb_info); |
| 822 | 870 | ||
| 823 | cpumask_clear(&batch->cpumask); | 871 | cpumask_clear(&batch->cpumask); |
| 824 | 872 | ||
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index a21e1734fc1f..beb44e22afdf 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c | |||
| @@ -2318,8 +2318,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) | |||
| 2318 | #elif defined(CONFIG_X86_VSYSCALL_EMULATION) | 2318 | #elif defined(CONFIG_X86_VSYSCALL_EMULATION) |
| 2319 | case VSYSCALL_PAGE: | 2319 | case VSYSCALL_PAGE: |
| 2320 | #endif | 2320 | #endif |
| 2321 | case FIX_TEXT_POKE0: | ||
| 2322 | case FIX_TEXT_POKE1: | ||
| 2323 | /* All local page mappings */ | 2321 | /* All local page mappings */ |
| 2324 | pte = pfn_pte(phys, prot); | 2322 | pte = pfn_pte(phys, prot); |
| 2325 | break; | 2323 | break; |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index fa782fba51ee..75d9d68a6de7 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
| @@ -1126,6 +1126,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, | |||
| 1126 | static inline void init_espfix_bsp(void) { } | 1126 | static inline void init_espfix_bsp(void) { } |
| 1127 | #endif | 1127 | #endif |
| 1128 | 1128 | ||
| 1129 | extern void __init pgd_cache_init(void); | ||
| 1130 | |||
| 1129 | #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED | 1131 | #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED |
| 1130 | static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) | 1132 | static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) |
| 1131 | { | 1133 | { |
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index b9edc7608d90..480e5b2a5748 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h | |||
| @@ -21,6 +21,15 @@ | |||
| 21 | #include <asm/tlbflush.h> | 21 | #include <asm/tlbflush.h> |
| 22 | #include <asm/cacheflush.h> | 22 | #include <asm/cacheflush.h> |
| 23 | 23 | ||
| 24 | /* | ||
| 25 | * Blindly accessing user memory from NMI context can be dangerous | ||
| 26 | * if we're in the middle of switching the current user task or switching | ||
| 27 | * the loaded mm. | ||
| 28 | */ | ||
| 29 | #ifndef nmi_uaccess_okay | ||
| 30 | # define nmi_uaccess_okay() true | ||
| 31 | #endif | ||
| 32 | |||
| 24 | #ifdef CONFIG_MMU | 33 | #ifdef CONFIG_MMU |
| 25 | 34 | ||
| 26 | /* | 35 | /* |
diff --git a/include/linux/filter.h b/include/linux/filter.h index 6074aa064b54..7d3abde3f183 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/set_memory.h> | 20 | #include <linux/set_memory.h> |
| 21 | #include <linux/kallsyms.h> | 21 | #include <linux/kallsyms.h> |
| 22 | #include <linux/if_vlan.h> | 22 | #include <linux/if_vlan.h> |
| 23 | #include <linux/vmalloc.h> | ||
| 23 | 24 | ||
| 24 | #include <net/sch_generic.h> | 25 | #include <net/sch_generic.h> |
| 25 | 26 | ||
| @@ -503,7 +504,6 @@ struct bpf_prog { | |||
| 503 | u16 pages; /* Number of allocated pages */ | 504 | u16 pages; /* Number of allocated pages */ |
| 504 | u16 jited:1, /* Is our filter JIT'ed? */ | 505 | u16 jited:1, /* Is our filter JIT'ed? */ |
| 505 | jit_requested:1,/* archs need to JIT the prog */ | 506 | jit_requested:1,/* archs need to JIT the prog */ |
| 506 | undo_set_mem:1, /* Passed set_memory_ro() checkpoint */ | ||
| 507 | gpl_compatible:1, /* Is filter GPL compatible? */ | 507 | gpl_compatible:1, /* Is filter GPL compatible? */ |
| 508 | cb_access:1, /* Is control block accessed? */ | 508 | cb_access:1, /* Is control block accessed? */ |
| 509 | dst_needed:1, /* Do we need dst entry? */ | 509 | dst_needed:1, /* Do we need dst entry? */ |
| @@ -733,24 +733,15 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) | |||
| 733 | 733 | ||
| 734 | static inline void bpf_prog_lock_ro(struct bpf_prog *fp) | 734 | static inline void bpf_prog_lock_ro(struct bpf_prog *fp) |
| 735 | { | 735 | { |
| 736 | fp->undo_set_mem = 1; | 736 | set_vm_flush_reset_perms(fp); |
| 737 | set_memory_ro((unsigned long)fp, fp->pages); | 737 | set_memory_ro((unsigned long)fp, fp->pages); |
| 738 | } | 738 | } |
| 739 | 739 | ||
| 740 | static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) | ||
| 741 | { | ||
| 742 | if (fp->undo_set_mem) | ||
| 743 | set_memory_rw((unsigned long)fp, fp->pages); | ||
| 744 | } | ||
| 745 | |||
| 746 | static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) | 740 | static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) |
| 747 | { | 741 | { |
| 742 | set_vm_flush_reset_perms(hdr); | ||
| 748 | set_memory_ro((unsigned long)hdr, hdr->pages); | 743 | set_memory_ro((unsigned long)hdr, hdr->pages); |
| 749 | } | 744 | set_memory_x((unsigned long)hdr, hdr->pages); |
| 750 | |||
| 751 | static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) | ||
| 752 | { | ||
| 753 | set_memory_rw((unsigned long)hdr, hdr->pages); | ||
| 754 | } | 745 | } |
| 755 | 746 | ||
| 756 | static inline struct bpf_binary_header * | 747 | static inline struct bpf_binary_header * |
| @@ -788,7 +779,6 @@ void __bpf_prog_free(struct bpf_prog *fp); | |||
| 788 | 779 | ||
| 789 | static inline void bpf_prog_unlock_free(struct bpf_prog *fp) | 780 | static inline void bpf_prog_unlock_free(struct bpf_prog *fp) |
| 790 | { | 781 | { |
| 791 | bpf_prog_unlock_ro(fp); | ||
| 792 | __bpf_prog_free(fp); | 782 | __bpf_prog_free(fp); |
| 793 | } | 783 | } |
| 794 | 784 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b10c21630f5..083d7b4863ed 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
| @@ -2610,37 +2610,31 @@ static inline void kernel_poison_pages(struct page *page, int numpages, | |||
| 2610 | int enable) { } | 2610 | int enable) { } |
| 2611 | #endif | 2611 | #endif |
| 2612 | 2612 | ||
| 2613 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 2614 | extern bool _debug_pagealloc_enabled; | 2613 | extern bool _debug_pagealloc_enabled; |
| 2615 | extern void __kernel_map_pages(struct page *page, int numpages, int enable); | ||
| 2616 | 2614 | ||
| 2617 | static inline bool debug_pagealloc_enabled(void) | 2615 | static inline bool debug_pagealloc_enabled(void) |
| 2618 | { | 2616 | { |
| 2619 | return _debug_pagealloc_enabled; | 2617 | return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled; |
| 2620 | } | 2618 | } |
| 2621 | 2619 | ||
| 2620 | #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) | ||
| 2621 | extern void __kernel_map_pages(struct page *page, int numpages, int enable); | ||
| 2622 | |||
| 2622 | static inline void | 2623 | static inline void |
| 2623 | kernel_map_pages(struct page *page, int numpages, int enable) | 2624 | kernel_map_pages(struct page *page, int numpages, int enable) |
| 2624 | { | 2625 | { |
| 2625 | if (!debug_pagealloc_enabled()) | ||
| 2626 | return; | ||
| 2627 | |||
| 2628 | __kernel_map_pages(page, numpages, enable); | 2626 | __kernel_map_pages(page, numpages, enable); |
| 2629 | } | 2627 | } |
| 2630 | #ifdef CONFIG_HIBERNATION | 2628 | #ifdef CONFIG_HIBERNATION |
| 2631 | extern bool kernel_page_present(struct page *page); | 2629 | extern bool kernel_page_present(struct page *page); |
| 2632 | #endif /* CONFIG_HIBERNATION */ | 2630 | #endif /* CONFIG_HIBERNATION */ |
| 2633 | #else /* CONFIG_DEBUG_PAGEALLOC */ | 2631 | #else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ |
| 2634 | static inline void | 2632 | static inline void |
| 2635 | kernel_map_pages(struct page *page, int numpages, int enable) {} | 2633 | kernel_map_pages(struct page *page, int numpages, int enable) {} |
| 2636 | #ifdef CONFIG_HIBERNATION | 2634 | #ifdef CONFIG_HIBERNATION |
| 2637 | static inline bool kernel_page_present(struct page *page) { return true; } | 2635 | static inline bool kernel_page_present(struct page *page) { return true; } |
| 2638 | #endif /* CONFIG_HIBERNATION */ | 2636 | #endif /* CONFIG_HIBERNATION */ |
| 2639 | static inline bool debug_pagealloc_enabled(void) | 2637 | #endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ |
| 2640 | { | ||
| 2641 | return false; | ||
| 2642 | } | ||
| 2643 | #endif /* CONFIG_DEBUG_PAGEALLOC */ | ||
| 2644 | 2638 | ||
| 2645 | #ifdef __HAVE_ARCH_GATE_AREA | 2639 | #ifdef __HAVE_ARCH_GATE_AREA |
| 2646 | extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); | 2640 | extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); |
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 2e97a2227045..f1227f2c38a4 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h | |||
| @@ -76,6 +76,7 @@ extern void exit_itimers(struct signal_struct *); | |||
| 76 | extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); | 76 | extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); |
| 77 | extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); | 77 | extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); |
| 78 | struct task_struct *fork_idle(int); | 78 | struct task_struct *fork_idle(int); |
| 79 | struct mm_struct *copy_init_mm(void); | ||
| 79 | extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); | 80 | extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); |
| 80 | extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); | 81 | extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); |
| 81 | 82 | ||
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 2a986d282a97..b5071497b8cb 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h | |||
| @@ -17,6 +17,17 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } | |||
| 17 | static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } | 17 | static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } |
| 18 | #endif | 18 | #endif |
| 19 | 19 | ||
| 20 | #ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP | ||
| 21 | static inline int set_direct_map_invalid_noflush(struct page *page) | ||
| 22 | { | ||
| 23 | return 0; | ||
| 24 | } | ||
| 25 | static inline int set_direct_map_default_noflush(struct page *page) | ||
| 26 | { | ||
| 27 | return 0; | ||
| 28 | } | ||
| 29 | #endif | ||
| 30 | |||
| 20 | #ifndef set_mce_nospec | 31 | #ifndef set_mce_nospec |
| 21 | static inline int set_mce_nospec(unsigned long pfn) | 32 | static inline int set_mce_nospec(unsigned long pfn) |
| 22 | { | 33 | { |
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 103a48a48872..12bf0b68ed92 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h | |||
| @@ -115,6 +115,7 @@ struct uprobes_state { | |||
| 115 | struct xol_area *xol_area; | 115 | struct xol_area *xol_area; |
| 116 | }; | 116 | }; |
| 117 | 117 | ||
| 118 | extern void __init uprobes_init(void); | ||
| 118 | extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); | 119 | extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); |
| 119 | extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); | 120 | extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); |
| 120 | extern bool is_swbp_insn(uprobe_opcode_t *insn); | 121 | extern bool is_swbp_insn(uprobe_opcode_t *insn); |
| @@ -154,6 +155,10 @@ extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, | |||
| 154 | struct uprobes_state { | 155 | struct uprobes_state { |
| 155 | }; | 156 | }; |
| 156 | 157 | ||
| 158 | static inline void uprobes_init(void) | ||
| 159 | { | ||
| 160 | } | ||
| 161 | |||
| 157 | #define uprobe_get_trap_addr(regs) instruction_pointer(regs) | 162 | #define uprobe_get_trap_addr(regs) instruction_pointer(regs) |
| 158 | 163 | ||
| 159 | static inline int | 164 | static inline int |
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 398e9c95cd61..c6eebb839552 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h | |||
| @@ -21,6 +21,11 @@ struct notifier_block; /* in notifier.h */ | |||
| 21 | #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ | 21 | #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ |
| 22 | #define VM_NO_GUARD 0x00000040 /* don't add guard page */ | 22 | #define VM_NO_GUARD 0x00000040 /* don't add guard page */ |
| 23 | #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ | 23 | #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ |
| 24 | /* | ||
| 25 | * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with | ||
| 26 | * vfree_atomic(). | ||
| 27 | */ | ||
| 28 | #define VM_FLUSH_RESET_PERMS 0x00000100 /* Reset direct map and flush TLB on unmap */ | ||
| 24 | /* bits [20..32] reserved for arch specific ioremap internals */ | 29 | /* bits [20..32] reserved for arch specific ioremap internals */ |
| 25 | 30 | ||
| 26 | /* | 31 | /* |
| @@ -142,6 +147,13 @@ extern int map_kernel_range_noflush(unsigned long start, unsigned long size, | |||
| 142 | pgprot_t prot, struct page **pages); | 147 | pgprot_t prot, struct page **pages); |
| 143 | extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); | 148 | extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); |
| 144 | extern void unmap_kernel_range(unsigned long addr, unsigned long size); | 149 | extern void unmap_kernel_range(unsigned long addr, unsigned long size); |
| 150 | static inline void set_vm_flush_reset_perms(void *addr) | ||
| 151 | { | ||
| 152 | struct vm_struct *vm = find_vm_area(addr); | ||
| 153 | |||
| 154 | if (vm) | ||
| 155 | vm->flags |= VM_FLUSH_RESET_PERMS; | ||
| 156 | } | ||
| 145 | #else | 157 | #else |
| 146 | static inline int | 158 | static inline int |
| 147 | map_kernel_range_noflush(unsigned long start, unsigned long size, | 159 | map_kernel_range_noflush(unsigned long start, unsigned long size, |
| @@ -157,6 +169,9 @@ static inline void | |||
| 157 | unmap_kernel_range(unsigned long addr, unsigned long size) | 169 | unmap_kernel_range(unsigned long addr, unsigned long size) |
| 158 | { | 170 | { |
| 159 | } | 171 | } |
| 172 | static inline void set_vm_flush_reset_perms(void *addr) | ||
| 173 | { | ||
| 174 | } | ||
| 160 | #endif | 175 | #endif |
| 161 | 176 | ||
| 162 | /* Allocate/destroy a 'vmalloc' VM area. */ | 177 | /* Allocate/destroy a 'vmalloc' VM area. */ |
diff --git a/init/main.c b/init/main.c index 7d4025d665eb..9dc2f3b4f753 100644 --- a/init/main.c +++ b/init/main.c | |||
| @@ -504,6 +504,10 @@ void __init __weak thread_stack_cache_init(void) | |||
| 504 | 504 | ||
| 505 | void __init __weak mem_encrypt_init(void) { } | 505 | void __init __weak mem_encrypt_init(void) { } |
| 506 | 506 | ||
| 507 | void __init __weak poking_init(void) { } | ||
| 508 | |||
| 509 | void __init __weak pgd_cache_init(void) { } | ||
| 510 | |||
| 507 | bool initcall_debug; | 511 | bool initcall_debug; |
| 508 | core_param(initcall_debug, initcall_debug, bool, 0644); | 512 | core_param(initcall_debug, initcall_debug, bool, 0644); |
| 509 | 513 | ||
| @@ -535,6 +539,7 @@ static void __init mm_init(void) | |||
| 535 | init_espfix_bsp(); | 539 | init_espfix_bsp(); |
| 536 | /* Should be run after espfix64 is set up. */ | 540 | /* Should be run after espfix64 is set up. */ |
| 537 | pti_init(); | 541 | pti_init(); |
| 542 | pgd_cache_init(); | ||
| 538 | } | 543 | } |
| 539 | 544 | ||
| 540 | void __init __weak arch_call_rest_init(void) | 545 | void __init __weak arch_call_rest_init(void) |
| @@ -737,6 +742,7 @@ asmlinkage __visible void __init start_kernel(void) | |||
| 737 | taskstats_init_early(); | 742 | taskstats_init_early(); |
| 738 | delayacct_init(); | 743 | delayacct_init(); |
| 739 | 744 | ||
| 745 | poking_init(); | ||
| 740 | check_bugs(); | 746 | check_bugs(); |
| 741 | 747 | ||
| 742 | acpi_subsystem_init(); | 748 | acpi_subsystem_init(); |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ff09d32a8a1b..c605397c79f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) | |||
| 848 | if (fp->jited) { | 848 | if (fp->jited) { |
| 849 | struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); | 849 | struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); |
| 850 | 850 | ||
| 851 | bpf_jit_binary_unlock_ro(hdr); | ||
| 852 | bpf_jit_binary_free(hdr); | 851 | bpf_jit_binary_free(hdr); |
| 853 | 852 | ||
| 854 | WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); | 853 | WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c5cde87329c7..e6a0d6be87e3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = { | |||
| 2294 | .priority = INT_MAX-1, /* notified after kprobes, kgdb */ | 2294 | .priority = INT_MAX-1, /* notified after kprobes, kgdb */ |
| 2295 | }; | 2295 | }; |
| 2296 | 2296 | ||
| 2297 | static int __init init_uprobes(void) | 2297 | void __init uprobes_init(void) |
| 2298 | { | 2298 | { |
| 2299 | int i; | 2299 | int i; |
| 2300 | 2300 | ||
| 2301 | for (i = 0; i < UPROBES_HASH_SZ; i++) | 2301 | for (i = 0; i < UPROBES_HASH_SZ; i++) |
| 2302 | mutex_init(&uprobes_mmap_mutex[i]); | 2302 | mutex_init(&uprobes_mmap_mutex[i]); |
| 2303 | 2303 | ||
| 2304 | if (percpu_init_rwsem(&dup_mmap_sem)) | 2304 | BUG_ON(percpu_init_rwsem(&dup_mmap_sem)); |
| 2305 | return -ENOMEM; | ||
| 2306 | 2305 | ||
| 2307 | return register_die_notifier(&uprobe_exception_nb); | 2306 | BUG_ON(register_die_notifier(&uprobe_exception_nb)); |
| 2308 | } | 2307 | } |
| 2309 | __initcall(init_uprobes); | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 9dcd18aa210b..fbe9dfcd8680 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -815,6 +815,7 @@ void __init fork_init(void) | |||
| 815 | #endif | 815 | #endif |
| 816 | 816 | ||
| 817 | lockdep_init_task(&init_task); | 817 | lockdep_init_task(&init_task); |
| 818 | uprobes_init(); | ||
| 818 | } | 819 | } |
| 819 | 820 | ||
| 820 | int __weak arch_dup_task_struct(struct task_struct *dst, | 821 | int __weak arch_dup_task_struct(struct task_struct *dst, |
| @@ -1298,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
| 1298 | complete_vfork_done(tsk); | 1299 | complete_vfork_done(tsk); |
| 1299 | } | 1300 | } |
| 1300 | 1301 | ||
| 1301 | /* | 1302 | /** |
| 1302 | * Allocate a new mm structure and copy contents from the | 1303 | * dup_mm() - duplicates an existing mm structure |
| 1303 | * mm structure of the passed in task structure. | 1304 | * @tsk: the task_struct with which the new mm will be associated. |
| 1305 | * @oldmm: the mm to duplicate. | ||
| 1306 | * | ||
| 1307 | * Allocates a new mm structure and duplicates the provided @oldmm structure | ||
| 1308 | * content into it. | ||
| 1309 | * | ||
| 1310 | * Return: the duplicated mm or NULL on failure. | ||
| 1304 | */ | 1311 | */ |
| 1305 | static struct mm_struct *dup_mm(struct task_struct *tsk) | 1312 | static struct mm_struct *dup_mm(struct task_struct *tsk, |
| 1313 | struct mm_struct *oldmm) | ||
| 1306 | { | 1314 | { |
| 1307 | struct mm_struct *mm, *oldmm = current->mm; | 1315 | struct mm_struct *mm; |
| 1308 | int err; | 1316 | int err; |
| 1309 | 1317 | ||
| 1310 | mm = allocate_mm(); | 1318 | mm = allocate_mm(); |
| @@ -1371,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) | |||
| 1371 | } | 1379 | } |
| 1372 | 1380 | ||
| 1373 | retval = -ENOMEM; | 1381 | retval = -ENOMEM; |
| 1374 | mm = dup_mm(tsk); | 1382 | mm = dup_mm(tsk, current->mm); |
| 1375 | if (!mm) | 1383 | if (!mm) |
| 1376 | goto fail_nomem; | 1384 | goto fail_nomem; |
| 1377 | 1385 | ||
| @@ -2186,6 +2194,11 @@ struct task_struct *fork_idle(int cpu) | |||
| 2186 | return task; | 2194 | return task; |
| 2187 | } | 2195 | } |
| 2188 | 2196 | ||
| 2197 | struct mm_struct *copy_init_mm(void) | ||
| 2198 | { | ||
| 2199 | return dup_mm(NULL, &init_mm); | ||
| 2200 | } | ||
| 2201 | |||
| 2189 | /* | 2202 | /* |
| 2190 | * Ok, this is the main fork-routine. | 2203 | * Ok, this is the main fork-routine. |
| 2191 | * | 2204 | * |
diff --git a/kernel/module.c b/kernel/module.c index 0b9aa8ab89f0..a9020bdd4cf6 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex); | |||
| 98 | EXPORT_SYMBOL_GPL(module_mutex); | 98 | EXPORT_SYMBOL_GPL(module_mutex); |
| 99 | static LIST_HEAD(modules); | 99 | static LIST_HEAD(modules); |
| 100 | 100 | ||
| 101 | /* Work queue for freeing init sections in success case */ | ||
| 102 | static struct work_struct init_free_wq; | ||
| 103 | static struct llist_head init_free_list; | ||
| 104 | |||
| 101 | #ifdef CONFIG_MODULES_TREE_LOOKUP | 105 | #ifdef CONFIG_MODULES_TREE_LOOKUP |
| 102 | 106 | ||
| 103 | /* | 107 | /* |
| @@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init) | |||
| 1949 | if (!rodata_enabled) | 1953 | if (!rodata_enabled) |
| 1950 | return; | 1954 | return; |
| 1951 | 1955 | ||
| 1956 | set_vm_flush_reset_perms(mod->core_layout.base); | ||
| 1957 | set_vm_flush_reset_perms(mod->init_layout.base); | ||
| 1952 | frob_text(&mod->core_layout, set_memory_ro); | 1958 | frob_text(&mod->core_layout, set_memory_ro); |
| 1959 | frob_text(&mod->core_layout, set_memory_x); | ||
| 1960 | |||
| 1953 | frob_rodata(&mod->core_layout, set_memory_ro); | 1961 | frob_rodata(&mod->core_layout, set_memory_ro); |
| 1962 | |||
| 1954 | frob_text(&mod->init_layout, set_memory_ro); | 1963 | frob_text(&mod->init_layout, set_memory_ro); |
| 1964 | frob_text(&mod->init_layout, set_memory_x); | ||
| 1965 | |||
| 1955 | frob_rodata(&mod->init_layout, set_memory_ro); | 1966 | frob_rodata(&mod->init_layout, set_memory_ro); |
| 1956 | 1967 | ||
| 1957 | if (after_init) | 1968 | if (after_init) |
| @@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod) | |||
| 1967 | frob_writable_data(&mod->init_layout, set_memory_nx); | 1978 | frob_writable_data(&mod->init_layout, set_memory_nx); |
| 1968 | } | 1979 | } |
| 1969 | 1980 | ||
| 1970 | static void module_disable_nx(const struct module *mod) | ||
| 1971 | { | ||
| 1972 | frob_rodata(&mod->core_layout, set_memory_x); | ||
| 1973 | frob_ro_after_init(&mod->core_layout, set_memory_x); | ||
| 1974 | frob_writable_data(&mod->core_layout, set_memory_x); | ||
| 1975 | frob_rodata(&mod->init_layout, set_memory_x); | ||
| 1976 | frob_writable_data(&mod->init_layout, set_memory_x); | ||
| 1977 | } | ||
| 1978 | |||
| 1979 | /* Iterate through all modules and set each module's text as RW */ | 1981 | /* Iterate through all modules and set each module's text as RW */ |
| 1980 | void set_all_modules_text_rw(void) | 1982 | void set_all_modules_text_rw(void) |
| 1981 | { | 1983 | { |
| @@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void) | |||
| 2019 | } | 2021 | } |
| 2020 | mutex_unlock(&module_mutex); | 2022 | mutex_unlock(&module_mutex); |
| 2021 | } | 2023 | } |
| 2022 | |||
| 2023 | static void disable_ro_nx(const struct module_layout *layout) | ||
| 2024 | { | ||
| 2025 | if (rodata_enabled) { | ||
| 2026 | frob_text(layout, set_memory_rw); | ||
| 2027 | frob_rodata(layout, set_memory_rw); | ||
| 2028 | frob_ro_after_init(layout, set_memory_rw); | ||
| 2029 | } | ||
| 2030 | frob_rodata(layout, set_memory_x); | ||
| 2031 | frob_ro_after_init(layout, set_memory_x); | ||
| 2032 | frob_writable_data(layout, set_memory_x); | ||
| 2033 | } | ||
| 2034 | |||
| 2035 | #else | 2024 | #else |
| 2036 | static void disable_ro_nx(const struct module_layout *layout) { } | ||
| 2037 | static void module_enable_nx(const struct module *mod) { } | 2025 | static void module_enable_nx(const struct module *mod) { } |
| 2038 | static void module_disable_nx(const struct module *mod) { } | ||
| 2039 | #endif | 2026 | #endif |
| 2040 | 2027 | ||
| 2041 | #ifdef CONFIG_LIVEPATCH | 2028 | #ifdef CONFIG_LIVEPATCH |
| @@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod) | |||
| 2115 | 2102 | ||
| 2116 | void __weak module_memfree(void *module_region) | 2103 | void __weak module_memfree(void *module_region) |
| 2117 | { | 2104 | { |
| 2105 | /* | ||
| 2106 | * This memory may be RO, and freeing RO memory in an interrupt is not | ||
| 2107 | * supported by vmalloc. | ||
| 2108 | */ | ||
| 2109 | WARN_ON(in_interrupt()); | ||
| 2118 | vfree(module_region); | 2110 | vfree(module_region); |
| 2119 | } | 2111 | } |
| 2120 | 2112 | ||
| @@ -2166,7 +2158,6 @@ static void free_module(struct module *mod) | |||
| 2166 | mutex_unlock(&module_mutex); | 2158 | mutex_unlock(&module_mutex); |
| 2167 | 2159 | ||
| 2168 | /* This may be empty, but that's OK */ | 2160 | /* This may be empty, but that's OK */ |
| 2169 | disable_ro_nx(&mod->init_layout); | ||
| 2170 | module_arch_freeing_init(mod); | 2161 | module_arch_freeing_init(mod); |
| 2171 | module_memfree(mod->init_layout.base); | 2162 | module_memfree(mod->init_layout.base); |
| 2172 | kfree(mod->args); | 2163 | kfree(mod->args); |
| @@ -2176,7 +2167,6 @@ static void free_module(struct module *mod) | |||
| 2176 | lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); | 2167 | lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); |
| 2177 | 2168 | ||
| 2178 | /* Finally, free the core (containing the module structure) */ | 2169 | /* Finally, free the core (containing the module structure) */ |
| 2179 | disable_ro_nx(&mod->core_layout); | ||
| 2180 | module_memfree(mod->core_layout.base); | 2170 | module_memfree(mod->core_layout.base); |
| 2181 | } | 2171 | } |
| 2182 | 2172 | ||
| @@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod) | |||
| 3415 | 3405 | ||
| 3416 | /* For freeing module_init on success, in case kallsyms traversing */ | 3406 | /* For freeing module_init on success, in case kallsyms traversing */ |
| 3417 | struct mod_initfree { | 3407 | struct mod_initfree { |
| 3418 | struct rcu_head rcu; | 3408 | struct llist_node node; |
| 3419 | void *module_init; | 3409 | void *module_init; |
| 3420 | }; | 3410 | }; |
| 3421 | 3411 | ||
| 3422 | static void do_free_init(struct rcu_head *head) | 3412 | static void do_free_init(struct work_struct *w) |
| 3423 | { | 3413 | { |
| 3424 | struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); | 3414 | struct llist_node *pos, *n, *list; |
| 3425 | module_memfree(m->module_init); | 3415 | struct mod_initfree *initfree; |
| 3426 | kfree(m); | 3416 | |
| 3417 | list = llist_del_all(&init_free_list); | ||
| 3418 | |||
| 3419 | synchronize_rcu(); | ||
| 3420 | |||
| 3421 | llist_for_each_safe(pos, n, list) { | ||
| 3422 | initfree = container_of(pos, struct mod_initfree, node); | ||
| 3423 | module_memfree(initfree->module_init); | ||
| 3424 | kfree(initfree); | ||
| 3425 | } | ||
| 3427 | } | 3426 | } |
| 3428 | 3427 | ||
| 3428 | static int __init modules_wq_init(void) | ||
| 3429 | { | ||
| 3430 | INIT_WORK(&init_free_wq, do_free_init); | ||
| 3431 | init_llist_head(&init_free_list); | ||
| 3432 | return 0; | ||
| 3433 | } | ||
| 3434 | module_init(modules_wq_init); | ||
| 3435 | |||
| 3429 | /* | 3436 | /* |
| 3430 | * This is where the real work happens. | 3437 | * This is where the real work happens. |
| 3431 | * | 3438 | * |
| @@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod) | |||
| 3502 | #endif | 3509 | #endif |
| 3503 | module_enable_ro(mod, true); | 3510 | module_enable_ro(mod, true); |
| 3504 | mod_tree_remove_init(mod); | 3511 | mod_tree_remove_init(mod); |
| 3505 | disable_ro_nx(&mod->init_layout); | ||
| 3506 | module_arch_freeing_init(mod); | 3512 | module_arch_freeing_init(mod); |
| 3507 | mod->init_layout.base = NULL; | 3513 | mod->init_layout.base = NULL; |
| 3508 | mod->init_layout.size = 0; | 3514 | mod->init_layout.size = 0; |
| @@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod) | |||
| 3513 | * We want to free module_init, but be aware that kallsyms may be | 3519 | * We want to free module_init, but be aware that kallsyms may be |
| 3514 | * walking this with preempt disabled. In all the failure paths, we | 3520 | * walking this with preempt disabled. In all the failure paths, we |
| 3515 | * call synchronize_rcu(), but we don't want to slow down the success | 3521 | * call synchronize_rcu(), but we don't want to slow down the success |
| 3516 | * path, so use actual RCU here. | 3522 | * path. module_memfree() cannot be called in an interrupt, so do the |
| 3523 | * work and call synchronize_rcu() in a work queue. | ||
| 3524 | * | ||
| 3517 | * Note that module_alloc() on most architectures creates W+X page | 3525 | * Note that module_alloc() on most architectures creates W+X page |
| 3518 | * mappings which won't be cleaned up until do_free_init() runs. Any | 3526 | * mappings which won't be cleaned up until do_free_init() runs. Any |
| 3519 | * code such as mark_rodata_ro() which depends on those mappings to | 3527 | * code such as mark_rodata_ro() which depends on those mappings to |
| 3520 | * be cleaned up needs to sync with the queued work - ie | 3528 | * be cleaned up needs to sync with the queued work - ie |
| 3521 | * rcu_barrier() | 3529 | * rcu_barrier() |
| 3522 | */ | 3530 | */ |
| 3523 | call_rcu(&freeinit->rcu, do_free_init); | 3531 | if (llist_add(&freeinit->node, &init_free_list)) |
| 3532 | schedule_work(&init_free_wq); | ||
| 3533 | |||
| 3524 | mutex_unlock(&module_mutex); | 3534 | mutex_unlock(&module_mutex); |
| 3525 | wake_up_all(&module_wq); | 3535 | wake_up_all(&module_wq); |
| 3526 | 3536 | ||
| @@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
| 3817 | module_bug_cleanup(mod); | 3827 | module_bug_cleanup(mod); |
| 3818 | mutex_unlock(&module_mutex); | 3828 | mutex_unlock(&module_mutex); |
| 3819 | 3829 | ||
| 3820 | /* we can't deallocate the module until we clear memory protection */ | ||
| 3821 | module_disable_ro(mod); | ||
| 3822 | module_disable_nx(mod); | ||
| 3823 | |||
| 3824 | ddebug_cleanup: | 3830 | ddebug_cleanup: |
| 3825 | ftrace_release_mod(mod); | 3831 | ftrace_release_mod(mod); |
| 3826 | dynamic_debug_remove(mod, info->debug); | 3832 | dynamic_debug_remove(mod, info->debug); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f08a1e4ee1d4..bc9558ab1e5b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src) | |||
| 1342 | * safe_copy_page - Copy a page in a safe way. | 1342 | * safe_copy_page - Copy a page in a safe way. |
| 1343 | * | 1343 | * |
| 1344 | * Check if the page we are going to copy is marked as present in the kernel | 1344 | * Check if the page we are going to copy is marked as present in the kernel |
| 1345 | * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set | 1345 | * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or |
| 1346 | * and in that case kernel_page_present() always returns 'true'). | 1346 | * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() |
| 1347 | * always returns 'true'. | ||
| 1347 | */ | 1348 | */ |
| 1348 | static void safe_copy_page(void *dst, struct page *s_page) | 1349 | static void safe_copy_page(void *dst, struct page *s_page) |
| 1349 | { | 1350 | { |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d64c00afceb5..94b0e37d90ef 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
| @@ -14,6 +14,8 @@ | |||
| 14 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
| 15 | #include <linux/error-injection.h> | 15 | #include <linux/error-injection.h> |
| 16 | 16 | ||
| 17 | #include <asm/tlb.h> | ||
| 18 | |||
| 17 | #include "trace_probe.h" | 19 | #include "trace_probe.h" |
| 18 | #include "trace.h" | 20 | #include "trace.h" |
| 19 | 21 | ||
| @@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, | |||
| 163 | * access_ok() should prevent writing to non-user memory, but in | 165 | * access_ok() should prevent writing to non-user memory, but in |
| 164 | * some situations (nommu, temporary switch, etc) access_ok() does | 166 | * some situations (nommu, temporary switch, etc) access_ok() does |
| 165 | * not provide enough validation, hence the check on KERNEL_DS. | 167 | * not provide enough validation, hence the check on KERNEL_DS. |
| 168 | * | ||
| 169 | * nmi_uaccess_okay() ensures the probe is not run in an interim | ||
| 170 | * state, when the task or mm are switched. This is specifically | ||
| 171 | * required to prevent the use of temporary mm. | ||
| 166 | */ | 172 | */ |
| 167 | 173 | ||
| 168 | if (unlikely(in_interrupt() || | 174 | if (unlikely(in_interrupt() || |
| @@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, | |||
| 170 | return -EPERM; | 176 | return -EPERM; |
| 171 | if (unlikely(uaccess_kernel())) | 177 | if (unlikely(uaccess_kernel())) |
| 172 | return -EPERM; | 178 | return -EPERM; |
| 179 | if (unlikely(!nmi_uaccess_okay())) | ||
| 180 | return -EPERM; | ||
| 173 | if (!access_ok(unsafe_ptr, size)) | 181 | if (!access_ok(unsafe_ptr, size)) |
| 174 | return -EPERM; | 182 | return -EPERM; |
| 175 | 183 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c02cff1ed56e..59661106da16 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -1144,7 +1144,9 @@ static __always_inline bool free_pages_prepare(struct page *page, | |||
| 1144 | } | 1144 | } |
| 1145 | arch_free_page(page, order); | 1145 | arch_free_page(page, order); |
| 1146 | kernel_poison_pages(page, 1 << order, 0); | 1146 | kernel_poison_pages(page, 1 << order, 0); |
| 1147 | kernel_map_pages(page, 1 << order, 0); | 1147 | if (debug_pagealloc_enabled()) |
| 1148 | kernel_map_pages(page, 1 << order, 0); | ||
| 1149 | |||
| 1148 | kasan_free_nondeferred_pages(page, order); | 1150 | kasan_free_nondeferred_pages(page, order); |
| 1149 | 1151 | ||
| 1150 | return true; | 1152 | return true; |
| @@ -2014,7 +2016,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, | |||
| 2014 | set_page_refcounted(page); | 2016 | set_page_refcounted(page); |
| 2015 | 2017 | ||
| 2016 | arch_alloc_page(page, order); | 2018 | arch_alloc_page(page, order); |
| 2017 | kernel_map_pages(page, 1 << order, 1); | 2019 | if (debug_pagealloc_enabled()) |
| 2020 | kernel_map_pages(page, 1 << order, 1); | ||
| 2018 | kasan_alloc_pages(page, order); | 2021 | kasan_alloc_pages(page, order); |
| 2019 | kernel_poison_pages(page, 1 << order, 1); | 2022 | kernel_poison_pages(page, 1 << order, 1); |
| 2020 | set_page_owner(page, order, gfp_flags); | 2023 | set_page_owner(page, order, gfp_flags); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e86ba6e74b50..e5e9e1fcac01 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/interrupt.h> | 18 | #include <linux/interrupt.h> |
| 19 | #include <linux/proc_fs.h> | 19 | #include <linux/proc_fs.h> |
| 20 | #include <linux/seq_file.h> | 20 | #include <linux/seq_file.h> |
| 21 | #include <linux/set_memory.h> | ||
| 21 | #include <linux/debugobjects.h> | 22 | #include <linux/debugobjects.h> |
| 22 | #include <linux/kallsyms.h> | 23 | #include <linux/kallsyms.h> |
| 23 | #include <linux/list.h> | 24 | #include <linux/list.h> |
| @@ -1059,24 +1060,9 @@ static void vb_free(const void *addr, unsigned long size) | |||
| 1059 | spin_unlock(&vb->lock); | 1060 | spin_unlock(&vb->lock); |
| 1060 | } | 1061 | } |
| 1061 | 1062 | ||
| 1062 | /** | 1063 | static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) |
| 1063 | * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer | ||
| 1064 | * | ||
| 1065 | * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily | ||
| 1066 | * to amortize TLB flushing overheads. What this means is that any page you | ||
| 1067 | * have now, may, in a former life, have been mapped into kernel virtual | ||
| 1068 | * address by the vmap layer and so there might be some CPUs with TLB entries | ||
| 1069 | * still referencing that page (additional to the regular 1:1 kernel mapping). | ||
| 1070 | * | ||
| 1071 | * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can | ||
| 1072 | * be sure that none of the pages we have control over will have any aliases | ||
| 1073 | * from the vmap layer. | ||
| 1074 | */ | ||
| 1075 | void vm_unmap_aliases(void) | ||
| 1076 | { | 1064 | { |
| 1077 | unsigned long start = ULONG_MAX, end = 0; | ||
| 1078 | int cpu; | 1065 | int cpu; |
| 1079 | int flush = 0; | ||
| 1080 | 1066 | ||
| 1081 | if (unlikely(!vmap_initialized)) | 1067 | if (unlikely(!vmap_initialized)) |
| 1082 | return; | 1068 | return; |
| @@ -1113,6 +1099,27 @@ void vm_unmap_aliases(void) | |||
| 1113 | flush_tlb_kernel_range(start, end); | 1099 | flush_tlb_kernel_range(start, end); |
| 1114 | mutex_unlock(&vmap_purge_lock); | 1100 | mutex_unlock(&vmap_purge_lock); |
| 1115 | } | 1101 | } |
| 1102 | |||
| 1103 | /** | ||
| 1104 | * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer | ||
| 1105 | * | ||
| 1106 | * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily | ||
| 1107 | * to amortize TLB flushing overheads. What this means is that any page you | ||
| 1108 | * have now, may, in a former life, have been mapped into kernel virtual | ||
| 1109 | * address by the vmap layer and so there might be some CPUs with TLB entries | ||
| 1110 | * still referencing that page (additional to the regular 1:1 kernel mapping). | ||
| 1111 | * | ||
| 1112 | * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can | ||
| 1113 | * be sure that none of the pages we have control over will have any aliases | ||
| 1114 | * from the vmap layer. | ||
| 1115 | */ | ||
| 1116 | void vm_unmap_aliases(void) | ||
| 1117 | { | ||
| 1118 | unsigned long start = ULONG_MAX, end = 0; | ||
| 1119 | int flush = 0; | ||
| 1120 | |||
| 1121 | _vm_unmap_aliases(start, end, flush); | ||
| 1122 | } | ||
| 1116 | EXPORT_SYMBOL_GPL(vm_unmap_aliases); | 1123 | EXPORT_SYMBOL_GPL(vm_unmap_aliases); |
| 1117 | 1124 | ||
| 1118 | /** | 1125 | /** |
| @@ -1505,6 +1512,72 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
| 1505 | return NULL; | 1512 | return NULL; |
| 1506 | } | 1513 | } |
| 1507 | 1514 | ||
| 1515 | static inline void set_area_direct_map(const struct vm_struct *area, | ||
| 1516 | int (*set_direct_map)(struct page *page)) | ||
| 1517 | { | ||
| 1518 | int i; | ||
| 1519 | |||
| 1520 | for (i = 0; i < area->nr_pages; i++) | ||
| 1521 | if (page_address(area->pages[i])) | ||
| 1522 | set_direct_map(area->pages[i]); | ||
| 1523 | } | ||
| 1524 | |||
| 1525 | /* Handle removing and resetting vm mappings related to the vm_struct. */ | ||
| 1526 | static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) | ||
| 1527 | { | ||
| 1528 | unsigned long addr = (unsigned long)area->addr; | ||
| 1529 | unsigned long start = ULONG_MAX, end = 0; | ||
| 1530 | int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; | ||
| 1531 | int i; | ||
| 1532 | |||
| 1533 | /* | ||
| 1534 | * The below block can be removed when all architectures that have | ||
| 1535 | * direct map permissions also have set_direct_map_() implementations. | ||
| 1536 | * This is concerned with resetting the direct map any an vm alias with | ||
| 1537 | * execute permissions, without leaving a RW+X window. | ||
| 1538 | */ | ||
| 1539 | if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { | ||
| 1540 | set_memory_nx(addr, area->nr_pages); | ||
| 1541 | set_memory_rw(addr, area->nr_pages); | ||
| 1542 | } | ||
| 1543 | |||
| 1544 | remove_vm_area(area->addr); | ||
| 1545 | |||
| 1546 | /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ | ||
| 1547 | if (!flush_reset) | ||
| 1548 | return; | ||
| 1549 | |||
| 1550 | /* | ||
| 1551 | * If not deallocating pages, just do the flush of the VM area and | ||
| 1552 | * return. | ||
| 1553 | */ | ||
| 1554 | if (!deallocate_pages) { | ||
| 1555 | vm_unmap_aliases(); | ||
| 1556 | return; | ||
| 1557 | } | ||
| 1558 | |||
| 1559 | /* | ||
| 1560 | * If execution gets here, flush the vm mapping and reset the direct | ||
| 1561 | * map. Find the start and end range of the direct mappings to make sure | ||
| 1562 | * the vm_unmap_aliases() flush includes the direct map. | ||
| 1563 | */ | ||
| 1564 | for (i = 0; i < area->nr_pages; i++) { | ||
| 1565 | if (page_address(area->pages[i])) { | ||
| 1566 | start = min(addr, start); | ||
| 1567 | end = max(addr, end); | ||
| 1568 | } | ||
| 1569 | } | ||
| 1570 | |||
| 1571 | /* | ||
| 1572 | * Set direct map to something invalid so that it won't be cached if | ||
| 1573 | * there are any accesses after the TLB flush, then flush the TLB and | ||
| 1574 | * reset the direct map permissions to the default. | ||
| 1575 | */ | ||
| 1576 | set_area_direct_map(area, set_direct_map_invalid_noflush); | ||
| 1577 | _vm_unmap_aliases(start, end, 1); | ||
| 1578 | set_area_direct_map(area, set_direct_map_default_noflush); | ||
| 1579 | } | ||
| 1580 | |||
| 1508 | static void __vunmap(const void *addr, int deallocate_pages) | 1581 | static void __vunmap(const void *addr, int deallocate_pages) |
| 1509 | { | 1582 | { |
| 1510 | struct vm_struct *area; | 1583 | struct vm_struct *area; |
| @@ -1526,7 +1599,8 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
| 1526 | debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); | 1599 | debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); |
| 1527 | debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); | 1600 | debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); |
| 1528 | 1601 | ||
| 1529 | remove_vm_area(addr); | 1602 | vm_remove_mappings(area, deallocate_pages); |
| 1603 | |||
| 1530 | if (deallocate_pages) { | 1604 | if (deallocate_pages) { |
| 1531 | int i; | 1605 | int i; |
| 1532 | 1606 | ||
| @@ -1961,8 +2035,9 @@ EXPORT_SYMBOL(vzalloc_node); | |||
| 1961 | */ | 2035 | */ |
| 1962 | void *vmalloc_exec(unsigned long size) | 2036 | void *vmalloc_exec(unsigned long size) |
| 1963 | { | 2037 | { |
| 1964 | return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, | 2038 | return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, |
| 1965 | NUMA_NO_NODE, __builtin_return_address(0)); | 2039 | GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, |
| 2040 | NUMA_NO_NODE, __builtin_return_address(0)); | ||
| 1966 | } | 2041 | } |
| 1967 | 2042 | ||
| 1968 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) | 2043 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) |
