diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-23 12:05:28 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-23 12:05:28 -0400 |
commit | 99792e0cea1ed733cdc8d0758677981e0cbebfed (patch) | |
tree | acf6868f48f687dd8667ee4f99c156415ea8ff7b | |
parent | 382d72a9aa525b56ab8453ce61751fa712414d3d (diff) | |
parent | 977e4be5eb714c48a67afc26a6c477f24130a1f2 (diff) |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
"Lots of changes in this cycle:
- Lots of CPA (change page attribute) optimizations and related
cleanups (Thomas Gleixner, Peter Zijstra)
- Make lazy TLB mode even lazier (Rik van Riel)
- Fault handler cleanups and improvements (Dave Hansen)
- kdump, vmcore: Enable kdumping encrypted memory with AMD SME
enabled (Lianbo Jiang)
- Clean up VM layout documentation (Baoquan He, Ingo Molnar)
- ... plus misc other fixes and enhancements"
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits)
x86/stackprotector: Remove the call to boot_init_stack_canary() from cpu_startup_entry()
x86/mm: Kill stray kernel fault handling comment
x86/mm: Do not warn about PCI BIOS W+X mappings
resource: Clean it up a bit
resource: Fix find_next_iomem_res() iteration issue
resource: Include resource end in walk_*() interfaces
x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error
x86/mm: Remove spurious fault pkey check
x86/mm/vsyscall: Consider vsyscall page part of user address space
x86/mm: Add vsyscall address helper
x86/mm: Fix exception table comments
x86/mm: Add clarifying comments for user addr space
x86/mm: Break out user address space handling
x86/mm: Break out kernel address space handling
x86/mm: Clarify hardware vs. software "error_code"
x86/mm/tlb: Make lazy TLB mode lazier
x86/mm/tlb: Add freed_tables element to flush_tlb_info
x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range
smp,cpumask: introduce on_each_cpu_cond_mask
smp: use __cpumask_set_cpu in on_each_cpu_cond
...
-rw-r--r-- | Documentation/x86/x86_64/mm.txt | 171 | ||||
-rw-r--r-- | arch/x86/Kconfig | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/io.h | 3 | ||||
-rw-r--r-- | arch/x86/include/asm/kexec.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/page_64_types.h | 15 | ||||
-rw-r--r-- | arch/x86/include/asm/tlb.h | 21 | ||||
-rw-r--r-- | arch/x86/include/asm/tlbflush.h | 33 | ||||
-rw-r--r-- | arch/x86/kernel/crash_dump_64.c | 60 | ||||
-rw-r--r-- | arch/x86/kernel/ldt.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/vm86_32.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 35 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 288 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 23 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 24 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 627 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 167 | ||||
-rw-r--r-- | arch/x86/xen/smp_pv.c | 2 | ||||
-rw-r--r-- | drivers/iommu/amd_iommu_init.c | 14 | ||||
-rw-r--r-- | fs/proc/vmcore.c | 34 | ||||
-rw-r--r-- | include/linux/crash_dump.h | 4 | ||||
-rw-r--r-- | include/linux/smp.h | 4 | ||||
-rw-r--r-- | kernel/kexec_core.c | 6 | ||||
-rw-r--r-- | kernel/resource.c | 141 | ||||
-rw-r--r-- | kernel/sched/idle.c | 15 | ||||
-rw-r--r-- | kernel/sched/sched.h | 1 | ||||
-rw-r--r-- | kernel/smp.c | 19 | ||||
-rw-r--r-- | kernel/up.c | 14 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 1 |
28 files changed, 1117 insertions, 619 deletions
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 5432a96d31ff..702898633b00 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt | |||
@@ -1,55 +1,124 @@ | |||
1 | ==================================================== | ||
2 | Complete virtual memory map with 4-level page tables | ||
3 | ==================================================== | ||
1 | 4 | ||
2 | Virtual memory map with 4 level page tables: | 5 | Notes: |
3 | 6 | ||
4 | 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm | 7 | - Negative addresses such as "-23 TB" are absolute addresses in bytes, counted down |
5 | hole caused by [47:63] sign extension | 8 | from the top of the 64-bit address space. It's easier to understand the layout |
6 | ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor | 9 | when seen both in absolute addresses and in distance-from-top notation. |
7 | ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory | 10 | |
8 | ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole | 11 | For example 0xffffe90000000000 == -23 TB, it's 23 TB lower than the top of the |
9 | ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space | 12 | 64-bit address space (ffffffffffffffff). |
10 | ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole | 13 | |
11 | ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) | 14 | Note that as we get closer to the top of the address space, the notation changes |
12 | ... unused hole ... | 15 | from TB to GB and then MB/KB. |
13 | ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) | 16 | |
14 | ... unused hole ... | 17 | - "16M TB" might look weird at first sight, but it's an easier to visualize size |
15 | vaddr_end for KASLR | 18 | notation than "16 EB", which few will recognize at first sight as 16 exabytes. |
16 | fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping | 19 | It also shows it nicely how incredibly large 64-bit address space is. |
17 | fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI | 20 | |
18 | ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks | 21 | ======================================================================================================================== |
19 | ... unused hole ... | 22 | Start addr | Offset | End addr | Size | VM area description |
20 | ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space | 23 | ======================================================================================================================== |
21 | ... unused hole ... | 24 | | | | | |
22 | ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 | 25 | 0000000000000000 | 0 | 00007fffffffffff | 128 TB | user-space virtual memory, different per mm |
23 | ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space | 26 | __________________|____________|__________________|_________|___________________________________________________________ |
24 | [fixmap start] - ffffffffff5fffff kernel-internal fixmap range | 27 | | | | | |
25 | ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI | 28 | 0000800000000000 | +128 TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical |
26 | ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole | 29 | | | | | virtual memory addresses up to the -128 TB |
27 | 30 | | | | | starting offset of kernel mappings. | |
28 | Virtual memory map with 5 level page tables: | 31 | __________________|____________|__________________|_________|___________________________________________________________ |
29 | 32 | | | |
30 | 0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm | 33 | | Kernel-space virtual memory, shared between all processes: |
31 | hole caused by [56:63] sign extension | 34 | ____________________________________________________________|___________________________________________________________ |
32 | ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor | 35 | | | | | |
33 | ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory | 36 | ffff800000000000 | -128 TB | ffff87ffffffffff | 8 TB | ... guard hole, also reserved for hypervisor |
34 | ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI | 37 | ffff880000000000 | -120 TB | ffffc7ffffffffff | 64 TB | direct mapping of all physical memory (page_offset_base) |
35 | ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) | 38 | ffffc80000000000 | -56 TB | ffffc8ffffffffff | 1 TB | ... unused hole |
36 | ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole | 39 | ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap space (vmalloc_base) |
37 | ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) | 40 | ffffe90000000000 | -23 TB | ffffe9ffffffffff | 1 TB | ... unused hole |
38 | ... unused hole ... | 41 | ffffea0000000000 | -22 TB | ffffeaffffffffff | 1 TB | virtual memory map (vmemmap_base) |
39 | ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) | 42 | ffffeb0000000000 | -21 TB | ffffebffffffffff | 1 TB | ... unused hole |
40 | ... unused hole ... | 43 | ffffec0000000000 | -20 TB | fffffbffffffffff | 16 TB | KASAN shadow memory |
41 | vaddr_end for KASLR | 44 | fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole |
42 | fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping | 45 | | | | | vaddr_end for KASLR |
43 | ... unused hole ... | 46 | fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping |
44 | ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks | 47 | fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | LDT remap for PTI |
45 | ... unused hole ... | 48 | ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks |
46 | ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space | 49 | __________________|____________|__________________|_________|____________________________________________________________ |
47 | ... unused hole ... | 50 | | |
48 | ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 | 51 | | Identical layout to the 47-bit one from here on: |
49 | ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space | 52 | ____________________________________________________________|____________________________________________________________ |
50 | [fixmap start] - ffffffffff5fffff kernel-internal fixmap range | 53 | | | | | |
51 | ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI | 54 | ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole |
52 | ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole | 55 | ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space |
56 | ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole | ||
57 | ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0 | ||
58 | ffffffff80000000 |-2048 MB | | | | ||
59 | ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space | ||
60 | ffffffffff000000 | -16 MB | | | | ||
61 | FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset | ||
62 | ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI | ||
63 | ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole | ||
64 | __________________|____________|__________________|_________|___________________________________________________________ | ||
65 | |||
66 | |||
67 | ==================================================== | ||
68 | Complete virtual memory map with 5-level page tables | ||
69 | ==================================================== | ||
70 | |||
71 | Notes: | ||
72 | |||
73 | - With 56-bit addresses, user-space memory gets expanded by a factor of 512x, | ||
74 | from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting | ||
75 | offset and many of the regions expand to support the much larger physical | ||
76 | memory supported. | ||
77 | |||
78 | ======================================================================================================================== | ||
79 | Start addr | Offset | End addr | Size | VM area description | ||
80 | ======================================================================================================================== | ||
81 | | | | | | ||
82 | 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm | ||
83 | __________________|____________|__________________|_________|___________________________________________________________ | ||
84 | | | | | | ||
85 | 0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical | ||
86 | | | | | virtual memory addresses up to the -128 TB | ||
87 | | | | | starting offset of kernel mappings. | ||
88 | __________________|____________|__________________|_________|___________________________________________________________ | ||
89 | | | ||
90 | | Kernel-space virtual memory, shared between all processes: | ||
91 | ____________________________________________________________|___________________________________________________________ | ||
92 | | | | | | ||
93 | ff00000000000000 | -64 PB | ff0fffffffffffff | 4 PB | ... guard hole, also reserved for hypervisor | ||
94 | ff10000000000000 | -60 PB | ff8fffffffffffff | 32 PB | direct mapping of all physical memory (page_offset_base) | ||
95 | ff90000000000000 | -28 PB | ff9fffffffffffff | 4 PB | LDT remap for PTI | ||
96 | ffa0000000000000 | -24 PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base) | ||
97 | ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole | ||
98 | ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base) | ||
99 | ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole | ||
100 | ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory | ||
101 | fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole | ||
102 | | | | | vaddr_end for KASLR | ||
103 | fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping | ||
104 | fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole | ||
105 | ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks | ||
106 | __________________|____________|__________________|_________|____________________________________________________________ | ||
107 | | | ||
108 | | Identical layout to the 47-bit one from here on: | ||
109 | ____________________________________________________________|____________________________________________________________ | ||
110 | | | | | | ||
111 | ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole | ||
112 | ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space | ||
113 | ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole | ||
114 | ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0 | ||
115 | ffffffff80000000 |-2048 MB | | | | ||
116 | ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space | ||
117 | ffffffffff000000 | -16 MB | | | | ||
118 | FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset | ||
119 | ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI | ||
120 | ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole | ||
121 | __________________|____________|__________________|_________|___________________________________________________________ | ||
53 | 122 | ||
54 | Architecture defines a 64-bit virtual address. Implementations can support | 123 | Architecture defines a 64-bit virtual address. Implementations can support |
55 | less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 | 124 | less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8282985d438a..ff425a2d286c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -1487,6 +1487,14 @@ config X86_DIRECT_GBPAGES | |||
1487 | supports them), so don't confuse the user by printing | 1487 | supports them), so don't confuse the user by printing |
1488 | that we have them enabled. | 1488 | that we have them enabled. |
1489 | 1489 | ||
1490 | config X86_CPA_STATISTICS | ||
1491 | bool "Enable statistic for Change Page Attribute" | ||
1492 | depends on DEBUG_FS | ||
1493 | ---help--- | ||
1494 | Expose statistics about the Change Page Attribute mechanims, which | ||
1495 | helps to determine the effectivness of preserving large and huge | ||
1496 | page mappings when mapping protections are changed. | ||
1497 | |||
1490 | config ARCH_HAS_MEM_ENCRYPT | 1498 | config ARCH_HAS_MEM_ENCRYPT |
1491 | def_bool y | 1499 | def_bool y |
1492 | 1500 | ||
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 9a92a3ac2ac5..832da8229cc7 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h | |||
@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size) | |||
187 | #define ioremap_nocache ioremap_nocache | 187 | #define ioremap_nocache ioremap_nocache |
188 | extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); | 188 | extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); |
189 | #define ioremap_uc ioremap_uc | 189 | #define ioremap_uc ioremap_uc |
190 | |||
191 | extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); | 190 | extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); |
192 | #define ioremap_cache ioremap_cache | 191 | #define ioremap_cache ioremap_cache |
193 | extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); | 192 | extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); |
194 | #define ioremap_prot ioremap_prot | 193 | #define ioremap_prot ioremap_prot |
194 | extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); | ||
195 | #define ioremap_encrypted ioremap_encrypted | ||
195 | 196 | ||
196 | /** | 197 | /** |
197 | * ioremap - map bus memory into CPU space | 198 | * ioremap - map bus memory into CPU space |
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index f327236f0fa7..5125fca472bb 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h | |||
@@ -67,7 +67,7 @@ struct kimage; | |||
67 | 67 | ||
68 | /* Memory to backup during crash kdump */ | 68 | /* Memory to backup during crash kdump */ |
69 | #define KEXEC_BACKUP_SRC_START (0UL) | 69 | #define KEXEC_BACKUP_SRC_START (0UL) |
70 | #define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ | 70 | #define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */ |
71 | 71 | ||
72 | /* | 72 | /* |
73 | * CPU does not save ss and sp on stack if execution is already | 73 | * CPU does not save ss and sp on stack if execution is already |
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6afac386a434..cd0cf1c568b4 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h | |||
@@ -59,13 +59,16 @@ | |||
59 | #endif | 59 | #endif |
60 | 60 | ||
61 | /* | 61 | /* |
62 | * Kernel image size is limited to 1GiB due to the fixmap living in the | 62 | * Maximum kernel image size is limited to 1 GiB, due to the fixmap living |
63 | * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use | 63 | * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). |
64 | * 512MiB by default, leaving 1.5GiB for modules once the page tables | 64 | * |
65 | * are fully set up. If kernel ASLR is configured, it can extend the | 65 | * On KASLR use 1 GiB by default, leaving 1 GiB for modules once the |
66 | * kernel page table mapping, reducing the size of the modules area. | 66 | * page tables are fully set up. |
67 | * | ||
68 | * If KASLR is disabled we can shrink it to 0.5 GiB and increase the size | ||
69 | * of the modules area to 1.5 GiB. | ||
67 | */ | 70 | */ |
68 | #if defined(CONFIG_RANDOMIZE_BASE) | 71 | #ifdef CONFIG_RANDOMIZE_BASE |
69 | #define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) | 72 | #define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) |
70 | #else | 73 | #else |
71 | #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) | 74 | #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) |
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index cb0a1f470980..404b8b1d44f5 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h | |||
@@ -6,16 +6,23 @@ | |||
6 | #define tlb_end_vma(tlb, vma) do { } while (0) | 6 | #define tlb_end_vma(tlb, vma) do { } while (0) |
7 | #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) | 7 | #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) |
8 | 8 | ||
9 | #define tlb_flush(tlb) \ | 9 | static inline void tlb_flush(struct mmu_gather *tlb); |
10 | { \ | ||
11 | if (!tlb->fullmm && !tlb->need_flush_all) \ | ||
12 | flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \ | ||
13 | else \ | ||
14 | flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \ | ||
15 | } | ||
16 | 10 | ||
17 | #include <asm-generic/tlb.h> | 11 | #include <asm-generic/tlb.h> |
18 | 12 | ||
13 | static inline void tlb_flush(struct mmu_gather *tlb) | ||
14 | { | ||
15 | unsigned long start = 0UL, end = TLB_FLUSH_ALL; | ||
16 | unsigned int stride_shift = tlb_get_unmap_shift(tlb); | ||
17 | |||
18 | if (!tlb->fullmm && !tlb->need_flush_all) { | ||
19 | start = tlb->start; | ||
20 | end = tlb->end; | ||
21 | } | ||
22 | |||
23 | flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); | ||
24 | } | ||
25 | |||
19 | /* | 26 | /* |
20 | * While x86 architecture in general requires an IPI to perform TLB | 27 | * While x86 architecture in general requires an IPI to perform TLB |
21 | * shootdown, enablement code for several hypervisors overrides | 28 | * shootdown, enablement code for several hypervisors overrides |
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 58ce5288878e..323a313947e0 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h | |||
@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) | |||
148 | #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) | 148 | #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) |
149 | #endif | 149 | #endif |
150 | 150 | ||
151 | static inline bool tlb_defer_switch_to_init_mm(void) | ||
152 | { | ||
153 | /* | ||
154 | * If we have PCID, then switching to init_mm is reasonably | ||
155 | * fast. If we don't have PCID, then switching to init_mm is | ||
156 | * quite slow, so we try to defer it in the hopes that we can | ||
157 | * avoid it entirely. The latter approach runs the risk of | ||
158 | * receiving otherwise unnecessary IPIs. | ||
159 | * | ||
160 | * This choice is just a heuristic. The tlb code can handle this | ||
161 | * function returning true or false regardless of whether we have | ||
162 | * PCID. | ||
163 | */ | ||
164 | return !static_cpu_has(X86_FEATURE_PCID); | ||
165 | } | ||
166 | |||
167 | struct tlb_context { | 151 | struct tlb_context { |
168 | u64 ctx_id; | 152 | u64 ctx_id; |
169 | u64 tlb_gen; | 153 | u64 tlb_gen; |
@@ -547,23 +531,30 @@ struct flush_tlb_info { | |||
547 | unsigned long start; | 531 | unsigned long start; |
548 | unsigned long end; | 532 | unsigned long end; |
549 | u64 new_tlb_gen; | 533 | u64 new_tlb_gen; |
534 | unsigned int stride_shift; | ||
535 | bool freed_tables; | ||
550 | }; | 536 | }; |
551 | 537 | ||
552 | #define local_flush_tlb() __flush_tlb() | 538 | #define local_flush_tlb() __flush_tlb() |
553 | 539 | ||
554 | #define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL) | 540 | #define flush_tlb_mm(mm) \ |
541 | flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true) | ||
555 | 542 | ||
556 | #define flush_tlb_range(vma, start, end) \ | 543 | #define flush_tlb_range(vma, start, end) \ |
557 | flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) | 544 | flush_tlb_mm_range((vma)->vm_mm, start, end, \ |
545 | ((vma)->vm_flags & VM_HUGETLB) \ | ||
546 | ? huge_page_shift(hstate_vma(vma)) \ | ||
547 | : PAGE_SHIFT, false) | ||
558 | 548 | ||
559 | extern void flush_tlb_all(void); | 549 | extern void flush_tlb_all(void); |
560 | extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 550 | extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
561 | unsigned long end, unsigned long vmflag); | 551 | unsigned long end, unsigned int stride_shift, |
552 | bool freed_tables); | ||
562 | extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); | 553 | extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); |
563 | 554 | ||
564 | static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) | 555 | static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) |
565 | { | 556 | { |
566 | flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE); | 557 | flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); |
567 | } | 558 | } |
568 | 559 | ||
569 | void native_flush_tlb_others(const struct cpumask *cpumask, | 560 | void native_flush_tlb_others(const struct cpumask *cpumask, |
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 4f2e0778feac..eb8ab3915268 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c | |||
@@ -11,40 +11,62 @@ | |||
11 | #include <linux/uaccess.h> | 11 | #include <linux/uaccess.h> |
12 | #include <linux/io.h> | 12 | #include <linux/io.h> |
13 | 13 | ||
14 | /** | 14 | static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, |
15 | * copy_oldmem_page - copy one page from "oldmem" | 15 | unsigned long offset, int userbuf, |
16 | * @pfn: page frame number to be copied | 16 | bool encrypted) |
17 | * @buf: target memory address for the copy; this can be in kernel address | ||
18 | * space or user address space (see @userbuf) | ||
19 | * @csize: number of bytes to copy | ||
20 | * @offset: offset in bytes into the page (based on pfn) to begin the copy | ||
21 | * @userbuf: if set, @buf is in user address space, use copy_to_user(), | ||
22 | * otherwise @buf is in kernel address space, use memcpy(). | ||
23 | * | ||
24 | * Copy a page from "oldmem". For this page, there is no pte mapped | ||
25 | * in the current kernel. We stitch up a pte, similar to kmap_atomic. | ||
26 | */ | ||
27 | ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | ||
28 | size_t csize, unsigned long offset, int userbuf) | ||
29 | { | 17 | { |
30 | void *vaddr; | 18 | void *vaddr; |
31 | 19 | ||
32 | if (!csize) | 20 | if (!csize) |
33 | return 0; | 21 | return 0; |
34 | 22 | ||
35 | vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); | 23 | if (encrypted) |
24 | vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE); | ||
25 | else | ||
26 | vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); | ||
27 | |||
36 | if (!vaddr) | 28 | if (!vaddr) |
37 | return -ENOMEM; | 29 | return -ENOMEM; |
38 | 30 | ||
39 | if (userbuf) { | 31 | if (userbuf) { |
40 | if (copy_to_user(buf, vaddr + offset, csize)) { | 32 | if (copy_to_user((void __user *)buf, vaddr + offset, csize)) { |
41 | iounmap(vaddr); | 33 | iounmap((void __iomem *)vaddr); |
42 | return -EFAULT; | 34 | return -EFAULT; |
43 | } | 35 | } |
44 | } else | 36 | } else |
45 | memcpy(buf, vaddr + offset, csize); | 37 | memcpy(buf, vaddr + offset, csize); |
46 | 38 | ||
47 | set_iounmap_nonlazy(); | 39 | set_iounmap_nonlazy(); |
48 | iounmap(vaddr); | 40 | iounmap((void __iomem *)vaddr); |
49 | return csize; | 41 | return csize; |
50 | } | 42 | } |
43 | |||
44 | /** | ||
45 | * copy_oldmem_page - copy one page of memory | ||
46 | * @pfn: page frame number to be copied | ||
47 | * @buf: target memory address for the copy; this can be in kernel address | ||
48 | * space or user address space (see @userbuf) | ||
49 | * @csize: number of bytes to copy | ||
50 | * @offset: offset in bytes into the page (based on pfn) to begin the copy | ||
51 | * @userbuf: if set, @buf is in user address space, use copy_to_user(), | ||
52 | * otherwise @buf is in kernel address space, use memcpy(). | ||
53 | * | ||
54 | * Copy a page from the old kernel's memory. For this page, there is no pte | ||
55 | * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic. | ||
56 | */ | ||
57 | ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, | ||
58 | unsigned long offset, int userbuf) | ||
59 | { | ||
60 | return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false); | ||
61 | } | ||
62 | |||
63 | /** | ||
64 | * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the | ||
65 | * memory with the encryption mask set to accomodate kdump on SME-enabled | ||
66 | * machines. | ||
67 | */ | ||
68 | ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, | ||
69 | unsigned long offset, int userbuf) | ||
70 | { | ||
71 | return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); | ||
72 | } | ||
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 733e6ace0fa4..ab18e0884dc6 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c | |||
@@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) | |||
273 | map_ldt_struct_to_user(mm); | 273 | map_ldt_struct_to_user(mm); |
274 | 274 | ||
275 | va = (unsigned long)ldt_slot_va(slot); | 275 | va = (unsigned long)ldt_slot_va(slot); |
276 | flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); | 276 | flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false); |
277 | 277 | ||
278 | ldt->slot = slot; | 278 | ldt->slot = slot; |
279 | return 0; | 279 | return 0; |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 1c03e4aa6474..c2fd39752da8 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
199 | pte_unmap_unlock(pte, ptl); | 199 | pte_unmap_unlock(pte, ptl); |
200 | out: | 200 | out: |
201 | up_write(&mm->mmap_sem); | 201 | up_write(&mm->mmap_sem); |
202 | flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL); | 202 | flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false); |
203 | } | 203 | } |
204 | 204 | ||
205 | 205 | ||
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a12afff146d1..fc37bbd23eb8 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c | |||
@@ -19,7 +19,9 @@ | |||
19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
20 | #include <linux/seq_file.h> | 20 | #include <linux/seq_file.h> |
21 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> |
22 | #include <linux/pci.h> | ||
22 | 23 | ||
24 | #include <asm/e820/types.h> | ||
23 | #include <asm/pgtable.h> | 25 | #include <asm/pgtable.h> |
24 | 26 | ||
25 | /* | 27 | /* |
@@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u) | |||
241 | return (signed long)(u << shift) >> shift; | 243 | return (signed long)(u << shift) >> shift; |
242 | } | 244 | } |
243 | 245 | ||
246 | static void note_wx(struct pg_state *st) | ||
247 | { | ||
248 | unsigned long npages; | ||
249 | |||
250 | npages = (st->current_address - st->start_address) / PAGE_SIZE; | ||
251 | |||
252 | #ifdef CONFIG_PCI_BIOS | ||
253 | /* | ||
254 | * If PCI BIOS is enabled, the PCI BIOS area is forced to WX. | ||
255 | * Inform about it, but avoid the warning. | ||
256 | */ | ||
257 | if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && | ||
258 | st->current_address <= PAGE_OFFSET + BIOS_END) { | ||
259 | pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); | ||
260 | return; | ||
261 | } | ||
262 | #endif | ||
263 | /* Account the WX pages */ | ||
264 | st->wx_pages += npages; | ||
265 | WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n", | ||
266 | (void *)st->start_address); | ||
267 | } | ||
268 | |||
244 | /* | 269 | /* |
245 | * This function gets called on a break in a continuous series | 270 | * This function gets called on a break in a continuous series |
246 | * of PTE entries; the next one is different so we need to | 271 | * of PTE entries; the next one is different so we need to |
@@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, | |||
276 | unsigned long delta; | 301 | unsigned long delta; |
277 | int width = sizeof(unsigned long) * 2; | 302 | int width = sizeof(unsigned long) * 2; |
278 | 303 | ||
279 | if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) { | 304 | if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) |
280 | WARN_ONCE(1, | 305 | note_wx(st); |
281 | "x86/mm: Found insecure W+X mapping at address %p/%pS\n", | ||
282 | (void *)st->start_address, | ||
283 | (void *)st->start_address); | ||
284 | st->wx_pages += (st->current_address - | ||
285 | st->start_address) / PAGE_SIZE; | ||
286 | } | ||
287 | 306 | ||
288 | /* | 307 | /* |
289 | * Now print the actual finished series | 308 | * Now print the actual finished series |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 0d45f6debb3a..2b1519bc5381 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -851,6 +851,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, | |||
851 | show_opcodes(regs, loglvl); | 851 | show_opcodes(regs, loglvl); |
852 | } | 852 | } |
853 | 853 | ||
854 | /* | ||
855 | * The (legacy) vsyscall page is the long page in the kernel portion | ||
856 | * of the address space that has user-accessible permissions. | ||
857 | */ | ||
858 | static bool is_vsyscall_vaddr(unsigned long vaddr) | ||
859 | { | ||
860 | return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); | ||
861 | } | ||
862 | |||
854 | static void | 863 | static void |
855 | __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, | 864 | __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, |
856 | unsigned long address, u32 *pkey, int si_code) | 865 | unsigned long address, u32 *pkey, int si_code) |
@@ -874,18 +883,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, | |||
874 | if (is_errata100(regs, address)) | 883 | if (is_errata100(regs, address)) |
875 | return; | 884 | return; |
876 | 885 | ||
877 | #ifdef CONFIG_X86_64 | ||
878 | /* | ||
879 | * Instruction fetch faults in the vsyscall page might need | ||
880 | * emulation. | ||
881 | */ | ||
882 | if (unlikely((error_code & X86_PF_INSTR) && | ||
883 | ((address & ~0xfff) == VSYSCALL_ADDR))) { | ||
884 | if (emulate_vsyscall(regs, address)) | ||
885 | return; | ||
886 | } | ||
887 | #endif | ||
888 | |||
889 | /* | 886 | /* |
890 | * To avoid leaking information about the kernel page table | 887 | * To avoid leaking information about the kernel page table |
891 | * layout, pretend that user-mode accesses to kernel addresses | 888 | * layout, pretend that user-mode accesses to kernel addresses |
@@ -1043,19 +1040,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
1043 | } | 1040 | } |
1044 | } | 1041 | } |
1045 | 1042 | ||
1046 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) | 1043 | static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) |
1047 | { | 1044 | { |
1048 | if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) | 1045 | if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) |
1049 | return 0; | 1046 | return 0; |
1050 | 1047 | ||
1051 | if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) | 1048 | if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) |
1052 | return 0; | 1049 | return 0; |
1053 | /* | ||
1054 | * Note: We do not do lazy flushing on protection key | ||
1055 | * changes, so no spurious fault will ever set X86_PF_PK. | ||
1056 | */ | ||
1057 | if ((error_code & X86_PF_PK)) | ||
1058 | return 1; | ||
1059 | 1050 | ||
1060 | return 1; | 1051 | return 1; |
1061 | } | 1052 | } |
@@ -1082,7 +1073,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) | |||
1082 | * (Optional Invalidation). | 1073 | * (Optional Invalidation). |
1083 | */ | 1074 | */ |
1084 | static noinline int | 1075 | static noinline int |
1085 | spurious_fault(unsigned long error_code, unsigned long address) | 1076 | spurious_kernel_fault(unsigned long error_code, unsigned long address) |
1086 | { | 1077 | { |
1087 | pgd_t *pgd; | 1078 | pgd_t *pgd; |
1088 | p4d_t *p4d; | 1079 | p4d_t *p4d; |
@@ -1113,27 +1104,27 @@ spurious_fault(unsigned long error_code, unsigned long address) | |||
1113 | return 0; | 1104 | return 0; |
1114 | 1105 | ||
1115 | if (p4d_large(*p4d)) | 1106 | if (p4d_large(*p4d)) |
1116 | return spurious_fault_check(error_code, (pte_t *) p4d); | 1107 | return spurious_kernel_fault_check(error_code, (pte_t *) p4d); |
1117 | 1108 | ||
1118 | pud = pud_offset(p4d, address); | 1109 | pud = pud_offset(p4d, address); |
1119 | if (!pud_present(*pud)) | 1110 | if (!pud_present(*pud)) |
1120 | return 0; | 1111 | return 0; |
1121 | 1112 | ||
1122 | if (pud_large(*pud)) | 1113 | if (pud_large(*pud)) |
1123 | return spurious_fault_check(error_code, (pte_t *) pud); | 1114 | return spurious_kernel_fault_check(error_code, (pte_t *) pud); |
1124 | 1115 | ||
1125 | pmd = pmd_offset(pud, address); | 1116 | pmd = pmd_offset(pud, address); |
1126 | if (!pmd_present(*pmd)) | 1117 | if (!pmd_present(*pmd)) |
1127 | return 0; | 1118 | return 0; |
1128 | 1119 | ||
1129 | if (pmd_large(*pmd)) | 1120 | if (pmd_large(*pmd)) |
1130 | return spurious_fault_check(error_code, (pte_t *) pmd); | 1121 | return spurious_kernel_fault_check(error_code, (pte_t *) pmd); |
1131 | 1122 | ||
1132 | pte = pte_offset_kernel(pmd, address); | 1123 | pte = pte_offset_kernel(pmd, address); |
1133 | if (!pte_present(*pte)) | 1124 | if (!pte_present(*pte)) |
1134 | return 0; | 1125 | return 0; |
1135 | 1126 | ||
1136 | ret = spurious_fault_check(error_code, pte); | 1127 | ret = spurious_kernel_fault_check(error_code, pte); |
1137 | if (!ret) | 1128 | if (!ret) |
1138 | return 0; | 1129 | return 0; |
1139 | 1130 | ||
@@ -1141,12 +1132,12 @@ spurious_fault(unsigned long error_code, unsigned long address) | |||
1141 | * Make sure we have permissions in PMD. | 1132 | * Make sure we have permissions in PMD. |
1142 | * If not, then there's a bug in the page tables: | 1133 | * If not, then there's a bug in the page tables: |
1143 | */ | 1134 | */ |
1144 | ret = spurious_fault_check(error_code, (pte_t *) pmd); | 1135 | ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); |
1145 | WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); | 1136 | WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); |
1146 | 1137 | ||
1147 | return ret; | 1138 | return ret; |
1148 | } | 1139 | } |
1149 | NOKPROBE_SYMBOL(spurious_fault); | 1140 | NOKPROBE_SYMBOL(spurious_kernel_fault); |
1150 | 1141 | ||
1151 | int show_unhandled_signals = 1; | 1142 | int show_unhandled_signals = 1; |
1152 | 1143 | ||
@@ -1193,6 +1184,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) | |||
1193 | 1184 | ||
1194 | static int fault_in_kernel_space(unsigned long address) | 1185 | static int fault_in_kernel_space(unsigned long address) |
1195 | { | 1186 | { |
1187 | /* | ||
1188 | * On 64-bit systems, the vsyscall page is at an address above | ||
1189 | * TASK_SIZE_MAX, but is not considered part of the kernel | ||
1190 | * address space. | ||
1191 | */ | ||
1192 | if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) | ||
1193 | return false; | ||
1194 | |||
1196 | return address >= TASK_SIZE_MAX; | 1195 | return address >= TASK_SIZE_MAX; |
1197 | } | 1196 | } |
1198 | 1197 | ||
@@ -1214,31 +1213,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) | |||
1214 | } | 1213 | } |
1215 | 1214 | ||
1216 | /* | 1215 | /* |
1217 | * This routine handles page faults. It determines the address, | 1216 | * Called for all faults where 'address' is part of the kernel address |
1218 | * and the problem, and then passes it off to one of the appropriate | 1217 | * space. Might get called for faults that originate from *code* that |
1219 | * routines. | 1218 | * ran in userspace or the kernel. |
1220 | */ | 1219 | */ |
1221 | static noinline void | 1220 | static void |
1222 | __do_page_fault(struct pt_regs *regs, unsigned long error_code, | 1221 | do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, |
1223 | unsigned long address) | 1222 | unsigned long address) |
1224 | { | 1223 | { |
1225 | struct vm_area_struct *vma; | 1224 | /* |
1226 | struct task_struct *tsk; | 1225 | * Protection keys exceptions only happen on user pages. We |
1227 | struct mm_struct *mm; | 1226 | * have no user pages in the kernel portion of the address |
1228 | vm_fault_t fault, major = 0; | 1227 | * space, so do not expect them here. |
1229 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; | 1228 | */ |
1230 | u32 pkey; | 1229 | WARN_ON_ONCE(hw_error_code & X86_PF_PK); |
1231 | |||
1232 | tsk = current; | ||
1233 | mm = tsk->mm; | ||
1234 | |||
1235 | prefetchw(&mm->mmap_sem); | ||
1236 | |||
1237 | if (unlikely(kmmio_fault(regs, address))) | ||
1238 | return; | ||
1239 | 1230 | ||
1240 | /* | 1231 | /* |
1241 | * We fault-in kernel-space virtual memory on-demand. The | 1232 | * We can fault-in kernel-space virtual memory on-demand. The |
1242 | * 'reference' page table is init_mm.pgd. | 1233 | * 'reference' page table is init_mm.pgd. |
1243 | * | 1234 | * |
1244 | * NOTE! We MUST NOT take any locks for this case. We may | 1235 | * NOTE! We MUST NOT take any locks for this case. We may |
@@ -1246,41 +1237,74 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, | |||
1246 | * only copy the information from the master page table, | 1237 | * only copy the information from the master page table, |
1247 | * nothing more. | 1238 | * nothing more. |
1248 | * | 1239 | * |
1249 | * This verifies that the fault happens in kernel space | 1240 | * Before doing this on-demand faulting, ensure that the |
1250 | * (error_code & 4) == 0, and that the fault was not a | 1241 | * fault is not any of the following: |
1251 | * protection error (error_code & 9) == 0. | 1242 | * 1. A fault on a PTE with a reserved bit set. |
1243 | * 2. A fault caused by a user-mode access. (Do not demand- | ||
1244 | * fault kernel memory due to user-mode accesses). | ||
1245 | * 3. A fault caused by a page-level protection violation. | ||
1246 | * (A demand fault would be on a non-present page which | ||
1247 | * would have X86_PF_PROT==0). | ||
1252 | */ | 1248 | */ |
1253 | if (unlikely(fault_in_kernel_space(address))) { | 1249 | if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { |
1254 | if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { | 1250 | if (vmalloc_fault(address) >= 0) |
1255 | if (vmalloc_fault(address) >= 0) | ||
1256 | return; | ||
1257 | } | ||
1258 | |||
1259 | /* Can handle a stale RO->RW TLB: */ | ||
1260 | if (spurious_fault(error_code, address)) | ||
1261 | return; | 1251 | return; |
1252 | } | ||
1262 | 1253 | ||
1263 | /* kprobes don't want to hook the spurious faults: */ | 1254 | /* Was the fault spurious, caused by lazy TLB invalidation? */ |
1264 | if (kprobes_fault(regs)) | 1255 | if (spurious_kernel_fault(hw_error_code, address)) |
1265 | return; | 1256 | return; |
1266 | /* | ||
1267 | * Don't take the mm semaphore here. If we fixup a prefetch | ||
1268 | * fault we could otherwise deadlock: | ||
1269 | */ | ||
1270 | bad_area_nosemaphore(regs, error_code, address, NULL); | ||
1271 | 1257 | ||
1258 | /* kprobes don't want to hook the spurious faults: */ | ||
1259 | if (kprobes_fault(regs)) | ||
1272 | return; | 1260 | return; |
1273 | } | 1261 | |
1262 | /* | ||
1263 | * Note, despite being a "bad area", there are quite a few | ||
1264 | * acceptable reasons to get here, such as erratum fixups | ||
1265 | * and handling kernel code that can fault, like get_user(). | ||
1266 | * | ||
1267 | * Don't take the mm semaphore here. If we fixup a prefetch | ||
1268 | * fault we could otherwise deadlock: | ||
1269 | */ | ||
1270 | bad_area_nosemaphore(regs, hw_error_code, address, NULL); | ||
1271 | } | ||
1272 | NOKPROBE_SYMBOL(do_kern_addr_fault); | ||
1273 | |||
1274 | /* Handle faults in the user portion of the address space */ | ||
1275 | static inline | ||
1276 | void do_user_addr_fault(struct pt_regs *regs, | ||
1277 | unsigned long hw_error_code, | ||
1278 | unsigned long address) | ||
1279 | { | ||
1280 | unsigned long sw_error_code; | ||
1281 | struct vm_area_struct *vma; | ||
1282 | struct task_struct *tsk; | ||
1283 | struct mm_struct *mm; | ||
1284 | vm_fault_t fault, major = 0; | ||
1285 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; | ||
1286 | u32 pkey; | ||
1287 | |||
1288 | tsk = current; | ||
1289 | mm = tsk->mm; | ||
1274 | 1290 | ||
1275 | /* kprobes don't want to hook the spurious faults: */ | 1291 | /* kprobes don't want to hook the spurious faults: */ |
1276 | if (unlikely(kprobes_fault(regs))) | 1292 | if (unlikely(kprobes_fault(regs))) |
1277 | return; | 1293 | return; |
1278 | 1294 | ||
1279 | if (unlikely(error_code & X86_PF_RSVD)) | 1295 | /* |
1280 | pgtable_bad(regs, error_code, address); | 1296 | * Reserved bits are never expected to be set on |
1297 | * entries in the user portion of the page tables. | ||
1298 | */ | ||
1299 | if (unlikely(hw_error_code & X86_PF_RSVD)) | ||
1300 | pgtable_bad(regs, hw_error_code, address); | ||
1281 | 1301 | ||
1282 | if (unlikely(smap_violation(error_code, regs))) { | 1302 | /* |
1283 | bad_area_nosemaphore(regs, error_code, address, NULL); | 1303 | * Check for invalid kernel (supervisor) access to user |
1304 | * pages in the user address space. | ||
1305 | */ | ||
1306 | if (unlikely(smap_violation(hw_error_code, regs))) { | ||
1307 | bad_area_nosemaphore(regs, hw_error_code, address, NULL); | ||
1284 | return; | 1308 | return; |
1285 | } | 1309 | } |
1286 | 1310 | ||
@@ -1289,11 +1313,18 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, | |||
1289 | * in a region with pagefaults disabled then we must not take the fault | 1313 | * in a region with pagefaults disabled then we must not take the fault |
1290 | */ | 1314 | */ |
1291 | if (unlikely(faulthandler_disabled() || !mm)) { | 1315 | if (unlikely(faulthandler_disabled() || !mm)) { |
1292 | bad_area_nosemaphore(regs, error_code, address, NULL); | 1316 | bad_area_nosemaphore(regs, hw_error_code, address, NULL); |
1293 | return; | 1317 | return; |
1294 | } | 1318 | } |
1295 | 1319 | ||
1296 | /* | 1320 | /* |
1321 | * hw_error_code is literally the "page fault error code" passed to | ||
1322 | * the kernel directly from the hardware. But, we will shortly be | ||
1323 | * modifying it in software, so give it a new name. | ||
1324 | */ | ||
1325 | sw_error_code = hw_error_code; | ||
1326 | |||
1327 | /* | ||
1297 | * It's safe to allow irq's after cr2 has been saved and the | 1328 | * It's safe to allow irq's after cr2 has been saved and the |
1298 | * vmalloc fault has been handled. | 1329 | * vmalloc fault has been handled. |
1299 | * | 1330 | * |
@@ -1302,7 +1333,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, | |||
1302 | */ | 1333 | */ |
1303 | if (user_mode(regs)) { | 1334 | if (user_mode(regs)) { |
1304 | local_irq_enable(); | 1335 | local_irq_enable(); |
1305 | error_code |= X86_PF_USER; | 1336 | /* |
1337 | * Up to this point, X86_PF_USER set in hw_error_code | ||
1338 | * indicated a user-mode access. But, after this, | ||
1339 | * X86_PF_USER in sw_error_code will indicate either | ||
1340 | * that, *or* an implicit kernel(supervisor)-mode access | ||
1341 | * which originated from user mode. | ||
1342 | */ | ||
1343 | if (!(hw_error_code & X86_PF_USER)) { | ||
1344 | /* | ||
1345 | * The CPU was in user mode, but the CPU says | ||
1346 | * the fault was not a user-mode access. | ||
1347 | * Must be an implicit kernel-mode access, | ||
1348 | * which we do not expect to happen in the | ||
1349 | * user address space. | ||
1350 | */ | ||
1351 | pr_warn_once("kernel-mode error from user-mode: %lx\n", | ||
1352 | hw_error_code); | ||
1353 | |||
1354 | sw_error_code |= X86_PF_USER; | ||
1355 | } | ||
1306 | flags |= FAULT_FLAG_USER; | 1356 | flags |= FAULT_FLAG_USER; |
1307 | } else { | 1357 | } else { |
1308 | if (regs->flags & X86_EFLAGS_IF) | 1358 | if (regs->flags & X86_EFLAGS_IF) |
@@ -1311,31 +1361,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, | |||
1311 | 1361 | ||
1312 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); | 1362 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); |
1313 | 1363 | ||
1314 | if (error_code & X86_PF_WRITE) | 1364 | if (sw_error_code & X86_PF_WRITE) |
1315 | flags |= FAULT_FLAG_WRITE; | 1365 | flags |= FAULT_FLAG_WRITE; |
1316 | if (error_code & X86_PF_INSTR) | 1366 | if (sw_error_code & X86_PF_INSTR) |
1317 | flags |= FAULT_FLAG_INSTRUCTION; | 1367 | flags |= FAULT_FLAG_INSTRUCTION; |
1318 | 1368 | ||
1369 | #ifdef CONFIG_X86_64 | ||
1319 | /* | 1370 | /* |
1320 | * When running in the kernel we expect faults to occur only to | 1371 | * Instruction fetch faults in the vsyscall page might need |
1321 | * addresses in user space. All other faults represent errors in | 1372 | * emulation. The vsyscall page is at a high address |
1322 | * the kernel and should generate an OOPS. Unfortunately, in the | 1373 | * (>PAGE_OFFSET), but is considered to be part of the user |
1323 | * case of an erroneous fault occurring in a code path which already | 1374 | * address space. |
1324 | * holds mmap_sem we will deadlock attempting to validate the fault | ||
1325 | * against the address space. Luckily the kernel only validly | ||
1326 | * references user space from well defined areas of code, which are | ||
1327 | * listed in the exceptions table. | ||
1328 | * | 1375 | * |
1329 | * As the vast majority of faults will be valid we will only perform | 1376 | * The vsyscall page does not have a "real" VMA, so do this |
1330 | * the source reference check when there is a possibility of a | 1377 | * emulation before we go searching for VMAs. |
1331 | * deadlock. Attempt to lock the address space, if we cannot we then | 1378 | */ |
1332 | * validate the source. If this is invalid we can skip the address | 1379 | if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { |
1333 | * space check, thus avoiding the deadlock: | 1380 | if (emulate_vsyscall(regs, address)) |
1381 | return; | ||
1382 | } | ||
1383 | #endif | ||
1384 | |||
1385 | /* | ||
1386 | * Kernel-mode access to the user address space should only occur | ||
1387 | * on well-defined single instructions listed in the exception | ||
1388 | * tables. But, an erroneous kernel fault occurring outside one of | ||
1389 | * those areas which also holds mmap_sem might deadlock attempting | ||
1390 | * to validate the fault against the address space. | ||
1391 | * | ||
1392 | * Only do the expensive exception table search when we might be at | ||
1393 | * risk of a deadlock. This happens if we | ||
1394 | * 1. Failed to acquire mmap_sem, and | ||
1395 | * 2. The access did not originate in userspace. Note: either the | ||
1396 | * hardware or earlier page fault code may set X86_PF_USER | ||
1397 | * in sw_error_code. | ||
1334 | */ | 1398 | */ |
1335 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { | 1399 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { |
1336 | if (!(error_code & X86_PF_USER) && | 1400 | if (!(sw_error_code & X86_PF_USER) && |
1337 | !search_exception_tables(regs->ip)) { | 1401 | !search_exception_tables(regs->ip)) { |
1338 | bad_area_nosemaphore(regs, error_code, address, NULL); | 1402 | /* |
1403 | * Fault from code in kernel from | ||
1404 | * which we do not expect faults. | ||
1405 | */ | ||
1406 | bad_area_nosemaphore(regs, sw_error_code, address, NULL); | ||
1339 | return; | 1407 | return; |
1340 | } | 1408 | } |
1341 | retry: | 1409 | retry: |
@@ -1351,16 +1419,16 @@ retry: | |||
1351 | 1419 | ||
1352 | vma = find_vma(mm, address); | 1420 | vma = find_vma(mm, address); |
1353 | if (unlikely(!vma)) { | 1421 | if (unlikely(!vma)) { |
1354 | bad_area(regs, error_code, address); | 1422 | bad_area(regs, sw_error_code, address); |
1355 | return; | 1423 | return; |
1356 | } | 1424 | } |
1357 | if (likely(vma->vm_start <= address)) | 1425 | if (likely(vma->vm_start <= address)) |
1358 | goto good_area; | 1426 | goto good_area; |
1359 | if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { | 1427 | if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { |
1360 | bad_area(regs, error_code, address); | 1428 | bad_area(regs, sw_error_code, address); |
1361 | return; | 1429 | return; |
1362 | } | 1430 | } |
1363 | if (error_code & X86_PF_USER) { | 1431 | if (sw_error_code & X86_PF_USER) { |
1364 | /* | 1432 | /* |
1365 | * Accessing the stack below %sp is always a bug. | 1433 | * Accessing the stack below %sp is always a bug. |
1366 | * The large cushion allows instructions like enter | 1434 | * The large cushion allows instructions like enter |
@@ -1368,12 +1436,12 @@ retry: | |||
1368 | * 32 pointers and then decrements %sp by 65535.) | 1436 | * 32 pointers and then decrements %sp by 65535.) |
1369 | */ | 1437 | */ |
1370 | if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { | 1438 | if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { |
1371 | bad_area(regs, error_code, address); | 1439 | bad_area(regs, sw_error_code, address); |
1372 | return; | 1440 | return; |
1373 | } | 1441 | } |
1374 | } | 1442 | } |
1375 | if (unlikely(expand_stack(vma, address))) { | 1443 | if (unlikely(expand_stack(vma, address))) { |
1376 | bad_area(regs, error_code, address); | 1444 | bad_area(regs, sw_error_code, address); |
1377 | return; | 1445 | return; |
1378 | } | 1446 | } |
1379 | 1447 | ||
@@ -1382,8 +1450,8 @@ retry: | |||
1382 | * we can handle it.. | 1450 | * we can handle it.. |
1383 | */ | 1451 | */ |
1384 | good_area: | 1452 | good_area: |
1385 | if (unlikely(access_error(error_code, vma))) { | 1453 | if (unlikely(access_error(sw_error_code, vma))) { |
1386 | bad_area_access_error(regs, error_code, address, vma); | 1454 | bad_area_access_error(regs, sw_error_code, address, vma); |
1387 | return; | 1455 | return; |
1388 | } | 1456 | } |
1389 | 1457 | ||
@@ -1425,13 +1493,13 @@ good_area: | |||
1425 | return; | 1493 | return; |
1426 | 1494 | ||
1427 | /* Not returning to user mode? Handle exceptions or die: */ | 1495 | /* Not returning to user mode? Handle exceptions or die: */ |
1428 | no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); | 1496 | no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR); |
1429 | return; | 1497 | return; |
1430 | } | 1498 | } |
1431 | 1499 | ||
1432 | up_read(&mm->mmap_sem); | 1500 | up_read(&mm->mmap_sem); |
1433 | if (unlikely(fault & VM_FAULT_ERROR)) { | 1501 | if (unlikely(fault & VM_FAULT_ERROR)) { |
1434 | mm_fault_error(regs, error_code, address, &pkey, fault); | 1502 | mm_fault_error(regs, sw_error_code, address, &pkey, fault); |
1435 | return; | 1503 | return; |
1436 | } | 1504 | } |
1437 | 1505 | ||
@@ -1449,6 +1517,28 @@ good_area: | |||
1449 | 1517 | ||
1450 | check_v8086_mode(regs, address, tsk); | 1518 | check_v8086_mode(regs, address, tsk); |
1451 | } | 1519 | } |
1520 | NOKPROBE_SYMBOL(do_user_addr_fault); | ||
1521 | |||
1522 | /* | ||
1523 | * This routine handles page faults. It determines the address, | ||
1524 | * and the problem, and then passes it off to one of the appropriate | ||
1525 | * routines. | ||
1526 | */ | ||
1527 | static noinline void | ||
1528 | __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, | ||
1529 | unsigned long address) | ||
1530 | { | ||
1531 | prefetchw(¤t->mm->mmap_sem); | ||
1532 | |||
1533 | if (unlikely(kmmio_fault(regs, address))) | ||
1534 | return; | ||
1535 | |||
1536 | /* Was the fault on kernel-controlled part of the address space? */ | ||
1537 | if (unlikely(fault_in_kernel_space(address))) | ||
1538 | do_kern_addr_fault(regs, hw_error_code, address); | ||
1539 | else | ||
1540 | do_user_addr_fault(regs, hw_error_code, address); | ||
1541 | } | ||
1452 | NOKPROBE_SYMBOL(__do_page_fault); | 1542 | NOKPROBE_SYMBOL(__do_page_fault); |
1453 | 1543 | ||
1454 | static nokprobe_inline void | 1544 | static nokprobe_inline void |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 979e0a02cbe1..142c7d9f89cc 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -923,34 +923,19 @@ static void mark_nxdata_nx(void) | |||
923 | void mark_rodata_ro(void) | 923 | void mark_rodata_ro(void) |
924 | { | 924 | { |
925 | unsigned long start = PFN_ALIGN(_text); | 925 | unsigned long start = PFN_ALIGN(_text); |
926 | unsigned long size = PFN_ALIGN(_etext) - start; | 926 | unsigned long size = (unsigned long)__end_rodata - start; |
927 | 927 | ||
928 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); | 928 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); |
929 | printk(KERN_INFO "Write protecting the kernel text: %luk\n", | 929 | pr_info("Write protecting kernel text and read-only data: %luk\n", |
930 | size >> 10); | 930 | size >> 10); |
931 | 931 | ||
932 | kernel_set_to_readonly = 1; | 932 | kernel_set_to_readonly = 1; |
933 | 933 | ||
934 | #ifdef CONFIG_CPA_DEBUG | 934 | #ifdef CONFIG_CPA_DEBUG |
935 | printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", | 935 | pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size); |
936 | start, start+size); | ||
937 | set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); | ||
938 | |||
939 | printk(KERN_INFO "Testing CPA: write protecting again\n"); | ||
940 | set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); | ||
941 | #endif | ||
942 | |||
943 | start += size; | ||
944 | size = (unsigned long)__end_rodata - start; | ||
945 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); | ||
946 | printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", | ||
947 | size >> 10); | ||
948 | |||
949 | #ifdef CONFIG_CPA_DEBUG | ||
950 | printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); | ||
951 | set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); | 936 | set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); |
952 | 937 | ||
953 | printk(KERN_INFO "Testing CPA: write protecting again\n"); | 938 | pr_info("Testing CPA: write protecting again\n"); |
954 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); | 939 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); |
955 | #endif | 940 | #endif |
956 | mark_nxdata_nx(); | 941 | mark_nxdata_nx(); |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index c63a545ec199..24e0920a9b25 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, | |||
131 | * caller shouldn't need to know that small detail. | 131 | * caller shouldn't need to know that small detail. |
132 | */ | 132 | */ |
133 | static void __iomem *__ioremap_caller(resource_size_t phys_addr, | 133 | static void __iomem *__ioremap_caller(resource_size_t phys_addr, |
134 | unsigned long size, enum page_cache_mode pcm, void *caller) | 134 | unsigned long size, enum page_cache_mode pcm, |
135 | void *caller, bool encrypted) | ||
135 | { | 136 | { |
136 | unsigned long offset, vaddr; | 137 | unsigned long offset, vaddr; |
137 | resource_size_t last_addr; | 138 | resource_size_t last_addr; |
@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
199 | * resulting mapping. | 200 | * resulting mapping. |
200 | */ | 201 | */ |
201 | prot = PAGE_KERNEL_IO; | 202 | prot = PAGE_KERNEL_IO; |
202 | if (sev_active() && mem_flags.desc_other) | 203 | if ((sev_active() && mem_flags.desc_other) || encrypted) |
203 | prot = pgprot_encrypted(prot); | 204 | prot = pgprot_encrypted(prot); |
204 | 205 | ||
205 | switch (pcm) { | 206 | switch (pcm) { |
@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) | |||
291 | enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; | 292 | enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; |
292 | 293 | ||
293 | return __ioremap_caller(phys_addr, size, pcm, | 294 | return __ioremap_caller(phys_addr, size, pcm, |
294 | __builtin_return_address(0)); | 295 | __builtin_return_address(0), false); |
295 | } | 296 | } |
296 | EXPORT_SYMBOL(ioremap_nocache); | 297 | EXPORT_SYMBOL(ioremap_nocache); |
297 | 298 | ||
@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size) | |||
324 | enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; | 325 | enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; |
325 | 326 | ||
326 | return __ioremap_caller(phys_addr, size, pcm, | 327 | return __ioremap_caller(phys_addr, size, pcm, |
327 | __builtin_return_address(0)); | 328 | __builtin_return_address(0), false); |
328 | } | 329 | } |
329 | EXPORT_SYMBOL_GPL(ioremap_uc); | 330 | EXPORT_SYMBOL_GPL(ioremap_uc); |
330 | 331 | ||
@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc); | |||
341 | void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) | 342 | void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) |
342 | { | 343 | { |
343 | return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, | 344 | return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, |
344 | __builtin_return_address(0)); | 345 | __builtin_return_address(0), false); |
345 | } | 346 | } |
346 | EXPORT_SYMBOL(ioremap_wc); | 347 | EXPORT_SYMBOL(ioremap_wc); |
347 | 348 | ||
@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc); | |||
358 | void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size) | 359 | void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size) |
359 | { | 360 | { |
360 | return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT, | 361 | return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT, |
361 | __builtin_return_address(0)); | 362 | __builtin_return_address(0), false); |
362 | } | 363 | } |
363 | EXPORT_SYMBOL(ioremap_wt); | 364 | EXPORT_SYMBOL(ioremap_wt); |
364 | 365 | ||
366 | void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size) | ||
367 | { | ||
368 | return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, | ||
369 | __builtin_return_address(0), true); | ||
370 | } | ||
371 | EXPORT_SYMBOL(ioremap_encrypted); | ||
372 | |||
365 | void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) | 373 | void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) |
366 | { | 374 | { |
367 | return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, | 375 | return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, |
368 | __builtin_return_address(0)); | 376 | __builtin_return_address(0), false); |
369 | } | 377 | } |
370 | EXPORT_SYMBOL(ioremap_cache); | 378 | EXPORT_SYMBOL(ioremap_cache); |
371 | 379 | ||
@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, | |||
374 | { | 382 | { |
375 | return __ioremap_caller(phys_addr, size, | 383 | return __ioremap_caller(phys_addr, size, |
376 | pgprot2cachemode(__pgprot(prot_val)), | 384 | pgprot2cachemode(__pgprot(prot_val)), |
377 | __builtin_return_address(0)); | 385 | __builtin_return_address(0), false); |
378 | } | 386 | } |
379 | EXPORT_SYMBOL(ioremap_prot); | 387 | EXPORT_SYMBOL(ioremap_prot); |
380 | 388 | ||
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 51a5a69ecac9..62bb30b4bd2a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -37,11 +37,20 @@ struct cpa_data { | |||
37 | unsigned long numpages; | 37 | unsigned long numpages; |
38 | int flags; | 38 | int flags; |
39 | unsigned long pfn; | 39 | unsigned long pfn; |
40 | unsigned force_split : 1; | 40 | unsigned force_split : 1, |
41 | force_static_prot : 1; | ||
41 | int curpage; | 42 | int curpage; |
42 | struct page **pages; | 43 | struct page **pages; |
43 | }; | 44 | }; |
44 | 45 | ||
46 | enum cpa_warn { | ||
47 | CPA_CONFLICT, | ||
48 | CPA_PROTECT, | ||
49 | CPA_DETECT, | ||
50 | }; | ||
51 | |||
52 | static const int cpa_warn_level = CPA_PROTECT; | ||
53 | |||
45 | /* | 54 | /* |
46 | * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) | 55 | * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) |
47 | * using cpa_lock. So that we don't allow any other cpu, with stale large tlb | 56 | * using cpa_lock. So that we don't allow any other cpu, with stale large tlb |
@@ -94,6 +103,87 @@ void arch_report_meminfo(struct seq_file *m) | |||
94 | static inline void split_page_count(int level) { } | 103 | static inline void split_page_count(int level) { } |
95 | #endif | 104 | #endif |
96 | 105 | ||
106 | #ifdef CONFIG_X86_CPA_STATISTICS | ||
107 | |||
108 | static unsigned long cpa_1g_checked; | ||
109 | static unsigned long cpa_1g_sameprot; | ||
110 | static unsigned long cpa_1g_preserved; | ||
111 | static unsigned long cpa_2m_checked; | ||
112 | static unsigned long cpa_2m_sameprot; | ||
113 | static unsigned long cpa_2m_preserved; | ||
114 | static unsigned long cpa_4k_install; | ||
115 | |||
116 | static inline void cpa_inc_1g_checked(void) | ||
117 | { | ||
118 | cpa_1g_checked++; | ||
119 | } | ||
120 | |||
121 | static inline void cpa_inc_2m_checked(void) | ||
122 | { | ||
123 | cpa_2m_checked++; | ||
124 | } | ||
125 | |||
126 | static inline void cpa_inc_4k_install(void) | ||
127 | { | ||
128 | cpa_4k_install++; | ||
129 | } | ||
130 | |||
131 | static inline void cpa_inc_lp_sameprot(int level) | ||
132 | { | ||
133 | if (level == PG_LEVEL_1G) | ||
134 | cpa_1g_sameprot++; | ||
135 | else | ||
136 | cpa_2m_sameprot++; | ||
137 | } | ||
138 | |||
139 | static inline void cpa_inc_lp_preserved(int level) | ||
140 | { | ||
141 | if (level == PG_LEVEL_1G) | ||
142 | cpa_1g_preserved++; | ||
143 | else | ||
144 | cpa_2m_preserved++; | ||
145 | } | ||
146 | |||
147 | static int cpastats_show(struct seq_file *m, void *p) | ||
148 | { | ||
149 | seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked); | ||
150 | seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot); | ||
151 | seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved); | ||
152 | seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked); | ||
153 | seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot); | ||
154 | seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved); | ||
155 | seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install); | ||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | static int cpastats_open(struct inode *inode, struct file *file) | ||
160 | { | ||
161 | return single_open(file, cpastats_show, NULL); | ||
162 | } | ||
163 | |||
164 | static const struct file_operations cpastats_fops = { | ||
165 | .open = cpastats_open, | ||
166 | .read = seq_read, | ||
167 | .llseek = seq_lseek, | ||
168 | .release = single_release, | ||
169 | }; | ||
170 | |||
171 | static int __init cpa_stats_init(void) | ||
172 | { | ||
173 | debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL, | ||
174 | &cpastats_fops); | ||
175 | return 0; | ||
176 | } | ||
177 | late_initcall(cpa_stats_init); | ||
178 | #else | ||
179 | static inline void cpa_inc_1g_checked(void) { } | ||
180 | static inline void cpa_inc_2m_checked(void) { } | ||
181 | static inline void cpa_inc_4k_install(void) { } | ||
182 | static inline void cpa_inc_lp_sameprot(int level) { } | ||
183 | static inline void cpa_inc_lp_preserved(int level) { } | ||
184 | #endif | ||
185 | |||
186 | |||
97 | static inline int | 187 | static inline int |
98 | within(unsigned long addr, unsigned long start, unsigned long end) | 188 | within(unsigned long addr, unsigned long start, unsigned long end) |
99 | { | 189 | { |
@@ -195,14 +285,20 @@ static void cpa_flush_all(unsigned long cache) | |||
195 | on_each_cpu(__cpa_flush_all, (void *) cache, 1); | 285 | on_each_cpu(__cpa_flush_all, (void *) cache, 1); |
196 | } | 286 | } |
197 | 287 | ||
198 | static void __cpa_flush_range(void *arg) | 288 | static bool __cpa_flush_range(unsigned long start, int numpages, int cache) |
199 | { | 289 | { |
200 | /* | 290 | BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); |
201 | * We could optimize that further and do individual per page | 291 | |
202 | * tlb invalidates for a low number of pages. Caveat: we must | 292 | WARN_ON(PAGE_ALIGN(start) != start); |
203 | * flush the high aliases on 64bit as well. | 293 | |
204 | */ | 294 | if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { |
205 | __flush_tlb_all(); | 295 | cpa_flush_all(cache); |
296 | return true; | ||
297 | } | ||
298 | |||
299 | flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); | ||
300 | |||
301 | return !cache; | ||
206 | } | 302 | } |
207 | 303 | ||
208 | static void cpa_flush_range(unsigned long start, int numpages, int cache) | 304 | static void cpa_flush_range(unsigned long start, int numpages, int cache) |
@@ -210,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) | |||
210 | unsigned int i, level; | 306 | unsigned int i, level; |
211 | unsigned long addr; | 307 | unsigned long addr; |
212 | 308 | ||
213 | BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); | 309 | if (__cpa_flush_range(start, numpages, cache)) |
214 | WARN_ON(PAGE_ALIGN(start) != start); | ||
215 | |||
216 | on_each_cpu(__cpa_flush_range, NULL, 1); | ||
217 | |||
218 | if (!cache) | ||
219 | return; | 310 | return; |
220 | 311 | ||
221 | /* | 312 | /* |
@@ -235,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) | |||
235 | } | 326 | } |
236 | } | 327 | } |
237 | 328 | ||
238 | static void cpa_flush_array(unsigned long *start, int numpages, int cache, | 329 | static void cpa_flush_array(unsigned long baddr, unsigned long *start, |
330 | int numpages, int cache, | ||
239 | int in_flags, struct page **pages) | 331 | int in_flags, struct page **pages) |
240 | { | 332 | { |
241 | unsigned int i, level; | 333 | unsigned int i, level; |
242 | #ifdef CONFIG_PREEMPT | ||
243 | /* | ||
244 | * Avoid wbinvd() because it causes latencies on all CPUs, | ||
245 | * regardless of any CPU isolation that may be in effect. | ||
246 | * | ||
247 | * This should be extended for CAT enabled systems independent of | ||
248 | * PREEMPT because wbinvd() does not respect the CAT partitions and | ||
249 | * this is exposed to unpriviledged users through the graphics | ||
250 | * subsystem. | ||
251 | */ | ||
252 | unsigned long do_wbinvd = 0; | ||
253 | #else | ||
254 | unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ | ||
255 | #endif | ||
256 | |||
257 | BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); | ||
258 | 334 | ||
259 | on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); | 335 | if (__cpa_flush_range(baddr, numpages, cache)) |
260 | |||
261 | if (!cache || do_wbinvd) | ||
262 | return; | 336 | return; |
263 | 337 | ||
264 | /* | 338 | /* |
@@ -286,84 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, | |||
286 | } | 360 | } |
287 | } | 361 | } |
288 | 362 | ||
289 | /* | 363 | static bool overlaps(unsigned long r1_start, unsigned long r1_end, |
290 | * Certain areas of memory on x86 require very specific protection flags, | 364 | unsigned long r2_start, unsigned long r2_end) |
291 | * for example the BIOS area or kernel text. Callers don't always get this | ||
292 | * right (again, ioremap() on BIOS memory is not uncommon) so this function | ||
293 | * checks and fixes these known static required protection bits. | ||
294 | */ | ||
295 | static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, | ||
296 | unsigned long pfn) | ||
297 | { | 365 | { |
298 | pgprot_t forbidden = __pgprot(0); | 366 | return (r1_start <= r2_end && r1_end >= r2_start) || |
367 | (r2_start <= r1_end && r2_end >= r1_start); | ||
368 | } | ||
299 | 369 | ||
300 | /* | ||
301 | * The BIOS area between 640k and 1Mb needs to be executable for | ||
302 | * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. | ||
303 | */ | ||
304 | #ifdef CONFIG_PCI_BIOS | 370 | #ifdef CONFIG_PCI_BIOS |
305 | if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) | 371 | /* |
306 | pgprot_val(forbidden) |= _PAGE_NX; | 372 | * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS |
373 | * based config access (CONFIG_PCI_GOBIOS) support. | ||
374 | */ | ||
375 | #define BIOS_PFN PFN_DOWN(BIOS_BEGIN) | ||
376 | #define BIOS_PFN_END PFN_DOWN(BIOS_END - 1) | ||
377 | |||
378 | static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) | ||
379 | { | ||
380 | if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END)) | ||
381 | return _PAGE_NX; | ||
382 | return 0; | ||
383 | } | ||
384 | #else | ||
385 | static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) | ||
386 | { | ||
387 | return 0; | ||
388 | } | ||
307 | #endif | 389 | #endif |
308 | 390 | ||
309 | /* | 391 | /* |
310 | * The kernel text needs to be executable for obvious reasons | 392 | * The .rodata section needs to be read-only. Using the pfn catches all |
311 | * Does not cover __inittext since that is gone later on. On | 393 | * aliases. This also includes __ro_after_init, so do not enforce until |
312 | * 64bit we do not enforce !NX on the low mapping | 394 | * kernel_set_to_readonly is true. |
313 | */ | 395 | */ |
314 | if (within(address, (unsigned long)_text, (unsigned long)_etext)) | 396 | static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn) |
315 | pgprot_val(forbidden) |= _PAGE_NX; | 397 | { |
398 | unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata)); | ||
316 | 399 | ||
317 | /* | 400 | /* |
318 | * The .rodata section needs to be read-only. Using the pfn | 401 | * Note: __end_rodata is at page aligned and not inclusive, so |
319 | * catches all aliases. This also includes __ro_after_init, | 402 | * subtract 1 to get the last enforced PFN in the rodata area. |
320 | * so do not enforce until kernel_set_to_readonly is true. | ||
321 | */ | 403 | */ |
322 | if (kernel_set_to_readonly && | 404 | epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1; |
323 | within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, | 405 | |
324 | __pa_symbol(__end_rodata) >> PAGE_SHIFT)) | 406 | if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro)) |
325 | pgprot_val(forbidden) |= _PAGE_RW; | 407 | return _PAGE_RW; |
408 | return 0; | ||
409 | } | ||
410 | |||
411 | /* | ||
412 | * Protect kernel text against becoming non executable by forbidding | ||
413 | * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext) | ||
414 | * out of which the kernel actually executes. Do not protect the low | ||
415 | * mapping. | ||
416 | * | ||
417 | * This does not cover __inittext since that is gone after boot. | ||
418 | */ | ||
419 | static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end) | ||
420 | { | ||
421 | unsigned long t_end = (unsigned long)_etext - 1; | ||
422 | unsigned long t_start = (unsigned long)_text; | ||
423 | |||
424 | if (overlaps(start, end, t_start, t_end)) | ||
425 | return _PAGE_NX; | ||
426 | return 0; | ||
427 | } | ||
326 | 428 | ||
327 | #if defined(CONFIG_X86_64) | 429 | #if defined(CONFIG_X86_64) |
430 | /* | ||
431 | * Once the kernel maps the text as RO (kernel_set_to_readonly is set), | ||
432 | * kernel text mappings for the large page aligned text, rodata sections | ||
433 | * will be always read-only. For the kernel identity mappings covering the | ||
434 | * holes caused by this alignment can be anything that user asks. | ||
435 | * | ||
436 | * This will preserve the large page mappings for kernel text/data at no | ||
437 | * extra cost. | ||
438 | */ | ||
439 | static pgprotval_t protect_kernel_text_ro(unsigned long start, | ||
440 | unsigned long end) | ||
441 | { | ||
442 | unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1; | ||
443 | unsigned long t_start = (unsigned long)_text; | ||
444 | unsigned int level; | ||
445 | |||
446 | if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end)) | ||
447 | return 0; | ||
328 | /* | 448 | /* |
329 | * Once the kernel maps the text as RO (kernel_set_to_readonly is set), | 449 | * Don't enforce the !RW mapping for the kernel text mapping, if |
330 | * kernel text mappings for the large page aligned text, rodata sections | 450 | * the current mapping is already using small page mapping. No |
331 | * will be always read-only. For the kernel identity mappings covering | 451 | * need to work hard to preserve large page mappings in this case. |
332 | * the holes caused by this alignment can be anything that user asks. | ||
333 | * | 452 | * |
334 | * This will preserve the large page mappings for kernel text/data | 453 | * This also fixes the Linux Xen paravirt guest boot failure caused |
335 | * at no extra cost. | 454 | * by unexpected read-only mappings for kernel identity |
455 | * mappings. In this paravirt guest case, the kernel text mapping | ||
456 | * and the kernel identity mapping share the same page-table pages, | ||
457 | * so the protections for kernel text and identity mappings have to | ||
458 | * be the same. | ||
336 | */ | 459 | */ |
337 | if (kernel_set_to_readonly && | 460 | if (lookup_address(start, &level) && (level != PG_LEVEL_4K)) |
338 | within(address, (unsigned long)_text, | 461 | return _PAGE_RW; |
339 | (unsigned long)__end_rodata_hpage_align)) { | 462 | return 0; |
340 | unsigned int level; | 463 | } |
341 | 464 | #else | |
342 | /* | 465 | static pgprotval_t protect_kernel_text_ro(unsigned long start, |
343 | * Don't enforce the !RW mapping for the kernel text mapping, | 466 | unsigned long end) |
344 | * if the current mapping is already using small page mapping. | 467 | { |
345 | * No need to work hard to preserve large page mappings in this | 468 | return 0; |
346 | * case. | 469 | } |
347 | * | ||
348 | * This also fixes the Linux Xen paravirt guest boot failure | ||
349 | * (because of unexpected read-only mappings for kernel identity | ||
350 | * mappings). In this paravirt guest case, the kernel text | ||
351 | * mapping and the kernel identity mapping share the same | ||
352 | * page-table pages. Thus we can't really use different | ||
353 | * protections for the kernel text and identity mappings. Also, | ||
354 | * these shared mappings are made of small page mappings. | ||
355 | * Thus this don't enforce !RW mapping for small page kernel | ||
356 | * text mapping logic will help Linux Xen parvirt guest boot | ||
357 | * as well. | ||
358 | */ | ||
359 | if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) | ||
360 | pgprot_val(forbidden) |= _PAGE_RW; | ||
361 | } | ||
362 | #endif | 470 | #endif |
363 | 471 | ||
364 | prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); | 472 | static inline bool conflicts(pgprot_t prot, pgprotval_t val) |
473 | { | ||
474 | return (pgprot_val(prot) & ~val) != pgprot_val(prot); | ||
475 | } | ||
365 | 476 | ||
366 | return prot; | 477 | static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, |
478 | unsigned long start, unsigned long end, | ||
479 | unsigned long pfn, const char *txt) | ||
480 | { | ||
481 | static const char *lvltxt[] = { | ||
482 | [CPA_CONFLICT] = "conflict", | ||
483 | [CPA_PROTECT] = "protect", | ||
484 | [CPA_DETECT] = "detect", | ||
485 | }; | ||
486 | |||
487 | if (warnlvl > cpa_warn_level || !conflicts(prot, val)) | ||
488 | return; | ||
489 | |||
490 | pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n", | ||
491 | lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot), | ||
492 | (unsigned long long)val); | ||
493 | } | ||
494 | |||
495 | /* | ||
496 | * Certain areas of memory on x86 require very specific protection flags, | ||
497 | * for example the BIOS area or kernel text. Callers don't always get this | ||
498 | * right (again, ioremap() on BIOS memory is not uncommon) so this function | ||
499 | * checks and fixes these known static required protection bits. | ||
500 | */ | ||
501 | static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, | ||
502 | unsigned long pfn, unsigned long npg, | ||
503 | int warnlvl) | ||
504 | { | ||
505 | pgprotval_t forbidden, res; | ||
506 | unsigned long end; | ||
507 | |||
508 | /* | ||
509 | * There is no point in checking RW/NX conflicts when the requested | ||
510 | * mapping is setting the page !PRESENT. | ||
511 | */ | ||
512 | if (!(pgprot_val(prot) & _PAGE_PRESENT)) | ||
513 | return prot; | ||
514 | |||
515 | /* Operate on the virtual address */ | ||
516 | end = start + npg * PAGE_SIZE - 1; | ||
517 | |||
518 | res = protect_kernel_text(start, end); | ||
519 | check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX"); | ||
520 | forbidden = res; | ||
521 | |||
522 | res = protect_kernel_text_ro(start, end); | ||
523 | check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); | ||
524 | forbidden |= res; | ||
525 | |||
526 | /* Check the PFN directly */ | ||
527 | res = protect_pci_bios(pfn, pfn + npg - 1); | ||
528 | check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX"); | ||
529 | forbidden |= res; | ||
530 | |||
531 | res = protect_rodata(pfn, pfn + npg - 1); | ||
532 | check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO"); | ||
533 | forbidden |= res; | ||
534 | |||
535 | return __pgprot(pgprot_val(prot) & ~forbidden); | ||
367 | } | 536 | } |
368 | 537 | ||
369 | /* | 538 | /* |
@@ -421,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, | |||
421 | */ | 590 | */ |
422 | pte_t *lookup_address(unsigned long address, unsigned int *level) | 591 | pte_t *lookup_address(unsigned long address, unsigned int *level) |
423 | { | 592 | { |
424 | return lookup_address_in_pgd(pgd_offset_k(address), address, level); | 593 | return lookup_address_in_pgd(pgd_offset_k(address), address, level); |
425 | } | 594 | } |
426 | EXPORT_SYMBOL_GPL(lookup_address); | 595 | EXPORT_SYMBOL_GPL(lookup_address); |
427 | 596 | ||
428 | static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, | 597 | static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, |
429 | unsigned int *level) | 598 | unsigned int *level) |
430 | { | 599 | { |
431 | if (cpa->pgd) | 600 | if (cpa->pgd) |
432 | return lookup_address_in_pgd(cpa->pgd + pgd_index(address), | 601 | return lookup_address_in_pgd(cpa->pgd + pgd_index(address), |
433 | address, level); | 602 | address, level); |
434 | 603 | ||
435 | return lookup_address(address, level); | 604 | return lookup_address(address, level); |
436 | } | 605 | } |
437 | 606 | ||
438 | /* | 607 | /* |
@@ -549,40 +718,35 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) | |||
549 | return prot; | 718 | return prot; |
550 | } | 719 | } |
551 | 720 | ||
552 | static int | 721 | static int __should_split_large_page(pte_t *kpte, unsigned long address, |
553 | try_preserve_large_page(pte_t *kpte, unsigned long address, | 722 | struct cpa_data *cpa) |
554 | struct cpa_data *cpa) | ||
555 | { | 723 | { |
556 | unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; | 724 | unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn; |
725 | pgprot_t old_prot, new_prot, req_prot, chk_prot; | ||
557 | pte_t new_pte, old_pte, *tmp; | 726 | pte_t new_pte, old_pte, *tmp; |
558 | pgprot_t old_prot, new_prot, req_prot; | ||
559 | int i, do_split = 1; | ||
560 | enum pg_level level; | 727 | enum pg_level level; |
561 | 728 | ||
562 | if (cpa->force_split) | ||
563 | return 1; | ||
564 | |||
565 | spin_lock(&pgd_lock); | ||
566 | /* | 729 | /* |
567 | * Check for races, another CPU might have split this page | 730 | * Check for races, another CPU might have split this page |
568 | * up already: | 731 | * up already: |
569 | */ | 732 | */ |
570 | tmp = _lookup_address_cpa(cpa, address, &level); | 733 | tmp = _lookup_address_cpa(cpa, address, &level); |
571 | if (tmp != kpte) | 734 | if (tmp != kpte) |
572 | goto out_unlock; | 735 | return 1; |
573 | 736 | ||
574 | switch (level) { | 737 | switch (level) { |
575 | case PG_LEVEL_2M: | 738 | case PG_LEVEL_2M: |
576 | old_prot = pmd_pgprot(*(pmd_t *)kpte); | 739 | old_prot = pmd_pgprot(*(pmd_t *)kpte); |
577 | old_pfn = pmd_pfn(*(pmd_t *)kpte); | 740 | old_pfn = pmd_pfn(*(pmd_t *)kpte); |
741 | cpa_inc_2m_checked(); | ||
578 | break; | 742 | break; |
579 | case PG_LEVEL_1G: | 743 | case PG_LEVEL_1G: |
580 | old_prot = pud_pgprot(*(pud_t *)kpte); | 744 | old_prot = pud_pgprot(*(pud_t *)kpte); |
581 | old_pfn = pud_pfn(*(pud_t *)kpte); | 745 | old_pfn = pud_pfn(*(pud_t *)kpte); |
746 | cpa_inc_1g_checked(); | ||
582 | break; | 747 | break; |
583 | default: | 748 | default: |
584 | do_split = -EINVAL; | 749 | return -EINVAL; |
585 | goto out_unlock; | ||
586 | } | 750 | } |
587 | 751 | ||
588 | psize = page_level_size(level); | 752 | psize = page_level_size(level); |
@@ -592,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, | |||
592 | * Calculate the number of pages, which fit into this large | 756 | * Calculate the number of pages, which fit into this large |
593 | * page starting at address: | 757 | * page starting at address: |
594 | */ | 758 | */ |
595 | nextpage_addr = (address + psize) & pmask; | 759 | lpaddr = (address + psize) & pmask; |
596 | numpages = (nextpage_addr - address) >> PAGE_SHIFT; | 760 | numpages = (lpaddr - address) >> PAGE_SHIFT; |
597 | if (numpages < cpa->numpages) | 761 | if (numpages < cpa->numpages) |
598 | cpa->numpages = numpages; | 762 | cpa->numpages = numpages; |
599 | 763 | ||
@@ -620,71 +784,142 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, | |||
620 | pgprot_val(req_prot) |= _PAGE_PSE; | 784 | pgprot_val(req_prot) |= _PAGE_PSE; |
621 | 785 | ||
622 | /* | 786 | /* |
623 | * old_pfn points to the large page base pfn. So we need | 787 | * old_pfn points to the large page base pfn. So we need to add the |
624 | * to add the offset of the virtual address: | 788 | * offset of the virtual address: |
625 | */ | 789 | */ |
626 | pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); | 790 | pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); |
627 | cpa->pfn = pfn; | 791 | cpa->pfn = pfn; |
628 | 792 | ||
629 | new_prot = static_protections(req_prot, address, pfn); | 793 | /* |
794 | * Calculate the large page base address and the number of 4K pages | ||
795 | * in the large page | ||
796 | */ | ||
797 | lpaddr = address & pmask; | ||
798 | numpages = psize >> PAGE_SHIFT; | ||
630 | 799 | ||
631 | /* | 800 | /* |
632 | * We need to check the full range, whether | 801 | * Sanity check that the existing mapping is correct versus the static |
633 | * static_protection() requires a different pgprot for one of | 802 | * protections. static_protections() guards against !PRESENT, so no |
634 | * the pages in the range we try to preserve: | 803 | * extra conditional required here. |
635 | */ | 804 | */ |
636 | addr = address & pmask; | 805 | chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages, |
637 | pfn = old_pfn; | 806 | CPA_CONFLICT); |
638 | for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { | ||
639 | pgprot_t chk_prot = static_protections(req_prot, addr, pfn); | ||
640 | 807 | ||
641 | if (pgprot_val(chk_prot) != pgprot_val(new_prot)) | 808 | if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { |
642 | goto out_unlock; | 809 | /* |
810 | * Split the large page and tell the split code to | ||
811 | * enforce static protections. | ||
812 | */ | ||
813 | cpa->force_static_prot = 1; | ||
814 | return 1; | ||
643 | } | 815 | } |
644 | 816 | ||
645 | /* | 817 | /* |
646 | * If there are no changes, return. maxpages has been updated | 818 | * Optimization: If the requested pgprot is the same as the current |
647 | * above: | 819 | * pgprot, then the large page can be preserved and no updates are |
820 | * required independent of alignment and length of the requested | ||
821 | * range. The above already established that the current pgprot is | ||
822 | * correct, which in consequence makes the requested pgprot correct | ||
823 | * as well if it is the same. The static protection scan below will | ||
824 | * not come to a different conclusion. | ||
648 | */ | 825 | */ |
649 | if (pgprot_val(new_prot) == pgprot_val(old_prot)) { | 826 | if (pgprot_val(req_prot) == pgprot_val(old_prot)) { |
650 | do_split = 0; | 827 | cpa_inc_lp_sameprot(level); |
651 | goto out_unlock; | 828 | return 0; |
652 | } | 829 | } |
653 | 830 | ||
654 | /* | 831 | /* |
655 | * We need to change the attributes. Check, whether we can | 832 | * If the requested range does not cover the full page, split it up |
656 | * change the large page in one go. We request a split, when | ||
657 | * the address is not aligned and the number of pages is | ||
658 | * smaller than the number of pages in the large page. Note | ||
659 | * that we limited the number of possible pages already to | ||
660 | * the number of pages in the large page. | ||
661 | */ | 833 | */ |
662 | if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { | 834 | if (address != lpaddr || cpa->numpages != numpages) |
663 | /* | 835 | return 1; |
664 | * The address is aligned and the number of pages | ||
665 | * covers the full page. | ||
666 | */ | ||
667 | new_pte = pfn_pte(old_pfn, new_prot); | ||
668 | __set_pmd_pte(kpte, address, new_pte); | ||
669 | cpa->flags |= CPA_FLUSHTLB; | ||
670 | do_split = 0; | ||
671 | } | ||
672 | 836 | ||
673 | out_unlock: | 837 | /* |
838 | * Check whether the requested pgprot is conflicting with a static | ||
839 | * protection requirement in the large page. | ||
840 | */ | ||
841 | new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, | ||
842 | CPA_DETECT); | ||
843 | |||
844 | /* | ||
845 | * If there is a conflict, split the large page. | ||
846 | * | ||
847 | * There used to be a 4k wise evaluation trying really hard to | ||
848 | * preserve the large pages, but experimentation has shown, that this | ||
849 | * does not help at all. There might be corner cases which would | ||
850 | * preserve one large page occasionally, but it's really not worth the | ||
851 | * extra code and cycles for the common case. | ||
852 | */ | ||
853 | if (pgprot_val(req_prot) != pgprot_val(new_prot)) | ||
854 | return 1; | ||
855 | |||
856 | /* All checks passed. Update the large page mapping. */ | ||
857 | new_pte = pfn_pte(old_pfn, new_prot); | ||
858 | __set_pmd_pte(kpte, address, new_pte); | ||
859 | cpa->flags |= CPA_FLUSHTLB; | ||
860 | cpa_inc_lp_preserved(level); | ||
861 | return 0; | ||
862 | } | ||
863 | |||
864 | static int should_split_large_page(pte_t *kpte, unsigned long address, | ||
865 | struct cpa_data *cpa) | ||
866 | { | ||
867 | int do_split; | ||
868 | |||
869 | if (cpa->force_split) | ||
870 | return 1; | ||
871 | |||
872 | spin_lock(&pgd_lock); | ||
873 | do_split = __should_split_large_page(kpte, address, cpa); | ||
674 | spin_unlock(&pgd_lock); | 874 | spin_unlock(&pgd_lock); |
675 | 875 | ||
676 | return do_split; | 876 | return do_split; |
677 | } | 877 | } |
678 | 878 | ||
879 | static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn, | ||
880 | pgprot_t ref_prot, unsigned long address, | ||
881 | unsigned long size) | ||
882 | { | ||
883 | unsigned int npg = PFN_DOWN(size); | ||
884 | pgprot_t prot; | ||
885 | |||
886 | /* | ||
887 | * If should_split_large_page() discovered an inconsistent mapping, | ||
888 | * remove the invalid protection in the split mapping. | ||
889 | */ | ||
890 | if (!cpa->force_static_prot) | ||
891 | goto set; | ||
892 | |||
893 | prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT); | ||
894 | |||
895 | if (pgprot_val(prot) == pgprot_val(ref_prot)) | ||
896 | goto set; | ||
897 | |||
898 | /* | ||
899 | * If this is splitting a PMD, fix it up. PUD splits cannot be | ||
900 | * fixed trivially as that would require to rescan the newly | ||
901 | * installed PMD mappings after returning from split_large_page() | ||
902 | * so an eventual further split can allocate the necessary PTE | ||
903 | * pages. Warn for now and revisit it in case this actually | ||
904 | * happens. | ||
905 | */ | ||
906 | if (size == PAGE_SIZE) | ||
907 | ref_prot = prot; | ||
908 | else | ||
909 | pr_warn_once("CPA: Cannot fixup static protections for PUD split\n"); | ||
910 | set: | ||
911 | set_pte(pte, pfn_pte(pfn, ref_prot)); | ||
912 | } | ||
913 | |||
679 | static int | 914 | static int |
680 | __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, | 915 | __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, |
681 | struct page *base) | 916 | struct page *base) |
682 | { | 917 | { |
918 | unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1; | ||
683 | pte_t *pbase = (pte_t *)page_address(base); | 919 | pte_t *pbase = (pte_t *)page_address(base); |
684 | unsigned long ref_pfn, pfn, pfninc = 1; | ||
685 | unsigned int i, level; | 920 | unsigned int i, level; |
686 | pte_t *tmp; | ||
687 | pgprot_t ref_prot; | 921 | pgprot_t ref_prot; |
922 | pte_t *tmp; | ||
688 | 923 | ||
689 | spin_lock(&pgd_lock); | 924 | spin_lock(&pgd_lock); |
690 | /* | 925 | /* |
@@ -707,15 +942,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, | |||
707 | * PAT bit to correct position. | 942 | * PAT bit to correct position. |
708 | */ | 943 | */ |
709 | ref_prot = pgprot_large_2_4k(ref_prot); | 944 | ref_prot = pgprot_large_2_4k(ref_prot); |
710 | |||
711 | ref_pfn = pmd_pfn(*(pmd_t *)kpte); | 945 | ref_pfn = pmd_pfn(*(pmd_t *)kpte); |
946 | lpaddr = address & PMD_MASK; | ||
947 | lpinc = PAGE_SIZE; | ||
712 | break; | 948 | break; |
713 | 949 | ||
714 | case PG_LEVEL_1G: | 950 | case PG_LEVEL_1G: |
715 | ref_prot = pud_pgprot(*(pud_t *)kpte); | 951 | ref_prot = pud_pgprot(*(pud_t *)kpte); |
716 | ref_pfn = pud_pfn(*(pud_t *)kpte); | 952 | ref_pfn = pud_pfn(*(pud_t *)kpte); |
717 | pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; | 953 | pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; |
718 | 954 | lpaddr = address & PUD_MASK; | |
955 | lpinc = PMD_SIZE; | ||
719 | /* | 956 | /* |
720 | * Clear the PSE flags if the PRESENT flag is not set | 957 | * Clear the PSE flags if the PRESENT flag is not set |
721 | * otherwise pmd_present/pmd_huge will return true | 958 | * otherwise pmd_present/pmd_huge will return true |
@@ -736,8 +973,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, | |||
736 | * Get the target pfn from the original entry: | 973 | * Get the target pfn from the original entry: |
737 | */ | 974 | */ |
738 | pfn = ref_pfn; | 975 | pfn = ref_pfn; |
739 | for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) | 976 | for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc) |
740 | set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); | 977 | split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc); |
741 | 978 | ||
742 | if (virt_addr_valid(address)) { | 979 | if (virt_addr_valid(address)) { |
743 | unsigned long pfn = PFN_DOWN(__pa(address)); | 980 | unsigned long pfn = PFN_DOWN(__pa(address)); |
@@ -756,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, | |||
756 | __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); | 993 | __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); |
757 | 994 | ||
758 | /* | 995 | /* |
759 | * Intel Atom errata AAH41 workaround. | 996 | * Do a global flush tlb after splitting the large page |
997 | * and before we do the actual change page attribute in the PTE. | ||
998 | * | ||
999 | * Without this, we violate the TLB application note, that says: | ||
1000 | * "The TLBs may contain both ordinary and large-page | ||
1001 | * translations for a 4-KByte range of linear addresses. This | ||
1002 | * may occur if software modifies the paging structures so that | ||
1003 | * the page size used for the address range changes. If the two | ||
1004 | * translations differ with respect to page frame or attributes | ||
1005 | * (e.g., permissions), processor behavior is undefined and may | ||
1006 | * be implementation-specific." | ||
760 | * | 1007 | * |
761 | * The real fix should be in hw or in a microcode update, but | 1008 | * We do this global tlb flush inside the cpa_lock, so that we |
762 | * we also probabilistically try to reduce the window of having | 1009 | * don't allow any other cpu, with stale tlb entries change the |
763 | * a large TLB mixed with 4K TLBs while instruction fetches are | 1010 | * page attribute in parallel, that also falls into the |
764 | * going on. | 1011 | * just split large page entry. |
765 | */ | 1012 | */ |
766 | __flush_tlb_all(); | 1013 | flush_tlb_all(); |
767 | spin_unlock(&pgd_lock); | 1014 | spin_unlock(&pgd_lock); |
768 | 1015 | ||
769 | return 0; | 1016 | return 0; |
@@ -1247,7 +1494,9 @@ repeat: | |||
1247 | pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); | 1494 | pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); |
1248 | pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); | 1495 | pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); |
1249 | 1496 | ||
1250 | new_prot = static_protections(new_prot, address, pfn); | 1497 | cpa_inc_4k_install(); |
1498 | new_prot = static_protections(new_prot, address, pfn, 1, | ||
1499 | CPA_PROTECT); | ||
1251 | 1500 | ||
1252 | new_prot = pgprot_clear_protnone_bits(new_prot); | 1501 | new_prot = pgprot_clear_protnone_bits(new_prot); |
1253 | 1502 | ||
@@ -1273,7 +1522,7 @@ repeat: | |||
1273 | * Check, whether we can keep the large page intact | 1522 | * Check, whether we can keep the large page intact |
1274 | * and just change the pte: | 1523 | * and just change the pte: |
1275 | */ | 1524 | */ |
1276 | do_split = try_preserve_large_page(kpte, address, cpa); | 1525 | do_split = should_split_large_page(kpte, address, cpa); |
1277 | /* | 1526 | /* |
1278 | * When the range fits into the existing large page, | 1527 | * When the range fits into the existing large page, |
1279 | * return. cp->numpages and cpa->tlbflush have been updated in | 1528 | * return. cp->numpages and cpa->tlbflush have been updated in |
@@ -1286,28 +1535,8 @@ repeat: | |||
1286 | * We have to split the large page: | 1535 | * We have to split the large page: |
1287 | */ | 1536 | */ |
1288 | err = split_large_page(cpa, kpte, address); | 1537 | err = split_large_page(cpa, kpte, address); |
1289 | if (!err) { | 1538 | if (!err) |
1290 | /* | ||
1291 | * Do a global flush tlb after splitting the large page | ||
1292 | * and before we do the actual change page attribute in the PTE. | ||
1293 | * | ||
1294 | * With out this, we violate the TLB application note, that says | ||
1295 | * "The TLBs may contain both ordinary and large-page | ||
1296 | * translations for a 4-KByte range of linear addresses. This | ||
1297 | * may occur if software modifies the paging structures so that | ||
1298 | * the page size used for the address range changes. If the two | ||
1299 | * translations differ with respect to page frame or attributes | ||
1300 | * (e.g., permissions), processor behavior is undefined and may | ||
1301 | * be implementation-specific." | ||
1302 | * | ||
1303 | * We do this global tlb flush inside the cpa_lock, so that we | ||
1304 | * don't allow any other cpu, with stale tlb entries change the | ||
1305 | * page attribute in parallel, that also falls into the | ||
1306 | * just split large page entry. | ||
1307 | */ | ||
1308 | flush_tlb_all(); | ||
1309 | goto repeat; | 1539 | goto repeat; |
1310 | } | ||
1311 | 1540 | ||
1312 | return err; | 1541 | return err; |
1313 | } | 1542 | } |
@@ -1529,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, | |||
1529 | cache = !!pgprot2cachemode(mask_set); | 1758 | cache = !!pgprot2cachemode(mask_set); |
1530 | 1759 | ||
1531 | /* | 1760 | /* |
1532 | * On success we use CLFLUSH, when the CPU supports it to | 1761 | * On error; flush everything to be sure. |
1533 | * avoid the WBINVD. If the CPU does not support it and in the | ||
1534 | * error case we fall back to cpa_flush_all (which uses | ||
1535 | * WBINVD): | ||
1536 | */ | 1762 | */ |
1537 | if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { | 1763 | if (ret) { |
1538 | if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { | ||
1539 | cpa_flush_array(addr, numpages, cache, | ||
1540 | cpa.flags, pages); | ||
1541 | } else | ||
1542 | cpa_flush_range(baddr, numpages, cache); | ||
1543 | } else | ||
1544 | cpa_flush_all(cache); | 1764 | cpa_flush_all(cache); |
1765 | goto out; | ||
1766 | } | ||
1767 | |||
1768 | if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { | ||
1769 | cpa_flush_array(baddr, addr, numpages, cache, | ||
1770 | cpa.flags, pages); | ||
1771 | } else { | ||
1772 | cpa_flush_range(baddr, numpages, cache); | ||
1773 | } | ||
1545 | 1774 | ||
1546 | out: | 1775 | out: |
1547 | return ret; | 1776 | return ret; |
@@ -1856,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) | |||
1856 | /* | 2085 | /* |
1857 | * Before changing the encryption attribute, we need to flush caches. | 2086 | * Before changing the encryption attribute, we need to flush caches. |
1858 | */ | 2087 | */ |
1859 | if (static_cpu_has(X86_FEATURE_CLFLUSH)) | 2088 | cpa_flush_range(start, numpages, 1); |
1860 | cpa_flush_range(start, numpages, 1); | ||
1861 | else | ||
1862 | cpa_flush_all(1); | ||
1863 | 2089 | ||
1864 | ret = __change_page_attr_set_clr(&cpa, 1); | 2090 | ret = __change_page_attr_set_clr(&cpa, 1); |
1865 | 2091 | ||
@@ -1870,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) | |||
1870 | * in case TLB flushing gets optimized in the cpa_flush_range() | 2096 | * in case TLB flushing gets optimized in the cpa_flush_range() |
1871 | * path use the same logic as above. | 2097 | * path use the same logic as above. |
1872 | */ | 2098 | */ |
1873 | if (static_cpu_has(X86_FEATURE_CLFLUSH)) | 2099 | cpa_flush_range(start, numpages, 0); |
1874 | cpa_flush_range(start, numpages, 0); | ||
1875 | else | ||
1876 | cpa_flush_all(0); | ||
1877 | 2100 | ||
1878 | return ret; | 2101 | return ret; |
1879 | } | 2102 | } |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e96b99eb800c..7d68489cfdb1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -185,8 +185,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |||
185 | { | 185 | { |
186 | struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); | 186 | struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); |
187 | u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); | 187 | u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); |
188 | bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); | ||
188 | unsigned cpu = smp_processor_id(); | 189 | unsigned cpu = smp_processor_id(); |
189 | u64 next_tlb_gen; | 190 | u64 next_tlb_gen; |
191 | bool need_flush; | ||
192 | u16 new_asid; | ||
190 | 193 | ||
191 | /* | 194 | /* |
192 | * NB: The scheduler will call us with prev == next when switching | 195 | * NB: The scheduler will call us with prev == next when switching |
@@ -240,20 +243,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |||
240 | next->context.ctx_id); | 243 | next->context.ctx_id); |
241 | 244 | ||
242 | /* | 245 | /* |
243 | * We don't currently support having a real mm loaded without | 246 | * Even in lazy TLB mode, the CPU should stay set in the |
244 | * our cpu set in mm_cpumask(). We have all the bookkeeping | 247 | * mm_cpumask. The TLB shootdown code can figure out from |
245 | * in place to figure out whether we would need to flush | 248 | * from cpu_tlbstate.is_lazy whether or not to send an IPI. |
246 | * if our cpu were cleared in mm_cpumask(), but we don't | ||
247 | * currently use it. | ||
248 | */ | 249 | */ |
249 | if (WARN_ON_ONCE(real_prev != &init_mm && | 250 | if (WARN_ON_ONCE(real_prev != &init_mm && |
250 | !cpumask_test_cpu(cpu, mm_cpumask(next)))) | 251 | !cpumask_test_cpu(cpu, mm_cpumask(next)))) |
251 | cpumask_set_cpu(cpu, mm_cpumask(next)); | 252 | cpumask_set_cpu(cpu, mm_cpumask(next)); |
252 | 253 | ||
253 | return; | 254 | /* |
255 | * If the CPU is not in lazy TLB mode, we are just switching | ||
256 | * from one thread in a process to another thread in the same | ||
257 | * process. No TLB flush required. | ||
258 | */ | ||
259 | if (!was_lazy) | ||
260 | return; | ||
261 | |||
262 | /* | ||
263 | * Read the tlb_gen to check whether a flush is needed. | ||
264 | * If the TLB is up to date, just use it. | ||
265 | * The barrier synchronizes with the tlb_gen increment in | ||
266 | * the TLB shootdown code. | ||
267 | */ | ||
268 | smp_mb(); | ||
269 | next_tlb_gen = atomic64_read(&next->context.tlb_gen); | ||
270 | if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == | ||
271 | next_tlb_gen) | ||
272 | return; | ||
273 | |||
274 | /* | ||
275 | * TLB contents went out of date while we were in lazy | ||
276 | * mode. Fall through to the TLB switching code below. | ||
277 | */ | ||
278 | new_asid = prev_asid; | ||
279 | need_flush = true; | ||
254 | } else { | 280 | } else { |
255 | u16 new_asid; | ||
256 | bool need_flush; | ||
257 | u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); | 281 | u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); |
258 | 282 | ||
259 | /* | 283 | /* |
@@ -308,46 +332,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |||
308 | /* Let nmi_uaccess_okay() know that we're changing CR3. */ | 332 | /* Let nmi_uaccess_okay() know that we're changing CR3. */ |
309 | this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); | 333 | this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); |
310 | barrier(); | 334 | barrier(); |
335 | } | ||
311 | 336 | ||
312 | if (need_flush) { | 337 | if (need_flush) { |
313 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); | 338 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); |
314 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); | 339 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); |
315 | load_new_mm_cr3(next->pgd, new_asid, true); | 340 | load_new_mm_cr3(next->pgd, new_asid, true); |
316 | |||
317 | /* | ||
318 | * NB: This gets called via leave_mm() in the idle path | ||
319 | * where RCU functions differently. Tracing normally | ||
320 | * uses RCU, so we need to use the _rcuidle variant. | ||
321 | * | ||
322 | * (There is no good reason for this. The idle code should | ||
323 | * be rearranged to call this before rcu_idle_enter().) | ||
324 | */ | ||
325 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | ||
326 | } else { | ||
327 | /* The new ASID is already up to date. */ | ||
328 | load_new_mm_cr3(next->pgd, new_asid, false); | ||
329 | |||
330 | /* See above wrt _rcuidle. */ | ||
331 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); | ||
332 | } | ||
333 | 341 | ||
334 | /* | 342 | /* |
335 | * Record last user mm's context id, so we can avoid | 343 | * NB: This gets called via leave_mm() in the idle path |
336 | * flushing branch buffer with IBPB if we switch back | 344 | * where RCU functions differently. Tracing normally |
337 | * to the same user. | 345 | * uses RCU, so we need to use the _rcuidle variant. |
346 | * | ||
347 | * (There is no good reason for this. The idle code should | ||
348 | * be rearranged to call this before rcu_idle_enter().) | ||
338 | */ | 349 | */ |
339 | if (next != &init_mm) | 350 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); |
340 | this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); | 351 | } else { |
341 | 352 | /* The new ASID is already up to date. */ | |
342 | /* Make sure we write CR3 before loaded_mm. */ | 353 | load_new_mm_cr3(next->pgd, new_asid, false); |
343 | barrier(); | ||
344 | 354 | ||
345 | this_cpu_write(cpu_tlbstate.loaded_mm, next); | 355 | /* See above wrt _rcuidle. */ |
346 | this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); | 356 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); |
347 | } | 357 | } |
348 | 358 | ||
349 | load_mm_cr4(next); | 359 | /* |
350 | switch_ldt(real_prev, next); | 360 | * Record last user mm's context id, so we can avoid |
361 | * flushing branch buffer with IBPB if we switch back | ||
362 | * to the same user. | ||
363 | */ | ||
364 | if (next != &init_mm) | ||
365 | this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); | ||
366 | |||
367 | /* Make sure we write CR3 before loaded_mm. */ | ||
368 | barrier(); | ||
369 | |||
370 | this_cpu_write(cpu_tlbstate.loaded_mm, next); | ||
371 | this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); | ||
372 | |||
373 | if (next != real_prev) { | ||
374 | load_mm_cr4(next); | ||
375 | switch_ldt(real_prev, next); | ||
376 | } | ||
351 | } | 377 | } |
352 | 378 | ||
353 | /* | 379 | /* |
@@ -368,20 +394,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) | |||
368 | if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) | 394 | if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) |
369 | return; | 395 | return; |
370 | 396 | ||
371 | if (tlb_defer_switch_to_init_mm()) { | 397 | this_cpu_write(cpu_tlbstate.is_lazy, true); |
372 | /* | ||
373 | * There's a significant optimization that may be possible | ||
374 | * here. We have accurate enough TLB flush tracking that we | ||
375 | * don't need to maintain coherence of TLB per se when we're | ||
376 | * lazy. We do, however, need to maintain coherence of | ||
377 | * paging-structure caches. We could, in principle, leave our | ||
378 | * old mm loaded and only switch to init_mm when | ||
379 | * tlb_remove_page() happens. | ||
380 | */ | ||
381 | this_cpu_write(cpu_tlbstate.is_lazy, true); | ||
382 | } else { | ||
383 | switch_mm(NULL, &init_mm, NULL); | ||
384 | } | ||
385 | } | 398 | } |
386 | 399 | ||
387 | /* | 400 | /* |
@@ -468,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, | |||
468 | * paging-structure cache to avoid speculatively reading | 481 | * paging-structure cache to avoid speculatively reading |
469 | * garbage into our TLB. Since switching to init_mm is barely | 482 | * garbage into our TLB. Since switching to init_mm is barely |
470 | * slower than a minimal flush, just switch to init_mm. | 483 | * slower than a minimal flush, just switch to init_mm. |
484 | * | ||
485 | * This should be rare, with native_flush_tlb_others skipping | ||
486 | * IPIs to lazy TLB mode CPUs. | ||
471 | */ | 487 | */ |
472 | switch_mm_irqs_off(NULL, &init_mm, NULL); | 488 | switch_mm_irqs_off(NULL, &init_mm, NULL); |
473 | return; | 489 | return; |
@@ -528,17 +544,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, | |||
528 | f->new_tlb_gen == local_tlb_gen + 1 && | 544 | f->new_tlb_gen == local_tlb_gen + 1 && |
529 | f->new_tlb_gen == mm_tlb_gen) { | 545 | f->new_tlb_gen == mm_tlb_gen) { |
530 | /* Partial flush */ | 546 | /* Partial flush */ |
531 | unsigned long addr; | 547 | unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift; |
532 | unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; | 548 | unsigned long addr = f->start; |
533 | 549 | ||
534 | addr = f->start; | ||
535 | while (addr < f->end) { | 550 | while (addr < f->end) { |
536 | __flush_tlb_one_user(addr); | 551 | __flush_tlb_one_user(addr); |
537 | addr += PAGE_SIZE; | 552 | addr += 1UL << f->stride_shift; |
538 | } | 553 | } |
539 | if (local) | 554 | if (local) |
540 | count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); | 555 | count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); |
541 | trace_tlb_flush(reason, nr_pages); | 556 | trace_tlb_flush(reason, nr_invalidate); |
542 | } else { | 557 | } else { |
543 | /* Full flush. */ | 558 | /* Full flush. */ |
544 | local_flush_tlb(); | 559 | local_flush_tlb(); |
@@ -571,6 +586,11 @@ static void flush_tlb_func_remote(void *info) | |||
571 | flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); | 586 | flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); |
572 | } | 587 | } |
573 | 588 | ||
589 | static bool tlb_is_not_lazy(int cpu, void *data) | ||
590 | { | ||
591 | return !per_cpu(cpu_tlbstate.is_lazy, cpu); | ||
592 | } | ||
593 | |||
574 | void native_flush_tlb_others(const struct cpumask *cpumask, | 594 | void native_flush_tlb_others(const struct cpumask *cpumask, |
575 | const struct flush_tlb_info *info) | 595 | const struct flush_tlb_info *info) |
576 | { | 596 | { |
@@ -606,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
606 | (void *)info, 1); | 626 | (void *)info, 1); |
607 | return; | 627 | return; |
608 | } | 628 | } |
609 | smp_call_function_many(cpumask, flush_tlb_func_remote, | 629 | |
630 | /* | ||
631 | * If no page tables were freed, we can skip sending IPIs to | ||
632 | * CPUs in lazy TLB mode. They will flush the CPU themselves | ||
633 | * at the next context switch. | ||
634 | * | ||
635 | * However, if page tables are getting freed, we need to send the | ||
636 | * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping | ||
637 | * up on the new contents of what used to be page tables, while | ||
638 | * doing a speculative memory access. | ||
639 | */ | ||
640 | if (info->freed_tables) | ||
641 | smp_call_function_many(cpumask, flush_tlb_func_remote, | ||
610 | (void *)info, 1); | 642 | (void *)info, 1); |
643 | else | ||
644 | on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, | ||
645 | (void *)info, 1, GFP_ATOMIC, cpumask); | ||
611 | } | 646 | } |
612 | 647 | ||
613 | /* | 648 | /* |
@@ -623,12 +658,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
623 | static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; | 658 | static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; |
624 | 659 | ||
625 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 660 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
626 | unsigned long end, unsigned long vmflag) | 661 | unsigned long end, unsigned int stride_shift, |
662 | bool freed_tables) | ||
627 | { | 663 | { |
628 | int cpu; | 664 | int cpu; |
629 | 665 | ||
630 | struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { | 666 | struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { |
631 | .mm = mm, | 667 | .mm = mm, |
668 | .stride_shift = stride_shift, | ||
669 | .freed_tables = freed_tables, | ||
632 | }; | 670 | }; |
633 | 671 | ||
634 | cpu = get_cpu(); | 672 | cpu = get_cpu(); |
@@ -638,8 +676,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | |||
638 | 676 | ||
639 | /* Should we flush just the requested range? */ | 677 | /* Should we flush just the requested range? */ |
640 | if ((end != TLB_FLUSH_ALL) && | 678 | if ((end != TLB_FLUSH_ALL) && |
641 | !(vmflag & VM_HUGETLB) && | 679 | ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { |
642 | ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { | ||
643 | info.start = start; | 680 | info.start = start; |
644 | info.end = end; | 681 | info.end = end; |
645 | } else { | 682 | } else { |
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index e3b18ad49889..145506f9fdbe 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/tick.h> | 22 | #include <linux/tick.h> |
23 | #include <linux/nmi.h> | 23 | #include <linux/nmi.h> |
24 | #include <linux/cpuhotplug.h> | 24 | #include <linux/cpuhotplug.h> |
25 | #include <linux/stackprotector.h> | ||
25 | 26 | ||
26 | #include <asm/paravirt.h> | 27 | #include <asm/paravirt.h> |
27 | #include <asm/desc.h> | 28 | #include <asm/desc.h> |
@@ -88,6 +89,7 @@ static void cpu_bringup(void) | |||
88 | asmlinkage __visible void cpu_bringup_and_idle(void) | 89 | asmlinkage __visible void cpu_bringup_and_idle(void) |
89 | { | 90 | { |
90 | cpu_bringup(); | 91 | cpu_bringup(); |
92 | boot_init_stack_canary(); | ||
91 | cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); | 93 | cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); |
92 | } | 94 | } |
93 | 95 | ||
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 84b3e4445d46..3931c7de7c69 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c | |||
@@ -902,12 +902,22 @@ static bool copy_device_table(void) | |||
902 | } | 902 | } |
903 | } | 903 | } |
904 | 904 | ||
905 | old_devtb_phys = entry & PAGE_MASK; | 905 | /* |
906 | * When SME is enabled in the first kernel, the entry includes the | ||
907 | * memory encryption mask(sme_me_mask), we must remove the memory | ||
908 | * encryption mask to obtain the true physical address in kdump kernel. | ||
909 | */ | ||
910 | old_devtb_phys = __sme_clr(entry) & PAGE_MASK; | ||
911 | |||
906 | if (old_devtb_phys >= 0x100000000ULL) { | 912 | if (old_devtb_phys >= 0x100000000ULL) { |
907 | pr_err("The address of old device table is above 4G, not trustworthy!\n"); | 913 | pr_err("The address of old device table is above 4G, not trustworthy!\n"); |
908 | return false; | 914 | return false; |
909 | } | 915 | } |
910 | old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB); | 916 | old_devtb = (sme_active() && is_kdump_kernel()) |
917 | ? (__force void *)ioremap_encrypted(old_devtb_phys, | ||
918 | dev_table_size) | ||
919 | : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB); | ||
920 | |||
911 | if (!old_devtb) | 921 | if (!old_devtb) |
912 | return false; | 922 | return false; |
913 | 923 | ||
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index cbde728f8ac6..91ae16fbd7d5 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c | |||
@@ -24,6 +24,8 @@ | |||
24 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
25 | #include <linux/pagemap.h> | 25 | #include <linux/pagemap.h> |
26 | #include <linux/uaccess.h> | 26 | #include <linux/uaccess.h> |
27 | #include <linux/mem_encrypt.h> | ||
28 | #include <asm/pgtable.h> | ||
27 | #include <asm/io.h> | 29 | #include <asm/io.h> |
28 | #include "internal.h" | 30 | #include "internal.h" |
29 | 31 | ||
@@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn) | |||
98 | 100 | ||
99 | /* Reads a page from the oldmem device from given offset. */ | 101 | /* Reads a page from the oldmem device from given offset. */ |
100 | static ssize_t read_from_oldmem(char *buf, size_t count, | 102 | static ssize_t read_from_oldmem(char *buf, size_t count, |
101 | u64 *ppos, int userbuf) | 103 | u64 *ppos, int userbuf, |
104 | bool encrypted) | ||
102 | { | 105 | { |
103 | unsigned long pfn, offset; | 106 | unsigned long pfn, offset; |
104 | size_t nr_bytes; | 107 | size_t nr_bytes; |
@@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count, | |||
120 | if (pfn_is_ram(pfn) == 0) | 123 | if (pfn_is_ram(pfn) == 0) |
121 | memset(buf, 0, nr_bytes); | 124 | memset(buf, 0, nr_bytes); |
122 | else { | 125 | else { |
123 | tmp = copy_oldmem_page(pfn, buf, nr_bytes, | 126 | if (encrypted) |
124 | offset, userbuf); | 127 | tmp = copy_oldmem_page_encrypted(pfn, buf, |
128 | nr_bytes, | ||
129 | offset, | ||
130 | userbuf); | ||
131 | else | ||
132 | tmp = copy_oldmem_page(pfn, buf, nr_bytes, | ||
133 | offset, userbuf); | ||
134 | |||
125 | if (tmp < 0) | 135 | if (tmp < 0) |
126 | return tmp; | 136 | return tmp; |
127 | } | 137 | } |
@@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr) | |||
155 | */ | 165 | */ |
156 | ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) | 166 | ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) |
157 | { | 167 | { |
158 | return read_from_oldmem(buf, count, ppos, 0); | 168 | return read_from_oldmem(buf, count, ppos, 0, false); |
159 | } | 169 | } |
160 | 170 | ||
161 | /* | 171 | /* |
@@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) | |||
163 | */ | 173 | */ |
164 | ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) | 174 | ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) |
165 | { | 175 | { |
166 | return read_from_oldmem(buf, count, ppos, 0); | 176 | return read_from_oldmem(buf, count, ppos, 0, sme_active()); |
167 | } | 177 | } |
168 | 178 | ||
169 | /* | 179 | /* |
@@ -173,10 +183,21 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, | |||
173 | unsigned long from, unsigned long pfn, | 183 | unsigned long from, unsigned long pfn, |
174 | unsigned long size, pgprot_t prot) | 184 | unsigned long size, pgprot_t prot) |
175 | { | 185 | { |
186 | prot = pgprot_encrypted(prot); | ||
176 | return remap_pfn_range(vma, from, pfn, size, prot); | 187 | return remap_pfn_range(vma, from, pfn, size, prot); |
177 | } | 188 | } |
178 | 189 | ||
179 | /* | 190 | /* |
191 | * Architectures which support memory encryption override this. | ||
192 | */ | ||
193 | ssize_t __weak | ||
194 | copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, | ||
195 | unsigned long offset, int userbuf) | ||
196 | { | ||
197 | return copy_oldmem_page(pfn, buf, csize, offset, userbuf); | ||
198 | } | ||
199 | |||
200 | /* | ||
180 | * Copy to either kernel or user space | 201 | * Copy to either kernel or user space |
181 | */ | 202 | */ |
182 | static int copy_to(void *target, void *src, size_t size, int userbuf) | 203 | static int copy_to(void *target, void *src, size_t size, int userbuf) |
@@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, | |||
351 | m->offset + m->size - *fpos, | 372 | m->offset + m->size - *fpos, |
352 | buflen); | 373 | buflen); |
353 | start = m->paddr + *fpos - m->offset; | 374 | start = m->paddr + *fpos - m->offset; |
354 | tmp = read_from_oldmem(buffer, tsz, &start, userbuf); | 375 | tmp = read_from_oldmem(buffer, tsz, &start, |
376 | userbuf, sme_active()); | ||
355 | if (tmp < 0) | 377 | if (tmp < 0) |
356 | return tmp; | 378 | return tmp; |
357 | buflen -= tsz; | 379 | buflen -= tsz; |
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 3e4ba9d753c8..f774c5eb9e3c 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h | |||
@@ -26,6 +26,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, | |||
26 | 26 | ||
27 | extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, | 27 | extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, |
28 | unsigned long, int); | 28 | unsigned long, int); |
29 | extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, | ||
30 | size_t csize, unsigned long offset, | ||
31 | int userbuf); | ||
32 | |||
29 | void vmcore_cleanup(void); | 33 | void vmcore_cleanup(void); |
30 | 34 | ||
31 | /* Architecture code defines this if there are other possible ELF | 35 | /* Architecture code defines this if there are other possible ELF |
diff --git a/include/linux/smp.h b/include/linux/smp.h index 9fb239e12b82..a56f08ff3097 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h | |||
@@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
53 | smp_call_func_t func, void *info, bool wait, | 53 | smp_call_func_t func, void *info, bool wait, |
54 | gfp_t gfp_flags); | 54 | gfp_t gfp_flags); |
55 | 55 | ||
56 | void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), | ||
57 | smp_call_func_t func, void *info, bool wait, | ||
58 | gfp_t gfp_flags, const struct cpumask *mask); | ||
59 | |||
56 | int smp_call_function_single_async(int cpu, call_single_data_t *csd); | 60 | int smp_call_function_single_async(int cpu, call_single_data_t *csd); |
57 | 61 | ||
58 | #ifdef CONFIG_SMP | 62 | #ifdef CONFIG_SMP |
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 23a83a4da38a..86ef06d3dbe3 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c | |||
@@ -471,6 +471,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, | |||
471 | } | 471 | } |
472 | } | 472 | } |
473 | 473 | ||
474 | /* Ensure that these pages are decrypted if SME is enabled. */ | ||
475 | if (pages) | ||
476 | arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0); | ||
477 | |||
474 | return pages; | 478 | return pages; |
475 | } | 479 | } |
476 | 480 | ||
@@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
867 | result = -ENOMEM; | 871 | result = -ENOMEM; |
868 | goto out; | 872 | goto out; |
869 | } | 873 | } |
874 | arch_kexec_post_alloc_pages(page_address(page), 1, 0); | ||
870 | ptr = kmap(page); | 875 | ptr = kmap(page); |
871 | ptr += maddr & ~PAGE_MASK; | 876 | ptr += maddr & ~PAGE_MASK; |
872 | mchunk = min_t(size_t, mbytes, | 877 | mchunk = min_t(size_t, mbytes, |
@@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
884 | result = copy_from_user(ptr, buf, uchunk); | 889 | result = copy_from_user(ptr, buf, uchunk); |
885 | kexec_flush_icache_page(page); | 890 | kexec_flush_icache_page(page); |
886 | kunmap(page); | 891 | kunmap(page); |
892 | arch_kexec_pre_free_pages(page_address(page), 1); | ||
887 | if (result) { | 893 | if (result) { |
888 | result = -EFAULT; | 894 | result = -EFAULT; |
889 | goto out; | 895 | goto out; |
diff --git a/kernel/resource.c b/kernel/resource.c index 30e1bc68503b..b3a3a1fc499e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -318,33 +318,34 @@ int release_resource(struct resource *old) | |||
318 | 318 | ||
319 | EXPORT_SYMBOL(release_resource); | 319 | EXPORT_SYMBOL(release_resource); |
320 | 320 | ||
321 | /* | 321 | /** |
322 | * Finds the lowest iomem resource existing within [res->start.res->end). | 322 | * Finds the lowest iomem resource that covers part of [start..end]. The |
323 | * The caller must specify res->start, res->end, res->flags, and optionally | 323 | * caller must specify start, end, flags, and desc (which may be |
324 | * desc. If found, returns 0, res is overwritten, if not found, returns -1. | 324 | * IORES_DESC_NONE). |
325 | * This function walks the whole tree and not just first level children until | 325 | * |
326 | * and unless first_level_children_only is true. | 326 | * If a resource is found, returns 0 and *res is overwritten with the part |
327 | * of the resource that's within [start..end]; if none is found, returns | ||
328 | * -1. | ||
329 | * | ||
330 | * This function walks the whole tree and not just first level children | ||
331 | * unless @first_lvl is true. | ||
327 | */ | 332 | */ |
328 | static int find_next_iomem_res(struct resource *res, unsigned long desc, | 333 | static int find_next_iomem_res(resource_size_t start, resource_size_t end, |
329 | bool first_level_children_only) | 334 | unsigned long flags, unsigned long desc, |
335 | bool first_lvl, struct resource *res) | ||
330 | { | 336 | { |
331 | resource_size_t start, end; | ||
332 | struct resource *p; | 337 | struct resource *p; |
333 | bool sibling_only = false; | ||
334 | 338 | ||
335 | BUG_ON(!res); | 339 | if (!res) |
336 | 340 | return -EINVAL; | |
337 | start = res->start; | ||
338 | end = res->end; | ||
339 | BUG_ON(start >= end); | ||
340 | 341 | ||
341 | if (first_level_children_only) | 342 | if (start >= end) |
342 | sibling_only = true; | 343 | return -EINVAL; |
343 | 344 | ||
344 | read_lock(&resource_lock); | 345 | read_lock(&resource_lock); |
345 | 346 | ||
346 | for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { | 347 | for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) { |
347 | if ((p->flags & res->flags) != res->flags) | 348 | if ((p->flags & flags) != flags) |
348 | continue; | 349 | continue; |
349 | if ((desc != IORES_DESC_NONE) && (desc != p->desc)) | 350 | if ((desc != IORES_DESC_NONE) && (desc != p->desc)) |
350 | continue; | 351 | continue; |
@@ -352,45 +353,43 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, | |||
352 | p = NULL; | 353 | p = NULL; |
353 | break; | 354 | break; |
354 | } | 355 | } |
355 | if ((p->end >= start) && (p->start < end)) | 356 | if ((p->end >= start) && (p->start <= end)) |
356 | break; | 357 | break; |
357 | } | 358 | } |
358 | 359 | ||
359 | read_unlock(&resource_lock); | 360 | read_unlock(&resource_lock); |
360 | if (!p) | 361 | if (!p) |
361 | return -1; | 362 | return -1; |
363 | |||
362 | /* copy data */ | 364 | /* copy data */ |
363 | if (res->start < p->start) | 365 | res->start = max(start, p->start); |
364 | res->start = p->start; | 366 | res->end = min(end, p->end); |
365 | if (res->end > p->end) | ||
366 | res->end = p->end; | ||
367 | res->flags = p->flags; | 367 | res->flags = p->flags; |
368 | res->desc = p->desc; | 368 | res->desc = p->desc; |
369 | return 0; | 369 | return 0; |
370 | } | 370 | } |
371 | 371 | ||
372 | static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, | 372 | static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, |
373 | bool first_level_children_only, | 373 | unsigned long flags, unsigned long desc, |
374 | void *arg, | 374 | bool first_lvl, void *arg, |
375 | int (*func)(struct resource *, void *)) | 375 | int (*func)(struct resource *, void *)) |
376 | { | 376 | { |
377 | u64 orig_end = res->end; | 377 | struct resource res; |
378 | int ret = -1; | 378 | int ret = -1; |
379 | 379 | ||
380 | while ((res->start < res->end) && | 380 | while (start < end && |
381 | !find_next_iomem_res(res, desc, first_level_children_only)) { | 381 | !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) { |
382 | ret = (*func)(res, arg); | 382 | ret = (*func)(&res, arg); |
383 | if (ret) | 383 | if (ret) |
384 | break; | 384 | break; |
385 | 385 | ||
386 | res->start = res->end + 1; | 386 | start = res.end + 1; |
387 | res->end = orig_end; | ||
388 | } | 387 | } |
389 | 388 | ||
390 | return ret; | 389 | return ret; |
391 | } | 390 | } |
392 | 391 | ||
393 | /* | 392 | /** |
394 | * Walks through iomem resources and calls func() with matching resource | 393 | * Walks through iomem resources and calls func() with matching resource |
395 | * ranges. This walks through whole tree and not just first level children. | 394 | * ranges. This walks through whole tree and not just first level children. |
396 | * All the memory ranges which overlap start,end and also match flags and | 395 | * All the memory ranges which overlap start,end and also match flags and |
@@ -407,13 +406,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, | |||
407 | int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, | 406 | int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, |
408 | u64 end, void *arg, int (*func)(struct resource *, void *)) | 407 | u64 end, void *arg, int (*func)(struct resource *, void *)) |
409 | { | 408 | { |
410 | struct resource res; | 409 | return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func); |
411 | |||
412 | res.start = start; | ||
413 | res.end = end; | ||
414 | res.flags = flags; | ||
415 | |||
416 | return __walk_iomem_res_desc(&res, desc, false, arg, func); | ||
417 | } | 410 | } |
418 | EXPORT_SYMBOL_GPL(walk_iomem_res_desc); | 411 | EXPORT_SYMBOL_GPL(walk_iomem_res_desc); |
419 | 412 | ||
@@ -425,15 +418,11 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc); | |||
425 | * ranges. | 418 | * ranges. |
426 | */ | 419 | */ |
427 | int walk_system_ram_res(u64 start, u64 end, void *arg, | 420 | int walk_system_ram_res(u64 start, u64 end, void *arg, |
428 | int (*func)(struct resource *, void *)) | 421 | int (*func)(struct resource *, void *)) |
429 | { | 422 | { |
430 | struct resource res; | 423 | unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; |
431 | 424 | ||
432 | res.start = start; | 425 | return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, |
433 | res.end = end; | ||
434 | res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; | ||
435 | |||
436 | return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, | ||
437 | arg, func); | 426 | arg, func); |
438 | } | 427 | } |
439 | 428 | ||
@@ -444,13 +433,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, | |||
444 | int walk_mem_res(u64 start, u64 end, void *arg, | 433 | int walk_mem_res(u64 start, u64 end, void *arg, |
445 | int (*func)(struct resource *, void *)) | 434 | int (*func)(struct resource *, void *)) |
446 | { | 435 | { |
447 | struct resource res; | 436 | unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
448 | |||
449 | res.start = start; | ||
450 | res.end = end; | ||
451 | res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
452 | 437 | ||
453 | return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, | 438 | return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, |
454 | arg, func); | 439 | arg, func); |
455 | } | 440 | } |
456 | 441 | ||
@@ -462,27 +447,27 @@ int walk_mem_res(u64 start, u64 end, void *arg, | |||
462 | * It is to be used only for System RAM. | 447 | * It is to be used only for System RAM. |
463 | */ | 448 | */ |
464 | int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | 449 | int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, |
465 | void *arg, int (*func)(unsigned long, unsigned long, void *)) | 450 | void *arg, int (*func)(unsigned long, unsigned long, void *)) |
466 | { | 451 | { |
452 | resource_size_t start, end; | ||
453 | unsigned long flags; | ||
467 | struct resource res; | 454 | struct resource res; |
468 | unsigned long pfn, end_pfn; | 455 | unsigned long pfn, end_pfn; |
469 | u64 orig_end; | ||
470 | int ret = -1; | 456 | int ret = -1; |
471 | 457 | ||
472 | res.start = (u64) start_pfn << PAGE_SHIFT; | 458 | start = (u64) start_pfn << PAGE_SHIFT; |
473 | res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; | 459 | end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; |
474 | res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; | 460 | flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; |
475 | orig_end = res.end; | 461 | while (start < end && |
476 | while ((res.start < res.end) && | 462 | !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, |
477 | (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) { | 463 | true, &res)) { |
478 | pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; | 464 | pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; |
479 | end_pfn = (res.end + 1) >> PAGE_SHIFT; | 465 | end_pfn = (res.end + 1) >> PAGE_SHIFT; |
480 | if (end_pfn > pfn) | 466 | if (end_pfn > pfn) |
481 | ret = (*func)(pfn, end_pfn - pfn, arg); | 467 | ret = (*func)(pfn, end_pfn - pfn, arg); |
482 | if (ret) | 468 | if (ret) |
483 | break; | 469 | break; |
484 | res.start = res.end + 1; | 470 | start = res.end + 1; |
485 | res.end = orig_end; | ||
486 | } | 471 | } |
487 | return ret; | 472 | return ret; |
488 | } | 473 | } |
@@ -658,8 +643,8 @@ static int find_resource(struct resource *root, struct resource *new, | |||
658 | * @constraint: the size and alignment constraints to be met. | 643 | * @constraint: the size and alignment constraints to be met. |
659 | */ | 644 | */ |
660 | static int reallocate_resource(struct resource *root, struct resource *old, | 645 | static int reallocate_resource(struct resource *root, struct resource *old, |
661 | resource_size_t newsize, | 646 | resource_size_t newsize, |
662 | struct resource_constraint *constraint) | 647 | struct resource_constraint *constraint) |
663 | { | 648 | { |
664 | int err=0; | 649 | int err=0; |
665 | struct resource new = *old; | 650 | struct resource new = *old; |
@@ -972,7 +957,7 @@ skip: | |||
972 | * Existing children of the resource are assumed to be immutable. | 957 | * Existing children of the resource are assumed to be immutable. |
973 | */ | 958 | */ |
974 | int adjust_resource(struct resource *res, resource_size_t start, | 959 | int adjust_resource(struct resource *res, resource_size_t start, |
975 | resource_size_t size) | 960 | resource_size_t size) |
976 | { | 961 | { |
977 | int result; | 962 | int result; |
978 | 963 | ||
@@ -983,9 +968,9 @@ int adjust_resource(struct resource *res, resource_size_t start, | |||
983 | } | 968 | } |
984 | EXPORT_SYMBOL(adjust_resource); | 969 | EXPORT_SYMBOL(adjust_resource); |
985 | 970 | ||
986 | static void __init __reserve_region_with_split(struct resource *root, | 971 | static void __init |
987 | resource_size_t start, resource_size_t end, | 972 | __reserve_region_with_split(struct resource *root, resource_size_t start, |
988 | const char *name) | 973 | resource_size_t end, const char *name) |
989 | { | 974 | { |
990 | struct resource *parent = root; | 975 | struct resource *parent = root; |
991 | struct resource *conflict; | 976 | struct resource *conflict; |
@@ -1044,9 +1029,9 @@ static void __init __reserve_region_with_split(struct resource *root, | |||
1044 | 1029 | ||
1045 | } | 1030 | } |
1046 | 1031 | ||
1047 | void __init reserve_region_with_split(struct resource *root, | 1032 | void __init |
1048 | resource_size_t start, resource_size_t end, | 1033 | reserve_region_with_split(struct resource *root, resource_size_t start, |
1049 | const char *name) | 1034 | resource_size_t end, const char *name) |
1050 | { | 1035 | { |
1051 | int abort = 0; | 1036 | int abort = 0; |
1052 | 1037 | ||
@@ -1172,7 +1157,7 @@ EXPORT_SYMBOL(__request_region); | |||
1172 | * The described resource region must match a currently busy region. | 1157 | * The described resource region must match a currently busy region. |
1173 | */ | 1158 | */ |
1174 | void __release_region(struct resource *parent, resource_size_t start, | 1159 | void __release_region(struct resource *parent, resource_size_t start, |
1175 | resource_size_t n) | 1160 | resource_size_t n) |
1176 | { | 1161 | { |
1177 | struct resource **p; | 1162 | struct resource **p; |
1178 | resource_size_t end; | 1163 | resource_size_t end; |
@@ -1234,7 +1219,7 @@ EXPORT_SYMBOL(__release_region); | |||
1234 | * simplicity. Enhance this logic when necessary. | 1219 | * simplicity. Enhance this logic when necessary. |
1235 | */ | 1220 | */ |
1236 | int release_mem_region_adjustable(struct resource *parent, | 1221 | int release_mem_region_adjustable(struct resource *parent, |
1237 | resource_size_t start, resource_size_t size) | 1222 | resource_size_t start, resource_size_t size) |
1238 | { | 1223 | { |
1239 | struct resource **p; | 1224 | struct resource **p; |
1240 | struct resource *res; | 1225 | struct resource *res; |
@@ -1410,9 +1395,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data) | |||
1410 | this->start == match->start && this->n == match->n; | 1395 | this->start == match->start && this->n == match->n; |
1411 | } | 1396 | } |
1412 | 1397 | ||
1413 | struct resource * __devm_request_region(struct device *dev, | 1398 | struct resource * |
1414 | struct resource *parent, resource_size_t start, | 1399 | __devm_request_region(struct device *dev, struct resource *parent, |
1415 | resource_size_t n, const char *name) | 1400 | resource_size_t start, resource_size_t n, const char *name) |
1416 | { | 1401 | { |
1417 | struct region_devres *dr = NULL; | 1402 | struct region_devres *dr = NULL; |
1418 | struct resource *res; | 1403 | struct resource *res; |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 16f84142f2f4..f5516bae0c1b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle); | |||
347 | 347 | ||
348 | void cpu_startup_entry(enum cpuhp_state state) | 348 | void cpu_startup_entry(enum cpuhp_state state) |
349 | { | 349 | { |
350 | /* | ||
351 | * This #ifdef needs to die, but it's too late in the cycle to | ||
352 | * make this generic (ARM and SH have never invoked the canary | ||
353 | * init for the non boot CPUs!). Will be fixed in 3.11 | ||
354 | */ | ||
355 | #ifdef CONFIG_X86 | ||
356 | /* | ||
357 | * If we're the non-boot CPU, nothing set the stack canary up | ||
358 | * for us. The boot CPU already has it initialized but no harm | ||
359 | * in doing it again. This is a good place for updating it, as | ||
360 | * we wont ever return from this function (so the invalid | ||
361 | * canaries already on the stack wont ever trigger). | ||
362 | */ | ||
363 | boot_init_stack_canary(); | ||
364 | #endif | ||
365 | arch_cpu_idle_prepare(); | 350 | arch_cpu_idle_prepare(); |
366 | cpuhp_online_idle(state); | 351 | cpuhp_online_idle(state); |
367 | while (1) | 352 | while (1) |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fc24f2b8c646..b8c007713b3b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -56,7 +56,6 @@ | |||
56 | #include <linux/profile.h> | 56 | #include <linux/profile.h> |
57 | #include <linux/rcupdate_wait.h> | 57 | #include <linux/rcupdate_wait.h> |
58 | #include <linux/security.h> | 58 | #include <linux/security.h> |
59 | #include <linux/stackprotector.h> | ||
60 | #include <linux/stop_machine.h> | 59 | #include <linux/stop_machine.h> |
61 | #include <linux/suspend.h> | 60 | #include <linux/suspend.h> |
62 | #include <linux/swait.h> | 61 | #include <linux/swait.h> |
diff --git a/kernel/smp.c b/kernel/smp.c index d86eec5f51c1..163c451af42e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); | |||
669 | * You must not call this function with disabled interrupts or | 669 | * You must not call this function with disabled interrupts or |
670 | * from a hardware interrupt handler or from a bottom half handler. | 670 | * from a hardware interrupt handler or from a bottom half handler. |
671 | */ | 671 | */ |
672 | void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | 672 | void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), |
673 | smp_call_func_t func, void *info, bool wait, | 673 | smp_call_func_t func, void *info, bool wait, |
674 | gfp_t gfp_flags) | 674 | gfp_t gfp_flags, const struct cpumask *mask) |
675 | { | 675 | { |
676 | cpumask_var_t cpus; | 676 | cpumask_var_t cpus; |
677 | int cpu, ret; | 677 | int cpu, ret; |
@@ -680,9 +680,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
680 | 680 | ||
681 | if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { | 681 | if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { |
682 | preempt_disable(); | 682 | preempt_disable(); |
683 | for_each_online_cpu(cpu) | 683 | for_each_cpu(cpu, mask) |
684 | if (cond_func(cpu, info)) | 684 | if (cond_func(cpu, info)) |
685 | cpumask_set_cpu(cpu, cpus); | 685 | __cpumask_set_cpu(cpu, cpus); |
686 | on_each_cpu_mask(cpus, func, info, wait); | 686 | on_each_cpu_mask(cpus, func, info, wait); |
687 | preempt_enable(); | 687 | preempt_enable(); |
688 | free_cpumask_var(cpus); | 688 | free_cpumask_var(cpus); |
@@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
692 | * just have to IPI them one by one. | 692 | * just have to IPI them one by one. |
693 | */ | 693 | */ |
694 | preempt_disable(); | 694 | preempt_disable(); |
695 | for_each_online_cpu(cpu) | 695 | for_each_cpu(cpu, mask) |
696 | if (cond_func(cpu, info)) { | 696 | if (cond_func(cpu, info)) { |
697 | ret = smp_call_function_single(cpu, func, | 697 | ret = smp_call_function_single(cpu, func, |
698 | info, wait); | 698 | info, wait); |
@@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
701 | preempt_enable(); | 701 | preempt_enable(); |
702 | } | 702 | } |
703 | } | 703 | } |
704 | EXPORT_SYMBOL(on_each_cpu_cond_mask); | ||
705 | |||
706 | void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | ||
707 | smp_call_func_t func, void *info, bool wait, | ||
708 | gfp_t gfp_flags) | ||
709 | { | ||
710 | on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, | ||
711 | cpu_online_mask); | ||
712 | } | ||
704 | EXPORT_SYMBOL(on_each_cpu_cond); | 713 | EXPORT_SYMBOL(on_each_cpu_cond); |
705 | 714 | ||
706 | static void do_nothing(void *unused) | 715 | static void do_nothing(void *unused) |
diff --git a/kernel/up.c b/kernel/up.c index 42c46bf3e0a5..ff536f9cc8a2 100644 --- a/kernel/up.c +++ b/kernel/up.c | |||
@@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); | |||
68 | * Preemption is disabled here to make sure the cond_func is called under the | 68 | * Preemption is disabled here to make sure the cond_func is called under the |
69 | * same condtions in UP and SMP. | 69 | * same condtions in UP and SMP. |
70 | */ | 70 | */ |
71 | void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | 71 | void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), |
72 | smp_call_func_t func, void *info, bool wait, | 72 | smp_call_func_t func, void *info, bool wait, |
73 | gfp_t gfp_flags) | 73 | gfp_t gfp_flags, const struct cpumask *mask) |
74 | { | 74 | { |
75 | unsigned long flags; | 75 | unsigned long flags; |
76 | 76 | ||
@@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
82 | } | 82 | } |
83 | preempt_enable(); | 83 | preempt_enable(); |
84 | } | 84 | } |
85 | EXPORT_SYMBOL(on_each_cpu_cond_mask); | ||
86 | |||
87 | void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | ||
88 | smp_call_func_t func, void *info, bool wait, | ||
89 | gfp_t gfp_flags) | ||
90 | { | ||
91 | on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL); | ||
92 | } | ||
85 | EXPORT_SYMBOL(on_each_cpu_cond); | 93 | EXPORT_SYMBOL(on_each_cpu_cond); |
86 | 94 | ||
87 | int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) | 95 | int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index cf2af04b34b9..532c29276fce 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -8,6 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/pagemap.h> | 10 | #include <linux/pagemap.h> |
11 | #include <linux/hugetlb.h> | ||
11 | #include <asm/tlb.h> | 12 | #include <asm/tlb.h> |
12 | #include <asm-generic/pgtable.h> | 13 | #include <asm-generic/pgtable.h> |
13 | 14 | ||