diff options
35 files changed, 1591 insertions, 47 deletions
diff --git a/Documentation/x86/intel_mpx.txt b/Documentation/x86/intel_mpx.txt new file mode 100644 index 000000000000..4472ed2ad921 --- /dev/null +++ b/Documentation/x86/intel_mpx.txt | |||
@@ -0,0 +1,234 @@ | |||
1 | 1. Intel(R) MPX Overview | ||
2 | ======================== | ||
3 | |||
4 | Intel(R) Memory Protection Extensions (Intel(R) MPX) is a new capability | ||
5 | introduced into Intel Architecture. Intel MPX provides hardware features | ||
6 | that can be used in conjunction with compiler changes to check memory | ||
7 | references, for those references whose compile-time normal intentions are | ||
8 | usurped at runtime due to buffer overflow or underflow. | ||
9 | |||
10 | For more information, please refer to Intel(R) Architecture Instruction | ||
11 | Set Extensions Programming Reference, Chapter 9: Intel(R) Memory Protection | ||
12 | Extensions. | ||
13 | |||
14 | Note: Currently no hardware with MPX ISA is available but it is always | ||
15 | possible to use SDE (Intel(R) Software Development Emulator) instead, which | ||
16 | can be downloaded from | ||
17 | http://software.intel.com/en-us/articles/intel-software-development-emulator | ||
18 | |||
19 | |||
20 | 2. How to get the advantage of MPX | ||
21 | ================================== | ||
22 | |||
23 | For MPX to work, changes are required in the kernel, binutils and compiler. | ||
24 | No source changes are required for applications, just a recompile. | ||
25 | |||
26 | There are a lot of moving parts of this to all work right. The following | ||
27 | is how we expect the compiler, application and kernel to work together. | ||
28 | |||
29 | 1) Application developer compiles with -fmpx. The compiler will add the | ||
30 | instrumentation as well as some setup code called early after the app | ||
31 | starts. New instruction prefixes are noops for old CPUs. | ||
32 | 2) That setup code allocates (virtual) space for the "bounds directory", | ||
33 | points the "bndcfgu" register to the directory and notifies the kernel | ||
34 | (via the new prctl(PR_MPX_ENABLE_MANAGEMENT)) that the app will be using | ||
35 | MPX. | ||
36 | 3) The kernel detects that the CPU has MPX, allows the new prctl() to | ||
37 | succeed, and notes the location of the bounds directory. Userspace is | ||
38 | expected to keep the bounds directory at that locationWe note it | ||
39 | instead of reading it each time because the 'xsave' operation needed | ||
40 | to access the bounds directory register is an expensive operation. | ||
41 | 4) If the application needs to spill bounds out of the 4 registers, it | ||
42 | issues a bndstx instruction. Since the bounds directory is empty at | ||
43 | this point, a bounds fault (#BR) is raised, the kernel allocates a | ||
44 | bounds table (in the user address space) and makes the relevant entry | ||
45 | in the bounds directory point to the new table. | ||
46 | 5) If the application violates the bounds specified in the bounds registers, | ||
47 | a separate kind of #BR is raised which will deliver a signal with | ||
48 | information about the violation in the 'struct siginfo'. | ||
49 | 6) Whenever memory is freed, we know that it can no longer contain valid | ||
50 | pointers, and we attempt to free the associated space in the bounds | ||
51 | tables. If an entire table becomes unused, we will attempt to free | ||
52 | the table and remove the entry in the directory. | ||
53 | |||
54 | To summarize, there are essentially three things interacting here: | ||
55 | |||
56 | GCC with -fmpx: | ||
57 | * enables annotation of code with MPX instructions and prefixes | ||
58 | * inserts code early in the application to call in to the "gcc runtime" | ||
59 | GCC MPX Runtime: | ||
60 | * Checks for hardware MPX support in cpuid leaf | ||
61 | * allocates virtual space for the bounds directory (malloc() essentially) | ||
62 | * points the hardware BNDCFGU register at the directory | ||
63 | * calls a new prctl(PR_MPX_ENABLE_MANAGEMENT) to notify the kernel to | ||
64 | start managing the bounds directories | ||
65 | Kernel MPX Code: | ||
66 | * Checks for hardware MPX support in cpuid leaf | ||
67 | * Handles #BR exceptions and sends SIGSEGV to the app when it violates | ||
68 | bounds, like during a buffer overflow. | ||
69 | * When bounds are spilled in to an unallocated bounds table, the kernel | ||
70 | notices in the #BR exception, allocates the virtual space, then | ||
71 | updates the bounds directory to point to the new table. It keeps | ||
72 | special track of the memory with a VM_MPX flag. | ||
73 | * Frees unused bounds tables at the time that the memory they described | ||
74 | is unmapped. | ||
75 | |||
76 | |||
77 | 3. How does MPX kernel code work | ||
78 | ================================ | ||
79 | |||
80 | Handling #BR faults caused by MPX | ||
81 | --------------------------------- | ||
82 | |||
83 | When MPX is enabled, there are 2 new situations that can generate | ||
84 | #BR faults. | ||
85 | * new bounds tables (BT) need to be allocated to save bounds. | ||
86 | * bounds violation caused by MPX instructions. | ||
87 | |||
88 | We hook #BR handler to handle these two new situations. | ||
89 | |||
90 | On-demand kernel allocation of bounds tables | ||
91 | -------------------------------------------- | ||
92 | |||
93 | MPX only has 4 hardware registers for storing bounds information. If | ||
94 | MPX-enabled code needs more than these 4 registers, it needs to spill | ||
95 | them somewhere. It has two special instructions for this which allow | ||
96 | the bounds to be moved between the bounds registers and some new "bounds | ||
97 | tables". | ||
98 | |||
99 | #BR exceptions are a new class of exceptions just for MPX. They are | ||
100 | similar conceptually to a page fault and will be raised by the MPX | ||
101 | hardware during both bounds violations or when the tables are not | ||
102 | present. The kernel handles those #BR exceptions for not-present tables | ||
103 | by carving the space out of the normal processes address space and then | ||
104 | pointing the bounds-directory over to it. | ||
105 | |||
106 | The tables need to be accessed and controlled by userspace because | ||
107 | the instructions for moving bounds in and out of them are extremely | ||
108 | frequent. They potentially happen every time a register points to | ||
109 | memory. Any direct kernel involvement (like a syscall) to access the | ||
110 | tables would obviously destroy performance. | ||
111 | |||
112 | Why not do this in userspace? MPX does not strictly require anything in | ||
113 | the kernel. It can theoretically be done completely from userspace. Here | ||
114 | are a few ways this could be done. We don't think any of them are practical | ||
115 | in the real-world, but here they are. | ||
116 | |||
117 | Q: Can virtual space simply be reserved for the bounds tables so that we | ||
118 | never have to allocate them? | ||
119 | A: MPX-enabled application will possibly create a lot of bounds tables in | ||
120 | process address space to save bounds information. These tables can take | ||
121 | up huge swaths of memory (as much as 80% of the memory on the system) | ||
122 | even if we clean them up aggressively. In the worst-case scenario, the | ||
123 | tables can be 4x the size of the data structure being tracked. IOW, a | ||
124 | 1-page structure can require 4 bounds-table pages. An X-GB virtual | ||
125 | area needs 4*X GB of virtual space, plus 2GB for the bounds directory. | ||
126 | If we were to preallocate them for the 128TB of user virtual address | ||
127 | space, we would need to reserve 512TB+2GB, which is larger than the | ||
128 | entire virtual address space today. This means they can not be reserved | ||
129 | ahead of time. Also, a single process's pre-popualated bounds directory | ||
130 | consumes 2GB of virtual *AND* physical memory. IOW, it's completely | ||
131 | infeasible to prepopulate bounds directories. | ||
132 | |||
133 | Q: Can we preallocate bounds table space at the same time memory is | ||
134 | allocated which might contain pointers that might eventually need | ||
135 | bounds tables? | ||
136 | A: This would work if we could hook the site of each and every memory | ||
137 | allocation syscall. This can be done for small, constrained applications. | ||
138 | But, it isn't practical at a larger scale since a given app has no | ||
139 | way of controlling how all the parts of the app might allocate memory | ||
140 | (think libraries). The kernel is really the only place to intercept | ||
141 | these calls. | ||
142 | |||
143 | Q: Could a bounds fault be handed to userspace and the tables allocated | ||
144 | there in a signal handler intead of in the kernel? | ||
145 | A: mmap() is not on the list of safe async handler functions and even | ||
146 | if mmap() would work it still requires locking or nasty tricks to | ||
147 | keep track of the allocation state there. | ||
148 | |||
149 | Having ruled out all of the userspace-only approaches for managing | ||
150 | bounds tables that we could think of, we create them on demand in | ||
151 | the kernel. | ||
152 | |||
153 | Decoding MPX instructions | ||
154 | ------------------------- | ||
155 | |||
156 | If a #BR is generated due to a bounds violation caused by MPX. | ||
157 | We need to decode MPX instructions to get violation address and | ||
158 | set this address into extended struct siginfo. | ||
159 | |||
160 | The _sigfault feild of struct siginfo is extended as follow: | ||
161 | |||
162 | 87 /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ | ||
163 | 88 struct { | ||
164 | 89 void __user *_addr; /* faulting insn/memory ref. */ | ||
165 | 90 #ifdef __ARCH_SI_TRAPNO | ||
166 | 91 int _trapno; /* TRAP # which caused the signal */ | ||
167 | 92 #endif | ||
168 | 93 short _addr_lsb; /* LSB of the reported address */ | ||
169 | 94 struct { | ||
170 | 95 void __user *_lower; | ||
171 | 96 void __user *_upper; | ||
172 | 97 } _addr_bnd; | ||
173 | 98 } _sigfault; | ||
174 | |||
175 | The '_addr' field refers to violation address, and new '_addr_and' | ||
176 | field refers to the upper/lower bounds when a #BR is caused. | ||
177 | |||
178 | Glibc will be also updated to support this new siginfo. So user | ||
179 | can get violation address and bounds when bounds violations occur. | ||
180 | |||
181 | Cleanup unused bounds tables | ||
182 | ---------------------------- | ||
183 | |||
184 | When a BNDSTX instruction attempts to save bounds to a bounds directory | ||
185 | entry marked as invalid, a #BR is generated. This is an indication that | ||
186 | no bounds table exists for this entry. In this case the fault handler | ||
187 | will allocate a new bounds table on demand. | ||
188 | |||
189 | Since the kernel allocated those tables on-demand without userspace | ||
190 | knowledge, it is also responsible for freeing them when the associated | ||
191 | mappings go away. | ||
192 | |||
193 | Here, the solution for this issue is to hook do_munmap() to check | ||
194 | whether one process is MPX enabled. If yes, those bounds tables covered | ||
195 | in the virtual address region which is being unmapped will be freed also. | ||
196 | |||
197 | Adding new prctl commands | ||
198 | ------------------------- | ||
199 | |||
200 | Two new prctl commands are added to enable and disable MPX bounds tables | ||
201 | management in kernel. | ||
202 | |||
203 | 155 #define PR_MPX_ENABLE_MANAGEMENT 43 | ||
204 | 156 #define PR_MPX_DISABLE_MANAGEMENT 44 | ||
205 | |||
206 | Runtime library in userspace is responsible for allocation of bounds | ||
207 | directory. So kernel have to use XSAVE instruction to get the base | ||
208 | of bounds directory from BNDCFG register. | ||
209 | |||
210 | But XSAVE is expected to be very expensive. In order to do performance | ||
211 | optimization, we have to get the base of bounds directory and save it | ||
212 | into struct mm_struct to be used in future during PR_MPX_ENABLE_MANAGEMENT | ||
213 | command execution. | ||
214 | |||
215 | |||
216 | 4. Special rules | ||
217 | ================ | ||
218 | |||
219 | 1) If userspace is requesting help from the kernel to do the management | ||
220 | of bounds tables, it may not create or modify entries in the bounds directory. | ||
221 | |||
222 | Certainly users can allocate bounds tables and forcibly point the bounds | ||
223 | directory at them through XSAVE instruction, and then set valid bit | ||
224 | of bounds entry to have this entry valid. But, the kernel will decline | ||
225 | to assist in managing these tables. | ||
226 | |||
227 | 2) Userspace may not take multiple bounds directory entries and point | ||
228 | them at the same bounds table. | ||
229 | |||
230 | This is allowed architecturally. See more information "Intel(R) Architecture | ||
231 | Instruction Set Extensions Programming Reference" (9.3.4). | ||
232 | |||
233 | However, if users did this, the kernel might be fooled in to unmaping an | ||
234 | in-use bounds table since it does not recognize sharing. | ||
diff --git a/arch/ia64/include/uapi/asm/siginfo.h b/arch/ia64/include/uapi/asm/siginfo.h index 4ea6225196bb..bce9bc1a66c4 100644 --- a/arch/ia64/include/uapi/asm/siginfo.h +++ b/arch/ia64/include/uapi/asm/siginfo.h | |||
@@ -63,6 +63,10 @@ typedef struct siginfo { | |||
63 | unsigned int _flags; /* see below */ | 63 | unsigned int _flags; /* see below */ |
64 | unsigned long _isr; /* isr */ | 64 | unsigned long _isr; /* isr */ |
65 | short _addr_lsb; /* lsb of faulting address */ | 65 | short _addr_lsb; /* lsb of faulting address */ |
66 | struct { | ||
67 | void __user *_lower; | ||
68 | void __user *_upper; | ||
69 | } _addr_bnd; | ||
66 | } _sigfault; | 70 | } _sigfault; |
67 | 71 | ||
68 | /* SIGPOLL */ | 72 | /* SIGPOLL */ |
@@ -110,9 +114,9 @@ typedef struct siginfo { | |||
110 | /* | 114 | /* |
111 | * SIGSEGV si_codes | 115 | * SIGSEGV si_codes |
112 | */ | 116 | */ |
113 | #define __SEGV_PSTKOVF (__SI_FAULT|3) /* paragraph stack overflow */ | 117 | #define __SEGV_PSTKOVF (__SI_FAULT|4) /* paragraph stack overflow */ |
114 | #undef NSIGSEGV | 118 | #undef NSIGSEGV |
115 | #define NSIGSEGV 3 | 119 | #define NSIGSEGV 4 |
116 | 120 | ||
117 | #undef NSIGTRAP | 121 | #undef NSIGTRAP |
118 | #define NSIGTRAP 4 | 122 | #define NSIGTRAP 4 |
diff --git a/arch/mips/include/uapi/asm/siginfo.h b/arch/mips/include/uapi/asm/siginfo.h index e81174432bab..d08f83f19db5 100644 --- a/arch/mips/include/uapi/asm/siginfo.h +++ b/arch/mips/include/uapi/asm/siginfo.h | |||
@@ -92,6 +92,10 @@ typedef struct siginfo { | |||
92 | int _trapno; /* TRAP # which caused the signal */ | 92 | int _trapno; /* TRAP # which caused the signal */ |
93 | #endif | 93 | #endif |
94 | short _addr_lsb; | 94 | short _addr_lsb; |
95 | struct { | ||
96 | void __user *_lower; | ||
97 | void __user *_upper; | ||
98 | } _addr_bnd; | ||
95 | } _sigfault; | 99 | } _sigfault; |
96 | 100 | ||
97 | /* SIGPOLL, SIGXFSZ (To do ...) */ | 101 | /* SIGPOLL, SIGXFSZ (To do ...) */ |
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 3815bfea1b2d..f49b71954654 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h | |||
@@ -120,4 +120,15 @@ static inline void arch_exit_mmap(struct mm_struct *mm) | |||
120 | { | 120 | { |
121 | } | 121 | } |
122 | 122 | ||
123 | static inline void arch_unmap(struct mm_struct *mm, | ||
124 | struct vm_area_struct *vma, | ||
125 | unsigned long start, unsigned long end) | ||
126 | { | ||
127 | } | ||
128 | |||
129 | static inline void arch_bprm_mm_init(struct mm_struct *mm, | ||
130 | struct vm_area_struct *vma) | ||
131 | { | ||
132 | } | ||
133 | |||
123 | #endif /* __S390_MMU_CONTEXT_H */ | 134 | #endif /* __S390_MMU_CONTEXT_H */ |
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h index aa4a743dc4ab..941527e507f7 100644 --- a/arch/um/include/asm/mmu_context.h +++ b/arch/um/include/asm/mmu_context.h | |||
@@ -10,7 +10,26 @@ | |||
10 | #include <asm/mmu.h> | 10 | #include <asm/mmu.h> |
11 | 11 | ||
12 | extern void uml_setup_stubs(struct mm_struct *mm); | 12 | extern void uml_setup_stubs(struct mm_struct *mm); |
13 | /* | ||
14 | * Needed since we do not use the asm-generic/mm_hooks.h: | ||
15 | */ | ||
16 | static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | ||
17 | { | ||
18 | uml_setup_stubs(mm); | ||
19 | } | ||
13 | extern void arch_exit_mmap(struct mm_struct *mm); | 20 | extern void arch_exit_mmap(struct mm_struct *mm); |
21 | static inline void arch_unmap(struct mm_struct *mm, | ||
22 | struct vm_area_struct *vma, | ||
23 | unsigned long start, unsigned long end) | ||
24 | { | ||
25 | } | ||
26 | static inline void arch_bprm_mm_init(struct mm_struct *mm, | ||
27 | struct vm_area_struct *vma) | ||
28 | { | ||
29 | } | ||
30 | /* | ||
31 | * end asm-generic/mm_hooks.h functions | ||
32 | */ | ||
14 | 33 | ||
15 | #define deactivate_mm(tsk,mm) do { } while (0) | 34 | #define deactivate_mm(tsk,mm) do { } while (0) |
16 | 35 | ||
@@ -41,11 +60,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, | |||
41 | } | 60 | } |
42 | } | 61 | } |
43 | 62 | ||
44 | static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | ||
45 | { | ||
46 | uml_setup_stubs(mm); | ||
47 | } | ||
48 | |||
49 | static inline void enter_lazy_tlb(struct mm_struct *mm, | 63 | static inline void enter_lazy_tlb(struct mm_struct *mm, |
50 | struct task_struct *tsk) | 64 | struct task_struct *tsk) |
51 | { | 65 | { |
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index ef470a7a3d0f..1cb5220afaf9 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h | |||
@@ -86,4 +86,15 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm, | |||
86 | { | 86 | { |
87 | } | 87 | } |
88 | 88 | ||
89 | static inline void arch_unmap(struct mm_struct *mm, | ||
90 | struct vm_area_struct *vma, | ||
91 | unsigned long start, unsigned long end) | ||
92 | { | ||
93 | } | ||
94 | |||
95 | static inline void arch_bprm_mm_init(struct mm_struct *mm, | ||
96 | struct vm_area_struct *vma) | ||
97 | { | ||
98 | } | ||
99 | |||
89 | #endif | 100 | #endif |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 41a503c15862..666ac6651c17 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -248,6 +248,10 @@ config HAVE_INTEL_TXT | |||
248 | def_bool y | 248 | def_bool y |
249 | depends on INTEL_IOMMU && ACPI | 249 | depends on INTEL_IOMMU && ACPI |
250 | 250 | ||
251 | config X86_INTEL_MPX | ||
252 | def_bool y | ||
253 | depends on CPU_SUP_INTEL | ||
254 | |||
251 | config X86_32_SMP | 255 | config X86_32_SMP |
252 | def_bool y | 256 | def_bool y |
253 | depends on X86_32 && SMP | 257 | depends on X86_32 && SMP |
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 97534a7d38e3..f226df064660 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h | |||
@@ -10,6 +10,12 @@ | |||
10 | * cpu_feature_enabled(). | 10 | * cpu_feature_enabled(). |
11 | */ | 11 | */ |
12 | 12 | ||
13 | #ifdef CONFIG_X86_INTEL_MPX | ||
14 | # define DISABLE_MPX 0 | ||
15 | #else | ||
16 | # define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) | ||
17 | #endif | ||
18 | |||
13 | #ifdef CONFIG_X86_64 | 19 | #ifdef CONFIG_X86_64 |
14 | # define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) | 20 | # define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) |
15 | # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) | 21 | # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) |
@@ -34,6 +40,6 @@ | |||
34 | #define DISABLED_MASK6 0 | 40 | #define DISABLED_MASK6 0 |
35 | #define DISABLED_MASK7 0 | 41 | #define DISABLED_MASK7 0 |
36 | #define DISABLED_MASK8 0 | 42 | #define DISABLED_MASK8 0 |
37 | #define DISABLED_MASK9 0 | 43 | #define DISABLED_MASK9 (DISABLE_MPX) |
38 | 44 | ||
39 | #endif /* _ASM_X86_DISABLED_FEATURES_H */ | 45 | #endif /* _ASM_X86_DISABLED_FEATURES_H */ |
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 48eb30a86062..47f29b1d1846 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h | |||
@@ -65,6 +65,7 @@ struct insn { | |||
65 | unsigned char x86_64; | 65 | unsigned char x86_64; |
66 | 66 | ||
67 | const insn_byte_t *kaddr; /* kernel address of insn to analyze */ | 67 | const insn_byte_t *kaddr; /* kernel address of insn to analyze */ |
68 | const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ | ||
68 | const insn_byte_t *next_byte; | 69 | const insn_byte_t *next_byte; |
69 | }; | 70 | }; |
70 | 71 | ||
@@ -96,7 +97,7 @@ struct insn { | |||
96 | #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ | 97 | #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ |
97 | #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ | 98 | #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ |
98 | 99 | ||
99 | extern void insn_init(struct insn *insn, const void *kaddr, int x86_64); | 100 | extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); |
100 | extern void insn_get_prefixes(struct insn *insn); | 101 | extern void insn_get_prefixes(struct insn *insn); |
101 | extern void insn_get_opcode(struct insn *insn); | 102 | extern void insn_get_opcode(struct insn *insn); |
102 | extern void insn_get_modrm(struct insn *insn); | 103 | extern void insn_get_modrm(struct insn *insn); |
@@ -115,12 +116,13 @@ static inline void insn_get_attribute(struct insn *insn) | |||
115 | extern int insn_rip_relative(struct insn *insn); | 116 | extern int insn_rip_relative(struct insn *insn); |
116 | 117 | ||
117 | /* Init insn for kernel text */ | 118 | /* Init insn for kernel text */ |
118 | static inline void kernel_insn_init(struct insn *insn, const void *kaddr) | 119 | static inline void kernel_insn_init(struct insn *insn, |
120 | const void *kaddr, int buf_len) | ||
119 | { | 121 | { |
120 | #ifdef CONFIG_X86_64 | 122 | #ifdef CONFIG_X86_64 |
121 | insn_init(insn, kaddr, 1); | 123 | insn_init(insn, kaddr, buf_len, 1); |
122 | #else /* CONFIG_X86_32 */ | 124 | #else /* CONFIG_X86_32 */ |
123 | insn_init(insn, kaddr, 0); | 125 | insn_init(insn, kaddr, buf_len, 0); |
124 | #endif | 126 | #endif |
125 | } | 127 | } |
126 | 128 | ||
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 166af2a8e865..be91d5736e08 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h | |||
@@ -10,9 +10,8 @@ | |||
10 | #include <asm/pgalloc.h> | 10 | #include <asm/pgalloc.h> |
11 | #include <asm/tlbflush.h> | 11 | #include <asm/tlbflush.h> |
12 | #include <asm/paravirt.h> | 12 | #include <asm/paravirt.h> |
13 | #include <asm/mpx.h> | ||
13 | #ifndef CONFIG_PARAVIRT | 14 | #ifndef CONFIG_PARAVIRT |
14 | #include <asm-generic/mm_hooks.h> | ||
15 | |||
16 | static inline void paravirt_activate_mm(struct mm_struct *prev, | 15 | static inline void paravirt_activate_mm(struct mm_struct *prev, |
17 | struct mm_struct *next) | 16 | struct mm_struct *next) |
18 | { | 17 | { |
@@ -102,4 +101,27 @@ do { \ | |||
102 | } while (0) | 101 | } while (0) |
103 | #endif | 102 | #endif |
104 | 103 | ||
104 | static inline void arch_dup_mmap(struct mm_struct *oldmm, | ||
105 | struct mm_struct *mm) | ||
106 | { | ||
107 | paravirt_arch_dup_mmap(oldmm, mm); | ||
108 | } | ||
109 | |||
110 | static inline void arch_exit_mmap(struct mm_struct *mm) | ||
111 | { | ||
112 | paravirt_arch_exit_mmap(mm); | ||
113 | } | ||
114 | |||
115 | static inline void arch_bprm_mm_init(struct mm_struct *mm, | ||
116 | struct vm_area_struct *vma) | ||
117 | { | ||
118 | mpx_mm_init(mm); | ||
119 | } | ||
120 | |||
121 | static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, | ||
122 | unsigned long start, unsigned long end) | ||
123 | { | ||
124 | mpx_notify_unmap(mm, vma, start, end); | ||
125 | } | ||
126 | |||
105 | #endif /* _ASM_X86_MMU_CONTEXT_H */ | 127 | #endif /* _ASM_X86_MMU_CONTEXT_H */ |
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h new file mode 100644 index 000000000000..a952a13d59a7 --- /dev/null +++ b/arch/x86/include/asm/mpx.h | |||
@@ -0,0 +1,103 @@ | |||
1 | #ifndef _ASM_X86_MPX_H | ||
2 | #define _ASM_X86_MPX_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | #include <asm/ptrace.h> | ||
6 | #include <asm/insn.h> | ||
7 | |||
8 | /* | ||
9 | * NULL is theoretically a valid place to put the bounds | ||
10 | * directory, so point this at an invalid address. | ||
11 | */ | ||
12 | #define MPX_INVALID_BOUNDS_DIR ((void __user *)-1) | ||
13 | #define MPX_BNDCFG_ENABLE_FLAG 0x1 | ||
14 | #define MPX_BD_ENTRY_VALID_FLAG 0x1 | ||
15 | |||
16 | #ifdef CONFIG_X86_64 | ||
17 | |||
18 | /* upper 28 bits [47:20] of the virtual address in 64-bit used to | ||
19 | * index into bounds directory (BD). | ||
20 | */ | ||
21 | #define MPX_BD_ENTRY_OFFSET 28 | ||
22 | #define MPX_BD_ENTRY_SHIFT 3 | ||
23 | /* bits [19:3] of the virtual address in 64-bit used to index into | ||
24 | * bounds table (BT). | ||
25 | */ | ||
26 | #define MPX_BT_ENTRY_OFFSET 17 | ||
27 | #define MPX_BT_ENTRY_SHIFT 5 | ||
28 | #define MPX_IGN_BITS 3 | ||
29 | #define MPX_BD_ENTRY_TAIL 3 | ||
30 | |||
31 | #else | ||
32 | |||
33 | #define MPX_BD_ENTRY_OFFSET 20 | ||
34 | #define MPX_BD_ENTRY_SHIFT 2 | ||
35 | #define MPX_BT_ENTRY_OFFSET 10 | ||
36 | #define MPX_BT_ENTRY_SHIFT 4 | ||
37 | #define MPX_IGN_BITS 2 | ||
38 | #define MPX_BD_ENTRY_TAIL 2 | ||
39 | |||
40 | #endif | ||
41 | |||
42 | #define MPX_BD_SIZE_BYTES (1UL<<(MPX_BD_ENTRY_OFFSET+MPX_BD_ENTRY_SHIFT)) | ||
43 | #define MPX_BT_SIZE_BYTES (1UL<<(MPX_BT_ENTRY_OFFSET+MPX_BT_ENTRY_SHIFT)) | ||
44 | |||
45 | #define MPX_BNDSTA_TAIL 2 | ||
46 | #define MPX_BNDCFG_TAIL 12 | ||
47 | #define MPX_BNDSTA_ADDR_MASK (~((1UL<<MPX_BNDSTA_TAIL)-1)) | ||
48 | #define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1)) | ||
49 | #define MPX_BT_ADDR_MASK (~((1UL<<MPX_BD_ENTRY_TAIL)-1)) | ||
50 | |||
51 | #define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1)) | ||
52 | #define MPX_BNDSTA_ERROR_CODE 0x3 | ||
53 | |||
54 | #define MPX_BD_ENTRY_MASK ((1<<MPX_BD_ENTRY_OFFSET)-1) | ||
55 | #define MPX_BT_ENTRY_MASK ((1<<MPX_BT_ENTRY_OFFSET)-1) | ||
56 | #define MPX_GET_BD_ENTRY_OFFSET(addr) ((((addr)>>(MPX_BT_ENTRY_OFFSET+ \ | ||
57 | MPX_IGN_BITS)) & MPX_BD_ENTRY_MASK) << MPX_BD_ENTRY_SHIFT) | ||
58 | #define MPX_GET_BT_ENTRY_OFFSET(addr) ((((addr)>>MPX_IGN_BITS) & \ | ||
59 | MPX_BT_ENTRY_MASK) << MPX_BT_ENTRY_SHIFT) | ||
60 | |||
61 | #ifdef CONFIG_X86_INTEL_MPX | ||
62 | siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, | ||
63 | struct xsave_struct *xsave_buf); | ||
64 | int mpx_handle_bd_fault(struct xsave_struct *xsave_buf); | ||
65 | static inline int kernel_managing_mpx_tables(struct mm_struct *mm) | ||
66 | { | ||
67 | return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); | ||
68 | } | ||
69 | static inline void mpx_mm_init(struct mm_struct *mm) | ||
70 | { | ||
71 | /* | ||
72 | * NULL is theoretically a valid place to put the bounds | ||
73 | * directory, so point this at an invalid address. | ||
74 | */ | ||
75 | mm->bd_addr = MPX_INVALID_BOUNDS_DIR; | ||
76 | } | ||
77 | void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, | ||
78 | unsigned long start, unsigned long end); | ||
79 | #else | ||
80 | static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, | ||
81 | struct xsave_struct *xsave_buf) | ||
82 | { | ||
83 | return NULL; | ||
84 | } | ||
85 | static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) | ||
86 | { | ||
87 | return -EINVAL; | ||
88 | } | ||
89 | static inline int kernel_managing_mpx_tables(struct mm_struct *mm) | ||
90 | { | ||
91 | return 0; | ||
92 | } | ||
93 | static inline void mpx_mm_init(struct mm_struct *mm) | ||
94 | { | ||
95 | } | ||
96 | static inline void mpx_notify_unmap(struct mm_struct *mm, | ||
97 | struct vm_area_struct *vma, | ||
98 | unsigned long start, unsigned long end) | ||
99 | { | ||
100 | } | ||
101 | #endif /* CONFIG_X86_INTEL_MPX */ | ||
102 | |||
103 | #endif /* _ASM_X86_MPX_H */ | ||
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cd6e1610e29e..32444ae939ca 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h | |||
@@ -330,13 +330,13 @@ static inline void paravirt_activate_mm(struct mm_struct *prev, | |||
330 | PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next); | 330 | PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next); |
331 | } | 331 | } |
332 | 332 | ||
333 | static inline void arch_dup_mmap(struct mm_struct *oldmm, | 333 | static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, |
334 | struct mm_struct *mm) | 334 | struct mm_struct *mm) |
335 | { | 335 | { |
336 | PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm); | 336 | PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm); |
337 | } | 337 | } |
338 | 338 | ||
339 | static inline void arch_exit_mmap(struct mm_struct *mm) | 339 | static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) |
340 | { | 340 | { |
341 | PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm); | 341 | PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm); |
342 | } | 342 | } |
@@ -986,5 +986,15 @@ extern void default_banner(void); | |||
986 | #endif /* __ASSEMBLY__ */ | 986 | #endif /* __ASSEMBLY__ */ |
987 | #else /* CONFIG_PARAVIRT */ | 987 | #else /* CONFIG_PARAVIRT */ |
988 | # define default_banner x86_init_noop | 988 | # define default_banner x86_init_noop |
989 | #ifndef __ASSEMBLY__ | ||
990 | static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, | ||
991 | struct mm_struct *mm) | ||
992 | { | ||
993 | } | ||
994 | |||
995 | static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) | ||
996 | { | ||
997 | } | ||
998 | #endif /* __ASSEMBLY__ */ | ||
989 | #endif /* !CONFIG_PARAVIRT */ | 999 | #endif /* !CONFIG_PARAVIRT */ |
990 | #endif /* _ASM_X86_PARAVIRT_H */ | 1000 | #endif /* _ASM_X86_PARAVIRT_H */ |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index eb71ec794732..9617a1716813 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -374,13 +374,14 @@ struct lwp_struct { | |||
374 | u8 reserved[128]; | 374 | u8 reserved[128]; |
375 | }; | 375 | }; |
376 | 376 | ||
377 | struct bndregs_struct { | 377 | struct bndreg { |
378 | u64 bndregs[8]; | 378 | u64 lower_bound; |
379 | u64 upper_bound; | ||
379 | } __packed; | 380 | } __packed; |
380 | 381 | ||
381 | struct bndcsr_struct { | 382 | struct bndcsr { |
382 | u64 cfg_reg_u; | 383 | u64 bndcfgu; |
383 | u64 status_reg; | 384 | u64 bndstatus; |
384 | } __packed; | 385 | } __packed; |
385 | 386 | ||
386 | struct xsave_hdr_struct { | 387 | struct xsave_hdr_struct { |
@@ -394,8 +395,8 @@ struct xsave_struct { | |||
394 | struct xsave_hdr_struct xsave_hdr; | 395 | struct xsave_hdr_struct xsave_hdr; |
395 | struct ymmh_struct ymmh; | 396 | struct ymmh_struct ymmh; |
396 | struct lwp_struct lwp; | 397 | struct lwp_struct lwp; |
397 | struct bndregs_struct bndregs; | 398 | struct bndreg bndreg[4]; |
398 | struct bndcsr_struct bndcsr; | 399 | struct bndcsr bndcsr; |
399 | /* new processor state extensions will go here */ | 400 | /* new processor state extensions will go here */ |
400 | } __attribute__ ((packed, aligned (64))); | 401 | } __attribute__ ((packed, aligned (64))); |
401 | 402 | ||
@@ -953,6 +954,24 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, | |||
953 | extern int get_tsc_mode(unsigned long adr); | 954 | extern int get_tsc_mode(unsigned long adr); |
954 | extern int set_tsc_mode(unsigned int val); | 955 | extern int set_tsc_mode(unsigned int val); |
955 | 956 | ||
957 | /* Register/unregister a process' MPX related resource */ | ||
958 | #define MPX_ENABLE_MANAGEMENT(tsk) mpx_enable_management((tsk)) | ||
959 | #define MPX_DISABLE_MANAGEMENT(tsk) mpx_disable_management((tsk)) | ||
960 | |||
961 | #ifdef CONFIG_X86_INTEL_MPX | ||
962 | extern int mpx_enable_management(struct task_struct *tsk); | ||
963 | extern int mpx_disable_management(struct task_struct *tsk); | ||
964 | #else | ||
965 | static inline int mpx_enable_management(struct task_struct *tsk) | ||
966 | { | ||
967 | return -EINVAL; | ||
968 | } | ||
969 | static inline int mpx_disable_management(struct task_struct *tsk) | ||
970 | { | ||
971 | return -EINVAL; | ||
972 | } | ||
973 | #endif /* CONFIG_X86_INTEL_MPX */ | ||
974 | |||
956 | extern u16 amd_get_nb_id(int cpu); | 975 | extern u16 amd_get_nb_id(int cpu); |
957 | 976 | ||
958 | static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) | 977 | static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 495ae9793628..3c895d480cd7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c | |||
@@ -724,6 +724,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) | |||
724 | unsigned long ip = regs->ip; | 724 | unsigned long ip = regs->ip; |
725 | int is_64bit = 0; | 725 | int is_64bit = 0; |
726 | void *kaddr; | 726 | void *kaddr; |
727 | int size; | ||
727 | 728 | ||
728 | /* | 729 | /* |
729 | * We don't need to fixup if the PEBS assist is fault like | 730 | * We don't need to fixup if the PEBS assist is fault like |
@@ -758,11 +759,12 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) | |||
758 | return 1; | 759 | return 1; |
759 | } | 760 | } |
760 | 761 | ||
762 | size = ip - to; | ||
761 | if (!kernel_ip(ip)) { | 763 | if (!kernel_ip(ip)) { |
762 | int size, bytes; | 764 | int bytes; |
763 | u8 *buf = this_cpu_read(insn_buffer); | 765 | u8 *buf = this_cpu_read(insn_buffer); |
764 | 766 | ||
765 | size = ip - to; /* Must fit our buffer, see above */ | 767 | /* 'size' must fit our buffer, see above */ |
766 | bytes = copy_from_user_nmi(buf, (void __user *)to, size); | 768 | bytes = copy_from_user_nmi(buf, (void __user *)to, size); |
767 | if (bytes != 0) | 769 | if (bytes != 0) |
768 | return 0; | 770 | return 0; |
@@ -780,11 +782,20 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) | |||
780 | #ifdef CONFIG_X86_64 | 782 | #ifdef CONFIG_X86_64 |
781 | is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); | 783 | is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); |
782 | #endif | 784 | #endif |
783 | insn_init(&insn, kaddr, is_64bit); | 785 | insn_init(&insn, kaddr, size, is_64bit); |
784 | insn_get_length(&insn); | 786 | insn_get_length(&insn); |
787 | /* | ||
788 | * Make sure there was not a problem decoding the | ||
789 | * instruction and getting the length. This is | ||
790 | * doubly important because we have an infinite | ||
791 | * loop if insn.length=0. | ||
792 | */ | ||
793 | if (!insn.length) | ||
794 | break; | ||
785 | 795 | ||
786 | to += insn.length; | 796 | to += insn.length; |
787 | kaddr += insn.length; | 797 | kaddr += insn.length; |
798 | size -= insn.length; | ||
788 | } while (to < ip); | 799 | } while (to < ip); |
789 | 800 | ||
790 | if (to == ip) { | 801 | if (to == ip) { |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 45fa730a5283..58f1a94beaf0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c | |||
@@ -465,7 +465,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) | |||
465 | { | 465 | { |
466 | struct insn insn; | 466 | struct insn insn; |
467 | void *addr; | 467 | void *addr; |
468 | int bytes, size = MAX_INSN_SIZE; | 468 | int bytes_read, bytes_left; |
469 | int ret = X86_BR_NONE; | 469 | int ret = X86_BR_NONE; |
470 | int ext, to_plm, from_plm; | 470 | int ext, to_plm, from_plm; |
471 | u8 buf[MAX_INSN_SIZE]; | 471 | u8 buf[MAX_INSN_SIZE]; |
@@ -493,8 +493,10 @@ static int branch_type(unsigned long from, unsigned long to, int abort) | |||
493 | return X86_BR_NONE; | 493 | return X86_BR_NONE; |
494 | 494 | ||
495 | /* may fail if text not present */ | 495 | /* may fail if text not present */ |
496 | bytes = copy_from_user_nmi(buf, (void __user *)from, size); | 496 | bytes_left = copy_from_user_nmi(buf, (void __user *)from, |
497 | if (bytes != 0) | 497 | MAX_INSN_SIZE); |
498 | bytes_read = MAX_INSN_SIZE - bytes_left; | ||
499 | if (!bytes_read) | ||
498 | return X86_BR_NONE; | 500 | return X86_BR_NONE; |
499 | 501 | ||
500 | addr = buf; | 502 | addr = buf; |
@@ -505,10 +507,19 @@ static int branch_type(unsigned long from, unsigned long to, int abort) | |||
505 | * Ensure we don't blindy read any address by validating it is | 507 | * Ensure we don't blindy read any address by validating it is |
506 | * a known text address. | 508 | * a known text address. |
507 | */ | 509 | */ |
508 | if (kernel_text_address(from)) | 510 | if (kernel_text_address(from)) { |
509 | addr = (void *)from; | 511 | addr = (void *)from; |
510 | else | 512 | /* |
513 | * Assume we can get the maximum possible size | ||
514 | * when grabbing kernel data. This is not | ||
515 | * _strictly_ true since we could possibly be | ||
516 | * executing up next to a memory hole, but | ||
517 | * it is very unlikely to be a problem. | ||
518 | */ | ||
519 | bytes_read = MAX_INSN_SIZE; | ||
520 | } else { | ||
511 | return X86_BR_NONE; | 521 | return X86_BR_NONE; |
522 | } | ||
512 | } | 523 | } |
513 | 524 | ||
514 | /* | 525 | /* |
@@ -518,8 +529,10 @@ static int branch_type(unsigned long from, unsigned long to, int abort) | |||
518 | #ifdef CONFIG_X86_64 | 529 | #ifdef CONFIG_X86_64 |
519 | is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); | 530 | is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); |
520 | #endif | 531 | #endif |
521 | insn_init(&insn, addr, is64); | 532 | insn_init(&insn, addr, bytes_read, is64); |
522 | insn_get_opcode(&insn); | 533 | insn_get_opcode(&insn); |
534 | if (!insn.opcode.got) | ||
535 | return X86_BR_ABORT; | ||
523 | 536 | ||
524 | switch (insn.opcode.bytes[0]) { | 537 | switch (insn.opcode.bytes[0]) { |
525 | case 0xf: | 538 | case 0xf: |
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 67e6d19ef1be..f7e3cd50ece0 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c | |||
@@ -285,7 +285,7 @@ static int can_probe(unsigned long paddr) | |||
285 | * normally used, we just go through if there is no kprobe. | 285 | * normally used, we just go through if there is no kprobe. |
286 | */ | 286 | */ |
287 | __addr = recover_probed_instruction(buf, addr); | 287 | __addr = recover_probed_instruction(buf, addr); |
288 | kernel_insn_init(&insn, (void *)__addr); | 288 | kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE); |
289 | insn_get_length(&insn); | 289 | insn_get_length(&insn); |
290 | 290 | ||
291 | /* | 291 | /* |
@@ -330,8 +330,10 @@ int __copy_instruction(u8 *dest, u8 *src) | |||
330 | { | 330 | { |
331 | struct insn insn; | 331 | struct insn insn; |
332 | kprobe_opcode_t buf[MAX_INSN_SIZE]; | 332 | kprobe_opcode_t buf[MAX_INSN_SIZE]; |
333 | unsigned long recovered_insn = | ||
334 | recover_probed_instruction(buf, (unsigned long)src); | ||
333 | 335 | ||
334 | kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); | 336 | kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); |
335 | insn_get_length(&insn); | 337 | insn_get_length(&insn); |
336 | /* Another subsystem puts a breakpoint, failed to recover */ | 338 | /* Another subsystem puts a breakpoint, failed to recover */ |
337 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) | 339 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) |
@@ -342,7 +344,7 @@ int __copy_instruction(u8 *dest, u8 *src) | |||
342 | if (insn_rip_relative(&insn)) { | 344 | if (insn_rip_relative(&insn)) { |
343 | s64 newdisp; | 345 | s64 newdisp; |
344 | u8 *disp; | 346 | u8 *disp; |
345 | kernel_insn_init(&insn, dest); | 347 | kernel_insn_init(&insn, dest, insn.length); |
346 | insn_get_displacement(&insn); | 348 | insn_get_displacement(&insn); |
347 | /* | 349 | /* |
348 | * The copied instruction uses the %rip-relative addressing | 350 | * The copied instruction uses the %rip-relative addressing |
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index f1314d0bcf0a..7c523bbf3dc8 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c | |||
@@ -251,13 +251,15 @@ static int can_optimize(unsigned long paddr) | |||
251 | /* Decode instructions */ | 251 | /* Decode instructions */ |
252 | addr = paddr - offset; | 252 | addr = paddr - offset; |
253 | while (addr < paddr - offset + size) { /* Decode until function end */ | 253 | while (addr < paddr - offset + size) { /* Decode until function end */ |
254 | unsigned long recovered_insn; | ||
254 | if (search_exception_tables(addr)) | 255 | if (search_exception_tables(addr)) |
255 | /* | 256 | /* |
256 | * Since some fixup code will jumps into this function, | 257 | * Since some fixup code will jumps into this function, |
257 | * we can't optimize kprobe in this function. | 258 | * we can't optimize kprobe in this function. |
258 | */ | 259 | */ |
259 | return 0; | 260 | return 0; |
260 | kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); | 261 | recovered_insn = recover_probed_instruction(buf, addr); |
262 | kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); | ||
261 | insn_get_length(&insn); | 263 | insn_get_length(&insn); |
262 | /* Another subsystem puts a breakpoint */ | 264 | /* Another subsystem puts a breakpoint */ |
263 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) | 265 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ab08aa2276fb..214245d6b996 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -960,6 +960,8 @@ void __init setup_arch(char **cmdline_p) | |||
960 | init_mm.end_data = (unsigned long) _edata; | 960 | init_mm.end_data = (unsigned long) _edata; |
961 | init_mm.brk = _brk_end; | 961 | init_mm.brk = _brk_end; |
962 | 962 | ||
963 | mpx_mm_init(&init_mm); | ||
964 | |||
963 | code_resource.start = __pa_symbol(_text); | 965 | code_resource.start = __pa_symbol(_text); |
964 | code_resource.end = __pa_symbol(_etext)-1; | 966 | code_resource.end = __pa_symbol(_etext)-1; |
965 | data_resource.start = __pa_symbol(_etext); | 967 | data_resource.start = __pa_symbol(_etext); |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 07ab8e9733c5..a9ae20579895 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -60,6 +60,7 @@ | |||
60 | #include <asm/fixmap.h> | 60 | #include <asm/fixmap.h> |
61 | #include <asm/mach_traps.h> | 61 | #include <asm/mach_traps.h> |
62 | #include <asm/alternative.h> | 62 | #include <asm/alternative.h> |
63 | #include <asm/mpx.h> | ||
63 | 64 | ||
64 | #ifdef CONFIG_X86_64 | 65 | #ifdef CONFIG_X86_64 |
65 | #include <asm/x86_init.h> | 66 | #include <asm/x86_init.h> |
@@ -228,7 +229,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ | |||
228 | 229 | ||
229 | DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error) | 230 | DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error) |
230 | DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) | 231 | DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) |
231 | DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) | ||
232 | DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) | 232 | DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) |
233 | DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) | 233 | DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) |
234 | DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) | 234 | DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) |
@@ -286,6 +286,89 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) | |||
286 | } | 286 | } |
287 | #endif | 287 | #endif |
288 | 288 | ||
289 | dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) | ||
290 | { | ||
291 | struct task_struct *tsk = current; | ||
292 | struct xsave_struct *xsave_buf; | ||
293 | enum ctx_state prev_state; | ||
294 | struct bndcsr *bndcsr; | ||
295 | siginfo_t *info; | ||
296 | |||
297 | prev_state = exception_enter(); | ||
298 | if (notify_die(DIE_TRAP, "bounds", regs, error_code, | ||
299 | X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) | ||
300 | goto exit; | ||
301 | conditional_sti(regs); | ||
302 | |||
303 | if (!user_mode(regs)) | ||
304 | die("bounds", regs, error_code); | ||
305 | |||
306 | if (!cpu_feature_enabled(X86_FEATURE_MPX)) { | ||
307 | /* The exception is not from Intel MPX */ | ||
308 | goto exit_trap; | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * We need to look at BNDSTATUS to resolve this exception. | ||
313 | * It is not directly accessible, though, so we need to | ||
314 | * do an xsave and then pull it out of the xsave buffer. | ||
315 | */ | ||
316 | fpu_save_init(&tsk->thread.fpu); | ||
317 | xsave_buf = &(tsk->thread.fpu.state->xsave); | ||
318 | bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); | ||
319 | if (!bndcsr) | ||
320 | goto exit_trap; | ||
321 | |||
322 | /* | ||
323 | * The error code field of the BNDSTATUS register communicates status | ||
324 | * information of a bound range exception #BR or operation involving | ||
325 | * bound directory. | ||
326 | */ | ||
327 | switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) { | ||
328 | case 2: /* Bound directory has invalid entry. */ | ||
329 | if (mpx_handle_bd_fault(xsave_buf)) | ||
330 | goto exit_trap; | ||
331 | break; /* Success, it was handled */ | ||
332 | case 1: /* Bound violation. */ | ||
333 | info = mpx_generate_siginfo(regs, xsave_buf); | ||
334 | if (PTR_ERR(info)) { | ||
335 | /* | ||
336 | * We failed to decode the MPX instruction. Act as if | ||
337 | * the exception was not caused by MPX. | ||
338 | */ | ||
339 | goto exit_trap; | ||
340 | } | ||
341 | /* | ||
342 | * Success, we decoded the instruction and retrieved | ||
343 | * an 'info' containing the address being accessed | ||
344 | * which caused the exception. This information | ||
345 | * allows and application to possibly handle the | ||
346 | * #BR exception itself. | ||
347 | */ | ||
348 | do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info); | ||
349 | kfree(info); | ||
350 | break; | ||
351 | case 0: /* No exception caused by Intel MPX operations. */ | ||
352 | goto exit_trap; | ||
353 | default: | ||
354 | die("bounds", regs, error_code); | ||
355 | } | ||
356 | |||
357 | exit: | ||
358 | exception_exit(prev_state); | ||
359 | return; | ||
360 | exit_trap: | ||
361 | /* | ||
362 | * This path out is for all the cases where we could not | ||
363 | * handle the exception in some way (like allocating a | ||
364 | * table or telling userspace about it. We will also end | ||
365 | * up here if the kernel has MPX turned off at compile | ||
366 | * time.. | ||
367 | */ | ||
368 | do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL); | ||
369 | exception_exit(prev_state); | ||
370 | } | ||
371 | |||
289 | dotraplinkage void | 372 | dotraplinkage void |
290 | do_general_protection(struct pt_regs *regs, long error_code) | 373 | do_general_protection(struct pt_regs *regs, long error_code) |
291 | { | 374 | { |
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 5d1cbfe4ae58..8b96a947021f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c | |||
@@ -219,7 +219,7 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool | |||
219 | { | 219 | { |
220 | u32 volatile *good_insns; | 220 | u32 volatile *good_insns; |
221 | 221 | ||
222 | insn_init(insn, auprobe->insn, x86_64); | 222 | insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64); |
223 | /* has the side-effect of processing the entire instruction */ | 223 | /* has the side-effect of processing the entire instruction */ |
224 | insn_get_length(insn); | 224 | insn_get_length(insn); |
225 | if (WARN_ON_ONCE(!insn_complete(insn))) | 225 | if (WARN_ON_ONCE(!insn_complete(insn))) |
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 54fcffed28ed..2480978b31cc 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c | |||
@@ -28,7 +28,7 @@ | |||
28 | 28 | ||
29 | /* Verify next sizeof(t) bytes can be on the same instruction */ | 29 | /* Verify next sizeof(t) bytes can be on the same instruction */ |
30 | #define validate_next(t, insn, n) \ | 30 | #define validate_next(t, insn, n) \ |
31 | ((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE) | 31 | ((insn)->next_byte + sizeof(t) + n < (insn)->end_kaddr) |
32 | 32 | ||
33 | #define __get_next(t, insn) \ | 33 | #define __get_next(t, insn) \ |
34 | ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) | 34 | ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) |
@@ -50,10 +50,11 @@ | |||
50 | * @kaddr: address (in kernel memory) of instruction (or copy thereof) | 50 | * @kaddr: address (in kernel memory) of instruction (or copy thereof) |
51 | * @x86_64: !0 for 64-bit kernel or 64-bit app | 51 | * @x86_64: !0 for 64-bit kernel or 64-bit app |
52 | */ | 52 | */ |
53 | void insn_init(struct insn *insn, const void *kaddr, int x86_64) | 53 | void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) |
54 | { | 54 | { |
55 | memset(insn, 0, sizeof(*insn)); | 55 | memset(insn, 0, sizeof(*insn)); |
56 | insn->kaddr = kaddr; | 56 | insn->kaddr = kaddr; |
57 | insn->end_kaddr = kaddr + buf_len; | ||
57 | insn->next_byte = kaddr; | 58 | insn->next_byte = kaddr; |
58 | insn->x86_64 = x86_64 ? 1 : 0; | 59 | insn->x86_64 = x86_64 ? 1 : 0; |
59 | insn->opnd_bytes = 4; | 60 | insn->opnd_bytes = 4; |
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 6a19ad9f370d..ecfdc46a024a 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -30,3 +30,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o | |||
30 | obj-$(CONFIG_NUMA_EMU) += numa_emulation.o | 30 | obj-$(CONFIG_NUMA_EMU) += numa_emulation.o |
31 | 31 | ||
32 | obj-$(CONFIG_MEMTEST) += memtest.o | 32 | obj-$(CONFIG_MEMTEST) += memtest.o |
33 | |||
34 | obj-$(CONFIG_X86_INTEL_MPX) += mpx.o | ||
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c new file mode 100644 index 000000000000..67ebf5751222 --- /dev/null +++ b/arch/x86/mm/mpx.c | |||
@@ -0,0 +1,928 @@ | |||
1 | /* | ||
2 | * mpx.c - Memory Protection eXtensions | ||
3 | * | ||
4 | * Copyright (c) 2014, Intel Corporation. | ||
5 | * Qiaowei Ren <qiaowei.ren@intel.com> | ||
6 | * Dave Hansen <dave.hansen@intel.com> | ||
7 | */ | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/syscalls.h> | ||
11 | #include <linux/sched/sysctl.h> | ||
12 | |||
13 | #include <asm/i387.h> | ||
14 | #include <asm/insn.h> | ||
15 | #include <asm/mman.h> | ||
16 | #include <asm/mmu_context.h> | ||
17 | #include <asm/mpx.h> | ||
18 | #include <asm/processor.h> | ||
19 | #include <asm/fpu-internal.h> | ||
20 | |||
21 | static const char *mpx_mapping_name(struct vm_area_struct *vma) | ||
22 | { | ||
23 | return "[mpx]"; | ||
24 | } | ||
25 | |||
26 | static struct vm_operations_struct mpx_vma_ops = { | ||
27 | .name = mpx_mapping_name, | ||
28 | }; | ||
29 | |||
30 | static int is_mpx_vma(struct vm_area_struct *vma) | ||
31 | { | ||
32 | return (vma->vm_ops == &mpx_vma_ops); | ||
33 | } | ||
34 | |||
35 | /* | ||
36 | * This is really a simplified "vm_mmap". it only handles MPX | ||
37 | * bounds tables (the bounds directory is user-allocated). | ||
38 | * | ||
39 | * Later on, we use the vma->vm_ops to uniquely identify these | ||
40 | * VMAs. | ||
41 | */ | ||
42 | static unsigned long mpx_mmap(unsigned long len) | ||
43 | { | ||
44 | unsigned long ret; | ||
45 | unsigned long addr, pgoff; | ||
46 | struct mm_struct *mm = current->mm; | ||
47 | vm_flags_t vm_flags; | ||
48 | struct vm_area_struct *vma; | ||
49 | |||
50 | /* Only bounds table and bounds directory can be allocated here */ | ||
51 | if (len != MPX_BD_SIZE_BYTES && len != MPX_BT_SIZE_BYTES) | ||
52 | return -EINVAL; | ||
53 | |||
54 | down_write(&mm->mmap_sem); | ||
55 | |||
56 | /* Too many mappings? */ | ||
57 | if (mm->map_count > sysctl_max_map_count) { | ||
58 | ret = -ENOMEM; | ||
59 | goto out; | ||
60 | } | ||
61 | |||
62 | /* Obtain the address to map to. we verify (or select) it and ensure | ||
63 | * that it represents a valid section of the address space. | ||
64 | */ | ||
65 | addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE); | ||
66 | if (addr & ~PAGE_MASK) { | ||
67 | ret = addr; | ||
68 | goto out; | ||
69 | } | ||
70 | |||
71 | vm_flags = VM_READ | VM_WRITE | VM_MPX | | ||
72 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | ||
73 | |||
74 | /* Set pgoff according to addr for anon_vma */ | ||
75 | pgoff = addr >> PAGE_SHIFT; | ||
76 | |||
77 | ret = mmap_region(NULL, addr, len, vm_flags, pgoff); | ||
78 | if (IS_ERR_VALUE(ret)) | ||
79 | goto out; | ||
80 | |||
81 | vma = find_vma(mm, ret); | ||
82 | if (!vma) { | ||
83 | ret = -ENOMEM; | ||
84 | goto out; | ||
85 | } | ||
86 | vma->vm_ops = &mpx_vma_ops; | ||
87 | |||
88 | if (vm_flags & VM_LOCKED) { | ||
89 | up_write(&mm->mmap_sem); | ||
90 | mm_populate(ret, len); | ||
91 | return ret; | ||
92 | } | ||
93 | |||
94 | out: | ||
95 | up_write(&mm->mmap_sem); | ||
96 | return ret; | ||
97 | } | ||
98 | |||
99 | enum reg_type { | ||
100 | REG_TYPE_RM = 0, | ||
101 | REG_TYPE_INDEX, | ||
102 | REG_TYPE_BASE, | ||
103 | }; | ||
104 | |||
105 | static int get_reg_offset(struct insn *insn, struct pt_regs *regs, | ||
106 | enum reg_type type) | ||
107 | { | ||
108 | int regno = 0; | ||
109 | |||
110 | static const int regoff[] = { | ||
111 | offsetof(struct pt_regs, ax), | ||
112 | offsetof(struct pt_regs, cx), | ||
113 | offsetof(struct pt_regs, dx), | ||
114 | offsetof(struct pt_regs, bx), | ||
115 | offsetof(struct pt_regs, sp), | ||
116 | offsetof(struct pt_regs, bp), | ||
117 | offsetof(struct pt_regs, si), | ||
118 | offsetof(struct pt_regs, di), | ||
119 | #ifdef CONFIG_X86_64 | ||
120 | offsetof(struct pt_regs, r8), | ||
121 | offsetof(struct pt_regs, r9), | ||
122 | offsetof(struct pt_regs, r10), | ||
123 | offsetof(struct pt_regs, r11), | ||
124 | offsetof(struct pt_regs, r12), | ||
125 | offsetof(struct pt_regs, r13), | ||
126 | offsetof(struct pt_regs, r14), | ||
127 | offsetof(struct pt_regs, r15), | ||
128 | #endif | ||
129 | }; | ||
130 | int nr_registers = ARRAY_SIZE(regoff); | ||
131 | /* | ||
132 | * Don't possibly decode a 32-bit instructions as | ||
133 | * reading a 64-bit-only register. | ||
134 | */ | ||
135 | if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64) | ||
136 | nr_registers -= 8; | ||
137 | |||
138 | switch (type) { | ||
139 | case REG_TYPE_RM: | ||
140 | regno = X86_MODRM_RM(insn->modrm.value); | ||
141 | if (X86_REX_B(insn->rex_prefix.value) == 1) | ||
142 | regno += 8; | ||
143 | break; | ||
144 | |||
145 | case REG_TYPE_INDEX: | ||
146 | regno = X86_SIB_INDEX(insn->sib.value); | ||
147 | if (X86_REX_X(insn->rex_prefix.value) == 1) | ||
148 | regno += 8; | ||
149 | break; | ||
150 | |||
151 | case REG_TYPE_BASE: | ||
152 | regno = X86_SIB_BASE(insn->sib.value); | ||
153 | if (X86_REX_B(insn->rex_prefix.value) == 1) | ||
154 | regno += 8; | ||
155 | break; | ||
156 | |||
157 | default: | ||
158 | pr_err("invalid register type"); | ||
159 | BUG(); | ||
160 | break; | ||
161 | } | ||
162 | |||
163 | if (regno > nr_registers) { | ||
164 | WARN_ONCE(1, "decoded an instruction with an invalid register"); | ||
165 | return -EINVAL; | ||
166 | } | ||
167 | return regoff[regno]; | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * return the address being referenced be instruction | ||
172 | * for rm=3 returning the content of the rm reg | ||
173 | * for rm!=3 calculates the address using SIB and Disp | ||
174 | */ | ||
175 | static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) | ||
176 | { | ||
177 | unsigned long addr, base, indx; | ||
178 | int addr_offset, base_offset, indx_offset; | ||
179 | insn_byte_t sib; | ||
180 | |||
181 | insn_get_modrm(insn); | ||
182 | insn_get_sib(insn); | ||
183 | sib = insn->sib.value; | ||
184 | |||
185 | if (X86_MODRM_MOD(insn->modrm.value) == 3) { | ||
186 | addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); | ||
187 | if (addr_offset < 0) | ||
188 | goto out_err; | ||
189 | addr = regs_get_register(regs, addr_offset); | ||
190 | } else { | ||
191 | if (insn->sib.nbytes) { | ||
192 | base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); | ||
193 | if (base_offset < 0) | ||
194 | goto out_err; | ||
195 | |||
196 | indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); | ||
197 | if (indx_offset < 0) | ||
198 | goto out_err; | ||
199 | |||
200 | base = regs_get_register(regs, base_offset); | ||
201 | indx = regs_get_register(regs, indx_offset); | ||
202 | addr = base + indx * (1 << X86_SIB_SCALE(sib)); | ||
203 | } else { | ||
204 | addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); | ||
205 | if (addr_offset < 0) | ||
206 | goto out_err; | ||
207 | addr = regs_get_register(regs, addr_offset); | ||
208 | } | ||
209 | addr += insn->displacement.value; | ||
210 | } | ||
211 | return (void __user *)addr; | ||
212 | out_err: | ||
213 | return (void __user *)-1; | ||
214 | } | ||
215 | |||
216 | static int mpx_insn_decode(struct insn *insn, | ||
217 | struct pt_regs *regs) | ||
218 | { | ||
219 | unsigned char buf[MAX_INSN_SIZE]; | ||
220 | int x86_64 = !test_thread_flag(TIF_IA32); | ||
221 | int not_copied; | ||
222 | int nr_copied; | ||
223 | |||
224 | not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf)); | ||
225 | nr_copied = sizeof(buf) - not_copied; | ||
226 | /* | ||
227 | * The decoder _should_ fail nicely if we pass it a short buffer. | ||
228 | * But, let's not depend on that implementation detail. If we | ||
229 | * did not get anything, just error out now. | ||
230 | */ | ||
231 | if (!nr_copied) | ||
232 | return -EFAULT; | ||
233 | insn_init(insn, buf, nr_copied, x86_64); | ||
234 | insn_get_length(insn); | ||
235 | /* | ||
236 | * copy_from_user() tries to get as many bytes as we could see in | ||
237 | * the largest possible instruction. If the instruction we are | ||
238 | * after is shorter than that _and_ we attempt to copy from | ||
239 | * something unreadable, we might get a short read. This is OK | ||
240 | * as long as the read did not stop in the middle of the | ||
241 | * instruction. Check to see if we got a partial instruction. | ||
242 | */ | ||
243 | if (nr_copied < insn->length) | ||
244 | return -EFAULT; | ||
245 | |||
246 | insn_get_opcode(insn); | ||
247 | /* | ||
248 | * We only _really_ need to decode bndcl/bndcn/bndcu | ||
249 | * Error out on anything else. | ||
250 | */ | ||
251 | if (insn->opcode.bytes[0] != 0x0f) | ||
252 | goto bad_opcode; | ||
253 | if ((insn->opcode.bytes[1] != 0x1a) && | ||
254 | (insn->opcode.bytes[1] != 0x1b)) | ||
255 | goto bad_opcode; | ||
256 | |||
257 | return 0; | ||
258 | bad_opcode: | ||
259 | return -EINVAL; | ||
260 | } | ||
261 | |||
262 | /* | ||
263 | * If a bounds overflow occurs then a #BR is generated. This | ||
264 | * function decodes MPX instructions to get violation address | ||
265 | * and set this address into extended struct siginfo. | ||
266 | * | ||
267 | * Note that this is not a super precise way of doing this. | ||
268 | * Userspace could have, by the time we get here, written | ||
269 | * anything it wants in to the instructions. We can not | ||
270 | * trust anything about it. They might not be valid | ||
271 | * instructions or might encode invalid registers, etc... | ||
272 | * | ||
273 | * The caller is expected to kfree() the returned siginfo_t. | ||
274 | */ | ||
275 | siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, | ||
276 | struct xsave_struct *xsave_buf) | ||
277 | { | ||
278 | struct bndreg *bndregs, *bndreg; | ||
279 | siginfo_t *info = NULL; | ||
280 | struct insn insn; | ||
281 | uint8_t bndregno; | ||
282 | int err; | ||
283 | |||
284 | err = mpx_insn_decode(&insn, regs); | ||
285 | if (err) | ||
286 | goto err_out; | ||
287 | |||
288 | /* | ||
289 | * We know at this point that we are only dealing with | ||
290 | * MPX instructions. | ||
291 | */ | ||
292 | insn_get_modrm(&insn); | ||
293 | bndregno = X86_MODRM_REG(insn.modrm.value); | ||
294 | if (bndregno > 3) { | ||
295 | err = -EINVAL; | ||
296 | goto err_out; | ||
297 | } | ||
298 | /* get the bndregs _area_ of the xsave structure */ | ||
299 | bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS); | ||
300 | if (!bndregs) { | ||
301 | err = -EINVAL; | ||
302 | goto err_out; | ||
303 | } | ||
304 | /* now go select the individual register in the set of 4 */ | ||
305 | bndreg = &bndregs[bndregno]; | ||
306 | |||
307 | info = kzalloc(sizeof(*info), GFP_KERNEL); | ||
308 | if (!info) { | ||
309 | err = -ENOMEM; | ||
310 | goto err_out; | ||
311 | } | ||
312 | /* | ||
313 | * The registers are always 64-bit, but the upper 32 | ||
314 | * bits are ignored in 32-bit mode. Also, note that the | ||
315 | * upper bounds are architecturally represented in 1's | ||
316 | * complement form. | ||
317 | * | ||
318 | * The 'unsigned long' cast is because the compiler | ||
319 | * complains when casting from integers to different-size | ||
320 | * pointers. | ||
321 | */ | ||
322 | info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound; | ||
323 | info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound; | ||
324 | info->si_addr_lsb = 0; | ||
325 | info->si_signo = SIGSEGV; | ||
326 | info->si_errno = 0; | ||
327 | info->si_code = SEGV_BNDERR; | ||
328 | info->si_addr = mpx_get_addr_ref(&insn, regs); | ||
329 | /* | ||
330 | * We were not able to extract an address from the instruction, | ||
331 | * probably because there was something invalid in it. | ||
332 | */ | ||
333 | if (info->si_addr == (void *)-1) { | ||
334 | err = -EINVAL; | ||
335 | goto err_out; | ||
336 | } | ||
337 | return info; | ||
338 | err_out: | ||
339 | /* info might be NULL, but kfree() handles that */ | ||
340 | kfree(info); | ||
341 | return ERR_PTR(err); | ||
342 | } | ||
343 | |||
344 | static __user void *task_get_bounds_dir(struct task_struct *tsk) | ||
345 | { | ||
346 | struct bndcsr *bndcsr; | ||
347 | |||
348 | if (!cpu_feature_enabled(X86_FEATURE_MPX)) | ||
349 | return MPX_INVALID_BOUNDS_DIR; | ||
350 | |||
351 | /* | ||
352 | * The bounds directory pointer is stored in a register | ||
353 | * only accessible if we first do an xsave. | ||
354 | */ | ||
355 | fpu_save_init(&tsk->thread.fpu); | ||
356 | bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR); | ||
357 | if (!bndcsr) | ||
358 | return MPX_INVALID_BOUNDS_DIR; | ||
359 | |||
360 | /* | ||
361 | * Make sure the register looks valid by checking the | ||
362 | * enable bit. | ||
363 | */ | ||
364 | if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG)) | ||
365 | return MPX_INVALID_BOUNDS_DIR; | ||
366 | |||
367 | /* | ||
368 | * Lastly, mask off the low bits used for configuration | ||
369 | * flags, and return the address of the bounds table. | ||
370 | */ | ||
371 | return (void __user *)(unsigned long) | ||
372 | (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK); | ||
373 | } | ||
374 | |||
375 | int mpx_enable_management(struct task_struct *tsk) | ||
376 | { | ||
377 | void __user *bd_base = MPX_INVALID_BOUNDS_DIR; | ||
378 | struct mm_struct *mm = tsk->mm; | ||
379 | int ret = 0; | ||
380 | |||
381 | /* | ||
382 | * runtime in the userspace will be responsible for allocation of | ||
383 | * the bounds directory. Then, it will save the base of the bounds | ||
384 | * directory into XSAVE/XRSTOR Save Area and enable MPX through | ||
385 | * XRSTOR instruction. | ||
386 | * | ||
387 | * fpu_xsave() is expected to be very expensive. Storing the bounds | ||
388 | * directory here means that we do not have to do xsave in the unmap | ||
389 | * path; we can just use mm->bd_addr instead. | ||
390 | */ | ||
391 | bd_base = task_get_bounds_dir(tsk); | ||
392 | down_write(&mm->mmap_sem); | ||
393 | mm->bd_addr = bd_base; | ||
394 | if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR) | ||
395 | ret = -ENXIO; | ||
396 | |||
397 | up_write(&mm->mmap_sem); | ||
398 | return ret; | ||
399 | } | ||
400 | |||
401 | int mpx_disable_management(struct task_struct *tsk) | ||
402 | { | ||
403 | struct mm_struct *mm = current->mm; | ||
404 | |||
405 | if (!cpu_feature_enabled(X86_FEATURE_MPX)) | ||
406 | return -ENXIO; | ||
407 | |||
408 | down_write(&mm->mmap_sem); | ||
409 | mm->bd_addr = MPX_INVALID_BOUNDS_DIR; | ||
410 | up_write(&mm->mmap_sem); | ||
411 | return 0; | ||
412 | } | ||
413 | |||
414 | /* | ||
415 | * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each | ||
416 | * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB, | ||
417 | * and the size of each bounds table is 4MB. | ||
418 | */ | ||
419 | static int allocate_bt(long __user *bd_entry) | ||
420 | { | ||
421 | unsigned long expected_old_val = 0; | ||
422 | unsigned long actual_old_val = 0; | ||
423 | unsigned long bt_addr; | ||
424 | int ret = 0; | ||
425 | |||
426 | /* | ||
427 | * Carve the virtual space out of userspace for the new | ||
428 | * bounds table: | ||
429 | */ | ||
430 | bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES); | ||
431 | if (IS_ERR((void *)bt_addr)) | ||
432 | return PTR_ERR((void *)bt_addr); | ||
433 | /* | ||
434 | * Set the valid flag (kinda like _PAGE_PRESENT in a pte) | ||
435 | */ | ||
436 | bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG; | ||
437 | |||
438 | /* | ||
439 | * Go poke the address of the new bounds table in to the | ||
440 | * bounds directory entry out in userspace memory. Note: | ||
441 | * we may race with another CPU instantiating the same table. | ||
442 | * In that case the cmpxchg will see an unexpected | ||
443 | * 'actual_old_val'. | ||
444 | * | ||
445 | * This can fault, but that's OK because we do not hold | ||
446 | * mmap_sem at this point, unlike some of the other part | ||
447 | * of the MPX code that have to pagefault_disable(). | ||
448 | */ | ||
449 | ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, | ||
450 | expected_old_val, bt_addr); | ||
451 | if (ret) | ||
452 | goto out_unmap; | ||
453 | |||
454 | /* | ||
455 | * The user_atomic_cmpxchg_inatomic() will only return nonzero | ||
456 | * for faults, *not* if the cmpxchg itself fails. Now we must | ||
457 | * verify that the cmpxchg itself completed successfully. | ||
458 | */ | ||
459 | /* | ||
460 | * We expected an empty 'expected_old_val', but instead found | ||
461 | * an apparently valid entry. Assume we raced with another | ||
462 | * thread to instantiate this table and desclare succecss. | ||
463 | */ | ||
464 | if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) { | ||
465 | ret = 0; | ||
466 | goto out_unmap; | ||
467 | } | ||
468 | /* | ||
469 | * We found a non-empty bd_entry but it did not have the | ||
470 | * VALID_FLAG set. Return an error which will result in | ||
471 | * a SEGV since this probably means that somebody scribbled | ||
472 | * some invalid data in to a bounds table. | ||
473 | */ | ||
474 | if (expected_old_val != actual_old_val) { | ||
475 | ret = -EINVAL; | ||
476 | goto out_unmap; | ||
477 | } | ||
478 | return 0; | ||
479 | out_unmap: | ||
480 | vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES); | ||
481 | return ret; | ||
482 | } | ||
483 | |||
484 | /* | ||
485 | * When a BNDSTX instruction attempts to save bounds to a bounds | ||
486 | * table, it will first attempt to look up the table in the | ||
487 | * first-level bounds directory. If it does not find a table in | ||
488 | * the directory, a #BR is generated and we get here in order to | ||
489 | * allocate a new table. | ||
490 | * | ||
491 | * With 32-bit mode, the size of BD is 4MB, and the size of each | ||
492 | * bound table is 16KB. With 64-bit mode, the size of BD is 2GB, | ||
493 | * and the size of each bound table is 4MB. | ||
494 | */ | ||
495 | static int do_mpx_bt_fault(struct xsave_struct *xsave_buf) | ||
496 | { | ||
497 | unsigned long bd_entry, bd_base; | ||
498 | struct bndcsr *bndcsr; | ||
499 | |||
500 | bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); | ||
501 | if (!bndcsr) | ||
502 | return -EINVAL; | ||
503 | /* | ||
504 | * Mask off the preserve and enable bits | ||
505 | */ | ||
506 | bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK; | ||
507 | /* | ||
508 | * The hardware provides the address of the missing or invalid | ||
509 | * entry via BNDSTATUS, so we don't have to go look it up. | ||
510 | */ | ||
511 | bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK; | ||
512 | /* | ||
513 | * Make sure the directory entry is within where we think | ||
514 | * the directory is. | ||
515 | */ | ||
516 | if ((bd_entry < bd_base) || | ||
517 | (bd_entry >= bd_base + MPX_BD_SIZE_BYTES)) | ||
518 | return -EINVAL; | ||
519 | |||
520 | return allocate_bt((long __user *)bd_entry); | ||
521 | } | ||
522 | |||
523 | int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) | ||
524 | { | ||
525 | /* | ||
526 | * Userspace never asked us to manage the bounds tables, | ||
527 | * so refuse to help. | ||
528 | */ | ||
529 | if (!kernel_managing_mpx_tables(current->mm)) | ||
530 | return -EINVAL; | ||
531 | |||
532 | if (do_mpx_bt_fault(xsave_buf)) { | ||
533 | force_sig(SIGSEGV, current); | ||
534 | /* | ||
535 | * The force_sig() is essentially "handling" this | ||
536 | * exception, so we do not pass up the error | ||
537 | * from do_mpx_bt_fault(). | ||
538 | */ | ||
539 | } | ||
540 | return 0; | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * A thin wrapper around get_user_pages(). Returns 0 if the | ||
545 | * fault was resolved or -errno if not. | ||
546 | */ | ||
547 | static int mpx_resolve_fault(long __user *addr, int write) | ||
548 | { | ||
549 | long gup_ret; | ||
550 | int nr_pages = 1; | ||
551 | int force = 0; | ||
552 | |||
553 | gup_ret = get_user_pages(current, current->mm, (unsigned long)addr, | ||
554 | nr_pages, write, force, NULL, NULL); | ||
555 | /* | ||
556 | * get_user_pages() returns number of pages gotten. | ||
557 | * 0 means we failed to fault in and get anything, | ||
558 | * probably because 'addr' is bad. | ||
559 | */ | ||
560 | if (!gup_ret) | ||
561 | return -EFAULT; | ||
562 | /* Other error, return it */ | ||
563 | if (gup_ret < 0) | ||
564 | return gup_ret; | ||
565 | /* must have gup'd a page and gup_ret>0, success */ | ||
566 | return 0; | ||
567 | } | ||
568 | |||
569 | /* | ||
570 | * Get the base of bounds tables pointed by specific bounds | ||
571 | * directory entry. | ||
572 | */ | ||
573 | static int get_bt_addr(struct mm_struct *mm, | ||
574 | long __user *bd_entry, unsigned long *bt_addr) | ||
575 | { | ||
576 | int ret; | ||
577 | int valid_bit; | ||
578 | |||
579 | if (!access_ok(VERIFY_READ, (bd_entry), sizeof(*bd_entry))) | ||
580 | return -EFAULT; | ||
581 | |||
582 | while (1) { | ||
583 | int need_write = 0; | ||
584 | |||
585 | pagefault_disable(); | ||
586 | ret = get_user(*bt_addr, bd_entry); | ||
587 | pagefault_enable(); | ||
588 | if (!ret) | ||
589 | break; | ||
590 | if (ret == -EFAULT) | ||
591 | ret = mpx_resolve_fault(bd_entry, need_write); | ||
592 | /* | ||
593 | * If we could not resolve the fault, consider it | ||
594 | * userspace's fault and error out. | ||
595 | */ | ||
596 | if (ret) | ||
597 | return ret; | ||
598 | } | ||
599 | |||
600 | valid_bit = *bt_addr & MPX_BD_ENTRY_VALID_FLAG; | ||
601 | *bt_addr &= MPX_BT_ADDR_MASK; | ||
602 | |||
603 | /* | ||
604 | * When the kernel is managing bounds tables, a bounds directory | ||
605 | * entry will either have a valid address (plus the valid bit) | ||
606 | * *OR* be completely empty. If we see a !valid entry *and* some | ||
607 | * data in the address field, we know something is wrong. This | ||
608 | * -EINVAL return will cause a SIGSEGV. | ||
609 | */ | ||
610 | if (!valid_bit && *bt_addr) | ||
611 | return -EINVAL; | ||
612 | /* | ||
613 | * Do we have an completely zeroed bt entry? That is OK. It | ||
614 | * just means there was no bounds table for this memory. Make | ||
615 | * sure to distinguish this from -EINVAL, which will cause | ||
616 | * a SEGV. | ||
617 | */ | ||
618 | if (!valid_bit) | ||
619 | return -ENOENT; | ||
620 | |||
621 | return 0; | ||
622 | } | ||
623 | |||
624 | /* | ||
625 | * Free the backing physical pages of bounds table 'bt_addr'. | ||
626 | * Assume start...end is within that bounds table. | ||
627 | */ | ||
628 | static int zap_bt_entries(struct mm_struct *mm, | ||
629 | unsigned long bt_addr, | ||
630 | unsigned long start, unsigned long end) | ||
631 | { | ||
632 | struct vm_area_struct *vma; | ||
633 | unsigned long addr, len; | ||
634 | |||
635 | /* | ||
636 | * Find the first overlapping vma. If vma->vm_start > start, there | ||
637 | * will be a hole in the bounds table. This -EINVAL return will | ||
638 | * cause a SIGSEGV. | ||
639 | */ | ||
640 | vma = find_vma(mm, start); | ||
641 | if (!vma || vma->vm_start > start) | ||
642 | return -EINVAL; | ||
643 | |||
644 | /* | ||
645 | * A NUMA policy on a VM_MPX VMA could cause this bouds table to | ||
646 | * be split. So we need to look across the entire 'start -> end' | ||
647 | * range of this bounds table, find all of the VM_MPX VMAs, and | ||
648 | * zap only those. | ||
649 | */ | ||
650 | addr = start; | ||
651 | while (vma && vma->vm_start < end) { | ||
652 | /* | ||
653 | * We followed a bounds directory entry down | ||
654 | * here. If we find a non-MPX VMA, that's bad, | ||
655 | * so stop immediately and return an error. This | ||
656 | * probably results in a SIGSEGV. | ||
657 | */ | ||
658 | if (!is_mpx_vma(vma)) | ||
659 | return -EINVAL; | ||
660 | |||
661 | len = min(vma->vm_end, end) - addr; | ||
662 | zap_page_range(vma, addr, len, NULL); | ||
663 | |||
664 | vma = vma->vm_next; | ||
665 | addr = vma->vm_start; | ||
666 | } | ||
667 | |||
668 | return 0; | ||
669 | } | ||
670 | |||
671 | static int unmap_single_bt(struct mm_struct *mm, | ||
672 | long __user *bd_entry, unsigned long bt_addr) | ||
673 | { | ||
674 | unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG; | ||
675 | unsigned long actual_old_val = 0; | ||
676 | int ret; | ||
677 | |||
678 | while (1) { | ||
679 | int need_write = 1; | ||
680 | |||
681 | pagefault_disable(); | ||
682 | ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, | ||
683 | expected_old_val, 0); | ||
684 | pagefault_enable(); | ||
685 | if (!ret) | ||
686 | break; | ||
687 | if (ret == -EFAULT) | ||
688 | ret = mpx_resolve_fault(bd_entry, need_write); | ||
689 | /* | ||
690 | * If we could not resolve the fault, consider it | ||
691 | * userspace's fault and error out. | ||
692 | */ | ||
693 | if (ret) | ||
694 | return ret; | ||
695 | } | ||
696 | /* | ||
697 | * The cmpxchg was performed, check the results. | ||
698 | */ | ||
699 | if (actual_old_val != expected_old_val) { | ||
700 | /* | ||
701 | * Someone else raced with us to unmap the table. | ||
702 | * There was no bounds table pointed to by the | ||
703 | * directory, so declare success. Somebody freed | ||
704 | * it. | ||
705 | */ | ||
706 | if (!actual_old_val) | ||
707 | return 0; | ||
708 | /* | ||
709 | * Something messed with the bounds directory | ||
710 | * entry. We hold mmap_sem for read or write | ||
711 | * here, so it could not be a _new_ bounds table | ||
712 | * that someone just allocated. Something is | ||
713 | * wrong, so pass up the error and SIGSEGV. | ||
714 | */ | ||
715 | return -EINVAL; | ||
716 | } | ||
717 | |||
718 | /* | ||
719 | * Note, we are likely being called under do_munmap() already. To | ||
720 | * avoid recursion, do_munmap() will check whether it comes | ||
721 | * from one bounds table through VM_MPX flag. | ||
722 | */ | ||
723 | return do_munmap(mm, bt_addr, MPX_BT_SIZE_BYTES); | ||
724 | } | ||
725 | |||
726 | /* | ||
727 | * If the bounds table pointed by bounds directory 'bd_entry' is | ||
728 | * not shared, unmap this whole bounds table. Otherwise, only free | ||
729 | * those backing physical pages of bounds table entries covered | ||
730 | * in this virtual address region start...end. | ||
731 | */ | ||
732 | static int unmap_shared_bt(struct mm_struct *mm, | ||
733 | long __user *bd_entry, unsigned long start, | ||
734 | unsigned long end, bool prev_shared, bool next_shared) | ||
735 | { | ||
736 | unsigned long bt_addr; | ||
737 | int ret; | ||
738 | |||
739 | ret = get_bt_addr(mm, bd_entry, &bt_addr); | ||
740 | /* | ||
741 | * We could see an "error" ret for not-present bounds | ||
742 | * tables (not really an error), or actual errors, but | ||
743 | * stop unmapping either way. | ||
744 | */ | ||
745 | if (ret) | ||
746 | return ret; | ||
747 | |||
748 | if (prev_shared && next_shared) | ||
749 | ret = zap_bt_entries(mm, bt_addr, | ||
750 | bt_addr+MPX_GET_BT_ENTRY_OFFSET(start), | ||
751 | bt_addr+MPX_GET_BT_ENTRY_OFFSET(end)); | ||
752 | else if (prev_shared) | ||
753 | ret = zap_bt_entries(mm, bt_addr, | ||
754 | bt_addr+MPX_GET_BT_ENTRY_OFFSET(start), | ||
755 | bt_addr+MPX_BT_SIZE_BYTES); | ||
756 | else if (next_shared) | ||
757 | ret = zap_bt_entries(mm, bt_addr, bt_addr, | ||
758 | bt_addr+MPX_GET_BT_ENTRY_OFFSET(end)); | ||
759 | else | ||
760 | ret = unmap_single_bt(mm, bd_entry, bt_addr); | ||
761 | |||
762 | return ret; | ||
763 | } | ||
764 | |||
765 | /* | ||
766 | * A virtual address region being munmap()ed might share bounds table | ||
767 | * with adjacent VMAs. We only need to free the backing physical | ||
768 | * memory of these shared bounds tables entries covered in this virtual | ||
769 | * address region. | ||
770 | */ | ||
771 | static int unmap_edge_bts(struct mm_struct *mm, | ||
772 | unsigned long start, unsigned long end) | ||
773 | { | ||
774 | int ret; | ||
775 | long __user *bde_start, *bde_end; | ||
776 | struct vm_area_struct *prev, *next; | ||
777 | bool prev_shared = false, next_shared = false; | ||
778 | |||
779 | bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start); | ||
780 | bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1); | ||
781 | |||
782 | /* | ||
783 | * Check whether bde_start and bde_end are shared with adjacent | ||
784 | * VMAs. | ||
785 | * | ||
786 | * We already unliked the VMAs from the mm's rbtree so 'start' | ||
787 | * is guaranteed to be in a hole. This gets us the first VMA | ||
788 | * before the hole in to 'prev' and the next VMA after the hole | ||
789 | * in to 'next'. | ||
790 | */ | ||
791 | next = find_vma_prev(mm, start, &prev); | ||
792 | if (prev && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(prev->vm_end-1)) | ||
793 | == bde_start) | ||
794 | prev_shared = true; | ||
795 | if (next && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(next->vm_start)) | ||
796 | == bde_end) | ||
797 | next_shared = true; | ||
798 | |||
799 | /* | ||
800 | * This virtual address region being munmap()ed is only | ||
801 | * covered by one bounds table. | ||
802 | * | ||
803 | * In this case, if this table is also shared with adjacent | ||
804 | * VMAs, only part of the backing physical memory of the bounds | ||
805 | * table need be freeed. Otherwise the whole bounds table need | ||
806 | * be unmapped. | ||
807 | */ | ||
808 | if (bde_start == bde_end) { | ||
809 | return unmap_shared_bt(mm, bde_start, start, end, | ||
810 | prev_shared, next_shared); | ||
811 | } | ||
812 | |||
813 | /* | ||
814 | * If more than one bounds tables are covered in this virtual | ||
815 | * address region being munmap()ed, we need to separately check | ||
816 | * whether bde_start and bde_end are shared with adjacent VMAs. | ||
817 | */ | ||
818 | ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false); | ||
819 | if (ret) | ||
820 | return ret; | ||
821 | ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared); | ||
822 | if (ret) | ||
823 | return ret; | ||
824 | |||
825 | return 0; | ||
826 | } | ||
827 | |||
828 | static int mpx_unmap_tables(struct mm_struct *mm, | ||
829 | unsigned long start, unsigned long end) | ||
830 | { | ||
831 | int ret; | ||
832 | long __user *bd_entry, *bde_start, *bde_end; | ||
833 | unsigned long bt_addr; | ||
834 | |||
835 | /* | ||
836 | * "Edge" bounds tables are those which are being used by the region | ||
837 | * (start -> end), but that may be shared with adjacent areas. If they | ||
838 | * turn out to be completely unshared, they will be freed. If they are | ||
839 | * shared, we will free the backing store (like an MADV_DONTNEED) for | ||
840 | * areas used by this region. | ||
841 | */ | ||
842 | ret = unmap_edge_bts(mm, start, end); | ||
843 | switch (ret) { | ||
844 | /* non-present tables are OK */ | ||
845 | case 0: | ||
846 | case -ENOENT: | ||
847 | /* Success, or no tables to unmap */ | ||
848 | break; | ||
849 | case -EINVAL: | ||
850 | case -EFAULT: | ||
851 | default: | ||
852 | return ret; | ||
853 | } | ||
854 | |||
855 | /* | ||
856 | * Only unmap the bounds table that are | ||
857 | * 1. fully covered | ||
858 | * 2. not at the edges of the mapping, even if full aligned | ||
859 | */ | ||
860 | bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start); | ||
861 | bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1); | ||
862 | for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) { | ||
863 | ret = get_bt_addr(mm, bd_entry, &bt_addr); | ||
864 | switch (ret) { | ||
865 | case 0: | ||
866 | break; | ||
867 | case -ENOENT: | ||
868 | /* No table here, try the next one */ | ||
869 | continue; | ||
870 | case -EINVAL: | ||
871 | case -EFAULT: | ||
872 | default: | ||
873 | /* | ||
874 | * Note: we are being strict here. | ||
875 | * Any time we run in to an issue | ||
876 | * unmapping tables, we stop and | ||
877 | * SIGSEGV. | ||
878 | */ | ||
879 | return ret; | ||
880 | } | ||
881 | |||
882 | ret = unmap_single_bt(mm, bd_entry, bt_addr); | ||
883 | if (ret) | ||
884 | return ret; | ||
885 | } | ||
886 | |||
887 | return 0; | ||
888 | } | ||
889 | |||
890 | /* | ||
891 | * Free unused bounds tables covered in a virtual address region being | ||
892 | * munmap()ed. Assume end > start. | ||
893 | * | ||
894 | * This function will be called by do_munmap(), and the VMAs covering | ||
895 | * the virtual address region start...end have already been split if | ||
896 | * necessary, and the 'vma' is the first vma in this range (start -> end). | ||
897 | */ | ||
898 | void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, | ||
899 | unsigned long start, unsigned long end) | ||
900 | { | ||
901 | int ret; | ||
902 | |||
903 | /* | ||
904 | * Refuse to do anything unless userspace has asked | ||
905 | * the kernel to help manage the bounds tables, | ||
906 | */ | ||
907 | if (!kernel_managing_mpx_tables(current->mm)) | ||
908 | return; | ||
909 | /* | ||
910 | * This will look across the entire 'start -> end' range, | ||
911 | * and find all of the non-VM_MPX VMAs. | ||
912 | * | ||
913 | * To avoid recursion, if a VM_MPX vma is found in the range | ||
914 | * (start->end), we will not continue follow-up work. This | ||
915 | * recursion represents having bounds tables for bounds tables, | ||
916 | * which should not occur normally. Being strict about it here | ||
917 | * helps ensure that we do not have an exploitable stack overflow. | ||
918 | */ | ||
919 | do { | ||
920 | if (vma->vm_flags & VM_MPX) | ||
921 | return; | ||
922 | vma = vma->vm_next; | ||
923 | } while (vma && vma->vm_start < end); | ||
924 | |||
925 | ret = mpx_unmap_tables(mm, start, end); | ||
926 | if (ret) | ||
927 | force_sig(SIGSEGV, current); | ||
928 | } | ||
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index 872eb60e7806..ba70ff232917 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c | |||
@@ -254,7 +254,7 @@ int main(int argc, char **argv) | |||
254 | continue; | 254 | continue; |
255 | 255 | ||
256 | /* Decode an instruction */ | 256 | /* Decode an instruction */ |
257 | insn_init(&insn, insn_buf, x86_64); | 257 | insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); |
258 | insn_get_length(&insn); | 258 | insn_get_length(&insn); |
259 | 259 | ||
260 | if (insn.next_byte <= insn.kaddr || | 260 | if (insn.next_byte <= insn.kaddr || |
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index 13403fc95a96..56f04db0c9c0 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c | |||
@@ -149,7 +149,7 @@ int main(int argc, char **argv) | |||
149 | break; | 149 | break; |
150 | } | 150 | } |
151 | /* Decode an instruction */ | 151 | /* Decode an instruction */ |
152 | insn_init(&insn, insn_buf, x86_64); | 152 | insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); |
153 | insn_get_length(&insn); | 153 | insn_get_length(&insn); |
154 | if (insn.length != nb) { | 154 | if (insn.length != nb) { |
155 | warnings++; | 155 | warnings++; |
@@ -277,6 +277,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) | |||
277 | goto err; | 277 | goto err; |
278 | 278 | ||
279 | mm->stack_vm = mm->total_vm = 1; | 279 | mm->stack_vm = mm->total_vm = 1; |
280 | arch_bprm_mm_init(mm, vma); | ||
280 | up_write(&mm->mmap_sem); | 281 | up_write(&mm->mmap_sem); |
281 | bprm->p = vma->vm_end - sizeof(void *); | 282 | bprm->p = vma->vm_end - sizeof(void *); |
282 | return 0; | 283 | return 0; |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4e0388cffe3d..f6734c6b66a6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -552,6 +552,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) | |||
552 | [ilog2(VM_GROWSDOWN)] = "gd", | 552 | [ilog2(VM_GROWSDOWN)] = "gd", |
553 | [ilog2(VM_PFNMAP)] = "pf", | 553 | [ilog2(VM_PFNMAP)] = "pf", |
554 | [ilog2(VM_DENYWRITE)] = "dw", | 554 | [ilog2(VM_DENYWRITE)] = "dw", |
555 | #ifdef CONFIG_X86_INTEL_MPX | ||
556 | [ilog2(VM_MPX)] = "mp", | ||
557 | #endif | ||
555 | [ilog2(VM_LOCKED)] = "lo", | 558 | [ilog2(VM_LOCKED)] = "lo", |
556 | [ilog2(VM_IO)] = "io", | 559 | [ilog2(VM_IO)] = "io", |
557 | [ilog2(VM_SEQ_READ)] = "sr", | 560 | [ilog2(VM_SEQ_READ)] = "sr", |
diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index 67dea8123683..866aa461efa5 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Define generic no-op hooks for arch_dup_mmap and arch_exit_mmap, to | 2 | * Define generic no-op hooks for arch_dup_mmap, arch_exit_mmap |
3 | * be included in asm-FOO/mmu_context.h for any arch FOO which doesn't | 3 | * and arch_unmap to be included in asm-FOO/mmu_context.h for any |
4 | * need to hook these. | 4 | * arch FOO which doesn't need to hook these. |
5 | */ | 5 | */ |
6 | #ifndef _ASM_GENERIC_MM_HOOKS_H | 6 | #ifndef _ASM_GENERIC_MM_HOOKS_H |
7 | #define _ASM_GENERIC_MM_HOOKS_H | 7 | #define _ASM_GENERIC_MM_HOOKS_H |
@@ -15,4 +15,15 @@ static inline void arch_exit_mmap(struct mm_struct *mm) | |||
15 | { | 15 | { |
16 | } | 16 | } |
17 | 17 | ||
18 | static inline void arch_unmap(struct mm_struct *mm, | ||
19 | struct vm_area_struct *vma, | ||
20 | unsigned long start, unsigned long end) | ||
21 | { | ||
22 | } | ||
23 | |||
24 | static inline void arch_bprm_mm_init(struct mm_struct *mm, | ||
25 | struct vm_area_struct *vma) | ||
26 | { | ||
27 | } | ||
28 | |||
18 | #endif /* _ASM_GENERIC_MM_HOOKS_H */ | 29 | #endif /* _ASM_GENERIC_MM_HOOKS_H */ |
diff --git a/include/linux/mm.h b/include/linux/mm.h index b46461116cd2..f7606d3a0915 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -128,6 +128,7 @@ extern unsigned int kobjsize(const void *objp); | |||
128 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ | 128 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ |
129 | #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ | 129 | #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ |
130 | #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ | 130 | #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ |
131 | #define VM_ARCH_2 0x02000000 | ||
131 | #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ | 132 | #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ |
132 | 133 | ||
133 | #ifdef CONFIG_MEM_SOFT_DIRTY | 134 | #ifdef CONFIG_MEM_SOFT_DIRTY |
@@ -155,6 +156,11 @@ extern unsigned int kobjsize(const void *objp); | |||
155 | # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ | 156 | # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ |
156 | #endif | 157 | #endif |
157 | 158 | ||
159 | #if defined(CONFIG_X86) | ||
160 | /* MPX specific bounds table or bounds directory */ | ||
161 | # define VM_MPX VM_ARCH_2 | ||
162 | #endif | ||
163 | |||
158 | #ifndef VM_GROWSUP | 164 | #ifndef VM_GROWSUP |
159 | # define VM_GROWSUP VM_NONE | 165 | # define VM_GROWSUP VM_NONE |
160 | #endif | 166 | #endif |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6e0b286649f1..004e9d17b47e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -454,6 +454,10 @@ struct mm_struct { | |||
454 | bool tlb_flush_pending; | 454 | bool tlb_flush_pending; |
455 | #endif | 455 | #endif |
456 | struct uprobes_state uprobes_state; | 456 | struct uprobes_state uprobes_state; |
457 | #ifdef CONFIG_X86_INTEL_MPX | ||
458 | /* address of the bounds directory */ | ||
459 | void __user *bd_addr; | ||
460 | #endif | ||
457 | }; | 461 | }; |
458 | 462 | ||
459 | static inline void mm_init_cpumask(struct mm_struct *mm) | 463 | static inline void mm_init_cpumask(struct mm_struct *mm) |
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index ba5be7fdbdfe..1e3552037a5a 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h | |||
@@ -91,6 +91,10 @@ typedef struct siginfo { | |||
91 | int _trapno; /* TRAP # which caused the signal */ | 91 | int _trapno; /* TRAP # which caused the signal */ |
92 | #endif | 92 | #endif |
93 | short _addr_lsb; /* LSB of the reported address */ | 93 | short _addr_lsb; /* LSB of the reported address */ |
94 | struct { | ||
95 | void __user *_lower; | ||
96 | void __user *_upper; | ||
97 | } _addr_bnd; | ||
94 | } _sigfault; | 98 | } _sigfault; |
95 | 99 | ||
96 | /* SIGPOLL */ | 100 | /* SIGPOLL */ |
@@ -131,6 +135,8 @@ typedef struct siginfo { | |||
131 | #define si_trapno _sifields._sigfault._trapno | 135 | #define si_trapno _sifields._sigfault._trapno |
132 | #endif | 136 | #endif |
133 | #define si_addr_lsb _sifields._sigfault._addr_lsb | 137 | #define si_addr_lsb _sifields._sigfault._addr_lsb |
138 | #define si_lower _sifields._sigfault._addr_bnd._lower | ||
139 | #define si_upper _sifields._sigfault._addr_bnd._upper | ||
134 | #define si_band _sifields._sigpoll._band | 140 | #define si_band _sifields._sigpoll._band |
135 | #define si_fd _sifields._sigpoll._fd | 141 | #define si_fd _sifields._sigpoll._fd |
136 | #ifdef __ARCH_SIGSYS | 142 | #ifdef __ARCH_SIGSYS |
@@ -199,7 +205,8 @@ typedef struct siginfo { | |||
199 | */ | 205 | */ |
200 | #define SEGV_MAPERR (__SI_FAULT|1) /* address not mapped to object */ | 206 | #define SEGV_MAPERR (__SI_FAULT|1) /* address not mapped to object */ |
201 | #define SEGV_ACCERR (__SI_FAULT|2) /* invalid permissions for mapped object */ | 207 | #define SEGV_ACCERR (__SI_FAULT|2) /* invalid permissions for mapped object */ |
202 | #define NSIGSEGV 2 | 208 | #define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */ |
209 | #define NSIGSEGV 3 | ||
203 | 210 | ||
204 | /* | 211 | /* |
205 | * SIGBUS si_codes | 212 | * SIGBUS si_codes |
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 513df75d0fc9..89f63503f903 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h | |||
@@ -179,4 +179,10 @@ struct prctl_mm_map { | |||
179 | #define PR_SET_THP_DISABLE 41 | 179 | #define PR_SET_THP_DISABLE 41 |
180 | #define PR_GET_THP_DISABLE 42 | 180 | #define PR_GET_THP_DISABLE 42 |
181 | 181 | ||
182 | /* | ||
183 | * Tell the kernel to start/stop helping userspace manage bounds tables. | ||
184 | */ | ||
185 | #define PR_MPX_ENABLE_MANAGEMENT 43 | ||
186 | #define PR_MPX_DISABLE_MANAGEMENT 44 | ||
187 | |||
182 | #endif /* _LINUX_PRCTL_H */ | 188 | #endif /* _LINUX_PRCTL_H */ |
diff --git a/kernel/signal.c b/kernel/signal.c index 19e35135fc60..16a305295256 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -2756,6 +2756,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) | |||
2756 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) | 2756 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) |
2757 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); | 2757 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); |
2758 | #endif | 2758 | #endif |
2759 | #ifdef SEGV_BNDERR | ||
2760 | err |= __put_user(from->si_lower, &to->si_lower); | ||
2761 | err |= __put_user(from->si_upper, &to->si_upper); | ||
2762 | #endif | ||
2759 | break; | 2763 | break; |
2760 | case __SI_CHLD: | 2764 | case __SI_CHLD: |
2761 | err |= __put_user(from->si_pid, &to->si_pid); | 2765 | err |= __put_user(from->si_pid, &to->si_pid); |
diff --git a/kernel/sys.c b/kernel/sys.c index 1eaa2f0b0246..a8c9f5a7dda6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -91,6 +91,12 @@ | |||
91 | #ifndef SET_TSC_CTL | 91 | #ifndef SET_TSC_CTL |
92 | # define SET_TSC_CTL(a) (-EINVAL) | 92 | # define SET_TSC_CTL(a) (-EINVAL) |
93 | #endif | 93 | #endif |
94 | #ifndef MPX_ENABLE_MANAGEMENT | ||
95 | # define MPX_ENABLE_MANAGEMENT(a) (-EINVAL) | ||
96 | #endif | ||
97 | #ifndef MPX_DISABLE_MANAGEMENT | ||
98 | # define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) | ||
99 | #endif | ||
94 | 100 | ||
95 | /* | 101 | /* |
96 | * this is where the system-wide overflow UID and GID are defined, for | 102 | * this is where the system-wide overflow UID and GID are defined, for |
@@ -2203,6 +2209,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2203 | me->mm->def_flags &= ~VM_NOHUGEPAGE; | 2209 | me->mm->def_flags &= ~VM_NOHUGEPAGE; |
2204 | up_write(&me->mm->mmap_sem); | 2210 | up_write(&me->mm->mmap_sem); |
2205 | break; | 2211 | break; |
2212 | case PR_MPX_ENABLE_MANAGEMENT: | ||
2213 | error = MPX_ENABLE_MANAGEMENT(me); | ||
2214 | break; | ||
2215 | case PR_MPX_DISABLE_MANAGEMENT: | ||
2216 | error = MPX_DISABLE_MANAGEMENT(me); | ||
2217 | break; | ||
2206 | default: | 2218 | default: |
2207 | error = -EINVAL; | 2219 | error = -EINVAL; |
2208 | break; | 2220 | break; |
@@ -2601,6 +2601,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
2601 | detach_vmas_to_be_unmapped(mm, vma, prev, end); | 2601 | detach_vmas_to_be_unmapped(mm, vma, prev, end); |
2602 | unmap_region(mm, vma, prev, start, end); | 2602 | unmap_region(mm, vma, prev, start, end); |
2603 | 2603 | ||
2604 | arch_unmap(mm, vma, start, end); | ||
2605 | |||
2604 | /* Fix up all other VM information */ | 2606 | /* Fix up all other VM information */ |
2605 | remove_vma_list(mm, vma); | 2607 | remove_vma_list(mm, vma); |
2606 | 2608 | ||