diff options
121 files changed, 3006 insertions, 2026 deletions
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index a75e3adaa39d..88b85899d309 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt | |||
@@ -406,6 +406,12 @@ Protocol: 2.00+ | |||
406 | - If 0, the protected-mode code is loaded at 0x10000. | 406 | - If 0, the protected-mode code is loaded at 0x10000. |
407 | - If 1, the protected-mode code is loaded at 0x100000. | 407 | - If 1, the protected-mode code is loaded at 0x100000. |
408 | 408 | ||
409 | Bit 1 (kernel internal): ALSR_FLAG | ||
410 | - Used internally by the compressed kernel to communicate | ||
411 | KASLR status to kernel proper. | ||
412 | If 1, KASLR enabled. | ||
413 | If 0, KASLR disabled. | ||
414 | |||
409 | Bit 5 (write): QUIET_FLAG | 415 | Bit 5 (write): QUIET_FLAG |
410 | - If 0, print early messages. | 416 | - If 0, print early messages. |
411 | - If 1, suppress early messages. | 417 | - If 1, suppress early messages. |
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index bb1376381985..d7b1f655b3ef 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c | |||
@@ -295,7 +295,8 @@ static unsigned long find_random_addr(unsigned long minimum, | |||
295 | return slots_fetch_random(); | 295 | return slots_fetch_random(); |
296 | } | 296 | } |
297 | 297 | ||
298 | unsigned char *choose_kernel_location(unsigned char *input, | 298 | unsigned char *choose_kernel_location(struct boot_params *boot_params, |
299 | unsigned char *input, | ||
299 | unsigned long input_size, | 300 | unsigned long input_size, |
300 | unsigned char *output, | 301 | unsigned char *output, |
301 | unsigned long output_size) | 302 | unsigned long output_size) |
@@ -315,6 +316,8 @@ unsigned char *choose_kernel_location(unsigned char *input, | |||
315 | } | 316 | } |
316 | #endif | 317 | #endif |
317 | 318 | ||
319 | boot_params->hdr.loadflags |= KASLR_FLAG; | ||
320 | |||
318 | /* Record the various known unsafe memory ranges. */ | 321 | /* Record the various known unsafe memory ranges. */ |
319 | mem_avoid_init((unsigned long)input, input_size, | 322 | mem_avoid_init((unsigned long)input, input_size, |
320 | (unsigned long)output, output_size); | 323 | (unsigned long)output, output_size); |
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1d7fbbcc196d..8ef964ddc18e 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <asm/page_types.h> | 29 | #include <asm/page_types.h> |
30 | #include <asm/boot.h> | 30 | #include <asm/boot.h> |
31 | #include <asm/asm-offsets.h> | 31 | #include <asm/asm-offsets.h> |
32 | #include <asm/bootparam.h> | ||
32 | 33 | ||
33 | __HEAD | 34 | __HEAD |
34 | ENTRY(startup_32) | 35 | ENTRY(startup_32) |
@@ -102,7 +103,7 @@ preferred_addr: | |||
102 | * Test KEEP_SEGMENTS flag to see if the bootloader is asking | 103 | * Test KEEP_SEGMENTS flag to see if the bootloader is asking |
103 | * us to not reload segments | 104 | * us to not reload segments |
104 | */ | 105 | */ |
105 | testb $(1<<6), BP_loadflags(%esi) | 106 | testb $KEEP_SEGMENTS, BP_loadflags(%esi) |
106 | jnz 1f | 107 | jnz 1f |
107 | 108 | ||
108 | cli | 109 | cli |
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 6b1766c6c082..b0c0d16ef58d 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <asm/msr.h> | 31 | #include <asm/msr.h> |
32 | #include <asm/processor-flags.h> | 32 | #include <asm/processor-flags.h> |
33 | #include <asm/asm-offsets.h> | 33 | #include <asm/asm-offsets.h> |
34 | #include <asm/bootparam.h> | ||
34 | 35 | ||
35 | __HEAD | 36 | __HEAD |
36 | .code32 | 37 | .code32 |
@@ -46,7 +47,7 @@ ENTRY(startup_32) | |||
46 | * Test KEEP_SEGMENTS flag to see if the bootloader is asking | 47 | * Test KEEP_SEGMENTS flag to see if the bootloader is asking |
47 | * us to not reload segments | 48 | * us to not reload segments |
48 | */ | 49 | */ |
49 | testb $(1<<6), BP_loadflags(%esi) | 50 | testb $KEEP_SEGMENTS, BP_loadflags(%esi) |
50 | jnz 1f | 51 | jnz 1f |
51 | 52 | ||
52 | cli | 53 | cli |
@@ -164,7 +165,7 @@ ENTRY(startup_32) | |||
164 | /* After gdt is loaded */ | 165 | /* After gdt is loaded */ |
165 | xorl %eax, %eax | 166 | xorl %eax, %eax |
166 | lldt %ax | 167 | lldt %ax |
167 | movl $0x20, %eax | 168 | movl $__BOOT_TSS, %eax |
168 | ltr %ax | 169 | ltr %ax |
169 | 170 | ||
170 | /* | 171 | /* |
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index a950864a64da..a107b935e22f 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c | |||
@@ -377,6 +377,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, | |||
377 | 377 | ||
378 | real_mode = rmode; | 378 | real_mode = rmode; |
379 | 379 | ||
380 | /* Clear it for solely in-kernel use */ | ||
381 | real_mode->hdr.loadflags &= ~KASLR_FLAG; | ||
382 | |||
380 | sanitize_boot_params(real_mode); | 383 | sanitize_boot_params(real_mode); |
381 | 384 | ||
382 | if (real_mode->screen_info.orig_video_mode == 7) { | 385 | if (real_mode->screen_info.orig_video_mode == 7) { |
@@ -401,7 +404,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, | |||
401 | * the entire decompressed kernel plus relocation table, or the | 404 | * the entire decompressed kernel plus relocation table, or the |
402 | * entire decompressed kernel plus .bss and .brk sections. | 405 | * entire decompressed kernel plus .bss and .brk sections. |
403 | */ | 406 | */ |
404 | output = choose_kernel_location(input_data, input_len, output, | 407 | output = choose_kernel_location(real_mode, input_data, input_len, output, |
405 | output_len > run_size ? output_len | 408 | output_len > run_size ? output_len |
406 | : run_size); | 409 | : run_size); |
407 | 410 | ||
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 04477d68403f..89dd0d78013a 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h | |||
@@ -57,7 +57,8 @@ int cmdline_find_option_bool(const char *option); | |||
57 | 57 | ||
58 | #if CONFIG_RANDOMIZE_BASE | 58 | #if CONFIG_RANDOMIZE_BASE |
59 | /* aslr.c */ | 59 | /* aslr.c */ |
60 | unsigned char *choose_kernel_location(unsigned char *input, | 60 | unsigned char *choose_kernel_location(struct boot_params *boot_params, |
61 | unsigned char *input, | ||
61 | unsigned long input_size, | 62 | unsigned long input_size, |
62 | unsigned char *output, | 63 | unsigned char *output, |
63 | unsigned long output_size); | 64 | unsigned long output_size); |
@@ -65,7 +66,8 @@ unsigned char *choose_kernel_location(unsigned char *input, | |||
65 | bool has_cpuflag(int flag); | 66 | bool has_cpuflag(int flag); |
66 | #else | 67 | #else |
67 | static inline | 68 | static inline |
68 | unsigned char *choose_kernel_location(unsigned char *input, | 69 | unsigned char *choose_kernel_location(struct boot_params *boot_params, |
70 | unsigned char *input, | ||
69 | unsigned long input_size, | 71 | unsigned long input_size, |
70 | unsigned char *output, | 72 | unsigned char *output, |
71 | unsigned long output_size) | 73 | unsigned long output_size) |
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 26d49ebae040..225be06edc80 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S | |||
@@ -178,7 +178,7 @@ continue_block: | |||
178 | ## 2a) PROCESS FULL BLOCKS: | 178 | ## 2a) PROCESS FULL BLOCKS: |
179 | ################################################################ | 179 | ################################################################ |
180 | full_block: | 180 | full_block: |
181 | movq $128,%rax | 181 | movl $128,%eax |
182 | lea 128*8*2(block_0), block_1 | 182 | lea 128*8*2(block_0), block_1 |
183 | lea 128*8*3(block_0), block_2 | 183 | lea 128*8*3(block_0), block_2 |
184 | add $128*8*1, block_0 | 184 | add $128*8*1, block_0 |
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index a039d21986a2..a350c990dc86 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S | |||
@@ -264,7 +264,7 @@ ENTRY(twofish_enc_blk) | |||
264 | movq R1, 8(%rsi) | 264 | movq R1, 8(%rsi) |
265 | 265 | ||
266 | popq R1 | 266 | popq R1 |
267 | movq $1,%rax | 267 | movl $1,%eax |
268 | ret | 268 | ret |
269 | ENDPROC(twofish_enc_blk) | 269 | ENDPROC(twofish_enc_blk) |
270 | 270 | ||
@@ -316,6 +316,6 @@ ENTRY(twofish_dec_blk) | |||
316 | movq R1, 8(%rsi) | 316 | movq R1, 8(%rsi) |
317 | 317 | ||
318 | popq R1 | 318 | popq R1 |
319 | movq $1,%rax | 319 | movl $1,%eax |
320 | ret | 320 | ret |
321 | ENDPROC(twofish_dec_blk) | 321 | ENDPROC(twofish_dec_blk) |
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index e785b422b766..bb635c641869 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile | |||
@@ -3,7 +3,6 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o | 5 | obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o |
6 | obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o | ||
7 | 6 | ||
8 | obj-$(CONFIG_IA32_AOUT) += ia32_aout.o | 7 | obj-$(CONFIG_IA32_AOUT) += ia32_aout.o |
9 | 8 | ||
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index d0165c9a2932..c81d35e6c7f1 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c | |||
@@ -161,8 +161,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) | |||
161 | } | 161 | } |
162 | 162 | ||
163 | static int ia32_restore_sigcontext(struct pt_regs *regs, | 163 | static int ia32_restore_sigcontext(struct pt_regs *regs, |
164 | struct sigcontext_ia32 __user *sc, | 164 | struct sigcontext_ia32 __user *sc) |
165 | unsigned int *pax) | ||
166 | { | 165 | { |
167 | unsigned int tmpflags, err = 0; | 166 | unsigned int tmpflags, err = 0; |
168 | void __user *buf; | 167 | void __user *buf; |
@@ -184,7 +183,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, | |||
184 | RELOAD_SEG(es); | 183 | RELOAD_SEG(es); |
185 | 184 | ||
186 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); | 185 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); |
187 | COPY(dx); COPY(cx); COPY(ip); | 186 | COPY(dx); COPY(cx); COPY(ip); COPY(ax); |
188 | /* Don't touch extended registers */ | 187 | /* Don't touch extended registers */ |
189 | 188 | ||
190 | COPY_SEG_CPL3(cs); | 189 | COPY_SEG_CPL3(cs); |
@@ -197,12 +196,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, | |||
197 | 196 | ||
198 | get_user_ex(tmp, &sc->fpstate); | 197 | get_user_ex(tmp, &sc->fpstate); |
199 | buf = compat_ptr(tmp); | 198 | buf = compat_ptr(tmp); |
200 | |||
201 | get_user_ex(*pax, &sc->ax); | ||
202 | } get_user_catch(err); | 199 | } get_user_catch(err); |
203 | 200 | ||
204 | err |= restore_xstate_sig(buf, 1); | 201 | err |= restore_xstate_sig(buf, 1); |
205 | 202 | ||
203 | force_iret(); | ||
204 | |||
206 | return err; | 205 | return err; |
207 | } | 206 | } |
208 | 207 | ||
@@ -211,7 +210,6 @@ asmlinkage long sys32_sigreturn(void) | |||
211 | struct pt_regs *regs = current_pt_regs(); | 210 | struct pt_regs *regs = current_pt_regs(); |
212 | struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); | 211 | struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); |
213 | sigset_t set; | 212 | sigset_t set; |
214 | unsigned int ax; | ||
215 | 213 | ||
216 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) | 214 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) |
217 | goto badframe; | 215 | goto badframe; |
@@ -224,9 +222,9 @@ asmlinkage long sys32_sigreturn(void) | |||
224 | 222 | ||
225 | set_current_blocked(&set); | 223 | set_current_blocked(&set); |
226 | 224 | ||
227 | if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) | 225 | if (ia32_restore_sigcontext(regs, &frame->sc)) |
228 | goto badframe; | 226 | goto badframe; |
229 | return ax; | 227 | return regs->ax; |
230 | 228 | ||
231 | badframe: | 229 | badframe: |
232 | signal_fault(regs, frame, "32bit sigreturn"); | 230 | signal_fault(regs, frame, "32bit sigreturn"); |
@@ -238,7 +236,6 @@ asmlinkage long sys32_rt_sigreturn(void) | |||
238 | struct pt_regs *regs = current_pt_regs(); | 236 | struct pt_regs *regs = current_pt_regs(); |
239 | struct rt_sigframe_ia32 __user *frame; | 237 | struct rt_sigframe_ia32 __user *frame; |
240 | sigset_t set; | 238 | sigset_t set; |
241 | unsigned int ax; | ||
242 | 239 | ||
243 | frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); | 240 | frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); |
244 | 241 | ||
@@ -249,13 +246,13 @@ asmlinkage long sys32_rt_sigreturn(void) | |||
249 | 246 | ||
250 | set_current_blocked(&set); | 247 | set_current_blocked(&set); |
251 | 248 | ||
252 | if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) | 249 | if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) |
253 | goto badframe; | 250 | goto badframe; |
254 | 251 | ||
255 | if (compat_restore_altstack(&frame->uc.uc_stack)) | 252 | if (compat_restore_altstack(&frame->uc.uc_stack)) |
256 | goto badframe; | 253 | goto badframe; |
257 | 254 | ||
258 | return ax; | 255 | return regs->ax; |
259 | 256 | ||
260 | badframe: | 257 | badframe: |
261 | signal_fault(regs, frame, "32bit rt sigreturn"); | 258 | signal_fault(regs, frame, "32bit rt sigreturn"); |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 156ebcab4ada..a821b1cd4fa7 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
@@ -30,24 +30,13 @@ | |||
30 | 30 | ||
31 | .section .entry.text, "ax" | 31 | .section .entry.text, "ax" |
32 | 32 | ||
33 | .macro IA32_ARG_FIXUP noebp=0 | 33 | /* clobbers %rax */ |
34 | movl %edi,%r8d | 34 | .macro CLEAR_RREGS _r9=rax |
35 | .if \noebp | ||
36 | .else | ||
37 | movl %ebp,%r9d | ||
38 | .endif | ||
39 | xchg %ecx,%esi | ||
40 | movl %ebx,%edi | ||
41 | movl %edx,%edx /* zero extension */ | ||
42 | .endm | ||
43 | |||
44 | /* clobbers %eax */ | ||
45 | .macro CLEAR_RREGS offset=0, _r9=rax | ||
46 | xorl %eax,%eax | 35 | xorl %eax,%eax |
47 | movq %rax,\offset+R11(%rsp) | 36 | movq %rax,R11(%rsp) |
48 | movq %rax,\offset+R10(%rsp) | 37 | movq %rax,R10(%rsp) |
49 | movq %\_r9,\offset+R9(%rsp) | 38 | movq %\_r9,R9(%rsp) |
50 | movq %rax,\offset+R8(%rsp) | 39 | movq %rax,R8(%rsp) |
51 | .endm | 40 | .endm |
52 | 41 | ||
53 | /* | 42 | /* |
@@ -60,14 +49,14 @@ | |||
60 | * If it's -1 to make us punt the syscall, then (u32)-1 is still | 49 | * If it's -1 to make us punt the syscall, then (u32)-1 is still |
61 | * an appropriately invalid value. | 50 | * an appropriately invalid value. |
62 | */ | 51 | */ |
63 | .macro LOAD_ARGS32 offset, _r9=0 | 52 | .macro LOAD_ARGS32 _r9=0 |
64 | .if \_r9 | 53 | .if \_r9 |
65 | movl \offset+16(%rsp),%r9d | 54 | movl R9(%rsp),%r9d |
66 | .endif | 55 | .endif |
67 | movl \offset+40(%rsp),%ecx | 56 | movl RCX(%rsp),%ecx |
68 | movl \offset+48(%rsp),%edx | 57 | movl RDX(%rsp),%edx |
69 | movl \offset+56(%rsp),%esi | 58 | movl RSI(%rsp),%esi |
70 | movl \offset+64(%rsp),%edi | 59 | movl RDI(%rsp),%edi |
71 | movl %eax,%eax /* zero extension */ | 60 | movl %eax,%eax /* zero extension */ |
72 | .endm | 61 | .endm |
73 | 62 | ||
@@ -99,54 +88,69 @@ ENDPROC(native_irq_enable_sysexit) | |||
99 | /* | 88 | /* |
100 | * 32bit SYSENTER instruction entry. | 89 | * 32bit SYSENTER instruction entry. |
101 | * | 90 | * |
91 | * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. | ||
92 | * IF and VM in rflags are cleared (IOW: interrupts are off). | ||
93 | * SYSENTER does not save anything on the stack, | ||
94 | * and does not save old rip (!!!) and rflags. | ||
95 | * | ||
102 | * Arguments: | 96 | * Arguments: |
103 | * %eax System call number. | 97 | * eax system call number |
104 | * %ebx Arg1 | 98 | * ebx arg1 |
105 | * %ecx Arg2 | 99 | * ecx arg2 |
106 | * %edx Arg3 | 100 | * edx arg3 |
107 | * %esi Arg4 | 101 | * esi arg4 |
108 | * %edi Arg5 | 102 | * edi arg5 |
109 | * %ebp user stack | 103 | * ebp user stack |
110 | * 0(%ebp) Arg6 | 104 | * 0(%ebp) arg6 |
111 | * | 105 | * |
112 | * Interrupts off. | ||
113 | * | ||
114 | * This is purely a fast path. For anything complicated we use the int 0x80 | 106 | * This is purely a fast path. For anything complicated we use the int 0x80 |
115 | * path below. Set up a complete hardware stack frame to share code | 107 | * path below. We set up a complete hardware stack frame to share code |
116 | * with the int 0x80 path. | 108 | * with the int 0x80 path. |
117 | */ | 109 | */ |
118 | ENTRY(ia32_sysenter_target) | 110 | ENTRY(ia32_sysenter_target) |
119 | CFI_STARTPROC32 simple | 111 | CFI_STARTPROC32 simple |
120 | CFI_SIGNAL_FRAME | 112 | CFI_SIGNAL_FRAME |
121 | CFI_DEF_CFA rsp,0 | 113 | CFI_DEF_CFA rsp,0 |
122 | CFI_REGISTER rsp,rbp | 114 | CFI_REGISTER rsp,rbp |
123 | SWAPGS_UNSAFE_STACK | 115 | |
124 | movq PER_CPU_VAR(kernel_stack), %rsp | ||
125 | addq $(KERNEL_STACK_OFFSET),%rsp | ||
126 | /* | 116 | /* |
127 | * No need to follow this irqs on/off section: the syscall | 117 | * Interrupts are off on entry. |
128 | * disabled irqs, here we enable it straight after entry: | 118 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, |
119 | * it is too small to ever cause noticeable irq latency. | ||
129 | */ | 120 | */ |
121 | SWAPGS_UNSAFE_STACK | ||
122 | movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp | ||
130 | ENABLE_INTERRUPTS(CLBR_NONE) | 123 | ENABLE_INTERRUPTS(CLBR_NONE) |
131 | movl %ebp,%ebp /* zero extension */ | 124 | |
132 | pushq_cfi $__USER32_DS | 125 | /* Zero-extending 32-bit regs, do not remove */ |
133 | /*CFI_REL_OFFSET ss,0*/ | 126 | movl %ebp, %ebp |
134 | pushq_cfi %rbp | ||
135 | CFI_REL_OFFSET rsp,0 | ||
136 | pushfq_cfi | ||
137 | /*CFI_REL_OFFSET rflags,0*/ | ||
138 | movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d | ||
139 | CFI_REGISTER rip,r10 | ||
140 | pushq_cfi $__USER32_CS | ||
141 | /*CFI_REL_OFFSET cs,0*/ | ||
142 | movl %eax, %eax | 127 | movl %eax, %eax |
143 | pushq_cfi %r10 | 128 | |
144 | CFI_REL_OFFSET rip,0 | 129 | movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d |
145 | pushq_cfi %rax | 130 | CFI_REGISTER rip,r10 |
131 | |||
132 | /* Construct struct pt_regs on stack */ | ||
133 | pushq_cfi $__USER32_DS /* pt_regs->ss */ | ||
134 | pushq_cfi %rbp /* pt_regs->sp */ | ||
135 | CFI_REL_OFFSET rsp,0 | ||
136 | pushfq_cfi /* pt_regs->flags */ | ||
137 | pushq_cfi $__USER32_CS /* pt_regs->cs */ | ||
138 | pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */ | ||
139 | CFI_REL_OFFSET rip,0 | ||
140 | pushq_cfi_reg rax /* pt_regs->orig_ax */ | ||
141 | pushq_cfi_reg rdi /* pt_regs->di */ | ||
142 | pushq_cfi_reg rsi /* pt_regs->si */ | ||
143 | pushq_cfi_reg rdx /* pt_regs->dx */ | ||
144 | pushq_cfi_reg rcx /* pt_regs->cx */ | ||
145 | pushq_cfi_reg rax /* pt_regs->ax */ | ||
146 | cld | 146 | cld |
147 | SAVE_ARGS 0,1,0 | 147 | sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ |
148 | /* no need to do an access_ok check here because rbp has been | 148 | CFI_ADJUST_CFA_OFFSET 10*8 |
149 | 32bit zero extended */ | 149 | |
150 | /* | ||
151 | * no need to do an access_ok check here because rbp has been | ||
152 | * 32bit zero extended | ||
153 | */ | ||
150 | ASM_STAC | 154 | ASM_STAC |
151 | 1: movl (%rbp),%ebp | 155 | 1: movl (%rbp),%ebp |
152 | _ASM_EXTABLE(1b,ia32_badarg) | 156 | _ASM_EXTABLE(1b,ia32_badarg) |
@@ -157,42 +161,80 @@ ENTRY(ia32_sysenter_target) | |||
157 | * ourselves. To save a few cycles, we can check whether | 161 | * ourselves. To save a few cycles, we can check whether |
158 | * NT was set instead of doing an unconditional popfq. | 162 | * NT was set instead of doing an unconditional popfq. |
159 | */ | 163 | */ |
160 | testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp) | 164 | testl $X86_EFLAGS_NT,EFLAGS(%rsp) |
161 | jnz sysenter_fix_flags | 165 | jnz sysenter_fix_flags |
162 | sysenter_flags_fixed: | 166 | sysenter_flags_fixed: |
163 | 167 | ||
164 | orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 168 | orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) |
165 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 169 | testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
166 | CFI_REMEMBER_STATE | 170 | CFI_REMEMBER_STATE |
167 | jnz sysenter_tracesys | 171 | jnz sysenter_tracesys |
168 | cmpq $(IA32_NR_syscalls-1),%rax | 172 | cmpq $(IA32_NR_syscalls-1),%rax |
169 | ja ia32_badsys | 173 | ja ia32_badsys |
170 | sysenter_do_call: | 174 | sysenter_do_call: |
171 | IA32_ARG_FIXUP | 175 | /* 32bit syscall -> 64bit C ABI argument conversion */ |
176 | movl %edi,%r8d /* arg5 */ | ||
177 | movl %ebp,%r9d /* arg6 */ | ||
178 | xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ | ||
179 | movl %ebx,%edi /* arg1 */ | ||
180 | movl %edx,%edx /* arg3 (zero extension) */ | ||
172 | sysenter_dispatch: | 181 | sysenter_dispatch: |
173 | call *ia32_sys_call_table(,%rax,8) | 182 | call *ia32_sys_call_table(,%rax,8) |
174 | movq %rax,RAX-ARGOFFSET(%rsp) | 183 | movq %rax,RAX(%rsp) |
175 | DISABLE_INTERRUPTS(CLBR_NONE) | 184 | DISABLE_INTERRUPTS(CLBR_NONE) |
176 | TRACE_IRQS_OFF | 185 | TRACE_IRQS_OFF |
177 | testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 186 | testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
178 | jnz sysexit_audit | 187 | jnz sysexit_audit |
179 | sysexit_from_sys_call: | 188 | sysexit_from_sys_call: |
180 | andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 189 | /* |
181 | /* clear IF, that popfq doesn't enable interrupts early */ | 190 | * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an |
182 | andl $~0x200,EFLAGS-ARGOFFSET(%rsp) | 191 | * NMI between STI and SYSEXIT has poorly specified behavior, |
183 | movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */ | 192 | * and and NMI followed by an IRQ with usergs is fatal. So |
184 | CFI_REGISTER rip,rdx | 193 | * we just pretend we're using SYSEXIT but we really use |
185 | RESTORE_ARGS 0,24,0,0,0,0 | 194 | * SYSRETL instead. |
195 | * | ||
196 | * This code path is still called 'sysexit' because it pairs | ||
197 | * with 'sysenter' and it uses the SYSENTER calling convention. | ||
198 | */ | ||
199 | andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) | ||
200 | movl RIP(%rsp),%ecx /* User %eip */ | ||
201 | CFI_REGISTER rip,rcx | ||
202 | RESTORE_RSI_RDI | ||
203 | xorl %edx,%edx /* avoid info leaks */ | ||
186 | xorq %r8,%r8 | 204 | xorq %r8,%r8 |
187 | xorq %r9,%r9 | 205 | xorq %r9,%r9 |
188 | xorq %r10,%r10 | 206 | xorq %r10,%r10 |
189 | xorq %r11,%r11 | 207 | movl EFLAGS(%rsp),%r11d /* User eflags */ |
190 | popfq_cfi | ||
191 | /*CFI_RESTORE rflags*/ | 208 | /*CFI_RESTORE rflags*/ |
192 | popq_cfi %rcx /* User %esp */ | ||
193 | CFI_REGISTER rsp,rcx | ||
194 | TRACE_IRQS_ON | 209 | TRACE_IRQS_ON |
195 | ENABLE_INTERRUPTS_SYSEXIT32 | 210 | |
211 | /* | ||
212 | * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, | ||
213 | * since it avoids a dicey window with interrupts enabled. | ||
214 | */ | ||
215 | movl RSP(%rsp),%esp | ||
216 | |||
217 | /* | ||
218 | * USERGS_SYSRET32 does: | ||
219 | * gsbase = user's gs base | ||
220 | * eip = ecx | ||
221 | * rflags = r11 | ||
222 | * cs = __USER32_CS | ||
223 | * ss = __USER_DS | ||
224 | * | ||
225 | * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: | ||
226 | * | ||
227 | * pop %ebp | ||
228 | * pop %edx | ||
229 | * pop %ecx | ||
230 | * | ||
231 | * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to | ||
232 | * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's | ||
233 | * address (already known to user code), and R12-R15 are | ||
234 | * callee-saved and therefore don't contain any interesting | ||
235 | * kernel data. | ||
236 | */ | ||
237 | USERGS_SYSRET32 | ||
196 | 238 | ||
197 | CFI_RESTORE_STATE | 239 | CFI_RESTORE_STATE |
198 | 240 | ||
@@ -205,18 +247,18 @@ sysexit_from_sys_call: | |||
205 | movl %ebx,%esi /* 2nd arg: 1st syscall arg */ | 247 | movl %ebx,%esi /* 2nd arg: 1st syscall arg */ |
206 | movl %eax,%edi /* 1st arg: syscall number */ | 248 | movl %eax,%edi /* 1st arg: syscall number */ |
207 | call __audit_syscall_entry | 249 | call __audit_syscall_entry |
208 | movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ | 250 | movl RAX(%rsp),%eax /* reload syscall number */ |
209 | cmpq $(IA32_NR_syscalls-1),%rax | 251 | cmpq $(IA32_NR_syscalls-1),%rax |
210 | ja ia32_badsys | 252 | ja ia32_badsys |
211 | movl %ebx,%edi /* reload 1st syscall arg */ | 253 | movl %ebx,%edi /* reload 1st syscall arg */ |
212 | movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ | 254 | movl RCX(%rsp),%esi /* reload 2nd syscall arg */ |
213 | movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ | 255 | movl RDX(%rsp),%edx /* reload 3rd syscall arg */ |
214 | movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */ | 256 | movl RSI(%rsp),%ecx /* reload 4th syscall arg */ |
215 | movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ | 257 | movl RDI(%rsp),%r8d /* reload 5th syscall arg */ |
216 | .endm | 258 | .endm |
217 | 259 | ||
218 | .macro auditsys_exit exit | 260 | .macro auditsys_exit exit |
219 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 261 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
220 | jnz ia32_ret_from_sys_call | 262 | jnz ia32_ret_from_sys_call |
221 | TRACE_IRQS_ON | 263 | TRACE_IRQS_ON |
222 | ENABLE_INTERRUPTS(CLBR_NONE) | 264 | ENABLE_INTERRUPTS(CLBR_NONE) |
@@ -227,13 +269,13 @@ sysexit_from_sys_call: | |||
227 | 1: setbe %al /* 1 if error, 0 if not */ | 269 | 1: setbe %al /* 1 if error, 0 if not */ |
228 | movzbl %al,%edi /* zero-extend that into %edi */ | 270 | movzbl %al,%edi /* zero-extend that into %edi */ |
229 | call __audit_syscall_exit | 271 | call __audit_syscall_exit |
230 | movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ | 272 | movq RAX(%rsp),%rax /* reload syscall return value */ |
231 | movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi | 273 | movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi |
232 | DISABLE_INTERRUPTS(CLBR_NONE) | 274 | DISABLE_INTERRUPTS(CLBR_NONE) |
233 | TRACE_IRQS_OFF | 275 | TRACE_IRQS_OFF |
234 | testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 276 | testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
235 | jz \exit | 277 | jz \exit |
236 | CLEAR_RREGS -ARGOFFSET | 278 | CLEAR_RREGS |
237 | jmp int_with_check | 279 | jmp int_with_check |
238 | .endm | 280 | .endm |
239 | 281 | ||
@@ -253,16 +295,16 @@ sysenter_fix_flags: | |||
253 | 295 | ||
254 | sysenter_tracesys: | 296 | sysenter_tracesys: |
255 | #ifdef CONFIG_AUDITSYSCALL | 297 | #ifdef CONFIG_AUDITSYSCALL |
256 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 298 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
257 | jz sysenter_auditsys | 299 | jz sysenter_auditsys |
258 | #endif | 300 | #endif |
259 | SAVE_REST | 301 | SAVE_EXTRA_REGS |
260 | CLEAR_RREGS | 302 | CLEAR_RREGS |
261 | movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ | 303 | movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ |
262 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | 304 | movq %rsp,%rdi /* &pt_regs -> arg1 */ |
263 | call syscall_trace_enter | 305 | call syscall_trace_enter |
264 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | 306 | LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ |
265 | RESTORE_REST | 307 | RESTORE_EXTRA_REGS |
266 | cmpq $(IA32_NR_syscalls-1),%rax | 308 | cmpq $(IA32_NR_syscalls-1),%rax |
267 | ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ | 309 | ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ |
268 | jmp sysenter_do_call | 310 | jmp sysenter_do_call |
@@ -272,94 +314,128 @@ ENDPROC(ia32_sysenter_target) | |||
272 | /* | 314 | /* |
273 | * 32bit SYSCALL instruction entry. | 315 | * 32bit SYSCALL instruction entry. |
274 | * | 316 | * |
317 | * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, | ||
318 | * then loads new ss, cs, and rip from previously programmed MSRs. | ||
319 | * rflags gets masked by a value from another MSR (so CLD and CLAC | ||
320 | * are not needed). SYSCALL does not save anything on the stack | ||
321 | * and does not change rsp. | ||
322 | * | ||
323 | * Note: rflags saving+masking-with-MSR happens only in Long mode | ||
324 | * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). | ||
325 | * Don't get confused: rflags saving+masking depends on Long Mode Active bit | ||
326 | * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes | ||
327 | * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). | ||
328 | * | ||
275 | * Arguments: | 329 | * Arguments: |
276 | * %eax System call number. | 330 | * eax system call number |
277 | * %ebx Arg1 | 331 | * ecx return address |
278 | * %ecx return EIP | 332 | * ebx arg1 |
279 | * %edx Arg3 | 333 | * ebp arg2 (note: not saved in the stack frame, should not be touched) |
280 | * %esi Arg4 | 334 | * edx arg3 |
281 | * %edi Arg5 | 335 | * esi arg4 |
282 | * %ebp Arg2 [note: not saved in the stack frame, should not be touched] | 336 | * edi arg5 |
283 | * %esp user stack | 337 | * esp user stack |
284 | * 0(%esp) Arg6 | 338 | * 0(%esp) arg6 |
285 | * | 339 | * |
286 | * Interrupts off. | ||
287 | * | ||
288 | * This is purely a fast path. For anything complicated we use the int 0x80 | 340 | * This is purely a fast path. For anything complicated we use the int 0x80 |
289 | * path below. Set up a complete hardware stack frame to share code | 341 | * path below. We set up a complete hardware stack frame to share code |
290 | * with the int 0x80 path. | 342 | * with the int 0x80 path. |
291 | */ | 343 | */ |
292 | ENTRY(ia32_cstar_target) | 344 | ENTRY(ia32_cstar_target) |
293 | CFI_STARTPROC32 simple | 345 | CFI_STARTPROC32 simple |
294 | CFI_SIGNAL_FRAME | 346 | CFI_SIGNAL_FRAME |
295 | CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET | 347 | CFI_DEF_CFA rsp,0 |
296 | CFI_REGISTER rip,rcx | 348 | CFI_REGISTER rip,rcx |
297 | /*CFI_REGISTER rflags,r11*/ | 349 | /*CFI_REGISTER rflags,r11*/ |
350 | |||
351 | /* | ||
352 | * Interrupts are off on entry. | ||
353 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, | ||
354 | * it is too small to ever cause noticeable irq latency. | ||
355 | */ | ||
298 | SWAPGS_UNSAFE_STACK | 356 | SWAPGS_UNSAFE_STACK |
299 | movl %esp,%r8d | 357 | movl %esp,%r8d |
300 | CFI_REGISTER rsp,r8 | 358 | CFI_REGISTER rsp,r8 |
301 | movq PER_CPU_VAR(kernel_stack),%rsp | 359 | movq PER_CPU_VAR(kernel_stack),%rsp |
302 | /* | ||
303 | * No need to follow this irqs on/off section: the syscall | ||
304 | * disabled irqs and here we enable it straight after entry: | ||
305 | */ | ||
306 | ENABLE_INTERRUPTS(CLBR_NONE) | 360 | ENABLE_INTERRUPTS(CLBR_NONE) |
307 | SAVE_ARGS 8,0,0 | 361 | |
308 | movl %eax,%eax /* zero extension */ | 362 | /* Zero-extending 32-bit regs, do not remove */ |
309 | movq %rax,ORIG_RAX-ARGOFFSET(%rsp) | 363 | movl %eax,%eax |
310 | movq %rcx,RIP-ARGOFFSET(%rsp) | 364 | |
311 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | 365 | /* Construct struct pt_regs on stack */ |
312 | movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ | 366 | pushq_cfi $__USER32_DS /* pt_regs->ss */ |
367 | pushq_cfi %r8 /* pt_regs->sp */ | ||
368 | CFI_REL_OFFSET rsp,0 | ||
369 | pushq_cfi %r11 /* pt_regs->flags */ | ||
370 | pushq_cfi $__USER32_CS /* pt_regs->cs */ | ||
371 | pushq_cfi %rcx /* pt_regs->ip */ | ||
372 | CFI_REL_OFFSET rip,0 | ||
373 | pushq_cfi_reg rax /* pt_regs->orig_ax */ | ||
374 | pushq_cfi_reg rdi /* pt_regs->di */ | ||
375 | pushq_cfi_reg rsi /* pt_regs->si */ | ||
376 | pushq_cfi_reg rdx /* pt_regs->dx */ | ||
377 | pushq_cfi_reg rbp /* pt_regs->cx */ | ||
313 | movl %ebp,%ecx | 378 | movl %ebp,%ecx |
314 | movq $__USER32_CS,CS-ARGOFFSET(%rsp) | 379 | pushq_cfi_reg rax /* pt_regs->ax */ |
315 | movq $__USER32_DS,SS-ARGOFFSET(%rsp) | 380 | sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ |
316 | movq %r11,EFLAGS-ARGOFFSET(%rsp) | 381 | CFI_ADJUST_CFA_OFFSET 10*8 |
317 | /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ | 382 | |
318 | movq %r8,RSP-ARGOFFSET(%rsp) | 383 | /* |
319 | CFI_REL_OFFSET rsp,RSP-ARGOFFSET | 384 | * no need to do an access_ok check here because r8 has been |
320 | /* no need to do an access_ok check here because r8 has been | 385 | * 32bit zero extended |
321 | 32bit zero extended */ | 386 | */ |
322 | /* hardware stack frame is complete now */ | ||
323 | ASM_STAC | 387 | ASM_STAC |
324 | 1: movl (%r8),%r9d | 388 | 1: movl (%r8),%r9d |
325 | _ASM_EXTABLE(1b,ia32_badarg) | 389 | _ASM_EXTABLE(1b,ia32_badarg) |
326 | ASM_CLAC | 390 | ASM_CLAC |
327 | orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 391 | orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) |
328 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 392 | testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
329 | CFI_REMEMBER_STATE | 393 | CFI_REMEMBER_STATE |
330 | jnz cstar_tracesys | 394 | jnz cstar_tracesys |
331 | cmpq $IA32_NR_syscalls-1,%rax | 395 | cmpq $IA32_NR_syscalls-1,%rax |
332 | ja ia32_badsys | 396 | ja ia32_badsys |
333 | cstar_do_call: | 397 | cstar_do_call: |
334 | IA32_ARG_FIXUP 1 | 398 | /* 32bit syscall -> 64bit C ABI argument conversion */ |
399 | movl %edi,%r8d /* arg5 */ | ||
400 | /* r9 already loaded */ /* arg6 */ | ||
401 | xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ | ||
402 | movl %ebx,%edi /* arg1 */ | ||
403 | movl %edx,%edx /* arg3 (zero extension) */ | ||
335 | cstar_dispatch: | 404 | cstar_dispatch: |
336 | call *ia32_sys_call_table(,%rax,8) | 405 | call *ia32_sys_call_table(,%rax,8) |
337 | movq %rax,RAX-ARGOFFSET(%rsp) | 406 | movq %rax,RAX(%rsp) |
338 | DISABLE_INTERRUPTS(CLBR_NONE) | 407 | DISABLE_INTERRUPTS(CLBR_NONE) |
339 | TRACE_IRQS_OFF | 408 | TRACE_IRQS_OFF |
340 | testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 409 | testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
341 | jnz sysretl_audit | 410 | jnz sysretl_audit |
342 | sysretl_from_sys_call: | 411 | sysretl_from_sys_call: |
343 | andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 412 | andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) |
344 | RESTORE_ARGS 0,-ARG_SKIP,0,0,0 | 413 | RESTORE_RSI_RDI_RDX |
345 | movl RIP-ARGOFFSET(%rsp),%ecx | 414 | movl RIP(%rsp),%ecx |
346 | CFI_REGISTER rip,rcx | 415 | CFI_REGISTER rip,rcx |
347 | movl EFLAGS-ARGOFFSET(%rsp),%r11d | 416 | movl EFLAGS(%rsp),%r11d |
348 | /*CFI_REGISTER rflags,r11*/ | 417 | /*CFI_REGISTER rflags,r11*/ |
349 | xorq %r10,%r10 | 418 | xorq %r10,%r10 |
350 | xorq %r9,%r9 | 419 | xorq %r9,%r9 |
351 | xorq %r8,%r8 | 420 | xorq %r8,%r8 |
352 | TRACE_IRQS_ON | 421 | TRACE_IRQS_ON |
353 | movl RSP-ARGOFFSET(%rsp),%esp | 422 | movl RSP(%rsp),%esp |
354 | CFI_RESTORE rsp | 423 | CFI_RESTORE rsp |
424 | /* | ||
425 | * 64bit->32bit SYSRET restores eip from ecx, | ||
426 | * eflags from r11 (but RF and VM bits are forced to 0), | ||
427 | * cs and ss are loaded from MSRs. | ||
428 | * (Note: 32bit->32bit SYSRET is different: since r11 | ||
429 | * does not exist, it merely sets eflags.IF=1). | ||
430 | */ | ||
355 | USERGS_SYSRET32 | 431 | USERGS_SYSRET32 |
356 | 432 | ||
357 | #ifdef CONFIG_AUDITSYSCALL | 433 | #ifdef CONFIG_AUDITSYSCALL |
358 | cstar_auditsys: | 434 | cstar_auditsys: |
359 | CFI_RESTORE_STATE | 435 | CFI_RESTORE_STATE |
360 | movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ | 436 | movl %r9d,R9(%rsp) /* register to be clobbered by call */ |
361 | auditsys_entry_common | 437 | auditsys_entry_common |
362 | movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ | 438 | movl R9(%rsp),%r9d /* reload 6th syscall arg */ |
363 | jmp cstar_dispatch | 439 | jmp cstar_dispatch |
364 | 440 | ||
365 | sysretl_audit: | 441 | sysretl_audit: |
@@ -368,17 +444,17 @@ sysretl_audit: | |||
368 | 444 | ||
369 | cstar_tracesys: | 445 | cstar_tracesys: |
370 | #ifdef CONFIG_AUDITSYSCALL | 446 | #ifdef CONFIG_AUDITSYSCALL |
371 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 447 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
372 | jz cstar_auditsys | 448 | jz cstar_auditsys |
373 | #endif | 449 | #endif |
374 | xchgl %r9d,%ebp | 450 | xchgl %r9d,%ebp |
375 | SAVE_REST | 451 | SAVE_EXTRA_REGS |
376 | CLEAR_RREGS 0, r9 | 452 | CLEAR_RREGS r9 |
377 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ | 453 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ |
378 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | 454 | movq %rsp,%rdi /* &pt_regs -> arg1 */ |
379 | call syscall_trace_enter | 455 | call syscall_trace_enter |
380 | LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ | 456 | LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ |
381 | RESTORE_REST | 457 | RESTORE_EXTRA_REGS |
382 | xchgl %ebp,%r9d | 458 | xchgl %ebp,%r9d |
383 | cmpq $(IA32_NR_syscalls-1),%rax | 459 | cmpq $(IA32_NR_syscalls-1),%rax |
384 | ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ | 460 | ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ |
@@ -391,78 +467,94 @@ ia32_badarg: | |||
391 | jmp ia32_sysret | 467 | jmp ia32_sysret |
392 | CFI_ENDPROC | 468 | CFI_ENDPROC |
393 | 469 | ||
394 | /* | 470 | /* |
395 | * Emulated IA32 system calls via int 0x80. | 471 | * Emulated IA32 system calls via int 0x80. |
396 | * | 472 | * |
397 | * Arguments: | 473 | * Arguments: |
398 | * %eax System call number. | 474 | * eax system call number |
399 | * %ebx Arg1 | 475 | * ebx arg1 |
400 | * %ecx Arg2 | 476 | * ecx arg2 |
401 | * %edx Arg3 | 477 | * edx arg3 |
402 | * %esi Arg4 | 478 | * esi arg4 |
403 | * %edi Arg5 | 479 | * edi arg5 |
404 | * %ebp Arg6 [note: not saved in the stack frame, should not be touched] | 480 | * ebp arg6 (note: not saved in the stack frame, should not be touched) |
405 | * | 481 | * |
406 | * Notes: | 482 | * Notes: |
407 | * Uses the same stack frame as the x86-64 version. | 483 | * Uses the same stack frame as the x86-64 version. |
408 | * All registers except %eax must be saved (but ptrace may violate that) | 484 | * All registers except eax must be saved (but ptrace may violate that). |
409 | * Arguments are zero extended. For system calls that want sign extension and | 485 | * Arguments are zero extended. For system calls that want sign extension and |
410 | * take long arguments a wrapper is needed. Most calls can just be called | 486 | * take long arguments a wrapper is needed. Most calls can just be called |
411 | * directly. | 487 | * directly. |
412 | * Assumes it is only called from user space and entered with interrupts off. | 488 | * Assumes it is only called from user space and entered with interrupts off. |
413 | */ | 489 | */ |
414 | 490 | ||
415 | ENTRY(ia32_syscall) | 491 | ENTRY(ia32_syscall) |
416 | CFI_STARTPROC32 simple | 492 | CFI_STARTPROC32 simple |
417 | CFI_SIGNAL_FRAME | 493 | CFI_SIGNAL_FRAME |
418 | CFI_DEF_CFA rsp,SS+8-RIP | 494 | CFI_DEF_CFA rsp,5*8 |
419 | /*CFI_REL_OFFSET ss,SS-RIP*/ | 495 | /*CFI_REL_OFFSET ss,4*8 */ |
420 | CFI_REL_OFFSET rsp,RSP-RIP | 496 | CFI_REL_OFFSET rsp,3*8 |
421 | /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ | 497 | /*CFI_REL_OFFSET rflags,2*8 */ |
422 | /*CFI_REL_OFFSET cs,CS-RIP*/ | 498 | /*CFI_REL_OFFSET cs,1*8 */ |
423 | CFI_REL_OFFSET rip,RIP-RIP | 499 | CFI_REL_OFFSET rip,0*8 |
424 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 500 | |
425 | SWAPGS | ||
426 | /* | 501 | /* |
427 | * No need to follow this irqs on/off section: the syscall | 502 | * Interrupts are off on entry. |
428 | * disabled irqs and here we enable it straight after entry: | 503 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, |
504 | * it is too small to ever cause noticeable irq latency. | ||
429 | */ | 505 | */ |
506 | PARAVIRT_ADJUST_EXCEPTION_FRAME | ||
507 | SWAPGS | ||
430 | ENABLE_INTERRUPTS(CLBR_NONE) | 508 | ENABLE_INTERRUPTS(CLBR_NONE) |
431 | movl %eax,%eax | 509 | |
432 | pushq_cfi %rax | 510 | /* Zero-extending 32-bit regs, do not remove */ |
511 | movl %eax,%eax | ||
512 | |||
513 | /* Construct struct pt_regs on stack (iret frame is already on stack) */ | ||
514 | pushq_cfi_reg rax /* pt_regs->orig_ax */ | ||
515 | pushq_cfi_reg rdi /* pt_regs->di */ | ||
516 | pushq_cfi_reg rsi /* pt_regs->si */ | ||
517 | pushq_cfi_reg rdx /* pt_regs->dx */ | ||
518 | pushq_cfi_reg rcx /* pt_regs->cx */ | ||
519 | pushq_cfi_reg rax /* pt_regs->ax */ | ||
433 | cld | 520 | cld |
434 | /* note the registers are not zero extended to the sf. | 521 | sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ |
435 | this could be a problem. */ | 522 | CFI_ADJUST_CFA_OFFSET 10*8 |
436 | SAVE_ARGS 0,1,0 | 523 | |
437 | orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 524 | orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) |
438 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 525 | testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
439 | jnz ia32_tracesys | 526 | jnz ia32_tracesys |
440 | cmpq $(IA32_NR_syscalls-1),%rax | 527 | cmpq $(IA32_NR_syscalls-1),%rax |
441 | ja ia32_badsys | 528 | ja ia32_badsys |
442 | ia32_do_call: | 529 | ia32_do_call: |
443 | IA32_ARG_FIXUP | 530 | /* 32bit syscall -> 64bit C ABI argument conversion */ |
531 | movl %edi,%r8d /* arg5 */ | ||
532 | movl %ebp,%r9d /* arg6 */ | ||
533 | xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ | ||
534 | movl %ebx,%edi /* arg1 */ | ||
535 | movl %edx,%edx /* arg3 (zero extension) */ | ||
444 | call *ia32_sys_call_table(,%rax,8) # xxx: rip relative | 536 | call *ia32_sys_call_table(,%rax,8) # xxx: rip relative |
445 | ia32_sysret: | 537 | ia32_sysret: |
446 | movq %rax,RAX-ARGOFFSET(%rsp) | 538 | movq %rax,RAX(%rsp) |
447 | ia32_ret_from_sys_call: | 539 | ia32_ret_from_sys_call: |
448 | CLEAR_RREGS -ARGOFFSET | 540 | CLEAR_RREGS |
449 | jmp int_ret_from_sys_call | 541 | jmp int_ret_from_sys_call |
450 | 542 | ||
451 | ia32_tracesys: | 543 | ia32_tracesys: |
452 | SAVE_REST | 544 | SAVE_EXTRA_REGS |
453 | CLEAR_RREGS | 545 | CLEAR_RREGS |
454 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ | 546 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ |
455 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | 547 | movq %rsp,%rdi /* &pt_regs -> arg1 */ |
456 | call syscall_trace_enter | 548 | call syscall_trace_enter |
457 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | 549 | LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ |
458 | RESTORE_REST | 550 | RESTORE_EXTRA_REGS |
459 | cmpq $(IA32_NR_syscalls-1),%rax | 551 | cmpq $(IA32_NR_syscalls-1),%rax |
460 | ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ | 552 | ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ |
461 | jmp ia32_do_call | 553 | jmp ia32_do_call |
462 | END(ia32_syscall) | 554 | END(ia32_syscall) |
463 | 555 | ||
464 | ia32_badsys: | 556 | ia32_badsys: |
465 | movq $0,ORIG_RAX-ARGOFFSET(%rsp) | 557 | movq $0,ORIG_RAX(%rsp) |
466 | movq $-ENOSYS,%rax | 558 | movq $-ENOSYS,%rax |
467 | jmp ia32_sysret | 559 | jmp ia32_sysret |
468 | 560 | ||
@@ -479,8 +571,6 @@ GLOBAL(\label) | |||
479 | 571 | ||
480 | PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn | 572 | PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn |
481 | PTREGSCALL stub32_sigreturn, sys32_sigreturn | 573 | PTREGSCALL stub32_sigreturn, sys32_sigreturn |
482 | PTREGSCALL stub32_execve, compat_sys_execve | ||
483 | PTREGSCALL stub32_execveat, compat_sys_execveat | ||
484 | PTREGSCALL stub32_fork, sys_fork | 574 | PTREGSCALL stub32_fork, sys_fork |
485 | PTREGSCALL stub32_vfork, sys_vfork | 575 | PTREGSCALL stub32_vfork, sys_vfork |
486 | 576 | ||
@@ -492,24 +582,23 @@ GLOBAL(stub32_clone) | |||
492 | 582 | ||
493 | ALIGN | 583 | ALIGN |
494 | ia32_ptregs_common: | 584 | ia32_ptregs_common: |
495 | popq %r11 | ||
496 | CFI_ENDPROC | 585 | CFI_ENDPROC |
497 | CFI_STARTPROC32 simple | 586 | CFI_STARTPROC32 simple |
498 | CFI_SIGNAL_FRAME | 587 | CFI_SIGNAL_FRAME |
499 | CFI_DEF_CFA rsp,SS+8-ARGOFFSET | 588 | CFI_DEF_CFA rsp,SIZEOF_PTREGS |
500 | CFI_REL_OFFSET rax,RAX-ARGOFFSET | 589 | CFI_REL_OFFSET rax,RAX |
501 | CFI_REL_OFFSET rcx,RCX-ARGOFFSET | 590 | CFI_REL_OFFSET rcx,RCX |
502 | CFI_REL_OFFSET rdx,RDX-ARGOFFSET | 591 | CFI_REL_OFFSET rdx,RDX |
503 | CFI_REL_OFFSET rsi,RSI-ARGOFFSET | 592 | CFI_REL_OFFSET rsi,RSI |
504 | CFI_REL_OFFSET rdi,RDI-ARGOFFSET | 593 | CFI_REL_OFFSET rdi,RDI |
505 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | 594 | CFI_REL_OFFSET rip,RIP |
506 | /* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ | 595 | /* CFI_REL_OFFSET cs,CS*/ |
507 | /* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ | 596 | /* CFI_REL_OFFSET rflags,EFLAGS*/ |
508 | CFI_REL_OFFSET rsp,RSP-ARGOFFSET | 597 | CFI_REL_OFFSET rsp,RSP |
509 | /* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ | 598 | /* CFI_REL_OFFSET ss,SS*/ |
510 | SAVE_REST | 599 | SAVE_EXTRA_REGS 8 |
511 | call *%rax | 600 | call *%rax |
512 | RESTORE_REST | 601 | RESTORE_EXTRA_REGS 8 |
513 | jmp ia32_sysret /* misbalances the return cache */ | 602 | ret |
514 | CFI_ENDPROC | 603 | CFI_ENDPROC |
515 | END(ia32_ptregs_common) | 604 | END(ia32_ptregs_common) |
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c deleted file mode 100644 index 51ecd5b4e787..000000000000 --- a/arch/x86/ia32/nosyscall.c +++ /dev/null | |||
@@ -1,7 +0,0 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/errno.h> | ||
3 | |||
4 | long compat_ni_syscall(void) | ||
5 | { | ||
6 | return -ENOSYS; | ||
7 | } | ||
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 8e0ceecdc957..719cd702b0a4 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c | |||
@@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, | |||
201 | advice); | 201 | advice); |
202 | } | 202 | } |
203 | 203 | ||
204 | long sys32_vm86_warning(void) | ||
205 | { | ||
206 | struct task_struct *me = current; | ||
207 | static char lastcomm[sizeof(me->comm)]; | ||
208 | |||
209 | if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { | ||
210 | compat_printk(KERN_INFO | ||
211 | "%s: vm86 mode not supported on 64 bit kernel\n", | ||
212 | me->comm); | ||
213 | strncpy(lastcomm, me->comm, sizeof(lastcomm)); | ||
214 | } | ||
215 | return -ENOSYS; | ||
216 | } | ||
217 | |||
218 | asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, | 204 | asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, |
219 | size_t count) | 205 | size_t count) |
220 | { | 206 | { |
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c deleted file mode 100644 index 4754ba0f5d9f..000000000000 --- a/arch/x86/ia32/syscall_ia32.c +++ /dev/null | |||
@@ -1,25 +0,0 @@ | |||
1 | /* System call table for ia32 emulation. */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <linux/sys.h> | ||
5 | #include <linux/cache.h> | ||
6 | #include <asm/asm-offsets.h> | ||
7 | |||
8 | #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ; | ||
9 | #include <asm/syscalls_32.h> | ||
10 | #undef __SYSCALL_I386 | ||
11 | |||
12 | #define __SYSCALL_I386(nr, sym, compat) [nr] = compat, | ||
13 | |||
14 | typedef void (*sys_call_ptr_t)(void); | ||
15 | |||
16 | extern void compat_ni_syscall(void); | ||
17 | |||
18 | const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { | ||
19 | /* | ||
20 | * Smells like a compiler bug -- it doesn't work | ||
21 | * when the & below is removed. | ||
22 | */ | ||
23 | [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall, | ||
24 | #include <asm/syscalls_32.h> | ||
25 | }; | ||
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 372231c22a47..bdf02eeee765 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h | |||
@@ -18,12 +18,63 @@ | |||
18 | .endm | 18 | .endm |
19 | #endif | 19 | #endif |
20 | 20 | ||
21 | .macro altinstruction_entry orig alt feature orig_len alt_len | 21 | .macro altinstruction_entry orig alt feature orig_len alt_len pad_len |
22 | .long \orig - . | 22 | .long \orig - . |
23 | .long \alt - . | 23 | .long \alt - . |
24 | .word \feature | 24 | .word \feature |
25 | .byte \orig_len | 25 | .byte \orig_len |
26 | .byte \alt_len | 26 | .byte \alt_len |
27 | .byte \pad_len | ||
28 | .endm | ||
29 | |||
30 | .macro ALTERNATIVE oldinstr, newinstr, feature | ||
31 | 140: | ||
32 | \oldinstr | ||
33 | 141: | ||
34 | .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 | ||
35 | 142: | ||
36 | |||
37 | .pushsection .altinstructions,"a" | ||
38 | altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b | ||
39 | .popsection | ||
40 | |||
41 | .pushsection .altinstr_replacement,"ax" | ||
42 | 143: | ||
43 | \newinstr | ||
44 | 144: | ||
45 | .popsection | ||
46 | .endm | ||
47 | |||
48 | #define old_len 141b-140b | ||
49 | #define new_len1 144f-143f | ||
50 | #define new_len2 145f-144f | ||
51 | |||
52 | /* | ||
53 | * max without conditionals. Idea adapted from: | ||
54 | * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax | ||
55 | */ | ||
56 | #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) | ||
57 | |||
58 | .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 | ||
59 | 140: | ||
60 | \oldinstr | ||
61 | 141: | ||
62 | .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ | ||
63 | (alt_max_short(new_len1, new_len2) - (old_len)),0x90 | ||
64 | 142: | ||
65 | |||
66 | .pushsection .altinstructions,"a" | ||
67 | altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b | ||
68 | altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b | ||
69 | .popsection | ||
70 | |||
71 | .pushsection .altinstr_replacement,"ax" | ||
72 | 143: | ||
73 | \newinstr1 | ||
74 | 144: | ||
75 | \newinstr2 | ||
76 | 145: | ||
77 | .popsection | ||
27 | .endm | 78 | .endm |
28 | 79 | ||
29 | #endif /* __ASSEMBLY__ */ | 80 | #endif /* __ASSEMBLY__ */ |
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 473bdbee378a..ba32af062f61 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h | |||
@@ -48,8 +48,9 @@ struct alt_instr { | |||
48 | s32 repl_offset; /* offset to replacement instruction */ | 48 | s32 repl_offset; /* offset to replacement instruction */ |
49 | u16 cpuid; /* cpuid bit set for replacement */ | 49 | u16 cpuid; /* cpuid bit set for replacement */ |
50 | u8 instrlen; /* length of original instruction */ | 50 | u8 instrlen; /* length of original instruction */ |
51 | u8 replacementlen; /* length of new instruction, <= instrlen */ | 51 | u8 replacementlen; /* length of new instruction */ |
52 | }; | 52 | u8 padlen; /* length of build-time padding */ |
53 | } __packed; | ||
53 | 54 | ||
54 | extern void alternative_instructions(void); | 55 | extern void alternative_instructions(void); |
55 | extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); | 56 | extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); |
@@ -76,50 +77,69 @@ static inline int alternatives_text_reserved(void *start, void *end) | |||
76 | } | 77 | } |
77 | #endif /* CONFIG_SMP */ | 78 | #endif /* CONFIG_SMP */ |
78 | 79 | ||
79 | #define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" | 80 | #define b_replacement(num) "664"#num |
81 | #define e_replacement(num) "665"#num | ||
80 | 82 | ||
81 | #define b_replacement(number) "663"#number | 83 | #define alt_end_marker "663" |
82 | #define e_replacement(number) "664"#number | 84 | #define alt_slen "662b-661b" |
85 | #define alt_pad_len alt_end_marker"b-662b" | ||
86 | #define alt_total_slen alt_end_marker"b-661b" | ||
87 | #define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" | ||
83 | 88 | ||
84 | #define alt_slen "662b-661b" | 89 | #define __OLDINSTR(oldinstr, num) \ |
85 | #define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" | 90 | "661:\n\t" oldinstr "\n662:\n" \ |
91 | ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \ | ||
92 | "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" | ||
86 | 93 | ||
87 | #define ALTINSTR_ENTRY(feature, number) \ | 94 | #define OLDINSTR(oldinstr, num) \ |
95 | __OLDINSTR(oldinstr, num) \ | ||
96 | alt_end_marker ":\n" | ||
97 | |||
98 | /* | ||
99 | * max without conditionals. Idea adapted from: | ||
100 | * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax | ||
101 | * | ||
102 | * The additional "-" is needed because gas works with s32s. | ||
103 | */ | ||
104 | #define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" | ||
105 | |||
106 | /* | ||
107 | * Pad the second replacement alternative with additional NOPs if it is | ||
108 | * additionally longer than the first replacement alternative. | ||
109 | */ | ||
110 | #define OLDINSTR_2(oldinstr, num1, num2) \ | ||
111 | "661:\n\t" oldinstr "\n662:\n" \ | ||
112 | ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \ | ||
113 | "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \ | ||
114 | alt_end_marker ":\n" | ||
115 | |||
116 | #define ALTINSTR_ENTRY(feature, num) \ | ||
88 | " .long 661b - .\n" /* label */ \ | 117 | " .long 661b - .\n" /* label */ \ |
89 | " .long " b_replacement(number)"f - .\n" /* new instruction */ \ | 118 | " .long " b_replacement(num)"f - .\n" /* new instruction */ \ |
90 | " .word " __stringify(feature) "\n" /* feature bit */ \ | 119 | " .word " __stringify(feature) "\n" /* feature bit */ \ |
91 | " .byte " alt_slen "\n" /* source len */ \ | 120 | " .byte " alt_total_slen "\n" /* source len */ \ |
92 | " .byte " alt_rlen(number) "\n" /* replacement len */ | 121 | " .byte " alt_rlen(num) "\n" /* replacement len */ \ |
93 | 122 | " .byte " alt_pad_len "\n" /* pad len */ | |
94 | #define DISCARD_ENTRY(number) /* rlen <= slen */ \ | ||
95 | " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" | ||
96 | 123 | ||
97 | #define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ | 124 | #define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ |
98 | b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" | 125 | b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t" |
99 | 126 | ||
100 | /* alternative assembly primitive: */ | 127 | /* alternative assembly primitive: */ |
101 | #define ALTERNATIVE(oldinstr, newinstr, feature) \ | 128 | #define ALTERNATIVE(oldinstr, newinstr, feature) \ |
102 | OLDINSTR(oldinstr) \ | 129 | OLDINSTR(oldinstr, 1) \ |
103 | ".pushsection .altinstructions,\"a\"\n" \ | 130 | ".pushsection .altinstructions,\"a\"\n" \ |
104 | ALTINSTR_ENTRY(feature, 1) \ | 131 | ALTINSTR_ENTRY(feature, 1) \ |
105 | ".popsection\n" \ | 132 | ".popsection\n" \ |
106 | ".pushsection .discard,\"aw\",@progbits\n" \ | ||
107 | DISCARD_ENTRY(1) \ | ||
108 | ".popsection\n" \ | ||
109 | ".pushsection .altinstr_replacement, \"ax\"\n" \ | 133 | ".pushsection .altinstr_replacement, \"ax\"\n" \ |
110 | ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ | 134 | ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ |
111 | ".popsection" | 135 | ".popsection" |
112 | 136 | ||
113 | #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ | 137 | #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ |
114 | OLDINSTR(oldinstr) \ | 138 | OLDINSTR_2(oldinstr, 1, 2) \ |
115 | ".pushsection .altinstructions,\"a\"\n" \ | 139 | ".pushsection .altinstructions,\"a\"\n" \ |
116 | ALTINSTR_ENTRY(feature1, 1) \ | 140 | ALTINSTR_ENTRY(feature1, 1) \ |
117 | ALTINSTR_ENTRY(feature2, 2) \ | 141 | ALTINSTR_ENTRY(feature2, 2) \ |
118 | ".popsection\n" \ | 142 | ".popsection\n" \ |
119 | ".pushsection .discard,\"aw\",@progbits\n" \ | ||
120 | DISCARD_ENTRY(1) \ | ||
121 | DISCARD_ENTRY(2) \ | ||
122 | ".popsection\n" \ | ||
123 | ".pushsection .altinstr_replacement, \"ax\"\n" \ | 143 | ".pushsection .altinstr_replacement, \"ax\"\n" \ |
124 | ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ | 144 | ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ |
125 | ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ | 145 | ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ |
@@ -146,6 +166,9 @@ static inline int alternatives_text_reserved(void *start, void *end) | |||
146 | #define alternative(oldinstr, newinstr, feature) \ | 166 | #define alternative(oldinstr, newinstr, feature) \ |
147 | asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") | 167 | asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") |
148 | 168 | ||
169 | #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ | ||
170 | asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") | ||
171 | |||
149 | /* | 172 | /* |
150 | * Alternative inline assembly with input. | 173 | * Alternative inline assembly with input. |
151 | * | 174 | * |
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 08f217354442..976b86a325e5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h | |||
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v) | |||
91 | { | 91 | { |
92 | volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); | 92 | volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); |
93 | 93 | ||
94 | alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, | 94 | alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP, |
95 | ASM_OUTPUT2("=r" (v), "=m" (*addr)), | 95 | ASM_OUTPUT2("=r" (v), "=m" (*addr)), |
96 | ASM_OUTPUT2("0" (v), "m" (*addr))); | 96 | ASM_OUTPUT2("0" (v), "m" (*addr))); |
97 | } | 97 | } |
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 2ab1eb33106e..959e45b81fe2 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h | |||
@@ -95,13 +95,11 @@ do { \ | |||
95 | * Stop RDTSC speculation. This is needed when you need to use RDTSC | 95 | * Stop RDTSC speculation. This is needed when you need to use RDTSC |
96 | * (or get_cycles or vread that possibly accesses the TSC) in a defined | 96 | * (or get_cycles or vread that possibly accesses the TSC) in a defined |
97 | * code region. | 97 | * code region. |
98 | * | ||
99 | * (Could use an alternative three way for this if there was one.) | ||
100 | */ | 98 | */ |
101 | static __always_inline void rdtsc_barrier(void) | 99 | static __always_inline void rdtsc_barrier(void) |
102 | { | 100 | { |
103 | alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); | 101 | alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, |
104 | alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); | 102 | "lfence", X86_FEATURE_LFENCE_RDTSC); |
105 | } | 103 | } |
106 | 104 | ||
107 | #endif /* _ASM_X86_BARRIER_H */ | 105 | #endif /* _ASM_X86_BARRIER_H */ |
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 1f1297b46f83..1c8b50edb2db 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h | |||
@@ -55,143 +55,157 @@ For 32-bit we have the following conventions - kernel is built with | |||
55 | * for assembly code: | 55 | * for assembly code: |
56 | */ | 56 | */ |
57 | 57 | ||
58 | #define R15 0 | 58 | /* The layout forms the "struct pt_regs" on the stack: */ |
59 | #define R14 8 | 59 | /* |
60 | #define R13 16 | 60 | * C ABI says these regs are callee-preserved. They aren't saved on kernel entry |
61 | #define R12 24 | 61 | * unless syscall needs a complete, fully filled "struct pt_regs". |
62 | #define RBP 32 | 62 | */ |
63 | #define RBX 40 | 63 | #define R15 0*8 |
64 | 64 | #define R14 1*8 | |
65 | /* arguments: interrupts/non tracing syscalls only save up to here: */ | 65 | #define R13 2*8 |
66 | #define R11 48 | 66 | #define R12 3*8 |
67 | #define R10 56 | 67 | #define RBP 4*8 |
68 | #define R9 64 | 68 | #define RBX 5*8 |
69 | #define R8 72 | 69 | /* These regs are callee-clobbered. Always saved on kernel entry. */ |
70 | #define RAX 80 | 70 | #define R11 6*8 |
71 | #define RCX 88 | 71 | #define R10 7*8 |
72 | #define RDX 96 | 72 | #define R9 8*8 |
73 | #define RSI 104 | 73 | #define R8 9*8 |
74 | #define RDI 112 | 74 | #define RAX 10*8 |
75 | #define ORIG_RAX 120 /* + error_code */ | 75 | #define RCX 11*8 |
76 | /* end of arguments */ | 76 | #define RDX 12*8 |
77 | 77 | #define RSI 13*8 | |
78 | /* cpu exception frame or undefined in case of fast syscall: */ | 78 | #define RDI 14*8 |
79 | #define RIP 128 | 79 | /* |
80 | #define CS 136 | 80 | * On syscall entry, this is syscall#. On CPU exception, this is error code. |
81 | #define EFLAGS 144 | 81 | * On hw interrupt, it's IRQ number: |
82 | #define RSP 152 | 82 | */ |
83 | #define SS 160 | 83 | #define ORIG_RAX 15*8 |
84 | 84 | /* Return frame for iretq */ | |
85 | #define ARGOFFSET R11 | 85 | #define RIP 16*8 |
86 | 86 | #define CS 17*8 | |
87 | .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 | 87 | #define EFLAGS 18*8 |
88 | subq $9*8+\addskip, %rsp | 88 | #define RSP 19*8 |
89 | CFI_ADJUST_CFA_OFFSET 9*8+\addskip | 89 | #define SS 20*8 |
90 | movq_cfi rdi, 8*8 | 90 | |
91 | movq_cfi rsi, 7*8 | 91 | #define SIZEOF_PTREGS 21*8 |
92 | movq_cfi rdx, 6*8 | 92 | |
93 | 93 | .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 | |
94 | .if \save_rcx | 94 | subq $15*8+\addskip, %rsp |
95 | movq_cfi rcx, 5*8 | 95 | CFI_ADJUST_CFA_OFFSET 15*8+\addskip |
96 | .endif | 96 | .endm |
97 | 97 | ||
98 | .if \rax_enosys | 98 | .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 |
99 | movq $-ENOSYS, 4*8(%rsp) | 99 | .if \r11 |
100 | .else | 100 | movq_cfi r11, 6*8+\offset |
101 | movq_cfi rax, 4*8 | ||
102 | .endif | 101 | .endif |
103 | 102 | .if \r8910 | |
104 | .if \save_r891011 | 103 | movq_cfi r10, 7*8+\offset |
105 | movq_cfi r8, 3*8 | 104 | movq_cfi r9, 8*8+\offset |
106 | movq_cfi r9, 2*8 | 105 | movq_cfi r8, 9*8+\offset |
107 | movq_cfi r10, 1*8 | 106 | .endif |
108 | movq_cfi r11, 0*8 | 107 | .if \rax |
108 | movq_cfi rax, 10*8+\offset | ||
109 | .endif | ||
110 | .if \rcx | ||
111 | movq_cfi rcx, 11*8+\offset | ||
109 | .endif | 112 | .endif |
113 | movq_cfi rdx, 12*8+\offset | ||
114 | movq_cfi rsi, 13*8+\offset | ||
115 | movq_cfi rdi, 14*8+\offset | ||
116 | .endm | ||
117 | .macro SAVE_C_REGS offset=0 | ||
118 | SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 | ||
119 | .endm | ||
120 | .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 | ||
121 | SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 | ||
122 | .endm | ||
123 | .macro SAVE_C_REGS_EXCEPT_R891011 | ||
124 | SAVE_C_REGS_HELPER 0, 1, 1, 0, 0 | ||
125 | .endm | ||
126 | .macro SAVE_C_REGS_EXCEPT_RCX_R891011 | ||
127 | SAVE_C_REGS_HELPER 0, 1, 0, 0, 0 | ||
128 | .endm | ||
129 | .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11 | ||
130 | SAVE_C_REGS_HELPER 0, 0, 0, 1, 0 | ||
131 | .endm | ||
132 | |||
133 | .macro SAVE_EXTRA_REGS offset=0 | ||
134 | movq_cfi r15, 0*8+\offset | ||
135 | movq_cfi r14, 1*8+\offset | ||
136 | movq_cfi r13, 2*8+\offset | ||
137 | movq_cfi r12, 3*8+\offset | ||
138 | movq_cfi rbp, 4*8+\offset | ||
139 | movq_cfi rbx, 5*8+\offset | ||
140 | .endm | ||
141 | .macro SAVE_EXTRA_REGS_RBP offset=0 | ||
142 | movq_cfi rbp, 4*8+\offset | ||
143 | .endm | ||
110 | 144 | ||
145 | .macro RESTORE_EXTRA_REGS offset=0 | ||
146 | movq_cfi_restore 0*8+\offset, r15 | ||
147 | movq_cfi_restore 1*8+\offset, r14 | ||
148 | movq_cfi_restore 2*8+\offset, r13 | ||
149 | movq_cfi_restore 3*8+\offset, r12 | ||
150 | movq_cfi_restore 4*8+\offset, rbp | ||
151 | movq_cfi_restore 5*8+\offset, rbx | ||
111 | .endm | 152 | .endm |
112 | 153 | ||
113 | #define ARG_SKIP (9*8) | 154 | .macro ZERO_EXTRA_REGS |
155 | xorl %r15d, %r15d | ||
156 | xorl %r14d, %r14d | ||
157 | xorl %r13d, %r13d | ||
158 | xorl %r12d, %r12d | ||
159 | xorl %ebp, %ebp | ||
160 | xorl %ebx, %ebx | ||
161 | .endm | ||
114 | 162 | ||
115 | .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \ | 163 | .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 |
116 | rstor_r8910=1, rstor_rdx=1 | ||
117 | .if \rstor_r11 | 164 | .if \rstor_r11 |
118 | movq_cfi_restore 0*8, r11 | 165 | movq_cfi_restore 6*8, r11 |
119 | .endif | 166 | .endif |
120 | |||
121 | .if \rstor_r8910 | 167 | .if \rstor_r8910 |
122 | movq_cfi_restore 1*8, r10 | 168 | movq_cfi_restore 7*8, r10 |
123 | movq_cfi_restore 2*8, r9 | 169 | movq_cfi_restore 8*8, r9 |
124 | movq_cfi_restore 3*8, r8 | 170 | movq_cfi_restore 9*8, r8 |
125 | .endif | 171 | .endif |
126 | |||
127 | .if \rstor_rax | 172 | .if \rstor_rax |
128 | movq_cfi_restore 4*8, rax | 173 | movq_cfi_restore 10*8, rax |
129 | .endif | 174 | .endif |
130 | |||
131 | .if \rstor_rcx | 175 | .if \rstor_rcx |
132 | movq_cfi_restore 5*8, rcx | 176 | movq_cfi_restore 11*8, rcx |
133 | .endif | 177 | .endif |
134 | |||
135 | .if \rstor_rdx | 178 | .if \rstor_rdx |
136 | movq_cfi_restore 6*8, rdx | 179 | movq_cfi_restore 12*8, rdx |
137 | .endif | ||
138 | |||
139 | movq_cfi_restore 7*8, rsi | ||
140 | movq_cfi_restore 8*8, rdi | ||
141 | |||
142 | .if ARG_SKIP+\addskip > 0 | ||
143 | addq $ARG_SKIP+\addskip, %rsp | ||
144 | CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip) | ||
145 | .endif | 180 | .endif |
181 | movq_cfi_restore 13*8, rsi | ||
182 | movq_cfi_restore 14*8, rdi | ||
146 | .endm | 183 | .endm |
147 | 184 | .macro RESTORE_C_REGS | |
148 | .macro LOAD_ARGS offset, skiprax=0 | 185 | RESTORE_C_REGS_HELPER 1,1,1,1,1 |
149 | movq \offset(%rsp), %r11 | ||
150 | movq \offset+8(%rsp), %r10 | ||
151 | movq \offset+16(%rsp), %r9 | ||
152 | movq \offset+24(%rsp), %r8 | ||
153 | movq \offset+40(%rsp), %rcx | ||
154 | movq \offset+48(%rsp), %rdx | ||
155 | movq \offset+56(%rsp), %rsi | ||
156 | movq \offset+64(%rsp), %rdi | ||
157 | .if \skiprax | ||
158 | .else | ||
159 | movq \offset+72(%rsp), %rax | ||
160 | .endif | ||
161 | .endm | 186 | .endm |
162 | 187 | .macro RESTORE_C_REGS_EXCEPT_RAX | |
163 | #define REST_SKIP (6*8) | 188 | RESTORE_C_REGS_HELPER 0,1,1,1,1 |
164 | |||
165 | .macro SAVE_REST | ||
166 | subq $REST_SKIP, %rsp | ||
167 | CFI_ADJUST_CFA_OFFSET REST_SKIP | ||
168 | movq_cfi rbx, 5*8 | ||
169 | movq_cfi rbp, 4*8 | ||
170 | movq_cfi r12, 3*8 | ||
171 | movq_cfi r13, 2*8 | ||
172 | movq_cfi r14, 1*8 | ||
173 | movq_cfi r15, 0*8 | ||
174 | .endm | 189 | .endm |
175 | 190 | .macro RESTORE_C_REGS_EXCEPT_RCX | |
176 | .macro RESTORE_REST | 191 | RESTORE_C_REGS_HELPER 1,0,1,1,1 |
177 | movq_cfi_restore 0*8, r15 | ||
178 | movq_cfi_restore 1*8, r14 | ||
179 | movq_cfi_restore 2*8, r13 | ||
180 | movq_cfi_restore 3*8, r12 | ||
181 | movq_cfi_restore 4*8, rbp | ||
182 | movq_cfi_restore 5*8, rbx | ||
183 | addq $REST_SKIP, %rsp | ||
184 | CFI_ADJUST_CFA_OFFSET -(REST_SKIP) | ||
185 | .endm | 192 | .endm |
186 | 193 | .macro RESTORE_C_REGS_EXCEPT_R11 | |
187 | .macro SAVE_ALL | 194 | RESTORE_C_REGS_HELPER 1,1,0,1,1 |
188 | SAVE_ARGS | 195 | .endm |
189 | SAVE_REST | 196 | .macro RESTORE_C_REGS_EXCEPT_RCX_R11 |
197 | RESTORE_C_REGS_HELPER 1,0,0,1,1 | ||
198 | .endm | ||
199 | .macro RESTORE_RSI_RDI | ||
200 | RESTORE_C_REGS_HELPER 0,0,0,0,0 | ||
201 | .endm | ||
202 | .macro RESTORE_RSI_RDI_RDX | ||
203 | RESTORE_C_REGS_HELPER 0,0,0,0,1 | ||
190 | .endm | 204 | .endm |
191 | 205 | ||
192 | .macro RESTORE_ALL addskip=0 | 206 | .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 |
193 | RESTORE_REST | 207 | addq $15*8+\addskip, %rsp |
194 | RESTORE_ARGS 1, \addskip | 208 | CFI_ADJUST_CFA_OFFSET -(15*8+\addskip) |
195 | .endm | 209 | .endm |
196 | 210 | ||
197 | .macro icebp | 211 | .macro icebp |
@@ -210,37 +224,23 @@ For 32-bit we have the following conventions - kernel is built with | |||
210 | */ | 224 | */ |
211 | 225 | ||
212 | .macro SAVE_ALL | 226 | .macro SAVE_ALL |
213 | pushl_cfi %eax | 227 | pushl_cfi_reg eax |
214 | CFI_REL_OFFSET eax, 0 | 228 | pushl_cfi_reg ebp |
215 | pushl_cfi %ebp | 229 | pushl_cfi_reg edi |
216 | CFI_REL_OFFSET ebp, 0 | 230 | pushl_cfi_reg esi |
217 | pushl_cfi %edi | 231 | pushl_cfi_reg edx |
218 | CFI_REL_OFFSET edi, 0 | 232 | pushl_cfi_reg ecx |
219 | pushl_cfi %esi | 233 | pushl_cfi_reg ebx |
220 | CFI_REL_OFFSET esi, 0 | ||
221 | pushl_cfi %edx | ||
222 | CFI_REL_OFFSET edx, 0 | ||
223 | pushl_cfi %ecx | ||
224 | CFI_REL_OFFSET ecx, 0 | ||
225 | pushl_cfi %ebx | ||
226 | CFI_REL_OFFSET ebx, 0 | ||
227 | .endm | 234 | .endm |
228 | 235 | ||
229 | .macro RESTORE_ALL | 236 | .macro RESTORE_ALL |
230 | popl_cfi %ebx | 237 | popl_cfi_reg ebx |
231 | CFI_RESTORE ebx | 238 | popl_cfi_reg ecx |
232 | popl_cfi %ecx | 239 | popl_cfi_reg edx |
233 | CFI_RESTORE ecx | 240 | popl_cfi_reg esi |
234 | popl_cfi %edx | 241 | popl_cfi_reg edi |
235 | CFI_RESTORE edx | 242 | popl_cfi_reg ebp |
236 | popl_cfi %esi | 243 | popl_cfi_reg eax |
237 | CFI_RESTORE esi | ||
238 | popl_cfi %edi | ||
239 | CFI_RESTORE edi | ||
240 | popl_cfi %ebp | ||
241 | CFI_RESTORE ebp | ||
242 | popl_cfi %eax | ||
243 | CFI_RESTORE eax | ||
244 | .endm | 244 | .endm |
245 | 245 | ||
246 | #endif /* CONFIG_X86_64 */ | 246 | #endif /* CONFIG_X86_64 */ |
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 59c6c401f79f..acdee09228b3 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h | |||
@@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len) | |||
301 | sp = task_pt_regs(current)->sp; | 301 | sp = task_pt_regs(current)->sp; |
302 | } else { | 302 | } else { |
303 | /* -128 for the x32 ABI redzone */ | 303 | /* -128 for the x32 ABI redzone */ |
304 | sp = this_cpu_read(old_rsp) - 128; | 304 | sp = task_pt_regs(current)->sp - 128; |
305 | } | 305 | } |
306 | 306 | ||
307 | return (void __user *)round_down(sp - len, 16); | 307 | return (void __user *)round_down(sp - len, 16); |
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 90a54851aedc..854c04b3c9c2 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -231,7 +231,9 @@ | |||
231 | #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ | 231 | #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ |
232 | #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ | 232 | #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ |
233 | #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ | 233 | #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ |
234 | #define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ | ||
234 | #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ | 235 | #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ |
236 | #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ | ||
235 | #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ | 237 | #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ |
236 | #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ | 238 | #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ |
237 | #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ | 239 | #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ |
@@ -418,6 +420,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
418 | " .word %P0\n" /* 1: do replace */ | 420 | " .word %P0\n" /* 1: do replace */ |
419 | " .byte 2b - 1b\n" /* source len */ | 421 | " .byte 2b - 1b\n" /* source len */ |
420 | " .byte 0\n" /* replacement len */ | 422 | " .byte 0\n" /* replacement len */ |
423 | " .byte 0\n" /* pad len */ | ||
421 | ".previous\n" | 424 | ".previous\n" |
422 | /* skipping size check since replacement size = 0 */ | 425 | /* skipping size check since replacement size = 0 */ |
423 | : : "i" (X86_FEATURE_ALWAYS) : : t_warn); | 426 | : : "i" (X86_FEATURE_ALWAYS) : : t_warn); |
@@ -432,6 +435,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
432 | " .word %P0\n" /* feature bit */ | 435 | " .word %P0\n" /* feature bit */ |
433 | " .byte 2b - 1b\n" /* source len */ | 436 | " .byte 2b - 1b\n" /* source len */ |
434 | " .byte 0\n" /* replacement len */ | 437 | " .byte 0\n" /* replacement len */ |
438 | " .byte 0\n" /* pad len */ | ||
435 | ".previous\n" | 439 | ".previous\n" |
436 | /* skipping size check since replacement size = 0 */ | 440 | /* skipping size check since replacement size = 0 */ |
437 | : : "i" (bit) : : t_no); | 441 | : : "i" (bit) : : t_no); |
@@ -457,6 +461,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
457 | " .word %P1\n" /* feature bit */ | 461 | " .word %P1\n" /* feature bit */ |
458 | " .byte 2b - 1b\n" /* source len */ | 462 | " .byte 2b - 1b\n" /* source len */ |
459 | " .byte 4f - 3f\n" /* replacement len */ | 463 | " .byte 4f - 3f\n" /* replacement len */ |
464 | " .byte 0\n" /* pad len */ | ||
460 | ".previous\n" | 465 | ".previous\n" |
461 | ".section .discard,\"aw\",@progbits\n" | 466 | ".section .discard,\"aw\",@progbits\n" |
462 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ | 467 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ |
@@ -483,31 +488,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
483 | static __always_inline __pure bool _static_cpu_has_safe(u16 bit) | 488 | static __always_inline __pure bool _static_cpu_has_safe(u16 bit) |
484 | { | 489 | { |
485 | #ifdef CC_HAVE_ASM_GOTO | 490 | #ifdef CC_HAVE_ASM_GOTO |
486 | /* | 491 | asm_volatile_goto("1: jmp %l[t_dynamic]\n" |
487 | * We need to spell the jumps to the compiler because, depending on the offset, | ||
488 | * the replacement jump can be bigger than the original jump, and this we cannot | ||
489 | * have. Thus, we force the jump to the widest, 4-byte, signed relative | ||
490 | * offset even though the last would often fit in less bytes. | ||
491 | */ | ||
492 | asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" | ||
493 | "2:\n" | 492 | "2:\n" |
493 | ".skip -(((5f-4f) - (2b-1b)) > 0) * " | ||
494 | "((5f-4f) - (2b-1b)),0x90\n" | ||
495 | "3:\n" | ||
494 | ".section .altinstructions,\"a\"\n" | 496 | ".section .altinstructions,\"a\"\n" |
495 | " .long 1b - .\n" /* src offset */ | 497 | " .long 1b - .\n" /* src offset */ |
496 | " .long 3f - .\n" /* repl offset */ | 498 | " .long 4f - .\n" /* repl offset */ |
497 | " .word %P1\n" /* always replace */ | 499 | " .word %P1\n" /* always replace */ |
498 | " .byte 2b - 1b\n" /* src len */ | 500 | " .byte 3b - 1b\n" /* src len */ |
499 | " .byte 4f - 3f\n" /* repl len */ | 501 | " .byte 5f - 4f\n" /* repl len */ |
502 | " .byte 3b - 2b\n" /* pad len */ | ||
500 | ".previous\n" | 503 | ".previous\n" |
501 | ".section .altinstr_replacement,\"ax\"\n" | 504 | ".section .altinstr_replacement,\"ax\"\n" |
502 | "3: .byte 0xe9\n .long %l[t_no] - 2b\n" | 505 | "4: jmp %l[t_no]\n" |
503 | "4:\n" | 506 | "5:\n" |
504 | ".previous\n" | 507 | ".previous\n" |
505 | ".section .altinstructions,\"a\"\n" | 508 | ".section .altinstructions,\"a\"\n" |
506 | " .long 1b - .\n" /* src offset */ | 509 | " .long 1b - .\n" /* src offset */ |
507 | " .long 0\n" /* no replacement */ | 510 | " .long 0\n" /* no replacement */ |
508 | " .word %P0\n" /* feature bit */ | 511 | " .word %P0\n" /* feature bit */ |
509 | " .byte 2b - 1b\n" /* src len */ | 512 | " .byte 3b - 1b\n" /* src len */ |
510 | " .byte 0\n" /* repl len */ | 513 | " .byte 0\n" /* repl len */ |
514 | " .byte 0\n" /* pad len */ | ||
511 | ".previous\n" | 515 | ".previous\n" |
512 | : : "i" (bit), "i" (X86_FEATURE_ALWAYS) | 516 | : : "i" (bit), "i" (X86_FEATURE_ALWAYS) |
513 | : : t_dynamic, t_no); | 517 | : : t_dynamic, t_no); |
@@ -527,6 +531,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) | |||
527 | " .word %P2\n" /* always replace */ | 531 | " .word %P2\n" /* always replace */ |
528 | " .byte 2b - 1b\n" /* source len */ | 532 | " .byte 2b - 1b\n" /* source len */ |
529 | " .byte 4f - 3f\n" /* replacement len */ | 533 | " .byte 4f - 3f\n" /* replacement len */ |
534 | " .byte 0\n" /* pad len */ | ||
530 | ".previous\n" | 535 | ".previous\n" |
531 | ".section .discard,\"aw\",@progbits\n" | 536 | ".section .discard,\"aw\",@progbits\n" |
532 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ | 537 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ |
@@ -541,6 +546,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) | |||
541 | " .word %P1\n" /* feature bit */ | 546 | " .word %P1\n" /* feature bit */ |
542 | " .byte 4b - 3b\n" /* src len */ | 547 | " .byte 4b - 3b\n" /* src len */ |
543 | " .byte 6f - 5f\n" /* repl len */ | 548 | " .byte 6f - 5f\n" /* repl len */ |
549 | " .byte 0\n" /* pad len */ | ||
544 | ".previous\n" | 550 | ".previous\n" |
545 | ".section .discard,\"aw\",@progbits\n" | 551 | ".section .discard,\"aw\",@progbits\n" |
546 | " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ | 552 | " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ |
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index a94b82e8f156..a0bf89fd2647 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h | |||
@@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr, | |||
376 | * Pentium F0 0F bugfix can have resulted in the mapped | 376 | * Pentium F0 0F bugfix can have resulted in the mapped |
377 | * IDT being write-protected. | 377 | * IDT being write-protected. |
378 | */ | 378 | */ |
379 | #define set_intr_gate(n, addr) \ | 379 | #define set_intr_gate_notrace(n, addr) \ |
380 | do { \ | 380 | do { \ |
381 | BUG_ON((unsigned)n > 0xFF); \ | 381 | BUG_ON((unsigned)n > 0xFF); \ |
382 | _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ | 382 | _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ |
383 | __KERNEL_CS); \ | 383 | __KERNEL_CS); \ |
384 | } while (0) | ||
385 | |||
386 | #define set_intr_gate(n, addr) \ | ||
387 | do { \ | ||
388 | set_intr_gate_notrace(n, addr); \ | ||
384 | _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ | 389 | _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ |
385 | 0, 0, __KERNEL_CS); \ | 390 | 0, 0, __KERNEL_CS); \ |
386 | } while (0) | 391 | } while (0) |
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index f6f15986df6c..de1cdaf4d743 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h | |||
@@ -86,11 +86,23 @@ | |||
86 | CFI_ADJUST_CFA_OFFSET 8 | 86 | CFI_ADJUST_CFA_OFFSET 8 |
87 | .endm | 87 | .endm |
88 | 88 | ||
89 | .macro pushq_cfi_reg reg | ||
90 | pushq %\reg | ||
91 | CFI_ADJUST_CFA_OFFSET 8 | ||
92 | CFI_REL_OFFSET \reg, 0 | ||
93 | .endm | ||
94 | |||
89 | .macro popq_cfi reg | 95 | .macro popq_cfi reg |
90 | popq \reg | 96 | popq \reg |
91 | CFI_ADJUST_CFA_OFFSET -8 | 97 | CFI_ADJUST_CFA_OFFSET -8 |
92 | .endm | 98 | .endm |
93 | 99 | ||
100 | .macro popq_cfi_reg reg | ||
101 | popq %\reg | ||
102 | CFI_ADJUST_CFA_OFFSET -8 | ||
103 | CFI_RESTORE \reg | ||
104 | .endm | ||
105 | |||
94 | .macro pushfq_cfi | 106 | .macro pushfq_cfi |
95 | pushfq | 107 | pushfq |
96 | CFI_ADJUST_CFA_OFFSET 8 | 108 | CFI_ADJUST_CFA_OFFSET 8 |
@@ -116,11 +128,23 @@ | |||
116 | CFI_ADJUST_CFA_OFFSET 4 | 128 | CFI_ADJUST_CFA_OFFSET 4 |
117 | .endm | 129 | .endm |
118 | 130 | ||
131 | .macro pushl_cfi_reg reg | ||
132 | pushl %\reg | ||
133 | CFI_ADJUST_CFA_OFFSET 4 | ||
134 | CFI_REL_OFFSET \reg, 0 | ||
135 | .endm | ||
136 | |||
119 | .macro popl_cfi reg | 137 | .macro popl_cfi reg |
120 | popl \reg | 138 | popl \reg |
121 | CFI_ADJUST_CFA_OFFSET -4 | 139 | CFI_ADJUST_CFA_OFFSET -4 |
122 | .endm | 140 | .endm |
123 | 141 | ||
142 | .macro popl_cfi_reg reg | ||
143 | popl %\reg | ||
144 | CFI_ADJUST_CFA_OFFSET -4 | ||
145 | CFI_RESTORE \reg | ||
146 | .endm | ||
147 | |||
124 | .macro pushfl_cfi | 148 | .macro pushfl_cfi |
125 | pushfl | 149 | pushfl |
126 | CFI_ADJUST_CFA_OFFSET 4 | 150 | CFI_ADJUST_CFA_OFFSET 4 |
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index ca3347a9dab5..3563107b5060 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h | |||
@@ -171,10 +171,11 @@ do { \ | |||
171 | static inline void elf_common_init(struct thread_struct *t, | 171 | static inline void elf_common_init(struct thread_struct *t, |
172 | struct pt_regs *regs, const u16 ds) | 172 | struct pt_regs *regs, const u16 ds) |
173 | { | 173 | { |
174 | regs->ax = regs->bx = regs->cx = regs->dx = 0; | 174 | /* Commented-out registers are cleared in stub_execve */ |
175 | regs->si = regs->di = regs->bp = 0; | 175 | /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0; |
176 | regs->si = regs->di /*= regs->bp*/ = 0; | ||
176 | regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; | 177 | regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; |
177 | regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; | 178 | /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/ |
178 | t->fs = t->gs = 0; | 179 | t->fs = t->gs = 0; |
179 | t->fsindex = t->gsindex = 0; | 180 | t->fsindex = t->gsindex = 0; |
180 | t->ds = t->es = ds; | 181 | t->ds = t->es = ds; |
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 9662290e0b20..e9571ddabc4f 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h | |||
@@ -181,10 +181,9 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *); | |||
181 | extern __visible void smp_invalidate_interrupt(struct pt_regs *); | 181 | extern __visible void smp_invalidate_interrupt(struct pt_regs *); |
182 | #endif | 182 | #endif |
183 | 183 | ||
184 | extern void (*__initconst interrupt[FIRST_SYSTEM_VECTOR | 184 | extern char irq_entries_start[]; |
185 | - FIRST_EXTERNAL_VECTOR])(void); | ||
186 | #ifdef CONFIG_TRACING | 185 | #ifdef CONFIG_TRACING |
187 | #define trace_interrupt interrupt | 186 | #define trace_irq_entries_start irq_entries_start |
188 | #endif | 187 | #endif |
189 | 188 | ||
190 | #define VECTOR_UNDEFINED (-1) | 189 | #define VECTOR_UNDEFINED (-1) |
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 47f29b1d1846..e7814b74caf8 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h | |||
@@ -69,7 +69,7 @@ struct insn { | |||
69 | const insn_byte_t *next_byte; | 69 | const insn_byte_t *next_byte; |
70 | }; | 70 | }; |
71 | 71 | ||
72 | #define MAX_INSN_SIZE 16 | 72 | #define MAX_INSN_SIZE 15 |
73 | 73 | ||
74 | #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) | 74 | #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) |
75 | #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) | 75 | #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) |
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 0a8b519226b8..b77f5edb03b0 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h | |||
@@ -136,10 +136,6 @@ static inline notrace unsigned long arch_local_irq_save(void) | |||
136 | #define USERGS_SYSRET32 \ | 136 | #define USERGS_SYSRET32 \ |
137 | swapgs; \ | 137 | swapgs; \ |
138 | sysretl | 138 | sysretl |
139 | #define ENABLE_INTERRUPTS_SYSEXIT32 \ | ||
140 | swapgs; \ | ||
141 | sti; \ | ||
142 | sysexit | ||
143 | 139 | ||
144 | #else | 140 | #else |
145 | #define INTERRUPT_RETURN iret | 141 | #define INTERRUPT_RETURN iret |
@@ -163,22 +159,27 @@ static inline int arch_irqs_disabled(void) | |||
163 | 159 | ||
164 | return arch_irqs_disabled_flags(flags); | 160 | return arch_irqs_disabled_flags(flags); |
165 | } | 161 | } |
162 | #endif /* !__ASSEMBLY__ */ | ||
166 | 163 | ||
164 | #ifdef __ASSEMBLY__ | ||
165 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
166 | # define TRACE_IRQS_ON call trace_hardirqs_on_thunk; | ||
167 | # define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; | ||
167 | #else | 168 | #else |
168 | 169 | # define TRACE_IRQS_ON | |
169 | #ifdef CONFIG_X86_64 | 170 | # define TRACE_IRQS_OFF |
170 | #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk | 171 | #endif |
171 | #define ARCH_LOCKDEP_SYS_EXIT_IRQ \ | 172 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
173 | # ifdef CONFIG_X86_64 | ||
174 | # define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk | ||
175 | # define LOCKDEP_SYS_EXIT_IRQ \ | ||
172 | TRACE_IRQS_ON; \ | 176 | TRACE_IRQS_ON; \ |
173 | sti; \ | 177 | sti; \ |
174 | SAVE_REST; \ | 178 | call lockdep_sys_exit_thunk; \ |
175 | LOCKDEP_SYS_EXIT; \ | ||
176 | RESTORE_REST; \ | ||
177 | cli; \ | 179 | cli; \ |
178 | TRACE_IRQS_OFF; | 180 | TRACE_IRQS_OFF; |
179 | 181 | # else | |
180 | #else | 182 | # define LOCKDEP_SYS_EXIT \ |
181 | #define ARCH_LOCKDEP_SYS_EXIT \ | ||
182 | pushl %eax; \ | 183 | pushl %eax; \ |
183 | pushl %ecx; \ | 184 | pushl %ecx; \ |
184 | pushl %edx; \ | 185 | pushl %edx; \ |
@@ -186,24 +187,12 @@ static inline int arch_irqs_disabled(void) | |||
186 | popl %edx; \ | 187 | popl %edx; \ |
187 | popl %ecx; \ | 188 | popl %ecx; \ |
188 | popl %eax; | 189 | popl %eax; |
189 | 190 | # define LOCKDEP_SYS_EXIT_IRQ | |
190 | #define ARCH_LOCKDEP_SYS_EXIT_IRQ | 191 | # endif |
191 | #endif | ||
192 | |||
193 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
194 | # define TRACE_IRQS_ON call trace_hardirqs_on_thunk; | ||
195 | # define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; | ||
196 | #else | 192 | #else |
197 | # define TRACE_IRQS_ON | ||
198 | # define TRACE_IRQS_OFF | ||
199 | #endif | ||
200 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
201 | # define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT | ||
202 | # define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ | ||
203 | # else | ||
204 | # define LOCKDEP_SYS_EXIT | 193 | # define LOCKDEP_SYS_EXIT |
205 | # define LOCKDEP_SYS_EXIT_IRQ | 194 | # define LOCKDEP_SYS_EXIT_IRQ |
206 | # endif | 195 | #endif |
207 | |||
208 | #endif /* __ASSEMBLY__ */ | 196 | #endif /* __ASSEMBLY__ */ |
197 | |||
209 | #endif | 198 | #endif |
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 965c47d254aa..5f6051d5d139 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h | |||
@@ -976,11 +976,6 @@ extern void default_banner(void); | |||
976 | PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ | 976 | PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ |
977 | CLBR_NONE, \ | 977 | CLBR_NONE, \ |
978 | jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) | 978 | jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) |
979 | |||
980 | #define ENABLE_INTERRUPTS_SYSEXIT32 \ | ||
981 | PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ | ||
982 | CLBR_NONE, \ | ||
983 | jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) | ||
984 | #endif /* CONFIG_X86_32 */ | 979 | #endif /* CONFIG_X86_32 */ |
985 | 980 | ||
986 | #endif /* __ASSEMBLY__ */ | 981 | #endif /* __ASSEMBLY__ */ |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ec1c93588cef..d2203b5d9538 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -210,8 +210,23 @@ struct x86_hw_tss { | |||
210 | unsigned long sp0; | 210 | unsigned long sp0; |
211 | unsigned short ss0, __ss0h; | 211 | unsigned short ss0, __ss0h; |
212 | unsigned long sp1; | 212 | unsigned long sp1; |
213 | /* ss1 caches MSR_IA32_SYSENTER_CS: */ | 213 | |
214 | unsigned short ss1, __ss1h; | 214 | /* |
215 | * We don't use ring 1, so ss1 is a convenient scratch space in | ||
216 | * the same cacheline as sp0. We use ss1 to cache the value in | ||
217 | * MSR_IA32_SYSENTER_CS. When we context switch | ||
218 | * MSR_IA32_SYSENTER_CS, we first check if the new value being | ||
219 | * written matches ss1, and, if it's not, then we wrmsr the new | ||
220 | * value and update ss1. | ||
221 | * | ||
222 | * The only reason we context switch MSR_IA32_SYSENTER_CS is | ||
223 | * that we set it to zero in vm86 tasks to avoid corrupting the | ||
224 | * stack if we were to go through the sysenter path from vm86 | ||
225 | * mode. | ||
226 | */ | ||
227 | unsigned short ss1; /* MSR_IA32_SYSENTER_CS */ | ||
228 | |||
229 | unsigned short __ss1h; | ||
215 | unsigned long sp2; | 230 | unsigned long sp2; |
216 | unsigned short ss2, __ss2h; | 231 | unsigned short ss2, __ss2h; |
217 | unsigned long __cr3; | 232 | unsigned long __cr3; |
@@ -276,13 +291,17 @@ struct tss_struct { | |||
276 | unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; | 291 | unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; |
277 | 292 | ||
278 | /* | 293 | /* |
279 | * .. and then another 0x100 bytes for the emergency kernel stack: | 294 | * Space for the temporary SYSENTER stack: |
280 | */ | 295 | */ |
281 | unsigned long stack[64]; | 296 | unsigned long SYSENTER_stack[64]; |
282 | 297 | ||
283 | } ____cacheline_aligned; | 298 | } ____cacheline_aligned; |
284 | 299 | ||
285 | DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); | 300 | DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); |
301 | |||
302 | #ifdef CONFIG_X86_32 | ||
303 | DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); | ||
304 | #endif | ||
286 | 305 | ||
287 | /* | 306 | /* |
288 | * Save the original ist values for checking stack pointers during debugging | 307 | * Save the original ist values for checking stack pointers during debugging |
@@ -474,7 +493,6 @@ struct thread_struct { | |||
474 | #ifdef CONFIG_X86_32 | 493 | #ifdef CONFIG_X86_32 |
475 | unsigned long sysenter_cs; | 494 | unsigned long sysenter_cs; |
476 | #else | 495 | #else |
477 | unsigned long usersp; /* Copy from PDA */ | ||
478 | unsigned short es; | 496 | unsigned short es; |
479 | unsigned short ds; | 497 | unsigned short ds; |
480 | unsigned short fsindex; | 498 | unsigned short fsindex; |
@@ -564,6 +582,16 @@ static inline void native_swapgs(void) | |||
564 | #endif | 582 | #endif |
565 | } | 583 | } |
566 | 584 | ||
585 | static inline unsigned long current_top_of_stack(void) | ||
586 | { | ||
587 | #ifdef CONFIG_X86_64 | ||
588 | return this_cpu_read_stable(cpu_tss.x86_tss.sp0); | ||
589 | #else | ||
590 | /* sp0 on x86_32 is special in and around vm86 mode. */ | ||
591 | return this_cpu_read_stable(cpu_current_top_of_stack); | ||
592 | #endif | ||
593 | } | ||
594 | |||
567 | #ifdef CONFIG_PARAVIRT | 595 | #ifdef CONFIG_PARAVIRT |
568 | #include <asm/paravirt.h> | 596 | #include <asm/paravirt.h> |
569 | #else | 597 | #else |
@@ -761,10 +789,10 @@ extern char ignore_fpu_irq; | |||
761 | #define ARCH_HAS_SPINLOCK_PREFETCH | 789 | #define ARCH_HAS_SPINLOCK_PREFETCH |
762 | 790 | ||
763 | #ifdef CONFIG_X86_32 | 791 | #ifdef CONFIG_X86_32 |
764 | # define BASE_PREFETCH ASM_NOP4 | 792 | # define BASE_PREFETCH "" |
765 | # define ARCH_HAS_PREFETCH | 793 | # define ARCH_HAS_PREFETCH |
766 | #else | 794 | #else |
767 | # define BASE_PREFETCH "prefetcht0 (%1)" | 795 | # define BASE_PREFETCH "prefetcht0 %P1" |
768 | #endif | 796 | #endif |
769 | 797 | ||
770 | /* | 798 | /* |
@@ -775,10 +803,9 @@ extern char ignore_fpu_irq; | |||
775 | */ | 803 | */ |
776 | static inline void prefetch(const void *x) | 804 | static inline void prefetch(const void *x) |
777 | { | 805 | { |
778 | alternative_input(BASE_PREFETCH, | 806 | alternative_input(BASE_PREFETCH, "prefetchnta %P1", |
779 | "prefetchnta (%1)", | ||
780 | X86_FEATURE_XMM, | 807 | X86_FEATURE_XMM, |
781 | "r" (x)); | 808 | "m" (*(const char *)x)); |
782 | } | 809 | } |
783 | 810 | ||
784 | /* | 811 | /* |
@@ -788,10 +815,9 @@ static inline void prefetch(const void *x) | |||
788 | */ | 815 | */ |
789 | static inline void prefetchw(const void *x) | 816 | static inline void prefetchw(const void *x) |
790 | { | 817 | { |
791 | alternative_input(BASE_PREFETCH, | 818 | alternative_input(BASE_PREFETCH, "prefetchw %P1", |
792 | "prefetchw (%1)", | 819 | X86_FEATURE_3DNOWPREFETCH, |
793 | X86_FEATURE_3DNOW, | 820 | "m" (*(const char *)x)); |
794 | "r" (x)); | ||
795 | } | 821 | } |
796 | 822 | ||
797 | static inline void spin_lock_prefetch(const void *x) | 823 | static inline void spin_lock_prefetch(const void *x) |
@@ -799,6 +825,9 @@ static inline void spin_lock_prefetch(const void *x) | |||
799 | prefetchw(x); | 825 | prefetchw(x); |
800 | } | 826 | } |
801 | 827 | ||
828 | #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ | ||
829 | TOP_OF_KERNEL_STACK_PADDING) | ||
830 | |||
802 | #ifdef CONFIG_X86_32 | 831 | #ifdef CONFIG_X86_32 |
803 | /* | 832 | /* |
804 | * User space process size: 3GB (default). | 833 | * User space process size: 3GB (default). |
@@ -809,39 +838,16 @@ static inline void spin_lock_prefetch(const void *x) | |||
809 | #define STACK_TOP_MAX STACK_TOP | 838 | #define STACK_TOP_MAX STACK_TOP |
810 | 839 | ||
811 | #define INIT_THREAD { \ | 840 | #define INIT_THREAD { \ |
812 | .sp0 = sizeof(init_stack) + (long)&init_stack, \ | 841 | .sp0 = TOP_OF_INIT_STACK, \ |
813 | .vm86_info = NULL, \ | 842 | .vm86_info = NULL, \ |
814 | .sysenter_cs = __KERNEL_CS, \ | 843 | .sysenter_cs = __KERNEL_CS, \ |
815 | .io_bitmap_ptr = NULL, \ | 844 | .io_bitmap_ptr = NULL, \ |
816 | } | 845 | } |
817 | 846 | ||
818 | /* | ||
819 | * Note that the .io_bitmap member must be extra-big. This is because | ||
820 | * the CPU will access an additional byte beyond the end of the IO | ||
821 | * permission bitmap. The extra byte must be all 1 bits, and must | ||
822 | * be within the limit. | ||
823 | */ | ||
824 | #define INIT_TSS { \ | ||
825 | .x86_tss = { \ | ||
826 | .sp0 = sizeof(init_stack) + (long)&init_stack, \ | ||
827 | .ss0 = __KERNEL_DS, \ | ||
828 | .ss1 = __KERNEL_CS, \ | ||
829 | .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ | ||
830 | }, \ | ||
831 | .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ | ||
832 | } | ||
833 | |||
834 | extern unsigned long thread_saved_pc(struct task_struct *tsk); | 847 | extern unsigned long thread_saved_pc(struct task_struct *tsk); |
835 | 848 | ||
836 | #define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) | ||
837 | #define KSTK_TOP(info) \ | ||
838 | ({ \ | ||
839 | unsigned long *__ptr = (unsigned long *)(info); \ | ||
840 | (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ | ||
841 | }) | ||
842 | |||
843 | /* | 849 | /* |
844 | * The below -8 is to reserve 8 bytes on top of the ring0 stack. | 850 | * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. |
845 | * This is necessary to guarantee that the entire "struct pt_regs" | 851 | * This is necessary to guarantee that the entire "struct pt_regs" |
846 | * is accessible even if the CPU haven't stored the SS/ESP registers | 852 | * is accessible even if the CPU haven't stored the SS/ESP registers |
847 | * on the stack (interrupt gate does not save these registers | 853 | * on the stack (interrupt gate does not save these registers |
@@ -850,11 +856,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); | |||
850 | * "struct pt_regs" is possible, but they may contain the | 856 | * "struct pt_regs" is possible, but they may contain the |
851 | * completely wrong values. | 857 | * completely wrong values. |
852 | */ | 858 | */ |
853 | #define task_pt_regs(task) \ | 859 | #define task_pt_regs(task) \ |
854 | ({ \ | 860 | ({ \ |
855 | struct pt_regs *__regs__; \ | 861 | unsigned long __ptr = (unsigned long)task_stack_page(task); \ |
856 | __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ | 862 | __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ |
857 | __regs__ - 1; \ | 863 | ((struct pt_regs *)__ptr) - 1; \ |
858 | }) | 864 | }) |
859 | 865 | ||
860 | #define KSTK_ESP(task) (task_pt_regs(task)->sp) | 866 | #define KSTK_ESP(task) (task_pt_regs(task)->sp) |
@@ -886,11 +892,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); | |||
886 | #define STACK_TOP_MAX TASK_SIZE_MAX | 892 | #define STACK_TOP_MAX TASK_SIZE_MAX |
887 | 893 | ||
888 | #define INIT_THREAD { \ | 894 | #define INIT_THREAD { \ |
889 | .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ | 895 | .sp0 = TOP_OF_INIT_STACK \ |
890 | } | ||
891 | |||
892 | #define INIT_TSS { \ | ||
893 | .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ | ||
894 | } | 896 | } |
895 | 897 | ||
896 | /* | 898 | /* |
@@ -902,11 +904,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); | |||
902 | #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) | 904 | #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) |
903 | extern unsigned long KSTK_ESP(struct task_struct *task); | 905 | extern unsigned long KSTK_ESP(struct task_struct *task); |
904 | 906 | ||
905 | /* | ||
906 | * User space RSP while inside the SYSCALL fast path | ||
907 | */ | ||
908 | DECLARE_PER_CPU(unsigned long, old_rsp); | ||
909 | |||
910 | #endif /* CONFIG_X86_64 */ | 907 | #endif /* CONFIG_X86_64 */ |
911 | 908 | ||
912 | extern void start_thread(struct pt_regs *regs, unsigned long new_ip, | 909 | extern void start_thread(struct pt_regs *regs, unsigned long new_ip, |
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 86fc2bb82287..19507ffa5d28 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h | |||
@@ -31,13 +31,17 @@ struct pt_regs { | |||
31 | #else /* __i386__ */ | 31 | #else /* __i386__ */ |
32 | 32 | ||
33 | struct pt_regs { | 33 | struct pt_regs { |
34 | /* | ||
35 | * C ABI says these regs are callee-preserved. They aren't saved on kernel entry | ||
36 | * unless syscall needs a complete, fully filled "struct pt_regs". | ||
37 | */ | ||
34 | unsigned long r15; | 38 | unsigned long r15; |
35 | unsigned long r14; | 39 | unsigned long r14; |
36 | unsigned long r13; | 40 | unsigned long r13; |
37 | unsigned long r12; | 41 | unsigned long r12; |
38 | unsigned long bp; | 42 | unsigned long bp; |
39 | unsigned long bx; | 43 | unsigned long bx; |
40 | /* arguments: non interrupts/non tracing syscalls only save up to here*/ | 44 | /* These regs are callee-clobbered. Always saved on kernel entry. */ |
41 | unsigned long r11; | 45 | unsigned long r11; |
42 | unsigned long r10; | 46 | unsigned long r10; |
43 | unsigned long r9; | 47 | unsigned long r9; |
@@ -47,9 +51,12 @@ struct pt_regs { | |||
47 | unsigned long dx; | 51 | unsigned long dx; |
48 | unsigned long si; | 52 | unsigned long si; |
49 | unsigned long di; | 53 | unsigned long di; |
54 | /* | ||
55 | * On syscall entry, this is syscall#. On CPU exception, this is error code. | ||
56 | * On hw interrupt, it's IRQ number: | ||
57 | */ | ||
50 | unsigned long orig_ax; | 58 | unsigned long orig_ax; |
51 | /* end of arguments */ | 59 | /* Return frame for iretq */ |
52 | /* cpu exception frame or undefined */ | ||
53 | unsigned long ip; | 60 | unsigned long ip; |
54 | unsigned long cs; | 61 | unsigned long cs; |
55 | unsigned long flags; | 62 | unsigned long flags; |
@@ -89,11 +96,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) | |||
89 | } | 96 | } |
90 | 97 | ||
91 | /* | 98 | /* |
92 | * user_mode_vm(regs) determines whether a register set came from user mode. | 99 | * user_mode(regs) determines whether a register set came from user |
93 | * This is true if V8086 mode was enabled OR if the register set was from | 100 | * mode. On x86_32, this is true if V8086 mode was enabled OR if the |
94 | * protected mode with RPL-3 CS value. This tricky test checks that with | 101 | * register set was from protected mode with RPL-3 CS value. This |
95 | * one comparison. Many places in the kernel can bypass this full check | 102 | * tricky test checks that with one comparison. |
96 | * if they have already ruled out V8086 mode, so user_mode(regs) can be used. | 103 | * |
104 | * On x86_64, vm86 mode is mercifully nonexistent, and we don't need | ||
105 | * the extra check. | ||
97 | */ | 106 | */ |
98 | static inline int user_mode(struct pt_regs *regs) | 107 | static inline int user_mode(struct pt_regs *regs) |
99 | { | 108 | { |
@@ -104,16 +113,6 @@ static inline int user_mode(struct pt_regs *regs) | |||
104 | #endif | 113 | #endif |
105 | } | 114 | } |
106 | 115 | ||
107 | static inline int user_mode_vm(struct pt_regs *regs) | ||
108 | { | ||
109 | #ifdef CONFIG_X86_32 | ||
110 | return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= | ||
111 | USER_RPL; | ||
112 | #else | ||
113 | return user_mode(regs); | ||
114 | #endif | ||
115 | } | ||
116 | |||
117 | static inline int v8086_mode(struct pt_regs *regs) | 116 | static inline int v8086_mode(struct pt_regs *regs) |
118 | { | 117 | { |
119 | #ifdef CONFIG_X86_32 | 118 | #ifdef CONFIG_X86_32 |
@@ -138,12 +137,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs) | |||
138 | #endif | 137 | #endif |
139 | } | 138 | } |
140 | 139 | ||
141 | #define current_user_stack_pointer() this_cpu_read(old_rsp) | 140 | #define current_user_stack_pointer() current_pt_regs()->sp |
142 | /* ia32 vs. x32 difference */ | 141 | #define compat_user_stack_pointer() current_pt_regs()->sp |
143 | #define compat_user_stack_pointer() \ | ||
144 | (test_thread_flag(TIF_IA32) \ | ||
145 | ? current_pt_regs()->sp \ | ||
146 | : this_cpu_read(old_rsp)) | ||
147 | #endif | 142 | #endif |
148 | 143 | ||
149 | #ifdef CONFIG_X86_32 | 144 | #ifdef CONFIG_X86_32 |
@@ -248,7 +243,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, | |||
248 | */ | 243 | */ |
249 | #define arch_ptrace_stop_needed(code, info) \ | 244 | #define arch_ptrace_stop_needed(code, info) \ |
250 | ({ \ | 245 | ({ \ |
251 | set_thread_flag(TIF_NOTIFY_RESUME); \ | 246 | force_iret(); \ |
252 | false; \ | 247 | false; \ |
253 | }) | 248 | }) |
254 | 249 | ||
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index db257a58571f..5a9856eb12ba 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h | |||
@@ -3,8 +3,10 @@ | |||
3 | 3 | ||
4 | #include <linux/const.h> | 4 | #include <linux/const.h> |
5 | 5 | ||
6 | /* Constructor for a conventional segment GDT (or LDT) entry */ | 6 | /* |
7 | /* This is a macro so it can be used in initializers */ | 7 | * Constructor for a conventional segment GDT (or LDT) entry. |
8 | * This is a macro so it can be used in initializers. | ||
9 | */ | ||
8 | #define GDT_ENTRY(flags, base, limit) \ | 10 | #define GDT_ENTRY(flags, base, limit) \ |
9 | ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ | 11 | ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ |
10 | (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ | 12 | (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ |
@@ -12,198 +14,228 @@ | |||
12 | (((base) & _AC(0x00ffffff,ULL)) << 16) | \ | 14 | (((base) & _AC(0x00ffffff,ULL)) << 16) | \ |
13 | (((limit) & _AC(0x0000ffff,ULL)))) | 15 | (((limit) & _AC(0x0000ffff,ULL)))) |
14 | 16 | ||
15 | /* Simple and small GDT entries for booting only */ | 17 | /* Simple and small GDT entries for booting only: */ |
16 | 18 | ||
17 | #define GDT_ENTRY_BOOT_CS 2 | 19 | #define GDT_ENTRY_BOOT_CS 2 |
18 | #define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) | 20 | #define GDT_ENTRY_BOOT_DS 3 |
21 | #define GDT_ENTRY_BOOT_TSS 4 | ||
22 | #define __BOOT_CS (GDT_ENTRY_BOOT_CS*8) | ||
23 | #define __BOOT_DS (GDT_ENTRY_BOOT_DS*8) | ||
24 | #define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8) | ||
25 | |||
26 | /* | ||
27 | * Bottom two bits of selector give the ring | ||
28 | * privilege level | ||
29 | */ | ||
30 | #define SEGMENT_RPL_MASK 0x3 | ||
19 | 31 | ||
20 | #define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) | 32 | /* User mode is privilege level 3: */ |
21 | #define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) | 33 | #define USER_RPL 0x3 |
22 | 34 | ||
23 | #define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) | 35 | /* Bit 2 is Table Indicator (TI): selects between LDT or GDT */ |
24 | #define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) | 36 | #define SEGMENT_TI_MASK 0x4 |
37 | /* LDT segment has TI set ... */ | ||
38 | #define SEGMENT_LDT 0x4 | ||
39 | /* ... GDT has it cleared */ | ||
40 | #define SEGMENT_GDT 0x0 | ||
25 | 41 | ||
26 | #define SEGMENT_RPL_MASK 0x3 /* | 42 | #define GDT_ENTRY_INVALID_SEG 0 |
27 | * Bottom two bits of selector give the ring | ||
28 | * privilege level | ||
29 | */ | ||
30 | #define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */ | ||
31 | #define USER_RPL 0x3 /* User mode is privilege level 3 */ | ||
32 | #define SEGMENT_LDT 0x4 /* LDT segment has TI set... */ | ||
33 | #define SEGMENT_GDT 0x0 /* ... GDT has it cleared */ | ||
34 | 43 | ||
35 | #ifdef CONFIG_X86_32 | 44 | #ifdef CONFIG_X86_32 |
36 | /* | 45 | /* |
37 | * The layout of the per-CPU GDT under Linux: | 46 | * The layout of the per-CPU GDT under Linux: |
38 | * | 47 | * |
39 | * 0 - null | 48 | * 0 - null <=== cacheline #1 |
40 | * 1 - reserved | 49 | * 1 - reserved |
41 | * 2 - reserved | 50 | * 2 - reserved |
42 | * 3 - reserved | 51 | * 3 - reserved |
43 | * | 52 | * |
44 | * 4 - unused <==== new cacheline | 53 | * 4 - unused <=== cacheline #2 |
45 | * 5 - unused | 54 | * 5 - unused |
46 | * | 55 | * |
47 | * ------- start of TLS (Thread-Local Storage) segments: | 56 | * ------- start of TLS (Thread-Local Storage) segments: |
48 | * | 57 | * |
49 | * 6 - TLS segment #1 [ glibc's TLS segment ] | 58 | * 6 - TLS segment #1 [ glibc's TLS segment ] |
50 | * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] | 59 | * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] |
51 | * 8 - TLS segment #3 | 60 | * 8 - TLS segment #3 <=== cacheline #3 |
52 | * 9 - reserved | 61 | * 9 - reserved |
53 | * 10 - reserved | 62 | * 10 - reserved |
54 | * 11 - reserved | 63 | * 11 - reserved |
55 | * | 64 | * |
56 | * ------- start of kernel segments: | 65 | * ------- start of kernel segments: |
57 | * | 66 | * |
58 | * 12 - kernel code segment <==== new cacheline | 67 | * 12 - kernel code segment <=== cacheline #4 |
59 | * 13 - kernel data segment | 68 | * 13 - kernel data segment |
60 | * 14 - default user CS | 69 | * 14 - default user CS |
61 | * 15 - default user DS | 70 | * 15 - default user DS |
62 | * 16 - TSS | 71 | * 16 - TSS <=== cacheline #5 |
63 | * 17 - LDT | 72 | * 17 - LDT |
64 | * 18 - PNPBIOS support (16->32 gate) | 73 | * 18 - PNPBIOS support (16->32 gate) |
65 | * 19 - PNPBIOS support | 74 | * 19 - PNPBIOS support |
66 | * 20 - PNPBIOS support | 75 | * 20 - PNPBIOS support <=== cacheline #6 |
67 | * 21 - PNPBIOS support | 76 | * 21 - PNPBIOS support |
68 | * 22 - PNPBIOS support | 77 | * 22 - PNPBIOS support |
69 | * 23 - APM BIOS support | 78 | * 23 - APM BIOS support |
70 | * 24 - APM BIOS support | 79 | * 24 - APM BIOS support <=== cacheline #7 |
71 | * 25 - APM BIOS support | 80 | * 25 - APM BIOS support |
72 | * | 81 | * |
73 | * 26 - ESPFIX small SS | 82 | * 26 - ESPFIX small SS |
74 | * 27 - per-cpu [ offset to per-cpu data area ] | 83 | * 27 - per-cpu [ offset to per-cpu data area ] |
75 | * 28 - stack_canary-20 [ for stack protector ] | 84 | * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8 |
76 | * 29 - unused | 85 | * 29 - unused |
77 | * 30 - unused | 86 | * 30 - unused |
78 | * 31 - TSS for double fault handler | 87 | * 31 - TSS for double fault handler |
79 | */ | 88 | */ |
80 | #define GDT_ENTRY_TLS_MIN 6 | 89 | #define GDT_ENTRY_TLS_MIN 6 |
81 | #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) | 90 | #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) |
82 | 91 | ||
92 | #define GDT_ENTRY_KERNEL_CS 12 | ||
93 | #define GDT_ENTRY_KERNEL_DS 13 | ||
83 | #define GDT_ENTRY_DEFAULT_USER_CS 14 | 94 | #define GDT_ENTRY_DEFAULT_USER_CS 14 |
84 | |||
85 | #define GDT_ENTRY_DEFAULT_USER_DS 15 | 95 | #define GDT_ENTRY_DEFAULT_USER_DS 15 |
96 | #define GDT_ENTRY_TSS 16 | ||
97 | #define GDT_ENTRY_LDT 17 | ||
98 | #define GDT_ENTRY_PNPBIOS_CS32 18 | ||
99 | #define GDT_ENTRY_PNPBIOS_CS16 19 | ||
100 | #define GDT_ENTRY_PNPBIOS_DS 20 | ||
101 | #define GDT_ENTRY_PNPBIOS_TS1 21 | ||
102 | #define GDT_ENTRY_PNPBIOS_TS2 22 | ||
103 | #define GDT_ENTRY_APMBIOS_BASE 23 | ||
104 | |||
105 | #define GDT_ENTRY_ESPFIX_SS 26 | ||
106 | #define GDT_ENTRY_PERCPU 27 | ||
107 | #define GDT_ENTRY_STACK_CANARY 28 | ||
108 | |||
109 | #define GDT_ENTRY_DOUBLEFAULT_TSS 31 | ||
86 | 110 | ||
87 | #define GDT_ENTRY_KERNEL_BASE (12) | 111 | /* |
112 | * Number of entries in the GDT table: | ||
113 | */ | ||
114 | #define GDT_ENTRIES 32 | ||
88 | 115 | ||
89 | #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0) | 116 | /* |
117 | * Segment selector values corresponding to the above entries: | ||
118 | */ | ||
90 | 119 | ||
91 | #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1) | 120 | #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) |
121 | #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) | ||
122 | #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) | ||
123 | #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) | ||
124 | #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) | ||
92 | 125 | ||
93 | #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4) | 126 | /* segment for calling fn: */ |
94 | #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5) | 127 | #define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8) |
128 | /* code segment for BIOS: */ | ||
129 | #define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8) | ||
95 | 130 | ||
96 | #define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6) | 131 | /* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */ |
97 | #define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11) | 132 | #define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32) |
98 | 133 | ||
99 | #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14) | 134 | /* data segment for BIOS: */ |
100 | #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) | 135 | #define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8) |
136 | /* transfer data segment: */ | ||
137 | #define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8) | ||
138 | /* another data segment: */ | ||
139 | #define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8) | ||
101 | 140 | ||
102 | #define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15) | ||
103 | #ifdef CONFIG_SMP | 141 | #ifdef CONFIG_SMP |
104 | #define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) | 142 | # define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8) |
105 | #else | 143 | #else |
106 | #define __KERNEL_PERCPU 0 | 144 | # define __KERNEL_PERCPU 0 |
107 | #endif | 145 | #endif |
108 | 146 | ||
109 | #define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16) | ||
110 | #ifdef CONFIG_CC_STACKPROTECTOR | 147 | #ifdef CONFIG_CC_STACKPROTECTOR |
111 | #define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) | 148 | # define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) |
112 | #else | 149 | #else |
113 | #define __KERNEL_STACK_CANARY 0 | 150 | # define __KERNEL_STACK_CANARY 0 |
114 | #endif | 151 | #endif |
115 | 152 | ||
116 | #define GDT_ENTRY_DOUBLEFAULT_TSS 31 | 153 | #else /* 64-bit: */ |
117 | |||
118 | /* | ||
119 | * The GDT has 32 entries | ||
120 | */ | ||
121 | #define GDT_ENTRIES 32 | ||
122 | 154 | ||
123 | /* The PnP BIOS entries in the GDT */ | 155 | #include <asm/cache.h> |
124 | #define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0) | ||
125 | #define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1) | ||
126 | #define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2) | ||
127 | #define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3) | ||
128 | #define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4) | ||
129 | |||
130 | /* The PnP BIOS selectors */ | ||
131 | #define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */ | ||
132 | #define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */ | ||
133 | #define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */ | ||
134 | #define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ | ||
135 | #define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ | ||
136 | 156 | ||
157 | #define GDT_ENTRY_KERNEL32_CS 1 | ||
158 | #define GDT_ENTRY_KERNEL_CS 2 | ||
159 | #define GDT_ENTRY_KERNEL_DS 3 | ||
137 | 160 | ||
138 | /* | 161 | /* |
139 | * Matching rules for certain types of segments. | 162 | * We cannot use the same code segment descriptor for user and kernel mode, |
163 | * not even in long flat mode, because of different DPL. | ||
164 | * | ||
165 | * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes | ||
166 | * selectors: | ||
167 | * | ||
168 | * if returning to 32-bit userspace: cs = STAR.SYSRET_CS, | ||
169 | * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16, | ||
170 | * | ||
171 | * ss = STAR.SYSRET_CS+8 (in either case) | ||
172 | * | ||
173 | * thus USER_DS should be between 32-bit and 64-bit code selectors: | ||
140 | */ | 174 | */ |
175 | #define GDT_ENTRY_DEFAULT_USER32_CS 4 | ||
176 | #define GDT_ENTRY_DEFAULT_USER_DS 5 | ||
177 | #define GDT_ENTRY_DEFAULT_USER_CS 6 | ||
141 | 178 | ||
142 | /* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ | 179 | /* Needs two entries */ |
143 | #define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) | 180 | #define GDT_ENTRY_TSS 8 |
144 | 181 | /* Needs two entries */ | |
182 | #define GDT_ENTRY_LDT 10 | ||
145 | 183 | ||
146 | #else | 184 | #define GDT_ENTRY_TLS_MIN 12 |
147 | #include <asm/cache.h> | 185 | #define GDT_ENTRY_TLS_MAX 14 |
148 | |||
149 | #define GDT_ENTRY_KERNEL32_CS 1 | ||
150 | #define GDT_ENTRY_KERNEL_CS 2 | ||
151 | #define GDT_ENTRY_KERNEL_DS 3 | ||
152 | 186 | ||
153 | #define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8) | 187 | /* Abused to load per CPU data from limit */ |
188 | #define GDT_ENTRY_PER_CPU 15 | ||
154 | 189 | ||
155 | /* | 190 | /* |
156 | * we cannot use the same code segment descriptor for user and kernel | 191 | * Number of entries in the GDT table: |
157 | * -- not even in the long flat mode, because of different DPL /kkeil | ||
158 | * The segment offset needs to contain a RPL. Grr. -AK | ||
159 | * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) | ||
160 | */ | 192 | */ |
161 | #define GDT_ENTRY_DEFAULT_USER32_CS 4 | 193 | #define GDT_ENTRIES 16 |
162 | #define GDT_ENTRY_DEFAULT_USER_DS 5 | ||
163 | #define GDT_ENTRY_DEFAULT_USER_CS 6 | ||
164 | #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3) | ||
165 | #define __USER32_DS __USER_DS | ||
166 | |||
167 | #define GDT_ENTRY_TSS 8 /* needs two entries */ | ||
168 | #define GDT_ENTRY_LDT 10 /* needs two entries */ | ||
169 | #define GDT_ENTRY_TLS_MIN 12 | ||
170 | #define GDT_ENTRY_TLS_MAX 14 | ||
171 | |||
172 | #define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */ | ||
173 | #define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3) | ||
174 | 194 | ||
175 | /* TLS indexes for 64bit - hardcoded in arch_prctl */ | 195 | /* |
176 | #define FS_TLS 0 | 196 | * Segment selector values corresponding to the above entries: |
177 | #define GS_TLS 1 | 197 | * |
178 | 198 | * Note, selectors also need to have a correct RPL, | |
179 | #define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) | 199 | * expressed with the +3 value for user-space selectors: |
180 | #define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) | 200 | */ |
181 | 201 | #define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8) | |
182 | #define GDT_ENTRIES 16 | 202 | #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) |
203 | #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) | ||
204 | #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) | ||
205 | #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) | ||
206 | #define __USER32_DS __USER_DS | ||
207 | #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) | ||
208 | #define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3) | ||
209 | |||
210 | /* TLS indexes for 64-bit - hardcoded in arch_prctl(): */ | ||
211 | #define FS_TLS 0 | ||
212 | #define GS_TLS 1 | ||
213 | |||
214 | #define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) | ||
215 | #define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) | ||
183 | 216 | ||
184 | #endif | 217 | #endif |
185 | 218 | ||
186 | #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) | ||
187 | #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) | ||
188 | #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) | ||
189 | #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) | ||
190 | #ifndef CONFIG_PARAVIRT | 219 | #ifndef CONFIG_PARAVIRT |
191 | #define get_kernel_rpl() 0 | 220 | # define get_kernel_rpl() 0 |
192 | #endif | 221 | #endif |
193 | 222 | ||
194 | #define IDT_ENTRIES 256 | 223 | #define IDT_ENTRIES 256 |
195 | #define NUM_EXCEPTION_VECTORS 32 | 224 | #define NUM_EXCEPTION_VECTORS 32 |
196 | /* Bitmask of exception vectors which push an error code on the stack */ | 225 | |
197 | #define EXCEPTION_ERRCODE_MASK 0x00027d00 | 226 | /* Bitmask of exception vectors which push an error code on the stack: */ |
198 | #define GDT_SIZE (GDT_ENTRIES * 8) | 227 | #define EXCEPTION_ERRCODE_MASK 0x00027d00 |
199 | #define GDT_ENTRY_TLS_ENTRIES 3 | 228 | |
200 | #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) | 229 | #define GDT_SIZE (GDT_ENTRIES*8) |
230 | #define GDT_ENTRY_TLS_ENTRIES 3 | ||
231 | #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) | ||
201 | 232 | ||
202 | #ifdef __KERNEL__ | 233 | #ifdef __KERNEL__ |
203 | #ifndef __ASSEMBLY__ | 234 | #ifndef __ASSEMBLY__ |
235 | |||
204 | extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; | 236 | extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; |
205 | #ifdef CONFIG_TRACING | 237 | #ifdef CONFIG_TRACING |
206 | #define trace_early_idt_handlers early_idt_handlers | 238 | # define trace_early_idt_handlers early_idt_handlers |
207 | #endif | 239 | #endif |
208 | 240 | ||
209 | /* | 241 | /* |
@@ -228,37 +260,30 @@ do { \ | |||
228 | } while (0) | 260 | } while (0) |
229 | 261 | ||
230 | /* | 262 | /* |
231 | * Save a segment register away | 263 | * Save a segment register away: |
232 | */ | 264 | */ |
233 | #define savesegment(seg, value) \ | 265 | #define savesegment(seg, value) \ |
234 | asm("mov %%" #seg ",%0":"=r" (value) : : "memory") | 266 | asm("mov %%" #seg ",%0":"=r" (value) : : "memory") |
235 | 267 | ||
236 | /* | 268 | /* |
237 | * x86_32 user gs accessors. | 269 | * x86-32 user GS accessors: |
238 | */ | 270 | */ |
239 | #ifdef CONFIG_X86_32 | 271 | #ifdef CONFIG_X86_32 |
240 | #ifdef CONFIG_X86_32_LAZY_GS | 272 | # ifdef CONFIG_X86_32_LAZY_GS |
241 | #define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) | 273 | # define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; }) |
242 | #define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) | 274 | # define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) |
243 | #define task_user_gs(tsk) ((tsk)->thread.gs) | 275 | # define task_user_gs(tsk) ((tsk)->thread.gs) |
244 | #define lazy_save_gs(v) savesegment(gs, (v)) | 276 | # define lazy_save_gs(v) savesegment(gs, (v)) |
245 | #define lazy_load_gs(v) loadsegment(gs, (v)) | 277 | # define lazy_load_gs(v) loadsegment(gs, (v)) |
246 | #else /* X86_32_LAZY_GS */ | 278 | # else /* X86_32_LAZY_GS */ |
247 | #define get_user_gs(regs) (u16)((regs)->gs) | 279 | # define get_user_gs(regs) (u16)((regs)->gs) |
248 | #define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) | 280 | # define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) |
249 | #define task_user_gs(tsk) (task_pt_regs(tsk)->gs) | 281 | # define task_user_gs(tsk) (task_pt_regs(tsk)->gs) |
250 | #define lazy_save_gs(v) do { } while (0) | 282 | # define lazy_save_gs(v) do { } while (0) |
251 | #define lazy_load_gs(v) do { } while (0) | 283 | # define lazy_load_gs(v) do { } while (0) |
252 | #endif /* X86_32_LAZY_GS */ | 284 | # endif /* X86_32_LAZY_GS */ |
253 | #endif /* X86_32 */ | 285 | #endif /* X86_32 */ |
254 | 286 | ||
255 | static inline unsigned long get_limit(unsigned long segment) | ||
256 | { | ||
257 | unsigned long __limit; | ||
258 | asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); | ||
259 | return __limit + 1; | ||
260 | } | ||
261 | |||
262 | #endif /* !__ASSEMBLY__ */ | 287 | #endif /* !__ASSEMBLY__ */ |
263 | #endif /* __KERNEL__ */ | 288 | #endif /* __KERNEL__ */ |
264 | 289 | ||
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ff4e7b236e21..f69e06b283fb 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h | |||
@@ -66,6 +66,11 @@ static inline void x86_ce4100_early_setup(void) { } | |||
66 | */ | 66 | */ |
67 | extern struct boot_params boot_params; | 67 | extern struct boot_params boot_params; |
68 | 68 | ||
69 | static inline bool kaslr_enabled(void) | ||
70 | { | ||
71 | return !!(boot_params.hdr.loadflags & KASLR_FLAG); | ||
72 | } | ||
73 | |||
69 | /* | 74 | /* |
70 | * Do NOT EVER look at the BIOS memory size location. | 75 | * Do NOT EVER look at the BIOS memory size location. |
71 | * It does not work on many machines. | 76 | * It does not work on many machines. |
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 9dfce4e0417d..6fe6b182c998 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h | |||
@@ -57,9 +57,9 @@ struct sigcontext { | |||
57 | unsigned long ip; | 57 | unsigned long ip; |
58 | unsigned long flags; | 58 | unsigned long flags; |
59 | unsigned short cs; | 59 | unsigned short cs; |
60 | unsigned short gs; | 60 | unsigned short __pad2; /* Was called gs, but was always zero. */ |
61 | unsigned short fs; | 61 | unsigned short __pad1; /* Was called fs, but was always zero. */ |
62 | unsigned short __pad0; | 62 | unsigned short ss; |
63 | unsigned long err; | 63 | unsigned long err; |
64 | unsigned long trapno; | 64 | unsigned long trapno; |
65 | unsigned long oldmask; | 65 | unsigned long oldmask; |
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 7a958164088c..89db46752a8f 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h | |||
@@ -13,9 +13,7 @@ | |||
13 | X86_EFLAGS_CF | X86_EFLAGS_RF) | 13 | X86_EFLAGS_CF | X86_EFLAGS_RF) |
14 | 14 | ||
15 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where); | 15 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where); |
16 | 16 | int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc); | |
17 | int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | ||
18 | unsigned long *pax); | ||
19 | int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, | 17 | int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, |
20 | struct pt_regs *regs, unsigned long mask); | 18 | struct pt_regs *regs, unsigned long mask); |
21 | 19 | ||
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8d3120f4e270..ba665ebd17bb 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h | |||
@@ -27,23 +27,11 @@ | |||
27 | 27 | ||
28 | #ifdef CONFIG_X86_SMAP | 28 | #ifdef CONFIG_X86_SMAP |
29 | 29 | ||
30 | #define ASM_CLAC \ | 30 | #define ASM_CLAC \ |
31 | 661: ASM_NOP3 ; \ | 31 | ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP |
32 | .pushsection .altinstr_replacement, "ax" ; \ | 32 | |
33 | 662: __ASM_CLAC ; \ | 33 | #define ASM_STAC \ |
34 | .popsection ; \ | 34 | ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP |
35 | .pushsection .altinstructions, "a" ; \ | ||
36 | altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ | ||
37 | .popsection | ||
38 | |||
39 | #define ASM_STAC \ | ||
40 | 661: ASM_NOP3 ; \ | ||
41 | .pushsection .altinstr_replacement, "ax" ; \ | ||
42 | 662: __ASM_STAC ; \ | ||
43 | .popsection ; \ | ||
44 | .pushsection .altinstructions, "a" ; \ | ||
45 | altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ | ||
46 | .popsection | ||
47 | 35 | ||
48 | #else /* CONFIG_X86_SMAP */ | 36 | #else /* CONFIG_X86_SMAP */ |
49 | 37 | ||
@@ -61,20 +49,20 @@ | |||
61 | static __always_inline void clac(void) | 49 | static __always_inline void clac(void) |
62 | { | 50 | { |
63 | /* Note: a barrier is implicit in alternative() */ | 51 | /* Note: a barrier is implicit in alternative() */ |
64 | alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); | 52 | alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); |
65 | } | 53 | } |
66 | 54 | ||
67 | static __always_inline void stac(void) | 55 | static __always_inline void stac(void) |
68 | { | 56 | { |
69 | /* Note: a barrier is implicit in alternative() */ | 57 | /* Note: a barrier is implicit in alternative() */ |
70 | alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); | 58 | alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); |
71 | } | 59 | } |
72 | 60 | ||
73 | /* These macros can be used in asm() statements */ | 61 | /* These macros can be used in asm() statements */ |
74 | #define ASM_CLAC \ | 62 | #define ASM_CLAC \ |
75 | ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) | 63 | ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) |
76 | #define ASM_STAC \ | 64 | #define ASM_STAC \ |
77 | ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) | 65 | ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP) |
78 | 66 | ||
79 | #else /* CONFIG_X86_SMAP */ | 67 | #else /* CONFIG_X86_SMAP */ |
80 | 68 | ||
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 8cd1cc3bc835..81d02fc7dafa 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h | |||
@@ -154,6 +154,7 @@ void cpu_die_common(unsigned int cpu); | |||
154 | void native_smp_prepare_boot_cpu(void); | 154 | void native_smp_prepare_boot_cpu(void); |
155 | void native_smp_prepare_cpus(unsigned int max_cpus); | 155 | void native_smp_prepare_cpus(unsigned int max_cpus); |
156 | void native_smp_cpus_done(unsigned int max_cpus); | 156 | void native_smp_cpus_done(unsigned int max_cpus); |
157 | void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); | ||
157 | int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); | 158 | int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); |
158 | int native_cpu_disable(void); | 159 | int native_cpu_disable(void); |
159 | void native_cpu_die(unsigned int cpu); | 160 | void native_cpu_die(unsigned int cpu); |
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 6a4b00fafb00..aeb4666e0c0a 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h | |||
@@ -4,6 +4,8 @@ | |||
4 | 4 | ||
5 | #ifdef __KERNEL__ | 5 | #ifdef __KERNEL__ |
6 | 6 | ||
7 | #include <asm/nops.h> | ||
8 | |||
7 | static inline void native_clts(void) | 9 | static inline void native_clts(void) |
8 | { | 10 | { |
9 | asm volatile("clts"); | 11 | asm volatile("clts"); |
@@ -199,6 +201,28 @@ static inline void clflushopt(volatile void *__p) | |||
199 | "+m" (*(volatile char __force *)__p)); | 201 | "+m" (*(volatile char __force *)__p)); |
200 | } | 202 | } |
201 | 203 | ||
204 | static inline void clwb(volatile void *__p) | ||
205 | { | ||
206 | volatile struct { char x[64]; } *p = __p; | ||
207 | |||
208 | asm volatile(ALTERNATIVE_2( | ||
209 | ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])", | ||
210 | ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ | ||
211 | X86_FEATURE_CLFLUSHOPT, | ||
212 | ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ | ||
213 | X86_FEATURE_CLWB) | ||
214 | : [p] "+m" (*p) | ||
215 | : [pax] "a" (p)); | ||
216 | } | ||
217 | |||
218 | static inline void pcommit_sfence(void) | ||
219 | { | ||
220 | alternative(ASM_NOP7, | ||
221 | ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */ | ||
222 | "sfence", | ||
223 | X86_FEATURE_PCOMMIT); | ||
224 | } | ||
225 | |||
202 | #define nop() asm volatile ("nop") | 226 | #define nop() asm volatile ("nop") |
203 | 227 | ||
204 | 228 | ||
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1d4e4f279a32..ea2dbe82cba3 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h | |||
@@ -13,6 +13,33 @@ | |||
13 | #include <asm/types.h> | 13 | #include <asm/types.h> |
14 | 14 | ||
15 | /* | 15 | /* |
16 | * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we | ||
17 | * reserve at the top of the kernel stack. We do it because of a nasty | ||
18 | * 32-bit corner case. On x86_32, the hardware stack frame is | ||
19 | * variable-length. Except for vm86 mode, struct pt_regs assumes a | ||
20 | * maximum-length frame. If we enter from CPL 0, the top 8 bytes of | ||
21 | * pt_regs don't actually exist. Ordinarily this doesn't matter, but it | ||
22 | * does in at least one case: | ||
23 | * | ||
24 | * If we take an NMI early enough in SYSENTER, then we can end up with | ||
25 | * pt_regs that extends above sp0. On the way out, in the espfix code, | ||
26 | * we can read the saved SS value, but that value will be above sp0. | ||
27 | * Without this offset, that can result in a page fault. (We are | ||
28 | * careful that, in this case, the value we read doesn't matter.) | ||
29 | * | ||
30 | * In vm86 mode, the hardware frame is much longer still, but we neither | ||
31 | * access the extra members from NMI context, nor do we write such a | ||
32 | * frame at sp0 at all. | ||
33 | * | ||
34 | * x86_64 has a fixed-length stack frame. | ||
35 | */ | ||
36 | #ifdef CONFIG_X86_32 | ||
37 | # define TOP_OF_KERNEL_STACK_PADDING 8 | ||
38 | #else | ||
39 | # define TOP_OF_KERNEL_STACK_PADDING 0 | ||
40 | #endif | ||
41 | |||
42 | /* | ||
16 | * low level task data that entry.S needs immediate access to | 43 | * low level task data that entry.S needs immediate access to |
17 | * - this struct should fit entirely inside of one cache line | 44 | * - this struct should fit entirely inside of one cache line |
18 | * - this struct shares the supervisor stack pages | 45 | * - this struct shares the supervisor stack pages |
@@ -145,7 +172,6 @@ struct thread_info { | |||
145 | #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) | 172 | #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) |
146 | 173 | ||
147 | #define STACK_WARN (THREAD_SIZE/8) | 174 | #define STACK_WARN (THREAD_SIZE/8) |
148 | #define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8)) | ||
149 | 175 | ||
150 | /* | 176 | /* |
151 | * macros/functions for gaining access to the thread information structure | 177 | * macros/functions for gaining access to the thread information structure |
@@ -158,10 +184,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); | |||
158 | 184 | ||
159 | static inline struct thread_info *current_thread_info(void) | 185 | static inline struct thread_info *current_thread_info(void) |
160 | { | 186 | { |
161 | struct thread_info *ti; | 187 | return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); |
162 | ti = (void *)(this_cpu_read_stable(kernel_stack) + | ||
163 | KERNEL_STACK_OFFSET - THREAD_SIZE); | ||
164 | return ti; | ||
165 | } | 188 | } |
166 | 189 | ||
167 | static inline unsigned long current_stack_pointer(void) | 190 | static inline unsigned long current_stack_pointer(void) |
@@ -177,16 +200,37 @@ static inline unsigned long current_stack_pointer(void) | |||
177 | 200 | ||
178 | #else /* !__ASSEMBLY__ */ | 201 | #else /* !__ASSEMBLY__ */ |
179 | 202 | ||
180 | /* how to get the thread information struct from ASM */ | 203 | /* Load thread_info address into "reg" */ |
181 | #define GET_THREAD_INFO(reg) \ | 204 | #define GET_THREAD_INFO(reg) \ |
182 | _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ | 205 | _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ |
183 | _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ; | 206 | _ASM_SUB $(THREAD_SIZE),reg ; |
184 | 207 | ||
185 | /* | 208 | /* |
186 | * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in | 209 | * ASM operand which evaluates to a 'thread_info' address of |
187 | * a certain register (to be used in assembler memory operands). | 210 | * the current task, if it is known that "reg" is exactly "off" |
211 | * bytes below the top of the stack currently. | ||
212 | * | ||
213 | * ( The kernel stack's size is known at build time, it is usually | ||
214 | * 2 or 4 pages, and the bottom of the kernel stack contains | ||
215 | * the thread_info structure. So to access the thread_info very | ||
216 | * quickly from assembly code we can calculate down from the | ||
217 | * top of the kernel stack to the bottom, using constant, | ||
218 | * build-time calculations only. ) | ||
219 | * | ||
220 | * For example, to fetch the current thread_info->flags value into %eax | ||
221 | * on x86-64 defconfig kernels, in syscall entry code where RSP is | ||
222 | * currently at exactly SIZEOF_PTREGS bytes away from the top of the | ||
223 | * stack: | ||
224 | * | ||
225 | * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax | ||
226 | * | ||
227 | * will translate to: | ||
228 | * | ||
229 | * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax | ||
230 | * | ||
231 | * which is below the current RSP by almost 16K. | ||
188 | */ | 232 | */ |
189 | #define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) | 233 | #define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) |
190 | 234 | ||
191 | #endif | 235 | #endif |
192 | 236 | ||
@@ -236,6 +280,16 @@ static inline bool is_ia32_task(void) | |||
236 | #endif | 280 | #endif |
237 | return false; | 281 | return false; |
238 | } | 282 | } |
283 | |||
284 | /* | ||
285 | * Force syscall return via IRET by making it look as if there was | ||
286 | * some work pending. IRET is our most capable (but slowest) syscall | ||
287 | * return path, which is able to restore modified SS, CS and certain | ||
288 | * EFLAGS values that other (fast) syscall return instructions | ||
289 | * are not able to restore properly. | ||
290 | */ | ||
291 | #define force_iret() set_thread_flag(TIF_NOTIFY_RESUME) | ||
292 | |||
239 | #endif /* !__ASSEMBLY__ */ | 293 | #endif /* !__ASSEMBLY__ */ |
240 | 294 | ||
241 | #ifndef __ASSEMBLY__ | 295 | #ifndef __ASSEMBLY__ |
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 225b0988043a..ab456dc233b5 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h | |||
@@ -15,6 +15,7 @@ | |||
15 | 15 | ||
16 | /* loadflags */ | 16 | /* loadflags */ |
17 | #define LOADED_HIGH (1<<0) | 17 | #define LOADED_HIGH (1<<0) |
18 | #define KASLR_FLAG (1<<1) | ||
18 | #define QUIET_FLAG (1<<5) | 19 | #define QUIET_FLAG (1<<5) |
19 | #define KEEP_SEGMENTS (1<<6) | 20 | #define KEEP_SEGMENTS (1<<6) |
20 | #define CAN_USE_HEAP (1<<7) | 21 | #define CAN_USE_HEAP (1<<7) |
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h index 7b0a55a88851..580aee3072e0 100644 --- a/arch/x86/include/uapi/asm/ptrace-abi.h +++ b/arch/x86/include/uapi/asm/ptrace-abi.h | |||
@@ -25,13 +25,17 @@ | |||
25 | #else /* __i386__ */ | 25 | #else /* __i386__ */ |
26 | 26 | ||
27 | #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) | 27 | #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) |
28 | /* | ||
29 | * C ABI says these regs are callee-preserved. They aren't saved on kernel entry | ||
30 | * unless syscall needs a complete, fully filled "struct pt_regs". | ||
31 | */ | ||
28 | #define R15 0 | 32 | #define R15 0 |
29 | #define R14 8 | 33 | #define R14 8 |
30 | #define R13 16 | 34 | #define R13 16 |
31 | #define R12 24 | 35 | #define R12 24 |
32 | #define RBP 32 | 36 | #define RBP 32 |
33 | #define RBX 40 | 37 | #define RBX 40 |
34 | /* arguments: interrupts/non tracing syscalls only save up to here*/ | 38 | /* These regs are callee-clobbered. Always saved on kernel entry. */ |
35 | #define R11 48 | 39 | #define R11 48 |
36 | #define R10 56 | 40 | #define R10 56 |
37 | #define R9 64 | 41 | #define R9 64 |
@@ -41,15 +45,17 @@ | |||
41 | #define RDX 96 | 45 | #define RDX 96 |
42 | #define RSI 104 | 46 | #define RSI 104 |
43 | #define RDI 112 | 47 | #define RDI 112 |
44 | #define ORIG_RAX 120 /* = ERROR */ | 48 | /* |
45 | /* end of arguments */ | 49 | * On syscall entry, this is syscall#. On CPU exception, this is error code. |
46 | /* cpu exception frame or undefined in case of fast syscall. */ | 50 | * On hw interrupt, it's IRQ number: |
51 | */ | ||
52 | #define ORIG_RAX 120 | ||
53 | /* Return frame for iretq */ | ||
47 | #define RIP 128 | 54 | #define RIP 128 |
48 | #define CS 136 | 55 | #define CS 136 |
49 | #define EFLAGS 144 | 56 | #define EFLAGS 144 |
50 | #define RSP 152 | 57 | #define RSP 152 |
51 | #define SS 160 | 58 | #define SS 160 |
52 | #define ARGOFFSET R11 | ||
53 | #endif /* __ASSEMBLY__ */ | 59 | #endif /* __ASSEMBLY__ */ |
54 | 60 | ||
55 | /* top of stack page */ | 61 | /* top of stack page */ |
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h index ac4b9aa4d999..bc16115af39b 100644 --- a/arch/x86/include/uapi/asm/ptrace.h +++ b/arch/x86/include/uapi/asm/ptrace.h | |||
@@ -41,13 +41,17 @@ struct pt_regs { | |||
41 | #ifndef __KERNEL__ | 41 | #ifndef __KERNEL__ |
42 | 42 | ||
43 | struct pt_regs { | 43 | struct pt_regs { |
44 | /* | ||
45 | * C ABI says these regs are callee-preserved. They aren't saved on kernel entry | ||
46 | * unless syscall needs a complete, fully filled "struct pt_regs". | ||
47 | */ | ||
44 | unsigned long r15; | 48 | unsigned long r15; |
45 | unsigned long r14; | 49 | unsigned long r14; |
46 | unsigned long r13; | 50 | unsigned long r13; |
47 | unsigned long r12; | 51 | unsigned long r12; |
48 | unsigned long rbp; | 52 | unsigned long rbp; |
49 | unsigned long rbx; | 53 | unsigned long rbx; |
50 | /* arguments: non interrupts/non tracing syscalls only save up to here*/ | 54 | /* These regs are callee-clobbered. Always saved on kernel entry. */ |
51 | unsigned long r11; | 55 | unsigned long r11; |
52 | unsigned long r10; | 56 | unsigned long r10; |
53 | unsigned long r9; | 57 | unsigned long r9; |
@@ -57,9 +61,12 @@ struct pt_regs { | |||
57 | unsigned long rdx; | 61 | unsigned long rdx; |
58 | unsigned long rsi; | 62 | unsigned long rsi; |
59 | unsigned long rdi; | 63 | unsigned long rdi; |
64 | /* | ||
65 | * On syscall entry, this is syscall#. On CPU exception, this is error code. | ||
66 | * On hw interrupt, it's IRQ number: | ||
67 | */ | ||
60 | unsigned long orig_rax; | 68 | unsigned long orig_rax; |
61 | /* end of arguments */ | 69 | /* Return frame for iretq */ |
62 | /* cpu exception frame or undefined */ | ||
63 | unsigned long rip; | 70 | unsigned long rip; |
64 | unsigned long cs; | 71 | unsigned long cs; |
65 | unsigned long eflags; | 72 | unsigned long eflags; |
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index d8b9f9081e86..16dc4e8a2cd3 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h | |||
@@ -177,9 +177,24 @@ struct sigcontext { | |||
177 | __u64 rip; | 177 | __u64 rip; |
178 | __u64 eflags; /* RFLAGS */ | 178 | __u64 eflags; /* RFLAGS */ |
179 | __u16 cs; | 179 | __u16 cs; |
180 | __u16 gs; | 180 | |
181 | __u16 fs; | 181 | /* |
182 | __u16 __pad0; | 182 | * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"), |
183 | * Linux saved and restored fs and gs in these slots. This | ||
184 | * was counterproductive, as fsbase and gsbase were never | ||
185 | * saved, so arch_prctl was presumably unreliable. | ||
186 | * | ||
187 | * If these slots are ever needed for any other purpose, there | ||
188 | * is some risk that very old 64-bit binaries could get | ||
189 | * confused. I doubt that many such binaries still work, | ||
190 | * though, since the same patch in 2.5.64 also removed the | ||
191 | * 64-bit set_thread_area syscall, so it appears that there is | ||
192 | * no TLS API that works in both pre- and post-2.5.64 kernels. | ||
193 | */ | ||
194 | __u16 __pad2; /* Was gs. */ | ||
195 | __u16 __pad1; /* Was fs. */ | ||
196 | |||
197 | __u16 ss; | ||
183 | __u64 err; | 198 | __u64 err; |
184 | __u64 trapno; | 199 | __u64 trapno; |
185 | __u64 oldmask; | 200 | __u64 oldmask; |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cdb1b70ddad0..c887cd944f0c 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o | |||
32 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o | 32 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o |
33 | obj-$(CONFIG_X86_64) += mcount_64.o | 33 | obj-$(CONFIG_X86_64) += mcount_64.o |
34 | obj-y += syscall_$(BITS).o vsyscall_gtod.o | 34 | obj-y += syscall_$(BITS).o vsyscall_gtod.o |
35 | obj-$(CONFIG_IA32_EMULATION) += syscall_32.o | ||
35 | obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o | 36 | obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o |
36 | obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o | 37 | obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o |
37 | obj-$(CONFIG_SYSFS) += ksysfs.o | 38 | obj-$(CONFIG_SYSFS) += ksysfs.o |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 703130f469ec..aef653193160 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str) | |||
52 | __setup("noreplace-paravirt", setup_noreplace_paravirt); | 52 | __setup("noreplace-paravirt", setup_noreplace_paravirt); |
53 | #endif | 53 | #endif |
54 | 54 | ||
55 | #define DPRINTK(fmt, ...) \ | 55 | #define DPRINTK(fmt, args...) \ |
56 | do { \ | 56 | do { \ |
57 | if (debug_alternative) \ | 57 | if (debug_alternative) \ |
58 | printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ | 58 | printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ |
59 | } while (0) | ||
60 | |||
61 | #define DUMP_BYTES(buf, len, fmt, args...) \ | ||
62 | do { \ | ||
63 | if (unlikely(debug_alternative)) { \ | ||
64 | int j; \ | ||
65 | \ | ||
66 | if (!(len)) \ | ||
67 | break; \ | ||
68 | \ | ||
69 | printk(KERN_DEBUG fmt, ##args); \ | ||
70 | for (j = 0; j < (len) - 1; j++) \ | ||
71 | printk(KERN_CONT "%02hhx ", buf[j]); \ | ||
72 | printk(KERN_CONT "%02hhx\n", buf[j]); \ | ||
73 | } \ | ||
59 | } while (0) | 74 | } while (0) |
60 | 75 | ||
61 | /* | 76 | /* |
@@ -243,12 +258,89 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | |||
243 | extern s32 __smp_locks[], __smp_locks_end[]; | 258 | extern s32 __smp_locks[], __smp_locks_end[]; |
244 | void *text_poke_early(void *addr, const void *opcode, size_t len); | 259 | void *text_poke_early(void *addr, const void *opcode, size_t len); |
245 | 260 | ||
246 | /* Replace instructions with better alternatives for this CPU type. | 261 | /* |
247 | This runs before SMP is initialized to avoid SMP problems with | 262 | * Are we looking at a near JMP with a 1 or 4-byte displacement. |
248 | self modifying code. This implies that asymmetric systems where | 263 | */ |
249 | APs have less capabilities than the boot processor are not handled. | 264 | static inline bool is_jmp(const u8 opcode) |
250 | Tough. Make sure you disable such features by hand. */ | 265 | { |
266 | return opcode == 0xeb || opcode == 0xe9; | ||
267 | } | ||
268 | |||
269 | static void __init_or_module | ||
270 | recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) | ||
271 | { | ||
272 | u8 *next_rip, *tgt_rip; | ||
273 | s32 n_dspl, o_dspl; | ||
274 | int repl_len; | ||
275 | |||
276 | if (a->replacementlen != 5) | ||
277 | return; | ||
278 | |||
279 | o_dspl = *(s32 *)(insnbuf + 1); | ||
280 | |||
281 | /* next_rip of the replacement JMP */ | ||
282 | next_rip = repl_insn + a->replacementlen; | ||
283 | /* target rip of the replacement JMP */ | ||
284 | tgt_rip = next_rip + o_dspl; | ||
285 | n_dspl = tgt_rip - orig_insn; | ||
286 | |||
287 | DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); | ||
288 | |||
289 | if (tgt_rip - orig_insn >= 0) { | ||
290 | if (n_dspl - 2 <= 127) | ||
291 | goto two_byte_jmp; | ||
292 | else | ||
293 | goto five_byte_jmp; | ||
294 | /* negative offset */ | ||
295 | } else { | ||
296 | if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) | ||
297 | goto two_byte_jmp; | ||
298 | else | ||
299 | goto five_byte_jmp; | ||
300 | } | ||
301 | |||
302 | two_byte_jmp: | ||
303 | n_dspl -= 2; | ||
304 | |||
305 | insnbuf[0] = 0xeb; | ||
306 | insnbuf[1] = (s8)n_dspl; | ||
307 | add_nops(insnbuf + 2, 3); | ||
308 | |||
309 | repl_len = 2; | ||
310 | goto done; | ||
311 | |||
312 | five_byte_jmp: | ||
313 | n_dspl -= 5; | ||
314 | |||
315 | insnbuf[0] = 0xe9; | ||
316 | *(s32 *)&insnbuf[1] = n_dspl; | ||
251 | 317 | ||
318 | repl_len = 5; | ||
319 | |||
320 | done: | ||
321 | |||
322 | DPRINTK("final displ: 0x%08x, JMP 0x%lx", | ||
323 | n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); | ||
324 | } | ||
325 | |||
326 | static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) | ||
327 | { | ||
328 | if (instr[0] != 0x90) | ||
329 | return; | ||
330 | |||
331 | add_nops(instr + (a->instrlen - a->padlen), a->padlen); | ||
332 | |||
333 | DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", | ||
334 | instr, a->instrlen - a->padlen, a->padlen); | ||
335 | } | ||
336 | |||
337 | /* | ||
338 | * Replace instructions with better alternatives for this CPU type. This runs | ||
339 | * before SMP is initialized to avoid SMP problems with self modifying code. | ||
340 | * This implies that asymmetric systems where APs have less capabilities than | ||
341 | * the boot processor are not handled. Tough. Make sure you disable such | ||
342 | * features by hand. | ||
343 | */ | ||
252 | void __init_or_module apply_alternatives(struct alt_instr *start, | 344 | void __init_or_module apply_alternatives(struct alt_instr *start, |
253 | struct alt_instr *end) | 345 | struct alt_instr *end) |
254 | { | 346 | { |
@@ -256,10 +348,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
256 | u8 *instr, *replacement; | 348 | u8 *instr, *replacement; |
257 | u8 insnbuf[MAX_PATCH_LEN]; | 349 | u8 insnbuf[MAX_PATCH_LEN]; |
258 | 350 | ||
259 | DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); | 351 | DPRINTK("alt table %p -> %p", start, end); |
260 | /* | 352 | /* |
261 | * The scan order should be from start to end. A later scanned | 353 | * The scan order should be from start to end. A later scanned |
262 | * alternative code can overwrite a previous scanned alternative code. | 354 | * alternative code can overwrite previously scanned alternative code. |
263 | * Some kernel functions (e.g. memcpy, memset, etc) use this order to | 355 | * Some kernel functions (e.g. memcpy, memset, etc) use this order to |
264 | * patch code. | 356 | * patch code. |
265 | * | 357 | * |
@@ -267,29 +359,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
267 | * order. | 359 | * order. |
268 | */ | 360 | */ |
269 | for (a = start; a < end; a++) { | 361 | for (a = start; a < end; a++) { |
362 | int insnbuf_sz = 0; | ||
363 | |||
270 | instr = (u8 *)&a->instr_offset + a->instr_offset; | 364 | instr = (u8 *)&a->instr_offset + a->instr_offset; |
271 | replacement = (u8 *)&a->repl_offset + a->repl_offset; | 365 | replacement = (u8 *)&a->repl_offset + a->repl_offset; |
272 | BUG_ON(a->replacementlen > a->instrlen); | ||
273 | BUG_ON(a->instrlen > sizeof(insnbuf)); | 366 | BUG_ON(a->instrlen > sizeof(insnbuf)); |
274 | BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); | 367 | BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); |
275 | if (!boot_cpu_has(a->cpuid)) | 368 | if (!boot_cpu_has(a->cpuid)) { |
369 | if (a->padlen > 1) | ||
370 | optimize_nops(a, instr); | ||
371 | |||
276 | continue; | 372 | continue; |
373 | } | ||
374 | |||
375 | DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d", | ||
376 | a->cpuid >> 5, | ||
377 | a->cpuid & 0x1f, | ||
378 | instr, a->instrlen, | ||
379 | replacement, a->replacementlen, a->padlen); | ||
380 | |||
381 | DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); | ||
382 | DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); | ||
277 | 383 | ||
278 | memcpy(insnbuf, replacement, a->replacementlen); | 384 | memcpy(insnbuf, replacement, a->replacementlen); |
385 | insnbuf_sz = a->replacementlen; | ||
279 | 386 | ||
280 | /* 0xe8 is a relative jump; fix the offset. */ | 387 | /* 0xe8 is a relative jump; fix the offset. */ |
281 | if (*insnbuf == 0xe8 && a->replacementlen == 5) | 388 | if (*insnbuf == 0xe8 && a->replacementlen == 5) { |
282 | *(s32 *)(insnbuf + 1) += replacement - instr; | 389 | *(s32 *)(insnbuf + 1) += replacement - instr; |
390 | DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", | ||
391 | *(s32 *)(insnbuf + 1), | ||
392 | (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); | ||
393 | } | ||
394 | |||
395 | if (a->replacementlen && is_jmp(replacement[0])) | ||
396 | recompute_jump(a, instr, replacement, insnbuf); | ||
283 | 397 | ||
284 | add_nops(insnbuf + a->replacementlen, | 398 | if (a->instrlen > a->replacementlen) { |
285 | a->instrlen - a->replacementlen); | 399 | add_nops(insnbuf + a->replacementlen, |
400 | a->instrlen - a->replacementlen); | ||
401 | insnbuf_sz += a->instrlen - a->replacementlen; | ||
402 | } | ||
403 | DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); | ||
286 | 404 | ||
287 | text_poke_early(instr, insnbuf, a->instrlen); | 405 | text_poke_early(instr, insnbuf, insnbuf_sz); |
288 | } | 406 | } |
289 | } | 407 | } |
290 | 408 | ||
291 | #ifdef CONFIG_SMP | 409 | #ifdef CONFIG_SMP |
292 | |||
293 | static void alternatives_smp_lock(const s32 *start, const s32 *end, | 410 | static void alternatives_smp_lock(const s32 *start, const s32 *end, |
294 | u8 *text, u8 *text_end) | 411 | u8 *text, u8 *text_end) |
295 | { | 412 | { |
@@ -371,8 +488,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, | |||
371 | smp->locks_end = locks_end; | 488 | smp->locks_end = locks_end; |
372 | smp->text = text; | 489 | smp->text = text; |
373 | smp->text_end = text_end; | 490 | smp->text_end = text_end; |
374 | DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", | 491 | DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", |
375 | __func__, smp->locks, smp->locks_end, | 492 | smp->locks, smp->locks_end, |
376 | smp->text, smp->text_end, smp->name); | 493 | smp->text, smp->text_end, smp->name); |
377 | 494 | ||
378 | list_add_tail(&smp->next, &smp_alt_modules); | 495 | list_add_tail(&smp->next, &smp_alt_modules); |
@@ -440,7 +557,7 @@ int alternatives_text_reserved(void *start, void *end) | |||
440 | 557 | ||
441 | return 0; | 558 | return 0; |
442 | } | 559 | } |
443 | #endif | 560 | #endif /* CONFIG_SMP */ |
444 | 561 | ||
445 | #ifdef CONFIG_PARAVIRT | 562 | #ifdef CONFIG_PARAVIRT |
446 | void __init_or_module apply_paravirt(struct paravirt_patch_site *start, | 563 | void __init_or_module apply_paravirt(struct paravirt_patch_site *start, |
@@ -601,7 +718,7 @@ int poke_int3_handler(struct pt_regs *regs) | |||
601 | if (likely(!bp_patching_in_progress)) | 718 | if (likely(!bp_patching_in_progress)) |
602 | return 0; | 719 | return 0; |
603 | 720 | ||
604 | if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) | 721 | if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr) |
605 | return 0; | 722 | return 0; |
606 | 723 | ||
607 | /* set up the specified breakpoint handler */ | 724 | /* set up the specified breakpoint handler */ |
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 3b3b9d33ac1d..47703aed74cf 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c | |||
@@ -68,7 +68,7 @@ void foo(void) | |||
68 | 68 | ||
69 | /* Offset from the sysenter stack to tss.sp0 */ | 69 | /* Offset from the sysenter stack to tss.sp0 */ |
70 | DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - | 70 | DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - |
71 | sizeof(struct tss_struct)); | 71 | offsetofend(struct tss_struct, SYSENTER_stack)); |
72 | 72 | ||
73 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) | 73 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) |
74 | BLANK(); | 74 | BLANK(); |
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index fdcbb4d27c9f..5ce6f2da8763 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c | |||
@@ -81,6 +81,7 @@ int main(void) | |||
81 | #undef ENTRY | 81 | #undef ENTRY |
82 | 82 | ||
83 | OFFSET(TSS_ist, tss_struct, x86_tss.ist); | 83 | OFFSET(TSS_ist, tss_struct, x86_tss.ist); |
84 | OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); | ||
84 | BLANK(); | 85 | BLANK(); |
85 | 86 | ||
86 | DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); | 87 | DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a220239cea65..dd9e50500297 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c) | |||
711 | set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); | 711 | set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); |
712 | 712 | ||
713 | rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); | 713 | rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); |
714 | |||
715 | /* 3DNow or LM implies PREFETCHW */ | ||
716 | if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) | ||
717 | if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) | ||
718 | set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); | ||
714 | } | 719 | } |
715 | 720 | ||
716 | #ifdef CONFIG_X86_32 | 721 | #ifdef CONFIG_X86_32 |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2346c95c6ab1..3f70538012e2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -959,38 +959,37 @@ static void identify_cpu(struct cpuinfo_x86 *c) | |||
959 | #endif | 959 | #endif |
960 | } | 960 | } |
961 | 961 | ||
962 | #ifdef CONFIG_X86_64 | 962 | /* |
963 | #ifdef CONFIG_IA32_EMULATION | 963 | * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions |
964 | /* May not be __init: called during resume */ | 964 | * on 32-bit kernels: |
965 | static void syscall32_cpu_init(void) | 965 | */ |
966 | { | ||
967 | /* Load these always in case some future AMD CPU supports | ||
968 | SYSENTER from compat mode too. */ | ||
969 | wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); | ||
970 | wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); | ||
971 | wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); | ||
972 | |||
973 | wrmsrl(MSR_CSTAR, ia32_cstar_target); | ||
974 | } | ||
975 | #endif /* CONFIG_IA32_EMULATION */ | ||
976 | #endif /* CONFIG_X86_64 */ | ||
977 | |||
978 | #ifdef CONFIG_X86_32 | 966 | #ifdef CONFIG_X86_32 |
979 | void enable_sep_cpu(void) | 967 | void enable_sep_cpu(void) |
980 | { | 968 | { |
981 | int cpu = get_cpu(); | 969 | struct tss_struct *tss; |
982 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 970 | int cpu; |
983 | 971 | ||
984 | if (!boot_cpu_has(X86_FEATURE_SEP)) { | 972 | cpu = get_cpu(); |
985 | put_cpu(); | 973 | tss = &per_cpu(cpu_tss, cpu); |
986 | return; | 974 | |
987 | } | 975 | if (!boot_cpu_has(X86_FEATURE_SEP)) |
976 | goto out; | ||
977 | |||
978 | /* | ||
979 | * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- | ||
980 | * see the big comment in struct x86_hw_tss's definition. | ||
981 | */ | ||
988 | 982 | ||
989 | tss->x86_tss.ss1 = __KERNEL_CS; | 983 | tss->x86_tss.ss1 = __KERNEL_CS; |
990 | tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; | 984 | wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); |
991 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); | 985 | |
992 | wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); | 986 | wrmsr(MSR_IA32_SYSENTER_ESP, |
993 | wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); | 987 | (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), |
988 | 0); | ||
989 | |||
990 | wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); | ||
991 | |||
992 | out: | ||
994 | put_cpu(); | 993 | put_cpu(); |
995 | } | 994 | } |
996 | #endif | 995 | #endif |
@@ -1118,7 +1117,7 @@ static __init int setup_disablecpuid(char *arg) | |||
1118 | __setup("clearcpuid=", setup_disablecpuid); | 1117 | __setup("clearcpuid=", setup_disablecpuid); |
1119 | 1118 | ||
1120 | DEFINE_PER_CPU(unsigned long, kernel_stack) = | 1119 | DEFINE_PER_CPU(unsigned long, kernel_stack) = |
1121 | (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; | 1120 | (unsigned long)&init_thread_union + THREAD_SIZE; |
1122 | EXPORT_PER_CPU_SYMBOL(kernel_stack); | 1121 | EXPORT_PER_CPU_SYMBOL(kernel_stack); |
1123 | 1122 | ||
1124 | #ifdef CONFIG_X86_64 | 1123 | #ifdef CONFIG_X86_64 |
@@ -1130,8 +1129,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union, | |||
1130 | irq_stack_union) __aligned(PAGE_SIZE) __visible; | 1129 | irq_stack_union) __aligned(PAGE_SIZE) __visible; |
1131 | 1130 | ||
1132 | /* | 1131 | /* |
1133 | * The following four percpu variables are hot. Align current_task to | 1132 | * The following percpu variables are hot. Align current_task to |
1134 | * cacheline size such that all four fall in the same cacheline. | 1133 | * cacheline size such that they fall in the same cacheline. |
1135 | */ | 1134 | */ |
1136 | DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = | 1135 | DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = |
1137 | &init_task; | 1136 | &init_task; |
@@ -1171,10 +1170,23 @@ void syscall_init(void) | |||
1171 | */ | 1170 | */ |
1172 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); | 1171 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); |
1173 | wrmsrl(MSR_LSTAR, system_call); | 1172 | wrmsrl(MSR_LSTAR, system_call); |
1174 | wrmsrl(MSR_CSTAR, ignore_sysret); | ||
1175 | 1173 | ||
1176 | #ifdef CONFIG_IA32_EMULATION | 1174 | #ifdef CONFIG_IA32_EMULATION |
1177 | syscall32_cpu_init(); | 1175 | wrmsrl(MSR_CSTAR, ia32_cstar_target); |
1176 | /* | ||
1177 | * This only works on Intel CPUs. | ||
1178 | * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. | ||
1179 | * This does not cause SYSENTER to jump to the wrong location, because | ||
1180 | * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). | ||
1181 | */ | ||
1182 | wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); | ||
1183 | wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); | ||
1184 | wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); | ||
1185 | #else | ||
1186 | wrmsrl(MSR_CSTAR, ignore_sysret); | ||
1187 | wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); | ||
1188 | wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); | ||
1189 | wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); | ||
1178 | #endif | 1190 | #endif |
1179 | 1191 | ||
1180 | /* Flags to clear on syscall */ | 1192 | /* Flags to clear on syscall */ |
@@ -1226,6 +1238,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; | |||
1226 | EXPORT_PER_CPU_SYMBOL(__preempt_count); | 1238 | EXPORT_PER_CPU_SYMBOL(__preempt_count); |
1227 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); | 1239 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); |
1228 | 1240 | ||
1241 | /* | ||
1242 | * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find | ||
1243 | * the top of the kernel stack. Use an extra percpu variable to track the | ||
1244 | * top of the kernel stack directly. | ||
1245 | */ | ||
1246 | DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = | ||
1247 | (unsigned long)&init_thread_union + THREAD_SIZE; | ||
1248 | EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); | ||
1249 | |||
1229 | #ifdef CONFIG_CC_STACKPROTECTOR | 1250 | #ifdef CONFIG_CC_STACKPROTECTOR |
1230 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); | 1251 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); |
1231 | #endif | 1252 | #endif |
@@ -1307,7 +1328,7 @@ void cpu_init(void) | |||
1307 | */ | 1328 | */ |
1308 | load_ucode_ap(); | 1329 | load_ucode_ap(); |
1309 | 1330 | ||
1310 | t = &per_cpu(init_tss, cpu); | 1331 | t = &per_cpu(cpu_tss, cpu); |
1311 | oist = &per_cpu(orig_ist, cpu); | 1332 | oist = &per_cpu(orig_ist, cpu); |
1312 | 1333 | ||
1313 | #ifdef CONFIG_NUMA | 1334 | #ifdef CONFIG_NUMA |
@@ -1391,7 +1412,7 @@ void cpu_init(void) | |||
1391 | { | 1412 | { |
1392 | int cpu = smp_processor_id(); | 1413 | int cpu = smp_processor_id(); |
1393 | struct task_struct *curr = current; | 1414 | struct task_struct *curr = current; |
1394 | struct tss_struct *t = &per_cpu(init_tss, cpu); | 1415 | struct tss_struct *t = &per_cpu(cpu_tss, cpu); |
1395 | struct thread_struct *thread = &curr->thread; | 1416 | struct thread_struct *thread = &curr->thread; |
1396 | 1417 | ||
1397 | wait_for_master_cpu(cpu); | 1418 | wait_for_master_cpu(cpu); |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b71a7f86d68a..e2888a3ad1e3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -2147,24 +2147,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) | |||
2147 | static unsigned long code_segment_base(struct pt_regs *regs) | 2147 | static unsigned long code_segment_base(struct pt_regs *regs) |
2148 | { | 2148 | { |
2149 | /* | 2149 | /* |
2150 | * For IA32 we look at the GDT/LDT segment base to convert the | ||
2151 | * effective IP to a linear address. | ||
2152 | */ | ||
2153 | |||
2154 | #ifdef CONFIG_X86_32 | ||
2155 | /* | ||
2150 | * If we are in VM86 mode, add the segment offset to convert to a | 2156 | * If we are in VM86 mode, add the segment offset to convert to a |
2151 | * linear address. | 2157 | * linear address. |
2152 | */ | 2158 | */ |
2153 | if (regs->flags & X86_VM_MASK) | 2159 | if (regs->flags & X86_VM_MASK) |
2154 | return 0x10 * regs->cs; | 2160 | return 0x10 * regs->cs; |
2155 | 2161 | ||
2156 | /* | ||
2157 | * For IA32 we look at the GDT/LDT segment base to convert the | ||
2158 | * effective IP to a linear address. | ||
2159 | */ | ||
2160 | #ifdef CONFIG_X86_32 | ||
2161 | if (user_mode(regs) && regs->cs != __USER_CS) | 2162 | if (user_mode(regs) && regs->cs != __USER_CS) |
2162 | return get_segment_base(regs->cs); | 2163 | return get_segment_base(regs->cs); |
2163 | #else | 2164 | #else |
2164 | if (test_thread_flag(TIF_IA32)) { | 2165 | if (user_mode(regs) && !user_64bit_mode(regs) && |
2165 | if (user_mode(regs) && regs->cs != __USER32_CS) | 2166 | regs->cs != __USER32_CS) |
2166 | return get_segment_base(regs->cs); | 2167 | return get_segment_base(regs->cs); |
2167 | } | ||
2168 | #endif | 2168 | #endif |
2169 | return 0; | 2169 | return 0; |
2170 | } | 2170 | } |
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index aceb2f90c716..c76d3e37c6e1 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c | |||
@@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) | |||
105 | #ifdef CONFIG_X86_32 | 105 | #ifdef CONFIG_X86_32 |
106 | struct pt_regs fixed_regs; | 106 | struct pt_regs fixed_regs; |
107 | 107 | ||
108 | if (!user_mode_vm(regs)) { | 108 | if (!user_mode(regs)) { |
109 | crash_fixup_ss_esp(&fixed_regs, regs); | 109 | crash_fixup_ss_esp(&fixed_regs, regs); |
110 | regs = &fixed_regs; | 110 | regs = &fixed_regs; |
111 | } | 111 | } |
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index cf3df1d8d039..ab3b65639a3e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -278,7 +278,7 @@ int __die(const char *str, struct pt_regs *regs, long err) | |||
278 | print_modules(); | 278 | print_modules(); |
279 | show_regs(regs); | 279 | show_regs(regs); |
280 | #ifdef CONFIG_X86_32 | 280 | #ifdef CONFIG_X86_32 |
281 | if (user_mode_vm(regs)) { | 281 | if (user_mode(regs)) { |
282 | sp = regs->sp; | 282 | sp = regs->sp; |
283 | ss = regs->ss & 0xffff; | 283 | ss = regs->ss & 0xffff; |
284 | } else { | 284 | } else { |
@@ -307,7 +307,7 @@ void die(const char *str, struct pt_regs *regs, long err) | |||
307 | unsigned long flags = oops_begin(); | 307 | unsigned long flags = oops_begin(); |
308 | int sig = SIGSEGV; | 308 | int sig = SIGSEGV; |
309 | 309 | ||
310 | if (!user_mode_vm(regs)) | 310 | if (!user_mode(regs)) |
311 | report_bug(regs->ip, regs); | 311 | report_bug(regs->ip, regs); |
312 | 312 | ||
313 | if (__die(str, regs, err)) | 313 | if (__die(str, regs, err)) |
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 5abd4cd4230c..39891ff50d03 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -123,13 +123,13 @@ void show_regs(struct pt_regs *regs) | |||
123 | int i; | 123 | int i; |
124 | 124 | ||
125 | show_regs_print_info(KERN_EMERG); | 125 | show_regs_print_info(KERN_EMERG); |
126 | __show_regs(regs, !user_mode_vm(regs)); | 126 | __show_regs(regs, !user_mode(regs)); |
127 | 127 | ||
128 | /* | 128 | /* |
129 | * When in-kernel, we also print out the stack and code at the | 129 | * When in-kernel, we also print out the stack and code at the |
130 | * time of the fault.. | 130 | * time of the fault.. |
131 | */ | 131 | */ |
132 | if (!user_mode_vm(regs)) { | 132 | if (!user_mode(regs)) { |
133 | unsigned int code_prologue = code_bytes * 43 / 64; | 133 | unsigned int code_prologue = code_bytes * 43 / 64; |
134 | unsigned int code_len = code_bytes; | 134 | unsigned int code_len = code_bytes; |
135 | unsigned char c; | 135 | unsigned char c; |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 31e2d5bf3e38..1c309763e321 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -395,10 +395,13 @@ sysenter_past_esp: | |||
395 | /*CFI_REL_OFFSET cs, 0*/ | 395 | /*CFI_REL_OFFSET cs, 0*/ |
396 | /* | 396 | /* |
397 | * Push current_thread_info()->sysenter_return to the stack. | 397 | * Push current_thread_info()->sysenter_return to the stack. |
398 | * A tiny bit of offset fixup is necessary - 4*4 means the 4 words | 398 | * A tiny bit of offset fixup is necessary: TI_sysenter_return |
399 | * pushed above; +8 corresponds to copy_thread's esp0 setting. | 399 | * is relative to thread_info, which is at the bottom of the |
400 | * kernel stack page. 4*4 means the 4 words pushed above; | ||
401 | * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; | ||
402 | * and THREAD_SIZE takes us to the bottom. | ||
400 | */ | 403 | */ |
401 | pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) | 404 | pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) |
402 | CFI_REL_OFFSET eip, 0 | 405 | CFI_REL_OFFSET eip, 0 |
403 | 406 | ||
404 | pushl_cfi %eax | 407 | pushl_cfi %eax |
@@ -432,7 +435,7 @@ sysenter_after_call: | |||
432 | TRACE_IRQS_OFF | 435 | TRACE_IRQS_OFF |
433 | movl TI_flags(%ebp), %ecx | 436 | movl TI_flags(%ebp), %ecx |
434 | testl $_TIF_ALLWORK_MASK, %ecx | 437 | testl $_TIF_ALLWORK_MASK, %ecx |
435 | jne sysexit_audit | 438 | jnz sysexit_audit |
436 | sysenter_exit: | 439 | sysenter_exit: |
437 | /* if something modifies registers it must also disable sysexit */ | 440 | /* if something modifies registers it must also disable sysexit */ |
438 | movl PT_EIP(%esp), %edx | 441 | movl PT_EIP(%esp), %edx |
@@ -460,7 +463,7 @@ sysenter_audit: | |||
460 | 463 | ||
461 | sysexit_audit: | 464 | sysexit_audit: |
462 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx | 465 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx |
463 | jne syscall_exit_work | 466 | jnz syscall_exit_work |
464 | TRACE_IRQS_ON | 467 | TRACE_IRQS_ON |
465 | ENABLE_INTERRUPTS(CLBR_ANY) | 468 | ENABLE_INTERRUPTS(CLBR_ANY) |
466 | movl %eax,%edx /* second arg, syscall return value */ | 469 | movl %eax,%edx /* second arg, syscall return value */ |
@@ -472,7 +475,7 @@ sysexit_audit: | |||
472 | TRACE_IRQS_OFF | 475 | TRACE_IRQS_OFF |
473 | movl TI_flags(%ebp), %ecx | 476 | movl TI_flags(%ebp), %ecx |
474 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx | 477 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx |
475 | jne syscall_exit_work | 478 | jnz syscall_exit_work |
476 | movl PT_EAX(%esp),%eax /* reload syscall return value */ | 479 | movl PT_EAX(%esp),%eax /* reload syscall return value */ |
477 | jmp sysenter_exit | 480 | jmp sysenter_exit |
478 | #endif | 481 | #endif |
@@ -510,7 +513,7 @@ syscall_exit: | |||
510 | TRACE_IRQS_OFF | 513 | TRACE_IRQS_OFF |
511 | movl TI_flags(%ebp), %ecx | 514 | movl TI_flags(%ebp), %ecx |
512 | testl $_TIF_ALLWORK_MASK, %ecx # current->work | 515 | testl $_TIF_ALLWORK_MASK, %ecx # current->work |
513 | jne syscall_exit_work | 516 | jnz syscall_exit_work |
514 | 517 | ||
515 | restore_all: | 518 | restore_all: |
516 | TRACE_IRQS_IRET | 519 | TRACE_IRQS_IRET |
@@ -612,7 +615,7 @@ work_notifysig: # deal with pending signals and | |||
612 | #ifdef CONFIG_VM86 | 615 | #ifdef CONFIG_VM86 |
613 | testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) | 616 | testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) |
614 | movl %esp, %eax | 617 | movl %esp, %eax |
615 | jne work_notifysig_v86 # returning to kernel-space or | 618 | jnz work_notifysig_v86 # returning to kernel-space or |
616 | # vm86-space | 619 | # vm86-space |
617 | 1: | 620 | 1: |
618 | #else | 621 | #else |
@@ -720,43 +723,22 @@ END(sysenter_badsys) | |||
720 | .endm | 723 | .endm |
721 | 724 | ||
722 | /* | 725 | /* |
723 | * Build the entry stubs and pointer table with some assembler magic. | 726 | * Build the entry stubs with some assembler magic. |
724 | * We pack 7 stubs into a single 32-byte chunk, which will fit in a | 727 | * We pack 1 stub into every 8-byte block. |
725 | * single cache line on all modern x86 implementations. | ||
726 | */ | 728 | */ |
727 | .section .init.rodata,"a" | 729 | .align 8 |
728 | ENTRY(interrupt) | ||
729 | .section .entry.text, "ax" | ||
730 | .p2align 5 | ||
731 | .p2align CONFIG_X86_L1_CACHE_SHIFT | ||
732 | ENTRY(irq_entries_start) | 730 | ENTRY(irq_entries_start) |
733 | RING0_INT_FRAME | 731 | RING0_INT_FRAME |
734 | vector=FIRST_EXTERNAL_VECTOR | 732 | vector=FIRST_EXTERNAL_VECTOR |
735 | .rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 | 733 | .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) |
736 | .balign 32 | 734 | pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ |
737 | .rept 7 | 735 | vector=vector+1 |
738 | .if vector < FIRST_SYSTEM_VECTOR | 736 | jmp common_interrupt |
739 | .if vector <> FIRST_EXTERNAL_VECTOR | ||
740 | CFI_ADJUST_CFA_OFFSET -4 | 737 | CFI_ADJUST_CFA_OFFSET -4 |
741 | .endif | 738 | .align 8 |
742 | 1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ | 739 | .endr |
743 | .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 | ||
744 | jmp 2f | ||
745 | .endif | ||
746 | .previous | ||
747 | .long 1b | ||
748 | .section .entry.text, "ax" | ||
749 | vector=vector+1 | ||
750 | .endif | ||
751 | .endr | ||
752 | 2: jmp common_interrupt | ||
753 | .endr | ||
754 | END(irq_entries_start) | 740 | END(irq_entries_start) |
755 | 741 | ||
756 | .previous | ||
757 | END(interrupt) | ||
758 | .previous | ||
759 | |||
760 | /* | 742 | /* |
761 | * the CPU automatically disables interrupts when executing an IRQ vector, | 743 | * the CPU automatically disables interrupts when executing an IRQ vector, |
762 | * so IRQ-flags tracing has to follow that: | 744 | * so IRQ-flags tracing has to follow that: |
@@ -816,15 +798,9 @@ ENTRY(simd_coprocessor_error) | |||
816 | pushl_cfi $0 | 798 | pushl_cfi $0 |
817 | #ifdef CONFIG_X86_INVD_BUG | 799 | #ifdef CONFIG_X86_INVD_BUG |
818 | /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ | 800 | /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ |
819 | 661: pushl_cfi $do_general_protection | 801 | ALTERNATIVE "pushl_cfi $do_general_protection", \ |
820 | 662: | 802 | "pushl $do_simd_coprocessor_error", \ |
821 | .section .altinstructions,"a" | 803 | X86_FEATURE_XMM |
822 | altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f | ||
823 | .previous | ||
824 | .section .altinstr_replacement,"ax" | ||
825 | 663: pushl $do_simd_coprocessor_error | ||
826 | 664: | ||
827 | .previous | ||
828 | #else | 804 | #else |
829 | pushl_cfi $do_simd_coprocessor_error | 805 | pushl_cfi $do_simd_coprocessor_error |
830 | #endif | 806 | #endif |
@@ -1240,20 +1216,13 @@ error_code: | |||
1240 | /*CFI_REL_OFFSET es, 0*/ | 1216 | /*CFI_REL_OFFSET es, 0*/ |
1241 | pushl_cfi %ds | 1217 | pushl_cfi %ds |
1242 | /*CFI_REL_OFFSET ds, 0*/ | 1218 | /*CFI_REL_OFFSET ds, 0*/ |
1243 | pushl_cfi %eax | 1219 | pushl_cfi_reg eax |
1244 | CFI_REL_OFFSET eax, 0 | 1220 | pushl_cfi_reg ebp |
1245 | pushl_cfi %ebp | 1221 | pushl_cfi_reg edi |
1246 | CFI_REL_OFFSET ebp, 0 | 1222 | pushl_cfi_reg esi |
1247 | pushl_cfi %edi | 1223 | pushl_cfi_reg edx |
1248 | CFI_REL_OFFSET edi, 0 | 1224 | pushl_cfi_reg ecx |
1249 | pushl_cfi %esi | 1225 | pushl_cfi_reg ebx |
1250 | CFI_REL_OFFSET esi, 0 | ||
1251 | pushl_cfi %edx | ||
1252 | CFI_REL_OFFSET edx, 0 | ||
1253 | pushl_cfi %ecx | ||
1254 | CFI_REL_OFFSET ecx, 0 | ||
1255 | pushl_cfi %ebx | ||
1256 | CFI_REL_OFFSET ebx, 0 | ||
1257 | cld | 1226 | cld |
1258 | movl $(__KERNEL_PERCPU), %ecx | 1227 | movl $(__KERNEL_PERCPU), %ecx |
1259 | movl %ecx, %fs | 1228 | movl %ecx, %fs |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f0095a76c182..c7b238494b31 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -14,27 +14,14 @@ | |||
14 | * NOTE: This code handles signal-recognition, which happens every time | 14 | * NOTE: This code handles signal-recognition, which happens every time |
15 | * after an interrupt and after each system call. | 15 | * after an interrupt and after each system call. |
16 | * | 16 | * |
17 | * Normal syscalls and interrupts don't save a full stack frame, this is | ||
18 | * only done for syscall tracing, signals or fork/exec et.al. | ||
19 | * | ||
20 | * A note on terminology: | 17 | * A note on terminology: |
21 | * - top of stack: Architecture defined interrupt frame from SS to RIP | 18 | * - iret frame: Architecture defined interrupt frame from SS to RIP |
22 | * at the top of the kernel process stack. | 19 | * at the top of the kernel process stack. |
23 | * - partial stack frame: partially saved registers up to R11. | ||
24 | * - full stack frame: Like partial stack frame, but all register saved. | ||
25 | * | 20 | * |
26 | * Some macro usage: | 21 | * Some macro usage: |
27 | * - CFI macros are used to generate dwarf2 unwind information for better | 22 | * - CFI macros are used to generate dwarf2 unwind information for better |
28 | * backtraces. They don't change any code. | 23 | * backtraces. They don't change any code. |
29 | * - SAVE_ALL/RESTORE_ALL - Save/restore all registers | ||
30 | * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. | ||
31 | * There are unfortunately lots of special cases where some registers | ||
32 | * not touched. The macro is a big mess that should be cleaned up. | ||
33 | * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. | ||
34 | * Gives a full stack frame. | ||
35 | * - ENTRY/END Define functions in the symbol table. | 24 | * - ENTRY/END Define functions in the symbol table. |
36 | * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack | ||
37 | * frame that is otherwise undefined after a SYSCALL | ||
38 | * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. | 25 | * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. |
39 | * - idtentry - Define exception entry points. | 26 | * - idtentry - Define exception entry points. |
40 | */ | 27 | */ |
@@ -70,10 +57,6 @@ | |||
70 | .section .entry.text, "ax" | 57 | .section .entry.text, "ax" |
71 | 58 | ||
72 | 59 | ||
73 | #ifndef CONFIG_PREEMPT | ||
74 | #define retint_kernel retint_restore_args | ||
75 | #endif | ||
76 | |||
77 | #ifdef CONFIG_PARAVIRT | 60 | #ifdef CONFIG_PARAVIRT |
78 | ENTRY(native_usergs_sysret64) | 61 | ENTRY(native_usergs_sysret64) |
79 | swapgs | 62 | swapgs |
@@ -82,9 +65,9 @@ ENDPROC(native_usergs_sysret64) | |||
82 | #endif /* CONFIG_PARAVIRT */ | 65 | #endif /* CONFIG_PARAVIRT */ |
83 | 66 | ||
84 | 67 | ||
85 | .macro TRACE_IRQS_IRETQ offset=ARGOFFSET | 68 | .macro TRACE_IRQS_IRETQ |
86 | #ifdef CONFIG_TRACE_IRQFLAGS | 69 | #ifdef CONFIG_TRACE_IRQFLAGS |
87 | bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ | 70 | bt $9,EFLAGS(%rsp) /* interrupts off? */ |
88 | jnc 1f | 71 | jnc 1f |
89 | TRACE_IRQS_ON | 72 | TRACE_IRQS_ON |
90 | 1: | 73 | 1: |
@@ -116,8 +99,8 @@ ENDPROC(native_usergs_sysret64) | |||
116 | call debug_stack_reset | 99 | call debug_stack_reset |
117 | .endm | 100 | .endm |
118 | 101 | ||
119 | .macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET | 102 | .macro TRACE_IRQS_IRETQ_DEBUG |
120 | bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ | 103 | bt $9,EFLAGS(%rsp) /* interrupts off? */ |
121 | jnc 1f | 104 | jnc 1f |
122 | TRACE_IRQS_ON_DEBUG | 105 | TRACE_IRQS_ON_DEBUG |
123 | 1: | 106 | 1: |
@@ -130,34 +113,7 @@ ENDPROC(native_usergs_sysret64) | |||
130 | #endif | 113 | #endif |
131 | 114 | ||
132 | /* | 115 | /* |
133 | * C code is not supposed to know about undefined top of stack. Every time | 116 | * empty frame |
134 | * a C function with an pt_regs argument is called from the SYSCALL based | ||
135 | * fast path FIXUP_TOP_OF_STACK is needed. | ||
136 | * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs | ||
137 | * manipulation. | ||
138 | */ | ||
139 | |||
140 | /* %rsp:at FRAMEEND */ | ||
141 | .macro FIXUP_TOP_OF_STACK tmp offset=0 | ||
142 | movq PER_CPU_VAR(old_rsp),\tmp | ||
143 | movq \tmp,RSP+\offset(%rsp) | ||
144 | movq $__USER_DS,SS+\offset(%rsp) | ||
145 | movq $__USER_CS,CS+\offset(%rsp) | ||
146 | movq RIP+\offset(%rsp),\tmp /* get rip */ | ||
147 | movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */ | ||
148 | movq R11+\offset(%rsp),\tmp /* get eflags */ | ||
149 | movq \tmp,EFLAGS+\offset(%rsp) | ||
150 | .endm | ||
151 | |||
152 | .macro RESTORE_TOP_OF_STACK tmp offset=0 | ||
153 | movq RSP+\offset(%rsp),\tmp | ||
154 | movq \tmp,PER_CPU_VAR(old_rsp) | ||
155 | movq EFLAGS+\offset(%rsp),\tmp | ||
156 | movq \tmp,R11+\offset(%rsp) | ||
157 | .endm | ||
158 | |||
159 | /* | ||
160 | * initial frame state for interrupts (and exceptions without error code) | ||
161 | */ | 117 | */ |
162 | .macro EMPTY_FRAME start=1 offset=0 | 118 | .macro EMPTY_FRAME start=1 offset=0 |
163 | .if \start | 119 | .if \start |
@@ -173,12 +129,12 @@ ENDPROC(native_usergs_sysret64) | |||
173 | * initial frame state for interrupts (and exceptions without error code) | 129 | * initial frame state for interrupts (and exceptions without error code) |
174 | */ | 130 | */ |
175 | .macro INTR_FRAME start=1 offset=0 | 131 | .macro INTR_FRAME start=1 offset=0 |
176 | EMPTY_FRAME \start, SS+8+\offset-RIP | 132 | EMPTY_FRAME \start, 5*8+\offset |
177 | /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ | 133 | /*CFI_REL_OFFSET ss, 4*8+\offset*/ |
178 | CFI_REL_OFFSET rsp, RSP+\offset-RIP | 134 | CFI_REL_OFFSET rsp, 3*8+\offset |
179 | /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ | 135 | /*CFI_REL_OFFSET rflags, 2*8+\offset*/ |
180 | /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ | 136 | /*CFI_REL_OFFSET cs, 1*8+\offset*/ |
181 | CFI_REL_OFFSET rip, RIP+\offset-RIP | 137 | CFI_REL_OFFSET rip, 0*8+\offset |
182 | .endm | 138 | .endm |
183 | 139 | ||
184 | /* | 140 | /* |
@@ -186,30 +142,23 @@ ENDPROC(native_usergs_sysret64) | |||
186 | * with vector already pushed) | 142 | * with vector already pushed) |
187 | */ | 143 | */ |
188 | .macro XCPT_FRAME start=1 offset=0 | 144 | .macro XCPT_FRAME start=1 offset=0 |
189 | INTR_FRAME \start, RIP+\offset-ORIG_RAX | 145 | INTR_FRAME \start, 1*8+\offset |
190 | .endm | ||
191 | |||
192 | /* | ||
193 | * frame that enables calling into C. | ||
194 | */ | ||
195 | .macro PARTIAL_FRAME start=1 offset=0 | ||
196 | XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET | ||
197 | CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET | ||
198 | CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET | ||
199 | CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET | ||
200 | CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET | ||
201 | CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET | ||
202 | CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET | ||
203 | CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET | ||
204 | CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET | ||
205 | CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET | ||
206 | .endm | 146 | .endm |
207 | 147 | ||
208 | /* | 148 | /* |
209 | * frame that enables passing a complete pt_regs to a C function. | 149 | * frame that enables passing a complete pt_regs to a C function. |
210 | */ | 150 | */ |
211 | .macro DEFAULT_FRAME start=1 offset=0 | 151 | .macro DEFAULT_FRAME start=1 offset=0 |
212 | PARTIAL_FRAME \start, R11+\offset-R15 | 152 | XCPT_FRAME \start, ORIG_RAX+\offset |
153 | CFI_REL_OFFSET rdi, RDI+\offset | ||
154 | CFI_REL_OFFSET rsi, RSI+\offset | ||
155 | CFI_REL_OFFSET rdx, RDX+\offset | ||
156 | CFI_REL_OFFSET rcx, RCX+\offset | ||
157 | CFI_REL_OFFSET rax, RAX+\offset | ||
158 | CFI_REL_OFFSET r8, R8+\offset | ||
159 | CFI_REL_OFFSET r9, R9+\offset | ||
160 | CFI_REL_OFFSET r10, R10+\offset | ||
161 | CFI_REL_OFFSET r11, R11+\offset | ||
213 | CFI_REL_OFFSET rbx, RBX+\offset | 162 | CFI_REL_OFFSET rbx, RBX+\offset |
214 | CFI_REL_OFFSET rbp, RBP+\offset | 163 | CFI_REL_OFFSET rbp, RBP+\offset |
215 | CFI_REL_OFFSET r12, R12+\offset | 164 | CFI_REL_OFFSET r12, R12+\offset |
@@ -218,105 +167,30 @@ ENDPROC(native_usergs_sysret64) | |||
218 | CFI_REL_OFFSET r15, R15+\offset | 167 | CFI_REL_OFFSET r15, R15+\offset |
219 | .endm | 168 | .endm |
220 | 169 | ||
221 | ENTRY(save_paranoid) | ||
222 | XCPT_FRAME 1 RDI+8 | ||
223 | cld | ||
224 | movq %rdi, RDI+8(%rsp) | ||
225 | movq %rsi, RSI+8(%rsp) | ||
226 | movq_cfi rdx, RDX+8 | ||
227 | movq_cfi rcx, RCX+8 | ||
228 | movq_cfi rax, RAX+8 | ||
229 | movq %r8, R8+8(%rsp) | ||
230 | movq %r9, R9+8(%rsp) | ||
231 | movq %r10, R10+8(%rsp) | ||
232 | movq %r11, R11+8(%rsp) | ||
233 | movq_cfi rbx, RBX+8 | ||
234 | movq %rbp, RBP+8(%rsp) | ||
235 | movq %r12, R12+8(%rsp) | ||
236 | movq %r13, R13+8(%rsp) | ||
237 | movq %r14, R14+8(%rsp) | ||
238 | movq %r15, R15+8(%rsp) | ||
239 | movl $1,%ebx | ||
240 | movl $MSR_GS_BASE,%ecx | ||
241 | rdmsr | ||
242 | testl %edx,%edx | ||
243 | js 1f /* negative -> in kernel */ | ||
244 | SWAPGS | ||
245 | xorl %ebx,%ebx | ||
246 | 1: ret | ||
247 | CFI_ENDPROC | ||
248 | END(save_paranoid) | ||
249 | |||
250 | /* | 170 | /* |
251 | * A newly forked process directly context switches into this address. | 171 | * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. |
252 | * | 172 | * |
253 | * rdi: prev task we switched from | 173 | * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, |
254 | */ | 174 | * then loads new ss, cs, and rip from previously programmed MSRs. |
255 | ENTRY(ret_from_fork) | 175 | * rflags gets masked by a value from another MSR (so CLD and CLAC |
256 | DEFAULT_FRAME | 176 | * are not needed). SYSCALL does not save anything on the stack |
257 | 177 | * and does not change rsp. | |
258 | LOCK ; btr $TIF_FORK,TI_flags(%r8) | ||
259 | |||
260 | pushq_cfi $0x0002 | ||
261 | popfq_cfi # reset kernel eflags | ||
262 | |||
263 | call schedule_tail # rdi: 'prev' task parameter | ||
264 | |||
265 | GET_THREAD_INFO(%rcx) | ||
266 | |||
267 | RESTORE_REST | ||
268 | |||
269 | testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? | ||
270 | jz 1f | ||
271 | |||
272 | /* | ||
273 | * By the time we get here, we have no idea whether our pt_regs, | ||
274 | * ti flags, and ti status came from the 64-bit SYSCALL fast path, | ||
275 | * the slow path, or one of the ia32entry paths. | ||
276 | * Use int_ret_from_sys_call to return, since it can safely handle | ||
277 | * all of the above. | ||
278 | */ | ||
279 | jmp int_ret_from_sys_call | ||
280 | |||
281 | 1: | ||
282 | subq $REST_SKIP, %rsp # leave space for volatiles | ||
283 | CFI_ADJUST_CFA_OFFSET REST_SKIP | ||
284 | movq %rbp, %rdi | ||
285 | call *%rbx | ||
286 | movl $0, RAX(%rsp) | ||
287 | RESTORE_REST | ||
288 | jmp int_ret_from_sys_call | ||
289 | CFI_ENDPROC | ||
290 | END(ret_from_fork) | ||
291 | |||
292 | /* | ||
293 | * System call entry. Up to 6 arguments in registers are supported. | ||
294 | * | 178 | * |
295 | * SYSCALL does not save anything on the stack and does not change the | 179 | * Registers on entry: |
296 | * stack pointer. However, it does mask the flags register for us, so | ||
297 | * CLD and CLAC are not needed. | ||
298 | */ | ||
299 | |||
300 | /* | ||
301 | * Register setup: | ||
302 | * rax system call number | 180 | * rax system call number |
181 | * rcx return address | ||
182 | * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) | ||
303 | * rdi arg0 | 183 | * rdi arg0 |
304 | * rcx return address for syscall/sysret, C arg3 | ||
305 | * rsi arg1 | 184 | * rsi arg1 |
306 | * rdx arg2 | 185 | * rdx arg2 |
307 | * r10 arg3 (--> moved to rcx for C) | 186 | * r10 arg3 (needs to be moved to rcx to conform to C ABI) |
308 | * r8 arg4 | 187 | * r8 arg4 |
309 | * r9 arg5 | 188 | * r9 arg5 |
310 | * r11 eflags for syscall/sysret, temporary for C | 189 | * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) |
311 | * r12-r15,rbp,rbx saved by C code, not touched. | ||
312 | * | 190 | * |
313 | * Interrupts are off on entry. | ||
314 | * Only called from user space. | 191 | * Only called from user space. |
315 | * | 192 | * |
316 | * XXX if we had a free scratch register we could save the RSP into the stack frame | 193 | * When user can change pt_regs->foo always force IRET. That is because |
317 | * and report it properly in ps. Unfortunately we haven't. | ||
318 | * | ||
319 | * When user can change the frames always force IRET. That is because | ||
320 | * it deals with uncanonical addresses better. SYSRET has trouble | 194 | * it deals with uncanonical addresses better. SYSRET has trouble |
321 | * with them due to bugs in both AMD and Intel CPUs. | 195 | * with them due to bugs in both AMD and Intel CPUs. |
322 | */ | 196 | */ |
@@ -324,9 +198,15 @@ END(ret_from_fork) | |||
324 | ENTRY(system_call) | 198 | ENTRY(system_call) |
325 | CFI_STARTPROC simple | 199 | CFI_STARTPROC simple |
326 | CFI_SIGNAL_FRAME | 200 | CFI_SIGNAL_FRAME |
327 | CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET | 201 | CFI_DEF_CFA rsp,0 |
328 | CFI_REGISTER rip,rcx | 202 | CFI_REGISTER rip,rcx |
329 | /*CFI_REGISTER rflags,r11*/ | 203 | /*CFI_REGISTER rflags,r11*/ |
204 | |||
205 | /* | ||
206 | * Interrupts are off on entry. | ||
207 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, | ||
208 | * it is too small to ever cause noticeable irq latency. | ||
209 | */ | ||
330 | SWAPGS_UNSAFE_STACK | 210 | SWAPGS_UNSAFE_STACK |
331 | /* | 211 | /* |
332 | * A hypervisor implementation might want to use a label | 212 | * A hypervisor implementation might want to use a label |
@@ -335,18 +215,38 @@ ENTRY(system_call) | |||
335 | */ | 215 | */ |
336 | GLOBAL(system_call_after_swapgs) | 216 | GLOBAL(system_call_after_swapgs) |
337 | 217 | ||
338 | movq %rsp,PER_CPU_VAR(old_rsp) | 218 | movq %rsp,PER_CPU_VAR(rsp_scratch) |
339 | movq PER_CPU_VAR(kernel_stack),%rsp | 219 | movq PER_CPU_VAR(kernel_stack),%rsp |
220 | |||
221 | /* Construct struct pt_regs on stack */ | ||
222 | pushq_cfi $__USER_DS /* pt_regs->ss */ | ||
223 | pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ | ||
340 | /* | 224 | /* |
341 | * No need to follow this irqs off/on section - it's straight | 225 | * Re-enable interrupts. |
342 | * and short: | 226 | * We use 'rsp_scratch' as a scratch space, hence irq-off block above |
227 | * must execute atomically in the face of possible interrupt-driven | ||
228 | * task preemption. We must enable interrupts only after we're done | ||
229 | * with using rsp_scratch: | ||
343 | */ | 230 | */ |
344 | ENABLE_INTERRUPTS(CLBR_NONE) | 231 | ENABLE_INTERRUPTS(CLBR_NONE) |
345 | SAVE_ARGS 8, 0, rax_enosys=1 | 232 | pushq_cfi %r11 /* pt_regs->flags */ |
346 | movq_cfi rax,(ORIG_RAX-ARGOFFSET) | 233 | pushq_cfi $__USER_CS /* pt_regs->cs */ |
347 | movq %rcx,RIP-ARGOFFSET(%rsp) | 234 | pushq_cfi %rcx /* pt_regs->ip */ |
348 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | 235 | CFI_REL_OFFSET rip,0 |
349 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 236 | pushq_cfi_reg rax /* pt_regs->orig_ax */ |
237 | pushq_cfi_reg rdi /* pt_regs->di */ | ||
238 | pushq_cfi_reg rsi /* pt_regs->si */ | ||
239 | pushq_cfi_reg rdx /* pt_regs->dx */ | ||
240 | pushq_cfi_reg rcx /* pt_regs->cx */ | ||
241 | pushq_cfi $-ENOSYS /* pt_regs->ax */ | ||
242 | pushq_cfi_reg r8 /* pt_regs->r8 */ | ||
243 | pushq_cfi_reg r9 /* pt_regs->r9 */ | ||
244 | pushq_cfi_reg r10 /* pt_regs->r10 */ | ||
245 | pushq_cfi_reg r11 /* pt_regs->r11 */ | ||
246 | sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ | ||
247 | CFI_ADJUST_CFA_OFFSET 6*8 | ||
248 | |||
249 | testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) | ||
350 | jnz tracesys | 250 | jnz tracesys |
351 | system_call_fastpath: | 251 | system_call_fastpath: |
352 | #if __SYSCALL_MASK == ~0 | 252 | #if __SYSCALL_MASK == ~0 |
@@ -355,18 +255,21 @@ system_call_fastpath: | |||
355 | andl $__SYSCALL_MASK,%eax | 255 | andl $__SYSCALL_MASK,%eax |
356 | cmpl $__NR_syscall_max,%eax | 256 | cmpl $__NR_syscall_max,%eax |
357 | #endif | 257 | #endif |
358 | ja ret_from_sys_call /* and return regs->ax */ | 258 | ja 1f /* return -ENOSYS (already in pt_regs->ax) */ |
359 | movq %r10,%rcx | 259 | movq %r10,%rcx |
360 | call *sys_call_table(,%rax,8) # XXX: rip relative | 260 | call *sys_call_table(,%rax,8) |
361 | movq %rax,RAX-ARGOFFSET(%rsp) | 261 | movq %rax,RAX(%rsp) |
262 | 1: | ||
362 | /* | 263 | /* |
363 | * Syscall return path ending with SYSRET (fast path) | 264 | * Syscall return path ending with SYSRET (fast path). |
364 | * Has incomplete stack frame and undefined top of stack. | 265 | * Has incompletely filled pt_regs. |
365 | */ | 266 | */ |
366 | ret_from_sys_call: | ||
367 | LOCKDEP_SYS_EXIT | 267 | LOCKDEP_SYS_EXIT |
268 | /* | ||
269 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, | ||
270 | * it is too small to ever cause noticeable irq latency. | ||
271 | */ | ||
368 | DISABLE_INTERRUPTS(CLBR_NONE) | 272 | DISABLE_INTERRUPTS(CLBR_NONE) |
369 | TRACE_IRQS_OFF | ||
370 | 273 | ||
371 | /* | 274 | /* |
372 | * We must check ti flags with interrupts (or at least preemption) | 275 | * We must check ti flags with interrupts (or at least preemption) |
@@ -376,72 +279,73 @@ ret_from_sys_call: | |||
376 | * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is | 279 | * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is |
377 | * very bad. | 280 | * very bad. |
378 | */ | 281 | */ |
379 | testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 282 | testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
380 | jnz int_ret_from_sys_call_fixup /* Go the the slow path */ | 283 | jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ |
381 | 284 | ||
382 | CFI_REMEMBER_STATE | 285 | CFI_REMEMBER_STATE |
383 | /* | 286 | |
384 | * sysretq will re-enable interrupts: | 287 | RESTORE_C_REGS_EXCEPT_RCX_R11 |
385 | */ | 288 | movq RIP(%rsp),%rcx |
386 | TRACE_IRQS_ON | ||
387 | movq RIP-ARGOFFSET(%rsp),%rcx | ||
388 | CFI_REGISTER rip,rcx | 289 | CFI_REGISTER rip,rcx |
389 | RESTORE_ARGS 1,-ARG_SKIP,0 | 290 | movq EFLAGS(%rsp),%r11 |
390 | /*CFI_REGISTER rflags,r11*/ | 291 | /*CFI_REGISTER rflags,r11*/ |
391 | movq PER_CPU_VAR(old_rsp), %rsp | 292 | movq RSP(%rsp),%rsp |
293 | /* | ||
294 | * 64bit SYSRET restores rip from rcx, | ||
295 | * rflags from r11 (but RF and VM bits are forced to 0), | ||
296 | * cs and ss are loaded from MSRs. | ||
297 | * Restoration of rflags re-enables interrupts. | ||
298 | */ | ||
392 | USERGS_SYSRET64 | 299 | USERGS_SYSRET64 |
393 | 300 | ||
394 | CFI_RESTORE_STATE | 301 | CFI_RESTORE_STATE |
395 | 302 | ||
396 | int_ret_from_sys_call_fixup: | 303 | /* Do syscall entry tracing */ |
397 | FIXUP_TOP_OF_STACK %r11, -ARGOFFSET | ||
398 | jmp int_ret_from_sys_call_irqs_off | ||
399 | |||
400 | /* Do syscall tracing */ | ||
401 | tracesys: | 304 | tracesys: |
402 | leaq -REST_SKIP(%rsp), %rdi | 305 | movq %rsp, %rdi |
403 | movq $AUDIT_ARCH_X86_64, %rsi | 306 | movl $AUDIT_ARCH_X86_64, %esi |
404 | call syscall_trace_enter_phase1 | 307 | call syscall_trace_enter_phase1 |
405 | test %rax, %rax | 308 | test %rax, %rax |
406 | jnz tracesys_phase2 /* if needed, run the slow path */ | 309 | jnz tracesys_phase2 /* if needed, run the slow path */ |
407 | LOAD_ARGS 0 /* else restore clobbered regs */ | 310 | RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ |
311 | movq ORIG_RAX(%rsp), %rax | ||
408 | jmp system_call_fastpath /* and return to the fast path */ | 312 | jmp system_call_fastpath /* and return to the fast path */ |
409 | 313 | ||
410 | tracesys_phase2: | 314 | tracesys_phase2: |
411 | SAVE_REST | 315 | SAVE_EXTRA_REGS |
412 | FIXUP_TOP_OF_STACK %rdi | ||
413 | movq %rsp, %rdi | 316 | movq %rsp, %rdi |
414 | movq $AUDIT_ARCH_X86_64, %rsi | 317 | movl $AUDIT_ARCH_X86_64, %esi |
415 | movq %rax,%rdx | 318 | movq %rax,%rdx |
416 | call syscall_trace_enter_phase2 | 319 | call syscall_trace_enter_phase2 |
417 | 320 | ||
418 | /* | 321 | /* |
419 | * Reload arg registers from stack in case ptrace changed them. | 322 | * Reload registers from stack in case ptrace changed them. |
420 | * We don't reload %rax because syscall_trace_entry_phase2() returned | 323 | * We don't reload %rax because syscall_trace_entry_phase2() returned |
421 | * the value it wants us to use in the table lookup. | 324 | * the value it wants us to use in the table lookup. |
422 | */ | 325 | */ |
423 | LOAD_ARGS ARGOFFSET, 1 | 326 | RESTORE_C_REGS_EXCEPT_RAX |
424 | RESTORE_REST | 327 | RESTORE_EXTRA_REGS |
425 | #if __SYSCALL_MASK == ~0 | 328 | #if __SYSCALL_MASK == ~0 |
426 | cmpq $__NR_syscall_max,%rax | 329 | cmpq $__NR_syscall_max,%rax |
427 | #else | 330 | #else |
428 | andl $__SYSCALL_MASK,%eax | 331 | andl $__SYSCALL_MASK,%eax |
429 | cmpl $__NR_syscall_max,%eax | 332 | cmpl $__NR_syscall_max,%eax |
430 | #endif | 333 | #endif |
431 | ja int_ret_from_sys_call /* RAX(%rsp) is already set */ | 334 | ja 1f /* return -ENOSYS (already in pt_regs->ax) */ |
432 | movq %r10,%rcx /* fixup for C */ | 335 | movq %r10,%rcx /* fixup for C */ |
433 | call *sys_call_table(,%rax,8) | 336 | call *sys_call_table(,%rax,8) |
434 | movq %rax,RAX-ARGOFFSET(%rsp) | 337 | movq %rax,RAX(%rsp) |
435 | /* Use IRET because user could have changed frame */ | 338 | 1: |
339 | /* Use IRET because user could have changed pt_regs->foo */ | ||
436 | 340 | ||
437 | /* | 341 | /* |
438 | * Syscall return path ending with IRET. | 342 | * Syscall return path ending with IRET. |
439 | * Has correct top of stack, but partial stack frame. | 343 | * Has correct iret frame. |
440 | */ | 344 | */ |
441 | GLOBAL(int_ret_from_sys_call) | 345 | GLOBAL(int_ret_from_sys_call) |
442 | DISABLE_INTERRUPTS(CLBR_NONE) | 346 | DISABLE_INTERRUPTS(CLBR_NONE) |
347 | int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ | ||
443 | TRACE_IRQS_OFF | 348 | TRACE_IRQS_OFF |
444 | int_ret_from_sys_call_irqs_off: | ||
445 | movl $_TIF_ALLWORK_MASK,%edi | 349 | movl $_TIF_ALLWORK_MASK,%edi |
446 | /* edi: mask to check */ | 350 | /* edi: mask to check */ |
447 | GLOBAL(int_with_check) | 351 | GLOBAL(int_with_check) |
@@ -450,8 +354,8 @@ GLOBAL(int_with_check) | |||
450 | movl TI_flags(%rcx),%edx | 354 | movl TI_flags(%rcx),%edx |
451 | andl %edi,%edx | 355 | andl %edi,%edx |
452 | jnz int_careful | 356 | jnz int_careful |
453 | andl $~TS_COMPAT,TI_status(%rcx) | 357 | andl $~TS_COMPAT,TI_status(%rcx) |
454 | jmp retint_swapgs | 358 | jmp syscall_return |
455 | 359 | ||
456 | /* Either reschedule or signal or syscall exit tracking needed. */ | 360 | /* Either reschedule or signal or syscall exit tracking needed. */ |
457 | /* First do a reschedule test. */ | 361 | /* First do a reschedule test. */ |
@@ -468,12 +372,11 @@ int_careful: | |||
468 | TRACE_IRQS_OFF | 372 | TRACE_IRQS_OFF |
469 | jmp int_with_check | 373 | jmp int_with_check |
470 | 374 | ||
471 | /* handle signals and tracing -- both require a full stack frame */ | 375 | /* handle signals and tracing -- both require a full pt_regs */ |
472 | int_very_careful: | 376 | int_very_careful: |
473 | TRACE_IRQS_ON | 377 | TRACE_IRQS_ON |
474 | ENABLE_INTERRUPTS(CLBR_NONE) | 378 | ENABLE_INTERRUPTS(CLBR_NONE) |
475 | int_check_syscall_exit_work: | 379 | SAVE_EXTRA_REGS |
476 | SAVE_REST | ||
477 | /* Check for syscall exit trace */ | 380 | /* Check for syscall exit trace */ |
478 | testl $_TIF_WORK_SYSCALL_EXIT,%edx | 381 | testl $_TIF_WORK_SYSCALL_EXIT,%edx |
479 | jz int_signal | 382 | jz int_signal |
@@ -492,86 +395,192 @@ int_signal: | |||
492 | call do_notify_resume | 395 | call do_notify_resume |
493 | 1: movl $_TIF_WORK_MASK,%edi | 396 | 1: movl $_TIF_WORK_MASK,%edi |
494 | int_restore_rest: | 397 | int_restore_rest: |
495 | RESTORE_REST | 398 | RESTORE_EXTRA_REGS |
496 | DISABLE_INTERRUPTS(CLBR_NONE) | 399 | DISABLE_INTERRUPTS(CLBR_NONE) |
497 | TRACE_IRQS_OFF | 400 | TRACE_IRQS_OFF |
498 | jmp int_with_check | 401 | jmp int_with_check |
402 | |||
403 | syscall_return: | ||
404 | /* The IRETQ could re-enable interrupts: */ | ||
405 | DISABLE_INTERRUPTS(CLBR_ANY) | ||
406 | TRACE_IRQS_IRETQ | ||
407 | |||
408 | /* | ||
409 | * Try to use SYSRET instead of IRET if we're returning to | ||
410 | * a completely clean 64-bit userspace context. | ||
411 | */ | ||
412 | movq RCX(%rsp),%rcx | ||
413 | cmpq %rcx,RIP(%rsp) /* RCX == RIP */ | ||
414 | jne opportunistic_sysret_failed | ||
415 | |||
416 | /* | ||
417 | * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP | ||
418 | * in kernel space. This essentially lets the user take over | ||
419 | * the kernel, since userspace controls RSP. It's not worth | ||
420 | * testing for canonicalness exactly -- this check detects any | ||
421 | * of the 17 high bits set, which is true for non-canonical | ||
422 | * or kernel addresses. (This will pessimize vsyscall=native. | ||
423 | * Big deal.) | ||
424 | * | ||
425 | * If virtual addresses ever become wider, this will need | ||
426 | * to be updated to remain correct on both old and new CPUs. | ||
427 | */ | ||
428 | .ifne __VIRTUAL_MASK_SHIFT - 47 | ||
429 | .error "virtual address width changed -- SYSRET checks need update" | ||
430 | .endif | ||
431 | shr $__VIRTUAL_MASK_SHIFT, %rcx | ||
432 | jnz opportunistic_sysret_failed | ||
433 | |||
434 | cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ | ||
435 | jne opportunistic_sysret_failed | ||
436 | |||
437 | movq R11(%rsp),%r11 | ||
438 | cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ | ||
439 | jne opportunistic_sysret_failed | ||
440 | |||
441 | /* | ||
442 | * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, | ||
443 | * restoring TF results in a trap from userspace immediately after | ||
444 | * SYSRET. This would cause an infinite loop whenever #DB happens | ||
445 | * with register state that satisfies the opportunistic SYSRET | ||
446 | * conditions. For example, single-stepping this user code: | ||
447 | * | ||
448 | * movq $stuck_here,%rcx | ||
449 | * pushfq | ||
450 | * popq %r11 | ||
451 | * stuck_here: | ||
452 | * | ||
453 | * would never get past 'stuck_here'. | ||
454 | */ | ||
455 | testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 | ||
456 | jnz opportunistic_sysret_failed | ||
457 | |||
458 | /* nothing to check for RSP */ | ||
459 | |||
460 | cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ | ||
461 | jne opportunistic_sysret_failed | ||
462 | |||
463 | /* | ||
464 | * We win! This label is here just for ease of understanding | ||
465 | * perf profiles. Nothing jumps here. | ||
466 | */ | ||
467 | syscall_return_via_sysret: | ||
468 | CFI_REMEMBER_STATE | ||
469 | /* r11 is already restored (see code above) */ | ||
470 | RESTORE_C_REGS_EXCEPT_R11 | ||
471 | movq RSP(%rsp),%rsp | ||
472 | USERGS_SYSRET64 | ||
473 | CFI_RESTORE_STATE | ||
474 | |||
475 | opportunistic_sysret_failed: | ||
476 | SWAPGS | ||
477 | jmp restore_c_regs_and_iret | ||
499 | CFI_ENDPROC | 478 | CFI_ENDPROC |
500 | END(system_call) | 479 | END(system_call) |
501 | 480 | ||
481 | |||
502 | .macro FORK_LIKE func | 482 | .macro FORK_LIKE func |
503 | ENTRY(stub_\func) | 483 | ENTRY(stub_\func) |
504 | CFI_STARTPROC | 484 | CFI_STARTPROC |
505 | popq %r11 /* save return address */ | 485 | DEFAULT_FRAME 0, 8 /* offset 8: return address */ |
506 | PARTIAL_FRAME 0 | 486 | SAVE_EXTRA_REGS 8 |
507 | SAVE_REST | 487 | jmp sys_\func |
508 | pushq %r11 /* put it back on stack */ | ||
509 | FIXUP_TOP_OF_STACK %r11, 8 | ||
510 | DEFAULT_FRAME 0 8 /* offset 8: return address */ | ||
511 | call sys_\func | ||
512 | RESTORE_TOP_OF_STACK %r11, 8 | ||
513 | ret $REST_SKIP /* pop extended registers */ | ||
514 | CFI_ENDPROC | 488 | CFI_ENDPROC |
515 | END(stub_\func) | 489 | END(stub_\func) |
516 | .endm | 490 | .endm |
517 | 491 | ||
518 | .macro FIXED_FRAME label,func | ||
519 | ENTRY(\label) | ||
520 | CFI_STARTPROC | ||
521 | PARTIAL_FRAME 0 8 /* offset 8: return address */ | ||
522 | FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET | ||
523 | call \func | ||
524 | RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET | ||
525 | ret | ||
526 | CFI_ENDPROC | ||
527 | END(\label) | ||
528 | .endm | ||
529 | |||
530 | FORK_LIKE clone | 492 | FORK_LIKE clone |
531 | FORK_LIKE fork | 493 | FORK_LIKE fork |
532 | FORK_LIKE vfork | 494 | FORK_LIKE vfork |
533 | FIXED_FRAME stub_iopl, sys_iopl | ||
534 | 495 | ||
535 | ENTRY(stub_execve) | 496 | ENTRY(stub_execve) |
536 | CFI_STARTPROC | 497 | CFI_STARTPROC |
537 | addq $8, %rsp | 498 | DEFAULT_FRAME 0, 8 |
538 | PARTIAL_FRAME 0 | 499 | call sys_execve |
539 | SAVE_REST | 500 | return_from_execve: |
540 | FIXUP_TOP_OF_STACK %r11 | 501 | testl %eax, %eax |
541 | call sys_execve | 502 | jz 1f |
542 | movq %rax,RAX(%rsp) | 503 | /* exec failed, can use fast SYSRET code path in this case */ |
543 | RESTORE_REST | 504 | ret |
544 | jmp int_ret_from_sys_call | 505 | 1: |
506 | /* must use IRET code path (pt_regs->cs may have changed) */ | ||
507 | addq $8, %rsp | ||
508 | CFI_ADJUST_CFA_OFFSET -8 | ||
509 | ZERO_EXTRA_REGS | ||
510 | movq %rax,RAX(%rsp) | ||
511 | jmp int_ret_from_sys_call | ||
545 | CFI_ENDPROC | 512 | CFI_ENDPROC |
546 | END(stub_execve) | 513 | END(stub_execve) |
547 | 514 | /* | |
548 | ENTRY(stub_execveat) | 515 | * Remaining execve stubs are only 7 bytes long. |
516 | * ENTRY() often aligns to 16 bytes, which in this case has no benefits. | ||
517 | */ | ||
518 | .align 8 | ||
519 | GLOBAL(stub_execveat) | ||
549 | CFI_STARTPROC | 520 | CFI_STARTPROC |
550 | addq $8, %rsp | 521 | DEFAULT_FRAME 0, 8 |
551 | PARTIAL_FRAME 0 | 522 | call sys_execveat |
552 | SAVE_REST | 523 | jmp return_from_execve |
553 | FIXUP_TOP_OF_STACK %r11 | ||
554 | call sys_execveat | ||
555 | RESTORE_TOP_OF_STACK %r11 | ||
556 | movq %rax,RAX(%rsp) | ||
557 | RESTORE_REST | ||
558 | jmp int_ret_from_sys_call | ||
559 | CFI_ENDPROC | 524 | CFI_ENDPROC |
560 | END(stub_execveat) | 525 | END(stub_execveat) |
561 | 526 | ||
527 | #ifdef CONFIG_X86_X32_ABI | ||
528 | .align 8 | ||
529 | GLOBAL(stub_x32_execve) | ||
530 | CFI_STARTPROC | ||
531 | DEFAULT_FRAME 0, 8 | ||
532 | call compat_sys_execve | ||
533 | jmp return_from_execve | ||
534 | CFI_ENDPROC | ||
535 | END(stub_x32_execve) | ||
536 | .align 8 | ||
537 | GLOBAL(stub_x32_execveat) | ||
538 | CFI_STARTPROC | ||
539 | DEFAULT_FRAME 0, 8 | ||
540 | call compat_sys_execveat | ||
541 | jmp return_from_execve | ||
542 | CFI_ENDPROC | ||
543 | END(stub_x32_execveat) | ||
544 | #endif | ||
545 | |||
546 | #ifdef CONFIG_IA32_EMULATION | ||
547 | .align 8 | ||
548 | GLOBAL(stub32_execve) | ||
549 | CFI_STARTPROC | ||
550 | call compat_sys_execve | ||
551 | jmp return_from_execve | ||
552 | CFI_ENDPROC | ||
553 | END(stub32_execve) | ||
554 | .align 8 | ||
555 | GLOBAL(stub32_execveat) | ||
556 | CFI_STARTPROC | ||
557 | call compat_sys_execveat | ||
558 | jmp return_from_execve | ||
559 | CFI_ENDPROC | ||
560 | END(stub32_execveat) | ||
561 | #endif | ||
562 | |||
562 | /* | 563 | /* |
563 | * sigreturn is special because it needs to restore all registers on return. | 564 | * sigreturn is special because it needs to restore all registers on return. |
564 | * This cannot be done with SYSRET, so use the IRET return path instead. | 565 | * This cannot be done with SYSRET, so use the IRET return path instead. |
565 | */ | 566 | */ |
566 | ENTRY(stub_rt_sigreturn) | 567 | ENTRY(stub_rt_sigreturn) |
567 | CFI_STARTPROC | 568 | CFI_STARTPROC |
568 | addq $8, %rsp | 569 | DEFAULT_FRAME 0, 8 |
569 | PARTIAL_FRAME 0 | 570 | /* |
570 | SAVE_REST | 571 | * SAVE_EXTRA_REGS result is not normally needed: |
571 | FIXUP_TOP_OF_STACK %r11 | 572 | * sigreturn overwrites all pt_regs->GPREGS. |
573 | * But sigreturn can fail (!), and there is no easy way to detect that. | ||
574 | * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, | ||
575 | * we SAVE_EXTRA_REGS here. | ||
576 | */ | ||
577 | SAVE_EXTRA_REGS 8 | ||
572 | call sys_rt_sigreturn | 578 | call sys_rt_sigreturn |
573 | movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer | 579 | return_from_stub: |
574 | RESTORE_REST | 580 | addq $8, %rsp |
581 | CFI_ADJUST_CFA_OFFSET -8 | ||
582 | RESTORE_EXTRA_REGS | ||
583 | movq %rax,RAX(%rsp) | ||
575 | jmp int_ret_from_sys_call | 584 | jmp int_ret_from_sys_call |
576 | CFI_ENDPROC | 585 | CFI_ENDPROC |
577 | END(stub_rt_sigreturn) | 586 | END(stub_rt_sigreturn) |
@@ -579,86 +588,70 @@ END(stub_rt_sigreturn) | |||
579 | #ifdef CONFIG_X86_X32_ABI | 588 | #ifdef CONFIG_X86_X32_ABI |
580 | ENTRY(stub_x32_rt_sigreturn) | 589 | ENTRY(stub_x32_rt_sigreturn) |
581 | CFI_STARTPROC | 590 | CFI_STARTPROC |
582 | addq $8, %rsp | 591 | DEFAULT_FRAME 0, 8 |
583 | PARTIAL_FRAME 0 | 592 | SAVE_EXTRA_REGS 8 |
584 | SAVE_REST | ||
585 | FIXUP_TOP_OF_STACK %r11 | ||
586 | call sys32_x32_rt_sigreturn | 593 | call sys32_x32_rt_sigreturn |
587 | movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer | 594 | jmp return_from_stub |
588 | RESTORE_REST | ||
589 | jmp int_ret_from_sys_call | ||
590 | CFI_ENDPROC | 595 | CFI_ENDPROC |
591 | END(stub_x32_rt_sigreturn) | 596 | END(stub_x32_rt_sigreturn) |
597 | #endif | ||
592 | 598 | ||
593 | ENTRY(stub_x32_execve) | 599 | /* |
594 | CFI_STARTPROC | 600 | * A newly forked process directly context switches into this address. |
595 | addq $8, %rsp | 601 | * |
596 | PARTIAL_FRAME 0 | 602 | * rdi: prev task we switched from |
597 | SAVE_REST | 603 | */ |
598 | FIXUP_TOP_OF_STACK %r11 | 604 | ENTRY(ret_from_fork) |
599 | call compat_sys_execve | 605 | DEFAULT_FRAME |
600 | RESTORE_TOP_OF_STACK %r11 | ||
601 | movq %rax,RAX(%rsp) | ||
602 | RESTORE_REST | ||
603 | jmp int_ret_from_sys_call | ||
604 | CFI_ENDPROC | ||
605 | END(stub_x32_execve) | ||
606 | 606 | ||
607 | ENTRY(stub_x32_execveat) | 607 | LOCK ; btr $TIF_FORK,TI_flags(%r8) |
608 | CFI_STARTPROC | 608 | |
609 | addq $8, %rsp | 609 | pushq_cfi $0x0002 |
610 | PARTIAL_FRAME 0 | 610 | popfq_cfi # reset kernel eflags |
611 | SAVE_REST | 611 | |
612 | FIXUP_TOP_OF_STACK %r11 | 612 | call schedule_tail # rdi: 'prev' task parameter |
613 | call compat_sys_execveat | 613 | |
614 | RESTORE_TOP_OF_STACK %r11 | 614 | RESTORE_EXTRA_REGS |
615 | movq %rax,RAX(%rsp) | 615 | |
616 | RESTORE_REST | 616 | testl $3,CS(%rsp) # from kernel_thread? |
617 | |||
618 | /* | ||
619 | * By the time we get here, we have no idea whether our pt_regs, | ||
620 | * ti flags, and ti status came from the 64-bit SYSCALL fast path, | ||
621 | * the slow path, or one of the ia32entry paths. | ||
622 | * Use IRET code path to return, since it can safely handle | ||
623 | * all of the above. | ||
624 | */ | ||
625 | jnz int_ret_from_sys_call | ||
626 | |||
627 | /* We came from kernel_thread */ | ||
628 | /* nb: we depend on RESTORE_EXTRA_REGS above */ | ||
629 | movq %rbp, %rdi | ||
630 | call *%rbx | ||
631 | movl $0, RAX(%rsp) | ||
632 | RESTORE_EXTRA_REGS | ||
617 | jmp int_ret_from_sys_call | 633 | jmp int_ret_from_sys_call |
618 | CFI_ENDPROC | 634 | CFI_ENDPROC |
619 | END(stub_x32_execveat) | 635 | END(ret_from_fork) |
620 | |||
621 | #endif | ||
622 | 636 | ||
623 | /* | 637 | /* |
624 | * Build the entry stubs and pointer table with some assembler magic. | 638 | * Build the entry stubs with some assembler magic. |
625 | * We pack 7 stubs into a single 32-byte chunk, which will fit in a | 639 | * We pack 1 stub into every 8-byte block. |
626 | * single cache line on all modern x86 implementations. | ||
627 | */ | 640 | */ |
628 | .section .init.rodata,"a" | 641 | .align 8 |
629 | ENTRY(interrupt) | ||
630 | .section .entry.text | ||
631 | .p2align 5 | ||
632 | .p2align CONFIG_X86_L1_CACHE_SHIFT | ||
633 | ENTRY(irq_entries_start) | 642 | ENTRY(irq_entries_start) |
634 | INTR_FRAME | 643 | INTR_FRAME |
635 | vector=FIRST_EXTERNAL_VECTOR | 644 | vector=FIRST_EXTERNAL_VECTOR |
636 | .rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 | 645 | .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) |
637 | .balign 32 | 646 | pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ |
638 | .rept 7 | 647 | vector=vector+1 |
639 | .if vector < FIRST_SYSTEM_VECTOR | 648 | jmp common_interrupt |
640 | .if vector <> FIRST_EXTERNAL_VECTOR | ||
641 | CFI_ADJUST_CFA_OFFSET -8 | 649 | CFI_ADJUST_CFA_OFFSET -8 |
642 | .endif | 650 | .align 8 |
643 | 1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ | 651 | .endr |
644 | .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 | ||
645 | jmp 2f | ||
646 | .endif | ||
647 | .previous | ||
648 | .quad 1b | ||
649 | .section .entry.text | ||
650 | vector=vector+1 | ||
651 | .endif | ||
652 | .endr | ||
653 | 2: jmp common_interrupt | ||
654 | .endr | ||
655 | CFI_ENDPROC | 652 | CFI_ENDPROC |
656 | END(irq_entries_start) | 653 | END(irq_entries_start) |
657 | 654 | ||
658 | .previous | ||
659 | END(interrupt) | ||
660 | .previous | ||
661 | |||
662 | /* | 655 | /* |
663 | * Interrupt entry/exit. | 656 | * Interrupt entry/exit. |
664 | * | 657 | * |
@@ -669,47 +662,45 @@ END(interrupt) | |||
669 | 662 | ||
670 | /* 0(%rsp): ~(interrupt number) */ | 663 | /* 0(%rsp): ~(interrupt number) */ |
671 | .macro interrupt func | 664 | .macro interrupt func |
672 | /* reserve pt_regs for scratch regs and rbp */ | ||
673 | subq $ORIG_RAX-RBP, %rsp | ||
674 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP | ||
675 | cld | 665 | cld |
676 | /* start from rbp in pt_regs and jump over */ | 666 | /* |
677 | movq_cfi rdi, (RDI-RBP) | 667 | * Since nothing in interrupt handling code touches r12...r15 members |
678 | movq_cfi rsi, (RSI-RBP) | 668 | * of "struct pt_regs", and since interrupts can nest, we can save |
679 | movq_cfi rdx, (RDX-RBP) | 669 | * four stack slots and simultaneously provide |
680 | movq_cfi rcx, (RCX-RBP) | 670 | * an unwind-friendly stack layout by saving "truncated" pt_regs |
681 | movq_cfi rax, (RAX-RBP) | 671 | * exactly up to rbp slot, without these members. |
682 | movq_cfi r8, (R8-RBP) | 672 | */ |
683 | movq_cfi r9, (R9-RBP) | 673 | ALLOC_PT_GPREGS_ON_STACK -RBP |
684 | movq_cfi r10, (R10-RBP) | 674 | SAVE_C_REGS -RBP |
685 | movq_cfi r11, (R11-RBP) | 675 | /* this goes to 0(%rsp) for unwinder, not for saving the value: */ |
686 | 676 | SAVE_EXTRA_REGS_RBP -RBP | |
687 | /* Save rbp so that we can unwind from get_irq_regs() */ | ||
688 | movq_cfi rbp, 0 | ||
689 | |||
690 | /* Save previous stack value */ | ||
691 | movq %rsp, %rsi | ||
692 | 677 | ||
693 | leaq -RBP(%rsp),%rdi /* arg1 for handler */ | 678 | leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ |
694 | testl $3, CS-RBP(%rsi) | 679 | |
680 | testl $3, CS-RBP(%rsp) | ||
695 | je 1f | 681 | je 1f |
696 | SWAPGS | 682 | SWAPGS |
683 | 1: | ||
697 | /* | 684 | /* |
685 | * Save previous stack pointer, optionally switch to interrupt stack. | ||
698 | * irq_count is used to check if a CPU is already on an interrupt stack | 686 | * irq_count is used to check if a CPU is already on an interrupt stack |
699 | * or not. While this is essentially redundant with preempt_count it is | 687 | * or not. While this is essentially redundant with preempt_count it is |
700 | * a little cheaper to use a separate counter in the PDA (short of | 688 | * a little cheaper to use a separate counter in the PDA (short of |
701 | * moving irq_enter into assembly, which would be too much work) | 689 | * moving irq_enter into assembly, which would be too much work) |
702 | */ | 690 | */ |
703 | 1: incl PER_CPU_VAR(irq_count) | 691 | movq %rsp, %rsi |
692 | incl PER_CPU_VAR(irq_count) | ||
704 | cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp | 693 | cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp |
705 | CFI_DEF_CFA_REGISTER rsi | 694 | CFI_DEF_CFA_REGISTER rsi |
706 | |||
707 | /* Store previous stack value */ | ||
708 | pushq %rsi | 695 | pushq %rsi |
696 | /* | ||
697 | * For debugger: | ||
698 | * "CFA (Current Frame Address) is the value on stack + offset" | ||
699 | */ | ||
709 | CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ | 700 | CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ |
710 | 0x77 /* DW_OP_breg7 */, 0, \ | 701 | 0x77 /* DW_OP_breg7 (rsp) */, 0, \ |
711 | 0x06 /* DW_OP_deref */, \ | 702 | 0x06 /* DW_OP_deref */, \ |
712 | 0x08 /* DW_OP_const1u */, SS+8-RBP, \ | 703 | 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \ |
713 | 0x22 /* DW_OP_plus */ | 704 | 0x22 /* DW_OP_plus */ |
714 | /* We entered an interrupt context - irqs are off: */ | 705 | /* We entered an interrupt context - irqs are off: */ |
715 | TRACE_IRQS_OFF | 706 | TRACE_IRQS_OFF |
@@ -727,7 +718,7 @@ common_interrupt: | |||
727 | ASM_CLAC | 718 | ASM_CLAC |
728 | addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ | 719 | addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ |
729 | interrupt do_IRQ | 720 | interrupt do_IRQ |
730 | /* 0(%rsp): old_rsp-ARGOFFSET */ | 721 | /* 0(%rsp): old RSP */ |
731 | ret_from_intr: | 722 | ret_from_intr: |
732 | DISABLE_INTERRUPTS(CLBR_NONE) | 723 | DISABLE_INTERRUPTS(CLBR_NONE) |
733 | TRACE_IRQS_OFF | 724 | TRACE_IRQS_OFF |
@@ -735,19 +726,18 @@ ret_from_intr: | |||
735 | 726 | ||
736 | /* Restore saved previous stack */ | 727 | /* Restore saved previous stack */ |
737 | popq %rsi | 728 | popq %rsi |
738 | CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ | 729 | CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */ |
739 | leaq ARGOFFSET-RBP(%rsi), %rsp | 730 | /* return code expects complete pt_regs - adjust rsp accordingly: */ |
731 | leaq -RBP(%rsi),%rsp | ||
740 | CFI_DEF_CFA_REGISTER rsp | 732 | CFI_DEF_CFA_REGISTER rsp |
741 | CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET | 733 | CFI_ADJUST_CFA_OFFSET RBP |
742 | 734 | ||
743 | exit_intr: | 735 | testl $3,CS(%rsp) |
744 | GET_THREAD_INFO(%rcx) | ||
745 | testl $3,CS-ARGOFFSET(%rsp) | ||
746 | je retint_kernel | 736 | je retint_kernel |
747 | |||
748 | /* Interrupt came from user space */ | 737 | /* Interrupt came from user space */ |
738 | |||
739 | GET_THREAD_INFO(%rcx) | ||
749 | /* | 740 | /* |
750 | * Has a correct top of stack, but a partial stack frame | ||
751 | * %rcx: thread info. Interrupts off. | 741 | * %rcx: thread info. Interrupts off. |
752 | */ | 742 | */ |
753 | retint_with_reschedule: | 743 | retint_with_reschedule: |
@@ -766,84 +756,34 @@ retint_swapgs: /* return to user-space */ | |||
766 | DISABLE_INTERRUPTS(CLBR_ANY) | 756 | DISABLE_INTERRUPTS(CLBR_ANY) |
767 | TRACE_IRQS_IRETQ | 757 | TRACE_IRQS_IRETQ |
768 | 758 | ||
769 | /* | ||
770 | * Try to use SYSRET instead of IRET if we're returning to | ||
771 | * a completely clean 64-bit userspace context. | ||
772 | */ | ||
773 | movq (RCX-R11)(%rsp), %rcx | ||
774 | cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */ | ||
775 | jne opportunistic_sysret_failed | ||
776 | |||
777 | /* | ||
778 | * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP | ||
779 | * in kernel space. This essentially lets the user take over | ||
780 | * the kernel, since userspace controls RSP. It's not worth | ||
781 | * testing for canonicalness exactly -- this check detects any | ||
782 | * of the 17 high bits set, which is true for non-canonical | ||
783 | * or kernel addresses. (This will pessimize vsyscall=native. | ||
784 | * Big deal.) | ||
785 | * | ||
786 | * If virtual addresses ever become wider, this will need | ||
787 | * to be updated to remain correct on both old and new CPUs. | ||
788 | */ | ||
789 | .ifne __VIRTUAL_MASK_SHIFT - 47 | ||
790 | .error "virtual address width changed -- sysret checks need update" | ||
791 | .endif | ||
792 | shr $__VIRTUAL_MASK_SHIFT, %rcx | ||
793 | jnz opportunistic_sysret_failed | ||
794 | |||
795 | cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */ | ||
796 | jne opportunistic_sysret_failed | ||
797 | |||
798 | movq (R11-ARGOFFSET)(%rsp), %r11 | ||
799 | cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */ | ||
800 | jne opportunistic_sysret_failed | ||
801 | |||
802 | /* | ||
803 | * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, | ||
804 | * restoring TF results in a trap from userspace immediately after | ||
805 | * SYSRET. This would cause an infinite loop whenever #DB happens | ||
806 | * with register state that satisfies the opportunistic SYSRET | ||
807 | * conditions. For example, single-stepping this user code: | ||
808 | * | ||
809 | * movq $stuck_here,%rcx | ||
810 | * pushfq | ||
811 | * popq %r11 | ||
812 | * stuck_here: | ||
813 | * | ||
814 | * would never get past 'stuck_here'. | ||
815 | */ | ||
816 | testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 | ||
817 | jnz opportunistic_sysret_failed | ||
818 | |||
819 | /* nothing to check for RSP */ | ||
820 | |||
821 | cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */ | ||
822 | jne opportunistic_sysret_failed | ||
823 | |||
824 | /* | ||
825 | * We win! This label is here just for ease of understanding | ||
826 | * perf profiles. Nothing jumps here. | ||
827 | */ | ||
828 | irq_return_via_sysret: | ||
829 | CFI_REMEMBER_STATE | ||
830 | RESTORE_ARGS 1,8,1 | ||
831 | movq (RSP-RIP)(%rsp),%rsp | ||
832 | USERGS_SYSRET64 | ||
833 | CFI_RESTORE_STATE | ||
834 | |||
835 | opportunistic_sysret_failed: | ||
836 | SWAPGS | 759 | SWAPGS |
837 | jmp restore_args | 760 | jmp restore_c_regs_and_iret |
838 | 761 | ||
839 | retint_restore_args: /* return to kernel space */ | 762 | /* Returning to kernel space */ |
840 | DISABLE_INTERRUPTS(CLBR_ANY) | 763 | retint_kernel: |
764 | #ifdef CONFIG_PREEMPT | ||
765 | /* Interrupts are off */ | ||
766 | /* Check if we need preemption */ | ||
767 | bt $9,EFLAGS(%rsp) /* interrupts were off? */ | ||
768 | jnc 1f | ||
769 | 0: cmpl $0,PER_CPU_VAR(__preempt_count) | ||
770 | jnz 1f | ||
771 | call preempt_schedule_irq | ||
772 | jmp 0b | ||
773 | 1: | ||
774 | #endif | ||
841 | /* | 775 | /* |
842 | * The iretq could re-enable interrupts: | 776 | * The iretq could re-enable interrupts: |
843 | */ | 777 | */ |
844 | TRACE_IRQS_IRETQ | 778 | TRACE_IRQS_IRETQ |
845 | restore_args: | 779 | |
846 | RESTORE_ARGS 1,8,1 | 780 | /* |
781 | * At this label, code paths which return to kernel and to user, | ||
782 | * which come from interrupts/exception and from syscalls, merge. | ||
783 | */ | ||
784 | restore_c_regs_and_iret: | ||
785 | RESTORE_C_REGS | ||
786 | REMOVE_PT_GPREGS_FROM_STACK 8 | ||
847 | 787 | ||
848 | irq_return: | 788 | irq_return: |
849 | INTERRUPT_RETURN | 789 | INTERRUPT_RETURN |
@@ -914,28 +854,17 @@ retint_signal: | |||
914 | jz retint_swapgs | 854 | jz retint_swapgs |
915 | TRACE_IRQS_ON | 855 | TRACE_IRQS_ON |
916 | ENABLE_INTERRUPTS(CLBR_NONE) | 856 | ENABLE_INTERRUPTS(CLBR_NONE) |
917 | SAVE_REST | 857 | SAVE_EXTRA_REGS |
918 | movq $-1,ORIG_RAX(%rsp) | 858 | movq $-1,ORIG_RAX(%rsp) |
919 | xorl %esi,%esi # oldset | 859 | xorl %esi,%esi # oldset |
920 | movq %rsp,%rdi # &pt_regs | 860 | movq %rsp,%rdi # &pt_regs |
921 | call do_notify_resume | 861 | call do_notify_resume |
922 | RESTORE_REST | 862 | RESTORE_EXTRA_REGS |
923 | DISABLE_INTERRUPTS(CLBR_NONE) | 863 | DISABLE_INTERRUPTS(CLBR_NONE) |
924 | TRACE_IRQS_OFF | 864 | TRACE_IRQS_OFF |
925 | GET_THREAD_INFO(%rcx) | 865 | GET_THREAD_INFO(%rcx) |
926 | jmp retint_with_reschedule | 866 | jmp retint_with_reschedule |
927 | 867 | ||
928 | #ifdef CONFIG_PREEMPT | ||
929 | /* Returning to kernel space. Check if we need preemption */ | ||
930 | /* rcx: threadinfo. interrupts off. */ | ||
931 | ENTRY(retint_kernel) | ||
932 | cmpl $0,PER_CPU_VAR(__preempt_count) | ||
933 | jnz retint_restore_args | ||
934 | bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ | ||
935 | jnc retint_restore_args | ||
936 | call preempt_schedule_irq | ||
937 | jmp exit_intr | ||
938 | #endif | ||
939 | CFI_ENDPROC | 868 | CFI_ENDPROC |
940 | END(common_interrupt) | 869 | END(common_interrupt) |
941 | 870 | ||
@@ -1024,7 +953,7 @@ apicinterrupt IRQ_WORK_VECTOR \ | |||
1024 | /* | 953 | /* |
1025 | * Exception entry points. | 954 | * Exception entry points. |
1026 | */ | 955 | */ |
1027 | #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) | 956 | #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) |
1028 | 957 | ||
1029 | .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 | 958 | .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 |
1030 | ENTRY(\sym) | 959 | ENTRY(\sym) |
@@ -1046,8 +975,7 @@ ENTRY(\sym) | |||
1046 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | 975 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
1047 | .endif | 976 | .endif |
1048 | 977 | ||
1049 | subq $ORIG_RAX-R15, %rsp | 978 | ALLOC_PT_GPREGS_ON_STACK |
1050 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | ||
1051 | 979 | ||
1052 | .if \paranoid | 980 | .if \paranoid |
1053 | .if \paranoid == 1 | 981 | .if \paranoid == 1 |
@@ -1055,10 +983,11 @@ ENTRY(\sym) | |||
1055 | testl $3, CS(%rsp) /* If coming from userspace, switch */ | 983 | testl $3, CS(%rsp) /* If coming from userspace, switch */ |
1056 | jnz 1f /* stacks. */ | 984 | jnz 1f /* stacks. */ |
1057 | .endif | 985 | .endif |
1058 | call save_paranoid | 986 | call paranoid_entry |
1059 | .else | 987 | .else |
1060 | call error_entry | 988 | call error_entry |
1061 | .endif | 989 | .endif |
990 | /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ | ||
1062 | 991 | ||
1063 | DEFAULT_FRAME 0 | 992 | DEFAULT_FRAME 0 |
1064 | 993 | ||
@@ -1080,19 +1009,20 @@ ENTRY(\sym) | |||
1080 | .endif | 1009 | .endif |
1081 | 1010 | ||
1082 | .if \shift_ist != -1 | 1011 | .if \shift_ist != -1 |
1083 | subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) | 1012 | subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) |
1084 | .endif | 1013 | .endif |
1085 | 1014 | ||
1086 | call \do_sym | 1015 | call \do_sym |
1087 | 1016 | ||
1088 | .if \shift_ist != -1 | 1017 | .if \shift_ist != -1 |
1089 | addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) | 1018 | addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) |
1090 | .endif | 1019 | .endif |
1091 | 1020 | ||
1021 | /* these procedures expect "no swapgs" flag in ebx */ | ||
1092 | .if \paranoid | 1022 | .if \paranoid |
1093 | jmp paranoid_exit /* %ebx: no swapgs flag */ | 1023 | jmp paranoid_exit |
1094 | .else | 1024 | .else |
1095 | jmp error_exit /* %ebx: no swapgs flag */ | 1025 | jmp error_exit |
1096 | .endif | 1026 | .endif |
1097 | 1027 | ||
1098 | .if \paranoid == 1 | 1028 | .if \paranoid == 1 |
@@ -1296,7 +1226,9 @@ ENTRY(xen_failsafe_callback) | |||
1296 | addq $0x30,%rsp | 1226 | addq $0x30,%rsp |
1297 | CFI_ADJUST_CFA_OFFSET -0x30 | 1227 | CFI_ADJUST_CFA_OFFSET -0x30 |
1298 | pushq_cfi $-1 /* orig_ax = -1 => not a system call */ | 1228 | pushq_cfi $-1 /* orig_ax = -1 => not a system call */ |
1299 | SAVE_ALL | 1229 | ALLOC_PT_GPREGS_ON_STACK |
1230 | SAVE_C_REGS | ||
1231 | SAVE_EXTRA_REGS | ||
1300 | jmp error_exit | 1232 | jmp error_exit |
1301 | CFI_ENDPROC | 1233 | CFI_ENDPROC |
1302 | END(xen_failsafe_callback) | 1234 | END(xen_failsafe_callback) |
@@ -1328,59 +1260,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1 | |||
1328 | idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) | 1260 | idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) |
1329 | #endif | 1261 | #endif |
1330 | 1262 | ||
1331 | /* | 1263 | /* |
1332 | * "Paranoid" exit path from exception stack. This is invoked | 1264 | * Save all registers in pt_regs, and switch gs if needed. |
1333 | * only on return from non-NMI IST interrupts that came | 1265 | * Use slow, but surefire "are we in kernel?" check. |
1334 | * from kernel space. | 1266 | * Return: ebx=0: need swapgs on exit, ebx=1: otherwise |
1335 | * | 1267 | */ |
1336 | * We may be returning to very strange contexts (e.g. very early | 1268 | ENTRY(paranoid_entry) |
1337 | * in syscall entry), so checking for preemption here would | 1269 | XCPT_FRAME 1 15*8 |
1338 | * be complicated. Fortunately, we there's no good reason | 1270 | cld |
1339 | * to try to handle preemption here. | 1271 | SAVE_C_REGS 8 |
1340 | */ | 1272 | SAVE_EXTRA_REGS 8 |
1273 | movl $1,%ebx | ||
1274 | movl $MSR_GS_BASE,%ecx | ||
1275 | rdmsr | ||
1276 | testl %edx,%edx | ||
1277 | js 1f /* negative -> in kernel */ | ||
1278 | SWAPGS | ||
1279 | xorl %ebx,%ebx | ||
1280 | 1: ret | ||
1281 | CFI_ENDPROC | ||
1282 | END(paranoid_entry) | ||
1341 | 1283 | ||
1342 | /* ebx: no swapgs flag */ | 1284 | /* |
1285 | * "Paranoid" exit path from exception stack. This is invoked | ||
1286 | * only on return from non-NMI IST interrupts that came | ||
1287 | * from kernel space. | ||
1288 | * | ||
1289 | * We may be returning to very strange contexts (e.g. very early | ||
1290 | * in syscall entry), so checking for preemption here would | ||
1291 | * be complicated. Fortunately, we there's no good reason | ||
1292 | * to try to handle preemption here. | ||
1293 | */ | ||
1294 | /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ | ||
1343 | ENTRY(paranoid_exit) | 1295 | ENTRY(paranoid_exit) |
1344 | DEFAULT_FRAME | 1296 | DEFAULT_FRAME |
1345 | DISABLE_INTERRUPTS(CLBR_NONE) | 1297 | DISABLE_INTERRUPTS(CLBR_NONE) |
1346 | TRACE_IRQS_OFF_DEBUG | 1298 | TRACE_IRQS_OFF_DEBUG |
1347 | testl %ebx,%ebx /* swapgs needed? */ | 1299 | testl %ebx,%ebx /* swapgs needed? */ |
1348 | jnz paranoid_restore | 1300 | jnz paranoid_exit_no_swapgs |
1349 | TRACE_IRQS_IRETQ 0 | 1301 | TRACE_IRQS_IRETQ |
1350 | SWAPGS_UNSAFE_STACK | 1302 | SWAPGS_UNSAFE_STACK |
1351 | RESTORE_ALL 8 | 1303 | jmp paranoid_exit_restore |
1352 | INTERRUPT_RETURN | 1304 | paranoid_exit_no_swapgs: |
1353 | paranoid_restore: | 1305 | TRACE_IRQS_IRETQ_DEBUG |
1354 | TRACE_IRQS_IRETQ_DEBUG 0 | 1306 | paranoid_exit_restore: |
1355 | RESTORE_ALL 8 | 1307 | RESTORE_EXTRA_REGS |
1308 | RESTORE_C_REGS | ||
1309 | REMOVE_PT_GPREGS_FROM_STACK 8 | ||
1356 | INTERRUPT_RETURN | 1310 | INTERRUPT_RETURN |
1357 | CFI_ENDPROC | 1311 | CFI_ENDPROC |
1358 | END(paranoid_exit) | 1312 | END(paranoid_exit) |
1359 | 1313 | ||
1360 | /* | 1314 | /* |
1361 | * Exception entry point. This expects an error code/orig_rax on the stack. | 1315 | * Save all registers in pt_regs, and switch gs if needed. |
1362 | * returns in "no swapgs flag" in %ebx. | 1316 | * Return: ebx=0: need swapgs on exit, ebx=1: otherwise |
1363 | */ | 1317 | */ |
1364 | ENTRY(error_entry) | 1318 | ENTRY(error_entry) |
1365 | XCPT_FRAME | 1319 | XCPT_FRAME 1 15*8 |
1366 | CFI_ADJUST_CFA_OFFSET 15*8 | ||
1367 | /* oldrax contains error code */ | ||
1368 | cld | 1320 | cld |
1369 | movq %rdi, RDI+8(%rsp) | 1321 | SAVE_C_REGS 8 |
1370 | movq %rsi, RSI+8(%rsp) | 1322 | SAVE_EXTRA_REGS 8 |
1371 | movq %rdx, RDX+8(%rsp) | ||
1372 | movq %rcx, RCX+8(%rsp) | ||
1373 | movq %rax, RAX+8(%rsp) | ||
1374 | movq %r8, R8+8(%rsp) | ||
1375 | movq %r9, R9+8(%rsp) | ||
1376 | movq %r10, R10+8(%rsp) | ||
1377 | movq %r11, R11+8(%rsp) | ||
1378 | movq_cfi rbx, RBX+8 | ||
1379 | movq %rbp, RBP+8(%rsp) | ||
1380 | movq %r12, R12+8(%rsp) | ||
1381 | movq %r13, R13+8(%rsp) | ||
1382 | movq %r14, R14+8(%rsp) | ||
1383 | movq %r15, R15+8(%rsp) | ||
1384 | xorl %ebx,%ebx | 1323 | xorl %ebx,%ebx |
1385 | testl $3,CS+8(%rsp) | 1324 | testl $3,CS+8(%rsp) |
1386 | je error_kernelspace | 1325 | je error_kernelspace |
@@ -1390,12 +1329,12 @@ error_sti: | |||
1390 | TRACE_IRQS_OFF | 1329 | TRACE_IRQS_OFF |
1391 | ret | 1330 | ret |
1392 | 1331 | ||
1393 | /* | 1332 | /* |
1394 | * There are two places in the kernel that can potentially fault with | 1333 | * There are two places in the kernel that can potentially fault with |
1395 | * usergs. Handle them here. B stepping K8s sometimes report a | 1334 | * usergs. Handle them here. B stepping K8s sometimes report a |
1396 | * truncated RIP for IRET exceptions returning to compat mode. Check | 1335 | * truncated RIP for IRET exceptions returning to compat mode. Check |
1397 | * for these here too. | 1336 | * for these here too. |
1398 | */ | 1337 | */ |
1399 | error_kernelspace: | 1338 | error_kernelspace: |
1400 | CFI_REL_OFFSET rcx, RCX+8 | 1339 | CFI_REL_OFFSET rcx, RCX+8 |
1401 | incl %ebx | 1340 | incl %ebx |
@@ -1425,11 +1364,11 @@ error_bad_iret: | |||
1425 | END(error_entry) | 1364 | END(error_entry) |
1426 | 1365 | ||
1427 | 1366 | ||
1428 | /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ | 1367 | /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ |
1429 | ENTRY(error_exit) | 1368 | ENTRY(error_exit) |
1430 | DEFAULT_FRAME | 1369 | DEFAULT_FRAME |
1431 | movl %ebx,%eax | 1370 | movl %ebx,%eax |
1432 | RESTORE_REST | 1371 | RESTORE_EXTRA_REGS |
1433 | DISABLE_INTERRUPTS(CLBR_NONE) | 1372 | DISABLE_INTERRUPTS(CLBR_NONE) |
1434 | TRACE_IRQS_OFF | 1373 | TRACE_IRQS_OFF |
1435 | GET_THREAD_INFO(%rcx) | 1374 | GET_THREAD_INFO(%rcx) |
@@ -1444,19 +1383,7 @@ ENTRY(error_exit) | |||
1444 | CFI_ENDPROC | 1383 | CFI_ENDPROC |
1445 | END(error_exit) | 1384 | END(error_exit) |
1446 | 1385 | ||
1447 | /* | 1386 | /* Runs on exception stack */ |
1448 | * Test if a given stack is an NMI stack or not. | ||
1449 | */ | ||
1450 | .macro test_in_nmi reg stack nmi_ret normal_ret | ||
1451 | cmpq %\reg, \stack | ||
1452 | ja \normal_ret | ||
1453 | subq $EXCEPTION_STKSZ, %\reg | ||
1454 | cmpq %\reg, \stack | ||
1455 | jb \normal_ret | ||
1456 | jmp \nmi_ret | ||
1457 | .endm | ||
1458 | |||
1459 | /* runs on exception stack */ | ||
1460 | ENTRY(nmi) | 1387 | ENTRY(nmi) |
1461 | INTR_FRAME | 1388 | INTR_FRAME |
1462 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1389 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
@@ -1492,7 +1419,7 @@ ENTRY(nmi) | |||
1492 | * NMI. | 1419 | * NMI. |
1493 | */ | 1420 | */ |
1494 | 1421 | ||
1495 | /* Use %rdx as out temp variable throughout */ | 1422 | /* Use %rdx as our temp variable throughout */ |
1496 | pushq_cfi %rdx | 1423 | pushq_cfi %rdx |
1497 | CFI_REL_OFFSET rdx, 0 | 1424 | CFI_REL_OFFSET rdx, 0 |
1498 | 1425 | ||
@@ -1517,8 +1444,17 @@ ENTRY(nmi) | |||
1517 | * We check the variable because the first NMI could be in a | 1444 | * We check the variable because the first NMI could be in a |
1518 | * breakpoint routine using a breakpoint stack. | 1445 | * breakpoint routine using a breakpoint stack. |
1519 | */ | 1446 | */ |
1520 | lea 6*8(%rsp), %rdx | 1447 | lea 6*8(%rsp), %rdx |
1521 | test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi | 1448 | /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ |
1449 | cmpq %rdx, 4*8(%rsp) | ||
1450 | /* If the stack pointer is above the NMI stack, this is a normal NMI */ | ||
1451 | ja first_nmi | ||
1452 | subq $EXCEPTION_STKSZ, %rdx | ||
1453 | cmpq %rdx, 4*8(%rsp) | ||
1454 | /* If it is below the NMI stack, it is a normal NMI */ | ||
1455 | jb first_nmi | ||
1456 | /* Ah, it is within the NMI stack, treat it as nested */ | ||
1457 | |||
1522 | CFI_REMEMBER_STATE | 1458 | CFI_REMEMBER_STATE |
1523 | 1459 | ||
1524 | nested_nmi: | 1460 | nested_nmi: |
@@ -1611,7 +1547,7 @@ first_nmi: | |||
1611 | .rept 5 | 1547 | .rept 5 |
1612 | pushq_cfi 11*8(%rsp) | 1548 | pushq_cfi 11*8(%rsp) |
1613 | .endr | 1549 | .endr |
1614 | CFI_DEF_CFA_OFFSET SS+8-RIP | 1550 | CFI_DEF_CFA_OFFSET 5*8 |
1615 | 1551 | ||
1616 | /* Everything up to here is safe from nested NMIs */ | 1552 | /* Everything up to here is safe from nested NMIs */ |
1617 | 1553 | ||
@@ -1639,7 +1575,7 @@ repeat_nmi: | |||
1639 | pushq_cfi -6*8(%rsp) | 1575 | pushq_cfi -6*8(%rsp) |
1640 | .endr | 1576 | .endr |
1641 | subq $(5*8), %rsp | 1577 | subq $(5*8), %rsp |
1642 | CFI_DEF_CFA_OFFSET SS+8-RIP | 1578 | CFI_DEF_CFA_OFFSET 5*8 |
1643 | end_repeat_nmi: | 1579 | end_repeat_nmi: |
1644 | 1580 | ||
1645 | /* | 1581 | /* |
@@ -1648,16 +1584,16 @@ end_repeat_nmi: | |||
1648 | * so that we repeat another NMI. | 1584 | * so that we repeat another NMI. |
1649 | */ | 1585 | */ |
1650 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | 1586 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
1651 | subq $ORIG_RAX-R15, %rsp | 1587 | ALLOC_PT_GPREGS_ON_STACK |
1652 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | 1588 | |
1653 | /* | 1589 | /* |
1654 | * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit | 1590 | * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit |
1655 | * as we should not be calling schedule in NMI context. | 1591 | * as we should not be calling schedule in NMI context. |
1656 | * Even with normal interrupts enabled. An NMI should not be | 1592 | * Even with normal interrupts enabled. An NMI should not be |
1657 | * setting NEED_RESCHED or anything that normal interrupts and | 1593 | * setting NEED_RESCHED or anything that normal interrupts and |
1658 | * exceptions might do. | 1594 | * exceptions might do. |
1659 | */ | 1595 | */ |
1660 | call save_paranoid | 1596 | call paranoid_entry |
1661 | DEFAULT_FRAME 0 | 1597 | DEFAULT_FRAME 0 |
1662 | 1598 | ||
1663 | /* | 1599 | /* |
@@ -1688,8 +1624,10 @@ end_repeat_nmi: | |||
1688 | nmi_swapgs: | 1624 | nmi_swapgs: |
1689 | SWAPGS_UNSAFE_STACK | 1625 | SWAPGS_UNSAFE_STACK |
1690 | nmi_restore: | 1626 | nmi_restore: |
1627 | RESTORE_EXTRA_REGS | ||
1628 | RESTORE_C_REGS | ||
1691 | /* Pop the extra iret frame at once */ | 1629 | /* Pop the extra iret frame at once */ |
1692 | RESTORE_ALL 6*8 | 1630 | REMOVE_PT_GPREGS_FROM_STACK 6*8 |
1693 | 1631 | ||
1694 | /* Clear the NMI executing stack variable */ | 1632 | /* Clear the NMI executing stack variable */ |
1695 | movq $0, 5*8(%rsp) | 1633 | movq $0, 5*8(%rsp) |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f36bd42d6f0c..d031bad9e07e 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <asm/cpufeature.h> | 22 | #include <asm/cpufeature.h> |
23 | #include <asm/percpu.h> | 23 | #include <asm/percpu.h> |
24 | #include <asm/nops.h> | 24 | #include <asm/nops.h> |
25 | #include <asm/bootparam.h> | ||
25 | 26 | ||
26 | /* Physical address */ | 27 | /* Physical address */ |
27 | #define pa(X) ((X) - __PAGE_OFFSET) | 28 | #define pa(X) ((X) - __PAGE_OFFSET) |
@@ -90,7 +91,7 @@ ENTRY(startup_32) | |||
90 | 91 | ||
91 | /* test KEEP_SEGMENTS flag to see if the bootloader is asking | 92 | /* test KEEP_SEGMENTS flag to see if the bootloader is asking |
92 | us to not reload segments */ | 93 | us to not reload segments */ |
93 | testb $(1<<6), BP_loadflags(%esi) | 94 | testb $KEEP_SEGMENTS, BP_loadflags(%esi) |
94 | jnz 2f | 95 | jnz 2f |
95 | 96 | ||
96 | /* | 97 | /* |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6fd514d9f69a..ae6588b301c2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit | 2 | * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit |
3 | * | 3 | * |
4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | 4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE |
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | 5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> |
@@ -56,7 +56,7 @@ startup_64: | |||
56 | * %rsi holds a physical pointer to real_mode_data. | 56 | * %rsi holds a physical pointer to real_mode_data. |
57 | * | 57 | * |
58 | * We come here either directly from a 64bit bootloader, or from | 58 | * We come here either directly from a 64bit bootloader, or from |
59 | * arch/x86_64/boot/compressed/head.S. | 59 | * arch/x86/boot/compressed/head_64.S. |
60 | * | 60 | * |
61 | * We only come here initially at boot nothing else comes here. | 61 | * We only come here initially at boot nothing else comes here. |
62 | * | 62 | * |
@@ -146,7 +146,7 @@ startup_64: | |||
146 | leaq level2_kernel_pgt(%rip), %rdi | 146 | leaq level2_kernel_pgt(%rip), %rdi |
147 | leaq 4096(%rdi), %r8 | 147 | leaq 4096(%rdi), %r8 |
148 | /* See if it is a valid page table entry */ | 148 | /* See if it is a valid page table entry */ |
149 | 1: testq $1, 0(%rdi) | 149 | 1: testb $1, 0(%rdi) |
150 | jz 2f | 150 | jz 2f |
151 | addq %rbp, 0(%rdi) | 151 | addq %rbp, 0(%rdi) |
152 | /* Go to the next page */ | 152 | /* Go to the next page */ |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index d5651fce0b71..29c740deafec 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
@@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void) | |||
68 | static inline bool interrupted_user_mode(void) | 68 | static inline bool interrupted_user_mode(void) |
69 | { | 69 | { |
70 | struct pt_regs *regs = get_irq_regs(); | 70 | struct pt_regs *regs = get_irq_regs(); |
71 | return regs && user_mode_vm(regs); | 71 | return regs && user_mode(regs); |
72 | } | 72 | } |
73 | 73 | ||
74 | /* | 74 | /* |
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 4ddaf66ea35f..37dae792dbbe 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c | |||
@@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | |||
54 | * because the ->io_bitmap_max value must match the bitmap | 54 | * because the ->io_bitmap_max value must match the bitmap |
55 | * contents: | 55 | * contents: |
56 | */ | 56 | */ |
57 | tss = &per_cpu(init_tss, get_cpu()); | 57 | tss = &per_cpu(cpu_tss, get_cpu()); |
58 | 58 | ||
59 | if (turn_on) | 59 | if (turn_on) |
60 | bitmap_clear(t->io_bitmap_ptr, from, num); | 60 | bitmap_clear(t->io_bitmap_ptr, from, num); |
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 28d28f5eb8f4..f9fd86a7fcc7 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) | |||
165 | if (unlikely(!desc)) | 165 | if (unlikely(!desc)) |
166 | return false; | 166 | return false; |
167 | 167 | ||
168 | if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { | 168 | if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) { |
169 | if (unlikely(overflow)) | 169 | if (unlikely(overflow)) |
170 | print_stack_overflow(); | 170 | print_stack_overflow(); |
171 | desc->handle_irq(irq, desc); | 171 | desc->handle_irq(irq, desc); |
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index e4b503d5558c..394e643d7830 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c | |||
@@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) | |||
44 | u64 estack_top, estack_bottom; | 44 | u64 estack_top, estack_bottom; |
45 | u64 curbase = (u64)task_stack_page(current); | 45 | u64 curbase = (u64)task_stack_page(current); |
46 | 46 | ||
47 | if (user_mode_vm(regs)) | 47 | if (user_mode(regs)) |
48 | return; | 48 | return; |
49 | 49 | ||
50 | if (regs->sp >= curbase + sizeof(struct thread_info) + | 50 | if (regs->sp >= curbase + sizeof(struct thread_info) + |
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 70e181ea1eac..cd10a6437264 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -178,7 +178,8 @@ void __init native_init_IRQ(void) | |||
178 | #endif | 178 | #endif |
179 | for_each_clear_bit_from(i, used_vectors, first_system_vector) { | 179 | for_each_clear_bit_from(i, used_vectors, first_system_vector) { |
180 | /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ | 180 | /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ |
181 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); | 181 | set_intr_gate(i, irq_entries_start + |
182 | 8 * (i - FIRST_EXTERNAL_VECTOR)); | ||
182 | } | 183 | } |
183 | #ifdef CONFIG_X86_LOCAL_APIC | 184 | #ifdef CONFIG_X86_LOCAL_APIC |
184 | for_each_clear_bit_from(i, used_vectors, NR_VECTORS) | 185 | for_each_clear_bit_from(i, used_vectors, NR_VECTORS) |
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 25ecd56cefa8..d6178d9791db 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) | |||
126 | #ifdef CONFIG_X86_32 | 126 | #ifdef CONFIG_X86_32 |
127 | switch (regno) { | 127 | switch (regno) { |
128 | case GDB_SS: | 128 | case GDB_SS: |
129 | if (!user_mode_vm(regs)) | 129 | if (!user_mode(regs)) |
130 | *(unsigned long *)mem = __KERNEL_DS; | 130 | *(unsigned long *)mem = __KERNEL_DS; |
131 | break; | 131 | break; |
132 | case GDB_SP: | 132 | case GDB_SP: |
133 | if (!user_mode_vm(regs)) | 133 | if (!user_mode(regs)) |
134 | *(unsigned long *)mem = kernel_stack_pointer(regs); | 134 | *(unsigned long *)mem = kernel_stack_pointer(regs); |
135 | break; | 135 | break; |
136 | case GDB_GS: | 136 | case GDB_GS: |
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 4e3d5a9621fe..24d079604fd5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c | |||
@@ -602,7 +602,7 @@ int kprobe_int3_handler(struct pt_regs *regs) | |||
602 | struct kprobe *p; | 602 | struct kprobe *p; |
603 | struct kprobe_ctlblk *kcb; | 603 | struct kprobe_ctlblk *kcb; |
604 | 604 | ||
605 | if (user_mode_vm(regs)) | 605 | if (user_mode(regs)) |
606 | return 0; | 606 | return 0; |
607 | 607 | ||
608 | addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); | 608 | addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); |
@@ -1007,7 +1007,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, | |||
1007 | struct die_args *args = data; | 1007 | struct die_args *args = data; |
1008 | int ret = NOTIFY_DONE; | 1008 | int ret = NOTIFY_DONE; |
1009 | 1009 | ||
1010 | if (args->regs && user_mode_vm(args->regs)) | 1010 | if (args->regs && user_mode(args->regs)) |
1011 | return ret; | 1011 | return ret; |
1012 | 1012 | ||
1013 | if (val == DIE_GPF) { | 1013 | if (val == DIE_GPF) { |
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index d1ac80b72c72..005c03e93fc5 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c | |||
@@ -33,6 +33,7 @@ | |||
33 | 33 | ||
34 | #include <asm/page.h> | 34 | #include <asm/page.h> |
35 | #include <asm/pgtable.h> | 35 | #include <asm/pgtable.h> |
36 | #include <asm/setup.h> | ||
36 | 37 | ||
37 | #if 0 | 38 | #if 0 |
38 | #define DEBUGP(fmt, ...) \ | 39 | #define DEBUGP(fmt, ...) \ |
@@ -47,21 +48,13 @@ do { \ | |||
47 | 48 | ||
48 | #ifdef CONFIG_RANDOMIZE_BASE | 49 | #ifdef CONFIG_RANDOMIZE_BASE |
49 | static unsigned long module_load_offset; | 50 | static unsigned long module_load_offset; |
50 | static int randomize_modules = 1; | ||
51 | 51 | ||
52 | /* Mutex protects the module_load_offset. */ | 52 | /* Mutex protects the module_load_offset. */ |
53 | static DEFINE_MUTEX(module_kaslr_mutex); | 53 | static DEFINE_MUTEX(module_kaslr_mutex); |
54 | 54 | ||
55 | static int __init parse_nokaslr(char *p) | ||
56 | { | ||
57 | randomize_modules = 0; | ||
58 | return 0; | ||
59 | } | ||
60 | early_param("nokaslr", parse_nokaslr); | ||
61 | |||
62 | static unsigned long int get_module_load_offset(void) | 55 | static unsigned long int get_module_load_offset(void) |
63 | { | 56 | { |
64 | if (randomize_modules) { | 57 | if (kaslr_enabled()) { |
65 | mutex_lock(&module_kaslr_mutex); | 58 | mutex_lock(&module_kaslr_mutex); |
66 | /* | 59 | /* |
67 | * Calculate the module_load_offset the first time this | 60 | * Calculate the module_load_offset the first time this |
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 781861cc5ee8..da8cb987b973 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c | |||
@@ -131,10 +131,11 @@ void perf_get_regs_user(struct perf_regs *regs_user, | |||
131 | } | 131 | } |
132 | 132 | ||
133 | /* | 133 | /* |
134 | * RIP, flags, and the argument registers are usually saved. | 134 | * These registers are always saved on 64-bit syscall entry. |
135 | * orig_ax is probably okay, too. | 135 | * On 32-bit entry points, they are saved too except r8..r11. |
136 | */ | 136 | */ |
137 | regs_user_copy->ip = user_regs->ip; | 137 | regs_user_copy->ip = user_regs->ip; |
138 | regs_user_copy->ax = user_regs->ax; | ||
138 | regs_user_copy->cx = user_regs->cx; | 139 | regs_user_copy->cx = user_regs->cx; |
139 | regs_user_copy->dx = user_regs->dx; | 140 | regs_user_copy->dx = user_regs->dx; |
140 | regs_user_copy->si = user_regs->si; | 141 | regs_user_copy->si = user_regs->si; |
@@ -145,9 +146,12 @@ void perf_get_regs_user(struct perf_regs *regs_user, | |||
145 | regs_user_copy->r11 = user_regs->r11; | 146 | regs_user_copy->r11 = user_regs->r11; |
146 | regs_user_copy->orig_ax = user_regs->orig_ax; | 147 | regs_user_copy->orig_ax = user_regs->orig_ax; |
147 | regs_user_copy->flags = user_regs->flags; | 148 | regs_user_copy->flags = user_regs->flags; |
149 | regs_user_copy->sp = user_regs->sp; | ||
150 | regs_user_copy->cs = user_regs->cs; | ||
151 | regs_user_copy->ss = user_regs->ss; | ||
148 | 152 | ||
149 | /* | 153 | /* |
150 | * Don't even try to report the "rest" regs. | 154 | * Most system calls don't save these registers, don't report them. |
151 | */ | 155 | */ |
152 | regs_user_copy->bx = -1; | 156 | regs_user_copy->bx = -1; |
153 | regs_user_copy->bp = -1; | 157 | regs_user_copy->bp = -1; |
@@ -158,37 +162,13 @@ void perf_get_regs_user(struct perf_regs *regs_user, | |||
158 | 162 | ||
159 | /* | 163 | /* |
160 | * For this to be at all useful, we need a reasonable guess for | 164 | * For this to be at all useful, we need a reasonable guess for |
161 | * sp and the ABI. Be careful: we're in NMI context, and we're | 165 | * the ABI. Be careful: we're in NMI context, and we're |
162 | * considering current to be the current task, so we should | 166 | * considering current to be the current task, so we should |
163 | * be careful not to look at any other percpu variables that might | 167 | * be careful not to look at any other percpu variables that might |
164 | * change during context switches. | 168 | * change during context switches. |
165 | */ | 169 | */ |
166 | if (IS_ENABLED(CONFIG_IA32_EMULATION) && | 170 | regs_user->abi = user_64bit_mode(user_regs) ? |
167 | task_thread_info(current)->status & TS_COMPAT) { | 171 | PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; |
168 | /* Easy case: we're in a compat syscall. */ | ||
169 | regs_user->abi = PERF_SAMPLE_REGS_ABI_32; | ||
170 | regs_user_copy->sp = user_regs->sp; | ||
171 | regs_user_copy->cs = user_regs->cs; | ||
172 | regs_user_copy->ss = user_regs->ss; | ||
173 | } else if (user_regs->orig_ax != -1) { | ||
174 | /* | ||
175 | * We're probably in a 64-bit syscall. | ||
176 | * Warning: this code is severely racy. At least it's better | ||
177 | * than just blindly copying user_regs. | ||
178 | */ | ||
179 | regs_user->abi = PERF_SAMPLE_REGS_ABI_64; | ||
180 | regs_user_copy->sp = this_cpu_read(old_rsp); | ||
181 | regs_user_copy->cs = __USER_CS; | ||
182 | regs_user_copy->ss = __USER_DS; | ||
183 | regs_user_copy->cx = -1; /* usually contains garbage */ | ||
184 | } else { | ||
185 | /* We're probably in an interrupt or exception. */ | ||
186 | regs_user->abi = user_64bit_mode(user_regs) ? | ||
187 | PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; | ||
188 | regs_user_copy->sp = user_regs->sp; | ||
189 | regs_user_copy->cs = user_regs->cs; | ||
190 | regs_user_copy->ss = user_regs->ss; | ||
191 | } | ||
192 | 172 | ||
193 | regs_user->regs = regs_user_copy; | 173 | regs_user->regs = regs_user_copy; |
194 | } | 174 | } |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7af7b6478637..0c8992dbead5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -38,7 +38,26 @@ | |||
38 | * section. Since TSS's are completely CPU-local, we want them | 38 | * section. Since TSS's are completely CPU-local, we want them |
39 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. | 39 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. |
40 | */ | 40 | */ |
41 | __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; | 41 | __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { |
42 | .x86_tss = { | ||
43 | .sp0 = TOP_OF_INIT_STACK, | ||
44 | #ifdef CONFIG_X86_32 | ||
45 | .ss0 = __KERNEL_DS, | ||
46 | .ss1 = __KERNEL_CS, | ||
47 | .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, | ||
48 | #endif | ||
49 | }, | ||
50 | #ifdef CONFIG_X86_32 | ||
51 | /* | ||
52 | * Note that the .io_bitmap member must be extra-big. This is because | ||
53 | * the CPU will access an additional byte beyond the end of the IO | ||
54 | * permission bitmap. The extra byte must be all 1 bits, and must | ||
55 | * be within the limit. | ||
56 | */ | ||
57 | .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, | ||
58 | #endif | ||
59 | }; | ||
60 | EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); | ||
42 | 61 | ||
43 | #ifdef CONFIG_X86_64 | 62 | #ifdef CONFIG_X86_64 |
44 | static DEFINE_PER_CPU(unsigned char, is_idle); | 63 | static DEFINE_PER_CPU(unsigned char, is_idle); |
@@ -110,7 +129,7 @@ void exit_thread(void) | |||
110 | unsigned long *bp = t->io_bitmap_ptr; | 129 | unsigned long *bp = t->io_bitmap_ptr; |
111 | 130 | ||
112 | if (bp) { | 131 | if (bp) { |
113 | struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); | 132 | struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); |
114 | 133 | ||
115 | t->io_bitmap_ptr = NULL; | 134 | t->io_bitmap_ptr = NULL; |
116 | clear_thread_flag(TIF_IO_BITMAP); | 135 | clear_thread_flag(TIF_IO_BITMAP); |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 603c4f99cb5a..8ed2106b06da 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all) | |||
73 | unsigned long sp; | 73 | unsigned long sp; |
74 | unsigned short ss, gs; | 74 | unsigned short ss, gs; |
75 | 75 | ||
76 | if (user_mode_vm(regs)) { | 76 | if (user_mode(regs)) { |
77 | sp = regs->sp; | 77 | sp = regs->sp; |
78 | ss = regs->ss & 0xffff; | 78 | ss = regs->ss & 0xffff; |
79 | gs = get_user_gs(regs); | 79 | gs = get_user_gs(regs); |
@@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) | |||
206 | regs->ip = new_ip; | 206 | regs->ip = new_ip; |
207 | regs->sp = new_sp; | 207 | regs->sp = new_sp; |
208 | regs->flags = X86_EFLAGS_IF; | 208 | regs->flags = X86_EFLAGS_IF; |
209 | /* | 209 | force_iret(); |
210 | * force it to the iret return path by making it look as if there was | ||
211 | * some work pending. | ||
212 | */ | ||
213 | set_thread_flag(TIF_NOTIFY_RESUME); | ||
214 | } | 210 | } |
215 | EXPORT_SYMBOL_GPL(start_thread); | 211 | EXPORT_SYMBOL_GPL(start_thread); |
216 | 212 | ||
@@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
248 | struct thread_struct *prev = &prev_p->thread, | 244 | struct thread_struct *prev = &prev_p->thread, |
249 | *next = &next_p->thread; | 245 | *next = &next_p->thread; |
250 | int cpu = smp_processor_id(); | 246 | int cpu = smp_processor_id(); |
251 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 247 | struct tss_struct *tss = &per_cpu(cpu_tss, cpu); |
252 | fpu_switch_t fpu; | 248 | fpu_switch_t fpu; |
253 | 249 | ||
254 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ | 250 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ |
@@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
256 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); | 252 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); |
257 | 253 | ||
258 | /* | 254 | /* |
259 | * Reload esp0. | ||
260 | */ | ||
261 | load_sp0(tss, next); | ||
262 | |||
263 | /* | ||
264 | * Save away %gs. No need to save %fs, as it was saved on the | 255 | * Save away %gs. No need to save %fs, as it was saved on the |
265 | * stack on entry. No need to save %es and %ds, as those are | 256 | * stack on entry. No need to save %es and %ds, as those are |
266 | * always kernel segments while inside the kernel. Doing this | 257 | * always kernel segments while inside the kernel. Doing this |
@@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
310 | */ | 301 | */ |
311 | arch_end_context_switch(next_p); | 302 | arch_end_context_switch(next_p); |
312 | 303 | ||
304 | /* | ||
305 | * Reload esp0, kernel_stack, and current_top_of_stack. This changes | ||
306 | * current_thread_info(). | ||
307 | */ | ||
308 | load_sp0(tss, next); | ||
313 | this_cpu_write(kernel_stack, | 309 | this_cpu_write(kernel_stack, |
314 | (unsigned long)task_stack_page(next_p) + | 310 | (unsigned long)task_stack_page(next_p) + |
315 | THREAD_SIZE - KERNEL_STACK_OFFSET); | 311 | THREAD_SIZE); |
312 | this_cpu_write(cpu_current_top_of_stack, | ||
313 | (unsigned long)task_stack_page(next_p) + | ||
314 | THREAD_SIZE); | ||
316 | 315 | ||
317 | /* | 316 | /* |
318 | * Restore %gs if needed (which is common) | 317 | * Restore %gs if needed (which is common) |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 67fcc43577d2..4baaa972f52a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -52,7 +52,7 @@ | |||
52 | 52 | ||
53 | asmlinkage extern void ret_from_fork(void); | 53 | asmlinkage extern void ret_from_fork(void); |
54 | 54 | ||
55 | __visible DEFINE_PER_CPU(unsigned long, old_rsp); | 55 | __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); |
56 | 56 | ||
57 | /* Prints also some state that isn't saved in the pt_regs */ | 57 | /* Prints also some state that isn't saved in the pt_regs */ |
58 | void __show_regs(struct pt_regs *regs, int all) | 58 | void __show_regs(struct pt_regs *regs, int all) |
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
161 | p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; | 161 | p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; |
162 | childregs = task_pt_regs(p); | 162 | childregs = task_pt_regs(p); |
163 | p->thread.sp = (unsigned long) childregs; | 163 | p->thread.sp = (unsigned long) childregs; |
164 | p->thread.usersp = me->thread.usersp; | ||
165 | set_tsk_thread_flag(p, TIF_FORK); | 164 | set_tsk_thread_flag(p, TIF_FORK); |
166 | p->thread.io_bitmap_ptr = NULL; | 165 | p->thread.io_bitmap_ptr = NULL; |
167 | 166 | ||
@@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
207 | */ | 206 | */ |
208 | if (clone_flags & CLONE_SETTLS) { | 207 | if (clone_flags & CLONE_SETTLS) { |
209 | #ifdef CONFIG_IA32_EMULATION | 208 | #ifdef CONFIG_IA32_EMULATION |
210 | if (test_thread_flag(TIF_IA32)) | 209 | if (is_ia32_task()) |
211 | err = do_set_thread_area(p, -1, | 210 | err = do_set_thread_area(p, -1, |
212 | (struct user_desc __user *)childregs->si, 0); | 211 | (struct user_desc __user *)childregs->si, 0); |
213 | else | 212 | else |
@@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, | |||
235 | loadsegment(es, _ds); | 234 | loadsegment(es, _ds); |
236 | loadsegment(ds, _ds); | 235 | loadsegment(ds, _ds); |
237 | load_gs_index(0); | 236 | load_gs_index(0); |
238 | current->thread.usersp = new_sp; | ||
239 | regs->ip = new_ip; | 237 | regs->ip = new_ip; |
240 | regs->sp = new_sp; | 238 | regs->sp = new_sp; |
241 | this_cpu_write(old_rsp, new_sp); | ||
242 | regs->cs = _cs; | 239 | regs->cs = _cs; |
243 | regs->ss = _ss; | 240 | regs->ss = _ss; |
244 | regs->flags = X86_EFLAGS_IF; | 241 | regs->flags = X86_EFLAGS_IF; |
242 | force_iret(); | ||
245 | } | 243 | } |
246 | 244 | ||
247 | void | 245 | void |
@@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
277 | struct thread_struct *prev = &prev_p->thread; | 275 | struct thread_struct *prev = &prev_p->thread; |
278 | struct thread_struct *next = &next_p->thread; | 276 | struct thread_struct *next = &next_p->thread; |
279 | int cpu = smp_processor_id(); | 277 | int cpu = smp_processor_id(); |
280 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 278 | struct tss_struct *tss = &per_cpu(cpu_tss, cpu); |
281 | unsigned fsindex, gsindex; | 279 | unsigned fsindex, gsindex; |
282 | fpu_switch_t fpu; | 280 | fpu_switch_t fpu; |
283 | 281 | ||
284 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); | 282 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); |
285 | 283 | ||
286 | /* Reload esp0 and ss1. */ | ||
287 | load_sp0(tss, next); | ||
288 | |||
289 | /* We must save %fs and %gs before load_TLS() because | 284 | /* We must save %fs and %gs before load_TLS() because |
290 | * %fs and %gs may be cleared by load_TLS(). | 285 | * %fs and %gs may be cleared by load_TLS(). |
291 | * | 286 | * |
@@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
401 | /* | 396 | /* |
402 | * Switch the PDA and FPU contexts. | 397 | * Switch the PDA and FPU contexts. |
403 | */ | 398 | */ |
404 | prev->usersp = this_cpu_read(old_rsp); | ||
405 | this_cpu_write(old_rsp, next->usersp); | ||
406 | this_cpu_write(current_task, next_p); | 399 | this_cpu_write(current_task, next_p); |
407 | 400 | ||
408 | /* | 401 | /* |
@@ -413,9 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
413 | task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); | 406 | task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); |
414 | this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); | 407 | this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); |
415 | 408 | ||
409 | /* Reload esp0 and ss1. This changes current_thread_info(). */ | ||
410 | load_sp0(tss, next); | ||
411 | |||
416 | this_cpu_write(kernel_stack, | 412 | this_cpu_write(kernel_stack, |
417 | (unsigned long)task_stack_page(next_p) + | 413 | (unsigned long)task_stack_page(next_p) + THREAD_SIZE); |
418 | THREAD_SIZE - KERNEL_STACK_OFFSET); | ||
419 | 414 | ||
420 | /* | 415 | /* |
421 | * Now maybe reload the debug registers and handle I/O bitmaps | 416 | * Now maybe reload the debug registers and handle I/O bitmaps |
@@ -602,6 +597,5 @@ long sys_arch_prctl(int code, unsigned long addr) | |||
602 | 597 | ||
603 | unsigned long KSTK_ESP(struct task_struct *task) | 598 | unsigned long KSTK_ESP(struct task_struct *task) |
604 | { | 599 | { |
605 | return (test_tsk_thread_flag(task, TIF_IA32)) ? | 600 | return task_pt_regs(task)->sp; |
606 | (task_pt_regs(task)->sp) : ((task)->thread.usersp); | ||
607 | } | 601 | } |
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e510618b2e91..a7bc79480719 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task, | |||
364 | case offsetof(struct user_regs_struct,cs): | 364 | case offsetof(struct user_regs_struct,cs): |
365 | if (unlikely(value == 0)) | 365 | if (unlikely(value == 0)) |
366 | return -EIO; | 366 | return -EIO; |
367 | #ifdef CONFIG_IA32_EMULATION | 367 | task_pt_regs(task)->cs = value; |
368 | if (test_tsk_thread_flag(task, TIF_IA32)) | ||
369 | task_pt_regs(task)->cs = value; | ||
370 | #endif | ||
371 | break; | 368 | break; |
372 | case offsetof(struct user_regs_struct,ss): | 369 | case offsetof(struct user_regs_struct,ss): |
373 | if (unlikely(value == 0)) | 370 | if (unlikely(value == 0)) |
374 | return -EIO; | 371 | return -EIO; |
375 | #ifdef CONFIG_IA32_EMULATION | 372 | task_pt_regs(task)->ss = value; |
376 | if (test_tsk_thread_flag(task, TIF_IA32)) | ||
377 | task_pt_regs(task)->ss = value; | ||
378 | #endif | ||
379 | break; | 373 | break; |
380 | } | 374 | } |
381 | 375 | ||
@@ -1421,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk, | |||
1421 | memset(info, 0, sizeof(*info)); | 1415 | memset(info, 0, sizeof(*info)); |
1422 | info->si_signo = SIGTRAP; | 1416 | info->si_signo = SIGTRAP; |
1423 | info->si_code = si_code; | 1417 | info->si_code = si_code; |
1424 | info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; | 1418 | info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; |
1425 | } | 1419 | } |
1426 | 1420 | ||
1427 | void user_single_step_siginfo(struct task_struct *tsk, | 1421 | void user_single_step_siginfo(struct task_struct *tsk, |
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index e13f8e7c22a6..77630d57e7bf 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S | |||
@@ -226,23 +226,23 @@ swap_pages: | |||
226 | movl (%ebx), %ecx | 226 | movl (%ebx), %ecx |
227 | addl $4, %ebx | 227 | addl $4, %ebx |
228 | 1: | 228 | 1: |
229 | testl $0x1, %ecx /* is it a destination page */ | 229 | testb $0x1, %cl /* is it a destination page */ |
230 | jz 2f | 230 | jz 2f |
231 | movl %ecx, %edi | 231 | movl %ecx, %edi |
232 | andl $0xfffff000, %edi | 232 | andl $0xfffff000, %edi |
233 | jmp 0b | 233 | jmp 0b |
234 | 2: | 234 | 2: |
235 | testl $0x2, %ecx /* is it an indirection page */ | 235 | testb $0x2, %cl /* is it an indirection page */ |
236 | jz 2f | 236 | jz 2f |
237 | movl %ecx, %ebx | 237 | movl %ecx, %ebx |
238 | andl $0xfffff000, %ebx | 238 | andl $0xfffff000, %ebx |
239 | jmp 0b | 239 | jmp 0b |
240 | 2: | 240 | 2: |
241 | testl $0x4, %ecx /* is it the done indicator */ | 241 | testb $0x4, %cl /* is it the done indicator */ |
242 | jz 2f | 242 | jz 2f |
243 | jmp 3f | 243 | jmp 3f |
244 | 2: | 244 | 2: |
245 | testl $0x8, %ecx /* is it the source indicator */ | 245 | testb $0x8, %cl /* is it the source indicator */ |
246 | jz 0b /* Ignore it otherwise */ | 246 | jz 0b /* Ignore it otherwise */ |
247 | movl %ecx, %esi /* For every source page do a copy */ | 247 | movl %ecx, %esi /* For every source page do a copy */ |
248 | andl $0xfffff000, %esi | 248 | andl $0xfffff000, %esi |
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 3fd2c693e475..98111b38ebfd 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S | |||
@@ -123,7 +123,7 @@ identity_mapped: | |||
123 | * Set cr4 to a known state: | 123 | * Set cr4 to a known state: |
124 | * - physical address extension enabled | 124 | * - physical address extension enabled |
125 | */ | 125 | */ |
126 | movq $X86_CR4_PAE, %rax | 126 | movl $X86_CR4_PAE, %eax |
127 | movq %rax, %cr4 | 127 | movq %rax, %cr4 |
128 | 128 | ||
129 | jmp 1f | 129 | jmp 1f |
@@ -221,23 +221,23 @@ swap_pages: | |||
221 | movq (%rbx), %rcx | 221 | movq (%rbx), %rcx |
222 | addq $8, %rbx | 222 | addq $8, %rbx |
223 | 1: | 223 | 1: |
224 | testq $0x1, %rcx /* is it a destination page? */ | 224 | testb $0x1, %cl /* is it a destination page? */ |
225 | jz 2f | 225 | jz 2f |
226 | movq %rcx, %rdi | 226 | movq %rcx, %rdi |
227 | andq $0xfffffffffffff000, %rdi | 227 | andq $0xfffffffffffff000, %rdi |
228 | jmp 0b | 228 | jmp 0b |
229 | 2: | 229 | 2: |
230 | testq $0x2, %rcx /* is it an indirection page? */ | 230 | testb $0x2, %cl /* is it an indirection page? */ |
231 | jz 2f | 231 | jz 2f |
232 | movq %rcx, %rbx | 232 | movq %rcx, %rbx |
233 | andq $0xfffffffffffff000, %rbx | 233 | andq $0xfffffffffffff000, %rbx |
234 | jmp 0b | 234 | jmp 0b |
235 | 2: | 235 | 2: |
236 | testq $0x4, %rcx /* is it the done indicator? */ | 236 | testb $0x4, %cl /* is it the done indicator? */ |
237 | jz 2f | 237 | jz 2f |
238 | jmp 3f | 238 | jmp 3f |
239 | 2: | 239 | 2: |
240 | testq $0x8, %rcx /* is it the source indicator? */ | 240 | testb $0x8, %cl /* is it the source indicator? */ |
241 | jz 0b /* Ignore it otherwise */ | 241 | jz 0b /* Ignore it otherwise */ |
242 | movq %rcx, %rsi /* For ever source page do a copy */ | 242 | movq %rcx, %rsi /* For ever source page do a copy */ |
243 | andq $0xfffffffffffff000, %rsi | 243 | andq $0xfffffffffffff000, %rsi |
@@ -246,17 +246,17 @@ swap_pages: | |||
246 | movq %rsi, %rax | 246 | movq %rsi, %rax |
247 | 247 | ||
248 | movq %r10, %rdi | 248 | movq %r10, %rdi |
249 | movq $512, %rcx | 249 | movl $512, %ecx |
250 | rep ; movsq | 250 | rep ; movsq |
251 | 251 | ||
252 | movq %rax, %rdi | 252 | movq %rax, %rdi |
253 | movq %rdx, %rsi | 253 | movq %rdx, %rsi |
254 | movq $512, %rcx | 254 | movl $512, %ecx |
255 | rep ; movsq | 255 | rep ; movsq |
256 | 256 | ||
257 | movq %rdx, %rdi | 257 | movq %rdx, %rdi |
258 | movq %r10, %rsi | 258 | movq %r10, %rsi |
259 | movq $512, %rcx | 259 | movl $512, %ecx |
260 | rep ; movsq | 260 | rep ; movsq |
261 | 261 | ||
262 | lea PAGE_SIZE(%rax), %rsi | 262 | lea PAGE_SIZE(%rax), %rsi |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0a2421cca01f..014466b152b5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void) | |||
832 | static int | 832 | static int |
833 | dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) | 833 | dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) |
834 | { | 834 | { |
835 | pr_emerg("Kernel Offset: 0x%lx from 0x%lx " | 835 | if (kaslr_enabled()) { |
836 | "(relocation range: 0x%lx-0x%lx)\n", | 836 | pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", |
837 | (unsigned long)&_text - __START_KERNEL, __START_KERNEL, | 837 | (unsigned long)&_text - __START_KERNEL, |
838 | __START_KERNEL_map, MODULES_VADDR-1); | 838 | __START_KERNEL, |
839 | __START_KERNEL_map, | ||
840 | MODULES_VADDR-1); | ||
841 | } else { | ||
842 | pr_emerg("Kernel Offset: disabled\n"); | ||
843 | } | ||
839 | 844 | ||
840 | return 0; | 845 | return 0; |
841 | } | 846 | } |
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e5042463c1bc..53cc4085c3d7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c | |||
@@ -61,8 +61,7 @@ | |||
61 | regs->seg = GET_SEG(seg) | 3; \ | 61 | regs->seg = GET_SEG(seg) | 3; \ |
62 | } while (0) | 62 | } while (0) |
63 | 63 | ||
64 | int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | 64 | int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) |
65 | unsigned long *pax) | ||
66 | { | 65 | { |
67 | void __user *buf; | 66 | void __user *buf; |
68 | unsigned int tmpflags; | 67 | unsigned int tmpflags; |
@@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | |||
81 | #endif /* CONFIG_X86_32 */ | 80 | #endif /* CONFIG_X86_32 */ |
82 | 81 | ||
83 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); | 82 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); |
84 | COPY(dx); COPY(cx); COPY(ip); | 83 | COPY(dx); COPY(cx); COPY(ip); COPY(ax); |
85 | 84 | ||
86 | #ifdef CONFIG_X86_64 | 85 | #ifdef CONFIG_X86_64 |
87 | COPY(r8); | 86 | COPY(r8); |
@@ -94,27 +93,20 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | |||
94 | COPY(r15); | 93 | COPY(r15); |
95 | #endif /* CONFIG_X86_64 */ | 94 | #endif /* CONFIG_X86_64 */ |
96 | 95 | ||
97 | #ifdef CONFIG_X86_32 | ||
98 | COPY_SEG_CPL3(cs); | 96 | COPY_SEG_CPL3(cs); |
99 | COPY_SEG_CPL3(ss); | 97 | COPY_SEG_CPL3(ss); |
100 | #else /* !CONFIG_X86_32 */ | ||
101 | /* Kernel saves and restores only the CS segment register on signals, | ||
102 | * which is the bare minimum needed to allow mixed 32/64-bit code. | ||
103 | * App's signal handler can save/restore other segments if needed. */ | ||
104 | COPY_SEG_CPL3(cs); | ||
105 | #endif /* CONFIG_X86_32 */ | ||
106 | 98 | ||
107 | get_user_ex(tmpflags, &sc->flags); | 99 | get_user_ex(tmpflags, &sc->flags); |
108 | regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); | 100 | regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); |
109 | regs->orig_ax = -1; /* disable syscall checks */ | 101 | regs->orig_ax = -1; /* disable syscall checks */ |
110 | 102 | ||
111 | get_user_ex(buf, &sc->fpstate); | 103 | get_user_ex(buf, &sc->fpstate); |
112 | |||
113 | get_user_ex(*pax, &sc->ax); | ||
114 | } get_user_catch(err); | 104 | } get_user_catch(err); |
115 | 105 | ||
116 | err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); | 106 | err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); |
117 | 107 | ||
108 | force_iret(); | ||
109 | |||
118 | return err; | 110 | return err; |
119 | } | 111 | } |
120 | 112 | ||
@@ -162,8 +154,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, | |||
162 | #else /* !CONFIG_X86_32 */ | 154 | #else /* !CONFIG_X86_32 */ |
163 | put_user_ex(regs->flags, &sc->flags); | 155 | put_user_ex(regs->flags, &sc->flags); |
164 | put_user_ex(regs->cs, &sc->cs); | 156 | put_user_ex(regs->cs, &sc->cs); |
165 | put_user_ex(0, &sc->gs); | 157 | put_user_ex(0, &sc->__pad2); |
166 | put_user_ex(0, &sc->fs); | 158 | put_user_ex(0, &sc->__pad1); |
159 | put_user_ex(regs->ss, &sc->ss); | ||
167 | #endif /* CONFIG_X86_32 */ | 160 | #endif /* CONFIG_X86_32 */ |
168 | 161 | ||
169 | put_user_ex(fpstate, &sc->fpstate); | 162 | put_user_ex(fpstate, &sc->fpstate); |
@@ -457,9 +450,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, | |||
457 | 450 | ||
458 | regs->sp = (unsigned long)frame; | 451 | regs->sp = (unsigned long)frame; |
459 | 452 | ||
460 | /* Set up the CS register to run signal handlers in 64-bit mode, | 453 | /* |
461 | even if the handler happens to be interrupting 32-bit code. */ | 454 | * Set up the CS and SS registers to run signal handlers in |
455 | * 64-bit mode, even if the handler happens to be interrupting | ||
456 | * 32-bit or 16-bit code. | ||
457 | * | ||
458 | * SS is subtle. In 64-bit mode, we don't need any particular | ||
459 | * SS descriptor, but we do need SS to be valid. It's possible | ||
460 | * that the old SS is entirely bogus -- this can happen if the | ||
461 | * signal we're trying to deliver is #GP or #SS caused by a bad | ||
462 | * SS value. | ||
463 | */ | ||
462 | regs->cs = __USER_CS; | 464 | regs->cs = __USER_CS; |
465 | regs->ss = __USER_DS; | ||
463 | 466 | ||
464 | return 0; | 467 | return 0; |
465 | } | 468 | } |
@@ -539,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void) | |||
539 | { | 542 | { |
540 | struct pt_regs *regs = current_pt_regs(); | 543 | struct pt_regs *regs = current_pt_regs(); |
541 | struct sigframe __user *frame; | 544 | struct sigframe __user *frame; |
542 | unsigned long ax; | ||
543 | sigset_t set; | 545 | sigset_t set; |
544 | 546 | ||
545 | frame = (struct sigframe __user *)(regs->sp - 8); | 547 | frame = (struct sigframe __user *)(regs->sp - 8); |
@@ -553,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void) | |||
553 | 555 | ||
554 | set_current_blocked(&set); | 556 | set_current_blocked(&set); |
555 | 557 | ||
556 | if (restore_sigcontext(regs, &frame->sc, &ax)) | 558 | if (restore_sigcontext(regs, &frame->sc)) |
557 | goto badframe; | 559 | goto badframe; |
558 | return ax; | 560 | return regs->ax; |
559 | 561 | ||
560 | badframe: | 562 | badframe: |
561 | signal_fault(regs, frame, "sigreturn"); | 563 | signal_fault(regs, frame, "sigreturn"); |
@@ -568,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void) | |||
568 | { | 570 | { |
569 | struct pt_regs *regs = current_pt_regs(); | 571 | struct pt_regs *regs = current_pt_regs(); |
570 | struct rt_sigframe __user *frame; | 572 | struct rt_sigframe __user *frame; |
571 | unsigned long ax; | ||
572 | sigset_t set; | 573 | sigset_t set; |
573 | 574 | ||
574 | frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); | 575 | frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); |
@@ -579,13 +580,13 @@ asmlinkage long sys_rt_sigreturn(void) | |||
579 | 580 | ||
580 | set_current_blocked(&set); | 581 | set_current_blocked(&set); |
581 | 582 | ||
582 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) | 583 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) |
583 | goto badframe; | 584 | goto badframe; |
584 | 585 | ||
585 | if (restore_altstack(&frame->uc.uc_stack)) | 586 | if (restore_altstack(&frame->uc.uc_stack)) |
586 | goto badframe; | 587 | goto badframe; |
587 | 588 | ||
588 | return ax; | 589 | return regs->ax; |
589 | 590 | ||
590 | badframe: | 591 | badframe: |
591 | signal_fault(regs, frame, "rt_sigreturn"); | 592 | signal_fault(regs, frame, "rt_sigreturn"); |
@@ -780,7 +781,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void) | |||
780 | struct pt_regs *regs = current_pt_regs(); | 781 | struct pt_regs *regs = current_pt_regs(); |
781 | struct rt_sigframe_x32 __user *frame; | 782 | struct rt_sigframe_x32 __user *frame; |
782 | sigset_t set; | 783 | sigset_t set; |
783 | unsigned long ax; | ||
784 | 784 | ||
785 | frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); | 785 | frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); |
786 | 786 | ||
@@ -791,13 +791,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void) | |||
791 | 791 | ||
792 | set_current_blocked(&set); | 792 | set_current_blocked(&set); |
793 | 793 | ||
794 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) | 794 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) |
795 | goto badframe; | 795 | goto badframe; |
796 | 796 | ||
797 | if (compat_restore_altstack(&frame->uc.uc_stack)) | 797 | if (compat_restore_altstack(&frame->uc.uc_stack)) |
798 | goto badframe; | 798 | goto badframe; |
799 | 799 | ||
800 | return ax; | 800 | return regs->ax; |
801 | 801 | ||
802 | badframe: | 802 | badframe: |
803 | signal_fault(regs, frame, "x32 rt_sigreturn"); | 803 | signal_fault(regs, frame, "x32 rt_sigreturn"); |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ddd2c0674cda..7035f6b21c3f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -779,6 +779,26 @@ out: | |||
779 | return boot_error; | 779 | return boot_error; |
780 | } | 780 | } |
781 | 781 | ||
782 | void common_cpu_up(unsigned int cpu, struct task_struct *idle) | ||
783 | { | ||
784 | /* Just in case we booted with a single CPU. */ | ||
785 | alternatives_enable_smp(); | ||
786 | |||
787 | per_cpu(current_task, cpu) = idle; | ||
788 | |||
789 | #ifdef CONFIG_X86_32 | ||
790 | /* Stack for startup_32 can be just as for start_secondary onwards */ | ||
791 | irq_ctx_init(cpu); | ||
792 | per_cpu(cpu_current_top_of_stack, cpu) = | ||
793 | (unsigned long)task_stack_page(idle) + THREAD_SIZE; | ||
794 | #else | ||
795 | clear_tsk_thread_flag(idle, TIF_FORK); | ||
796 | initial_gs = per_cpu_offset(cpu); | ||
797 | #endif | ||
798 | per_cpu(kernel_stack, cpu) = | ||
799 | (unsigned long)task_stack_page(idle) + THREAD_SIZE; | ||
800 | } | ||
801 | |||
782 | /* | 802 | /* |
783 | * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad | 803 | * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad |
784 | * (ie clustered apic addressing mode), this is a LOGICAL apic ID. | 804 | * (ie clustered apic addressing mode), this is a LOGICAL apic ID. |
@@ -796,23 +816,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) | |||
796 | int cpu0_nmi_registered = 0; | 816 | int cpu0_nmi_registered = 0; |
797 | unsigned long timeout; | 817 | unsigned long timeout; |
798 | 818 | ||
799 | /* Just in case we booted with a single CPU. */ | ||
800 | alternatives_enable_smp(); | ||
801 | |||
802 | idle->thread.sp = (unsigned long) (((struct pt_regs *) | 819 | idle->thread.sp = (unsigned long) (((struct pt_regs *) |
803 | (THREAD_SIZE + task_stack_page(idle))) - 1); | 820 | (THREAD_SIZE + task_stack_page(idle))) - 1); |
804 | per_cpu(current_task, cpu) = idle; | ||
805 | 821 | ||
806 | #ifdef CONFIG_X86_32 | ||
807 | /* Stack for startup_32 can be just as for start_secondary onwards */ | ||
808 | irq_ctx_init(cpu); | ||
809 | #else | ||
810 | clear_tsk_thread_flag(idle, TIF_FORK); | ||
811 | initial_gs = per_cpu_offset(cpu); | ||
812 | #endif | ||
813 | per_cpu(kernel_stack, cpu) = | ||
814 | (unsigned long)task_stack_page(idle) - | ||
815 | KERNEL_STACK_OFFSET + THREAD_SIZE; | ||
816 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); | 822 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); |
817 | initial_code = (unsigned long)start_secondary; | 823 | initial_code = (unsigned long)start_secondary; |
818 | stack_start = idle->thread.sp; | 824 | stack_start = idle->thread.sp; |
@@ -953,6 +959,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) | |||
953 | /* the FPU context is blank, nobody can own it */ | 959 | /* the FPU context is blank, nobody can own it */ |
954 | __cpu_disable_lazy_restore(cpu); | 960 | __cpu_disable_lazy_restore(cpu); |
955 | 961 | ||
962 | common_cpu_up(cpu, tidle); | ||
963 | |||
956 | err = do_boot_cpu(apicid, cpu, tidle); | 964 | err = do_boot_cpu(apicid, cpu, tidle); |
957 | if (err) { | 965 | if (err) { |
958 | pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); | 966 | pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); |
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index e9bcd57d8a9e..3777189c4a19 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c | |||
@@ -5,21 +5,29 @@ | |||
5 | #include <linux/cache.h> | 5 | #include <linux/cache.h> |
6 | #include <asm/asm-offsets.h> | 6 | #include <asm/asm-offsets.h> |
7 | 7 | ||
8 | #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; | 8 | #ifdef CONFIG_IA32_EMULATION |
9 | #define SYM(sym, compat) compat | ||
10 | #else | ||
11 | #define SYM(sym, compat) sym | ||
12 | #define ia32_sys_call_table sys_call_table | ||
13 | #define __NR_ia32_syscall_max __NR_syscall_max | ||
14 | #endif | ||
15 | |||
16 | #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; | ||
9 | #include <asm/syscalls_32.h> | 17 | #include <asm/syscalls_32.h> |
10 | #undef __SYSCALL_I386 | 18 | #undef __SYSCALL_I386 |
11 | 19 | ||
12 | #define __SYSCALL_I386(nr, sym, compat) [nr] = sym, | 20 | #define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), |
13 | 21 | ||
14 | typedef asmlinkage void (*sys_call_ptr_t)(void); | 22 | typedef asmlinkage void (*sys_call_ptr_t)(void); |
15 | 23 | ||
16 | extern asmlinkage void sys_ni_syscall(void); | 24 | extern asmlinkage void sys_ni_syscall(void); |
17 | 25 | ||
18 | __visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { | 26 | __visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { |
19 | /* | 27 | /* |
20 | * Smells like a compiler bug -- it doesn't work | 28 | * Smells like a compiler bug -- it doesn't work |
21 | * when the & below is removed. | 29 | * when the & below is removed. |
22 | */ | 30 | */ |
23 | [0 ... __NR_syscall_max] = &sys_ni_syscall, | 31 | [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, |
24 | #include <asm/syscalls_32.h> | 32 | #include <asm/syscalls_32.h> |
25 | }; | 33 | }; |
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 25adc0e16eaa..d39c09119db6 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c | |||
@@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs) | |||
30 | { | 30 | { |
31 | unsigned long pc = instruction_pointer(regs); | 31 | unsigned long pc = instruction_pointer(regs); |
32 | 32 | ||
33 | if (!user_mode_vm(regs) && in_lock_functions(pc)) { | 33 | if (!user_mode(regs) && in_lock_functions(pc)) { |
34 | #ifdef CONFIG_FRAME_POINTER | 34 | #ifdef CONFIG_FRAME_POINTER |
35 | return *(unsigned long *)(regs->bp + sizeof(long)); | 35 | return *(unsigned long *)(regs->bp + sizeof(long)); |
36 | #else | 36 | #else |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4ff5d162ff9f..6751c5c58eec 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs) | |||
112 | { | 112 | { |
113 | enum ctx_state prev_state; | 113 | enum ctx_state prev_state; |
114 | 114 | ||
115 | if (user_mode_vm(regs)) { | 115 | if (user_mode(regs)) { |
116 | /* Other than that, we're just an exception. */ | 116 | /* Other than that, we're just an exception. */ |
117 | prev_state = exception_enter(); | 117 | prev_state = exception_enter(); |
118 | } else { | 118 | } else { |
@@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | |||
146 | /* Must be before exception_exit. */ | 146 | /* Must be before exception_exit. */ |
147 | preempt_count_sub(HARDIRQ_OFFSET); | 147 | preempt_count_sub(HARDIRQ_OFFSET); |
148 | 148 | ||
149 | if (user_mode_vm(regs)) | 149 | if (user_mode(regs)) |
150 | return exception_exit(prev_state); | 150 | return exception_exit(prev_state); |
151 | else | 151 | else |
152 | rcu_nmi_exit(); | 152 | rcu_nmi_exit(); |
@@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | |||
158 | * | 158 | * |
159 | * IST exception handlers normally cannot schedule. As a special | 159 | * IST exception handlers normally cannot schedule. As a special |
160 | * exception, if the exception interrupted userspace code (i.e. | 160 | * exception, if the exception interrupted userspace code (i.e. |
161 | * user_mode_vm(regs) would return true) and the exception was not | 161 | * user_mode(regs) would return true) and the exception was not |
162 | * a double fault, it can be safe to schedule. ist_begin_non_atomic() | 162 | * a double fault, it can be safe to schedule. ist_begin_non_atomic() |
163 | * begins a non-atomic section within an ist_enter()/ist_exit() region. | 163 | * begins a non-atomic section within an ist_enter()/ist_exit() region. |
164 | * Callers are responsible for enabling interrupts themselves inside | 164 | * Callers are responsible for enabling interrupts themselves inside |
@@ -167,15 +167,15 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | |||
167 | */ | 167 | */ |
168 | void ist_begin_non_atomic(struct pt_regs *regs) | 168 | void ist_begin_non_atomic(struct pt_regs *regs) |
169 | { | 169 | { |
170 | BUG_ON(!user_mode_vm(regs)); | 170 | BUG_ON(!user_mode(regs)); |
171 | 171 | ||
172 | /* | 172 | /* |
173 | * Sanity check: we need to be on the normal thread stack. This | 173 | * Sanity check: we need to be on the normal thread stack. This |
174 | * will catch asm bugs and any attempt to use ist_preempt_enable | 174 | * will catch asm bugs and any attempt to use ist_preempt_enable |
175 | * from double_fault. | 175 | * from double_fault. |
176 | */ | 176 | */ |
177 | BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) | 177 | BUG_ON((unsigned long)(current_top_of_stack() - |
178 | & ~(THREAD_SIZE - 1)) != 0); | 178 | current_stack_pointer()) >= THREAD_SIZE); |
179 | 179 | ||
180 | preempt_count_sub(HARDIRQ_OFFSET); | 180 | preempt_count_sub(HARDIRQ_OFFSET); |
181 | } | 181 | } |
@@ -194,8 +194,7 @@ static nokprobe_inline int | |||
194 | do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, | 194 | do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, |
195 | struct pt_regs *regs, long error_code) | 195 | struct pt_regs *regs, long error_code) |
196 | { | 196 | { |
197 | #ifdef CONFIG_X86_32 | 197 | if (v8086_mode(regs)) { |
198 | if (regs->flags & X86_VM_MASK) { | ||
199 | /* | 198 | /* |
200 | * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. | 199 | * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. |
201 | * On nmi (interrupt 2), do_trap should not be called. | 200 | * On nmi (interrupt 2), do_trap should not be called. |
@@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, | |||
207 | } | 206 | } |
208 | return -1; | 207 | return -1; |
209 | } | 208 | } |
210 | #endif | 209 | |
211 | if (!user_mode(regs)) { | 210 | if (!user_mode(regs)) { |
212 | if (!fixup_exception(regs)) { | 211 | if (!fixup_exception(regs)) { |
213 | tsk->thread.error_code = error_code; | 212 | tsk->thread.error_code = error_code; |
@@ -384,7 +383,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) | |||
384 | goto exit; | 383 | goto exit; |
385 | conditional_sti(regs); | 384 | conditional_sti(regs); |
386 | 385 | ||
387 | if (!user_mode_vm(regs)) | 386 | if (!user_mode(regs)) |
388 | die("bounds", regs, error_code); | 387 | die("bounds", regs, error_code); |
389 | 388 | ||
390 | if (!cpu_feature_enabled(X86_FEATURE_MPX)) { | 389 | if (!cpu_feature_enabled(X86_FEATURE_MPX)) { |
@@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code) | |||
462 | prev_state = exception_enter(); | 461 | prev_state = exception_enter(); |
463 | conditional_sti(regs); | 462 | conditional_sti(regs); |
464 | 463 | ||
465 | #ifdef CONFIG_X86_32 | 464 | if (v8086_mode(regs)) { |
466 | if (regs->flags & X86_VM_MASK) { | ||
467 | local_irq_enable(); | 465 | local_irq_enable(); |
468 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); | 466 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); |
469 | goto exit; | 467 | goto exit; |
470 | } | 468 | } |
471 | #endif | ||
472 | 469 | ||
473 | tsk = current; | 470 | tsk = current; |
474 | if (!user_mode(regs)) { | 471 | if (!user_mode(regs)) { |
@@ -587,7 +584,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) | |||
587 | /* Copy the remainder of the stack from the current stack. */ | 584 | /* Copy the remainder of the stack from the current stack. */ |
588 | memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); | 585 | memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); |
589 | 586 | ||
590 | BUG_ON(!user_mode_vm(&new_stack->regs)); | 587 | BUG_ON(!user_mode(&new_stack->regs)); |
591 | return new_stack; | 588 | return new_stack; |
592 | } | 589 | } |
593 | NOKPROBE_SYMBOL(fixup_bad_iret); | 590 | NOKPROBE_SYMBOL(fixup_bad_iret); |
@@ -637,7 +634,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) | |||
637 | * then it's very likely the result of an icebp/int01 trap. | 634 | * then it's very likely the result of an icebp/int01 trap. |
638 | * User wants a sigtrap for that. | 635 | * User wants a sigtrap for that. |
639 | */ | 636 | */ |
640 | if (!dr6 && user_mode_vm(regs)) | 637 | if (!dr6 && user_mode(regs)) |
641 | user_icebp = 1; | 638 | user_icebp = 1; |
642 | 639 | ||
643 | /* Catch kmemcheck conditions first of all! */ | 640 | /* Catch kmemcheck conditions first of all! */ |
@@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) | |||
673 | /* It's safe to allow irq's after DR6 has been saved */ | 670 | /* It's safe to allow irq's after DR6 has been saved */ |
674 | preempt_conditional_sti(regs); | 671 | preempt_conditional_sti(regs); |
675 | 672 | ||
676 | if (regs->flags & X86_VM_MASK) { | 673 | if (v8086_mode(regs)) { |
677 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, | 674 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, |
678 | X86_TRAP_DB); | 675 | X86_TRAP_DB); |
679 | preempt_conditional_cli(regs); | 676 | preempt_conditional_cli(regs); |
@@ -721,7 +718,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) | |||
721 | return; | 718 | return; |
722 | conditional_sti(regs); | 719 | conditional_sti(regs); |
723 | 720 | ||
724 | if (!user_mode_vm(regs)) | 721 | if (!user_mode(regs)) |
725 | { | 722 | { |
726 | if (!fixup_exception(regs)) { | 723 | if (!fixup_exception(regs)) { |
727 | task->thread.error_code = error_code; | 724 | task->thread.error_code = error_code; |
@@ -925,9 +922,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) | |||
925 | /* Set of traps needed for early debugging. */ | 922 | /* Set of traps needed for early debugging. */ |
926 | void __init early_trap_init(void) | 923 | void __init early_trap_init(void) |
927 | { | 924 | { |
928 | set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); | 925 | /* |
926 | * Don't use IST to set DEBUG_STACK as it doesn't work until TSS | ||
927 | * is ready in cpu_init() <-- trap_init(). Before trap_init(), | ||
928 | * CPU runs at ring 0 so it is impossible to hit an invalid | ||
929 | * stack. Using the original stack works well enough at this | ||
930 | * early stage. DEBUG_STACK will be equipped after cpu_init() in | ||
931 | * trap_init(). | ||
932 | * | ||
933 | * We don't need to set trace_idt_table like set_intr_gate(), | ||
934 | * since we don't have trace_debug and it will be reset to | ||
935 | * 'debug' in trap_init() by set_intr_gate_ist(). | ||
936 | */ | ||
937 | set_intr_gate_notrace(X86_TRAP_DB, debug); | ||
929 | /* int3 can be called from all */ | 938 | /* int3 can be called from all */ |
930 | set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); | 939 | set_system_intr_gate(X86_TRAP_BP, &int3); |
931 | #ifdef CONFIG_X86_32 | 940 | #ifdef CONFIG_X86_32 |
932 | set_intr_gate(X86_TRAP_PF, page_fault); | 941 | set_intr_gate(X86_TRAP_PF, page_fault); |
933 | #endif | 942 | #endif |
@@ -1005,6 +1014,15 @@ void __init trap_init(void) | |||
1005 | */ | 1014 | */ |
1006 | cpu_init(); | 1015 | cpu_init(); |
1007 | 1016 | ||
1017 | /* | ||
1018 | * X86_TRAP_DB and X86_TRAP_BP have been set | ||
1019 | * in early_trap_init(). However, ITS works only after | ||
1020 | * cpu_init() loads TSS. See comments in early_trap_init(). | ||
1021 | */ | ||
1022 | set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); | ||
1023 | /* int3 can be called from all */ | ||
1024 | set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); | ||
1025 | |||
1008 | x86_init.irqs.trap_init(); | 1026 | x86_init.irqs.trap_init(); |
1009 | 1027 | ||
1010 | #ifdef CONFIG_X86_64 | 1028 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 81f8adb0679e..0b81ad67da07 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c | |||
@@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, | |||
912 | int ret = NOTIFY_DONE; | 912 | int ret = NOTIFY_DONE; |
913 | 913 | ||
914 | /* We are only interested in userspace traps */ | 914 | /* We are only interested in userspace traps */ |
915 | if (regs && !user_mode_vm(regs)) | 915 | if (regs && !user_mode(regs)) |
916 | return NOTIFY_DONE; | 916 | return NOTIFY_DONE; |
917 | 917 | ||
918 | switch (val) { | 918 | switch (val) { |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e8edcf52e069..fc9db6ef2a95 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) | |||
150 | do_exit(SIGSEGV); | 150 | do_exit(SIGSEGV); |
151 | } | 151 | } |
152 | 152 | ||
153 | tss = &per_cpu(init_tss, get_cpu()); | 153 | tss = &per_cpu(cpu_tss, get_cpu()); |
154 | current->thread.sp0 = current->thread.saved_sp0; | 154 | current->thread.sp0 = current->thread.saved_sp0; |
155 | current->thread.sysenter_cs = __KERNEL_CS; | 155 | current->thread.sysenter_cs = __KERNEL_CS; |
156 | load_sp0(tss, ¤t->thread); | 156 | load_sp0(tss, ¤t->thread); |
@@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk | |||
318 | tsk->thread.saved_fs = info->regs32->fs; | 318 | tsk->thread.saved_fs = info->regs32->fs; |
319 | tsk->thread.saved_gs = get_user_gs(info->regs32); | 319 | tsk->thread.saved_gs = get_user_gs(info->regs32); |
320 | 320 | ||
321 | tss = &per_cpu(init_tss, get_cpu()); | 321 | tss = &per_cpu(cpu_tss, get_cpu()); |
322 | tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; | 322 | tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; |
323 | if (cpu_has_sep) | 323 | if (cpu_has_sep) |
324 | tsk->thread.sysenter_cs = 0; | 324 | tsk->thread.sysenter_cs = 0; |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ac4453d8520e..717908b16037 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -868,7 +868,8 @@ static void __init lguest_init_IRQ(void) | |||
868 | /* Some systems map "vectors" to interrupts weirdly. Not us! */ | 868 | /* Some systems map "vectors" to interrupts weirdly. Not us! */ |
869 | __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); | 869 | __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); |
870 | if (i != SYSCALL_VECTOR) | 870 | if (i != SYSCALL_VECTOR) |
871 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); | 871 | set_intr_gate(i, irq_entries_start + |
872 | 8 * (i - FIRST_EXTERNAL_VECTOR)); | ||
872 | } | 873 | } |
873 | 874 | ||
874 | /* | 875 | /* |
@@ -1076,6 +1077,7 @@ static void lguest_load_sp0(struct tss_struct *tss, | |||
1076 | { | 1077 | { |
1077 | lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, | 1078 | lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, |
1078 | THREAD_SIZE / PAGE_SIZE); | 1079 | THREAD_SIZE / PAGE_SIZE); |
1080 | tss->x86_tss.sp0 = thread->sp0; | ||
1079 | } | 1081 | } |
1080 | 1082 | ||
1081 | /* Let's just say, I wouldn't do debugging under a Guest. */ | 1083 | /* Let's just say, I wouldn't do debugging under a Guest. */ |
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index f5cc9eb1d51b..082a85167a5b 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S | |||
@@ -13,16 +13,6 @@ | |||
13 | #include <asm/alternative-asm.h> | 13 | #include <asm/alternative-asm.h> |
14 | #include <asm/dwarf2.h> | 14 | #include <asm/dwarf2.h> |
15 | 15 | ||
16 | .macro SAVE reg | ||
17 | pushl_cfi %\reg | ||
18 | CFI_REL_OFFSET \reg, 0 | ||
19 | .endm | ||
20 | |||
21 | .macro RESTORE reg | ||
22 | popl_cfi %\reg | ||
23 | CFI_RESTORE \reg | ||
24 | .endm | ||
25 | |||
26 | .macro read64 reg | 16 | .macro read64 reg |
27 | movl %ebx, %eax | 17 | movl %ebx, %eax |
28 | movl %ecx, %edx | 18 | movl %ecx, %edx |
@@ -67,10 +57,10 @@ ENDPROC(atomic64_xchg_cx8) | |||
67 | .macro addsub_return func ins insc | 57 | .macro addsub_return func ins insc |
68 | ENTRY(atomic64_\func\()_return_cx8) | 58 | ENTRY(atomic64_\func\()_return_cx8) |
69 | CFI_STARTPROC | 59 | CFI_STARTPROC |
70 | SAVE ebp | 60 | pushl_cfi_reg ebp |
71 | SAVE ebx | 61 | pushl_cfi_reg ebx |
72 | SAVE esi | 62 | pushl_cfi_reg esi |
73 | SAVE edi | 63 | pushl_cfi_reg edi |
74 | 64 | ||
75 | movl %eax, %esi | 65 | movl %eax, %esi |
76 | movl %edx, %edi | 66 | movl %edx, %edi |
@@ -89,10 +79,10 @@ ENTRY(atomic64_\func\()_return_cx8) | |||
89 | 10: | 79 | 10: |
90 | movl %ebx, %eax | 80 | movl %ebx, %eax |
91 | movl %ecx, %edx | 81 | movl %ecx, %edx |
92 | RESTORE edi | 82 | popl_cfi_reg edi |
93 | RESTORE esi | 83 | popl_cfi_reg esi |
94 | RESTORE ebx | 84 | popl_cfi_reg ebx |
95 | RESTORE ebp | 85 | popl_cfi_reg ebp |
96 | ret | 86 | ret |
97 | CFI_ENDPROC | 87 | CFI_ENDPROC |
98 | ENDPROC(atomic64_\func\()_return_cx8) | 88 | ENDPROC(atomic64_\func\()_return_cx8) |
@@ -104,7 +94,7 @@ addsub_return sub sub sbb | |||
104 | .macro incdec_return func ins insc | 94 | .macro incdec_return func ins insc |
105 | ENTRY(atomic64_\func\()_return_cx8) | 95 | ENTRY(atomic64_\func\()_return_cx8) |
106 | CFI_STARTPROC | 96 | CFI_STARTPROC |
107 | SAVE ebx | 97 | pushl_cfi_reg ebx |
108 | 98 | ||
109 | read64 %esi | 99 | read64 %esi |
110 | 1: | 100 | 1: |
@@ -119,7 +109,7 @@ ENTRY(atomic64_\func\()_return_cx8) | |||
119 | 10: | 109 | 10: |
120 | movl %ebx, %eax | 110 | movl %ebx, %eax |
121 | movl %ecx, %edx | 111 | movl %ecx, %edx |
122 | RESTORE ebx | 112 | popl_cfi_reg ebx |
123 | ret | 113 | ret |
124 | CFI_ENDPROC | 114 | CFI_ENDPROC |
125 | ENDPROC(atomic64_\func\()_return_cx8) | 115 | ENDPROC(atomic64_\func\()_return_cx8) |
@@ -130,7 +120,7 @@ incdec_return dec sub sbb | |||
130 | 120 | ||
131 | ENTRY(atomic64_dec_if_positive_cx8) | 121 | ENTRY(atomic64_dec_if_positive_cx8) |
132 | CFI_STARTPROC | 122 | CFI_STARTPROC |
133 | SAVE ebx | 123 | pushl_cfi_reg ebx |
134 | 124 | ||
135 | read64 %esi | 125 | read64 %esi |
136 | 1: | 126 | 1: |
@@ -146,18 +136,18 @@ ENTRY(atomic64_dec_if_positive_cx8) | |||
146 | 2: | 136 | 2: |
147 | movl %ebx, %eax | 137 | movl %ebx, %eax |
148 | movl %ecx, %edx | 138 | movl %ecx, %edx |
149 | RESTORE ebx | 139 | popl_cfi_reg ebx |
150 | ret | 140 | ret |
151 | CFI_ENDPROC | 141 | CFI_ENDPROC |
152 | ENDPROC(atomic64_dec_if_positive_cx8) | 142 | ENDPROC(atomic64_dec_if_positive_cx8) |
153 | 143 | ||
154 | ENTRY(atomic64_add_unless_cx8) | 144 | ENTRY(atomic64_add_unless_cx8) |
155 | CFI_STARTPROC | 145 | CFI_STARTPROC |
156 | SAVE ebp | 146 | pushl_cfi_reg ebp |
157 | SAVE ebx | 147 | pushl_cfi_reg ebx |
158 | /* these just push these two parameters on the stack */ | 148 | /* these just push these two parameters on the stack */ |
159 | SAVE edi | 149 | pushl_cfi_reg edi |
160 | SAVE ecx | 150 | pushl_cfi_reg ecx |
161 | 151 | ||
162 | movl %eax, %ebp | 152 | movl %eax, %ebp |
163 | movl %edx, %edi | 153 | movl %edx, %edi |
@@ -179,8 +169,8 @@ ENTRY(atomic64_add_unless_cx8) | |||
179 | 3: | 169 | 3: |
180 | addl $8, %esp | 170 | addl $8, %esp |
181 | CFI_ADJUST_CFA_OFFSET -8 | 171 | CFI_ADJUST_CFA_OFFSET -8 |
182 | RESTORE ebx | 172 | popl_cfi_reg ebx |
183 | RESTORE ebp | 173 | popl_cfi_reg ebp |
184 | ret | 174 | ret |
185 | 4: | 175 | 4: |
186 | cmpl %edx, 4(%esp) | 176 | cmpl %edx, 4(%esp) |
@@ -192,7 +182,7 @@ ENDPROC(atomic64_add_unless_cx8) | |||
192 | 182 | ||
193 | ENTRY(atomic64_inc_not_zero_cx8) | 183 | ENTRY(atomic64_inc_not_zero_cx8) |
194 | CFI_STARTPROC | 184 | CFI_STARTPROC |
195 | SAVE ebx | 185 | pushl_cfi_reg ebx |
196 | 186 | ||
197 | read64 %esi | 187 | read64 %esi |
198 | 1: | 188 | 1: |
@@ -209,7 +199,7 @@ ENTRY(atomic64_inc_not_zero_cx8) | |||
209 | 199 | ||
210 | movl $1, %eax | 200 | movl $1, %eax |
211 | 3: | 201 | 3: |
212 | RESTORE ebx | 202 | popl_cfi_reg ebx |
213 | ret | 203 | ret |
214 | CFI_ENDPROC | 204 | CFI_ENDPROC |
215 | ENDPROC(atomic64_inc_not_zero_cx8) | 205 | ENDPROC(atomic64_inc_not_zero_cx8) |
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index e78b8eee6615..9bc944a91274 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S | |||
@@ -51,10 +51,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) | |||
51 | */ | 51 | */ |
52 | ENTRY(csum_partial) | 52 | ENTRY(csum_partial) |
53 | CFI_STARTPROC | 53 | CFI_STARTPROC |
54 | pushl_cfi %esi | 54 | pushl_cfi_reg esi |
55 | CFI_REL_OFFSET esi, 0 | 55 | pushl_cfi_reg ebx |
56 | pushl_cfi %ebx | ||
57 | CFI_REL_OFFSET ebx, 0 | ||
58 | movl 20(%esp),%eax # Function arg: unsigned int sum | 56 | movl 20(%esp),%eax # Function arg: unsigned int sum |
59 | movl 16(%esp),%ecx # Function arg: int len | 57 | movl 16(%esp),%ecx # Function arg: int len |
60 | movl 12(%esp),%esi # Function arg: unsigned char *buff | 58 | movl 12(%esp),%esi # Function arg: unsigned char *buff |
@@ -127,14 +125,12 @@ ENTRY(csum_partial) | |||
127 | 6: addl %ecx,%eax | 125 | 6: addl %ecx,%eax |
128 | adcl $0, %eax | 126 | adcl $0, %eax |
129 | 7: | 127 | 7: |
130 | testl $1, 12(%esp) | 128 | testb $1, 12(%esp) |
131 | jz 8f | 129 | jz 8f |
132 | roll $8, %eax | 130 | roll $8, %eax |
133 | 8: | 131 | 8: |
134 | popl_cfi %ebx | 132 | popl_cfi_reg ebx |
135 | CFI_RESTORE ebx | 133 | popl_cfi_reg esi |
136 | popl_cfi %esi | ||
137 | CFI_RESTORE esi | ||
138 | ret | 134 | ret |
139 | CFI_ENDPROC | 135 | CFI_ENDPROC |
140 | ENDPROC(csum_partial) | 136 | ENDPROC(csum_partial) |
@@ -145,10 +141,8 @@ ENDPROC(csum_partial) | |||
145 | 141 | ||
146 | ENTRY(csum_partial) | 142 | ENTRY(csum_partial) |
147 | CFI_STARTPROC | 143 | CFI_STARTPROC |
148 | pushl_cfi %esi | 144 | pushl_cfi_reg esi |
149 | CFI_REL_OFFSET esi, 0 | 145 | pushl_cfi_reg ebx |
150 | pushl_cfi %ebx | ||
151 | CFI_REL_OFFSET ebx, 0 | ||
152 | movl 20(%esp),%eax # Function arg: unsigned int sum | 146 | movl 20(%esp),%eax # Function arg: unsigned int sum |
153 | movl 16(%esp),%ecx # Function arg: int len | 147 | movl 16(%esp),%ecx # Function arg: int len |
154 | movl 12(%esp),%esi # Function arg: const unsigned char *buf | 148 | movl 12(%esp),%esi # Function arg: const unsigned char *buf |
@@ -251,14 +245,12 @@ ENTRY(csum_partial) | |||
251 | addl %ebx,%eax | 245 | addl %ebx,%eax |
252 | adcl $0,%eax | 246 | adcl $0,%eax |
253 | 80: | 247 | 80: |
254 | testl $1, 12(%esp) | 248 | testb $1, 12(%esp) |
255 | jz 90f | 249 | jz 90f |
256 | roll $8, %eax | 250 | roll $8, %eax |
257 | 90: | 251 | 90: |
258 | popl_cfi %ebx | 252 | popl_cfi_reg ebx |
259 | CFI_RESTORE ebx | 253 | popl_cfi_reg esi |
260 | popl_cfi %esi | ||
261 | CFI_RESTORE esi | ||
262 | ret | 254 | ret |
263 | CFI_ENDPROC | 255 | CFI_ENDPROC |
264 | ENDPROC(csum_partial) | 256 | ENDPROC(csum_partial) |
@@ -298,12 +290,9 @@ ENTRY(csum_partial_copy_generic) | |||
298 | CFI_STARTPROC | 290 | CFI_STARTPROC |
299 | subl $4,%esp | 291 | subl $4,%esp |
300 | CFI_ADJUST_CFA_OFFSET 4 | 292 | CFI_ADJUST_CFA_OFFSET 4 |
301 | pushl_cfi %edi | 293 | pushl_cfi_reg edi |
302 | CFI_REL_OFFSET edi, 0 | 294 | pushl_cfi_reg esi |
303 | pushl_cfi %esi | 295 | pushl_cfi_reg ebx |
304 | CFI_REL_OFFSET esi, 0 | ||
305 | pushl_cfi %ebx | ||
306 | CFI_REL_OFFSET ebx, 0 | ||
307 | movl ARGBASE+16(%esp),%eax # sum | 296 | movl ARGBASE+16(%esp),%eax # sum |
308 | movl ARGBASE+12(%esp),%ecx # len | 297 | movl ARGBASE+12(%esp),%ecx # len |
309 | movl ARGBASE+4(%esp),%esi # src | 298 | movl ARGBASE+4(%esp),%esi # src |
@@ -412,12 +401,9 @@ DST( movb %cl, (%edi) ) | |||
412 | 401 | ||
413 | .previous | 402 | .previous |
414 | 403 | ||
415 | popl_cfi %ebx | 404 | popl_cfi_reg ebx |
416 | CFI_RESTORE ebx | 405 | popl_cfi_reg esi |
417 | popl_cfi %esi | 406 | popl_cfi_reg edi |
418 | CFI_RESTORE esi | ||
419 | popl_cfi %edi | ||
420 | CFI_RESTORE edi | ||
421 | popl_cfi %ecx # equivalent to addl $4,%esp | 407 | popl_cfi %ecx # equivalent to addl $4,%esp |
422 | ret | 408 | ret |
423 | CFI_ENDPROC | 409 | CFI_ENDPROC |
@@ -441,12 +427,9 @@ ENDPROC(csum_partial_copy_generic) | |||
441 | 427 | ||
442 | ENTRY(csum_partial_copy_generic) | 428 | ENTRY(csum_partial_copy_generic) |
443 | CFI_STARTPROC | 429 | CFI_STARTPROC |
444 | pushl_cfi %ebx | 430 | pushl_cfi_reg ebx |
445 | CFI_REL_OFFSET ebx, 0 | 431 | pushl_cfi_reg edi |
446 | pushl_cfi %edi | 432 | pushl_cfi_reg esi |
447 | CFI_REL_OFFSET edi, 0 | ||
448 | pushl_cfi %esi | ||
449 | CFI_REL_OFFSET esi, 0 | ||
450 | movl ARGBASE+4(%esp),%esi #src | 433 | movl ARGBASE+4(%esp),%esi #src |
451 | movl ARGBASE+8(%esp),%edi #dst | 434 | movl ARGBASE+8(%esp),%edi #dst |
452 | movl ARGBASE+12(%esp),%ecx #len | 435 | movl ARGBASE+12(%esp),%ecx #len |
@@ -506,12 +489,9 @@ DST( movb %dl, (%edi) ) | |||
506 | jmp 7b | 489 | jmp 7b |
507 | .previous | 490 | .previous |
508 | 491 | ||
509 | popl_cfi %esi | 492 | popl_cfi_reg esi |
510 | CFI_RESTORE esi | 493 | popl_cfi_reg edi |
511 | popl_cfi %edi | 494 | popl_cfi_reg ebx |
512 | CFI_RESTORE edi | ||
513 | popl_cfi %ebx | ||
514 | CFI_RESTORE ebx | ||
515 | ret | 495 | ret |
516 | CFI_ENDPROC | 496 | CFI_ENDPROC |
517 | ENDPROC(csum_partial_copy_generic) | 497 | ENDPROC(csum_partial_copy_generic) |
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index f2145cfa12a6..e67e579c93bd 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S | |||
@@ -1,31 +1,35 @@ | |||
1 | #include <linux/linkage.h> | 1 | #include <linux/linkage.h> |
2 | #include <asm/dwarf2.h> | 2 | #include <asm/dwarf2.h> |
3 | #include <asm/cpufeature.h> | ||
3 | #include <asm/alternative-asm.h> | 4 | #include <asm/alternative-asm.h> |
4 | 5 | ||
5 | /* | 6 | /* |
6 | * Zero a page. | 7 | * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is |
7 | * rdi page | 8 | * recommended to use this when possible and we do use them by default. |
8 | */ | 9 | * If enhanced REP MOVSB/STOSB is not available, try to use fast string. |
9 | ENTRY(clear_page_c) | 10 | * Otherwise, use original. |
11 | */ | ||
12 | |||
13 | /* | ||
14 | * Zero a page. | ||
15 | * %rdi - page | ||
16 | */ | ||
17 | ENTRY(clear_page) | ||
10 | CFI_STARTPROC | 18 | CFI_STARTPROC |
19 | |||
20 | ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ | ||
21 | "jmp clear_page_c_e", X86_FEATURE_ERMS | ||
22 | |||
11 | movl $4096/8,%ecx | 23 | movl $4096/8,%ecx |
12 | xorl %eax,%eax | 24 | xorl %eax,%eax |
13 | rep stosq | 25 | rep stosq |
14 | ret | 26 | ret |
15 | CFI_ENDPROC | 27 | CFI_ENDPROC |
16 | ENDPROC(clear_page_c) | 28 | ENDPROC(clear_page) |
17 | 29 | ||
18 | ENTRY(clear_page_c_e) | 30 | ENTRY(clear_page_orig) |
19 | CFI_STARTPROC | 31 | CFI_STARTPROC |
20 | movl $4096,%ecx | ||
21 | xorl %eax,%eax | ||
22 | rep stosb | ||
23 | ret | ||
24 | CFI_ENDPROC | ||
25 | ENDPROC(clear_page_c_e) | ||
26 | 32 | ||
27 | ENTRY(clear_page) | ||
28 | CFI_STARTPROC | ||
29 | xorl %eax,%eax | 33 | xorl %eax,%eax |
30 | movl $4096/64,%ecx | 34 | movl $4096/64,%ecx |
31 | .p2align 4 | 35 | .p2align 4 |
@@ -45,29 +49,13 @@ ENTRY(clear_page) | |||
45 | nop | 49 | nop |
46 | ret | 50 | ret |
47 | CFI_ENDPROC | 51 | CFI_ENDPROC |
48 | .Lclear_page_end: | 52 | ENDPROC(clear_page_orig) |
49 | ENDPROC(clear_page) | ||
50 | |||
51 | /* | ||
52 | * Some CPUs support enhanced REP MOVSB/STOSB instructions. | ||
53 | * It is recommended to use this when possible. | ||
54 | * If enhanced REP MOVSB/STOSB is not available, try to use fast string. | ||
55 | * Otherwise, use original function. | ||
56 | * | ||
57 | */ | ||
58 | 53 | ||
59 | #include <asm/cpufeature.h> | 54 | ENTRY(clear_page_c_e) |
60 | 55 | CFI_STARTPROC | |
61 | .section .altinstr_replacement,"ax" | 56 | movl $4096,%ecx |
62 | 1: .byte 0xeb /* jmp <disp8> */ | 57 | xorl %eax,%eax |
63 | .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ | 58 | rep stosb |
64 | 2: .byte 0xeb /* jmp <disp8> */ | 59 | ret |
65 | .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ | 60 | CFI_ENDPROC |
66 | 3: | 61 | ENDPROC(clear_page_c_e) |
67 | .previous | ||
68 | .section .altinstructions,"a" | ||
69 | altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ | ||
70 | .Lclear_page_end-clear_page, 2b-1b | ||
71 | altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ | ||
72 | .Lclear_page_end-clear_page,3b-2b | ||
73 | .previous | ||
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 176cca67212b..8239dbcbf984 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S | |||
@@ -2,23 +2,26 @@ | |||
2 | 2 | ||
3 | #include <linux/linkage.h> | 3 | #include <linux/linkage.h> |
4 | #include <asm/dwarf2.h> | 4 | #include <asm/dwarf2.h> |
5 | #include <asm/cpufeature.h> | ||
5 | #include <asm/alternative-asm.h> | 6 | #include <asm/alternative-asm.h> |
6 | 7 | ||
8 | /* | ||
9 | * Some CPUs run faster using the string copy instructions (sane microcode). | ||
10 | * It is also a lot simpler. Use this when possible. But, don't use streaming | ||
11 | * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the | ||
12 | * prefetch distance based on SMP/UP. | ||
13 | */ | ||
7 | ALIGN | 14 | ALIGN |
8 | copy_page_rep: | 15 | ENTRY(copy_page) |
9 | CFI_STARTPROC | 16 | CFI_STARTPROC |
17 | ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD | ||
10 | movl $4096/8, %ecx | 18 | movl $4096/8, %ecx |
11 | rep movsq | 19 | rep movsq |
12 | ret | 20 | ret |
13 | CFI_ENDPROC | 21 | CFI_ENDPROC |
14 | ENDPROC(copy_page_rep) | 22 | ENDPROC(copy_page) |
15 | |||
16 | /* | ||
17 | * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD. | ||
18 | * Could vary the prefetch distance based on SMP/UP. | ||
19 | */ | ||
20 | 23 | ||
21 | ENTRY(copy_page) | 24 | ENTRY(copy_page_regs) |
22 | CFI_STARTPROC | 25 | CFI_STARTPROC |
23 | subq $2*8, %rsp | 26 | subq $2*8, %rsp |
24 | CFI_ADJUST_CFA_OFFSET 2*8 | 27 | CFI_ADJUST_CFA_OFFSET 2*8 |
@@ -90,21 +93,5 @@ ENTRY(copy_page) | |||
90 | addq $2*8, %rsp | 93 | addq $2*8, %rsp |
91 | CFI_ADJUST_CFA_OFFSET -2*8 | 94 | CFI_ADJUST_CFA_OFFSET -2*8 |
92 | ret | 95 | ret |
93 | .Lcopy_page_end: | ||
94 | CFI_ENDPROC | 96 | CFI_ENDPROC |
95 | ENDPROC(copy_page) | 97 | ENDPROC(copy_page_regs) |
96 | |||
97 | /* Some CPUs run faster using the string copy instructions. | ||
98 | It is also a lot simpler. Use this when possible */ | ||
99 | |||
100 | #include <asm/cpufeature.h> | ||
101 | |||
102 | .section .altinstr_replacement,"ax" | ||
103 | 1: .byte 0xeb /* jmp <disp8> */ | ||
104 | .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */ | ||
105 | 2: | ||
106 | .previous | ||
107 | .section .altinstructions,"a" | ||
108 | altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ | ||
109 | .Lcopy_page_end-copy_page, 2b-1b | ||
110 | .previous | ||
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index dee945d55594..fa997dfaef24 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S | |||
@@ -8,9 +8,6 @@ | |||
8 | 8 | ||
9 | #include <linux/linkage.h> | 9 | #include <linux/linkage.h> |
10 | #include <asm/dwarf2.h> | 10 | #include <asm/dwarf2.h> |
11 | |||
12 | #define FIX_ALIGNMENT 1 | ||
13 | |||
14 | #include <asm/current.h> | 11 | #include <asm/current.h> |
15 | #include <asm/asm-offsets.h> | 12 | #include <asm/asm-offsets.h> |
16 | #include <asm/thread_info.h> | 13 | #include <asm/thread_info.h> |
@@ -19,33 +16,7 @@ | |||
19 | #include <asm/asm.h> | 16 | #include <asm/asm.h> |
20 | #include <asm/smap.h> | 17 | #include <asm/smap.h> |
21 | 18 | ||
22 | /* | ||
23 | * By placing feature2 after feature1 in altinstructions section, we logically | ||
24 | * implement: | ||
25 | * If CPU has feature2, jmp to alt2 is used | ||
26 | * else if CPU has feature1, jmp to alt1 is used | ||
27 | * else jmp to orig is used. | ||
28 | */ | ||
29 | .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 | ||
30 | 0: | ||
31 | .byte 0xe9 /* 32bit jump */ | ||
32 | .long \orig-1f /* by default jump to orig */ | ||
33 | 1: | ||
34 | .section .altinstr_replacement,"ax" | ||
35 | 2: .byte 0xe9 /* near jump with 32bit immediate */ | ||
36 | .long \alt1-1b /* offset */ /* or alternatively to alt1 */ | ||
37 | 3: .byte 0xe9 /* near jump with 32bit immediate */ | ||
38 | .long \alt2-1b /* offset */ /* or alternatively to alt2 */ | ||
39 | .previous | ||
40 | |||
41 | .section .altinstructions,"a" | ||
42 | altinstruction_entry 0b,2b,\feature1,5,5 | ||
43 | altinstruction_entry 0b,3b,\feature2,5,5 | ||
44 | .previous | ||
45 | .endm | ||
46 | |||
47 | .macro ALIGN_DESTINATION | 19 | .macro ALIGN_DESTINATION |
48 | #ifdef FIX_ALIGNMENT | ||
49 | /* check for bad alignment of destination */ | 20 | /* check for bad alignment of destination */ |
50 | movl %edi,%ecx | 21 | movl %edi,%ecx |
51 | andl $7,%ecx | 22 | andl $7,%ecx |
@@ -67,7 +38,6 @@ | |||
67 | 38 | ||
68 | _ASM_EXTABLE(100b,103b) | 39 | _ASM_EXTABLE(100b,103b) |
69 | _ASM_EXTABLE(101b,103b) | 40 | _ASM_EXTABLE(101b,103b) |
70 | #endif | ||
71 | .endm | 41 | .endm |
72 | 42 | ||
73 | /* Standard copy_to_user with segment limit checking */ | 43 | /* Standard copy_to_user with segment limit checking */ |
@@ -79,9 +49,11 @@ ENTRY(_copy_to_user) | |||
79 | jc bad_to_user | 49 | jc bad_to_user |
80 | cmpq TI_addr_limit(%rax),%rcx | 50 | cmpq TI_addr_limit(%rax),%rcx |
81 | ja bad_to_user | 51 | ja bad_to_user |
82 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ | 52 | ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ |
83 | copy_user_generic_unrolled,copy_user_generic_string, \ | 53 | "jmp copy_user_generic_string", \ |
84 | copy_user_enhanced_fast_string | 54 | X86_FEATURE_REP_GOOD, \ |
55 | "jmp copy_user_enhanced_fast_string", \ | ||
56 | X86_FEATURE_ERMS | ||
85 | CFI_ENDPROC | 57 | CFI_ENDPROC |
86 | ENDPROC(_copy_to_user) | 58 | ENDPROC(_copy_to_user) |
87 | 59 | ||
@@ -94,9 +66,11 @@ ENTRY(_copy_from_user) | |||
94 | jc bad_from_user | 66 | jc bad_from_user |
95 | cmpq TI_addr_limit(%rax),%rcx | 67 | cmpq TI_addr_limit(%rax),%rcx |
96 | ja bad_from_user | 68 | ja bad_from_user |
97 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ | 69 | ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ |
98 | copy_user_generic_unrolled,copy_user_generic_string, \ | 70 | "jmp copy_user_generic_string", \ |
99 | copy_user_enhanced_fast_string | 71 | X86_FEATURE_REP_GOOD, \ |
72 | "jmp copy_user_enhanced_fast_string", \ | ||
73 | X86_FEATURE_ERMS | ||
100 | CFI_ENDPROC | 74 | CFI_ENDPROC |
101 | ENDPROC(_copy_from_user) | 75 | ENDPROC(_copy_from_user) |
102 | 76 | ||
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 2419d5fefae3..9734182966f3 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S | |||
@@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic) | |||
196 | 196 | ||
197 | /* handle last odd byte */ | 197 | /* handle last odd byte */ |
198 | .Lhandle_1: | 198 | .Lhandle_1: |
199 | testl $1, %r10d | 199 | testb $1, %r10b |
200 | jz .Lende | 200 | jz .Lende |
201 | xorl %ebx, %ebx | 201 | xorl %ebx, %ebx |
202 | source | 202 | source |
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 1313ae6b478b..8f72b334aea0 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c | |||
@@ -52,6 +52,13 @@ | |||
52 | */ | 52 | */ |
53 | void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) | 53 | void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) |
54 | { | 54 | { |
55 | /* | ||
56 | * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid | ||
57 | * even if the input buffer is long enough to hold them. | ||
58 | */ | ||
59 | if (buf_len > MAX_INSN_SIZE) | ||
60 | buf_len = MAX_INSN_SIZE; | ||
61 | |||
55 | memset(insn, 0, sizeof(*insn)); | 62 | memset(insn, 0, sizeof(*insn)); |
56 | insn->kaddr = kaddr; | 63 | insn->kaddr = kaddr; |
57 | insn->end_kaddr = kaddr + buf_len; | 64 | insn->end_kaddr = kaddr + buf_len; |
@@ -164,6 +171,12 @@ found: | |||
164 | /* VEX.W overrides opnd_size */ | 171 | /* VEX.W overrides opnd_size */ |
165 | insn->opnd_bytes = 8; | 172 | insn->opnd_bytes = 8; |
166 | } else { | 173 | } else { |
174 | /* | ||
175 | * For VEX2, fake VEX3-like byte#2. | ||
176 | * Makes it easier to decode vex.W, vex.vvvv, | ||
177 | * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. | ||
178 | */ | ||
179 | insn->vex_prefix.bytes[2] = b2 & 0x7f; | ||
167 | insn->vex_prefix.nbytes = 2; | 180 | insn->vex_prefix.nbytes = 2; |
168 | insn->next_byte += 2; | 181 | insn->next_byte += 2; |
169 | } | 182 | } |
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 89b53c9968e7..b046664f5a1c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S | |||
@@ -1,12 +1,20 @@ | |||
1 | /* Copyright 2002 Andi Kleen */ | 1 | /* Copyright 2002 Andi Kleen */ |
2 | 2 | ||
3 | #include <linux/linkage.h> | 3 | #include <linux/linkage.h> |
4 | |||
5 | #include <asm/cpufeature.h> | 4 | #include <asm/cpufeature.h> |
6 | #include <asm/dwarf2.h> | 5 | #include <asm/dwarf2.h> |
7 | #include <asm/alternative-asm.h> | 6 | #include <asm/alternative-asm.h> |
8 | 7 | ||
9 | /* | 8 | /* |
9 | * We build a jump to memcpy_orig by default which gets NOPped out on | ||
10 | * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which | ||
11 | * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs | ||
12 | * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. | ||
13 | */ | ||
14 | |||
15 | .weak memcpy | ||
16 | |||
17 | /* | ||
10 | * memcpy - Copy a memory block. | 18 | * memcpy - Copy a memory block. |
11 | * | 19 | * |
12 | * Input: | 20 | * Input: |
@@ -17,15 +25,11 @@ | |||
17 | * Output: | 25 | * Output: |
18 | * rax original destination | 26 | * rax original destination |
19 | */ | 27 | */ |
28 | ENTRY(__memcpy) | ||
29 | ENTRY(memcpy) | ||
30 | ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ | ||
31 | "jmp memcpy_erms", X86_FEATURE_ERMS | ||
20 | 32 | ||
21 | /* | ||
22 | * memcpy_c() - fast string ops (REP MOVSQ) based variant. | ||
23 | * | ||
24 | * This gets patched over the unrolled variant (below) via the | ||
25 | * alternative instructions framework: | ||
26 | */ | ||
27 | .section .altinstr_replacement, "ax", @progbits | ||
28 | .Lmemcpy_c: | ||
29 | movq %rdi, %rax | 33 | movq %rdi, %rax |
30 | movq %rdx, %rcx | 34 | movq %rdx, %rcx |
31 | shrq $3, %rcx | 35 | shrq $3, %rcx |
@@ -34,29 +38,21 @@ | |||
34 | movl %edx, %ecx | 38 | movl %edx, %ecx |
35 | rep movsb | 39 | rep movsb |
36 | ret | 40 | ret |
37 | .Lmemcpy_e: | 41 | ENDPROC(memcpy) |
38 | .previous | 42 | ENDPROC(__memcpy) |
39 | 43 | ||
40 | /* | 44 | /* |
41 | * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than | 45 | * memcpy_erms() - enhanced fast string memcpy. This is faster and |
42 | * memcpy_c. Use memcpy_c_e when possible. | 46 | * simpler than memcpy. Use memcpy_erms when possible. |
43 | * | ||
44 | * This gets patched over the unrolled variant (below) via the | ||
45 | * alternative instructions framework: | ||
46 | */ | 47 | */ |
47 | .section .altinstr_replacement, "ax", @progbits | 48 | ENTRY(memcpy_erms) |
48 | .Lmemcpy_c_e: | ||
49 | movq %rdi, %rax | 49 | movq %rdi, %rax |
50 | movq %rdx, %rcx | 50 | movq %rdx, %rcx |
51 | rep movsb | 51 | rep movsb |
52 | ret | 52 | ret |
53 | .Lmemcpy_e_e: | 53 | ENDPROC(memcpy_erms) |
54 | .previous | ||
55 | |||
56 | .weak memcpy | ||
57 | 54 | ||
58 | ENTRY(__memcpy) | 55 | ENTRY(memcpy_orig) |
59 | ENTRY(memcpy) | ||
60 | CFI_STARTPROC | 56 | CFI_STARTPROC |
61 | movq %rdi, %rax | 57 | movq %rdi, %rax |
62 | 58 | ||
@@ -183,26 +179,4 @@ ENTRY(memcpy) | |||
183 | .Lend: | 179 | .Lend: |
184 | retq | 180 | retq |
185 | CFI_ENDPROC | 181 | CFI_ENDPROC |
186 | ENDPROC(memcpy) | 182 | ENDPROC(memcpy_orig) |
187 | ENDPROC(__memcpy) | ||
188 | |||
189 | /* | ||
190 | * Some CPUs are adding enhanced REP MOVSB/STOSB feature | ||
191 | * If the feature is supported, memcpy_c_e() is the first choice. | ||
192 | * If enhanced rep movsb copy is not available, use fast string copy | ||
193 | * memcpy_c() when possible. This is faster and code is simpler than | ||
194 | * original memcpy(). | ||
195 | * Otherwise, original memcpy() is used. | ||
196 | * In .altinstructions section, ERMS feature is placed after REG_GOOD | ||
197 | * feature to implement the right patch order. | ||
198 | * | ||
199 | * Replace only beginning, memcpy is used to apply alternatives, | ||
200 | * so it is silly to overwrite itself with nops - reboot is the | ||
201 | * only outcome... | ||
202 | */ | ||
203 | .section .altinstructions, "a" | ||
204 | altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ | ||
205 | .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c | ||
206 | altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ | ||
207 | .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e | ||
208 | .previous | ||
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 9c4b530575da..0f8a0d0331b9 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S | |||
@@ -5,7 +5,6 @@ | |||
5 | * This assembly file is re-written from memmove_64.c file. | 5 | * This assembly file is re-written from memmove_64.c file. |
6 | * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> | 6 | * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> |
7 | */ | 7 | */ |
8 | #define _STRING_C | ||
9 | #include <linux/linkage.h> | 8 | #include <linux/linkage.h> |
10 | #include <asm/dwarf2.h> | 9 | #include <asm/dwarf2.h> |
11 | #include <asm/cpufeature.h> | 10 | #include <asm/cpufeature.h> |
@@ -44,6 +43,8 @@ ENTRY(__memmove) | |||
44 | jg 2f | 43 | jg 2f |
45 | 44 | ||
46 | .Lmemmove_begin_forward: | 45 | .Lmemmove_begin_forward: |
46 | ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS | ||
47 | |||
47 | /* | 48 | /* |
48 | * movsq instruction have many startup latency | 49 | * movsq instruction have many startup latency |
49 | * so we handle small size by general register. | 50 | * so we handle small size by general register. |
@@ -207,21 +208,5 @@ ENTRY(__memmove) | |||
207 | 13: | 208 | 13: |
208 | retq | 209 | retq |
209 | CFI_ENDPROC | 210 | CFI_ENDPROC |
210 | |||
211 | .section .altinstr_replacement,"ax" | ||
212 | .Lmemmove_begin_forward_efs: | ||
213 | /* Forward moving data. */ | ||
214 | movq %rdx, %rcx | ||
215 | rep movsb | ||
216 | retq | ||
217 | .Lmemmove_end_forward_efs: | ||
218 | .previous | ||
219 | |||
220 | .section .altinstructions,"a" | ||
221 | altinstruction_entry .Lmemmove_begin_forward, \ | ||
222 | .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ | ||
223 | .Lmemmove_end_forward-.Lmemmove_begin_forward, \ | ||
224 | .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs | ||
225 | .previous | ||
226 | ENDPROC(__memmove) | 211 | ENDPROC(__memmove) |
227 | ENDPROC(memmove) | 212 | ENDPROC(memmove) |
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 6f44935c6a60..93118fb23976 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S | |||
@@ -5,19 +5,30 @@ | |||
5 | #include <asm/cpufeature.h> | 5 | #include <asm/cpufeature.h> |
6 | #include <asm/alternative-asm.h> | 6 | #include <asm/alternative-asm.h> |
7 | 7 | ||
8 | .weak memset | ||
9 | |||
8 | /* | 10 | /* |
9 | * ISO C memset - set a memory block to a byte value. This function uses fast | 11 | * ISO C memset - set a memory block to a byte value. This function uses fast |
10 | * string to get better performance than the original function. The code is | 12 | * string to get better performance than the original function. The code is |
11 | * simpler and shorter than the orignal function as well. | 13 | * simpler and shorter than the orignal function as well. |
12 | * | 14 | * |
13 | * rdi destination | 15 | * rdi destination |
14 | * rsi value (char) | 16 | * rsi value (char) |
15 | * rdx count (bytes) | 17 | * rdx count (bytes) |
16 | * | 18 | * |
17 | * rax original destination | 19 | * rax original destination |
18 | */ | 20 | */ |
19 | .section .altinstr_replacement, "ax", @progbits | 21 | ENTRY(memset) |
20 | .Lmemset_c: | 22 | ENTRY(__memset) |
23 | /* | ||
24 | * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended | ||
25 | * to use it when possible. If not available, use fast string instructions. | ||
26 | * | ||
27 | * Otherwise, use original memset function. | ||
28 | */ | ||
29 | ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ | ||
30 | "jmp memset_erms", X86_FEATURE_ERMS | ||
31 | |||
21 | movq %rdi,%r9 | 32 | movq %rdi,%r9 |
22 | movq %rdx,%rcx | 33 | movq %rdx,%rcx |
23 | andl $7,%edx | 34 | andl $7,%edx |
@@ -31,8 +42,8 @@ | |||
31 | rep stosb | 42 | rep stosb |
32 | movq %r9,%rax | 43 | movq %r9,%rax |
33 | ret | 44 | ret |
34 | .Lmemset_e: | 45 | ENDPROC(memset) |
35 | .previous | 46 | ENDPROC(__memset) |
36 | 47 | ||
37 | /* | 48 | /* |
38 | * ISO C memset - set a memory block to a byte value. This function uses | 49 | * ISO C memset - set a memory block to a byte value. This function uses |
@@ -45,21 +56,16 @@ | |||
45 | * | 56 | * |
46 | * rax original destination | 57 | * rax original destination |
47 | */ | 58 | */ |
48 | .section .altinstr_replacement, "ax", @progbits | 59 | ENTRY(memset_erms) |
49 | .Lmemset_c_e: | ||
50 | movq %rdi,%r9 | 60 | movq %rdi,%r9 |
51 | movb %sil,%al | 61 | movb %sil,%al |
52 | movq %rdx,%rcx | 62 | movq %rdx,%rcx |
53 | rep stosb | 63 | rep stosb |
54 | movq %r9,%rax | 64 | movq %r9,%rax |
55 | ret | 65 | ret |
56 | .Lmemset_e_e: | 66 | ENDPROC(memset_erms) |
57 | .previous | ||
58 | |||
59 | .weak memset | ||
60 | 67 | ||
61 | ENTRY(memset) | 68 | ENTRY(memset_orig) |
62 | ENTRY(__memset) | ||
63 | CFI_STARTPROC | 69 | CFI_STARTPROC |
64 | movq %rdi,%r10 | 70 | movq %rdi,%r10 |
65 | 71 | ||
@@ -134,23 +140,4 @@ ENTRY(__memset) | |||
134 | jmp .Lafter_bad_alignment | 140 | jmp .Lafter_bad_alignment |
135 | .Lfinal: | 141 | .Lfinal: |
136 | CFI_ENDPROC | 142 | CFI_ENDPROC |
137 | ENDPROC(memset) | 143 | ENDPROC(memset_orig) |
138 | ENDPROC(__memset) | ||
139 | |||
140 | /* Some CPUs support enhanced REP MOVSB/STOSB feature. | ||
141 | * It is recommended to use this when possible. | ||
142 | * | ||
143 | * If enhanced REP MOVSB/STOSB feature is not available, use fast string | ||
144 | * instructions. | ||
145 | * | ||
146 | * Otherwise, use original memset function. | ||
147 | * | ||
148 | * In .altinstructions section, ERMS feature is placed after REG_GOOD | ||
149 | * feature to implement the right patch order. | ||
150 | */ | ||
151 | .section .altinstructions,"a" | ||
152 | altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ | ||
153 | .Lfinal-__memset,.Lmemset_e-.Lmemset_c | ||
154 | altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ | ||
155 | .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e | ||
156 | .previous | ||
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index f6d13eefad10..3ca5218fbece 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S | |||
@@ -14,8 +14,8 @@ | |||
14 | .macro op_safe_regs op | 14 | .macro op_safe_regs op |
15 | ENTRY(\op\()_safe_regs) | 15 | ENTRY(\op\()_safe_regs) |
16 | CFI_STARTPROC | 16 | CFI_STARTPROC |
17 | pushq_cfi %rbx | 17 | pushq_cfi_reg rbx |
18 | pushq_cfi %rbp | 18 | pushq_cfi_reg rbp |
19 | movq %rdi, %r10 /* Save pointer */ | 19 | movq %rdi, %r10 /* Save pointer */ |
20 | xorl %r11d, %r11d /* Return value */ | 20 | xorl %r11d, %r11d /* Return value */ |
21 | movl (%rdi), %eax | 21 | movl (%rdi), %eax |
@@ -35,8 +35,8 @@ ENTRY(\op\()_safe_regs) | |||
35 | movl %ebp, 20(%r10) | 35 | movl %ebp, 20(%r10) |
36 | movl %esi, 24(%r10) | 36 | movl %esi, 24(%r10) |
37 | movl %edi, 28(%r10) | 37 | movl %edi, 28(%r10) |
38 | popq_cfi %rbp | 38 | popq_cfi_reg rbp |
39 | popq_cfi %rbx | 39 | popq_cfi_reg rbx |
40 | ret | 40 | ret |
41 | 3: | 41 | 3: |
42 | CFI_RESTORE_STATE | 42 | CFI_RESTORE_STATE |
@@ -53,10 +53,10 @@ ENDPROC(\op\()_safe_regs) | |||
53 | .macro op_safe_regs op | 53 | .macro op_safe_regs op |
54 | ENTRY(\op\()_safe_regs) | 54 | ENTRY(\op\()_safe_regs) |
55 | CFI_STARTPROC | 55 | CFI_STARTPROC |
56 | pushl_cfi %ebx | 56 | pushl_cfi_reg ebx |
57 | pushl_cfi %ebp | 57 | pushl_cfi_reg ebp |
58 | pushl_cfi %esi | 58 | pushl_cfi_reg esi |
59 | pushl_cfi %edi | 59 | pushl_cfi_reg edi |
60 | pushl_cfi $0 /* Return value */ | 60 | pushl_cfi $0 /* Return value */ |
61 | pushl_cfi %eax | 61 | pushl_cfi %eax |
62 | movl 4(%eax), %ecx | 62 | movl 4(%eax), %ecx |
@@ -80,10 +80,10 @@ ENTRY(\op\()_safe_regs) | |||
80 | movl %esi, 24(%eax) | 80 | movl %esi, 24(%eax) |
81 | movl %edi, 28(%eax) | 81 | movl %edi, 28(%eax) |
82 | popl_cfi %eax | 82 | popl_cfi %eax |
83 | popl_cfi %edi | 83 | popl_cfi_reg edi |
84 | popl_cfi %esi | 84 | popl_cfi_reg esi |
85 | popl_cfi %ebp | 85 | popl_cfi_reg ebp |
86 | popl_cfi %ebx | 86 | popl_cfi_reg ebx |
87 | ret | 87 | ret |
88 | 3: | 88 | 3: |
89 | CFI_RESTORE_STATE | 89 | CFI_RESTORE_STATE |
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index 5dff5f042468..2322abe4da3b 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S | |||
@@ -34,10 +34,10 @@ | |||
34 | */ | 34 | */ |
35 | 35 | ||
36 | #define save_common_regs \ | 36 | #define save_common_regs \ |
37 | pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0 | 37 | pushl_cfi_reg ecx |
38 | 38 | ||
39 | #define restore_common_regs \ | 39 | #define restore_common_regs \ |
40 | popl_cfi %ecx; CFI_RESTORE ecx | 40 | popl_cfi_reg ecx |
41 | 41 | ||
42 | /* Avoid uglifying the argument copying x86-64 needs to do. */ | 42 | /* Avoid uglifying the argument copying x86-64 needs to do. */ |
43 | .macro movq src, dst | 43 | .macro movq src, dst |
@@ -64,22 +64,22 @@ | |||
64 | */ | 64 | */ |
65 | 65 | ||
66 | #define save_common_regs \ | 66 | #define save_common_regs \ |
67 | pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ | 67 | pushq_cfi_reg rdi; \ |
68 | pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ | 68 | pushq_cfi_reg rsi; \ |
69 | pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ | 69 | pushq_cfi_reg rcx; \ |
70 | pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ | 70 | pushq_cfi_reg r8; \ |
71 | pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ | 71 | pushq_cfi_reg r9; \ |
72 | pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ | 72 | pushq_cfi_reg r10; \ |
73 | pushq_cfi %r11; CFI_REL_OFFSET r11, 0 | 73 | pushq_cfi_reg r11 |
74 | 74 | ||
75 | #define restore_common_regs \ | 75 | #define restore_common_regs \ |
76 | popq_cfi %r11; CFI_RESTORE r11; \ | 76 | popq_cfi_reg r11; \ |
77 | popq_cfi %r10; CFI_RESTORE r10; \ | 77 | popq_cfi_reg r10; \ |
78 | popq_cfi %r9; CFI_RESTORE r9; \ | 78 | popq_cfi_reg r9; \ |
79 | popq_cfi %r8; CFI_RESTORE r8; \ | 79 | popq_cfi_reg r8; \ |
80 | popq_cfi %rcx; CFI_RESTORE rcx; \ | 80 | popq_cfi_reg rcx; \ |
81 | popq_cfi %rsi; CFI_RESTORE rsi; \ | 81 | popq_cfi_reg rsi; \ |
82 | popq_cfi %rdi; CFI_RESTORE rdi | 82 | popq_cfi_reg rdi |
83 | 83 | ||
84 | #endif | 84 | #endif |
85 | 85 | ||
@@ -87,12 +87,10 @@ | |||
87 | ENTRY(call_rwsem_down_read_failed) | 87 | ENTRY(call_rwsem_down_read_failed) |
88 | CFI_STARTPROC | 88 | CFI_STARTPROC |
89 | save_common_regs | 89 | save_common_regs |
90 | __ASM_SIZE(push,_cfi) %__ASM_REG(dx) | 90 | __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) |
91 | CFI_REL_OFFSET __ASM_REG(dx), 0 | ||
92 | movq %rax,%rdi | 91 | movq %rax,%rdi |
93 | call rwsem_down_read_failed | 92 | call rwsem_down_read_failed |
94 | __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) | 93 | __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) |
95 | CFI_RESTORE __ASM_REG(dx) | ||
96 | restore_common_regs | 94 | restore_common_regs |
97 | ret | 95 | ret |
98 | CFI_ENDPROC | 96 | CFI_ENDPROC |
@@ -124,12 +122,10 @@ ENDPROC(call_rwsem_wake) | |||
124 | ENTRY(call_rwsem_downgrade_wake) | 122 | ENTRY(call_rwsem_downgrade_wake) |
125 | CFI_STARTPROC | 123 | CFI_STARTPROC |
126 | save_common_regs | 124 | save_common_regs |
127 | __ASM_SIZE(push,_cfi) %__ASM_REG(dx) | 125 | __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) |
128 | CFI_REL_OFFSET __ASM_REG(dx), 0 | ||
129 | movq %rax,%rdi | 126 | movq %rax,%rdi |
130 | call rwsem_downgrade_wake | 127 | call rwsem_downgrade_wake |
131 | __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) | 128 | __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) |
132 | CFI_RESTORE __ASM_REG(dx) | ||
133 | restore_common_regs | 129 | restore_common_regs |
134 | ret | 130 | ret |
135 | CFI_ENDPROC | 131 | CFI_ENDPROC |
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index e28cdaf5ac2c..5eb715087b80 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S | |||
@@ -13,12 +13,9 @@ | |||
13 | .globl \name | 13 | .globl \name |
14 | \name: | 14 | \name: |
15 | CFI_STARTPROC | 15 | CFI_STARTPROC |
16 | pushl_cfi %eax | 16 | pushl_cfi_reg eax |
17 | CFI_REL_OFFSET eax, 0 | 17 | pushl_cfi_reg ecx |
18 | pushl_cfi %ecx | 18 | pushl_cfi_reg edx |
19 | CFI_REL_OFFSET ecx, 0 | ||
20 | pushl_cfi %edx | ||
21 | CFI_REL_OFFSET edx, 0 | ||
22 | 19 | ||
23 | .if \put_ret_addr_in_eax | 20 | .if \put_ret_addr_in_eax |
24 | /* Place EIP in the arg1 */ | 21 | /* Place EIP in the arg1 */ |
@@ -26,12 +23,9 @@ | |||
26 | .endif | 23 | .endif |
27 | 24 | ||
28 | call \func | 25 | call \func |
29 | popl_cfi %edx | 26 | popl_cfi_reg edx |
30 | CFI_RESTORE edx | 27 | popl_cfi_reg ecx |
31 | popl_cfi %ecx | 28 | popl_cfi_reg eax |
32 | CFI_RESTORE ecx | ||
33 | popl_cfi %eax | ||
34 | CFI_RESTORE eax | ||
35 | ret | 29 | ret |
36 | CFI_ENDPROC | 30 | CFI_ENDPROC |
37 | _ASM_NOKPROBE(\name) | 31 | _ASM_NOKPROBE(\name) |
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index b30b5ebd614a..f89ba4e93025 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S | |||
@@ -17,9 +17,18 @@ | |||
17 | CFI_STARTPROC | 17 | CFI_STARTPROC |
18 | 18 | ||
19 | /* this one pushes 9 elems, the next one would be %rIP */ | 19 | /* this one pushes 9 elems, the next one would be %rIP */ |
20 | SAVE_ARGS | 20 | pushq_cfi_reg rdi |
21 | pushq_cfi_reg rsi | ||
22 | pushq_cfi_reg rdx | ||
23 | pushq_cfi_reg rcx | ||
24 | pushq_cfi_reg rax | ||
25 | pushq_cfi_reg r8 | ||
26 | pushq_cfi_reg r9 | ||
27 | pushq_cfi_reg r10 | ||
28 | pushq_cfi_reg r11 | ||
21 | 29 | ||
22 | .if \put_ret_addr_in_rdi | 30 | .if \put_ret_addr_in_rdi |
31 | /* 9*8(%rsp) is return addr on stack */ | ||
23 | movq_cfi_restore 9*8, rdi | 32 | movq_cfi_restore 9*8, rdi |
24 | .endif | 33 | .endif |
25 | 34 | ||
@@ -45,11 +54,22 @@ | |||
45 | #endif | 54 | #endif |
46 | #endif | 55 | #endif |
47 | 56 | ||
48 | /* SAVE_ARGS below is used only for the .cfi directives it contains. */ | 57 | #if defined(CONFIG_TRACE_IRQFLAGS) \ |
58 | || defined(CONFIG_DEBUG_LOCK_ALLOC) \ | ||
59 | || defined(CONFIG_PREEMPT) | ||
49 | CFI_STARTPROC | 60 | CFI_STARTPROC |
50 | SAVE_ARGS | 61 | CFI_ADJUST_CFA_OFFSET 9*8 |
51 | restore: | 62 | restore: |
52 | RESTORE_ARGS | 63 | popq_cfi_reg r11 |
64 | popq_cfi_reg r10 | ||
65 | popq_cfi_reg r9 | ||
66 | popq_cfi_reg r8 | ||
67 | popq_cfi_reg rax | ||
68 | popq_cfi_reg rcx | ||
69 | popq_cfi_reg rdx | ||
70 | popq_cfi_reg rsi | ||
71 | popq_cfi_reg rdi | ||
53 | ret | 72 | ret |
54 | CFI_ENDPROC | 73 | CFI_ENDPROC |
55 | _ASM_NOKPROBE(restore) | 74 | _ASM_NOKPROBE(restore) |
75 | #endif | ||
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 1a2be7c6895d..816488c0b97e 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt | |||
@@ -273,6 +273,9 @@ dd: ESC | |||
273 | de: ESC | 273 | de: ESC |
274 | df: ESC | 274 | df: ESC |
275 | # 0xe0 - 0xef | 275 | # 0xe0 - 0xef |
276 | # Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix | ||
277 | # in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation | ||
278 | # to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. | ||
276 | e0: LOOPNE/LOOPNZ Jb (f64) | 279 | e0: LOOPNE/LOOPNZ Jb (f64) |
277 | e1: LOOPE/LOOPZ Jb (f64) | 280 | e1: LOOPE/LOOPZ Jb (f64) |
278 | e2: LOOP Jb (f64) | 281 | e2: LOOP Jb (f64) |
@@ -281,6 +284,10 @@ e4: IN AL,Ib | |||
281 | e5: IN eAX,Ib | 284 | e5: IN eAX,Ib |
282 | e6: OUT Ib,AL | 285 | e6: OUT Ib,AL |
283 | e7: OUT Ib,eAX | 286 | e7: OUT Ib,eAX |
287 | # With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset | ||
288 | # in "near" jumps and calls is 16-bit. For CALL, | ||
289 | # push of return address is 16-bit wide, RSP is decremented by 2 | ||
290 | # but is not truncated to 16 bits, unlike RIP. | ||
284 | e8: CALL Jz (f64) | 291 | e8: CALL Jz (f64) |
285 | e9: JMP-near Jz (f64) | 292 | e9: JMP-near Jz (f64) |
286 | ea: JMP-far Ap (i64) | 293 | ea: JMP-far Ap (i64) |
@@ -456,6 +463,7 @@ AVXcode: 1 | |||
456 | 7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) | 463 | 7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) |
457 | 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) | 464 | 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) |
458 | # 0x0f 0x80-0x8f | 465 | # 0x0f 0x80-0x8f |
466 | # Note: "forced64" is Intel CPU behavior (see comment about CALL insn). | ||
459 | 80: JO Jz (f64) | 467 | 80: JO Jz (f64) |
460 | 81: JNO Jz (f64) | 468 | 81: JNO Jz (f64) |
461 | 82: JB/JC/JNAE Jz (f64) | 469 | 82: JB/JC/JNAE Jz (f64) |
@@ -842,6 +850,7 @@ EndTable | |||
842 | GrpTable: Grp5 | 850 | GrpTable: Grp5 |
843 | 0: INC Ev | 851 | 0: INC Ev |
844 | 1: DEC Ev | 852 | 1: DEC Ev |
853 | # Note: "forced64" is Intel CPU behavior (see comment about CALL insn). | ||
845 | 2: CALLN Ev (f64) | 854 | 2: CALLN Ev (f64) |
846 | 3: CALLF Ep | 855 | 3: CALLF Ep |
847 | 4: JMPN Ev (f64) | 856 | 4: JMPN Ev (f64) |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ede025fb46f1..181c53bac3a7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -59,7 +59,7 @@ static nokprobe_inline int kprobes_fault(struct pt_regs *regs) | |||
59 | int ret = 0; | 59 | int ret = 0; |
60 | 60 | ||
61 | /* kprobe_running() needs smp_processor_id() */ | 61 | /* kprobe_running() needs smp_processor_id() */ |
62 | if (kprobes_built_in() && !user_mode_vm(regs)) { | 62 | if (kprobes_built_in() && !user_mode(regs)) { |
63 | preempt_disable(); | 63 | preempt_disable(); |
64 | if (kprobe_running() && kprobe_fault_handler(regs, 14)) | 64 | if (kprobe_running() && kprobe_fault_handler(regs, 14)) |
65 | ret = 1; | 65 | ret = 1; |
@@ -148,7 +148,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) | |||
148 | instr = (void *)convert_ip_to_linear(current, regs); | 148 | instr = (void *)convert_ip_to_linear(current, regs); |
149 | max_instr = instr + 15; | 149 | max_instr = instr + 15; |
150 | 150 | ||
151 | if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) | 151 | if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) |
152 | return 0; | 152 | return 0; |
153 | 153 | ||
154 | while (instr < max_instr) { | 154 | while (instr < max_instr) { |
@@ -1035,7 +1035,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) | |||
1035 | if (error_code & PF_USER) | 1035 | if (error_code & PF_USER) |
1036 | return false; | 1036 | return false; |
1037 | 1037 | ||
1038 | if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) | 1038 | if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) |
1039 | return false; | 1039 | return false; |
1040 | 1040 | ||
1041 | return true; | 1041 | return true; |
@@ -1140,7 +1140,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, | |||
1140 | * User-mode registers count as a user access even for any | 1140 | * User-mode registers count as a user access even for any |
1141 | * potential system fault or CPU buglet: | 1141 | * potential system fault or CPU buglet: |
1142 | */ | 1142 | */ |
1143 | if (user_mode_vm(regs)) { | 1143 | if (user_mode(regs)) { |
1144 | local_irq_enable(); | 1144 | local_irq_enable(); |
1145 | error_code |= PF_USER; | 1145 | error_code |= PF_USER; |
1146 | flags |= FAULT_FLAG_USER; | 1146 | flags |= FAULT_FLAG_USER; |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a110efca6d06..52417e771af9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -179,7 +179,8 @@ static void __init probe_page_size_mask(void) | |||
179 | if (cpu_has_pge) { | 179 | if (cpu_has_pge) { |
180 | cr4_set_bits_and_update_boot(X86_CR4_PGE); | 180 | cr4_set_bits_and_update_boot(X86_CR4_PGE); |
181 | __supported_pte_mask |= _PAGE_GLOBAL; | 181 | __supported_pte_mask |= _PAGE_GLOBAL; |
182 | } | 182 | } else |
183 | __supported_pte_mask &= ~_PAGE_GLOBAL; | ||
183 | } | 184 | } |
184 | 185 | ||
185 | #ifdef CONFIG_X86_32 | 186 | #ifdef CONFIG_X86_32 |
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 5d04be5efb64..4e664bdb535a 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c | |||
@@ -111,7 +111,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) | |||
111 | { | 111 | { |
112 | struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); | 112 | struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); |
113 | 113 | ||
114 | if (!user_mode_vm(regs)) { | 114 | if (!user_mode(regs)) { |
115 | unsigned long stack = kernel_stack_pointer(regs); | 115 | unsigned long stack = kernel_stack_pointer(regs); |
116 | if (depth) | 116 | if (depth) |
117 | dump_trace(NULL, regs, (unsigned long *)stack, 0, | 117 | dump_trace(NULL, regs, (unsigned long *)stack, 0, |
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 3e32ed5648a0..757678fb26e1 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c | |||
@@ -134,7 +134,7 @@ static void do_fpu_end(void) | |||
134 | static void fix_processor_context(void) | 134 | static void fix_processor_context(void) |
135 | { | 135 | { |
136 | int cpu = smp_processor_id(); | 136 | int cpu = smp_processor_id(); |
137 | struct tss_struct *t = &per_cpu(init_tss, cpu); | 137 | struct tss_struct *t = &per_cpu(cpu_tss, cpu); |
138 | #ifdef CONFIG_X86_64 | 138 | #ifdef CONFIG_X86_64 |
139 | struct desc_struct *desc = get_cpu_gdt_table(cpu); | 139 | struct desc_struct *desc = get_cpu_gdt_table(cpu); |
140 | tss_desc tss; | 140 | tss_desc tss; |
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index b3560ece1c9f..ef8187f9d28d 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl | |||
@@ -119,7 +119,7 @@ | |||
119 | 110 i386 iopl sys_iopl | 119 | 110 i386 iopl sys_iopl |
120 | 111 i386 vhangup sys_vhangup | 120 | 111 i386 vhangup sys_vhangup |
121 | 112 i386 idle | 121 | 112 i386 idle |
122 | 113 i386 vm86old sys_vm86old sys32_vm86_warning | 122 | 113 i386 vm86old sys_vm86old sys_ni_syscall |
123 | 114 i386 wait4 sys_wait4 compat_sys_wait4 | 123 | 114 i386 wait4 sys_wait4 compat_sys_wait4 |
124 | 115 i386 swapoff sys_swapoff | 124 | 115 i386 swapoff sys_swapoff |
125 | 116 i386 sysinfo sys_sysinfo compat_sys_sysinfo | 125 | 116 i386 sysinfo sys_sysinfo compat_sys_sysinfo |
@@ -172,7 +172,7 @@ | |||
172 | 163 i386 mremap sys_mremap | 172 | 163 i386 mremap sys_mremap |
173 | 164 i386 setresuid sys_setresuid16 | 173 | 164 i386 setresuid sys_setresuid16 |
174 | 165 i386 getresuid sys_getresuid16 | 174 | 165 i386 getresuid sys_getresuid16 |
175 | 166 i386 vm86 sys_vm86 sys32_vm86_warning | 175 | 166 i386 vm86 sys_vm86 sys_ni_syscall |
176 | 167 i386 query_module | 176 | 167 i386 query_module |
177 | 168 i386 poll sys_poll | 177 | 168 i386 poll sys_poll |
178 | 169 i386 nfsservctl | 178 | 169 i386 nfsservctl |
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 8d656fbb57aa..9ef32d5f1b19 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl | |||
@@ -178,7 +178,7 @@ | |||
178 | 169 common reboot sys_reboot | 178 | 169 common reboot sys_reboot |
179 | 170 common sethostname sys_sethostname | 179 | 170 common sethostname sys_sethostname |
180 | 171 common setdomainname sys_setdomainname | 180 | 171 common setdomainname sys_setdomainname |
181 | 172 common iopl stub_iopl | 181 | 172 common iopl sys_iopl |
182 | 173 common ioperm sys_ioperm | 182 | 173 common ioperm sys_ioperm |
183 | 174 64 create_module | 183 | 174 64 create_module |
184 | 175 common init_module sys_init_module | 184 | 175 common init_module sys_init_module |
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 2d7d9a1f5b53..8ffd2146fa6a 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h | |||
@@ -64,8 +64,8 @@ | |||
64 | */ | 64 | */ |
65 | static inline void rdtsc_barrier(void) | 65 | static inline void rdtsc_barrier(void) |
66 | { | 66 | { |
67 | alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); | 67 | alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, |
68 | alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); | 68 | "lfence", X86_FEATURE_LFENCE_RDTSC); |
69 | } | 69 | } |
70 | 70 | ||
71 | #endif | 71 | #endif |
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 5cdfa9db2217..a75d8700472a 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c | |||
@@ -16,7 +16,7 @@ | |||
16 | */ | 16 | */ |
17 | 17 | ||
18 | /* Not going to be implemented by UML, since we have no hardware. */ | 18 | /* Not going to be implemented by UML, since we have no hardware. */ |
19 | #define stub_iopl sys_ni_syscall | 19 | #define sys_iopl sys_ni_syscall |
20 | #define sys_ioperm sys_ni_syscall | 20 | #define sys_ioperm sys_ni_syscall |
21 | 21 | ||
22 | /* | 22 | /* |
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5240f563076d..81665c9f2132 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss, | |||
912 | mcs = xen_mc_entry(0); | 912 | mcs = xen_mc_entry(0); |
913 | MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); | 913 | MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); |
914 | xen_mc_issue(PARAVIRT_LAZY_CPU); | 914 | xen_mc_issue(PARAVIRT_LAZY_CPU); |
915 | tss->x86_tss.sp0 = thread->sp0; | ||
915 | } | 916 | } |
916 | 917 | ||
917 | static void xen_set_iopl_mask(unsigned mask) | 918 | static void xen_set_iopl_mask(unsigned mask) |
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 08e8489c47f1..7413ee3706d0 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
@@ -445,15 +445,7 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) | |||
445 | { | 445 | { |
446 | int rc; | 446 | int rc; |
447 | 447 | ||
448 | per_cpu(current_task, cpu) = idle; | 448 | common_cpu_up(cpu, idle); |
449 | #ifdef CONFIG_X86_32 | ||
450 | irq_ctx_init(cpu); | ||
451 | #else | ||
452 | clear_tsk_thread_flag(idle, TIF_FORK); | ||
453 | #endif | ||
454 | per_cpu(kernel_stack, cpu) = | ||
455 | (unsigned long)task_stack_page(idle) - | ||
456 | KERNEL_STACK_OFFSET + THREAD_SIZE; | ||
457 | 449 | ||
458 | xen_setup_runstate_info(cpu); | 450 | xen_setup_runstate_info(cpu); |
459 | xen_setup_timer(cpu); | 451 | xen_setup_timer(cpu); |
@@ -468,10 +460,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) | |||
468 | if (rc) | 460 | if (rc) |
469 | return rc; | 461 | return rc; |
470 | 462 | ||
471 | if (num_online_cpus() == 1) | ||
472 | /* Just in case we booted with a single CPU. */ | ||
473 | alternatives_enable_smp(); | ||
474 | |||
475 | rc = xen_smp_intr_init(cpu); | 463 | rc = xen_smp_intr_init(cpu); |
476 | if (rc) | 464 | if (rc) |
477 | return rc; | 465 | return rc; |
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 53adefda4275..985fc3ee0973 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S | |||
@@ -68,11 +68,11 @@ ENTRY(xen_sysret64) | |||
68 | * We're already on the usermode stack at this point, but | 68 | * We're already on the usermode stack at this point, but |
69 | * still with the kernel gs, so we can easily switch back | 69 | * still with the kernel gs, so we can easily switch back |
70 | */ | 70 | */ |
71 | movq %rsp, PER_CPU_VAR(old_rsp) | 71 | movq %rsp, PER_CPU_VAR(rsp_scratch) |
72 | movq PER_CPU_VAR(kernel_stack), %rsp | 72 | movq PER_CPU_VAR(kernel_stack), %rsp |
73 | 73 | ||
74 | pushq $__USER_DS | 74 | pushq $__USER_DS |
75 | pushq PER_CPU_VAR(old_rsp) | 75 | pushq PER_CPU_VAR(rsp_scratch) |
76 | pushq %r11 | 76 | pushq %r11 |
77 | pushq $__USER_CS | 77 | pushq $__USER_CS |
78 | pushq %rcx | 78 | pushq %rcx |
@@ -87,11 +87,11 @@ ENTRY(xen_sysret32) | |||
87 | * We're already on the usermode stack at this point, but | 87 | * We're already on the usermode stack at this point, but |
88 | * still with the kernel gs, so we can easily switch back | 88 | * still with the kernel gs, so we can easily switch back |
89 | */ | 89 | */ |
90 | movq %rsp, PER_CPU_VAR(old_rsp) | 90 | movq %rsp, PER_CPU_VAR(rsp_scratch) |
91 | movq PER_CPU_VAR(kernel_stack), %rsp | 91 | movq PER_CPU_VAR(kernel_stack), %rsp |
92 | 92 | ||
93 | pushq $__USER32_DS | 93 | pushq $__USER32_DS |
94 | pushq PER_CPU_VAR(old_rsp) | 94 | pushq PER_CPU_VAR(rsp_scratch) |
95 | pushq %r11 | 95 | pushq %r11 |
96 | pushq $__USER32_CS | 96 | pushq $__USER32_CS |
97 | pushq %rcx | 97 | pushq %rcx |
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c index 82dc5748f873..7f327121e6d7 100644 --- a/drivers/misc/sgi-xp/xpc_main.c +++ b/drivers/misc/sgi-xp/xpc_main.c | |||
@@ -1210,7 +1210,7 @@ xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args) | |||
1210 | 1210 | ||
1211 | if (((die_args->trapnr == X86_TRAP_MF) || | 1211 | if (((die_args->trapnr == X86_TRAP_MF) || |
1212 | (die_args->trapnr == X86_TRAP_XF)) && | 1212 | (die_args->trapnr == X86_TRAP_XF)) && |
1213 | !user_mode_vm(die_args->regs)) | 1213 | !user_mode(die_args->regs)) |
1214 | xpc_die_deactivate(); | 1214 | xpc_die_deactivate(); |
1215 | 1215 | ||
1216 | break; | 1216 | break; |
diff --git a/include/linux/stddef.h b/include/linux/stddef.h index f4aec0e75c3a..076af437284d 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h | |||
@@ -19,3 +19,12 @@ enum { | |||
19 | #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) | 19 | #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) |
20 | #endif | 20 | #endif |
21 | #endif | 21 | #endif |
22 | |||
23 | /** | ||
24 | * offsetofend(TYPE, MEMBER) | ||
25 | * | ||
26 | * @TYPE: The type of the structure | ||
27 | * @MEMBER: The member within the structure to get the end offset of | ||
28 | */ | ||
29 | #define offsetofend(TYPE, MEMBER) \ | ||
30 | (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) | ||
diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 2d67b8998fd8..049b2f497bc7 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h | |||
@@ -78,19 +78,6 @@ extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); | |||
78 | extern void vfio_unregister_iommu_driver( | 78 | extern void vfio_unregister_iommu_driver( |
79 | const struct vfio_iommu_driver_ops *ops); | 79 | const struct vfio_iommu_driver_ops *ops); |
80 | 80 | ||
81 | /** | ||
82 | * offsetofend(TYPE, MEMBER) | ||
83 | * | ||
84 | * @TYPE: The type of the structure | ||
85 | * @MEMBER: The member within the structure to get the end offset of | ||
86 | * | ||
87 | * Simple helper macro for dealing with variable sized structures passed | ||
88 | * from user space. This allows us to easily determine if the provided | ||
89 | * structure is sized to include various fields. | ||
90 | */ | ||
91 | #define offsetofend(TYPE, MEMBER) \ | ||
92 | (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) | ||
93 | |||
94 | /* | 81 | /* |
95 | * External user API | 82 | * External user API |
96 | */ | 83 | */ |
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h index d66ab799b35f..8c0c1a2770c8 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h | |||
@@ -1,12 +1,12 @@ | |||
1 | 1 | ||
2 | MEMCPY_FN(__memcpy, | 2 | MEMCPY_FN(memcpy_orig, |
3 | "x86-64-unrolled", | 3 | "x86-64-unrolled", |
4 | "unrolled memcpy() in arch/x86/lib/memcpy_64.S") | 4 | "unrolled memcpy() in arch/x86/lib/memcpy_64.S") |
5 | 5 | ||
6 | MEMCPY_FN(memcpy_c, | 6 | MEMCPY_FN(__memcpy, |
7 | "x86-64-movsq", | 7 | "x86-64-movsq", |
8 | "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") | 8 | "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") |
9 | 9 | ||
10 | MEMCPY_FN(memcpy_c_e, | 10 | MEMCPY_FN(memcpy_erms, |
11 | "x86-64-movsb", | 11 | "x86-64-movsb", |
12 | "movsb-based memcpy() in arch/x86/lib/memcpy_64.S") | 12 | "movsb-based memcpy() in arch/x86/lib/memcpy_64.S") |
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S index fcd9cf00600a..e4c2c30143b9 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm.S +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S | |||
@@ -1,8 +1,6 @@ | |||
1 | #define memcpy MEMCPY /* don't hide glibc's memcpy() */ | 1 | #define memcpy MEMCPY /* don't hide glibc's memcpy() */ |
2 | #define altinstr_replacement text | 2 | #define altinstr_replacement text |
3 | #define globl p2align 4; .globl | 3 | #define globl p2align 4; .globl |
4 | #define Lmemcpy_c globl memcpy_c; memcpy_c | ||
5 | #define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e | ||
6 | #include "../../../arch/x86/lib/memcpy_64.S" | 4 | #include "../../../arch/x86/lib/memcpy_64.S" |
7 | /* | 5 | /* |
8 | * We need to provide note.GNU-stack section, saying that we want | 6 | * We need to provide note.GNU-stack section, saying that we want |
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c index db1d3a29d97f..d3dfb7936dcd 100644 --- a/tools/perf/bench/mem-memcpy.c +++ b/tools/perf/bench/mem-memcpy.c | |||
@@ -36,7 +36,7 @@ static const struct option options[] = { | |||
36 | "Specify length of memory to copy. " | 36 | "Specify length of memory to copy. " |
37 | "Available units: B, KB, MB, GB and TB (upper and lower)"), | 37 | "Available units: B, KB, MB, GB and TB (upper and lower)"), |
38 | OPT_STRING('r', "routine", &routine, "default", | 38 | OPT_STRING('r', "routine", &routine, "default", |
39 | "Specify routine to copy"), | 39 | "Specify routine to copy, \"all\" runs all available routines"), |
40 | OPT_INTEGER('i', "iterations", &iterations, | 40 | OPT_INTEGER('i', "iterations", &iterations, |
41 | "repeat memcpy() invocation this number of times"), | 41 | "repeat memcpy() invocation this number of times"), |
42 | OPT_BOOLEAN('c', "cycle", &use_cycle, | 42 | OPT_BOOLEAN('c', "cycle", &use_cycle, |
@@ -135,55 +135,16 @@ struct bench_mem_info { | |||
135 | const char *const *usage; | 135 | const char *const *usage; |
136 | }; | 136 | }; |
137 | 137 | ||
138 | static int bench_mem_common(int argc, const char **argv, | 138 | static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen) |
139 | const char *prefix __maybe_unused, | ||
140 | struct bench_mem_info *info) | ||
141 | { | 139 | { |
142 | int i; | 140 | const struct routine *r = &info->routines[r_idx]; |
143 | size_t len; | ||
144 | double totallen; | ||
145 | double result_bps[2]; | 141 | double result_bps[2]; |
146 | u64 result_cycle[2]; | 142 | u64 result_cycle[2]; |
147 | 143 | ||
148 | argc = parse_options(argc, argv, options, | ||
149 | info->usage, 0); | ||
150 | |||
151 | if (no_prefault && only_prefault) { | ||
152 | fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n"); | ||
153 | return 1; | ||
154 | } | ||
155 | |||
156 | if (use_cycle) | ||
157 | init_cycle(); | ||
158 | |||
159 | len = (size_t)perf_atoll((char *)length_str); | ||
160 | totallen = (double)len * iterations; | ||
161 | |||
162 | result_cycle[0] = result_cycle[1] = 0ULL; | 144 | result_cycle[0] = result_cycle[1] = 0ULL; |
163 | result_bps[0] = result_bps[1] = 0.0; | 145 | result_bps[0] = result_bps[1] = 0.0; |
164 | 146 | ||
165 | if ((s64)len <= 0) { | 147 | printf("Routine %s (%s)\n", r->name, r->desc); |
166 | fprintf(stderr, "Invalid length:%s\n", length_str); | ||
167 | return 1; | ||
168 | } | ||
169 | |||
170 | /* same to without specifying either of prefault and no-prefault */ | ||
171 | if (only_prefault && no_prefault) | ||
172 | only_prefault = no_prefault = false; | ||
173 | |||
174 | for (i = 0; info->routines[i].name; i++) { | ||
175 | if (!strcmp(info->routines[i].name, routine)) | ||
176 | break; | ||
177 | } | ||
178 | if (!info->routines[i].name) { | ||
179 | printf("Unknown routine:%s\n", routine); | ||
180 | printf("Available routines...\n"); | ||
181 | for (i = 0; info->routines[i].name; i++) { | ||
182 | printf("\t%s ... %s\n", | ||
183 | info->routines[i].name, info->routines[i].desc); | ||
184 | } | ||
185 | return 1; | ||
186 | } | ||
187 | 148 | ||
188 | if (bench_format == BENCH_FORMAT_DEFAULT) | 149 | if (bench_format == BENCH_FORMAT_DEFAULT) |
189 | printf("# Copying %s Bytes ...\n\n", length_str); | 150 | printf("# Copying %s Bytes ...\n\n", length_str); |
@@ -191,28 +152,17 @@ static int bench_mem_common(int argc, const char **argv, | |||
191 | if (!only_prefault && !no_prefault) { | 152 | if (!only_prefault && !no_prefault) { |
192 | /* show both of results */ | 153 | /* show both of results */ |
193 | if (use_cycle) { | 154 | if (use_cycle) { |
194 | result_cycle[0] = | 155 | result_cycle[0] = info->do_cycle(r, len, false); |
195 | info->do_cycle(&info->routines[i], len, false); | 156 | result_cycle[1] = info->do_cycle(r, len, true); |
196 | result_cycle[1] = | ||
197 | info->do_cycle(&info->routines[i], len, true); | ||
198 | } else { | 157 | } else { |
199 | result_bps[0] = | 158 | result_bps[0] = info->do_gettimeofday(r, len, false); |
200 | info->do_gettimeofday(&info->routines[i], | 159 | result_bps[1] = info->do_gettimeofday(r, len, true); |
201 | len, false); | ||
202 | result_bps[1] = | ||
203 | info->do_gettimeofday(&info->routines[i], | ||
204 | len, true); | ||
205 | } | 160 | } |
206 | } else { | 161 | } else { |
207 | if (use_cycle) { | 162 | if (use_cycle) |
208 | result_cycle[pf] = | 163 | result_cycle[pf] = info->do_cycle(r, len, only_prefault); |
209 | info->do_cycle(&info->routines[i], | 164 | else |
210 | len, only_prefault); | 165 | result_bps[pf] = info->do_gettimeofday(r, len, only_prefault); |
211 | } else { | ||
212 | result_bps[pf] = | ||
213 | info->do_gettimeofday(&info->routines[i], | ||
214 | len, only_prefault); | ||
215 | } | ||
216 | } | 166 | } |
217 | 167 | ||
218 | switch (bench_format) { | 168 | switch (bench_format) { |
@@ -265,6 +215,60 @@ static int bench_mem_common(int argc, const char **argv, | |||
265 | die("unknown format: %d\n", bench_format); | 215 | die("unknown format: %d\n", bench_format); |
266 | break; | 216 | break; |
267 | } | 217 | } |
218 | } | ||
219 | |||
220 | static int bench_mem_common(int argc, const char **argv, | ||
221 | const char *prefix __maybe_unused, | ||
222 | struct bench_mem_info *info) | ||
223 | { | ||
224 | int i; | ||
225 | size_t len; | ||
226 | double totallen; | ||
227 | |||
228 | argc = parse_options(argc, argv, options, | ||
229 | info->usage, 0); | ||
230 | |||
231 | if (no_prefault && only_prefault) { | ||
232 | fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n"); | ||
233 | return 1; | ||
234 | } | ||
235 | |||
236 | if (use_cycle) | ||
237 | init_cycle(); | ||
238 | |||
239 | len = (size_t)perf_atoll((char *)length_str); | ||
240 | totallen = (double)len * iterations; | ||
241 | |||
242 | if ((s64)len <= 0) { | ||
243 | fprintf(stderr, "Invalid length:%s\n", length_str); | ||
244 | return 1; | ||
245 | } | ||
246 | |||
247 | /* same to without specifying either of prefault and no-prefault */ | ||
248 | if (only_prefault && no_prefault) | ||
249 | only_prefault = no_prefault = false; | ||
250 | |||
251 | if (!strncmp(routine, "all", 3)) { | ||
252 | for (i = 0; info->routines[i].name; i++) | ||
253 | __bench_mem_routine(info, i, len, totallen); | ||
254 | return 0; | ||
255 | } | ||
256 | |||
257 | for (i = 0; info->routines[i].name; i++) { | ||
258 | if (!strcmp(info->routines[i].name, routine)) | ||
259 | break; | ||
260 | } | ||
261 | if (!info->routines[i].name) { | ||
262 | printf("Unknown routine:%s\n", routine); | ||
263 | printf("Available routines...\n"); | ||
264 | for (i = 0; info->routines[i].name; i++) { | ||
265 | printf("\t%s ... %s\n", | ||
266 | info->routines[i].name, info->routines[i].desc); | ||
267 | } | ||
268 | return 1; | ||
269 | } | ||
270 | |||
271 | __bench_mem_routine(info, i, len, totallen); | ||
268 | 272 | ||
269 | return 0; | 273 | return 0; |
270 | } | 274 | } |
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h index a71dff97c1f5..f02d028771d9 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h | |||
@@ -1,12 +1,12 @@ | |||
1 | 1 | ||
2 | MEMSET_FN(__memset, | 2 | MEMSET_FN(memset_orig, |
3 | "x86-64-unrolled", | 3 | "x86-64-unrolled", |
4 | "unrolled memset() in arch/x86/lib/memset_64.S") | 4 | "unrolled memset() in arch/x86/lib/memset_64.S") |
5 | 5 | ||
6 | MEMSET_FN(memset_c, | 6 | MEMSET_FN(__memset, |
7 | "x86-64-stosq", | 7 | "x86-64-stosq", |
8 | "movsq-based memset() in arch/x86/lib/memset_64.S") | 8 | "movsq-based memset() in arch/x86/lib/memset_64.S") |
9 | 9 | ||
10 | MEMSET_FN(memset_c_e, | 10 | MEMSET_FN(memset_erms, |
11 | "x86-64-stosb", | 11 | "x86-64-stosb", |
12 | "movsb-based memset() in arch/x86/lib/memset_64.S") | 12 | "movsb-based memset() in arch/x86/lib/memset_64.S") |
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S index 9e5af89ed13a..de278784c866 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm.S +++ b/tools/perf/bench/mem-memset-x86-64-asm.S | |||
@@ -1,8 +1,6 @@ | |||
1 | #define memset MEMSET /* don't hide glibc's memset() */ | 1 | #define memset MEMSET /* don't hide glibc's memset() */ |
2 | #define altinstr_replacement text | 2 | #define altinstr_replacement text |
3 | #define globl p2align 4; .globl | 3 | #define globl p2align 4; .globl |
4 | #define Lmemset_c globl memset_c; memset_c | ||
5 | #define Lmemset_c_e globl memset_c_e; memset_c_e | ||
6 | #include "../../../arch/x86/lib/memset_64.S" | 4 | #include "../../../arch/x86/lib/memset_64.S" |
7 | 5 | ||
8 | /* | 6 | /* |
diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/perf/util/include/asm/alternative-asm.h index 6789d788d494..3a3a0f16456a 100644 --- a/tools/perf/util/include/asm/alternative-asm.h +++ b/tools/perf/util/include/asm/alternative-asm.h | |||
@@ -4,5 +4,6 @@ | |||
4 | /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ | 4 | /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ |
5 | 5 | ||
6 | #define altinstruction_entry # | 6 | #define altinstruction_entry # |
7 | #define ALTERNATIVE_2 # | ||
7 | 8 | ||
8 | #endif | 9 | #endif |
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index d643d5242537..95abddcd7839 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile | |||
@@ -17,6 +17,7 @@ TARGETS += sysctl | |||
17 | TARGETS += timers | 17 | TARGETS += timers |
18 | TARGETS += user | 18 | TARGETS += user |
19 | TARGETS += vm | 19 | TARGETS += vm |
20 | TARGETS += x86 | ||
20 | #Please keep the TARGETS list alphabetically sorted | 21 | #Please keep the TARGETS list alphabetically sorted |
21 | 22 | ||
22 | TARGETS_HOTPLUG = cpu-hotplug | 23 | TARGETS_HOTPLUG = cpu-hotplug |
diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore new file mode 100644 index 000000000000..15034fef9698 --- /dev/null +++ b/tools/testing/selftests/x86/.gitignore | |||
@@ -0,0 +1,2 @@ | |||
1 | *_32 | ||
2 | *_64 | ||
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile new file mode 100644 index 000000000000..f0a7918178dd --- /dev/null +++ b/tools/testing/selftests/x86/Makefile | |||
@@ -0,0 +1,48 @@ | |||
1 | .PHONY: all all_32 all_64 check_build32 clean run_tests | ||
2 | |||
3 | TARGETS_C_BOTHBITS := sigreturn | ||
4 | |||
5 | BINARIES_32 := $(TARGETS_C_BOTHBITS:%=%_32) | ||
6 | BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64) | ||
7 | |||
8 | CFLAGS := -O2 -g -std=gnu99 -pthread -Wall | ||
9 | |||
10 | UNAME_P := $(shell uname -p) | ||
11 | |||
12 | # Always build 32-bit tests | ||
13 | all: all_32 | ||
14 | |||
15 | # If we're on a 64-bit host, build 64-bit tests as well | ||
16 | ifeq ($(shell uname -p),x86_64) | ||
17 | all: all_64 | ||
18 | endif | ||
19 | |||
20 | all_32: check_build32 $(BINARIES_32) | ||
21 | |||
22 | all_64: $(BINARIES_64) | ||
23 | |||
24 | clean: | ||
25 | $(RM) $(BINARIES_32) $(BINARIES_64) | ||
26 | |||
27 | run_tests: | ||
28 | ./run_x86_tests.sh | ||
29 | |||
30 | $(TARGETS_C_BOTHBITS:%=%_32): %_32: %.c | ||
31 | $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl | ||
32 | |||
33 | $(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c | ||
34 | $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl | ||
35 | |||
36 | check_build32: | ||
37 | @if ! $(CC) -m32 -o /dev/null trivial_32bit_program.c; then \ | ||
38 | echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ | ||
39 | echo "environment. If you are using a Debian-like"; \ | ||
40 | echo " distribution, try:"; \ | ||
41 | echo ""; \ | ||
42 | echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ | ||
43 | echo ""; \ | ||
44 | echo "If you are using a Fedora-like distribution, try:"; \ | ||
45 | echo ""; \ | ||
46 | echo " yum install glibc-devel.*i686"; \ | ||
47 | exit 1; \ | ||
48 | fi | ||
diff --git a/tools/testing/selftests/x86/run_x86_tests.sh b/tools/testing/selftests/x86/run_x86_tests.sh new file mode 100644 index 000000000000..3d3ec65f3e7c --- /dev/null +++ b/tools/testing/selftests/x86/run_x86_tests.sh | |||
@@ -0,0 +1,11 @@ | |||
1 | #!/bin/bash | ||
2 | |||
3 | # This is deliberately minimal. IMO kselftests should provide a standard | ||
4 | # script here. | ||
5 | ./sigreturn_32 || exit 1 | ||
6 | |||
7 | if [[ "$uname -p" -eq "x86_64" ]]; then | ||
8 | ./sigreturn_64 || exit 1 | ||
9 | fi | ||
10 | |||
11 | exit 0 | ||
diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c new file mode 100644 index 000000000000..b5aa1bab7416 --- /dev/null +++ b/tools/testing/selftests/x86/sigreturn.c | |||
@@ -0,0 +1,684 @@ | |||
1 | /* | ||
2 | * sigreturn.c - tests for x86 sigreturn(2) and exit-to-userspace | ||
3 | * Copyright (c) 2014-2015 Andrew Lutomirski | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but | ||
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * General Public License for more details. | ||
13 | * | ||
14 | * This is a series of tests that exercises the sigreturn(2) syscall and | ||
15 | * the IRET / SYSRET paths in the kernel. | ||
16 | * | ||
17 | * For now, this focuses on the effects of unusual CS and SS values, | ||
18 | * and it has a bunch of tests to make sure that ESP/RSP is restored | ||
19 | * properly. | ||
20 | * | ||
21 | * The basic idea behind these tests is to raise(SIGUSR1) to create a | ||
22 | * sigcontext frame, plug in the values to be tested, and then return, | ||
23 | * which implicitly invokes sigreturn(2) and programs the user context | ||
24 | * as desired. | ||
25 | * | ||
26 | * For tests for which we expect sigreturn and the subsequent return to | ||
27 | * user mode to succeed, we return to a short trampoline that generates | ||
28 | * SIGTRAP so that the meat of the tests can be ordinary C code in a | ||
29 | * SIGTRAP handler. | ||
30 | * | ||
31 | * The inner workings of each test is documented below. | ||
32 | * | ||
33 | * Do not run on outdated, unpatched kernels at risk of nasty crashes. | ||
34 | */ | ||
35 | |||
36 | #define _GNU_SOURCE | ||
37 | |||
38 | #include <sys/time.h> | ||
39 | #include <time.h> | ||
40 | #include <stdlib.h> | ||
41 | #include <sys/syscall.h> | ||
42 | #include <unistd.h> | ||
43 | #include <stdio.h> | ||
44 | #include <string.h> | ||
45 | #include <inttypes.h> | ||
46 | #include <sys/mman.h> | ||
47 | #include <sys/signal.h> | ||
48 | #include <sys/ucontext.h> | ||
49 | #include <asm/ldt.h> | ||
50 | #include <err.h> | ||
51 | #include <setjmp.h> | ||
52 | #include <stddef.h> | ||
53 | #include <stdbool.h> | ||
54 | #include <sys/ptrace.h> | ||
55 | #include <sys/user.h> | ||
56 | |||
57 | /* | ||
58 | * In principle, this test can run on Linux emulation layers (e.g. | ||
59 | * Illumos "LX branded zones"). Solaris-based kernels reserve LDT | ||
60 | * entries 0-5 for their own internal purposes, so start our LDT | ||
61 | * allocations above that reservation. (The tests don't pass on LX | ||
62 | * branded zones, but at least this lets them run.) | ||
63 | */ | ||
64 | #define LDT_OFFSET 6 | ||
65 | |||
66 | /* An aligned stack accessible through some of our segments. */ | ||
67 | static unsigned char stack16[65536] __attribute__((aligned(4096))); | ||
68 | |||
69 | /* | ||
70 | * An aligned int3 instruction used as a trampoline. Some of the tests | ||
71 | * want to fish out their ss values, so this trampoline copies ss to eax | ||
72 | * before the int3. | ||
73 | */ | ||
74 | asm (".pushsection .text\n\t" | ||
75 | ".type int3, @function\n\t" | ||
76 | ".align 4096\n\t" | ||
77 | "int3:\n\t" | ||
78 | "mov %ss,%eax\n\t" | ||
79 | "int3\n\t" | ||
80 | ".size int3, . - int3\n\t" | ||
81 | ".align 4096, 0xcc\n\t" | ||
82 | ".popsection"); | ||
83 | extern char int3[4096]; | ||
84 | |||
85 | /* | ||
86 | * At startup, we prepapre: | ||
87 | * | ||
88 | * - ldt_nonexistent_sel: An LDT entry that doesn't exist (all-zero | ||
89 | * descriptor or out of bounds). | ||
90 | * - code16_sel: A 16-bit LDT code segment pointing to int3. | ||
91 | * - data16_sel: A 16-bit LDT data segment pointing to stack16. | ||
92 | * - npcode32_sel: A 32-bit not-present LDT code segment pointing to int3. | ||
93 | * - npdata32_sel: A 32-bit not-present LDT data segment pointing to stack16. | ||
94 | * - gdt_data16_idx: A 16-bit GDT data segment pointing to stack16. | ||
95 | * - gdt_npdata32_idx: A 32-bit not-present GDT data segment pointing to | ||
96 | * stack16. | ||
97 | * | ||
98 | * For no particularly good reason, xyz_sel is a selector value with the | ||
99 | * RPL and LDT bits filled in, whereas xyz_idx is just an index into the | ||
100 | * descriptor table. These variables will be zero if their respective | ||
101 | * segments could not be allocated. | ||
102 | */ | ||
103 | static unsigned short ldt_nonexistent_sel; | ||
104 | static unsigned short code16_sel, data16_sel, npcode32_sel, npdata32_sel; | ||
105 | |||
106 | static unsigned short gdt_data16_idx, gdt_npdata32_idx; | ||
107 | |||
108 | static unsigned short GDT3(int idx) | ||
109 | { | ||
110 | return (idx << 3) | 3; | ||
111 | } | ||
112 | |||
113 | static unsigned short LDT3(int idx) | ||
114 | { | ||
115 | return (idx << 3) | 7; | ||
116 | } | ||
117 | |||
118 | /* Our sigaltstack scratch space. */ | ||
119 | static char altstack_data[SIGSTKSZ]; | ||
120 | |||
121 | static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), | ||
122 | int flags) | ||
123 | { | ||
124 | struct sigaction sa; | ||
125 | memset(&sa, 0, sizeof(sa)); | ||
126 | sa.sa_sigaction = handler; | ||
127 | sa.sa_flags = SA_SIGINFO | flags; | ||
128 | sigemptyset(&sa.sa_mask); | ||
129 | if (sigaction(sig, &sa, 0)) | ||
130 | err(1, "sigaction"); | ||
131 | } | ||
132 | |||
133 | static void clearhandler(int sig) | ||
134 | { | ||
135 | struct sigaction sa; | ||
136 | memset(&sa, 0, sizeof(sa)); | ||
137 | sa.sa_handler = SIG_DFL; | ||
138 | sigemptyset(&sa.sa_mask); | ||
139 | if (sigaction(sig, &sa, 0)) | ||
140 | err(1, "sigaction"); | ||
141 | } | ||
142 | |||
143 | static void add_ldt(const struct user_desc *desc, unsigned short *var, | ||
144 | const char *name) | ||
145 | { | ||
146 | if (syscall(SYS_modify_ldt, 1, desc, sizeof(*desc)) == 0) { | ||
147 | *var = LDT3(desc->entry_number); | ||
148 | } else { | ||
149 | printf("[NOTE]\tFailed to create %s segment\n", name); | ||
150 | *var = 0; | ||
151 | } | ||
152 | } | ||
153 | |||
154 | static void setup_ldt(void) | ||
155 | { | ||
156 | if ((unsigned long)stack16 > (1ULL << 32) - sizeof(stack16)) | ||
157 | errx(1, "stack16 is too high\n"); | ||
158 | if ((unsigned long)int3 > (1ULL << 32) - sizeof(int3)) | ||
159 | errx(1, "int3 is too high\n"); | ||
160 | |||
161 | ldt_nonexistent_sel = LDT3(LDT_OFFSET + 2); | ||
162 | |||
163 | const struct user_desc code16_desc = { | ||
164 | .entry_number = LDT_OFFSET + 0, | ||
165 | .base_addr = (unsigned long)int3, | ||
166 | .limit = 4095, | ||
167 | .seg_32bit = 0, | ||
168 | .contents = 2, /* Code, not conforming */ | ||
169 | .read_exec_only = 0, | ||
170 | .limit_in_pages = 0, | ||
171 | .seg_not_present = 0, | ||
172 | .useable = 0 | ||
173 | }; | ||
174 | add_ldt(&code16_desc, &code16_sel, "code16"); | ||
175 | |||
176 | const struct user_desc data16_desc = { | ||
177 | .entry_number = LDT_OFFSET + 1, | ||
178 | .base_addr = (unsigned long)stack16, | ||
179 | .limit = 0xffff, | ||
180 | .seg_32bit = 0, | ||
181 | .contents = 0, /* Data, grow-up */ | ||
182 | .read_exec_only = 0, | ||
183 | .limit_in_pages = 0, | ||
184 | .seg_not_present = 0, | ||
185 | .useable = 0 | ||
186 | }; | ||
187 | add_ldt(&data16_desc, &data16_sel, "data16"); | ||
188 | |||
189 | const struct user_desc npcode32_desc = { | ||
190 | .entry_number = LDT_OFFSET + 3, | ||
191 | .base_addr = (unsigned long)int3, | ||
192 | .limit = 4095, | ||
193 | .seg_32bit = 1, | ||
194 | .contents = 2, /* Code, not conforming */ | ||
195 | .read_exec_only = 0, | ||
196 | .limit_in_pages = 0, | ||
197 | .seg_not_present = 1, | ||
198 | .useable = 0 | ||
199 | }; | ||
200 | add_ldt(&npcode32_desc, &npcode32_sel, "npcode32"); | ||
201 | |||
202 | const struct user_desc npdata32_desc = { | ||
203 | .entry_number = LDT_OFFSET + 4, | ||
204 | .base_addr = (unsigned long)stack16, | ||
205 | .limit = 0xffff, | ||
206 | .seg_32bit = 1, | ||
207 | .contents = 0, /* Data, grow-up */ | ||
208 | .read_exec_only = 0, | ||
209 | .limit_in_pages = 0, | ||
210 | .seg_not_present = 1, | ||
211 | .useable = 0 | ||
212 | }; | ||
213 | add_ldt(&npdata32_desc, &npdata32_sel, "npdata32"); | ||
214 | |||
215 | struct user_desc gdt_data16_desc = { | ||
216 | .entry_number = -1, | ||
217 | .base_addr = (unsigned long)stack16, | ||
218 | .limit = 0xffff, | ||
219 | .seg_32bit = 0, | ||
220 | .contents = 0, /* Data, grow-up */ | ||
221 | .read_exec_only = 0, | ||
222 | .limit_in_pages = 0, | ||
223 | .seg_not_present = 0, | ||
224 | .useable = 0 | ||
225 | }; | ||
226 | |||
227 | if (syscall(SYS_set_thread_area, &gdt_data16_desc) == 0) { | ||
228 | /* | ||
229 | * This probably indicates vulnerability to CVE-2014-8133. | ||
230 | * Merely getting here isn't definitive, though, and we'll | ||
231 | * diagnose the problem for real later on. | ||
232 | */ | ||
233 | printf("[WARN]\tset_thread_area allocated data16 at index %d\n", | ||
234 | gdt_data16_desc.entry_number); | ||
235 | gdt_data16_idx = gdt_data16_desc.entry_number; | ||
236 | } else { | ||
237 | printf("[OK]\tset_thread_area refused 16-bit data\n"); | ||
238 | } | ||
239 | |||
240 | struct user_desc gdt_npdata32_desc = { | ||
241 | .entry_number = -1, | ||
242 | .base_addr = (unsigned long)stack16, | ||
243 | .limit = 0xffff, | ||
244 | .seg_32bit = 1, | ||
245 | .contents = 0, /* Data, grow-up */ | ||
246 | .read_exec_only = 0, | ||
247 | .limit_in_pages = 0, | ||
248 | .seg_not_present = 1, | ||
249 | .useable = 0 | ||
250 | }; | ||
251 | |||
252 | if (syscall(SYS_set_thread_area, &gdt_npdata32_desc) == 0) { | ||
253 | /* | ||
254 | * As a hardening measure, newer kernels don't allow this. | ||
255 | */ | ||
256 | printf("[WARN]\tset_thread_area allocated npdata32 at index %d\n", | ||
257 | gdt_npdata32_desc.entry_number); | ||
258 | gdt_npdata32_idx = gdt_npdata32_desc.entry_number; | ||
259 | } else { | ||
260 | printf("[OK]\tset_thread_area refused 16-bit data\n"); | ||
261 | } | ||
262 | } | ||
263 | |||
264 | /* State used by our signal handlers. */ | ||
265 | static gregset_t initial_regs, requested_regs, resulting_regs; | ||
266 | |||
267 | /* Instructions for the SIGUSR1 handler. */ | ||
268 | static volatile unsigned short sig_cs, sig_ss; | ||
269 | static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno; | ||
270 | |||
271 | /* Abstractions for some 32-bit vs 64-bit differences. */ | ||
272 | #ifdef __x86_64__ | ||
273 | # define REG_IP REG_RIP | ||
274 | # define REG_SP REG_RSP | ||
275 | # define REG_AX REG_RAX | ||
276 | |||
277 | struct selectors { | ||
278 | unsigned short cs, gs, fs, ss; | ||
279 | }; | ||
280 | |||
281 | static unsigned short *ssptr(ucontext_t *ctx) | ||
282 | { | ||
283 | struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS]; | ||
284 | return &sels->ss; | ||
285 | } | ||
286 | |||
287 | static unsigned short *csptr(ucontext_t *ctx) | ||
288 | { | ||
289 | struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS]; | ||
290 | return &sels->cs; | ||
291 | } | ||
292 | #else | ||
293 | # define REG_IP REG_EIP | ||
294 | # define REG_SP REG_ESP | ||
295 | # define REG_AX REG_EAX | ||
296 | |||
297 | static greg_t *ssptr(ucontext_t *ctx) | ||
298 | { | ||
299 | return &ctx->uc_mcontext.gregs[REG_SS]; | ||
300 | } | ||
301 | |||
302 | static greg_t *csptr(ucontext_t *ctx) | ||
303 | { | ||
304 | return &ctx->uc_mcontext.gregs[REG_CS]; | ||
305 | } | ||
306 | #endif | ||
307 | |||
308 | /* Number of errors in the current test case. */ | ||
309 | static volatile sig_atomic_t nerrs; | ||
310 | |||
311 | /* | ||
312 | * SIGUSR1 handler. Sets CS and SS as requested and points IP to the | ||
313 | * int3 trampoline. Sets SP to a large known value so that we can see | ||
314 | * whether the value round-trips back to user mode correctly. | ||
315 | */ | ||
316 | static void sigusr1(int sig, siginfo_t *info, void *ctx_void) | ||
317 | { | ||
318 | ucontext_t *ctx = (ucontext_t*)ctx_void; | ||
319 | |||
320 | memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); | ||
321 | |||
322 | *csptr(ctx) = sig_cs; | ||
323 | *ssptr(ctx) = sig_ss; | ||
324 | |||
325 | ctx->uc_mcontext.gregs[REG_IP] = | ||
326 | sig_cs == code16_sel ? 0 : (unsigned long)&int3; | ||
327 | ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL; | ||
328 | ctx->uc_mcontext.gregs[REG_AX] = 0; | ||
329 | |||
330 | memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); | ||
331 | requested_regs[REG_AX] = *ssptr(ctx); /* The asm code does this. */ | ||
332 | |||
333 | return; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Called after a successful sigreturn. Restores our state so that | ||
338 | * the original raise(SIGUSR1) returns. | ||
339 | */ | ||
340 | static void sigtrap(int sig, siginfo_t *info, void *ctx_void) | ||
341 | { | ||
342 | ucontext_t *ctx = (ucontext_t*)ctx_void; | ||
343 | |||
344 | sig_err = ctx->uc_mcontext.gregs[REG_ERR]; | ||
345 | sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO]; | ||
346 | |||
347 | unsigned short ss; | ||
348 | asm ("mov %%ss,%0" : "=r" (ss)); | ||
349 | |||
350 | greg_t asm_ss = ctx->uc_mcontext.gregs[REG_AX]; | ||
351 | if (asm_ss != sig_ss && sig == SIGTRAP) { | ||
352 | /* Sanity check failure. */ | ||
353 | printf("[FAIL]\tSIGTRAP: ss = %hx, frame ss = %hx, ax = %llx\n", | ||
354 | ss, *ssptr(ctx), (unsigned long long)asm_ss); | ||
355 | nerrs++; | ||
356 | } | ||
357 | |||
358 | memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); | ||
359 | memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t)); | ||
360 | |||
361 | sig_trapped = sig; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Checks a given selector for its code bitness or returns -1 if it's not | ||
366 | * a usable code segment selector. | ||
367 | */ | ||
368 | int cs_bitness(unsigned short cs) | ||
369 | { | ||
370 | uint32_t valid = 0, ar; | ||
371 | asm ("lar %[cs], %[ar]\n\t" | ||
372 | "jnz 1f\n\t" | ||
373 | "mov $1, %[valid]\n\t" | ||
374 | "1:" | ||
375 | : [ar] "=r" (ar), [valid] "+rm" (valid) | ||
376 | : [cs] "r" (cs)); | ||
377 | |||
378 | if (!valid) | ||
379 | return -1; | ||
380 | |||
381 | bool db = (ar & (1 << 22)); | ||
382 | bool l = (ar & (1 << 21)); | ||
383 | |||
384 | if (!(ar & (1<<11))) | ||
385 | return -1; /* Not code. */ | ||
386 | |||
387 | if (l && !db) | ||
388 | return 64; | ||
389 | else if (!l && db) | ||
390 | return 32; | ||
391 | else if (!l && !db) | ||
392 | return 16; | ||
393 | else | ||
394 | return -1; /* Unknown bitness. */ | ||
395 | } | ||
396 | |||
397 | /* Finds a usable code segment of the requested bitness. */ | ||
398 | int find_cs(int bitness) | ||
399 | { | ||
400 | unsigned short my_cs; | ||
401 | |||
402 | asm ("mov %%cs,%0" : "=r" (my_cs)); | ||
403 | |||
404 | if (cs_bitness(my_cs) == bitness) | ||
405 | return my_cs; | ||
406 | if (cs_bitness(my_cs + (2 << 3)) == bitness) | ||
407 | return my_cs + (2 << 3); | ||
408 | if (my_cs > (2<<3) && cs_bitness(my_cs - (2 << 3)) == bitness) | ||
409 | return my_cs - (2 << 3); | ||
410 | if (cs_bitness(code16_sel) == bitness) | ||
411 | return code16_sel; | ||
412 | |||
413 | printf("[WARN]\tCould not find %d-bit CS\n", bitness); | ||
414 | return -1; | ||
415 | } | ||
416 | |||
417 | static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) | ||
418 | { | ||
419 | int cs = find_cs(cs_bits); | ||
420 | if (cs == -1) { | ||
421 | printf("[SKIP]\tCode segment unavailable for %d-bit CS, %d-bit SS\n", | ||
422 | cs_bits, use_16bit_ss ? 16 : 32); | ||
423 | return 0; | ||
424 | } | ||
425 | |||
426 | if (force_ss != -1) { | ||
427 | sig_ss = force_ss; | ||
428 | } else { | ||
429 | if (use_16bit_ss) { | ||
430 | if (!data16_sel) { | ||
431 | printf("[SKIP]\tData segment unavailable for %d-bit CS, 16-bit SS\n", | ||
432 | cs_bits); | ||
433 | return 0; | ||
434 | } | ||
435 | sig_ss = data16_sel; | ||
436 | } else { | ||
437 | asm volatile ("mov %%ss,%0" : "=r" (sig_ss)); | ||
438 | } | ||
439 | } | ||
440 | |||
441 | sig_cs = cs; | ||
442 | |||
443 | printf("[RUN]\tValid sigreturn: %d-bit CS (%hx), %d-bit SS (%hx%s)\n", | ||
444 | cs_bits, sig_cs, use_16bit_ss ? 16 : 32, sig_ss, | ||
445 | (sig_ss & 4) ? "" : ", GDT"); | ||
446 | |||
447 | raise(SIGUSR1); | ||
448 | |||
449 | nerrs = 0; | ||
450 | |||
451 | /* | ||
452 | * Check that each register had an acceptable value when the | ||
453 | * int3 trampoline was invoked. | ||
454 | */ | ||
455 | for (int i = 0; i < NGREG; i++) { | ||
456 | greg_t req = requested_regs[i], res = resulting_regs[i]; | ||
457 | if (i == REG_TRAPNO || i == REG_IP) | ||
458 | continue; /* don't care */ | ||
459 | if (i == REG_SP) { | ||
460 | printf("\tSP: %llx -> %llx\n", (unsigned long long)req, | ||
461 | (unsigned long long)res); | ||
462 | |||
463 | /* | ||
464 | * In many circumstances, the high 32 bits of rsp | ||
465 | * are zeroed. For example, we could be a real | ||
466 | * 32-bit program, or we could hit any of a number | ||
467 | * of poorly-documented IRET or segmented ESP | ||
468 | * oddities. If this happens, it's okay. | ||
469 | */ | ||
470 | if (res == (req & 0xFFFFFFFF)) | ||
471 | continue; /* OK; not expected to work */ | ||
472 | } | ||
473 | |||
474 | bool ignore_reg = false; | ||
475 | #if __i386__ | ||
476 | if (i == REG_UESP) | ||
477 | ignore_reg = true; | ||
478 | #else | ||
479 | if (i == REG_CSGSFS) { | ||
480 | struct selectors *req_sels = | ||
481 | (void *)&requested_regs[REG_CSGSFS]; | ||
482 | struct selectors *res_sels = | ||
483 | (void *)&resulting_regs[REG_CSGSFS]; | ||
484 | if (req_sels->cs != res_sels->cs) { | ||
485 | printf("[FAIL]\tCS mismatch: requested 0x%hx; got 0x%hx\n", | ||
486 | req_sels->cs, res_sels->cs); | ||
487 | nerrs++; | ||
488 | } | ||
489 | |||
490 | if (req_sels->ss != res_sels->ss) { | ||
491 | printf("[FAIL]\tSS mismatch: requested 0x%hx; got 0x%hx\n", | ||
492 | req_sels->ss, res_sels->ss); | ||
493 | nerrs++; | ||
494 | } | ||
495 | |||
496 | continue; | ||
497 | } | ||
498 | #endif | ||
499 | |||
500 | /* Sanity check on the kernel */ | ||
501 | if (i == REG_AX && requested_regs[i] != resulting_regs[i]) { | ||
502 | printf("[FAIL]\tAX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n", | ||
503 | (unsigned long long)requested_regs[i], | ||
504 | (unsigned long long)resulting_regs[i]); | ||
505 | nerrs++; | ||
506 | continue; | ||
507 | } | ||
508 | |||
509 | if (requested_regs[i] != resulting_regs[i] && !ignore_reg) { | ||
510 | /* | ||
511 | * SP is particularly interesting here. The | ||
512 | * usual cause of failures is that we hit the | ||
513 | * nasty IRET case of returning to a 16-bit SS, | ||
514 | * in which case bits 16:31 of the *kernel* | ||
515 | * stack pointer persist in ESP. | ||
516 | */ | ||
517 | printf("[FAIL]\tReg %d mismatch: requested 0x%llx; got 0x%llx\n", | ||
518 | i, (unsigned long long)requested_regs[i], | ||
519 | (unsigned long long)resulting_regs[i]); | ||
520 | nerrs++; | ||
521 | } | ||
522 | } | ||
523 | |||
524 | if (nerrs == 0) | ||
525 | printf("[OK]\tall registers okay\n"); | ||
526 | |||
527 | return nerrs; | ||
528 | } | ||
529 | |||
530 | static int test_bad_iret(int cs_bits, unsigned short ss, int force_cs) | ||
531 | { | ||
532 | int cs = force_cs == -1 ? find_cs(cs_bits) : force_cs; | ||
533 | if (cs == -1) | ||
534 | return 0; | ||
535 | |||
536 | sig_cs = cs; | ||
537 | sig_ss = ss; | ||
538 | |||
539 | printf("[RUN]\t%d-bit CS (%hx), bogus SS (%hx)\n", | ||
540 | cs_bits, sig_cs, sig_ss); | ||
541 | |||
542 | sig_trapped = 0; | ||
543 | raise(SIGUSR1); | ||
544 | if (sig_trapped) { | ||
545 | char errdesc[32] = ""; | ||
546 | if (sig_err) { | ||
547 | const char *src = (sig_err & 1) ? " EXT" : ""; | ||
548 | const char *table; | ||
549 | if ((sig_err & 0x6) == 0x0) | ||
550 | table = "GDT"; | ||
551 | else if ((sig_err & 0x6) == 0x4) | ||
552 | table = "LDT"; | ||
553 | else if ((sig_err & 0x6) == 0x2) | ||
554 | table = "IDT"; | ||
555 | else | ||
556 | table = "???"; | ||
557 | |||
558 | sprintf(errdesc, "%s%s index %d, ", | ||
559 | table, src, sig_err >> 3); | ||
560 | } | ||
561 | |||
562 | char trapname[32]; | ||
563 | if (sig_trapno == 13) | ||
564 | strcpy(trapname, "GP"); | ||
565 | else if (sig_trapno == 11) | ||
566 | strcpy(trapname, "NP"); | ||
567 | else if (sig_trapno == 12) | ||
568 | strcpy(trapname, "SS"); | ||
569 | else if (sig_trapno == 32) | ||
570 | strcpy(trapname, "IRET"); /* X86_TRAP_IRET */ | ||
571 | else | ||
572 | sprintf(trapname, "%d", sig_trapno); | ||
573 | |||
574 | printf("[OK]\tGot #%s(0x%lx) (i.e. %s%s)\n", | ||
575 | trapname, (unsigned long)sig_err, | ||
576 | errdesc, strsignal(sig_trapped)); | ||
577 | return 0; | ||
578 | } else { | ||
579 | printf("[FAIL]\tDid not get SIGSEGV\n"); | ||
580 | return 1; | ||
581 | } | ||
582 | } | ||
583 | |||
584 | int main() | ||
585 | { | ||
586 | int total_nerrs = 0; | ||
587 | unsigned short my_cs, my_ss; | ||
588 | |||
589 | asm volatile ("mov %%cs,%0" : "=r" (my_cs)); | ||
590 | asm volatile ("mov %%ss,%0" : "=r" (my_ss)); | ||
591 | setup_ldt(); | ||
592 | |||
593 | stack_t stack = { | ||
594 | .ss_sp = altstack_data, | ||
595 | .ss_size = SIGSTKSZ, | ||
596 | }; | ||
597 | if (sigaltstack(&stack, NULL) != 0) | ||
598 | err(1, "sigaltstack"); | ||
599 | |||
600 | sethandler(SIGUSR1, sigusr1, 0); | ||
601 | sethandler(SIGTRAP, sigtrap, SA_ONSTACK); | ||
602 | |||
603 | /* Easy cases: return to a 32-bit SS in each possible CS bitness. */ | ||
604 | total_nerrs += test_valid_sigreturn(64, false, -1); | ||
605 | total_nerrs += test_valid_sigreturn(32, false, -1); | ||
606 | total_nerrs += test_valid_sigreturn(16, false, -1); | ||
607 | |||
608 | /* | ||
609 | * Test easy espfix cases: return to a 16-bit LDT SS in each possible | ||
610 | * CS bitness. NB: with a long mode CS, the SS bitness is irrelevant. | ||
611 | * | ||
612 | * This catches the original missing-espfix-on-64-bit-kernels issue | ||
613 | * as well as CVE-2014-8134. | ||
614 | */ | ||
615 | total_nerrs += test_valid_sigreturn(64, true, -1); | ||
616 | total_nerrs += test_valid_sigreturn(32, true, -1); | ||
617 | total_nerrs += test_valid_sigreturn(16, true, -1); | ||
618 | |||
619 | if (gdt_data16_idx) { | ||
620 | /* | ||
621 | * For performance reasons, Linux skips espfix if SS points | ||
622 | * to the GDT. If we were able to allocate a 16-bit SS in | ||
623 | * the GDT, see if it leaks parts of the kernel stack pointer. | ||
624 | * | ||
625 | * This tests for CVE-2014-8133. | ||
626 | */ | ||
627 | total_nerrs += test_valid_sigreturn(64, true, | ||
628 | GDT3(gdt_data16_idx)); | ||
629 | total_nerrs += test_valid_sigreturn(32, true, | ||
630 | GDT3(gdt_data16_idx)); | ||
631 | total_nerrs += test_valid_sigreturn(16, true, | ||
632 | GDT3(gdt_data16_idx)); | ||
633 | } | ||
634 | |||
635 | /* | ||
636 | * We're done testing valid sigreturn cases. Now we test states | ||
637 | * for which sigreturn itself will succeed but the subsequent | ||
638 | * entry to user mode will fail. | ||
639 | * | ||
640 | * Depending on the failure mode and the kernel bitness, these | ||
641 | * entry failures can generate SIGSEGV, SIGBUS, or SIGILL. | ||
642 | */ | ||
643 | clearhandler(SIGTRAP); | ||
644 | sethandler(SIGSEGV, sigtrap, SA_ONSTACK); | ||
645 | sethandler(SIGBUS, sigtrap, SA_ONSTACK); | ||
646 | sethandler(SIGILL, sigtrap, SA_ONSTACK); /* 32-bit kernels do this */ | ||
647 | |||
648 | /* Easy failures: invalid SS, resulting in #GP(0) */ | ||
649 | test_bad_iret(64, ldt_nonexistent_sel, -1); | ||
650 | test_bad_iret(32, ldt_nonexistent_sel, -1); | ||
651 | test_bad_iret(16, ldt_nonexistent_sel, -1); | ||
652 | |||
653 | /* These fail because SS isn't a data segment, resulting in #GP(SS) */ | ||
654 | test_bad_iret(64, my_cs, -1); | ||
655 | test_bad_iret(32, my_cs, -1); | ||
656 | test_bad_iret(16, my_cs, -1); | ||
657 | |||
658 | /* Try to return to a not-present code segment, triggering #NP(SS). */ | ||
659 | test_bad_iret(32, my_ss, npcode32_sel); | ||
660 | |||
661 | /* | ||
662 | * Try to return to a not-present but otherwise valid data segment. | ||
663 | * This will cause IRET to fail with #SS on the espfix stack. This | ||
664 | * exercises CVE-2014-9322. | ||
665 | * | ||
666 | * Note that, if espfix is enabled, 64-bit Linux will lose track | ||
667 | * of the actual cause of failure and report #GP(0) instead. | ||
668 | * This would be very difficult for Linux to avoid, because | ||
669 | * espfix64 causes IRET failures to be promoted to #DF, so the | ||
670 | * original exception frame is never pushed onto the stack. | ||
671 | */ | ||
672 | test_bad_iret(32, npdata32_sel, -1); | ||
673 | |||
674 | /* | ||
675 | * Try to return to a not-present but otherwise valid data | ||
676 | * segment without invoking espfix. Newer kernels don't allow | ||
677 | * this to happen in the first place. On older kernels, though, | ||
678 | * this can trigger CVE-2014-9322. | ||
679 | */ | ||
680 | if (gdt_npdata32_idx) | ||
681 | test_bad_iret(32, GDT3(gdt_npdata32_idx), -1); | ||
682 | |||
683 | return total_nerrs ? 1 : 0; | ||
684 | } | ||
diff --git a/tools/testing/selftests/x86/trivial_32bit_program.c b/tools/testing/selftests/x86/trivial_32bit_program.c new file mode 100644 index 000000000000..2e231beb0a39 --- /dev/null +++ b/tools/testing/selftests/x86/trivial_32bit_program.c | |||
@@ -0,0 +1,14 @@ | |||
1 | /* | ||
2 | * Trivial program to check that we have a valid 32-bit build environment. | ||
3 | * Copyright (c) 2015 Andy Lutomirski | ||
4 | * GPL v2 | ||
5 | */ | ||
6 | |||
7 | #include <stdio.h> | ||
8 | |||
9 | int main() | ||
10 | { | ||
11 | printf("\n"); | ||
12 | |||
13 | return 0; | ||
14 | } | ||