aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/x86/boot.txt6
-rw-r--r--arch/x86/boot/compressed/aslr.c5
-rw-r--r--arch/x86/boot/compressed/head_32.S3
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/boot/compressed/misc.c5
-rw-r--r--arch/x86/boot/compressed/misc.h6
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S2
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S4
-rw-r--r--arch/x86/ia32/Makefile1
-rw-r--r--arch/x86/ia32/ia32_signal.c19
-rw-r--r--arch/x86/ia32/ia32entry.S485
-rw-r--r--arch/x86/ia32/nosyscall.c7
-rw-r--r--arch/x86/ia32/sys_ia32.c14
-rw-r--r--arch/x86/ia32/syscall_ia32.c25
-rw-r--r--arch/x86/include/asm/alternative-asm.h53
-rw-r--r--arch/x86/include/asm/alternative.h73
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/barrier.h6
-rw-r--r--arch/x86/include/asm/calling.h284
-rw-r--r--arch/x86/include/asm/compat.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h32
-rw-r--r--arch/x86/include/asm/desc.h7
-rw-r--r--arch/x86/include/asm/dwarf2.h24
-rw-r--r--arch/x86/include/asm/elf.h7
-rw-r--r--arch/x86/include/asm/hw_irq.h5
-rw-r--r--arch/x86/include/asm/insn.h2
-rw-r--r--arch/x86/include/asm/irqflags.h49
-rw-r--r--arch/x86/include/asm/paravirt.h5
-rw-r--r--arch/x86/include/asm/processor.h107
-rw-r--r--arch/x86/include/asm/ptrace.h45
-rw-r--r--arch/x86/include/asm/segment.h289
-rw-r--r--arch/x86/include/asm/setup.h5
-rw-r--r--arch/x86/include/asm/sigcontext.h6
-rw-r--r--arch/x86/include/asm/sighandling.h4
-rw-r--r--arch/x86/include/asm/smap.h30
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/special_insns.h24
-rw-r--r--arch/x86/include/asm/thread_info.h74
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h1
-rw-r--r--arch/x86/include/uapi/asm/ptrace-abi.h16
-rw-r--r--arch/x86/include/uapi/asm/ptrace.h13
-rw-r--r--arch/x86/include/uapi/asm/sigcontext.h21
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/alternative.c163
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c5
-rw-r--r--arch/x86/kernel/cpu/common.c87
-rw-r--r--arch/x86/kernel/cpu/perf_event.c18
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/dumpstack.c4
-rw-r--r--arch/x86/kernel/dumpstack_32.c4
-rw-r--r--arch/x86/kernel/entry_32.S93
-rw-r--r--arch/x86/kernel/entry_64.S978
-rw-r--r--arch/x86/kernel/head_32.S3
-rw-r--r--arch/x86/kernel/head_64.S6
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes/core.c4
-rw-r--r--arch/x86/kernel/module.c11
-rw-r--r--arch/x86/kernel/perf_regs.c40
-rw-r--r--arch/x86/kernel/process.c23
-rw-r--r--arch/x86/kernel/process_32.c27
-rw-r--r--arch/x86/kernel/process_64.c24
-rw-r--r--arch/x86/kernel/ptrace.c12
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S8
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S16
-rw-r--r--arch/x86/kernel/setup.c13
-rw-r--r--arch/x86/kernel/signal.c50
-rw-r--r--arch/x86/kernel/smpboot.c36
-rw-r--r--arch/x86/kernel/syscall_32.c16
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/traps.c56
-rw-r--r--arch/x86/kernel/uprobes.c2
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/lguest/boot.c4
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S50
-rw-r--r--arch/x86/lib/checksum_32.S64
-rw-r--r--arch/x86/lib/clear_page_64.S66
-rw-r--r--arch/x86/lib/copy_page_64.S37
-rw-r--r--arch/x86/lib/copy_user_64.S46
-rw-r--r--arch/x86/lib/csum-copy_64.S2
-rw-r--r--arch/x86/lib/insn.c13
-rw-r--r--arch/x86/lib/memcpy_64.S68
-rw-r--r--arch/x86/lib/memmove_64.S19
-rw-r--r--arch/x86/lib/memset_64.S61
-rw-r--r--arch/x86/lib/msr-reg.S24
-rw-r--r--arch/x86/lib/rwsem.S44
-rw-r--r--arch/x86/lib/thunk_32.S18
-rw-r--r--arch/x86/lib/thunk_64.S28
-rw-r--r--arch/x86/lib/x86-opcode-map.txt9
-rw-r--r--arch/x86/mm/fault.c8
-rw-r--r--arch/x86/mm/init.c3
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/syscalls/syscall_32.tbl4
-rw-r--r--arch/x86/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/um/asm/barrier.h4
-rw-r--r--arch/x86/um/sys_call_table_64.c2
-rw-r--r--arch/x86/xen/enlighten.c1
-rw-r--r--arch/x86/xen/smp.c14
-rw-r--r--arch/x86/xen/xen-asm_64.S8
-rw-r--r--drivers/misc/sgi-xp/xpc_main.c2
-rw-r--r--include/linux/stddef.h9
-rw-r--r--include/linux/vfio.h13
-rw-r--r--tools/perf/bench/mem-memcpy-x86-64-asm-def.h6
-rw-r--r--tools/perf/bench/mem-memcpy-x86-64-asm.S2
-rw-r--r--tools/perf/bench/mem-memcpy.c128
-rw-r--r--tools/perf/bench/mem-memset-x86-64-asm-def.h6
-rw-r--r--tools/perf/bench/mem-memset-x86-64-asm.S2
-rw-r--r--tools/perf/util/include/asm/alternative-asm.h1
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/x86/.gitignore2
-rw-r--r--tools/testing/selftests/x86/Makefile48
-rw-r--r--tools/testing/selftests/x86/run_x86_tests.sh11
-rw-r--r--tools/testing/selftests/x86/sigreturn.c684
-rw-r--r--tools/testing/selftests/x86/trivial_32bit_program.c14
121 files changed, 3006 insertions, 2026 deletions
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index a75e3adaa39d..88b85899d309 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -406,6 +406,12 @@ Protocol: 2.00+
406 - If 0, the protected-mode code is loaded at 0x10000. 406 - If 0, the protected-mode code is loaded at 0x10000.
407 - If 1, the protected-mode code is loaded at 0x100000. 407 - If 1, the protected-mode code is loaded at 0x100000.
408 408
409 Bit 1 (kernel internal): ALSR_FLAG
410 - Used internally by the compressed kernel to communicate
411 KASLR status to kernel proper.
412 If 1, KASLR enabled.
413 If 0, KASLR disabled.
414
409 Bit 5 (write): QUIET_FLAG 415 Bit 5 (write): QUIET_FLAG
410 - If 0, print early messages. 416 - If 0, print early messages.
411 - If 1, suppress early messages. 417 - If 1, suppress early messages.
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index bb1376381985..d7b1f655b3ef 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -295,7 +295,8 @@ static unsigned long find_random_addr(unsigned long minimum,
295 return slots_fetch_random(); 295 return slots_fetch_random();
296} 296}
297 297
298unsigned char *choose_kernel_location(unsigned char *input, 298unsigned char *choose_kernel_location(struct boot_params *boot_params,
299 unsigned char *input,
299 unsigned long input_size, 300 unsigned long input_size,
300 unsigned char *output, 301 unsigned char *output,
301 unsigned long output_size) 302 unsigned long output_size)
@@ -315,6 +316,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
315 } 316 }
316#endif 317#endif
317 318
319 boot_params->hdr.loadflags |= KASLR_FLAG;
320
318 /* Record the various known unsafe memory ranges. */ 321 /* Record the various known unsafe memory ranges. */
319 mem_avoid_init((unsigned long)input, input_size, 322 mem_avoid_init((unsigned long)input, input_size,
320 (unsigned long)output, output_size); 323 (unsigned long)output, output_size);
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1d7fbbcc196d..8ef964ddc18e 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -29,6 +29,7 @@
29#include <asm/page_types.h> 29#include <asm/page_types.h>
30#include <asm/boot.h> 30#include <asm/boot.h>
31#include <asm/asm-offsets.h> 31#include <asm/asm-offsets.h>
32#include <asm/bootparam.h>
32 33
33 __HEAD 34 __HEAD
34ENTRY(startup_32) 35ENTRY(startup_32)
@@ -102,7 +103,7 @@ preferred_addr:
102 * Test KEEP_SEGMENTS flag to see if the bootloader is asking 103 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
103 * us to not reload segments 104 * us to not reload segments
104 */ 105 */
105 testb $(1<<6), BP_loadflags(%esi) 106 testb $KEEP_SEGMENTS, BP_loadflags(%esi)
106 jnz 1f 107 jnz 1f
107 108
108 cli 109 cli
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 6b1766c6c082..b0c0d16ef58d 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -31,6 +31,7 @@
31#include <asm/msr.h> 31#include <asm/msr.h>
32#include <asm/processor-flags.h> 32#include <asm/processor-flags.h>
33#include <asm/asm-offsets.h> 33#include <asm/asm-offsets.h>
34#include <asm/bootparam.h>
34 35
35 __HEAD 36 __HEAD
36 .code32 37 .code32
@@ -46,7 +47,7 @@ ENTRY(startup_32)
46 * Test KEEP_SEGMENTS flag to see if the bootloader is asking 47 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
47 * us to not reload segments 48 * us to not reload segments
48 */ 49 */
49 testb $(1<<6), BP_loadflags(%esi) 50 testb $KEEP_SEGMENTS, BP_loadflags(%esi)
50 jnz 1f 51 jnz 1f
51 52
52 cli 53 cli
@@ -164,7 +165,7 @@ ENTRY(startup_32)
164 /* After gdt is loaded */ 165 /* After gdt is loaded */
165 xorl %eax, %eax 166 xorl %eax, %eax
166 lldt %ax 167 lldt %ax
167 movl $0x20, %eax 168 movl $__BOOT_TSS, %eax
168 ltr %ax 169 ltr %ax
169 170
170 /* 171 /*
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a950864a64da..a107b935e22f 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -377,6 +377,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
377 377
378 real_mode = rmode; 378 real_mode = rmode;
379 379
380 /* Clear it for solely in-kernel use */
381 real_mode->hdr.loadflags &= ~KASLR_FLAG;
382
380 sanitize_boot_params(real_mode); 383 sanitize_boot_params(real_mode);
381 384
382 if (real_mode->screen_info.orig_video_mode == 7) { 385 if (real_mode->screen_info.orig_video_mode == 7) {
@@ -401,7 +404,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
401 * the entire decompressed kernel plus relocation table, or the 404 * the entire decompressed kernel plus relocation table, or the
402 * entire decompressed kernel plus .bss and .brk sections. 405 * entire decompressed kernel plus .bss and .brk sections.
403 */ 406 */
404 output = choose_kernel_location(input_data, input_len, output, 407 output = choose_kernel_location(real_mode, input_data, input_len, output,
405 output_len > run_size ? output_len 408 output_len > run_size ? output_len
406 : run_size); 409 : run_size);
407 410
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 04477d68403f..89dd0d78013a 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -57,7 +57,8 @@ int cmdline_find_option_bool(const char *option);
57 57
58#if CONFIG_RANDOMIZE_BASE 58#if CONFIG_RANDOMIZE_BASE
59/* aslr.c */ 59/* aslr.c */
60unsigned char *choose_kernel_location(unsigned char *input, 60unsigned char *choose_kernel_location(struct boot_params *boot_params,
61 unsigned char *input,
61 unsigned long input_size, 62 unsigned long input_size,
62 unsigned char *output, 63 unsigned char *output,
63 unsigned long output_size); 64 unsigned long output_size);
@@ -65,7 +66,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
65bool has_cpuflag(int flag); 66bool has_cpuflag(int flag);
66#else 67#else
67static inline 68static inline
68unsigned char *choose_kernel_location(unsigned char *input, 69unsigned char *choose_kernel_location(struct boot_params *boot_params,
70 unsigned char *input,
69 unsigned long input_size, 71 unsigned long input_size,
70 unsigned char *output, 72 unsigned char *output,
71 unsigned long output_size) 73 unsigned long output_size)
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 26d49ebae040..225be06edc80 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -178,7 +178,7 @@ continue_block:
178 ## 2a) PROCESS FULL BLOCKS: 178 ## 2a) PROCESS FULL BLOCKS:
179 ################################################################ 179 ################################################################
180full_block: 180full_block:
181 movq $128,%rax 181 movl $128,%eax
182 lea 128*8*2(block_0), block_1 182 lea 128*8*2(block_0), block_1
183 lea 128*8*3(block_0), block_2 183 lea 128*8*3(block_0), block_2
184 add $128*8*1, block_0 184 add $128*8*1, block_0
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index a039d21986a2..a350c990dc86 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -264,7 +264,7 @@ ENTRY(twofish_enc_blk)
264 movq R1, 8(%rsi) 264 movq R1, 8(%rsi)
265 265
266 popq R1 266 popq R1
267 movq $1,%rax 267 movl $1,%eax
268 ret 268 ret
269ENDPROC(twofish_enc_blk) 269ENDPROC(twofish_enc_blk)
270 270
@@ -316,6 +316,6 @@ ENTRY(twofish_dec_blk)
316 movq R1, 8(%rsi) 316 movq R1, 8(%rsi)
317 317
318 popq R1 318 popq R1
319 movq $1,%rax 319 movl $1,%eax
320 ret 320 ret
321ENDPROC(twofish_dec_blk) 321ENDPROC(twofish_dec_blk)
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index e785b422b766..bb635c641869 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,6 @@
3# 3#
4 4
5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o 5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
6obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
7 6
8obj-$(CONFIG_IA32_AOUT) += ia32_aout.o 7obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
9 8
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index d0165c9a2932..c81d35e6c7f1 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -161,8 +161,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
161} 161}
162 162
163static int ia32_restore_sigcontext(struct pt_regs *regs, 163static int ia32_restore_sigcontext(struct pt_regs *regs,
164 struct sigcontext_ia32 __user *sc, 164 struct sigcontext_ia32 __user *sc)
165 unsigned int *pax)
166{ 165{
167 unsigned int tmpflags, err = 0; 166 unsigned int tmpflags, err = 0;
168 void __user *buf; 167 void __user *buf;
@@ -184,7 +183,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
184 RELOAD_SEG(es); 183 RELOAD_SEG(es);
185 184
186 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 185 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
187 COPY(dx); COPY(cx); COPY(ip); 186 COPY(dx); COPY(cx); COPY(ip); COPY(ax);
188 /* Don't touch extended registers */ 187 /* Don't touch extended registers */
189 188
190 COPY_SEG_CPL3(cs); 189 COPY_SEG_CPL3(cs);
@@ -197,12 +196,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
197 196
198 get_user_ex(tmp, &sc->fpstate); 197 get_user_ex(tmp, &sc->fpstate);
199 buf = compat_ptr(tmp); 198 buf = compat_ptr(tmp);
200
201 get_user_ex(*pax, &sc->ax);
202 } get_user_catch(err); 199 } get_user_catch(err);
203 200
204 err |= restore_xstate_sig(buf, 1); 201 err |= restore_xstate_sig(buf, 1);
205 202
203 force_iret();
204
206 return err; 205 return err;
207} 206}
208 207
@@ -211,7 +210,6 @@ asmlinkage long sys32_sigreturn(void)
211 struct pt_regs *regs = current_pt_regs(); 210 struct pt_regs *regs = current_pt_regs();
212 struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); 211 struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
213 sigset_t set; 212 sigset_t set;
214 unsigned int ax;
215 213
216 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 214 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
217 goto badframe; 215 goto badframe;
@@ -224,9 +222,9 @@ asmlinkage long sys32_sigreturn(void)
224 222
225 set_current_blocked(&set); 223 set_current_blocked(&set);
226 224
227 if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) 225 if (ia32_restore_sigcontext(regs, &frame->sc))
228 goto badframe; 226 goto badframe;
229 return ax; 227 return regs->ax;
230 228
231badframe: 229badframe:
232 signal_fault(regs, frame, "32bit sigreturn"); 230 signal_fault(regs, frame, "32bit sigreturn");
@@ -238,7 +236,6 @@ asmlinkage long sys32_rt_sigreturn(void)
238 struct pt_regs *regs = current_pt_regs(); 236 struct pt_regs *regs = current_pt_regs();
239 struct rt_sigframe_ia32 __user *frame; 237 struct rt_sigframe_ia32 __user *frame;
240 sigset_t set; 238 sigset_t set;
241 unsigned int ax;
242 239
243 frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); 240 frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
244 241
@@ -249,13 +246,13 @@ asmlinkage long sys32_rt_sigreturn(void)
249 246
250 set_current_blocked(&set); 247 set_current_blocked(&set);
251 248
252 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 249 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
253 goto badframe; 250 goto badframe;
254 251
255 if (compat_restore_altstack(&frame->uc.uc_stack)) 252 if (compat_restore_altstack(&frame->uc.uc_stack))
256 goto badframe; 253 goto badframe;
257 254
258 return ax; 255 return regs->ax;
259 256
260badframe: 257badframe:
261 signal_fault(regs, frame, "32bit rt sigreturn"); 258 signal_fault(regs, frame, "32bit rt sigreturn");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 156ebcab4ada..a821b1cd4fa7 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -30,24 +30,13 @@
30 30
31 .section .entry.text, "ax" 31 .section .entry.text, "ax"
32 32
33 .macro IA32_ARG_FIXUP noebp=0 33 /* clobbers %rax */
34 movl %edi,%r8d 34 .macro CLEAR_RREGS _r9=rax
35 .if \noebp
36 .else
37 movl %ebp,%r9d
38 .endif
39 xchg %ecx,%esi
40 movl %ebx,%edi
41 movl %edx,%edx /* zero extension */
42 .endm
43
44 /* clobbers %eax */
45 .macro CLEAR_RREGS offset=0, _r9=rax
46 xorl %eax,%eax 35 xorl %eax,%eax
47 movq %rax,\offset+R11(%rsp) 36 movq %rax,R11(%rsp)
48 movq %rax,\offset+R10(%rsp) 37 movq %rax,R10(%rsp)
49 movq %\_r9,\offset+R9(%rsp) 38 movq %\_r9,R9(%rsp)
50 movq %rax,\offset+R8(%rsp) 39 movq %rax,R8(%rsp)
51 .endm 40 .endm
52 41
53 /* 42 /*
@@ -60,14 +49,14 @@
60 * If it's -1 to make us punt the syscall, then (u32)-1 is still 49 * If it's -1 to make us punt the syscall, then (u32)-1 is still
61 * an appropriately invalid value. 50 * an appropriately invalid value.
62 */ 51 */
63 .macro LOAD_ARGS32 offset, _r9=0 52 .macro LOAD_ARGS32 _r9=0
64 .if \_r9 53 .if \_r9
65 movl \offset+16(%rsp),%r9d 54 movl R9(%rsp),%r9d
66 .endif 55 .endif
67 movl \offset+40(%rsp),%ecx 56 movl RCX(%rsp),%ecx
68 movl \offset+48(%rsp),%edx 57 movl RDX(%rsp),%edx
69 movl \offset+56(%rsp),%esi 58 movl RSI(%rsp),%esi
70 movl \offset+64(%rsp),%edi 59 movl RDI(%rsp),%edi
71 movl %eax,%eax /* zero extension */ 60 movl %eax,%eax /* zero extension */
72 .endm 61 .endm
73 62
@@ -99,54 +88,69 @@ ENDPROC(native_irq_enable_sysexit)
99/* 88/*
100 * 32bit SYSENTER instruction entry. 89 * 32bit SYSENTER instruction entry.
101 * 90 *
91 * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
92 * IF and VM in rflags are cleared (IOW: interrupts are off).
93 * SYSENTER does not save anything on the stack,
94 * and does not save old rip (!!!) and rflags.
95 *
102 * Arguments: 96 * Arguments:
103 * %eax System call number. 97 * eax system call number
104 * %ebx Arg1 98 * ebx arg1
105 * %ecx Arg2 99 * ecx arg2
106 * %edx Arg3 100 * edx arg3
107 * %esi Arg4 101 * esi arg4
108 * %edi Arg5 102 * edi arg5
109 * %ebp user stack 103 * ebp user stack
110 * 0(%ebp) Arg6 104 * 0(%ebp) arg6
111 * 105 *
112 * Interrupts off.
113 *
114 * This is purely a fast path. For anything complicated we use the int 0x80 106 * This is purely a fast path. For anything complicated we use the int 0x80
115 * path below. Set up a complete hardware stack frame to share code 107 * path below. We set up a complete hardware stack frame to share code
116 * with the int 0x80 path. 108 * with the int 0x80 path.
117 */ 109 */
118ENTRY(ia32_sysenter_target) 110ENTRY(ia32_sysenter_target)
119 CFI_STARTPROC32 simple 111 CFI_STARTPROC32 simple
120 CFI_SIGNAL_FRAME 112 CFI_SIGNAL_FRAME
121 CFI_DEF_CFA rsp,0 113 CFI_DEF_CFA rsp,0
122 CFI_REGISTER rsp,rbp 114 CFI_REGISTER rsp,rbp
123 SWAPGS_UNSAFE_STACK 115
124 movq PER_CPU_VAR(kernel_stack), %rsp
125 addq $(KERNEL_STACK_OFFSET),%rsp
126 /* 116 /*
127 * No need to follow this irqs on/off section: the syscall 117 * Interrupts are off on entry.
128 * disabled irqs, here we enable it straight after entry: 118 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
119 * it is too small to ever cause noticeable irq latency.
129 */ 120 */
121 SWAPGS_UNSAFE_STACK
122 movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
130 ENABLE_INTERRUPTS(CLBR_NONE) 123 ENABLE_INTERRUPTS(CLBR_NONE)
131 movl %ebp,%ebp /* zero extension */ 124
132 pushq_cfi $__USER32_DS 125 /* Zero-extending 32-bit regs, do not remove */
133 /*CFI_REL_OFFSET ss,0*/ 126 movl %ebp, %ebp
134 pushq_cfi %rbp
135 CFI_REL_OFFSET rsp,0
136 pushfq_cfi
137 /*CFI_REL_OFFSET rflags,0*/
138 movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
139 CFI_REGISTER rip,r10
140 pushq_cfi $__USER32_CS
141 /*CFI_REL_OFFSET cs,0*/
142 movl %eax, %eax 127 movl %eax, %eax
143 pushq_cfi %r10 128
144 CFI_REL_OFFSET rip,0 129 movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
145 pushq_cfi %rax 130 CFI_REGISTER rip,r10
131
132 /* Construct struct pt_regs on stack */
133 pushq_cfi $__USER32_DS /* pt_regs->ss */
134 pushq_cfi %rbp /* pt_regs->sp */
135 CFI_REL_OFFSET rsp,0
136 pushfq_cfi /* pt_regs->flags */
137 pushq_cfi $__USER32_CS /* pt_regs->cs */
138 pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */
139 CFI_REL_OFFSET rip,0
140 pushq_cfi_reg rax /* pt_regs->orig_ax */
141 pushq_cfi_reg rdi /* pt_regs->di */
142 pushq_cfi_reg rsi /* pt_regs->si */
143 pushq_cfi_reg rdx /* pt_regs->dx */
144 pushq_cfi_reg rcx /* pt_regs->cx */
145 pushq_cfi_reg rax /* pt_regs->ax */
146 cld 146 cld
147 SAVE_ARGS 0,1,0 147 sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
148 /* no need to do an access_ok check here because rbp has been 148 CFI_ADJUST_CFA_OFFSET 10*8
149 32bit zero extended */ 149
150 /*
151 * no need to do an access_ok check here because rbp has been
152 * 32bit zero extended
153 */
150 ASM_STAC 154 ASM_STAC
1511: movl (%rbp),%ebp 1551: movl (%rbp),%ebp
152 _ASM_EXTABLE(1b,ia32_badarg) 156 _ASM_EXTABLE(1b,ia32_badarg)
@@ -157,42 +161,80 @@ ENTRY(ia32_sysenter_target)
157 * ourselves. To save a few cycles, we can check whether 161 * ourselves. To save a few cycles, we can check whether
158 * NT was set instead of doing an unconditional popfq. 162 * NT was set instead of doing an unconditional popfq.
159 */ 163 */
160 testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp) 164 testl $X86_EFLAGS_NT,EFLAGS(%rsp)
161 jnz sysenter_fix_flags 165 jnz sysenter_fix_flags
162sysenter_flags_fixed: 166sysenter_flags_fixed:
163 167
164 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 168 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
165 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 169 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
166 CFI_REMEMBER_STATE 170 CFI_REMEMBER_STATE
167 jnz sysenter_tracesys 171 jnz sysenter_tracesys
168 cmpq $(IA32_NR_syscalls-1),%rax 172 cmpq $(IA32_NR_syscalls-1),%rax
169 ja ia32_badsys 173 ja ia32_badsys
170sysenter_do_call: 174sysenter_do_call:
171 IA32_ARG_FIXUP 175 /* 32bit syscall -> 64bit C ABI argument conversion */
176 movl %edi,%r8d /* arg5 */
177 movl %ebp,%r9d /* arg6 */
178 xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
179 movl %ebx,%edi /* arg1 */
180 movl %edx,%edx /* arg3 (zero extension) */
172sysenter_dispatch: 181sysenter_dispatch:
173 call *ia32_sys_call_table(,%rax,8) 182 call *ia32_sys_call_table(,%rax,8)
174 movq %rax,RAX-ARGOFFSET(%rsp) 183 movq %rax,RAX(%rsp)
175 DISABLE_INTERRUPTS(CLBR_NONE) 184 DISABLE_INTERRUPTS(CLBR_NONE)
176 TRACE_IRQS_OFF 185 TRACE_IRQS_OFF
177 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 186 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
178 jnz sysexit_audit 187 jnz sysexit_audit
179sysexit_from_sys_call: 188sysexit_from_sys_call:
180 andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 189 /*
181 /* clear IF, that popfq doesn't enable interrupts early */ 190 * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
182 andl $~0x200,EFLAGS-ARGOFFSET(%rsp) 191 * NMI between STI and SYSEXIT has poorly specified behavior,
183 movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */ 192 * and and NMI followed by an IRQ with usergs is fatal. So
184 CFI_REGISTER rip,rdx 193 * we just pretend we're using SYSEXIT but we really use
185 RESTORE_ARGS 0,24,0,0,0,0 194 * SYSRETL instead.
195 *
196 * This code path is still called 'sysexit' because it pairs
197 * with 'sysenter' and it uses the SYSENTER calling convention.
198 */
199 andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
200 movl RIP(%rsp),%ecx /* User %eip */
201 CFI_REGISTER rip,rcx
202 RESTORE_RSI_RDI
203 xorl %edx,%edx /* avoid info leaks */
186 xorq %r8,%r8 204 xorq %r8,%r8
187 xorq %r9,%r9 205 xorq %r9,%r9
188 xorq %r10,%r10 206 xorq %r10,%r10
189 xorq %r11,%r11 207 movl EFLAGS(%rsp),%r11d /* User eflags */
190 popfq_cfi
191 /*CFI_RESTORE rflags*/ 208 /*CFI_RESTORE rflags*/
192 popq_cfi %rcx /* User %esp */
193 CFI_REGISTER rsp,rcx
194 TRACE_IRQS_ON 209 TRACE_IRQS_ON
195 ENABLE_INTERRUPTS_SYSEXIT32 210
211 /*
212 * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT,
213 * since it avoids a dicey window with interrupts enabled.
214 */
215 movl RSP(%rsp),%esp
216
217 /*
218 * USERGS_SYSRET32 does:
219 * gsbase = user's gs base
220 * eip = ecx
221 * rflags = r11
222 * cs = __USER32_CS
223 * ss = __USER_DS
224 *
225 * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
226 *
227 * pop %ebp
228 * pop %edx
229 * pop %ecx
230 *
231 * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
232 * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's
233 * address (already known to user code), and R12-R15 are
234 * callee-saved and therefore don't contain any interesting
235 * kernel data.
236 */
237 USERGS_SYSRET32
196 238
197 CFI_RESTORE_STATE 239 CFI_RESTORE_STATE
198 240
@@ -205,18 +247,18 @@ sysexit_from_sys_call:
205 movl %ebx,%esi /* 2nd arg: 1st syscall arg */ 247 movl %ebx,%esi /* 2nd arg: 1st syscall arg */
206 movl %eax,%edi /* 1st arg: syscall number */ 248 movl %eax,%edi /* 1st arg: syscall number */
207 call __audit_syscall_entry 249 call __audit_syscall_entry
208 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ 250 movl RAX(%rsp),%eax /* reload syscall number */
209 cmpq $(IA32_NR_syscalls-1),%rax 251 cmpq $(IA32_NR_syscalls-1),%rax
210 ja ia32_badsys 252 ja ia32_badsys
211 movl %ebx,%edi /* reload 1st syscall arg */ 253 movl %ebx,%edi /* reload 1st syscall arg */
212 movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ 254 movl RCX(%rsp),%esi /* reload 2nd syscall arg */
213 movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ 255 movl RDX(%rsp),%edx /* reload 3rd syscall arg */
214 movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */ 256 movl RSI(%rsp),%ecx /* reload 4th syscall arg */
215 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ 257 movl RDI(%rsp),%r8d /* reload 5th syscall arg */
216 .endm 258 .endm
217 259
218 .macro auditsys_exit exit 260 .macro auditsys_exit exit
219 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 261 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
220 jnz ia32_ret_from_sys_call 262 jnz ia32_ret_from_sys_call
221 TRACE_IRQS_ON 263 TRACE_IRQS_ON
222 ENABLE_INTERRUPTS(CLBR_NONE) 264 ENABLE_INTERRUPTS(CLBR_NONE)
@@ -227,13 +269,13 @@ sysexit_from_sys_call:
2271: setbe %al /* 1 if error, 0 if not */ 2691: setbe %al /* 1 if error, 0 if not */
228 movzbl %al,%edi /* zero-extend that into %edi */ 270 movzbl %al,%edi /* zero-extend that into %edi */
229 call __audit_syscall_exit 271 call __audit_syscall_exit
230 movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ 272 movq RAX(%rsp),%rax /* reload syscall return value */
231 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 273 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
232 DISABLE_INTERRUPTS(CLBR_NONE) 274 DISABLE_INTERRUPTS(CLBR_NONE)
233 TRACE_IRQS_OFF 275 TRACE_IRQS_OFF
234 testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 276 testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
235 jz \exit 277 jz \exit
236 CLEAR_RREGS -ARGOFFSET 278 CLEAR_RREGS
237 jmp int_with_check 279 jmp int_with_check
238 .endm 280 .endm
239 281
@@ -253,16 +295,16 @@ sysenter_fix_flags:
253 295
254sysenter_tracesys: 296sysenter_tracesys:
255#ifdef CONFIG_AUDITSYSCALL 297#ifdef CONFIG_AUDITSYSCALL
256 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 298 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
257 jz sysenter_auditsys 299 jz sysenter_auditsys
258#endif 300#endif
259 SAVE_REST 301 SAVE_EXTRA_REGS
260 CLEAR_RREGS 302 CLEAR_RREGS
261 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ 303 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
262 movq %rsp,%rdi /* &pt_regs -> arg1 */ 304 movq %rsp,%rdi /* &pt_regs -> arg1 */
263 call syscall_trace_enter 305 call syscall_trace_enter
264 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 306 LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
265 RESTORE_REST 307 RESTORE_EXTRA_REGS
266 cmpq $(IA32_NR_syscalls-1),%rax 308 cmpq $(IA32_NR_syscalls-1),%rax
267 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ 309 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
268 jmp sysenter_do_call 310 jmp sysenter_do_call
@@ -272,94 +314,128 @@ ENDPROC(ia32_sysenter_target)
272/* 314/*
273 * 32bit SYSCALL instruction entry. 315 * 32bit SYSCALL instruction entry.
274 * 316 *
317 * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
318 * then loads new ss, cs, and rip from previously programmed MSRs.
319 * rflags gets masked by a value from another MSR (so CLD and CLAC
320 * are not needed). SYSCALL does not save anything on the stack
321 * and does not change rsp.
322 *
323 * Note: rflags saving+masking-with-MSR happens only in Long mode
324 * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
325 * Don't get confused: rflags saving+masking depends on Long Mode Active bit
326 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
327 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
328 *
275 * Arguments: 329 * Arguments:
276 * %eax System call number. 330 * eax system call number
277 * %ebx Arg1 331 * ecx return address
278 * %ecx return EIP 332 * ebx arg1
279 * %edx Arg3 333 * ebp arg2 (note: not saved in the stack frame, should not be touched)
280 * %esi Arg4 334 * edx arg3
281 * %edi Arg5 335 * esi arg4
282 * %ebp Arg2 [note: not saved in the stack frame, should not be touched] 336 * edi arg5
283 * %esp user stack 337 * esp user stack
284 * 0(%esp) Arg6 338 * 0(%esp) arg6
285 * 339 *
286 * Interrupts off.
287 *
288 * This is purely a fast path. For anything complicated we use the int 0x80 340 * This is purely a fast path. For anything complicated we use the int 0x80
289 * path below. Set up a complete hardware stack frame to share code 341 * path below. We set up a complete hardware stack frame to share code
290 * with the int 0x80 path. 342 * with the int 0x80 path.
291 */ 343 */
292ENTRY(ia32_cstar_target) 344ENTRY(ia32_cstar_target)
293 CFI_STARTPROC32 simple 345 CFI_STARTPROC32 simple
294 CFI_SIGNAL_FRAME 346 CFI_SIGNAL_FRAME
295 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET 347 CFI_DEF_CFA rsp,0
296 CFI_REGISTER rip,rcx 348 CFI_REGISTER rip,rcx
297 /*CFI_REGISTER rflags,r11*/ 349 /*CFI_REGISTER rflags,r11*/
350
351 /*
352 * Interrupts are off on entry.
353 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
354 * it is too small to ever cause noticeable irq latency.
355 */
298 SWAPGS_UNSAFE_STACK 356 SWAPGS_UNSAFE_STACK
299 movl %esp,%r8d 357 movl %esp,%r8d
300 CFI_REGISTER rsp,r8 358 CFI_REGISTER rsp,r8
301 movq PER_CPU_VAR(kernel_stack),%rsp 359 movq PER_CPU_VAR(kernel_stack),%rsp
302 /*
303 * No need to follow this irqs on/off section: the syscall
304 * disabled irqs and here we enable it straight after entry:
305 */
306 ENABLE_INTERRUPTS(CLBR_NONE) 360 ENABLE_INTERRUPTS(CLBR_NONE)
307 SAVE_ARGS 8,0,0 361
308 movl %eax,%eax /* zero extension */ 362 /* Zero-extending 32-bit regs, do not remove */
309 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 363 movl %eax,%eax
310 movq %rcx,RIP-ARGOFFSET(%rsp) 364
311 CFI_REL_OFFSET rip,RIP-ARGOFFSET 365 /* Construct struct pt_regs on stack */
312 movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ 366 pushq_cfi $__USER32_DS /* pt_regs->ss */
367 pushq_cfi %r8 /* pt_regs->sp */
368 CFI_REL_OFFSET rsp,0
369 pushq_cfi %r11 /* pt_regs->flags */
370 pushq_cfi $__USER32_CS /* pt_regs->cs */
371 pushq_cfi %rcx /* pt_regs->ip */
372 CFI_REL_OFFSET rip,0
373 pushq_cfi_reg rax /* pt_regs->orig_ax */
374 pushq_cfi_reg rdi /* pt_regs->di */
375 pushq_cfi_reg rsi /* pt_regs->si */
376 pushq_cfi_reg rdx /* pt_regs->dx */
377 pushq_cfi_reg rbp /* pt_regs->cx */
313 movl %ebp,%ecx 378 movl %ebp,%ecx
314 movq $__USER32_CS,CS-ARGOFFSET(%rsp) 379 pushq_cfi_reg rax /* pt_regs->ax */
315 movq $__USER32_DS,SS-ARGOFFSET(%rsp) 380 sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
316 movq %r11,EFLAGS-ARGOFFSET(%rsp) 381 CFI_ADJUST_CFA_OFFSET 10*8
317 /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ 382
318 movq %r8,RSP-ARGOFFSET(%rsp) 383 /*
319 CFI_REL_OFFSET rsp,RSP-ARGOFFSET 384 * no need to do an access_ok check here because r8 has been
320 /* no need to do an access_ok check here because r8 has been 385 * 32bit zero extended
321 32bit zero extended */ 386 */
322 /* hardware stack frame is complete now */
323 ASM_STAC 387 ASM_STAC
3241: movl (%r8),%r9d 3881: movl (%r8),%r9d
325 _ASM_EXTABLE(1b,ia32_badarg) 389 _ASM_EXTABLE(1b,ia32_badarg)
326 ASM_CLAC 390 ASM_CLAC
327 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 391 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
328 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 392 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
329 CFI_REMEMBER_STATE 393 CFI_REMEMBER_STATE
330 jnz cstar_tracesys 394 jnz cstar_tracesys
331 cmpq $IA32_NR_syscalls-1,%rax 395 cmpq $IA32_NR_syscalls-1,%rax
332 ja ia32_badsys 396 ja ia32_badsys
333cstar_do_call: 397cstar_do_call:
334 IA32_ARG_FIXUP 1 398 /* 32bit syscall -> 64bit C ABI argument conversion */
399 movl %edi,%r8d /* arg5 */
400 /* r9 already loaded */ /* arg6 */
401 xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
402 movl %ebx,%edi /* arg1 */
403 movl %edx,%edx /* arg3 (zero extension) */
335cstar_dispatch: 404cstar_dispatch:
336 call *ia32_sys_call_table(,%rax,8) 405 call *ia32_sys_call_table(,%rax,8)
337 movq %rax,RAX-ARGOFFSET(%rsp) 406 movq %rax,RAX(%rsp)
338 DISABLE_INTERRUPTS(CLBR_NONE) 407 DISABLE_INTERRUPTS(CLBR_NONE)
339 TRACE_IRQS_OFF 408 TRACE_IRQS_OFF
340 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 409 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
341 jnz sysretl_audit 410 jnz sysretl_audit
342sysretl_from_sys_call: 411sysretl_from_sys_call:
343 andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 412 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
344 RESTORE_ARGS 0,-ARG_SKIP,0,0,0 413 RESTORE_RSI_RDI_RDX
345 movl RIP-ARGOFFSET(%rsp),%ecx 414 movl RIP(%rsp),%ecx
346 CFI_REGISTER rip,rcx 415 CFI_REGISTER rip,rcx
347 movl EFLAGS-ARGOFFSET(%rsp),%r11d 416 movl EFLAGS(%rsp),%r11d
348 /*CFI_REGISTER rflags,r11*/ 417 /*CFI_REGISTER rflags,r11*/
349 xorq %r10,%r10 418 xorq %r10,%r10
350 xorq %r9,%r9 419 xorq %r9,%r9
351 xorq %r8,%r8 420 xorq %r8,%r8
352 TRACE_IRQS_ON 421 TRACE_IRQS_ON
353 movl RSP-ARGOFFSET(%rsp),%esp 422 movl RSP(%rsp),%esp
354 CFI_RESTORE rsp 423 CFI_RESTORE rsp
424 /*
425 * 64bit->32bit SYSRET restores eip from ecx,
426 * eflags from r11 (but RF and VM bits are forced to 0),
427 * cs and ss are loaded from MSRs.
428 * (Note: 32bit->32bit SYSRET is different: since r11
429 * does not exist, it merely sets eflags.IF=1).
430 */
355 USERGS_SYSRET32 431 USERGS_SYSRET32
356 432
357#ifdef CONFIG_AUDITSYSCALL 433#ifdef CONFIG_AUDITSYSCALL
358cstar_auditsys: 434cstar_auditsys:
359 CFI_RESTORE_STATE 435 CFI_RESTORE_STATE
360 movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ 436 movl %r9d,R9(%rsp) /* register to be clobbered by call */
361 auditsys_entry_common 437 auditsys_entry_common
362 movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ 438 movl R9(%rsp),%r9d /* reload 6th syscall arg */
363 jmp cstar_dispatch 439 jmp cstar_dispatch
364 440
365sysretl_audit: 441sysretl_audit:
@@ -368,17 +444,17 @@ sysretl_audit:
368 444
369cstar_tracesys: 445cstar_tracesys:
370#ifdef CONFIG_AUDITSYSCALL 446#ifdef CONFIG_AUDITSYSCALL
371 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 447 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
372 jz cstar_auditsys 448 jz cstar_auditsys
373#endif 449#endif
374 xchgl %r9d,%ebp 450 xchgl %r9d,%ebp
375 SAVE_REST 451 SAVE_EXTRA_REGS
376 CLEAR_RREGS 0, r9 452 CLEAR_RREGS r9
377 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 453 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
378 movq %rsp,%rdi /* &pt_regs -> arg1 */ 454 movq %rsp,%rdi /* &pt_regs -> arg1 */
379 call syscall_trace_enter 455 call syscall_trace_enter
380 LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ 456 LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */
381 RESTORE_REST 457 RESTORE_EXTRA_REGS
382 xchgl %ebp,%r9d 458 xchgl %ebp,%r9d
383 cmpq $(IA32_NR_syscalls-1),%rax 459 cmpq $(IA32_NR_syscalls-1),%rax
384 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ 460 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
@@ -391,78 +467,94 @@ ia32_badarg:
391 jmp ia32_sysret 467 jmp ia32_sysret
392 CFI_ENDPROC 468 CFI_ENDPROC
393 469
394/* 470/*
395 * Emulated IA32 system calls via int 0x80. 471 * Emulated IA32 system calls via int 0x80.
396 * 472 *
397 * Arguments: 473 * Arguments:
398 * %eax System call number. 474 * eax system call number
399 * %ebx Arg1 475 * ebx arg1
400 * %ecx Arg2 476 * ecx arg2
401 * %edx Arg3 477 * edx arg3
402 * %esi Arg4 478 * esi arg4
403 * %edi Arg5 479 * edi arg5
404 * %ebp Arg6 [note: not saved in the stack frame, should not be touched] 480 * ebp arg6 (note: not saved in the stack frame, should not be touched)
405 * 481 *
406 * Notes: 482 * Notes:
407 * Uses the same stack frame as the x86-64 version. 483 * Uses the same stack frame as the x86-64 version.
408 * All registers except %eax must be saved (but ptrace may violate that) 484 * All registers except eax must be saved (but ptrace may violate that).
409 * Arguments are zero extended. For system calls that want sign extension and 485 * Arguments are zero extended. For system calls that want sign extension and
410 * take long arguments a wrapper is needed. Most calls can just be called 486 * take long arguments a wrapper is needed. Most calls can just be called
411 * directly. 487 * directly.
412 * Assumes it is only called from user space and entered with interrupts off. 488 * Assumes it is only called from user space and entered with interrupts off.
413 */ 489 */
414 490
415ENTRY(ia32_syscall) 491ENTRY(ia32_syscall)
416 CFI_STARTPROC32 simple 492 CFI_STARTPROC32 simple
417 CFI_SIGNAL_FRAME 493 CFI_SIGNAL_FRAME
418 CFI_DEF_CFA rsp,SS+8-RIP 494 CFI_DEF_CFA rsp,5*8
419 /*CFI_REL_OFFSET ss,SS-RIP*/ 495 /*CFI_REL_OFFSET ss,4*8 */
420 CFI_REL_OFFSET rsp,RSP-RIP 496 CFI_REL_OFFSET rsp,3*8
421 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ 497 /*CFI_REL_OFFSET rflags,2*8 */
422 /*CFI_REL_OFFSET cs,CS-RIP*/ 498 /*CFI_REL_OFFSET cs,1*8 */
423 CFI_REL_OFFSET rip,RIP-RIP 499 CFI_REL_OFFSET rip,0*8
424 PARAVIRT_ADJUST_EXCEPTION_FRAME 500
425 SWAPGS
426 /* 501 /*
427 * No need to follow this irqs on/off section: the syscall 502 * Interrupts are off on entry.
428 * disabled irqs and here we enable it straight after entry: 503 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
504 * it is too small to ever cause noticeable irq latency.
429 */ 505 */
506 PARAVIRT_ADJUST_EXCEPTION_FRAME
507 SWAPGS
430 ENABLE_INTERRUPTS(CLBR_NONE) 508 ENABLE_INTERRUPTS(CLBR_NONE)
431 movl %eax,%eax 509
432 pushq_cfi %rax 510 /* Zero-extending 32-bit regs, do not remove */
511 movl %eax,%eax
512
513 /* Construct struct pt_regs on stack (iret frame is already on stack) */
514 pushq_cfi_reg rax /* pt_regs->orig_ax */
515 pushq_cfi_reg rdi /* pt_regs->di */
516 pushq_cfi_reg rsi /* pt_regs->si */
517 pushq_cfi_reg rdx /* pt_regs->dx */
518 pushq_cfi_reg rcx /* pt_regs->cx */
519 pushq_cfi_reg rax /* pt_regs->ax */
433 cld 520 cld
434 /* note the registers are not zero extended to the sf. 521 sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
435 this could be a problem. */ 522 CFI_ADJUST_CFA_OFFSET 10*8
436 SAVE_ARGS 0,1,0 523
437 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 524 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
438 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 525 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
439 jnz ia32_tracesys 526 jnz ia32_tracesys
440 cmpq $(IA32_NR_syscalls-1),%rax 527 cmpq $(IA32_NR_syscalls-1),%rax
441 ja ia32_badsys 528 ja ia32_badsys
442ia32_do_call: 529ia32_do_call:
443 IA32_ARG_FIXUP 530 /* 32bit syscall -> 64bit C ABI argument conversion */
531 movl %edi,%r8d /* arg5 */
532 movl %ebp,%r9d /* arg6 */
533 xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
534 movl %ebx,%edi /* arg1 */
535 movl %edx,%edx /* arg3 (zero extension) */
444 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative 536 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
445ia32_sysret: 537ia32_sysret:
446 movq %rax,RAX-ARGOFFSET(%rsp) 538 movq %rax,RAX(%rsp)
447ia32_ret_from_sys_call: 539ia32_ret_from_sys_call:
448 CLEAR_RREGS -ARGOFFSET 540 CLEAR_RREGS
449 jmp int_ret_from_sys_call 541 jmp int_ret_from_sys_call
450 542
451ia32_tracesys: 543ia32_tracesys:
452 SAVE_REST 544 SAVE_EXTRA_REGS
453 CLEAR_RREGS 545 CLEAR_RREGS
454 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 546 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
455 movq %rsp,%rdi /* &pt_regs -> arg1 */ 547 movq %rsp,%rdi /* &pt_regs -> arg1 */
456 call syscall_trace_enter 548 call syscall_trace_enter
457 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 549 LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
458 RESTORE_REST 550 RESTORE_EXTRA_REGS
459 cmpq $(IA32_NR_syscalls-1),%rax 551 cmpq $(IA32_NR_syscalls-1),%rax
460 ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ 552 ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
461 jmp ia32_do_call 553 jmp ia32_do_call
462END(ia32_syscall) 554END(ia32_syscall)
463 555
464ia32_badsys: 556ia32_badsys:
465 movq $0,ORIG_RAX-ARGOFFSET(%rsp) 557 movq $0,ORIG_RAX(%rsp)
466 movq $-ENOSYS,%rax 558 movq $-ENOSYS,%rax
467 jmp ia32_sysret 559 jmp ia32_sysret
468 560
@@ -479,8 +571,6 @@ GLOBAL(\label)
479 571
480 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn 572 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
481 PTREGSCALL stub32_sigreturn, sys32_sigreturn 573 PTREGSCALL stub32_sigreturn, sys32_sigreturn
482 PTREGSCALL stub32_execve, compat_sys_execve
483 PTREGSCALL stub32_execveat, compat_sys_execveat
484 PTREGSCALL stub32_fork, sys_fork 574 PTREGSCALL stub32_fork, sys_fork
485 PTREGSCALL stub32_vfork, sys_vfork 575 PTREGSCALL stub32_vfork, sys_vfork
486 576
@@ -492,24 +582,23 @@ GLOBAL(stub32_clone)
492 582
493 ALIGN 583 ALIGN
494ia32_ptregs_common: 584ia32_ptregs_common:
495 popq %r11
496 CFI_ENDPROC 585 CFI_ENDPROC
497 CFI_STARTPROC32 simple 586 CFI_STARTPROC32 simple
498 CFI_SIGNAL_FRAME 587 CFI_SIGNAL_FRAME
499 CFI_DEF_CFA rsp,SS+8-ARGOFFSET 588 CFI_DEF_CFA rsp,SIZEOF_PTREGS
500 CFI_REL_OFFSET rax,RAX-ARGOFFSET 589 CFI_REL_OFFSET rax,RAX
501 CFI_REL_OFFSET rcx,RCX-ARGOFFSET 590 CFI_REL_OFFSET rcx,RCX
502 CFI_REL_OFFSET rdx,RDX-ARGOFFSET 591 CFI_REL_OFFSET rdx,RDX
503 CFI_REL_OFFSET rsi,RSI-ARGOFFSET 592 CFI_REL_OFFSET rsi,RSI
504 CFI_REL_OFFSET rdi,RDI-ARGOFFSET 593 CFI_REL_OFFSET rdi,RDI
505 CFI_REL_OFFSET rip,RIP-ARGOFFSET 594 CFI_REL_OFFSET rip,RIP
506/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ 595/* CFI_REL_OFFSET cs,CS*/
507/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ 596/* CFI_REL_OFFSET rflags,EFLAGS*/
508 CFI_REL_OFFSET rsp,RSP-ARGOFFSET 597 CFI_REL_OFFSET rsp,RSP
509/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ 598/* CFI_REL_OFFSET ss,SS*/
510 SAVE_REST 599 SAVE_EXTRA_REGS 8
511 call *%rax 600 call *%rax
512 RESTORE_REST 601 RESTORE_EXTRA_REGS 8
513 jmp ia32_sysret /* misbalances the return cache */ 602 ret
514 CFI_ENDPROC 603 CFI_ENDPROC
515END(ia32_ptregs_common) 604END(ia32_ptregs_common)
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
deleted file mode 100644
index 51ecd5b4e787..000000000000
--- a/arch/x86/ia32/nosyscall.c
+++ /dev/null
@@ -1,7 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3
4long compat_ni_syscall(void)
5{
6 return -ENOSYS;
7}
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 8e0ceecdc957..719cd702b0a4 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
201 advice); 201 advice);
202} 202}
203 203
204long sys32_vm86_warning(void)
205{
206 struct task_struct *me = current;
207 static char lastcomm[sizeof(me->comm)];
208
209 if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
210 compat_printk(KERN_INFO
211 "%s: vm86 mode not supported on 64 bit kernel\n",
212 me->comm);
213 strncpy(lastcomm, me->comm, sizeof(lastcomm));
214 }
215 return -ENOSYS;
216}
217
218asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, 204asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
219 size_t count) 205 size_t count)
220{ 206{
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
deleted file mode 100644
index 4754ba0f5d9f..000000000000
--- a/arch/x86/ia32/syscall_ia32.c
+++ /dev/null
@@ -1,25 +0,0 @@
1/* System call table for ia32 emulation. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <asm/asm-offsets.h>
7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
9#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386
11
12#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
13
14typedef void (*sys_call_ptr_t)(void);
15
16extern void compat_ni_syscall(void);
17
18const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
19 /*
20 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed.
22 */
23 [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
24#include <asm/syscalls_32.h>
25};
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 372231c22a47..bdf02eeee765 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -18,12 +18,63 @@
18 .endm 18 .endm
19#endif 19#endif
20 20
21.macro altinstruction_entry orig alt feature orig_len alt_len 21.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
22 .long \orig - . 22 .long \orig - .
23 .long \alt - . 23 .long \alt - .
24 .word \feature 24 .word \feature
25 .byte \orig_len 25 .byte \orig_len
26 .byte \alt_len 26 .byte \alt_len
27 .byte \pad_len
28.endm
29
30.macro ALTERNATIVE oldinstr, newinstr, feature
31140:
32 \oldinstr
33141:
34 .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
35142:
36
37 .pushsection .altinstructions,"a"
38 altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
39 .popsection
40
41 .pushsection .altinstr_replacement,"ax"
42143:
43 \newinstr
44144:
45 .popsection
46.endm
47
48#define old_len 141b-140b
49#define new_len1 144f-143f
50#define new_len2 145f-144f
51
52/*
53 * max without conditionals. Idea adapted from:
54 * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
55 */
56#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
57
58.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
59140:
60 \oldinstr
61141:
62 .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
63 (alt_max_short(new_len1, new_len2) - (old_len)),0x90
64142:
65
66 .pushsection .altinstructions,"a"
67 altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
68 altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
69 .popsection
70
71 .pushsection .altinstr_replacement,"ax"
72143:
73 \newinstr1
74144:
75 \newinstr2
76145:
77 .popsection
27.endm 78.endm
28 79
29#endif /* __ASSEMBLY__ */ 80#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 473bdbee378a..ba32af062f61 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -48,8 +48,9 @@ struct alt_instr {
48 s32 repl_offset; /* offset to replacement instruction */ 48 s32 repl_offset; /* offset to replacement instruction */
49 u16 cpuid; /* cpuid bit set for replacement */ 49 u16 cpuid; /* cpuid bit set for replacement */
50 u8 instrlen; /* length of original instruction */ 50 u8 instrlen; /* length of original instruction */
51 u8 replacementlen; /* length of new instruction, <= instrlen */ 51 u8 replacementlen; /* length of new instruction */
52}; 52 u8 padlen; /* length of build-time padding */
53} __packed;
53 54
54extern void alternative_instructions(void); 55extern void alternative_instructions(void);
55extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); 56extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
@@ -76,50 +77,69 @@ static inline int alternatives_text_reserved(void *start, void *end)
76} 77}
77#endif /* CONFIG_SMP */ 78#endif /* CONFIG_SMP */
78 79
79#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" 80#define b_replacement(num) "664"#num
81#define e_replacement(num) "665"#num
80 82
81#define b_replacement(number) "663"#number 83#define alt_end_marker "663"
82#define e_replacement(number) "664"#number 84#define alt_slen "662b-661b"
85#define alt_pad_len alt_end_marker"b-662b"
86#define alt_total_slen alt_end_marker"b-661b"
87#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
83 88
84#define alt_slen "662b-661b" 89#define __OLDINSTR(oldinstr, num) \
85#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" 90 "661:\n\t" oldinstr "\n662:\n" \
91 ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \
92 "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
86 93
87#define ALTINSTR_ENTRY(feature, number) \ 94#define OLDINSTR(oldinstr, num) \
95 __OLDINSTR(oldinstr, num) \
96 alt_end_marker ":\n"
97
98/*
99 * max without conditionals. Idea adapted from:
100 * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
101 *
102 * The additional "-" is needed because gas works with s32s.
103 */
104#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
105
106/*
107 * Pad the second replacement alternative with additional NOPs if it is
108 * additionally longer than the first replacement alternative.
109 */
110#define OLDINSTR_2(oldinstr, num1, num2) \
111 "661:\n\t" oldinstr "\n662:\n" \
112 ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \
113 "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \
114 alt_end_marker ":\n"
115
116#define ALTINSTR_ENTRY(feature, num) \
88 " .long 661b - .\n" /* label */ \ 117 " .long 661b - .\n" /* label */ \
89 " .long " b_replacement(number)"f - .\n" /* new instruction */ \ 118 " .long " b_replacement(num)"f - .\n" /* new instruction */ \
90 " .word " __stringify(feature) "\n" /* feature bit */ \ 119 " .word " __stringify(feature) "\n" /* feature bit */ \
91 " .byte " alt_slen "\n" /* source len */ \ 120 " .byte " alt_total_slen "\n" /* source len */ \
92 " .byte " alt_rlen(number) "\n" /* replacement len */ 121 " .byte " alt_rlen(num) "\n" /* replacement len */ \
93 122 " .byte " alt_pad_len "\n" /* pad len */
94#define DISCARD_ENTRY(number) /* rlen <= slen */ \
95 " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
96 123
97#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ 124#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
98 b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" 125 b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
99 126
100/* alternative assembly primitive: */ 127/* alternative assembly primitive: */
101#define ALTERNATIVE(oldinstr, newinstr, feature) \ 128#define ALTERNATIVE(oldinstr, newinstr, feature) \
102 OLDINSTR(oldinstr) \ 129 OLDINSTR(oldinstr, 1) \
103 ".pushsection .altinstructions,\"a\"\n" \ 130 ".pushsection .altinstructions,\"a\"\n" \
104 ALTINSTR_ENTRY(feature, 1) \ 131 ALTINSTR_ENTRY(feature, 1) \
105 ".popsection\n" \ 132 ".popsection\n" \
106 ".pushsection .discard,\"aw\",@progbits\n" \
107 DISCARD_ENTRY(1) \
108 ".popsection\n" \
109 ".pushsection .altinstr_replacement, \"ax\"\n" \ 133 ".pushsection .altinstr_replacement, \"ax\"\n" \
110 ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ 134 ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
111 ".popsection" 135 ".popsection"
112 136
113#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ 137#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
114 OLDINSTR(oldinstr) \ 138 OLDINSTR_2(oldinstr, 1, 2) \
115 ".pushsection .altinstructions,\"a\"\n" \ 139 ".pushsection .altinstructions,\"a\"\n" \
116 ALTINSTR_ENTRY(feature1, 1) \ 140 ALTINSTR_ENTRY(feature1, 1) \
117 ALTINSTR_ENTRY(feature2, 2) \ 141 ALTINSTR_ENTRY(feature2, 2) \
118 ".popsection\n" \ 142 ".popsection\n" \
119 ".pushsection .discard,\"aw\",@progbits\n" \
120 DISCARD_ENTRY(1) \
121 DISCARD_ENTRY(2) \
122 ".popsection\n" \
123 ".pushsection .altinstr_replacement, \"ax\"\n" \ 143 ".pushsection .altinstr_replacement, \"ax\"\n" \
124 ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ 144 ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
125 ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ 145 ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
@@ -146,6 +166,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
146#define alternative(oldinstr, newinstr, feature) \ 166#define alternative(oldinstr, newinstr, feature) \
147 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") 167 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
148 168
169#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
170 asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
171
149/* 172/*
150 * Alternative inline assembly with input. 173 * Alternative inline assembly with input.
151 * 174 *
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 08f217354442..976b86a325e5 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
91{ 91{
92 volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); 92 volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
93 93
94 alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, 94 alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
95 ASM_OUTPUT2("=r" (v), "=m" (*addr)), 95 ASM_OUTPUT2("=r" (v), "=m" (*addr)),
96 ASM_OUTPUT2("0" (v), "m" (*addr))); 96 ASM_OUTPUT2("0" (v), "m" (*addr)));
97} 97}
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 2ab1eb33106e..959e45b81fe2 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -95,13 +95,11 @@ do { \
95 * Stop RDTSC speculation. This is needed when you need to use RDTSC 95 * Stop RDTSC speculation. This is needed when you need to use RDTSC
96 * (or get_cycles or vread that possibly accesses the TSC) in a defined 96 * (or get_cycles or vread that possibly accesses the TSC) in a defined
97 * code region. 97 * code region.
98 *
99 * (Could use an alternative three way for this if there was one.)
100 */ 98 */
101static __always_inline void rdtsc_barrier(void) 99static __always_inline void rdtsc_barrier(void)
102{ 100{
103 alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); 101 alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
104 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); 102 "lfence", X86_FEATURE_LFENCE_RDTSC);
105} 103}
106 104
107#endif /* _ASM_X86_BARRIER_H */ 105#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 1f1297b46f83..1c8b50edb2db 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -55,143 +55,157 @@ For 32-bit we have the following conventions - kernel is built with
55 * for assembly code: 55 * for assembly code:
56 */ 56 */
57 57
58#define R15 0 58/* The layout forms the "struct pt_regs" on the stack: */
59#define R14 8 59/*
60#define R13 16 60 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
61#define R12 24 61 * unless syscall needs a complete, fully filled "struct pt_regs".
62#define RBP 32 62 */
63#define RBX 40 63#define R15 0*8
64 64#define R14 1*8
65/* arguments: interrupts/non tracing syscalls only save up to here: */ 65#define R13 2*8
66#define R11 48 66#define R12 3*8
67#define R10 56 67#define RBP 4*8
68#define R9 64 68#define RBX 5*8
69#define R8 72 69/* These regs are callee-clobbered. Always saved on kernel entry. */
70#define RAX 80 70#define R11 6*8
71#define RCX 88 71#define R10 7*8
72#define RDX 96 72#define R9 8*8
73#define RSI 104 73#define R8 9*8
74#define RDI 112 74#define RAX 10*8
75#define ORIG_RAX 120 /* + error_code */ 75#define RCX 11*8
76/* end of arguments */ 76#define RDX 12*8
77 77#define RSI 13*8
78/* cpu exception frame or undefined in case of fast syscall: */ 78#define RDI 14*8
79#define RIP 128 79/*
80#define CS 136 80 * On syscall entry, this is syscall#. On CPU exception, this is error code.
81#define EFLAGS 144 81 * On hw interrupt, it's IRQ number:
82#define RSP 152 82 */
83#define SS 160 83#define ORIG_RAX 15*8
84 84/* Return frame for iretq */
85#define ARGOFFSET R11 85#define RIP 16*8
86 86#define CS 17*8
87 .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 87#define EFLAGS 18*8
88 subq $9*8+\addskip, %rsp 88#define RSP 19*8
89 CFI_ADJUST_CFA_OFFSET 9*8+\addskip 89#define SS 20*8
90 movq_cfi rdi, 8*8 90
91 movq_cfi rsi, 7*8 91#define SIZEOF_PTREGS 21*8
92 movq_cfi rdx, 6*8 92
93 93 .macro ALLOC_PT_GPREGS_ON_STACK addskip=0
94 .if \save_rcx 94 subq $15*8+\addskip, %rsp
95 movq_cfi rcx, 5*8 95 CFI_ADJUST_CFA_OFFSET 15*8+\addskip
96 .endif 96 .endm
97 97
98 .if \rax_enosys 98 .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
99 movq $-ENOSYS, 4*8(%rsp) 99 .if \r11
100 .else 100 movq_cfi r11, 6*8+\offset
101 movq_cfi rax, 4*8
102 .endif 101 .endif
103 102 .if \r8910
104 .if \save_r891011 103 movq_cfi r10, 7*8+\offset
105 movq_cfi r8, 3*8 104 movq_cfi r9, 8*8+\offset
106 movq_cfi r9, 2*8 105 movq_cfi r8, 9*8+\offset
107 movq_cfi r10, 1*8 106 .endif
108 movq_cfi r11, 0*8 107 .if \rax
108 movq_cfi rax, 10*8+\offset
109 .endif
110 .if \rcx
111 movq_cfi rcx, 11*8+\offset
109 .endif 112 .endif
113 movq_cfi rdx, 12*8+\offset
114 movq_cfi rsi, 13*8+\offset
115 movq_cfi rdi, 14*8+\offset
116 .endm
117 .macro SAVE_C_REGS offset=0
118 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
119 .endm
120 .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
121 SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
122 .endm
123 .macro SAVE_C_REGS_EXCEPT_R891011
124 SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
125 .endm
126 .macro SAVE_C_REGS_EXCEPT_RCX_R891011
127 SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
128 .endm
129 .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
130 SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
131 .endm
132
133 .macro SAVE_EXTRA_REGS offset=0
134 movq_cfi r15, 0*8+\offset
135 movq_cfi r14, 1*8+\offset
136 movq_cfi r13, 2*8+\offset
137 movq_cfi r12, 3*8+\offset
138 movq_cfi rbp, 4*8+\offset
139 movq_cfi rbx, 5*8+\offset
140 .endm
141 .macro SAVE_EXTRA_REGS_RBP offset=0
142 movq_cfi rbp, 4*8+\offset
143 .endm
110 144
145 .macro RESTORE_EXTRA_REGS offset=0
146 movq_cfi_restore 0*8+\offset, r15
147 movq_cfi_restore 1*8+\offset, r14
148 movq_cfi_restore 2*8+\offset, r13
149 movq_cfi_restore 3*8+\offset, r12
150 movq_cfi_restore 4*8+\offset, rbp
151 movq_cfi_restore 5*8+\offset, rbx
111 .endm 152 .endm
112 153
113#define ARG_SKIP (9*8) 154 .macro ZERO_EXTRA_REGS
155 xorl %r15d, %r15d
156 xorl %r14d, %r14d
157 xorl %r13d, %r13d
158 xorl %r12d, %r12d
159 xorl %ebp, %ebp
160 xorl %ebx, %ebx
161 .endm
114 162
115 .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \ 163 .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
116 rstor_r8910=1, rstor_rdx=1
117 .if \rstor_r11 164 .if \rstor_r11
118 movq_cfi_restore 0*8, r11 165 movq_cfi_restore 6*8, r11
119 .endif 166 .endif
120
121 .if \rstor_r8910 167 .if \rstor_r8910
122 movq_cfi_restore 1*8, r10 168 movq_cfi_restore 7*8, r10
123 movq_cfi_restore 2*8, r9 169 movq_cfi_restore 8*8, r9
124 movq_cfi_restore 3*8, r8 170 movq_cfi_restore 9*8, r8
125 .endif 171 .endif
126
127 .if \rstor_rax 172 .if \rstor_rax
128 movq_cfi_restore 4*8, rax 173 movq_cfi_restore 10*8, rax
129 .endif 174 .endif
130
131 .if \rstor_rcx 175 .if \rstor_rcx
132 movq_cfi_restore 5*8, rcx 176 movq_cfi_restore 11*8, rcx
133 .endif 177 .endif
134
135 .if \rstor_rdx 178 .if \rstor_rdx
136 movq_cfi_restore 6*8, rdx 179 movq_cfi_restore 12*8, rdx
137 .endif
138
139 movq_cfi_restore 7*8, rsi
140 movq_cfi_restore 8*8, rdi
141
142 .if ARG_SKIP+\addskip > 0
143 addq $ARG_SKIP+\addskip, %rsp
144 CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
145 .endif 180 .endif
181 movq_cfi_restore 13*8, rsi
182 movq_cfi_restore 14*8, rdi
146 .endm 183 .endm
147 184 .macro RESTORE_C_REGS
148 .macro LOAD_ARGS offset, skiprax=0 185 RESTORE_C_REGS_HELPER 1,1,1,1,1
149 movq \offset(%rsp), %r11
150 movq \offset+8(%rsp), %r10
151 movq \offset+16(%rsp), %r9
152 movq \offset+24(%rsp), %r8
153 movq \offset+40(%rsp), %rcx
154 movq \offset+48(%rsp), %rdx
155 movq \offset+56(%rsp), %rsi
156 movq \offset+64(%rsp), %rdi
157 .if \skiprax
158 .else
159 movq \offset+72(%rsp), %rax
160 .endif
161 .endm 186 .endm
162 187 .macro RESTORE_C_REGS_EXCEPT_RAX
163#define REST_SKIP (6*8) 188 RESTORE_C_REGS_HELPER 0,1,1,1,1
164
165 .macro SAVE_REST
166 subq $REST_SKIP, %rsp
167 CFI_ADJUST_CFA_OFFSET REST_SKIP
168 movq_cfi rbx, 5*8
169 movq_cfi rbp, 4*8
170 movq_cfi r12, 3*8
171 movq_cfi r13, 2*8
172 movq_cfi r14, 1*8
173 movq_cfi r15, 0*8
174 .endm 189 .endm
175 190 .macro RESTORE_C_REGS_EXCEPT_RCX
176 .macro RESTORE_REST 191 RESTORE_C_REGS_HELPER 1,0,1,1,1
177 movq_cfi_restore 0*8, r15
178 movq_cfi_restore 1*8, r14
179 movq_cfi_restore 2*8, r13
180 movq_cfi_restore 3*8, r12
181 movq_cfi_restore 4*8, rbp
182 movq_cfi_restore 5*8, rbx
183 addq $REST_SKIP, %rsp
184 CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
185 .endm 192 .endm
186 193 .macro RESTORE_C_REGS_EXCEPT_R11
187 .macro SAVE_ALL 194 RESTORE_C_REGS_HELPER 1,1,0,1,1
188 SAVE_ARGS 195 .endm
189 SAVE_REST 196 .macro RESTORE_C_REGS_EXCEPT_RCX_R11
197 RESTORE_C_REGS_HELPER 1,0,0,1,1
198 .endm
199 .macro RESTORE_RSI_RDI
200 RESTORE_C_REGS_HELPER 0,0,0,0,0
201 .endm
202 .macro RESTORE_RSI_RDI_RDX
203 RESTORE_C_REGS_HELPER 0,0,0,0,1
190 .endm 204 .endm
191 205
192 .macro RESTORE_ALL addskip=0 206 .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
193 RESTORE_REST 207 addq $15*8+\addskip, %rsp
194 RESTORE_ARGS 1, \addskip 208 CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
195 .endm 209 .endm
196 210
197 .macro icebp 211 .macro icebp
@@ -210,37 +224,23 @@ For 32-bit we have the following conventions - kernel is built with
210 */ 224 */
211 225
212 .macro SAVE_ALL 226 .macro SAVE_ALL
213 pushl_cfi %eax 227 pushl_cfi_reg eax
214 CFI_REL_OFFSET eax, 0 228 pushl_cfi_reg ebp
215 pushl_cfi %ebp 229 pushl_cfi_reg edi
216 CFI_REL_OFFSET ebp, 0 230 pushl_cfi_reg esi
217 pushl_cfi %edi 231 pushl_cfi_reg edx
218 CFI_REL_OFFSET edi, 0 232 pushl_cfi_reg ecx
219 pushl_cfi %esi 233 pushl_cfi_reg ebx
220 CFI_REL_OFFSET esi, 0
221 pushl_cfi %edx
222 CFI_REL_OFFSET edx, 0
223 pushl_cfi %ecx
224 CFI_REL_OFFSET ecx, 0
225 pushl_cfi %ebx
226 CFI_REL_OFFSET ebx, 0
227 .endm 234 .endm
228 235
229 .macro RESTORE_ALL 236 .macro RESTORE_ALL
230 popl_cfi %ebx 237 popl_cfi_reg ebx
231 CFI_RESTORE ebx 238 popl_cfi_reg ecx
232 popl_cfi %ecx 239 popl_cfi_reg edx
233 CFI_RESTORE ecx 240 popl_cfi_reg esi
234 popl_cfi %edx 241 popl_cfi_reg edi
235 CFI_RESTORE edx 242 popl_cfi_reg ebp
236 popl_cfi %esi 243 popl_cfi_reg eax
237 CFI_RESTORE esi
238 popl_cfi %edi
239 CFI_RESTORE edi
240 popl_cfi %ebp
241 CFI_RESTORE ebp
242 popl_cfi %eax
243 CFI_RESTORE eax
244 .endm 244 .endm
245 245
246#endif /* CONFIG_X86_64 */ 246#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 59c6c401f79f..acdee09228b3 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
301 sp = task_pt_regs(current)->sp; 301 sp = task_pt_regs(current)->sp;
302 } else { 302 } else {
303 /* -128 for the x32 ABI redzone */ 303 /* -128 for the x32 ABI redzone */
304 sp = this_cpu_read(old_rsp) - 128; 304 sp = task_pt_regs(current)->sp - 128;
305 } 305 }
306 306
307 return (void __user *)round_down(sp - len, 16); 307 return (void __user *)round_down(sp - len, 16);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 90a54851aedc..854c04b3c9c2 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -231,7 +231,9 @@
231#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ 231#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
232#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ 232#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
233#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ 233#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
234#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */
234#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ 235#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
236#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
235#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ 237#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
236#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ 238#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
237#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ 239#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
@@ -418,6 +420,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
418 " .word %P0\n" /* 1: do replace */ 420 " .word %P0\n" /* 1: do replace */
419 " .byte 2b - 1b\n" /* source len */ 421 " .byte 2b - 1b\n" /* source len */
420 " .byte 0\n" /* replacement len */ 422 " .byte 0\n" /* replacement len */
423 " .byte 0\n" /* pad len */
421 ".previous\n" 424 ".previous\n"
422 /* skipping size check since replacement size = 0 */ 425 /* skipping size check since replacement size = 0 */
423 : : "i" (X86_FEATURE_ALWAYS) : : t_warn); 426 : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
@@ -432,6 +435,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
432 " .word %P0\n" /* feature bit */ 435 " .word %P0\n" /* feature bit */
433 " .byte 2b - 1b\n" /* source len */ 436 " .byte 2b - 1b\n" /* source len */
434 " .byte 0\n" /* replacement len */ 437 " .byte 0\n" /* replacement len */
438 " .byte 0\n" /* pad len */
435 ".previous\n" 439 ".previous\n"
436 /* skipping size check since replacement size = 0 */ 440 /* skipping size check since replacement size = 0 */
437 : : "i" (bit) : : t_no); 441 : : "i" (bit) : : t_no);
@@ -457,6 +461,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
457 " .word %P1\n" /* feature bit */ 461 " .word %P1\n" /* feature bit */
458 " .byte 2b - 1b\n" /* source len */ 462 " .byte 2b - 1b\n" /* source len */
459 " .byte 4f - 3f\n" /* replacement len */ 463 " .byte 4f - 3f\n" /* replacement len */
464 " .byte 0\n" /* pad len */
460 ".previous\n" 465 ".previous\n"
461 ".section .discard,\"aw\",@progbits\n" 466 ".section .discard,\"aw\",@progbits\n"
462 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ 467 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -483,31 +488,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
483static __always_inline __pure bool _static_cpu_has_safe(u16 bit) 488static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
484{ 489{
485#ifdef CC_HAVE_ASM_GOTO 490#ifdef CC_HAVE_ASM_GOTO
486/* 491 asm_volatile_goto("1: jmp %l[t_dynamic]\n"
487 * We need to spell the jumps to the compiler because, depending on the offset,
488 * the replacement jump can be bigger than the original jump, and this we cannot
489 * have. Thus, we force the jump to the widest, 4-byte, signed relative
490 * offset even though the last would often fit in less bytes.
491 */
492 asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
493 "2:\n" 492 "2:\n"
493 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
494 "((5f-4f) - (2b-1b)),0x90\n"
495 "3:\n"
494 ".section .altinstructions,\"a\"\n" 496 ".section .altinstructions,\"a\"\n"
495 " .long 1b - .\n" /* src offset */ 497 " .long 1b - .\n" /* src offset */
496 " .long 3f - .\n" /* repl offset */ 498 " .long 4f - .\n" /* repl offset */
497 " .word %P1\n" /* always replace */ 499 " .word %P1\n" /* always replace */
498 " .byte 2b - 1b\n" /* src len */ 500 " .byte 3b - 1b\n" /* src len */
499 " .byte 4f - 3f\n" /* repl len */ 501 " .byte 5f - 4f\n" /* repl len */
502 " .byte 3b - 2b\n" /* pad len */
500 ".previous\n" 503 ".previous\n"
501 ".section .altinstr_replacement,\"ax\"\n" 504 ".section .altinstr_replacement,\"ax\"\n"
502 "3: .byte 0xe9\n .long %l[t_no] - 2b\n" 505 "4: jmp %l[t_no]\n"
503 "4:\n" 506 "5:\n"
504 ".previous\n" 507 ".previous\n"
505 ".section .altinstructions,\"a\"\n" 508 ".section .altinstructions,\"a\"\n"
506 " .long 1b - .\n" /* src offset */ 509 " .long 1b - .\n" /* src offset */
507 " .long 0\n" /* no replacement */ 510 " .long 0\n" /* no replacement */
508 " .word %P0\n" /* feature bit */ 511 " .word %P0\n" /* feature bit */
509 " .byte 2b - 1b\n" /* src len */ 512 " .byte 3b - 1b\n" /* src len */
510 " .byte 0\n" /* repl len */ 513 " .byte 0\n" /* repl len */
514 " .byte 0\n" /* pad len */
511 ".previous\n" 515 ".previous\n"
512 : : "i" (bit), "i" (X86_FEATURE_ALWAYS) 516 : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
513 : : t_dynamic, t_no); 517 : : t_dynamic, t_no);
@@ -527,6 +531,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
527 " .word %P2\n" /* always replace */ 531 " .word %P2\n" /* always replace */
528 " .byte 2b - 1b\n" /* source len */ 532 " .byte 2b - 1b\n" /* source len */
529 " .byte 4f - 3f\n" /* replacement len */ 533 " .byte 4f - 3f\n" /* replacement len */
534 " .byte 0\n" /* pad len */
530 ".previous\n" 535 ".previous\n"
531 ".section .discard,\"aw\",@progbits\n" 536 ".section .discard,\"aw\",@progbits\n"
532 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ 537 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -541,6 +546,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
541 " .word %P1\n" /* feature bit */ 546 " .word %P1\n" /* feature bit */
542 " .byte 4b - 3b\n" /* src len */ 547 " .byte 4b - 3b\n" /* src len */
543 " .byte 6f - 5f\n" /* repl len */ 548 " .byte 6f - 5f\n" /* repl len */
549 " .byte 0\n" /* pad len */
544 ".previous\n" 550 ".previous\n"
545 ".section .discard,\"aw\",@progbits\n" 551 ".section .discard,\"aw\",@progbits\n"
546 " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ 552 " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index a94b82e8f156..a0bf89fd2647 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
376 * Pentium F0 0F bugfix can have resulted in the mapped 376 * Pentium F0 0F bugfix can have resulted in the mapped
377 * IDT being write-protected. 377 * IDT being write-protected.
378 */ 378 */
379#define set_intr_gate(n, addr) \ 379#define set_intr_gate_notrace(n, addr) \
380 do { \ 380 do { \
381 BUG_ON((unsigned)n > 0xFF); \ 381 BUG_ON((unsigned)n > 0xFF); \
382 _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ 382 _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \
383 __KERNEL_CS); \ 383 __KERNEL_CS); \
384 } while (0)
385
386#define set_intr_gate(n, addr) \
387 do { \
388 set_intr_gate_notrace(n, addr); \
384 _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ 389 _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\
385 0, 0, __KERNEL_CS); \ 390 0, 0, __KERNEL_CS); \
386 } while (0) 391 } while (0)
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index f6f15986df6c..de1cdaf4d743 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -86,11 +86,23 @@
86 CFI_ADJUST_CFA_OFFSET 8 86 CFI_ADJUST_CFA_OFFSET 8
87 .endm 87 .endm
88 88
89 .macro pushq_cfi_reg reg
90 pushq %\reg
91 CFI_ADJUST_CFA_OFFSET 8
92 CFI_REL_OFFSET \reg, 0
93 .endm
94
89 .macro popq_cfi reg 95 .macro popq_cfi reg
90 popq \reg 96 popq \reg
91 CFI_ADJUST_CFA_OFFSET -8 97 CFI_ADJUST_CFA_OFFSET -8
92 .endm 98 .endm
93 99
100 .macro popq_cfi_reg reg
101 popq %\reg
102 CFI_ADJUST_CFA_OFFSET -8
103 CFI_RESTORE \reg
104 .endm
105
94 .macro pushfq_cfi 106 .macro pushfq_cfi
95 pushfq 107 pushfq
96 CFI_ADJUST_CFA_OFFSET 8 108 CFI_ADJUST_CFA_OFFSET 8
@@ -116,11 +128,23 @@
116 CFI_ADJUST_CFA_OFFSET 4 128 CFI_ADJUST_CFA_OFFSET 4
117 .endm 129 .endm
118 130
131 .macro pushl_cfi_reg reg
132 pushl %\reg
133 CFI_ADJUST_CFA_OFFSET 4
134 CFI_REL_OFFSET \reg, 0
135 .endm
136
119 .macro popl_cfi reg 137 .macro popl_cfi reg
120 popl \reg 138 popl \reg
121 CFI_ADJUST_CFA_OFFSET -4 139 CFI_ADJUST_CFA_OFFSET -4
122 .endm 140 .endm
123 141
142 .macro popl_cfi_reg reg
143 popl %\reg
144 CFI_ADJUST_CFA_OFFSET -4
145 CFI_RESTORE \reg
146 .endm
147
124 .macro pushfl_cfi 148 .macro pushfl_cfi
125 pushfl 149 pushfl
126 CFI_ADJUST_CFA_OFFSET 4 150 CFI_ADJUST_CFA_OFFSET 4
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index ca3347a9dab5..3563107b5060 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -171,10 +171,11 @@ do { \
171static inline void elf_common_init(struct thread_struct *t, 171static inline void elf_common_init(struct thread_struct *t,
172 struct pt_regs *regs, const u16 ds) 172 struct pt_regs *regs, const u16 ds)
173{ 173{
174 regs->ax = regs->bx = regs->cx = regs->dx = 0; 174 /* Commented-out registers are cleared in stub_execve */
175 regs->si = regs->di = regs->bp = 0; 175 /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0;
176 regs->si = regs->di /*= regs->bp*/ = 0;
176 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; 177 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
177 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; 178 /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/
178 t->fs = t->gs = 0; 179 t->fs = t->gs = 0;
179 t->fsindex = t->gsindex = 0; 180 t->fsindex = t->gsindex = 0;
180 t->ds = t->es = ds; 181 t->ds = t->es = ds;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 9662290e0b20..e9571ddabc4f 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -181,10 +181,9 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
181extern __visible void smp_invalidate_interrupt(struct pt_regs *); 181extern __visible void smp_invalidate_interrupt(struct pt_regs *);
182#endif 182#endif
183 183
184extern void (*__initconst interrupt[FIRST_SYSTEM_VECTOR 184extern char irq_entries_start[];
185 - FIRST_EXTERNAL_VECTOR])(void);
186#ifdef CONFIG_TRACING 185#ifdef CONFIG_TRACING
187#define trace_interrupt interrupt 186#define trace_irq_entries_start irq_entries_start
188#endif 187#endif
189 188
190#define VECTOR_UNDEFINED (-1) 189#define VECTOR_UNDEFINED (-1)
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 47f29b1d1846..e7814b74caf8 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -69,7 +69,7 @@ struct insn {
69 const insn_byte_t *next_byte; 69 const insn_byte_t *next_byte;
70}; 70};
71 71
72#define MAX_INSN_SIZE 16 72#define MAX_INSN_SIZE 15
73 73
74#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) 74#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
75#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) 75#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 0a8b519226b8..b77f5edb03b0 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -136,10 +136,6 @@ static inline notrace unsigned long arch_local_irq_save(void)
136#define USERGS_SYSRET32 \ 136#define USERGS_SYSRET32 \
137 swapgs; \ 137 swapgs; \
138 sysretl 138 sysretl
139#define ENABLE_INTERRUPTS_SYSEXIT32 \
140 swapgs; \
141 sti; \
142 sysexit
143 139
144#else 140#else
145#define INTERRUPT_RETURN iret 141#define INTERRUPT_RETURN iret
@@ -163,22 +159,27 @@ static inline int arch_irqs_disabled(void)
163 159
164 return arch_irqs_disabled_flags(flags); 160 return arch_irqs_disabled_flags(flags);
165} 161}
162#endif /* !__ASSEMBLY__ */
166 163
164#ifdef __ASSEMBLY__
165#ifdef CONFIG_TRACE_IRQFLAGS
166# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
167# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
167#else 168#else
168 169# define TRACE_IRQS_ON
169#ifdef CONFIG_X86_64 170# define TRACE_IRQS_OFF
170#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk 171#endif
171#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ 172#ifdef CONFIG_DEBUG_LOCK_ALLOC
173# ifdef CONFIG_X86_64
174# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
175# define LOCKDEP_SYS_EXIT_IRQ \
172 TRACE_IRQS_ON; \ 176 TRACE_IRQS_ON; \
173 sti; \ 177 sti; \
174 SAVE_REST; \ 178 call lockdep_sys_exit_thunk; \
175 LOCKDEP_SYS_EXIT; \
176 RESTORE_REST; \
177 cli; \ 179 cli; \
178 TRACE_IRQS_OFF; 180 TRACE_IRQS_OFF;
179 181# else
180#else 182# define LOCKDEP_SYS_EXIT \
181#define ARCH_LOCKDEP_SYS_EXIT \
182 pushl %eax; \ 183 pushl %eax; \
183 pushl %ecx; \ 184 pushl %ecx; \
184 pushl %edx; \ 185 pushl %edx; \
@@ -186,24 +187,12 @@ static inline int arch_irqs_disabled(void)
186 popl %edx; \ 187 popl %edx; \
187 popl %ecx; \ 188 popl %ecx; \
188 popl %eax; 189 popl %eax;
189 190# define LOCKDEP_SYS_EXIT_IRQ
190#define ARCH_LOCKDEP_SYS_EXIT_IRQ 191# endif
191#endif
192
193#ifdef CONFIG_TRACE_IRQFLAGS
194# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
195# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
196#else 192#else
197# define TRACE_IRQS_ON
198# define TRACE_IRQS_OFF
199#endif
200#ifdef CONFIG_DEBUG_LOCK_ALLOC
201# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
202# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
203# else
204# define LOCKDEP_SYS_EXIT 193# define LOCKDEP_SYS_EXIT
205# define LOCKDEP_SYS_EXIT_IRQ 194# define LOCKDEP_SYS_EXIT_IRQ
206# endif 195#endif
207
208#endif /* __ASSEMBLY__ */ 196#endif /* __ASSEMBLY__ */
197
209#endif 198#endif
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 965c47d254aa..5f6051d5d139 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -976,11 +976,6 @@ extern void default_banner(void);
976 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ 976 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
977 CLBR_NONE, \ 977 CLBR_NONE, \
978 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) 978 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
979
980#define ENABLE_INTERRUPTS_SYSEXIT32 \
981 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
982 CLBR_NONE, \
983 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
984#endif /* CONFIG_X86_32 */ 979#endif /* CONFIG_X86_32 */
985 980
986#endif /* __ASSEMBLY__ */ 981#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ec1c93588cef..d2203b5d9538 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -210,8 +210,23 @@ struct x86_hw_tss {
210 unsigned long sp0; 210 unsigned long sp0;
211 unsigned short ss0, __ss0h; 211 unsigned short ss0, __ss0h;
212 unsigned long sp1; 212 unsigned long sp1;
213 /* ss1 caches MSR_IA32_SYSENTER_CS: */ 213
214 unsigned short ss1, __ss1h; 214 /*
215 * We don't use ring 1, so ss1 is a convenient scratch space in
216 * the same cacheline as sp0. We use ss1 to cache the value in
217 * MSR_IA32_SYSENTER_CS. When we context switch
218 * MSR_IA32_SYSENTER_CS, we first check if the new value being
219 * written matches ss1, and, if it's not, then we wrmsr the new
220 * value and update ss1.
221 *
222 * The only reason we context switch MSR_IA32_SYSENTER_CS is
223 * that we set it to zero in vm86 tasks to avoid corrupting the
224 * stack if we were to go through the sysenter path from vm86
225 * mode.
226 */
227 unsigned short ss1; /* MSR_IA32_SYSENTER_CS */
228
229 unsigned short __ss1h;
215 unsigned long sp2; 230 unsigned long sp2;
216 unsigned short ss2, __ss2h; 231 unsigned short ss2, __ss2h;
217 unsigned long __cr3; 232 unsigned long __cr3;
@@ -276,13 +291,17 @@ struct tss_struct {
276 unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; 291 unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
277 292
278 /* 293 /*
279 * .. and then another 0x100 bytes for the emergency kernel stack: 294 * Space for the temporary SYSENTER stack:
280 */ 295 */
281 unsigned long stack[64]; 296 unsigned long SYSENTER_stack[64];
282 297
283} ____cacheline_aligned; 298} ____cacheline_aligned;
284 299
285DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); 300DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
301
302#ifdef CONFIG_X86_32
303DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
304#endif
286 305
287/* 306/*
288 * Save the original ist values for checking stack pointers during debugging 307 * Save the original ist values for checking stack pointers during debugging
@@ -474,7 +493,6 @@ struct thread_struct {
474#ifdef CONFIG_X86_32 493#ifdef CONFIG_X86_32
475 unsigned long sysenter_cs; 494 unsigned long sysenter_cs;
476#else 495#else
477 unsigned long usersp; /* Copy from PDA */
478 unsigned short es; 496 unsigned short es;
479 unsigned short ds; 497 unsigned short ds;
480 unsigned short fsindex; 498 unsigned short fsindex;
@@ -564,6 +582,16 @@ static inline void native_swapgs(void)
564#endif 582#endif
565} 583}
566 584
585static inline unsigned long current_top_of_stack(void)
586{
587#ifdef CONFIG_X86_64
588 return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
589#else
590 /* sp0 on x86_32 is special in and around vm86 mode. */
591 return this_cpu_read_stable(cpu_current_top_of_stack);
592#endif
593}
594
567#ifdef CONFIG_PARAVIRT 595#ifdef CONFIG_PARAVIRT
568#include <asm/paravirt.h> 596#include <asm/paravirt.h>
569#else 597#else
@@ -761,10 +789,10 @@ extern char ignore_fpu_irq;
761#define ARCH_HAS_SPINLOCK_PREFETCH 789#define ARCH_HAS_SPINLOCK_PREFETCH
762 790
763#ifdef CONFIG_X86_32 791#ifdef CONFIG_X86_32
764# define BASE_PREFETCH ASM_NOP4 792# define BASE_PREFETCH ""
765# define ARCH_HAS_PREFETCH 793# define ARCH_HAS_PREFETCH
766#else 794#else
767# define BASE_PREFETCH "prefetcht0 (%1)" 795# define BASE_PREFETCH "prefetcht0 %P1"
768#endif 796#endif
769 797
770/* 798/*
@@ -775,10 +803,9 @@ extern char ignore_fpu_irq;
775 */ 803 */
776static inline void prefetch(const void *x) 804static inline void prefetch(const void *x)
777{ 805{
778 alternative_input(BASE_PREFETCH, 806 alternative_input(BASE_PREFETCH, "prefetchnta %P1",
779 "prefetchnta (%1)",
780 X86_FEATURE_XMM, 807 X86_FEATURE_XMM,
781 "r" (x)); 808 "m" (*(const char *)x));
782} 809}
783 810
784/* 811/*
@@ -788,10 +815,9 @@ static inline void prefetch(const void *x)
788 */ 815 */
789static inline void prefetchw(const void *x) 816static inline void prefetchw(const void *x)
790{ 817{
791 alternative_input(BASE_PREFETCH, 818 alternative_input(BASE_PREFETCH, "prefetchw %P1",
792 "prefetchw (%1)", 819 X86_FEATURE_3DNOWPREFETCH,
793 X86_FEATURE_3DNOW, 820 "m" (*(const char *)x));
794 "r" (x));
795} 821}
796 822
797static inline void spin_lock_prefetch(const void *x) 823static inline void spin_lock_prefetch(const void *x)
@@ -799,6 +825,9 @@ static inline void spin_lock_prefetch(const void *x)
799 prefetchw(x); 825 prefetchw(x);
800} 826}
801 827
828#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
829 TOP_OF_KERNEL_STACK_PADDING)
830
802#ifdef CONFIG_X86_32 831#ifdef CONFIG_X86_32
803/* 832/*
804 * User space process size: 3GB (default). 833 * User space process size: 3GB (default).
@@ -809,39 +838,16 @@ static inline void spin_lock_prefetch(const void *x)
809#define STACK_TOP_MAX STACK_TOP 838#define STACK_TOP_MAX STACK_TOP
810 839
811#define INIT_THREAD { \ 840#define INIT_THREAD { \
812 .sp0 = sizeof(init_stack) + (long)&init_stack, \ 841 .sp0 = TOP_OF_INIT_STACK, \
813 .vm86_info = NULL, \ 842 .vm86_info = NULL, \
814 .sysenter_cs = __KERNEL_CS, \ 843 .sysenter_cs = __KERNEL_CS, \
815 .io_bitmap_ptr = NULL, \ 844 .io_bitmap_ptr = NULL, \
816} 845}
817 846
818/*
819 * Note that the .io_bitmap member must be extra-big. This is because
820 * the CPU will access an additional byte beyond the end of the IO
821 * permission bitmap. The extra byte must be all 1 bits, and must
822 * be within the limit.
823 */
824#define INIT_TSS { \
825 .x86_tss = { \
826 .sp0 = sizeof(init_stack) + (long)&init_stack, \
827 .ss0 = __KERNEL_DS, \
828 .ss1 = __KERNEL_CS, \
829 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
830 }, \
831 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
832}
833
834extern unsigned long thread_saved_pc(struct task_struct *tsk); 847extern unsigned long thread_saved_pc(struct task_struct *tsk);
835 848
836#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
837#define KSTK_TOP(info) \
838({ \
839 unsigned long *__ptr = (unsigned long *)(info); \
840 (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
841})
842
843/* 849/*
844 * The below -8 is to reserve 8 bytes on top of the ring0 stack. 850 * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
845 * This is necessary to guarantee that the entire "struct pt_regs" 851 * This is necessary to guarantee that the entire "struct pt_regs"
846 * is accessible even if the CPU haven't stored the SS/ESP registers 852 * is accessible even if the CPU haven't stored the SS/ESP registers
847 * on the stack (interrupt gate does not save these registers 853 * on the stack (interrupt gate does not save these registers
@@ -850,11 +856,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
850 * "struct pt_regs" is possible, but they may contain the 856 * "struct pt_regs" is possible, but they may contain the
851 * completely wrong values. 857 * completely wrong values.
852 */ 858 */
853#define task_pt_regs(task) \ 859#define task_pt_regs(task) \
854({ \ 860({ \
855 struct pt_regs *__regs__; \ 861 unsigned long __ptr = (unsigned long)task_stack_page(task); \
856 __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ 862 __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
857 __regs__ - 1; \ 863 ((struct pt_regs *)__ptr) - 1; \
858}) 864})
859 865
860#define KSTK_ESP(task) (task_pt_regs(task)->sp) 866#define KSTK_ESP(task) (task_pt_regs(task)->sp)
@@ -886,11 +892,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
886#define STACK_TOP_MAX TASK_SIZE_MAX 892#define STACK_TOP_MAX TASK_SIZE_MAX
887 893
888#define INIT_THREAD { \ 894#define INIT_THREAD { \
889 .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ 895 .sp0 = TOP_OF_INIT_STACK \
890}
891
892#define INIT_TSS { \
893 .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
894} 896}
895 897
896/* 898/*
@@ -902,11 +904,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
902#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) 904#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
903extern unsigned long KSTK_ESP(struct task_struct *task); 905extern unsigned long KSTK_ESP(struct task_struct *task);
904 906
905/*
906 * User space RSP while inside the SYSCALL fast path
907 */
908DECLARE_PER_CPU(unsigned long, old_rsp);
909
910#endif /* CONFIG_X86_64 */ 907#endif /* CONFIG_X86_64 */
911 908
912extern void start_thread(struct pt_regs *regs, unsigned long new_ip, 909extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 86fc2bb82287..19507ffa5d28 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -31,13 +31,17 @@ struct pt_regs {
31#else /* __i386__ */ 31#else /* __i386__ */
32 32
33struct pt_regs { 33struct pt_regs {
34/*
35 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
36 * unless syscall needs a complete, fully filled "struct pt_regs".
37 */
34 unsigned long r15; 38 unsigned long r15;
35 unsigned long r14; 39 unsigned long r14;
36 unsigned long r13; 40 unsigned long r13;
37 unsigned long r12; 41 unsigned long r12;
38 unsigned long bp; 42 unsigned long bp;
39 unsigned long bx; 43 unsigned long bx;
40/* arguments: non interrupts/non tracing syscalls only save up to here*/ 44/* These regs are callee-clobbered. Always saved on kernel entry. */
41 unsigned long r11; 45 unsigned long r11;
42 unsigned long r10; 46 unsigned long r10;
43 unsigned long r9; 47 unsigned long r9;
@@ -47,9 +51,12 @@ struct pt_regs {
47 unsigned long dx; 51 unsigned long dx;
48 unsigned long si; 52 unsigned long si;
49 unsigned long di; 53 unsigned long di;
54/*
55 * On syscall entry, this is syscall#. On CPU exception, this is error code.
56 * On hw interrupt, it's IRQ number:
57 */
50 unsigned long orig_ax; 58 unsigned long orig_ax;
51/* end of arguments */ 59/* Return frame for iretq */
52/* cpu exception frame or undefined */
53 unsigned long ip; 60 unsigned long ip;
54 unsigned long cs; 61 unsigned long cs;
55 unsigned long flags; 62 unsigned long flags;
@@ -89,11 +96,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
89} 96}
90 97
91/* 98/*
92 * user_mode_vm(regs) determines whether a register set came from user mode. 99 * user_mode(regs) determines whether a register set came from user
93 * This is true if V8086 mode was enabled OR if the register set was from 100 * mode. On x86_32, this is true if V8086 mode was enabled OR if the
94 * protected mode with RPL-3 CS value. This tricky test checks that with 101 * register set was from protected mode with RPL-3 CS value. This
95 * one comparison. Many places in the kernel can bypass this full check 102 * tricky test checks that with one comparison.
96 * if they have already ruled out V8086 mode, so user_mode(regs) can be used. 103 *
104 * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
105 * the extra check.
97 */ 106 */
98static inline int user_mode(struct pt_regs *regs) 107static inline int user_mode(struct pt_regs *regs)
99{ 108{
@@ -104,16 +113,6 @@ static inline int user_mode(struct pt_regs *regs)
104#endif 113#endif
105} 114}
106 115
107static inline int user_mode_vm(struct pt_regs *regs)
108{
109#ifdef CONFIG_X86_32
110 return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
111 USER_RPL;
112#else
113 return user_mode(regs);
114#endif
115}
116
117static inline int v8086_mode(struct pt_regs *regs) 116static inline int v8086_mode(struct pt_regs *regs)
118{ 117{
119#ifdef CONFIG_X86_32 118#ifdef CONFIG_X86_32
@@ -138,12 +137,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
138#endif 137#endif
139} 138}
140 139
141#define current_user_stack_pointer() this_cpu_read(old_rsp) 140#define current_user_stack_pointer() current_pt_regs()->sp
142/* ia32 vs. x32 difference */ 141#define compat_user_stack_pointer() current_pt_regs()->sp
143#define compat_user_stack_pointer() \
144 (test_thread_flag(TIF_IA32) \
145 ? current_pt_regs()->sp \
146 : this_cpu_read(old_rsp))
147#endif 142#endif
148 143
149#ifdef CONFIG_X86_32 144#ifdef CONFIG_X86_32
@@ -248,7 +243,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
248 */ 243 */
249#define arch_ptrace_stop_needed(code, info) \ 244#define arch_ptrace_stop_needed(code, info) \
250({ \ 245({ \
251 set_thread_flag(TIF_NOTIFY_RESUME); \ 246 force_iret(); \
252 false; \ 247 false; \
253}) 248})
254 249
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index db257a58571f..5a9856eb12ba 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -3,8 +3,10 @@
3 3
4#include <linux/const.h> 4#include <linux/const.h>
5 5
6/* Constructor for a conventional segment GDT (or LDT) entry */ 6/*
7/* This is a macro so it can be used in initializers */ 7 * Constructor for a conventional segment GDT (or LDT) entry.
8 * This is a macro so it can be used in initializers.
9 */
8#define GDT_ENTRY(flags, base, limit) \ 10#define GDT_ENTRY(flags, base, limit) \
9 ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ 11 ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \
10 (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ 12 (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \
@@ -12,198 +14,228 @@
12 (((base) & _AC(0x00ffffff,ULL)) << 16) | \ 14 (((base) & _AC(0x00ffffff,ULL)) << 16) | \
13 (((limit) & _AC(0x0000ffff,ULL)))) 15 (((limit) & _AC(0x0000ffff,ULL))))
14 16
15/* Simple and small GDT entries for booting only */ 17/* Simple and small GDT entries for booting only: */
16 18
17#define GDT_ENTRY_BOOT_CS 2 19#define GDT_ENTRY_BOOT_CS 2
18#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) 20#define GDT_ENTRY_BOOT_DS 3
21#define GDT_ENTRY_BOOT_TSS 4
22#define __BOOT_CS (GDT_ENTRY_BOOT_CS*8)
23#define __BOOT_DS (GDT_ENTRY_BOOT_DS*8)
24#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8)
25
26/*
27 * Bottom two bits of selector give the ring
28 * privilege level
29 */
30#define SEGMENT_RPL_MASK 0x3
19 31
20#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) 32/* User mode is privilege level 3: */
21#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) 33#define USER_RPL 0x3
22 34
23#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) 35/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */
24#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) 36#define SEGMENT_TI_MASK 0x4
37/* LDT segment has TI set ... */
38#define SEGMENT_LDT 0x4
39/* ... GDT has it cleared */
40#define SEGMENT_GDT 0x0
25 41
26#define SEGMENT_RPL_MASK 0x3 /* 42#define GDT_ENTRY_INVALID_SEG 0
27 * Bottom two bits of selector give the ring
28 * privilege level
29 */
30#define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */
31#define USER_RPL 0x3 /* User mode is privilege level 3 */
32#define SEGMENT_LDT 0x4 /* LDT segment has TI set... */
33#define SEGMENT_GDT 0x0 /* ... GDT has it cleared */
34 43
35#ifdef CONFIG_X86_32 44#ifdef CONFIG_X86_32
36/* 45/*
37 * The layout of the per-CPU GDT under Linux: 46 * The layout of the per-CPU GDT under Linux:
38 * 47 *
39 * 0 - null 48 * 0 - null <=== cacheline #1
40 * 1 - reserved 49 * 1 - reserved
41 * 2 - reserved 50 * 2 - reserved
42 * 3 - reserved 51 * 3 - reserved
43 * 52 *
44 * 4 - unused <==== new cacheline 53 * 4 - unused <=== cacheline #2
45 * 5 - unused 54 * 5 - unused
46 * 55 *
47 * ------- start of TLS (Thread-Local Storage) segments: 56 * ------- start of TLS (Thread-Local Storage) segments:
48 * 57 *
49 * 6 - TLS segment #1 [ glibc's TLS segment ] 58 * 6 - TLS segment #1 [ glibc's TLS segment ]
50 * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] 59 * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
51 * 8 - TLS segment #3 60 * 8 - TLS segment #3 <=== cacheline #3
52 * 9 - reserved 61 * 9 - reserved
53 * 10 - reserved 62 * 10 - reserved
54 * 11 - reserved 63 * 11 - reserved
55 * 64 *
56 * ------- start of kernel segments: 65 * ------- start of kernel segments:
57 * 66 *
58 * 12 - kernel code segment <==== new cacheline 67 * 12 - kernel code segment <=== cacheline #4
59 * 13 - kernel data segment 68 * 13 - kernel data segment
60 * 14 - default user CS 69 * 14 - default user CS
61 * 15 - default user DS 70 * 15 - default user DS
62 * 16 - TSS 71 * 16 - TSS <=== cacheline #5
63 * 17 - LDT 72 * 17 - LDT
64 * 18 - PNPBIOS support (16->32 gate) 73 * 18 - PNPBIOS support (16->32 gate)
65 * 19 - PNPBIOS support 74 * 19 - PNPBIOS support
66 * 20 - PNPBIOS support 75 * 20 - PNPBIOS support <=== cacheline #6
67 * 21 - PNPBIOS support 76 * 21 - PNPBIOS support
68 * 22 - PNPBIOS support 77 * 22 - PNPBIOS support
69 * 23 - APM BIOS support 78 * 23 - APM BIOS support
70 * 24 - APM BIOS support 79 * 24 - APM BIOS support <=== cacheline #7
71 * 25 - APM BIOS support 80 * 25 - APM BIOS support
72 * 81 *
73 * 26 - ESPFIX small SS 82 * 26 - ESPFIX small SS
74 * 27 - per-cpu [ offset to per-cpu data area ] 83 * 27 - per-cpu [ offset to per-cpu data area ]
75 * 28 - stack_canary-20 [ for stack protector ] 84 * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8
76 * 29 - unused 85 * 29 - unused
77 * 30 - unused 86 * 30 - unused
78 * 31 - TSS for double fault handler 87 * 31 - TSS for double fault handler
79 */ 88 */
80#define GDT_ENTRY_TLS_MIN 6 89#define GDT_ENTRY_TLS_MIN 6
81#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) 90#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
82 91
92#define GDT_ENTRY_KERNEL_CS 12
93#define GDT_ENTRY_KERNEL_DS 13
83#define GDT_ENTRY_DEFAULT_USER_CS 14 94#define GDT_ENTRY_DEFAULT_USER_CS 14
84
85#define GDT_ENTRY_DEFAULT_USER_DS 15 95#define GDT_ENTRY_DEFAULT_USER_DS 15
96#define GDT_ENTRY_TSS 16
97#define GDT_ENTRY_LDT 17
98#define GDT_ENTRY_PNPBIOS_CS32 18
99#define GDT_ENTRY_PNPBIOS_CS16 19
100#define GDT_ENTRY_PNPBIOS_DS 20
101#define GDT_ENTRY_PNPBIOS_TS1 21
102#define GDT_ENTRY_PNPBIOS_TS2 22
103#define GDT_ENTRY_APMBIOS_BASE 23
104
105#define GDT_ENTRY_ESPFIX_SS 26
106#define GDT_ENTRY_PERCPU 27
107#define GDT_ENTRY_STACK_CANARY 28
108
109#define GDT_ENTRY_DOUBLEFAULT_TSS 31
86 110
87#define GDT_ENTRY_KERNEL_BASE (12) 111/*
112 * Number of entries in the GDT table:
113 */
114#define GDT_ENTRIES 32
88 115
89#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0) 116/*
117 * Segment selector values corresponding to the above entries:
118 */
90 119
91#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1) 120#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
121#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
122#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
123#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
124#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
92 125
93#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4) 126/* segment for calling fn: */
94#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5) 127#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8)
128/* code segment for BIOS: */
129#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8)
95 130
96#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6) 131/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */
97#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11) 132#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32)
98 133
99#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14) 134/* data segment for BIOS: */
100#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) 135#define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8)
136/* transfer data segment: */
137#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8)
138/* another data segment: */
139#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8)
101 140
102#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15)
103#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
104#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) 142# define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8)
105#else 143#else
106#define __KERNEL_PERCPU 0 144# define __KERNEL_PERCPU 0
107#endif 145#endif
108 146
109#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16)
110#ifdef CONFIG_CC_STACKPROTECTOR 147#ifdef CONFIG_CC_STACKPROTECTOR
111#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) 148# define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8)
112#else 149#else
113#define __KERNEL_STACK_CANARY 0 150# define __KERNEL_STACK_CANARY 0
114#endif 151#endif
115 152
116#define GDT_ENTRY_DOUBLEFAULT_TSS 31 153#else /* 64-bit: */
117
118/*
119 * The GDT has 32 entries
120 */
121#define GDT_ENTRIES 32
122 154
123/* The PnP BIOS entries in the GDT */ 155#include <asm/cache.h>
124#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
125#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
126#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
127#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
128#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
129
130/* The PnP BIOS selectors */
131#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
132#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
133#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
134#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
135#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
136 156
157#define GDT_ENTRY_KERNEL32_CS 1
158#define GDT_ENTRY_KERNEL_CS 2
159#define GDT_ENTRY_KERNEL_DS 3
137 160
138/* 161/*
139 * Matching rules for certain types of segments. 162 * We cannot use the same code segment descriptor for user and kernel mode,
163 * not even in long flat mode, because of different DPL.
164 *
165 * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes
166 * selectors:
167 *
168 * if returning to 32-bit userspace: cs = STAR.SYSRET_CS,
169 * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16,
170 *
171 * ss = STAR.SYSRET_CS+8 (in either case)
172 *
173 * thus USER_DS should be between 32-bit and 64-bit code selectors:
140 */ 174 */
175#define GDT_ENTRY_DEFAULT_USER32_CS 4
176#define GDT_ENTRY_DEFAULT_USER_DS 5
177#define GDT_ENTRY_DEFAULT_USER_CS 6
141 178
142/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ 179/* Needs two entries */
143#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) 180#define GDT_ENTRY_TSS 8
144 181/* Needs two entries */
182#define GDT_ENTRY_LDT 10
145 183
146#else 184#define GDT_ENTRY_TLS_MIN 12
147#include <asm/cache.h> 185#define GDT_ENTRY_TLS_MAX 14
148
149#define GDT_ENTRY_KERNEL32_CS 1
150#define GDT_ENTRY_KERNEL_CS 2
151#define GDT_ENTRY_KERNEL_DS 3
152 186
153#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8) 187/* Abused to load per CPU data from limit */
188#define GDT_ENTRY_PER_CPU 15
154 189
155/* 190/*
156 * we cannot use the same code segment descriptor for user and kernel 191 * Number of entries in the GDT table:
157 * -- not even in the long flat mode, because of different DPL /kkeil
158 * The segment offset needs to contain a RPL. Grr. -AK
159 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
160 */ 192 */
161#define GDT_ENTRY_DEFAULT_USER32_CS 4 193#define GDT_ENTRIES 16
162#define GDT_ENTRY_DEFAULT_USER_DS 5
163#define GDT_ENTRY_DEFAULT_USER_CS 6
164#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3)
165#define __USER32_DS __USER_DS
166
167#define GDT_ENTRY_TSS 8 /* needs two entries */
168#define GDT_ENTRY_LDT 10 /* needs two entries */
169#define GDT_ENTRY_TLS_MIN 12
170#define GDT_ENTRY_TLS_MAX 14
171
172#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
173#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
174 194
175/* TLS indexes for 64bit - hardcoded in arch_prctl */ 195/*
176#define FS_TLS 0 196 * Segment selector values corresponding to the above entries:
177#define GS_TLS 1 197 *
178 198 * Note, selectors also need to have a correct RPL,
179#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) 199 * expressed with the +3 value for user-space selectors:
180#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) 200 */
181 201#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8)
182#define GDT_ENTRIES 16 202#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
203#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
204#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
205#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
206#define __USER32_DS __USER_DS
207#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
208#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3)
209
210/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */
211#define FS_TLS 0
212#define GS_TLS 1
213
214#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
215#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
183 216
184#endif 217#endif
185 218
186#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
187#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
188#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3)
189#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3)
190#ifndef CONFIG_PARAVIRT 219#ifndef CONFIG_PARAVIRT
191#define get_kernel_rpl() 0 220# define get_kernel_rpl() 0
192#endif 221#endif
193 222
194#define IDT_ENTRIES 256 223#define IDT_ENTRIES 256
195#define NUM_EXCEPTION_VECTORS 32 224#define NUM_EXCEPTION_VECTORS 32
196/* Bitmask of exception vectors which push an error code on the stack */ 225
197#define EXCEPTION_ERRCODE_MASK 0x00027d00 226/* Bitmask of exception vectors which push an error code on the stack: */
198#define GDT_SIZE (GDT_ENTRIES * 8) 227#define EXCEPTION_ERRCODE_MASK 0x00027d00
199#define GDT_ENTRY_TLS_ENTRIES 3 228
200#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) 229#define GDT_SIZE (GDT_ENTRIES*8)
230#define GDT_ENTRY_TLS_ENTRIES 3
231#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8)
201 232
202#ifdef __KERNEL__ 233#ifdef __KERNEL__
203#ifndef __ASSEMBLY__ 234#ifndef __ASSEMBLY__
235
204extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; 236extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5];
205#ifdef CONFIG_TRACING 237#ifdef CONFIG_TRACING
206#define trace_early_idt_handlers early_idt_handlers 238# define trace_early_idt_handlers early_idt_handlers
207#endif 239#endif
208 240
209/* 241/*
@@ -228,37 +260,30 @@ do { \
228} while (0) 260} while (0)
229 261
230/* 262/*
231 * Save a segment register away 263 * Save a segment register away:
232 */ 264 */
233#define savesegment(seg, value) \ 265#define savesegment(seg, value) \
234 asm("mov %%" #seg ",%0":"=r" (value) : : "memory") 266 asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
235 267
236/* 268/*
237 * x86_32 user gs accessors. 269 * x86-32 user GS accessors:
238 */ 270 */
239#ifdef CONFIG_X86_32 271#ifdef CONFIG_X86_32
240#ifdef CONFIG_X86_32_LAZY_GS 272# ifdef CONFIG_X86_32_LAZY_GS
241#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) 273# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; })
242#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) 274# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
243#define task_user_gs(tsk) ((tsk)->thread.gs) 275# define task_user_gs(tsk) ((tsk)->thread.gs)
244#define lazy_save_gs(v) savesegment(gs, (v)) 276# define lazy_save_gs(v) savesegment(gs, (v))
245#define lazy_load_gs(v) loadsegment(gs, (v)) 277# define lazy_load_gs(v) loadsegment(gs, (v))
246#else /* X86_32_LAZY_GS */ 278# else /* X86_32_LAZY_GS */
247#define get_user_gs(regs) (u16)((regs)->gs) 279# define get_user_gs(regs) (u16)((regs)->gs)
248#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) 280# define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
249#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) 281# define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
250#define lazy_save_gs(v) do { } while (0) 282# define lazy_save_gs(v) do { } while (0)
251#define lazy_load_gs(v) do { } while (0) 283# define lazy_load_gs(v) do { } while (0)
252#endif /* X86_32_LAZY_GS */ 284# endif /* X86_32_LAZY_GS */
253#endif /* X86_32 */ 285#endif /* X86_32 */
254 286
255static inline unsigned long get_limit(unsigned long segment)
256{
257 unsigned long __limit;
258 asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
259 return __limit + 1;
260}
261
262#endif /* !__ASSEMBLY__ */ 287#endif /* !__ASSEMBLY__ */
263#endif /* __KERNEL__ */ 288#endif /* __KERNEL__ */
264 289
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ff4e7b236e21..f69e06b283fb 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -66,6 +66,11 @@ static inline void x86_ce4100_early_setup(void) { }
66 */ 66 */
67extern struct boot_params boot_params; 67extern struct boot_params boot_params;
68 68
69static inline bool kaslr_enabled(void)
70{
71 return !!(boot_params.hdr.loadflags & KASLR_FLAG);
72}
73
69/* 74/*
70 * Do NOT EVER look at the BIOS memory size location. 75 * Do NOT EVER look at the BIOS memory size location.
71 * It does not work on many machines. 76 * It does not work on many machines.
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 9dfce4e0417d..6fe6b182c998 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -57,9 +57,9 @@ struct sigcontext {
57 unsigned long ip; 57 unsigned long ip;
58 unsigned long flags; 58 unsigned long flags;
59 unsigned short cs; 59 unsigned short cs;
60 unsigned short gs; 60 unsigned short __pad2; /* Was called gs, but was always zero. */
61 unsigned short fs; 61 unsigned short __pad1; /* Was called fs, but was always zero. */
62 unsigned short __pad0; 62 unsigned short ss;
63 unsigned long err; 63 unsigned long err;
64 unsigned long trapno; 64 unsigned long trapno;
65 unsigned long oldmask; 65 unsigned long oldmask;
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 7a958164088c..89db46752a8f 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -13,9 +13,7 @@
13 X86_EFLAGS_CF | X86_EFLAGS_RF) 13 X86_EFLAGS_CF | X86_EFLAGS_RF)
14 14
15void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 15void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
16 16int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc);
17int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
18 unsigned long *pax);
19int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, 17int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
20 struct pt_regs *regs, unsigned long mask); 18 struct pt_regs *regs, unsigned long mask);
21 19
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 8d3120f4e270..ba665ebd17bb 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -27,23 +27,11 @@
27 27
28#ifdef CONFIG_X86_SMAP 28#ifdef CONFIG_X86_SMAP
29 29
30#define ASM_CLAC \ 30#define ASM_CLAC \
31 661: ASM_NOP3 ; \ 31 ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
32 .pushsection .altinstr_replacement, "ax" ; \ 32
33 662: __ASM_CLAC ; \ 33#define ASM_STAC \
34 .popsection ; \ 34 ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
35 .pushsection .altinstructions, "a" ; \
36 altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
37 .popsection
38
39#define ASM_STAC \
40 661: ASM_NOP3 ; \
41 .pushsection .altinstr_replacement, "ax" ; \
42 662: __ASM_STAC ; \
43 .popsection ; \
44 .pushsection .altinstructions, "a" ; \
45 altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
46 .popsection
47 35
48#else /* CONFIG_X86_SMAP */ 36#else /* CONFIG_X86_SMAP */
49 37
@@ -61,20 +49,20 @@
61static __always_inline void clac(void) 49static __always_inline void clac(void)
62{ 50{
63 /* Note: a barrier is implicit in alternative() */ 51 /* Note: a barrier is implicit in alternative() */
64 alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); 52 alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
65} 53}
66 54
67static __always_inline void stac(void) 55static __always_inline void stac(void)
68{ 56{
69 /* Note: a barrier is implicit in alternative() */ 57 /* Note: a barrier is implicit in alternative() */
70 alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); 58 alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
71} 59}
72 60
73/* These macros can be used in asm() statements */ 61/* These macros can be used in asm() statements */
74#define ASM_CLAC \ 62#define ASM_CLAC \
75 ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) 63 ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
76#define ASM_STAC \ 64#define ASM_STAC \
77 ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) 65 ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
78 66
79#else /* CONFIG_X86_SMAP */ 67#else /* CONFIG_X86_SMAP */
80 68
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd1cc3bc835..81d02fc7dafa 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -154,6 +154,7 @@ void cpu_die_common(unsigned int cpu);
154void native_smp_prepare_boot_cpu(void); 154void native_smp_prepare_boot_cpu(void);
155void native_smp_prepare_cpus(unsigned int max_cpus); 155void native_smp_prepare_cpus(unsigned int max_cpus);
156void native_smp_cpus_done(unsigned int max_cpus); 156void native_smp_cpus_done(unsigned int max_cpus);
157void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
157int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); 158int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
158int native_cpu_disable(void); 159int native_cpu_disable(void);
159void native_cpu_die(unsigned int cpu); 160void native_cpu_die(unsigned int cpu);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 6a4b00fafb00..aeb4666e0c0a 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -4,6 +4,8 @@
4 4
5#ifdef __KERNEL__ 5#ifdef __KERNEL__
6 6
7#include <asm/nops.h>
8
7static inline void native_clts(void) 9static inline void native_clts(void)
8{ 10{
9 asm volatile("clts"); 11 asm volatile("clts");
@@ -199,6 +201,28 @@ static inline void clflushopt(volatile void *__p)
199 "+m" (*(volatile char __force *)__p)); 201 "+m" (*(volatile char __force *)__p));
200} 202}
201 203
204static inline void clwb(volatile void *__p)
205{
206 volatile struct { char x[64]; } *p = __p;
207
208 asm volatile(ALTERNATIVE_2(
209 ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])",
210 ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
211 X86_FEATURE_CLFLUSHOPT,
212 ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */
213 X86_FEATURE_CLWB)
214 : [p] "+m" (*p)
215 : [pax] "a" (p));
216}
217
218static inline void pcommit_sfence(void)
219{
220 alternative(ASM_NOP7,
221 ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */
222 "sfence",
223 X86_FEATURE_PCOMMIT);
224}
225
202#define nop() asm volatile ("nop") 226#define nop() asm volatile ("nop")
203 227
204 228
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1d4e4f279a32..ea2dbe82cba3 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -13,6 +13,33 @@
13#include <asm/types.h> 13#include <asm/types.h>
14 14
15/* 15/*
16 * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
17 * reserve at the top of the kernel stack. We do it because of a nasty
18 * 32-bit corner case. On x86_32, the hardware stack frame is
19 * variable-length. Except for vm86 mode, struct pt_regs assumes a
20 * maximum-length frame. If we enter from CPL 0, the top 8 bytes of
21 * pt_regs don't actually exist. Ordinarily this doesn't matter, but it
22 * does in at least one case:
23 *
24 * If we take an NMI early enough in SYSENTER, then we can end up with
25 * pt_regs that extends above sp0. On the way out, in the espfix code,
26 * we can read the saved SS value, but that value will be above sp0.
27 * Without this offset, that can result in a page fault. (We are
28 * careful that, in this case, the value we read doesn't matter.)
29 *
30 * In vm86 mode, the hardware frame is much longer still, but we neither
31 * access the extra members from NMI context, nor do we write such a
32 * frame at sp0 at all.
33 *
34 * x86_64 has a fixed-length stack frame.
35 */
36#ifdef CONFIG_X86_32
37# define TOP_OF_KERNEL_STACK_PADDING 8
38#else
39# define TOP_OF_KERNEL_STACK_PADDING 0
40#endif
41
42/*
16 * low level task data that entry.S needs immediate access to 43 * low level task data that entry.S needs immediate access to
17 * - this struct should fit entirely inside of one cache line 44 * - this struct should fit entirely inside of one cache line
18 * - this struct shares the supervisor stack pages 45 * - this struct shares the supervisor stack pages
@@ -145,7 +172,6 @@ struct thread_info {
145#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) 172#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
146 173
147#define STACK_WARN (THREAD_SIZE/8) 174#define STACK_WARN (THREAD_SIZE/8)
148#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8))
149 175
150/* 176/*
151 * macros/functions for gaining access to the thread information structure 177 * macros/functions for gaining access to the thread information structure
@@ -158,10 +184,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack);
158 184
159static inline struct thread_info *current_thread_info(void) 185static inline struct thread_info *current_thread_info(void)
160{ 186{
161 struct thread_info *ti; 187 return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
162 ti = (void *)(this_cpu_read_stable(kernel_stack) +
163 KERNEL_STACK_OFFSET - THREAD_SIZE);
164 return ti;
165} 188}
166 189
167static inline unsigned long current_stack_pointer(void) 190static inline unsigned long current_stack_pointer(void)
@@ -177,16 +200,37 @@ static inline unsigned long current_stack_pointer(void)
177 200
178#else /* !__ASSEMBLY__ */ 201#else /* !__ASSEMBLY__ */
179 202
180/* how to get the thread information struct from ASM */ 203/* Load thread_info address into "reg" */
181#define GET_THREAD_INFO(reg) \ 204#define GET_THREAD_INFO(reg) \
182 _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ 205 _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \
183 _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ; 206 _ASM_SUB $(THREAD_SIZE),reg ;
184 207
185/* 208/*
186 * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in 209 * ASM operand which evaluates to a 'thread_info' address of
187 * a certain register (to be used in assembler memory operands). 210 * the current task, if it is known that "reg" is exactly "off"
211 * bytes below the top of the stack currently.
212 *
213 * ( The kernel stack's size is known at build time, it is usually
214 * 2 or 4 pages, and the bottom of the kernel stack contains
215 * the thread_info structure. So to access the thread_info very
216 * quickly from assembly code we can calculate down from the
217 * top of the kernel stack to the bottom, using constant,
218 * build-time calculations only. )
219 *
220 * For example, to fetch the current thread_info->flags value into %eax
221 * on x86-64 defconfig kernels, in syscall entry code where RSP is
222 * currently at exactly SIZEOF_PTREGS bytes away from the top of the
223 * stack:
224 *
225 * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
226 *
227 * will translate to:
228 *
229 * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax
230 *
231 * which is below the current RSP by almost 16K.
188 */ 232 */
189#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) 233#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
190 234
191#endif 235#endif
192 236
@@ -236,6 +280,16 @@ static inline bool is_ia32_task(void)
236#endif 280#endif
237 return false; 281 return false;
238} 282}
283
284/*
285 * Force syscall return via IRET by making it look as if there was
286 * some work pending. IRET is our most capable (but slowest) syscall
287 * return path, which is able to restore modified SS, CS and certain
288 * EFLAGS values that other (fast) syscall return instructions
289 * are not able to restore properly.
290 */
291#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
292
239#endif /* !__ASSEMBLY__ */ 293#endif /* !__ASSEMBLY__ */
240 294
241#ifndef __ASSEMBLY__ 295#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 225b0988043a..ab456dc233b5 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -15,6 +15,7 @@
15 15
16/* loadflags */ 16/* loadflags */
17#define LOADED_HIGH (1<<0) 17#define LOADED_HIGH (1<<0)
18#define KASLR_FLAG (1<<1)
18#define QUIET_FLAG (1<<5) 19#define QUIET_FLAG (1<<5)
19#define KEEP_SEGMENTS (1<<6) 20#define KEEP_SEGMENTS (1<<6)
20#define CAN_USE_HEAP (1<<7) 21#define CAN_USE_HEAP (1<<7)
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
index 7b0a55a88851..580aee3072e0 100644
--- a/arch/x86/include/uapi/asm/ptrace-abi.h
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -25,13 +25,17 @@
25#else /* __i386__ */ 25#else /* __i386__ */
26 26
27#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) 27#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
28/*
29 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
30 * unless syscall needs a complete, fully filled "struct pt_regs".
31 */
28#define R15 0 32#define R15 0
29#define R14 8 33#define R14 8
30#define R13 16 34#define R13 16
31#define R12 24 35#define R12 24
32#define RBP 32 36#define RBP 32
33#define RBX 40 37#define RBX 40
34/* arguments: interrupts/non tracing syscalls only save up to here*/ 38/* These regs are callee-clobbered. Always saved on kernel entry. */
35#define R11 48 39#define R11 48
36#define R10 56 40#define R10 56
37#define R9 64 41#define R9 64
@@ -41,15 +45,17 @@
41#define RDX 96 45#define RDX 96
42#define RSI 104 46#define RSI 104
43#define RDI 112 47#define RDI 112
44#define ORIG_RAX 120 /* = ERROR */ 48/*
45/* end of arguments */ 49 * On syscall entry, this is syscall#. On CPU exception, this is error code.
46/* cpu exception frame or undefined in case of fast syscall. */ 50 * On hw interrupt, it's IRQ number:
51 */
52#define ORIG_RAX 120
53/* Return frame for iretq */
47#define RIP 128 54#define RIP 128
48#define CS 136 55#define CS 136
49#define EFLAGS 144 56#define EFLAGS 144
50#define RSP 152 57#define RSP 152
51#define SS 160 58#define SS 160
52#define ARGOFFSET R11
53#endif /* __ASSEMBLY__ */ 59#endif /* __ASSEMBLY__ */
54 60
55/* top of stack page */ 61/* top of stack page */
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
index ac4b9aa4d999..bc16115af39b 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -41,13 +41,17 @@ struct pt_regs {
41#ifndef __KERNEL__ 41#ifndef __KERNEL__
42 42
43struct pt_regs { 43struct pt_regs {
44/*
45 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
46 * unless syscall needs a complete, fully filled "struct pt_regs".
47 */
44 unsigned long r15; 48 unsigned long r15;
45 unsigned long r14; 49 unsigned long r14;
46 unsigned long r13; 50 unsigned long r13;
47 unsigned long r12; 51 unsigned long r12;
48 unsigned long rbp; 52 unsigned long rbp;
49 unsigned long rbx; 53 unsigned long rbx;
50/* arguments: non interrupts/non tracing syscalls only save up to here*/ 54/* These regs are callee-clobbered. Always saved on kernel entry. */
51 unsigned long r11; 55 unsigned long r11;
52 unsigned long r10; 56 unsigned long r10;
53 unsigned long r9; 57 unsigned long r9;
@@ -57,9 +61,12 @@ struct pt_regs {
57 unsigned long rdx; 61 unsigned long rdx;
58 unsigned long rsi; 62 unsigned long rsi;
59 unsigned long rdi; 63 unsigned long rdi;
64/*
65 * On syscall entry, this is syscall#. On CPU exception, this is error code.
66 * On hw interrupt, it's IRQ number:
67 */
60 unsigned long orig_rax; 68 unsigned long orig_rax;
61/* end of arguments */ 69/* Return frame for iretq */
62/* cpu exception frame or undefined */
63 unsigned long rip; 70 unsigned long rip;
64 unsigned long cs; 71 unsigned long cs;
65 unsigned long eflags; 72 unsigned long eflags;
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index d8b9f9081e86..16dc4e8a2cd3 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -177,9 +177,24 @@ struct sigcontext {
177 __u64 rip; 177 __u64 rip;
178 __u64 eflags; /* RFLAGS */ 178 __u64 eflags; /* RFLAGS */
179 __u16 cs; 179 __u16 cs;
180 __u16 gs; 180
181 __u16 fs; 181 /*
182 __u16 __pad0; 182 * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
183 * Linux saved and restored fs and gs in these slots. This
184 * was counterproductive, as fsbase and gsbase were never
185 * saved, so arch_prctl was presumably unreliable.
186 *
187 * If these slots are ever needed for any other purpose, there
188 * is some risk that very old 64-bit binaries could get
189 * confused. I doubt that many such binaries still work,
190 * though, since the same patch in 2.5.64 also removed the
191 * 64-bit set_thread_area syscall, so it appears that there is
192 * no TLS API that works in both pre- and post-2.5.64 kernels.
193 */
194 __u16 __pad2; /* Was gs. */
195 __u16 __pad1; /* Was fs. */
196
197 __u16 ss;
183 __u64 err; 198 __u64 err;
184 __u64 trapno; 199 __u64 trapno;
185 __u64 oldmask; 200 __u64 oldmask;
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70ddad0..c887cd944f0c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o
32obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 32obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
33obj-$(CONFIG_X86_64) += mcount_64.o 33obj-$(CONFIG_X86_64) += mcount_64.o
34obj-y += syscall_$(BITS).o vsyscall_gtod.o 34obj-y += syscall_$(BITS).o vsyscall_gtod.o
35obj-$(CONFIG_IA32_EMULATION) += syscall_32.o
35obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o 36obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
36obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o 37obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
37obj-$(CONFIG_SYSFS) += ksysfs.o 38obj-$(CONFIG_SYSFS) += ksysfs.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 703130f469ec..aef653193160 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str)
52__setup("noreplace-paravirt", setup_noreplace_paravirt); 52__setup("noreplace-paravirt", setup_noreplace_paravirt);
53#endif 53#endif
54 54
55#define DPRINTK(fmt, ...) \ 55#define DPRINTK(fmt, args...) \
56do { \ 56do { \
57 if (debug_alternative) \ 57 if (debug_alternative) \
58 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ 58 printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
59} while (0)
60
61#define DUMP_BYTES(buf, len, fmt, args...) \
62do { \
63 if (unlikely(debug_alternative)) { \
64 int j; \
65 \
66 if (!(len)) \
67 break; \
68 \
69 printk(KERN_DEBUG fmt, ##args); \
70 for (j = 0; j < (len) - 1; j++) \
71 printk(KERN_CONT "%02hhx ", buf[j]); \
72 printk(KERN_CONT "%02hhx\n", buf[j]); \
73 } \
59} while (0) 74} while (0)
60 75
61/* 76/*
@@ -243,12 +258,89 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
243extern s32 __smp_locks[], __smp_locks_end[]; 258extern s32 __smp_locks[], __smp_locks_end[];
244void *text_poke_early(void *addr, const void *opcode, size_t len); 259void *text_poke_early(void *addr, const void *opcode, size_t len);
245 260
246/* Replace instructions with better alternatives for this CPU type. 261/*
247 This runs before SMP is initialized to avoid SMP problems with 262 * Are we looking at a near JMP with a 1 or 4-byte displacement.
248 self modifying code. This implies that asymmetric systems where 263 */
249 APs have less capabilities than the boot processor are not handled. 264static inline bool is_jmp(const u8 opcode)
250 Tough. Make sure you disable such features by hand. */ 265{
266 return opcode == 0xeb || opcode == 0xe9;
267}
268
269static void __init_or_module
270recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
271{
272 u8 *next_rip, *tgt_rip;
273 s32 n_dspl, o_dspl;
274 int repl_len;
275
276 if (a->replacementlen != 5)
277 return;
278
279 o_dspl = *(s32 *)(insnbuf + 1);
280
281 /* next_rip of the replacement JMP */
282 next_rip = repl_insn + a->replacementlen;
283 /* target rip of the replacement JMP */
284 tgt_rip = next_rip + o_dspl;
285 n_dspl = tgt_rip - orig_insn;
286
287 DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
288
289 if (tgt_rip - orig_insn >= 0) {
290 if (n_dspl - 2 <= 127)
291 goto two_byte_jmp;
292 else
293 goto five_byte_jmp;
294 /* negative offset */
295 } else {
296 if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
297 goto two_byte_jmp;
298 else
299 goto five_byte_jmp;
300 }
301
302two_byte_jmp:
303 n_dspl -= 2;
304
305 insnbuf[0] = 0xeb;
306 insnbuf[1] = (s8)n_dspl;
307 add_nops(insnbuf + 2, 3);
308
309 repl_len = 2;
310 goto done;
311
312five_byte_jmp:
313 n_dspl -= 5;
314
315 insnbuf[0] = 0xe9;
316 *(s32 *)&insnbuf[1] = n_dspl;
251 317
318 repl_len = 5;
319
320done:
321
322 DPRINTK("final displ: 0x%08x, JMP 0x%lx",
323 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
324}
325
326static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
327{
328 if (instr[0] != 0x90)
329 return;
330
331 add_nops(instr + (a->instrlen - a->padlen), a->padlen);
332
333 DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
334 instr, a->instrlen - a->padlen, a->padlen);
335}
336
337/*
338 * Replace instructions with better alternatives for this CPU type. This runs
339 * before SMP is initialized to avoid SMP problems with self modifying code.
340 * This implies that asymmetric systems where APs have less capabilities than
341 * the boot processor are not handled. Tough. Make sure you disable such
342 * features by hand.
343 */
252void __init_or_module apply_alternatives(struct alt_instr *start, 344void __init_or_module apply_alternatives(struct alt_instr *start,
253 struct alt_instr *end) 345 struct alt_instr *end)
254{ 346{
@@ -256,10 +348,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
256 u8 *instr, *replacement; 348 u8 *instr, *replacement;
257 u8 insnbuf[MAX_PATCH_LEN]; 349 u8 insnbuf[MAX_PATCH_LEN];
258 350
259 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 351 DPRINTK("alt table %p -> %p", start, end);
260 /* 352 /*
261 * The scan order should be from start to end. A later scanned 353 * The scan order should be from start to end. A later scanned
262 * alternative code can overwrite a previous scanned alternative code. 354 * alternative code can overwrite previously scanned alternative code.
263 * Some kernel functions (e.g. memcpy, memset, etc) use this order to 355 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
264 * patch code. 356 * patch code.
265 * 357 *
@@ -267,29 +359,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
267 * order. 359 * order.
268 */ 360 */
269 for (a = start; a < end; a++) { 361 for (a = start; a < end; a++) {
362 int insnbuf_sz = 0;
363
270 instr = (u8 *)&a->instr_offset + a->instr_offset; 364 instr = (u8 *)&a->instr_offset + a->instr_offset;
271 replacement = (u8 *)&a->repl_offset + a->repl_offset; 365 replacement = (u8 *)&a->repl_offset + a->repl_offset;
272 BUG_ON(a->replacementlen > a->instrlen);
273 BUG_ON(a->instrlen > sizeof(insnbuf)); 366 BUG_ON(a->instrlen > sizeof(insnbuf));
274 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); 367 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
275 if (!boot_cpu_has(a->cpuid)) 368 if (!boot_cpu_has(a->cpuid)) {
369 if (a->padlen > 1)
370 optimize_nops(a, instr);
371
276 continue; 372 continue;
373 }
374
375 DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
376 a->cpuid >> 5,
377 a->cpuid & 0x1f,
378 instr, a->instrlen,
379 replacement, a->replacementlen, a->padlen);
380
381 DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
382 DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
277 383
278 memcpy(insnbuf, replacement, a->replacementlen); 384 memcpy(insnbuf, replacement, a->replacementlen);
385 insnbuf_sz = a->replacementlen;
279 386
280 /* 0xe8 is a relative jump; fix the offset. */ 387 /* 0xe8 is a relative jump; fix the offset. */
281 if (*insnbuf == 0xe8 && a->replacementlen == 5) 388 if (*insnbuf == 0xe8 && a->replacementlen == 5) {
282 *(s32 *)(insnbuf + 1) += replacement - instr; 389 *(s32 *)(insnbuf + 1) += replacement - instr;
390 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
391 *(s32 *)(insnbuf + 1),
392 (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
393 }
394
395 if (a->replacementlen && is_jmp(replacement[0]))
396 recompute_jump(a, instr, replacement, insnbuf);
283 397
284 add_nops(insnbuf + a->replacementlen, 398 if (a->instrlen > a->replacementlen) {
285 a->instrlen - a->replacementlen); 399 add_nops(insnbuf + a->replacementlen,
400 a->instrlen - a->replacementlen);
401 insnbuf_sz += a->instrlen - a->replacementlen;
402 }
403 DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
286 404
287 text_poke_early(instr, insnbuf, a->instrlen); 405 text_poke_early(instr, insnbuf, insnbuf_sz);
288 } 406 }
289} 407}
290 408
291#ifdef CONFIG_SMP 409#ifdef CONFIG_SMP
292
293static void alternatives_smp_lock(const s32 *start, const s32 *end, 410static void alternatives_smp_lock(const s32 *start, const s32 *end,
294 u8 *text, u8 *text_end) 411 u8 *text, u8 *text_end)
295{ 412{
@@ -371,8 +488,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
371 smp->locks_end = locks_end; 488 smp->locks_end = locks_end;
372 smp->text = text; 489 smp->text = text;
373 smp->text_end = text_end; 490 smp->text_end = text_end;
374 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", 491 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
375 __func__, smp->locks, smp->locks_end, 492 smp->locks, smp->locks_end,
376 smp->text, smp->text_end, smp->name); 493 smp->text, smp->text_end, smp->name);
377 494
378 list_add_tail(&smp->next, &smp_alt_modules); 495 list_add_tail(&smp->next, &smp_alt_modules);
@@ -440,7 +557,7 @@ int alternatives_text_reserved(void *start, void *end)
440 557
441 return 0; 558 return 0;
442} 559}
443#endif 560#endif /* CONFIG_SMP */
444 561
445#ifdef CONFIG_PARAVIRT 562#ifdef CONFIG_PARAVIRT
446void __init_or_module apply_paravirt(struct paravirt_patch_site *start, 563void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
@@ -601,7 +718,7 @@ int poke_int3_handler(struct pt_regs *regs)
601 if (likely(!bp_patching_in_progress)) 718 if (likely(!bp_patching_in_progress))
602 return 0; 719 return 0;
603 720
604 if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) 721 if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
605 return 0; 722 return 0;
606 723
607 /* set up the specified breakpoint handler */ 724 /* set up the specified breakpoint handler */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 3b3b9d33ac1d..47703aed74cf 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -68,7 +68,7 @@ void foo(void)
68 68
69 /* Offset from the sysenter stack to tss.sp0 */ 69 /* Offset from the sysenter stack to tss.sp0 */
70 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 70 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
71 sizeof(struct tss_struct)); 71 offsetofend(struct tss_struct, SYSENTER_stack));
72 72
73#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 73#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
74 BLANK(); 74 BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index fdcbb4d27c9f..5ce6f2da8763 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -81,6 +81,7 @@ int main(void)
81#undef ENTRY 81#undef ENTRY
82 82
83 OFFSET(TSS_ist, tss_struct, x86_tss.ist); 83 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
84 OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
84 BLANK(); 85 BLANK();
85 86
86 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); 87 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239cea65..dd9e50500297 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c)
711 set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); 711 set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
712 712
713 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); 713 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
714
715 /* 3DNow or LM implies PREFETCHW */
716 if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
717 if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
718 set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
714} 719}
715 720
716#ifdef CONFIG_X86_32 721#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2346c95c6ab1..3f70538012e2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -959,38 +959,37 @@ static void identify_cpu(struct cpuinfo_x86 *c)
959#endif 959#endif
960} 960}
961 961
962#ifdef CONFIG_X86_64 962/*
963#ifdef CONFIG_IA32_EMULATION 963 * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
964/* May not be __init: called during resume */ 964 * on 32-bit kernels:
965static void syscall32_cpu_init(void) 965 */
966{
967 /* Load these always in case some future AMD CPU supports
968 SYSENTER from compat mode too. */
969 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
970 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
971 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
972
973 wrmsrl(MSR_CSTAR, ia32_cstar_target);
974}
975#endif /* CONFIG_IA32_EMULATION */
976#endif /* CONFIG_X86_64 */
977
978#ifdef CONFIG_X86_32 966#ifdef CONFIG_X86_32
979void enable_sep_cpu(void) 967void enable_sep_cpu(void)
980{ 968{
981 int cpu = get_cpu(); 969 struct tss_struct *tss;
982 struct tss_struct *tss = &per_cpu(init_tss, cpu); 970 int cpu;
983 971
984 if (!boot_cpu_has(X86_FEATURE_SEP)) { 972 cpu = get_cpu();
985 put_cpu(); 973 tss = &per_cpu(cpu_tss, cpu);
986 return; 974
987 } 975 if (!boot_cpu_has(X86_FEATURE_SEP))
976 goto out;
977
978 /*
979 * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
980 * see the big comment in struct x86_hw_tss's definition.
981 */
988 982
989 tss->x86_tss.ss1 = __KERNEL_CS; 983 tss->x86_tss.ss1 = __KERNEL_CS;
990 tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; 984 wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
991 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 985
992 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); 986 wrmsr(MSR_IA32_SYSENTER_ESP,
993 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); 987 (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
988 0);
989
990 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
991
992out:
994 put_cpu(); 993 put_cpu();
995} 994}
996#endif 995#endif
@@ -1118,7 +1117,7 @@ static __init int setup_disablecpuid(char *arg)
1118__setup("clearcpuid=", setup_disablecpuid); 1117__setup("clearcpuid=", setup_disablecpuid);
1119 1118
1120DEFINE_PER_CPU(unsigned long, kernel_stack) = 1119DEFINE_PER_CPU(unsigned long, kernel_stack) =
1121 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; 1120 (unsigned long)&init_thread_union + THREAD_SIZE;
1122EXPORT_PER_CPU_SYMBOL(kernel_stack); 1121EXPORT_PER_CPU_SYMBOL(kernel_stack);
1123 1122
1124#ifdef CONFIG_X86_64 1123#ifdef CONFIG_X86_64
@@ -1130,8 +1129,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union,
1130 irq_stack_union) __aligned(PAGE_SIZE) __visible; 1129 irq_stack_union) __aligned(PAGE_SIZE) __visible;
1131 1130
1132/* 1131/*
1133 * The following four percpu variables are hot. Align current_task to 1132 * The following percpu variables are hot. Align current_task to
1134 * cacheline size such that all four fall in the same cacheline. 1133 * cacheline size such that they fall in the same cacheline.
1135 */ 1134 */
1136DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = 1135DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
1137 &init_task; 1136 &init_task;
@@ -1171,10 +1170,23 @@ void syscall_init(void)
1171 */ 1170 */
1172 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); 1171 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1173 wrmsrl(MSR_LSTAR, system_call); 1172 wrmsrl(MSR_LSTAR, system_call);
1174 wrmsrl(MSR_CSTAR, ignore_sysret);
1175 1173
1176#ifdef CONFIG_IA32_EMULATION 1174#ifdef CONFIG_IA32_EMULATION
1177 syscall32_cpu_init(); 1175 wrmsrl(MSR_CSTAR, ia32_cstar_target);
1176 /*
1177 * This only works on Intel CPUs.
1178 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
1179 * This does not cause SYSENTER to jump to the wrong location, because
1180 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
1181 */
1182 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
1183 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
1184 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
1185#else
1186 wrmsrl(MSR_CSTAR, ignore_sysret);
1187 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
1188 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
1189 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
1178#endif 1190#endif
1179 1191
1180 /* Flags to clear on syscall */ 1192 /* Flags to clear on syscall */
@@ -1226,6 +1238,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
1226EXPORT_PER_CPU_SYMBOL(__preempt_count); 1238EXPORT_PER_CPU_SYMBOL(__preempt_count);
1227DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); 1239DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1228 1240
1241/*
1242 * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
1243 * the top of the kernel stack. Use an extra percpu variable to track the
1244 * top of the kernel stack directly.
1245 */
1246DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
1247 (unsigned long)&init_thread_union + THREAD_SIZE;
1248EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
1249
1229#ifdef CONFIG_CC_STACKPROTECTOR 1250#ifdef CONFIG_CC_STACKPROTECTOR
1230DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 1251DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
1231#endif 1252#endif
@@ -1307,7 +1328,7 @@ void cpu_init(void)
1307 */ 1328 */
1308 load_ucode_ap(); 1329 load_ucode_ap();
1309 1330
1310 t = &per_cpu(init_tss, cpu); 1331 t = &per_cpu(cpu_tss, cpu);
1311 oist = &per_cpu(orig_ist, cpu); 1332 oist = &per_cpu(orig_ist, cpu);
1312 1333
1313#ifdef CONFIG_NUMA 1334#ifdef CONFIG_NUMA
@@ -1391,7 +1412,7 @@ void cpu_init(void)
1391{ 1412{
1392 int cpu = smp_processor_id(); 1413 int cpu = smp_processor_id();
1393 struct task_struct *curr = current; 1414 struct task_struct *curr = current;
1394 struct tss_struct *t = &per_cpu(init_tss, cpu); 1415 struct tss_struct *t = &per_cpu(cpu_tss, cpu);
1395 struct thread_struct *thread = &curr->thread; 1416 struct thread_struct *thread = &curr->thread;
1396 1417
1397 wait_for_master_cpu(cpu); 1418 wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..e2888a3ad1e3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2147,24 +2147,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
2147static unsigned long code_segment_base(struct pt_regs *regs) 2147static unsigned long code_segment_base(struct pt_regs *regs)
2148{ 2148{
2149 /* 2149 /*
2150 * For IA32 we look at the GDT/LDT segment base to convert the
2151 * effective IP to a linear address.
2152 */
2153
2154#ifdef CONFIG_X86_32
2155 /*
2150 * If we are in VM86 mode, add the segment offset to convert to a 2156 * If we are in VM86 mode, add the segment offset to convert to a
2151 * linear address. 2157 * linear address.
2152 */ 2158 */
2153 if (regs->flags & X86_VM_MASK) 2159 if (regs->flags & X86_VM_MASK)
2154 return 0x10 * regs->cs; 2160 return 0x10 * regs->cs;
2155 2161
2156 /*
2157 * For IA32 we look at the GDT/LDT segment base to convert the
2158 * effective IP to a linear address.
2159 */
2160#ifdef CONFIG_X86_32
2161 if (user_mode(regs) && regs->cs != __USER_CS) 2162 if (user_mode(regs) && regs->cs != __USER_CS)
2162 return get_segment_base(regs->cs); 2163 return get_segment_base(regs->cs);
2163#else 2164#else
2164 if (test_thread_flag(TIF_IA32)) { 2165 if (user_mode(regs) && !user_64bit_mode(regs) &&
2165 if (user_mode(regs) && regs->cs != __USER32_CS) 2166 regs->cs != __USER32_CS)
2166 return get_segment_base(regs->cs); 2167 return get_segment_base(regs->cs);
2167 }
2168#endif 2168#endif
2169 return 0; 2169 return 0;
2170} 2170}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index aceb2f90c716..c76d3e37c6e1 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
105#ifdef CONFIG_X86_32 105#ifdef CONFIG_X86_32
106 struct pt_regs fixed_regs; 106 struct pt_regs fixed_regs;
107 107
108 if (!user_mode_vm(regs)) { 108 if (!user_mode(regs)) {
109 crash_fixup_ss_esp(&fixed_regs, regs); 109 crash_fixup_ss_esp(&fixed_regs, regs);
110 regs = &fixed_regs; 110 regs = &fixed_regs;
111 } 111 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index cf3df1d8d039..ab3b65639a3e 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -278,7 +278,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
278 print_modules(); 278 print_modules();
279 show_regs(regs); 279 show_regs(regs);
280#ifdef CONFIG_X86_32 280#ifdef CONFIG_X86_32
281 if (user_mode_vm(regs)) { 281 if (user_mode(regs)) {
282 sp = regs->sp; 282 sp = regs->sp;
283 ss = regs->ss & 0xffff; 283 ss = regs->ss & 0xffff;
284 } else { 284 } else {
@@ -307,7 +307,7 @@ void die(const char *str, struct pt_regs *regs, long err)
307 unsigned long flags = oops_begin(); 307 unsigned long flags = oops_begin();
308 int sig = SIGSEGV; 308 int sig = SIGSEGV;
309 309
310 if (!user_mode_vm(regs)) 310 if (!user_mode(regs))
311 report_bug(regs->ip, regs); 311 report_bug(regs->ip, regs);
312 312
313 if (__die(str, regs, err)) 313 if (__die(str, regs, err))
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 5abd4cd4230c..39891ff50d03 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -123,13 +123,13 @@ void show_regs(struct pt_regs *regs)
123 int i; 123 int i;
124 124
125 show_regs_print_info(KERN_EMERG); 125 show_regs_print_info(KERN_EMERG);
126 __show_regs(regs, !user_mode_vm(regs)); 126 __show_regs(regs, !user_mode(regs));
127 127
128 /* 128 /*
129 * When in-kernel, we also print out the stack and code at the 129 * When in-kernel, we also print out the stack and code at the
130 * time of the fault.. 130 * time of the fault..
131 */ 131 */
132 if (!user_mode_vm(regs)) { 132 if (!user_mode(regs)) {
133 unsigned int code_prologue = code_bytes * 43 / 64; 133 unsigned int code_prologue = code_bytes * 43 / 64;
134 unsigned int code_len = code_bytes; 134 unsigned int code_len = code_bytes;
135 unsigned char c; 135 unsigned char c;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 31e2d5bf3e38..1c309763e321 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,10 +395,13 @@ sysenter_past_esp:
395 /*CFI_REL_OFFSET cs, 0*/ 395 /*CFI_REL_OFFSET cs, 0*/
396 /* 396 /*
397 * Push current_thread_info()->sysenter_return to the stack. 397 * Push current_thread_info()->sysenter_return to the stack.
398 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 398 * A tiny bit of offset fixup is necessary: TI_sysenter_return
399 * pushed above; +8 corresponds to copy_thread's esp0 setting. 399 * is relative to thread_info, which is at the bottom of the
400 * kernel stack page. 4*4 means the 4 words pushed above;
401 * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
402 * and THREAD_SIZE takes us to the bottom.
400 */ 403 */
401 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) 404 pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
402 CFI_REL_OFFSET eip, 0 405 CFI_REL_OFFSET eip, 0
403 406
404 pushl_cfi %eax 407 pushl_cfi %eax
@@ -432,7 +435,7 @@ sysenter_after_call:
432 TRACE_IRQS_OFF 435 TRACE_IRQS_OFF
433 movl TI_flags(%ebp), %ecx 436 movl TI_flags(%ebp), %ecx
434 testl $_TIF_ALLWORK_MASK, %ecx 437 testl $_TIF_ALLWORK_MASK, %ecx
435 jne sysexit_audit 438 jnz sysexit_audit
436sysenter_exit: 439sysenter_exit:
437/* if something modifies registers it must also disable sysexit */ 440/* if something modifies registers it must also disable sysexit */
438 movl PT_EIP(%esp), %edx 441 movl PT_EIP(%esp), %edx
@@ -460,7 +463,7 @@ sysenter_audit:
460 463
461sysexit_audit: 464sysexit_audit:
462 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx 465 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
463 jne syscall_exit_work 466 jnz syscall_exit_work
464 TRACE_IRQS_ON 467 TRACE_IRQS_ON
465 ENABLE_INTERRUPTS(CLBR_ANY) 468 ENABLE_INTERRUPTS(CLBR_ANY)
466 movl %eax,%edx /* second arg, syscall return value */ 469 movl %eax,%edx /* second arg, syscall return value */
@@ -472,7 +475,7 @@ sysexit_audit:
472 TRACE_IRQS_OFF 475 TRACE_IRQS_OFF
473 movl TI_flags(%ebp), %ecx 476 movl TI_flags(%ebp), %ecx
474 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx 477 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
475 jne syscall_exit_work 478 jnz syscall_exit_work
476 movl PT_EAX(%esp),%eax /* reload syscall return value */ 479 movl PT_EAX(%esp),%eax /* reload syscall return value */
477 jmp sysenter_exit 480 jmp sysenter_exit
478#endif 481#endif
@@ -510,7 +513,7 @@ syscall_exit:
510 TRACE_IRQS_OFF 513 TRACE_IRQS_OFF
511 movl TI_flags(%ebp), %ecx 514 movl TI_flags(%ebp), %ecx
512 testl $_TIF_ALLWORK_MASK, %ecx # current->work 515 testl $_TIF_ALLWORK_MASK, %ecx # current->work
513 jne syscall_exit_work 516 jnz syscall_exit_work
514 517
515restore_all: 518restore_all:
516 TRACE_IRQS_IRET 519 TRACE_IRQS_IRET
@@ -612,7 +615,7 @@ work_notifysig: # deal with pending signals and
612#ifdef CONFIG_VM86 615#ifdef CONFIG_VM86
613 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) 616 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
614 movl %esp, %eax 617 movl %esp, %eax
615 jne work_notifysig_v86 # returning to kernel-space or 618 jnz work_notifysig_v86 # returning to kernel-space or
616 # vm86-space 619 # vm86-space
6171: 6201:
618#else 621#else
@@ -720,43 +723,22 @@ END(sysenter_badsys)
720.endm 723.endm
721 724
722/* 725/*
723 * Build the entry stubs and pointer table with some assembler magic. 726 * Build the entry stubs with some assembler magic.
724 * We pack 7 stubs into a single 32-byte chunk, which will fit in a 727 * We pack 1 stub into every 8-byte block.
725 * single cache line on all modern x86 implementations.
726 */ 728 */
727.section .init.rodata,"a" 729 .align 8
728ENTRY(interrupt)
729.section .entry.text, "ax"
730 .p2align 5
731 .p2align CONFIG_X86_L1_CACHE_SHIFT
732ENTRY(irq_entries_start) 730ENTRY(irq_entries_start)
733 RING0_INT_FRAME 731 RING0_INT_FRAME
734vector=FIRST_EXTERNAL_VECTOR 732 vector=FIRST_EXTERNAL_VECTOR
735.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 733 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
736 .balign 32 734 pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
737 .rept 7 735 vector=vector+1
738 .if vector < FIRST_SYSTEM_VECTOR 736 jmp common_interrupt
739 .if vector <> FIRST_EXTERNAL_VECTOR
740 CFI_ADJUST_CFA_OFFSET -4 737 CFI_ADJUST_CFA_OFFSET -4
741 .endif 738 .align 8
7421: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ 739 .endr
743 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
744 jmp 2f
745 .endif
746 .previous
747 .long 1b
748 .section .entry.text, "ax"
749vector=vector+1
750 .endif
751 .endr
7522: jmp common_interrupt
753.endr
754END(irq_entries_start) 740END(irq_entries_start)
755 741
756.previous
757END(interrupt)
758.previous
759
760/* 742/*
761 * the CPU automatically disables interrupts when executing an IRQ vector, 743 * the CPU automatically disables interrupts when executing an IRQ vector,
762 * so IRQ-flags tracing has to follow that: 744 * so IRQ-flags tracing has to follow that:
@@ -816,15 +798,9 @@ ENTRY(simd_coprocessor_error)
816 pushl_cfi $0 798 pushl_cfi $0
817#ifdef CONFIG_X86_INVD_BUG 799#ifdef CONFIG_X86_INVD_BUG
818 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 800 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
819661: pushl_cfi $do_general_protection 801 ALTERNATIVE "pushl_cfi $do_general_protection", \
820662: 802 "pushl $do_simd_coprocessor_error", \
821.section .altinstructions,"a" 803 X86_FEATURE_XMM
822 altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
823.previous
824.section .altinstr_replacement,"ax"
825663: pushl $do_simd_coprocessor_error
826664:
827.previous
828#else 804#else
829 pushl_cfi $do_simd_coprocessor_error 805 pushl_cfi $do_simd_coprocessor_error
830#endif 806#endif
@@ -1240,20 +1216,13 @@ error_code:
1240 /*CFI_REL_OFFSET es, 0*/ 1216 /*CFI_REL_OFFSET es, 0*/
1241 pushl_cfi %ds 1217 pushl_cfi %ds
1242 /*CFI_REL_OFFSET ds, 0*/ 1218 /*CFI_REL_OFFSET ds, 0*/
1243 pushl_cfi %eax 1219 pushl_cfi_reg eax
1244 CFI_REL_OFFSET eax, 0 1220 pushl_cfi_reg ebp
1245 pushl_cfi %ebp 1221 pushl_cfi_reg edi
1246 CFI_REL_OFFSET ebp, 0 1222 pushl_cfi_reg esi
1247 pushl_cfi %edi 1223 pushl_cfi_reg edx
1248 CFI_REL_OFFSET edi, 0 1224 pushl_cfi_reg ecx
1249 pushl_cfi %esi 1225 pushl_cfi_reg ebx
1250 CFI_REL_OFFSET esi, 0
1251 pushl_cfi %edx
1252 CFI_REL_OFFSET edx, 0
1253 pushl_cfi %ecx
1254 CFI_REL_OFFSET ecx, 0
1255 pushl_cfi %ebx
1256 CFI_REL_OFFSET ebx, 0
1257 cld 1226 cld
1258 movl $(__KERNEL_PERCPU), %ecx 1227 movl $(__KERNEL_PERCPU), %ecx
1259 movl %ecx, %fs 1228 movl %ecx, %fs
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f0095a76c182..c7b238494b31 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -14,27 +14,14 @@
14 * NOTE: This code handles signal-recognition, which happens every time 14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call. 15 * after an interrupt and after each system call.
16 * 16 *
17 * Normal syscalls and interrupts don't save a full stack frame, this is
18 * only done for syscall tracing, signals or fork/exec et.al.
19 *
20 * A note on terminology: 17 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP 18 * - iret frame: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack. 19 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers up to R11.
24 * - full stack frame: Like partial stack frame, but all register saved.
25 * 20 *
26 * Some macro usage: 21 * Some macro usage:
27 * - CFI macros are used to generate dwarf2 unwind information for better 22 * - CFI macros are used to generate dwarf2 unwind information for better
28 * backtraces. They don't change any code. 23 * backtraces. They don't change any code.
29 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
30 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
31 * There are unfortunately lots of special cases where some registers
32 * not touched. The macro is a big mess that should be cleaned up.
33 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
34 * Gives a full stack frame.
35 * - ENTRY/END Define functions in the symbol table. 24 * - ENTRY/END Define functions in the symbol table.
36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
37 * frame that is otherwise undefined after a SYSCALL
38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. 25 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
39 * - idtentry - Define exception entry points. 26 * - idtentry - Define exception entry points.
40 */ 27 */
@@ -70,10 +57,6 @@
70 .section .entry.text, "ax" 57 .section .entry.text, "ax"
71 58
72 59
73#ifndef CONFIG_PREEMPT
74#define retint_kernel retint_restore_args
75#endif
76
77#ifdef CONFIG_PARAVIRT 60#ifdef CONFIG_PARAVIRT
78ENTRY(native_usergs_sysret64) 61ENTRY(native_usergs_sysret64)
79 swapgs 62 swapgs
@@ -82,9 +65,9 @@ ENDPROC(native_usergs_sysret64)
82#endif /* CONFIG_PARAVIRT */ 65#endif /* CONFIG_PARAVIRT */
83 66
84 67
85.macro TRACE_IRQS_IRETQ offset=ARGOFFSET 68.macro TRACE_IRQS_IRETQ
86#ifdef CONFIG_TRACE_IRQFLAGS 69#ifdef CONFIG_TRACE_IRQFLAGS
87 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ 70 bt $9,EFLAGS(%rsp) /* interrupts off? */
88 jnc 1f 71 jnc 1f
89 TRACE_IRQS_ON 72 TRACE_IRQS_ON
901: 731:
@@ -116,8 +99,8 @@ ENDPROC(native_usergs_sysret64)
116 call debug_stack_reset 99 call debug_stack_reset
117.endm 100.endm
118 101
119.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET 102.macro TRACE_IRQS_IRETQ_DEBUG
120 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ 103 bt $9,EFLAGS(%rsp) /* interrupts off? */
121 jnc 1f 104 jnc 1f
122 TRACE_IRQS_ON_DEBUG 105 TRACE_IRQS_ON_DEBUG
1231: 1061:
@@ -130,34 +113,7 @@ ENDPROC(native_usergs_sysret64)
130#endif 113#endif
131 114
132/* 115/*
133 * C code is not supposed to know about undefined top of stack. Every time 116 * empty frame
134 * a C function with an pt_regs argument is called from the SYSCALL based
135 * fast path FIXUP_TOP_OF_STACK is needed.
136 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
137 * manipulation.
138 */
139
140 /* %rsp:at FRAMEEND */
141 .macro FIXUP_TOP_OF_STACK tmp offset=0
142 movq PER_CPU_VAR(old_rsp),\tmp
143 movq \tmp,RSP+\offset(%rsp)
144 movq $__USER_DS,SS+\offset(%rsp)
145 movq $__USER_CS,CS+\offset(%rsp)
146 movq RIP+\offset(%rsp),\tmp /* get rip */
147 movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
148 movq R11+\offset(%rsp),\tmp /* get eflags */
149 movq \tmp,EFLAGS+\offset(%rsp)
150 .endm
151
152 .macro RESTORE_TOP_OF_STACK tmp offset=0
153 movq RSP+\offset(%rsp),\tmp
154 movq \tmp,PER_CPU_VAR(old_rsp)
155 movq EFLAGS+\offset(%rsp),\tmp
156 movq \tmp,R11+\offset(%rsp)
157 .endm
158
159/*
160 * initial frame state for interrupts (and exceptions without error code)
161 */ 117 */
162 .macro EMPTY_FRAME start=1 offset=0 118 .macro EMPTY_FRAME start=1 offset=0
163 .if \start 119 .if \start
@@ -173,12 +129,12 @@ ENDPROC(native_usergs_sysret64)
173 * initial frame state for interrupts (and exceptions without error code) 129 * initial frame state for interrupts (and exceptions without error code)
174 */ 130 */
175 .macro INTR_FRAME start=1 offset=0 131 .macro INTR_FRAME start=1 offset=0
176 EMPTY_FRAME \start, SS+8+\offset-RIP 132 EMPTY_FRAME \start, 5*8+\offset
177 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ 133 /*CFI_REL_OFFSET ss, 4*8+\offset*/
178 CFI_REL_OFFSET rsp, RSP+\offset-RIP 134 CFI_REL_OFFSET rsp, 3*8+\offset
179 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ 135 /*CFI_REL_OFFSET rflags, 2*8+\offset*/
180 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ 136 /*CFI_REL_OFFSET cs, 1*8+\offset*/
181 CFI_REL_OFFSET rip, RIP+\offset-RIP 137 CFI_REL_OFFSET rip, 0*8+\offset
182 .endm 138 .endm
183 139
184/* 140/*
@@ -186,30 +142,23 @@ ENDPROC(native_usergs_sysret64)
186 * with vector already pushed) 142 * with vector already pushed)
187 */ 143 */
188 .macro XCPT_FRAME start=1 offset=0 144 .macro XCPT_FRAME start=1 offset=0
189 INTR_FRAME \start, RIP+\offset-ORIG_RAX 145 INTR_FRAME \start, 1*8+\offset
190 .endm
191
192/*
193 * frame that enables calling into C.
194 */
195 .macro PARTIAL_FRAME start=1 offset=0
196 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
197 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
198 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
199 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
200 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
201 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
202 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
203 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
204 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
205 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
206 .endm 146 .endm
207 147
208/* 148/*
209 * frame that enables passing a complete pt_regs to a C function. 149 * frame that enables passing a complete pt_regs to a C function.
210 */ 150 */
211 .macro DEFAULT_FRAME start=1 offset=0 151 .macro DEFAULT_FRAME start=1 offset=0
212 PARTIAL_FRAME \start, R11+\offset-R15 152 XCPT_FRAME \start, ORIG_RAX+\offset
153 CFI_REL_OFFSET rdi, RDI+\offset
154 CFI_REL_OFFSET rsi, RSI+\offset
155 CFI_REL_OFFSET rdx, RDX+\offset
156 CFI_REL_OFFSET rcx, RCX+\offset
157 CFI_REL_OFFSET rax, RAX+\offset
158 CFI_REL_OFFSET r8, R8+\offset
159 CFI_REL_OFFSET r9, R9+\offset
160 CFI_REL_OFFSET r10, R10+\offset
161 CFI_REL_OFFSET r11, R11+\offset
213 CFI_REL_OFFSET rbx, RBX+\offset 162 CFI_REL_OFFSET rbx, RBX+\offset
214 CFI_REL_OFFSET rbp, RBP+\offset 163 CFI_REL_OFFSET rbp, RBP+\offset
215 CFI_REL_OFFSET r12, R12+\offset 164 CFI_REL_OFFSET r12, R12+\offset
@@ -218,105 +167,30 @@ ENDPROC(native_usergs_sysret64)
218 CFI_REL_OFFSET r15, R15+\offset 167 CFI_REL_OFFSET r15, R15+\offset
219 .endm 168 .endm
220 169
221ENTRY(save_paranoid)
222 XCPT_FRAME 1 RDI+8
223 cld
224 movq %rdi, RDI+8(%rsp)
225 movq %rsi, RSI+8(%rsp)
226 movq_cfi rdx, RDX+8
227 movq_cfi rcx, RCX+8
228 movq_cfi rax, RAX+8
229 movq %r8, R8+8(%rsp)
230 movq %r9, R9+8(%rsp)
231 movq %r10, R10+8(%rsp)
232 movq %r11, R11+8(%rsp)
233 movq_cfi rbx, RBX+8
234 movq %rbp, RBP+8(%rsp)
235 movq %r12, R12+8(%rsp)
236 movq %r13, R13+8(%rsp)
237 movq %r14, R14+8(%rsp)
238 movq %r15, R15+8(%rsp)
239 movl $1,%ebx
240 movl $MSR_GS_BASE,%ecx
241 rdmsr
242 testl %edx,%edx
243 js 1f /* negative -> in kernel */
244 SWAPGS
245 xorl %ebx,%ebx
2461: ret
247 CFI_ENDPROC
248END(save_paranoid)
249
250/* 170/*
251 * A newly forked process directly context switches into this address. 171 * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
252 * 172 *
253 * rdi: prev task we switched from 173 * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
254 */ 174 * then loads new ss, cs, and rip from previously programmed MSRs.
255ENTRY(ret_from_fork) 175 * rflags gets masked by a value from another MSR (so CLD and CLAC
256 DEFAULT_FRAME 176 * are not needed). SYSCALL does not save anything on the stack
257 177 * and does not change rsp.
258 LOCK ; btr $TIF_FORK,TI_flags(%r8)
259
260 pushq_cfi $0x0002
261 popfq_cfi # reset kernel eflags
262
263 call schedule_tail # rdi: 'prev' task parameter
264
265 GET_THREAD_INFO(%rcx)
266
267 RESTORE_REST
268
269 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
270 jz 1f
271
272 /*
273 * By the time we get here, we have no idea whether our pt_regs,
274 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
275 * the slow path, or one of the ia32entry paths.
276 * Use int_ret_from_sys_call to return, since it can safely handle
277 * all of the above.
278 */
279 jmp int_ret_from_sys_call
280
2811:
282 subq $REST_SKIP, %rsp # leave space for volatiles
283 CFI_ADJUST_CFA_OFFSET REST_SKIP
284 movq %rbp, %rdi
285 call *%rbx
286 movl $0, RAX(%rsp)
287 RESTORE_REST
288 jmp int_ret_from_sys_call
289 CFI_ENDPROC
290END(ret_from_fork)
291
292/*
293 * System call entry. Up to 6 arguments in registers are supported.
294 * 178 *
295 * SYSCALL does not save anything on the stack and does not change the 179 * Registers on entry:
296 * stack pointer. However, it does mask the flags register for us, so
297 * CLD and CLAC are not needed.
298 */
299
300/*
301 * Register setup:
302 * rax system call number 180 * rax system call number
181 * rcx return address
182 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
303 * rdi arg0 183 * rdi arg0
304 * rcx return address for syscall/sysret, C arg3
305 * rsi arg1 184 * rsi arg1
306 * rdx arg2 185 * rdx arg2
307 * r10 arg3 (--> moved to rcx for C) 186 * r10 arg3 (needs to be moved to rcx to conform to C ABI)
308 * r8 arg4 187 * r8 arg4
309 * r9 arg5 188 * r9 arg5
310 * r11 eflags for syscall/sysret, temporary for C 189 * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
311 * r12-r15,rbp,rbx saved by C code, not touched.
312 * 190 *
313 * Interrupts are off on entry.
314 * Only called from user space. 191 * Only called from user space.
315 * 192 *
316 * XXX if we had a free scratch register we could save the RSP into the stack frame 193 * When user can change pt_regs->foo always force IRET. That is because
317 * and report it properly in ps. Unfortunately we haven't.
318 *
319 * When user can change the frames always force IRET. That is because
320 * it deals with uncanonical addresses better. SYSRET has trouble 194 * it deals with uncanonical addresses better. SYSRET has trouble
321 * with them due to bugs in both AMD and Intel CPUs. 195 * with them due to bugs in both AMD and Intel CPUs.
322 */ 196 */
@@ -324,9 +198,15 @@ END(ret_from_fork)
324ENTRY(system_call) 198ENTRY(system_call)
325 CFI_STARTPROC simple 199 CFI_STARTPROC simple
326 CFI_SIGNAL_FRAME 200 CFI_SIGNAL_FRAME
327 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET 201 CFI_DEF_CFA rsp,0
328 CFI_REGISTER rip,rcx 202 CFI_REGISTER rip,rcx
329 /*CFI_REGISTER rflags,r11*/ 203 /*CFI_REGISTER rflags,r11*/
204
205 /*
206 * Interrupts are off on entry.
207 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
208 * it is too small to ever cause noticeable irq latency.
209 */
330 SWAPGS_UNSAFE_STACK 210 SWAPGS_UNSAFE_STACK
331 /* 211 /*
332 * A hypervisor implementation might want to use a label 212 * A hypervisor implementation might want to use a label
@@ -335,18 +215,38 @@ ENTRY(system_call)
335 */ 215 */
336GLOBAL(system_call_after_swapgs) 216GLOBAL(system_call_after_swapgs)
337 217
338 movq %rsp,PER_CPU_VAR(old_rsp) 218 movq %rsp,PER_CPU_VAR(rsp_scratch)
339 movq PER_CPU_VAR(kernel_stack),%rsp 219 movq PER_CPU_VAR(kernel_stack),%rsp
220
221 /* Construct struct pt_regs on stack */
222 pushq_cfi $__USER_DS /* pt_regs->ss */
223 pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
340 /* 224 /*
341 * No need to follow this irqs off/on section - it's straight 225 * Re-enable interrupts.
342 * and short: 226 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
227 * must execute atomically in the face of possible interrupt-driven
228 * task preemption. We must enable interrupts only after we're done
229 * with using rsp_scratch:
343 */ 230 */
344 ENABLE_INTERRUPTS(CLBR_NONE) 231 ENABLE_INTERRUPTS(CLBR_NONE)
345 SAVE_ARGS 8, 0, rax_enosys=1 232 pushq_cfi %r11 /* pt_regs->flags */
346 movq_cfi rax,(ORIG_RAX-ARGOFFSET) 233 pushq_cfi $__USER_CS /* pt_regs->cs */
347 movq %rcx,RIP-ARGOFFSET(%rsp) 234 pushq_cfi %rcx /* pt_regs->ip */
348 CFI_REL_OFFSET rip,RIP-ARGOFFSET 235 CFI_REL_OFFSET rip,0
349 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 236 pushq_cfi_reg rax /* pt_regs->orig_ax */
237 pushq_cfi_reg rdi /* pt_regs->di */
238 pushq_cfi_reg rsi /* pt_regs->si */
239 pushq_cfi_reg rdx /* pt_regs->dx */
240 pushq_cfi_reg rcx /* pt_regs->cx */
241 pushq_cfi $-ENOSYS /* pt_regs->ax */
242 pushq_cfi_reg r8 /* pt_regs->r8 */
243 pushq_cfi_reg r9 /* pt_regs->r9 */
244 pushq_cfi_reg r10 /* pt_regs->r10 */
245 pushq_cfi_reg r11 /* pt_regs->r11 */
246 sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
247 CFI_ADJUST_CFA_OFFSET 6*8
248
249 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
350 jnz tracesys 250 jnz tracesys
351system_call_fastpath: 251system_call_fastpath:
352#if __SYSCALL_MASK == ~0 252#if __SYSCALL_MASK == ~0
@@ -355,18 +255,21 @@ system_call_fastpath:
355 andl $__SYSCALL_MASK,%eax 255 andl $__SYSCALL_MASK,%eax
356 cmpl $__NR_syscall_max,%eax 256 cmpl $__NR_syscall_max,%eax
357#endif 257#endif
358 ja ret_from_sys_call /* and return regs->ax */ 258 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
359 movq %r10,%rcx 259 movq %r10,%rcx
360 call *sys_call_table(,%rax,8) # XXX: rip relative 260 call *sys_call_table(,%rax,8)
361 movq %rax,RAX-ARGOFFSET(%rsp) 261 movq %rax,RAX(%rsp)
2621:
362/* 263/*
363 * Syscall return path ending with SYSRET (fast path) 264 * Syscall return path ending with SYSRET (fast path).
364 * Has incomplete stack frame and undefined top of stack. 265 * Has incompletely filled pt_regs.
365 */ 266 */
366ret_from_sys_call:
367 LOCKDEP_SYS_EXIT 267 LOCKDEP_SYS_EXIT
268 /*
269 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
270 * it is too small to ever cause noticeable irq latency.
271 */
368 DISABLE_INTERRUPTS(CLBR_NONE) 272 DISABLE_INTERRUPTS(CLBR_NONE)
369 TRACE_IRQS_OFF
370 273
371 /* 274 /*
372 * We must check ti flags with interrupts (or at least preemption) 275 * We must check ti flags with interrupts (or at least preemption)
@@ -376,72 +279,73 @@ ret_from_sys_call:
376 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is 279 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
377 * very bad. 280 * very bad.
378 */ 281 */
379 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 282 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
380 jnz int_ret_from_sys_call_fixup /* Go the the slow path */ 283 jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
381 284
382 CFI_REMEMBER_STATE 285 CFI_REMEMBER_STATE
383 /* 286
384 * sysretq will re-enable interrupts: 287 RESTORE_C_REGS_EXCEPT_RCX_R11
385 */ 288 movq RIP(%rsp),%rcx
386 TRACE_IRQS_ON
387 movq RIP-ARGOFFSET(%rsp),%rcx
388 CFI_REGISTER rip,rcx 289 CFI_REGISTER rip,rcx
389 RESTORE_ARGS 1,-ARG_SKIP,0 290 movq EFLAGS(%rsp),%r11
390 /*CFI_REGISTER rflags,r11*/ 291 /*CFI_REGISTER rflags,r11*/
391 movq PER_CPU_VAR(old_rsp), %rsp 292 movq RSP(%rsp),%rsp
293 /*
294 * 64bit SYSRET restores rip from rcx,
295 * rflags from r11 (but RF and VM bits are forced to 0),
296 * cs and ss are loaded from MSRs.
297 * Restoration of rflags re-enables interrupts.
298 */
392 USERGS_SYSRET64 299 USERGS_SYSRET64
393 300
394 CFI_RESTORE_STATE 301 CFI_RESTORE_STATE
395 302
396int_ret_from_sys_call_fixup: 303 /* Do syscall entry tracing */
397 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
398 jmp int_ret_from_sys_call_irqs_off
399
400 /* Do syscall tracing */
401tracesys: 304tracesys:
402 leaq -REST_SKIP(%rsp), %rdi 305 movq %rsp, %rdi
403 movq $AUDIT_ARCH_X86_64, %rsi 306 movl $AUDIT_ARCH_X86_64, %esi
404 call syscall_trace_enter_phase1 307 call syscall_trace_enter_phase1
405 test %rax, %rax 308 test %rax, %rax
406 jnz tracesys_phase2 /* if needed, run the slow path */ 309 jnz tracesys_phase2 /* if needed, run the slow path */
407 LOAD_ARGS 0 /* else restore clobbered regs */ 310 RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
311 movq ORIG_RAX(%rsp), %rax
408 jmp system_call_fastpath /* and return to the fast path */ 312 jmp system_call_fastpath /* and return to the fast path */
409 313
410tracesys_phase2: 314tracesys_phase2:
411 SAVE_REST 315 SAVE_EXTRA_REGS
412 FIXUP_TOP_OF_STACK %rdi
413 movq %rsp, %rdi 316 movq %rsp, %rdi
414 movq $AUDIT_ARCH_X86_64, %rsi 317 movl $AUDIT_ARCH_X86_64, %esi
415 movq %rax,%rdx 318 movq %rax,%rdx
416 call syscall_trace_enter_phase2 319 call syscall_trace_enter_phase2
417 320
418 /* 321 /*
419 * Reload arg registers from stack in case ptrace changed them. 322 * Reload registers from stack in case ptrace changed them.
420 * We don't reload %rax because syscall_trace_entry_phase2() returned 323 * We don't reload %rax because syscall_trace_entry_phase2() returned
421 * the value it wants us to use in the table lookup. 324 * the value it wants us to use in the table lookup.
422 */ 325 */
423 LOAD_ARGS ARGOFFSET, 1 326 RESTORE_C_REGS_EXCEPT_RAX
424 RESTORE_REST 327 RESTORE_EXTRA_REGS
425#if __SYSCALL_MASK == ~0 328#if __SYSCALL_MASK == ~0
426 cmpq $__NR_syscall_max,%rax 329 cmpq $__NR_syscall_max,%rax
427#else 330#else
428 andl $__SYSCALL_MASK,%eax 331 andl $__SYSCALL_MASK,%eax
429 cmpl $__NR_syscall_max,%eax 332 cmpl $__NR_syscall_max,%eax
430#endif 333#endif
431 ja int_ret_from_sys_call /* RAX(%rsp) is already set */ 334 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
432 movq %r10,%rcx /* fixup for C */ 335 movq %r10,%rcx /* fixup for C */
433 call *sys_call_table(,%rax,8) 336 call *sys_call_table(,%rax,8)
434 movq %rax,RAX-ARGOFFSET(%rsp) 337 movq %rax,RAX(%rsp)
435 /* Use IRET because user could have changed frame */ 3381:
339 /* Use IRET because user could have changed pt_regs->foo */
436 340
437/* 341/*
438 * Syscall return path ending with IRET. 342 * Syscall return path ending with IRET.
439 * Has correct top of stack, but partial stack frame. 343 * Has correct iret frame.
440 */ 344 */
441GLOBAL(int_ret_from_sys_call) 345GLOBAL(int_ret_from_sys_call)
442 DISABLE_INTERRUPTS(CLBR_NONE) 346 DISABLE_INTERRUPTS(CLBR_NONE)
347int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
443 TRACE_IRQS_OFF 348 TRACE_IRQS_OFF
444int_ret_from_sys_call_irqs_off:
445 movl $_TIF_ALLWORK_MASK,%edi 349 movl $_TIF_ALLWORK_MASK,%edi
446 /* edi: mask to check */ 350 /* edi: mask to check */
447GLOBAL(int_with_check) 351GLOBAL(int_with_check)
@@ -450,8 +354,8 @@ GLOBAL(int_with_check)
450 movl TI_flags(%rcx),%edx 354 movl TI_flags(%rcx),%edx
451 andl %edi,%edx 355 andl %edi,%edx
452 jnz int_careful 356 jnz int_careful
453 andl $~TS_COMPAT,TI_status(%rcx) 357 andl $~TS_COMPAT,TI_status(%rcx)
454 jmp retint_swapgs 358 jmp syscall_return
455 359
456 /* Either reschedule or signal or syscall exit tracking needed. */ 360 /* Either reschedule or signal or syscall exit tracking needed. */
457 /* First do a reschedule test. */ 361 /* First do a reschedule test. */
@@ -468,12 +372,11 @@ int_careful:
468 TRACE_IRQS_OFF 372 TRACE_IRQS_OFF
469 jmp int_with_check 373 jmp int_with_check
470 374
471 /* handle signals and tracing -- both require a full stack frame */ 375 /* handle signals and tracing -- both require a full pt_regs */
472int_very_careful: 376int_very_careful:
473 TRACE_IRQS_ON 377 TRACE_IRQS_ON
474 ENABLE_INTERRUPTS(CLBR_NONE) 378 ENABLE_INTERRUPTS(CLBR_NONE)
475int_check_syscall_exit_work: 379 SAVE_EXTRA_REGS
476 SAVE_REST
477 /* Check for syscall exit trace */ 380 /* Check for syscall exit trace */
478 testl $_TIF_WORK_SYSCALL_EXIT,%edx 381 testl $_TIF_WORK_SYSCALL_EXIT,%edx
479 jz int_signal 382 jz int_signal
@@ -492,86 +395,192 @@ int_signal:
492 call do_notify_resume 395 call do_notify_resume
4931: movl $_TIF_WORK_MASK,%edi 3961: movl $_TIF_WORK_MASK,%edi
494int_restore_rest: 397int_restore_rest:
495 RESTORE_REST 398 RESTORE_EXTRA_REGS
496 DISABLE_INTERRUPTS(CLBR_NONE) 399 DISABLE_INTERRUPTS(CLBR_NONE)
497 TRACE_IRQS_OFF 400 TRACE_IRQS_OFF
498 jmp int_with_check 401 jmp int_with_check
402
403syscall_return:
404 /* The IRETQ could re-enable interrupts: */
405 DISABLE_INTERRUPTS(CLBR_ANY)
406 TRACE_IRQS_IRETQ
407
408 /*
409 * Try to use SYSRET instead of IRET if we're returning to
410 * a completely clean 64-bit userspace context.
411 */
412 movq RCX(%rsp),%rcx
413 cmpq %rcx,RIP(%rsp) /* RCX == RIP */
414 jne opportunistic_sysret_failed
415
416 /*
417 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
418 * in kernel space. This essentially lets the user take over
419 * the kernel, since userspace controls RSP. It's not worth
420 * testing for canonicalness exactly -- this check detects any
421 * of the 17 high bits set, which is true for non-canonical
422 * or kernel addresses. (This will pessimize vsyscall=native.
423 * Big deal.)
424 *
425 * If virtual addresses ever become wider, this will need
426 * to be updated to remain correct on both old and new CPUs.
427 */
428 .ifne __VIRTUAL_MASK_SHIFT - 47
429 .error "virtual address width changed -- SYSRET checks need update"
430 .endif
431 shr $__VIRTUAL_MASK_SHIFT, %rcx
432 jnz opportunistic_sysret_failed
433
434 cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
435 jne opportunistic_sysret_failed
436
437 movq R11(%rsp),%r11
438 cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
439 jne opportunistic_sysret_failed
440
441 /*
442 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
443 * restoring TF results in a trap from userspace immediately after
444 * SYSRET. This would cause an infinite loop whenever #DB happens
445 * with register state that satisfies the opportunistic SYSRET
446 * conditions. For example, single-stepping this user code:
447 *
448 * movq $stuck_here,%rcx
449 * pushfq
450 * popq %r11
451 * stuck_here:
452 *
453 * would never get past 'stuck_here'.
454 */
455 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
456 jnz opportunistic_sysret_failed
457
458 /* nothing to check for RSP */
459
460 cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
461 jne opportunistic_sysret_failed
462
463 /*
464 * We win! This label is here just for ease of understanding
465 * perf profiles. Nothing jumps here.
466 */
467syscall_return_via_sysret:
468 CFI_REMEMBER_STATE
469 /* r11 is already restored (see code above) */
470 RESTORE_C_REGS_EXCEPT_R11
471 movq RSP(%rsp),%rsp
472 USERGS_SYSRET64
473 CFI_RESTORE_STATE
474
475opportunistic_sysret_failed:
476 SWAPGS
477 jmp restore_c_regs_and_iret
499 CFI_ENDPROC 478 CFI_ENDPROC
500END(system_call) 479END(system_call)
501 480
481
502 .macro FORK_LIKE func 482 .macro FORK_LIKE func
503ENTRY(stub_\func) 483ENTRY(stub_\func)
504 CFI_STARTPROC 484 CFI_STARTPROC
505 popq %r11 /* save return address */ 485 DEFAULT_FRAME 0, 8 /* offset 8: return address */
506 PARTIAL_FRAME 0 486 SAVE_EXTRA_REGS 8
507 SAVE_REST 487 jmp sys_\func
508 pushq %r11 /* put it back on stack */
509 FIXUP_TOP_OF_STACK %r11, 8
510 DEFAULT_FRAME 0 8 /* offset 8: return address */
511 call sys_\func
512 RESTORE_TOP_OF_STACK %r11, 8
513 ret $REST_SKIP /* pop extended registers */
514 CFI_ENDPROC 488 CFI_ENDPROC
515END(stub_\func) 489END(stub_\func)
516 .endm 490 .endm
517 491
518 .macro FIXED_FRAME label,func
519ENTRY(\label)
520 CFI_STARTPROC
521 PARTIAL_FRAME 0 8 /* offset 8: return address */
522 FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
523 call \func
524 RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
525 ret
526 CFI_ENDPROC
527END(\label)
528 .endm
529
530 FORK_LIKE clone 492 FORK_LIKE clone
531 FORK_LIKE fork 493 FORK_LIKE fork
532 FORK_LIKE vfork 494 FORK_LIKE vfork
533 FIXED_FRAME stub_iopl, sys_iopl
534 495
535ENTRY(stub_execve) 496ENTRY(stub_execve)
536 CFI_STARTPROC 497 CFI_STARTPROC
537 addq $8, %rsp 498 DEFAULT_FRAME 0, 8
538 PARTIAL_FRAME 0 499 call sys_execve
539 SAVE_REST 500return_from_execve:
540 FIXUP_TOP_OF_STACK %r11 501 testl %eax, %eax
541 call sys_execve 502 jz 1f
542 movq %rax,RAX(%rsp) 503 /* exec failed, can use fast SYSRET code path in this case */
543 RESTORE_REST 504 ret
544 jmp int_ret_from_sys_call 5051:
506 /* must use IRET code path (pt_regs->cs may have changed) */
507 addq $8, %rsp
508 CFI_ADJUST_CFA_OFFSET -8
509 ZERO_EXTRA_REGS
510 movq %rax,RAX(%rsp)
511 jmp int_ret_from_sys_call
545 CFI_ENDPROC 512 CFI_ENDPROC
546END(stub_execve) 513END(stub_execve)
547 514/*
548ENTRY(stub_execveat) 515 * Remaining execve stubs are only 7 bytes long.
516 * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
517 */
518 .align 8
519GLOBAL(stub_execveat)
549 CFI_STARTPROC 520 CFI_STARTPROC
550 addq $8, %rsp 521 DEFAULT_FRAME 0, 8
551 PARTIAL_FRAME 0 522 call sys_execveat
552 SAVE_REST 523 jmp return_from_execve
553 FIXUP_TOP_OF_STACK %r11
554 call sys_execveat
555 RESTORE_TOP_OF_STACK %r11
556 movq %rax,RAX(%rsp)
557 RESTORE_REST
558 jmp int_ret_from_sys_call
559 CFI_ENDPROC 524 CFI_ENDPROC
560END(stub_execveat) 525END(stub_execveat)
561 526
527#ifdef CONFIG_X86_X32_ABI
528 .align 8
529GLOBAL(stub_x32_execve)
530 CFI_STARTPROC
531 DEFAULT_FRAME 0, 8
532 call compat_sys_execve
533 jmp return_from_execve
534 CFI_ENDPROC
535END(stub_x32_execve)
536 .align 8
537GLOBAL(stub_x32_execveat)
538 CFI_STARTPROC
539 DEFAULT_FRAME 0, 8
540 call compat_sys_execveat
541 jmp return_from_execve
542 CFI_ENDPROC
543END(stub_x32_execveat)
544#endif
545
546#ifdef CONFIG_IA32_EMULATION
547 .align 8
548GLOBAL(stub32_execve)
549 CFI_STARTPROC
550 call compat_sys_execve
551 jmp return_from_execve
552 CFI_ENDPROC
553END(stub32_execve)
554 .align 8
555GLOBAL(stub32_execveat)
556 CFI_STARTPROC
557 call compat_sys_execveat
558 jmp return_from_execve
559 CFI_ENDPROC
560END(stub32_execveat)
561#endif
562
562/* 563/*
563 * sigreturn is special because it needs to restore all registers on return. 564 * sigreturn is special because it needs to restore all registers on return.
564 * This cannot be done with SYSRET, so use the IRET return path instead. 565 * This cannot be done with SYSRET, so use the IRET return path instead.
565 */ 566 */
566ENTRY(stub_rt_sigreturn) 567ENTRY(stub_rt_sigreturn)
567 CFI_STARTPROC 568 CFI_STARTPROC
568 addq $8, %rsp 569 DEFAULT_FRAME 0, 8
569 PARTIAL_FRAME 0 570 /*
570 SAVE_REST 571 * SAVE_EXTRA_REGS result is not normally needed:
571 FIXUP_TOP_OF_STACK %r11 572 * sigreturn overwrites all pt_regs->GPREGS.
573 * But sigreturn can fail (!), and there is no easy way to detect that.
574 * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
575 * we SAVE_EXTRA_REGS here.
576 */
577 SAVE_EXTRA_REGS 8
572 call sys_rt_sigreturn 578 call sys_rt_sigreturn
573 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 579return_from_stub:
574 RESTORE_REST 580 addq $8, %rsp
581 CFI_ADJUST_CFA_OFFSET -8
582 RESTORE_EXTRA_REGS
583 movq %rax,RAX(%rsp)
575 jmp int_ret_from_sys_call 584 jmp int_ret_from_sys_call
576 CFI_ENDPROC 585 CFI_ENDPROC
577END(stub_rt_sigreturn) 586END(stub_rt_sigreturn)
@@ -579,86 +588,70 @@ END(stub_rt_sigreturn)
579#ifdef CONFIG_X86_X32_ABI 588#ifdef CONFIG_X86_X32_ABI
580ENTRY(stub_x32_rt_sigreturn) 589ENTRY(stub_x32_rt_sigreturn)
581 CFI_STARTPROC 590 CFI_STARTPROC
582 addq $8, %rsp 591 DEFAULT_FRAME 0, 8
583 PARTIAL_FRAME 0 592 SAVE_EXTRA_REGS 8
584 SAVE_REST
585 FIXUP_TOP_OF_STACK %r11
586 call sys32_x32_rt_sigreturn 593 call sys32_x32_rt_sigreturn
587 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 594 jmp return_from_stub
588 RESTORE_REST
589 jmp int_ret_from_sys_call
590 CFI_ENDPROC 595 CFI_ENDPROC
591END(stub_x32_rt_sigreturn) 596END(stub_x32_rt_sigreturn)
597#endif
592 598
593ENTRY(stub_x32_execve) 599/*
594 CFI_STARTPROC 600 * A newly forked process directly context switches into this address.
595 addq $8, %rsp 601 *
596 PARTIAL_FRAME 0 602 * rdi: prev task we switched from
597 SAVE_REST 603 */
598 FIXUP_TOP_OF_STACK %r11 604ENTRY(ret_from_fork)
599 call compat_sys_execve 605 DEFAULT_FRAME
600 RESTORE_TOP_OF_STACK %r11
601 movq %rax,RAX(%rsp)
602 RESTORE_REST
603 jmp int_ret_from_sys_call
604 CFI_ENDPROC
605END(stub_x32_execve)
606 606
607ENTRY(stub_x32_execveat) 607 LOCK ; btr $TIF_FORK,TI_flags(%r8)
608 CFI_STARTPROC 608
609 addq $8, %rsp 609 pushq_cfi $0x0002
610 PARTIAL_FRAME 0 610 popfq_cfi # reset kernel eflags
611 SAVE_REST 611
612 FIXUP_TOP_OF_STACK %r11 612 call schedule_tail # rdi: 'prev' task parameter
613 call compat_sys_execveat 613
614 RESTORE_TOP_OF_STACK %r11 614 RESTORE_EXTRA_REGS
615 movq %rax,RAX(%rsp) 615
616 RESTORE_REST 616 testl $3,CS(%rsp) # from kernel_thread?
617
618 /*
619 * By the time we get here, we have no idea whether our pt_regs,
620 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
621 * the slow path, or one of the ia32entry paths.
622 * Use IRET code path to return, since it can safely handle
623 * all of the above.
624 */
625 jnz int_ret_from_sys_call
626
627 /* We came from kernel_thread */
628 /* nb: we depend on RESTORE_EXTRA_REGS above */
629 movq %rbp, %rdi
630 call *%rbx
631 movl $0, RAX(%rsp)
632 RESTORE_EXTRA_REGS
617 jmp int_ret_from_sys_call 633 jmp int_ret_from_sys_call
618 CFI_ENDPROC 634 CFI_ENDPROC
619END(stub_x32_execveat) 635END(ret_from_fork)
620
621#endif
622 636
623/* 637/*
624 * Build the entry stubs and pointer table with some assembler magic. 638 * Build the entry stubs with some assembler magic.
625 * We pack 7 stubs into a single 32-byte chunk, which will fit in a 639 * We pack 1 stub into every 8-byte block.
626 * single cache line on all modern x86 implementations.
627 */ 640 */
628 .section .init.rodata,"a" 641 .align 8
629ENTRY(interrupt)
630 .section .entry.text
631 .p2align 5
632 .p2align CONFIG_X86_L1_CACHE_SHIFT
633ENTRY(irq_entries_start) 642ENTRY(irq_entries_start)
634 INTR_FRAME 643 INTR_FRAME
635vector=FIRST_EXTERNAL_VECTOR 644 vector=FIRST_EXTERNAL_VECTOR
636.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 645 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
637 .balign 32 646 pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
638 .rept 7 647 vector=vector+1
639 .if vector < FIRST_SYSTEM_VECTOR 648 jmp common_interrupt
640 .if vector <> FIRST_EXTERNAL_VECTOR
641 CFI_ADJUST_CFA_OFFSET -8 649 CFI_ADJUST_CFA_OFFSET -8
642 .endif 650 .align 8
6431: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ 651 .endr
644 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
645 jmp 2f
646 .endif
647 .previous
648 .quad 1b
649 .section .entry.text
650vector=vector+1
651 .endif
652 .endr
6532: jmp common_interrupt
654.endr
655 CFI_ENDPROC 652 CFI_ENDPROC
656END(irq_entries_start) 653END(irq_entries_start)
657 654
658.previous
659END(interrupt)
660.previous
661
662/* 655/*
663 * Interrupt entry/exit. 656 * Interrupt entry/exit.
664 * 657 *
@@ -669,47 +662,45 @@ END(interrupt)
669 662
670/* 0(%rsp): ~(interrupt number) */ 663/* 0(%rsp): ~(interrupt number) */
671 .macro interrupt func 664 .macro interrupt func
672 /* reserve pt_regs for scratch regs and rbp */
673 subq $ORIG_RAX-RBP, %rsp
674 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
675 cld 665 cld
676 /* start from rbp in pt_regs and jump over */ 666 /*
677 movq_cfi rdi, (RDI-RBP) 667 * Since nothing in interrupt handling code touches r12...r15 members
678 movq_cfi rsi, (RSI-RBP) 668 * of "struct pt_regs", and since interrupts can nest, we can save
679 movq_cfi rdx, (RDX-RBP) 669 * four stack slots and simultaneously provide
680 movq_cfi rcx, (RCX-RBP) 670 * an unwind-friendly stack layout by saving "truncated" pt_regs
681 movq_cfi rax, (RAX-RBP) 671 * exactly up to rbp slot, without these members.
682 movq_cfi r8, (R8-RBP) 672 */
683 movq_cfi r9, (R9-RBP) 673 ALLOC_PT_GPREGS_ON_STACK -RBP
684 movq_cfi r10, (R10-RBP) 674 SAVE_C_REGS -RBP
685 movq_cfi r11, (R11-RBP) 675 /* this goes to 0(%rsp) for unwinder, not for saving the value: */
686 676 SAVE_EXTRA_REGS_RBP -RBP
687 /* Save rbp so that we can unwind from get_irq_regs() */
688 movq_cfi rbp, 0
689
690 /* Save previous stack value */
691 movq %rsp, %rsi
692 677
693 leaq -RBP(%rsp),%rdi /* arg1 for handler */ 678 leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
694 testl $3, CS-RBP(%rsi) 679
680 testl $3, CS-RBP(%rsp)
695 je 1f 681 je 1f
696 SWAPGS 682 SWAPGS
6831:
697 /* 684 /*
685 * Save previous stack pointer, optionally switch to interrupt stack.
698 * irq_count is used to check if a CPU is already on an interrupt stack 686 * irq_count is used to check if a CPU is already on an interrupt stack
699 * or not. While this is essentially redundant with preempt_count it is 687 * or not. While this is essentially redundant with preempt_count it is
700 * a little cheaper to use a separate counter in the PDA (short of 688 * a little cheaper to use a separate counter in the PDA (short of
701 * moving irq_enter into assembly, which would be too much work) 689 * moving irq_enter into assembly, which would be too much work)
702 */ 690 */
7031: incl PER_CPU_VAR(irq_count) 691 movq %rsp, %rsi
692 incl PER_CPU_VAR(irq_count)
704 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp 693 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
705 CFI_DEF_CFA_REGISTER rsi 694 CFI_DEF_CFA_REGISTER rsi
706
707 /* Store previous stack value */
708 pushq %rsi 695 pushq %rsi
696 /*
697 * For debugger:
698 * "CFA (Current Frame Address) is the value on stack + offset"
699 */
709 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 700 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
710 0x77 /* DW_OP_breg7 */, 0, \ 701 0x77 /* DW_OP_breg7 (rsp) */, 0, \
711 0x06 /* DW_OP_deref */, \ 702 0x06 /* DW_OP_deref */, \
712 0x08 /* DW_OP_const1u */, SS+8-RBP, \ 703 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
713 0x22 /* DW_OP_plus */ 704 0x22 /* DW_OP_plus */
714 /* We entered an interrupt context - irqs are off: */ 705 /* We entered an interrupt context - irqs are off: */
715 TRACE_IRQS_OFF 706 TRACE_IRQS_OFF
@@ -727,7 +718,7 @@ common_interrupt:
727 ASM_CLAC 718 ASM_CLAC
728 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 719 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
729 interrupt do_IRQ 720 interrupt do_IRQ
730 /* 0(%rsp): old_rsp-ARGOFFSET */ 721 /* 0(%rsp): old RSP */
731ret_from_intr: 722ret_from_intr:
732 DISABLE_INTERRUPTS(CLBR_NONE) 723 DISABLE_INTERRUPTS(CLBR_NONE)
733 TRACE_IRQS_OFF 724 TRACE_IRQS_OFF
@@ -735,19 +726,18 @@ ret_from_intr:
735 726
736 /* Restore saved previous stack */ 727 /* Restore saved previous stack */
737 popq %rsi 728 popq %rsi
738 CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ 729 CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
739 leaq ARGOFFSET-RBP(%rsi), %rsp 730 /* return code expects complete pt_regs - adjust rsp accordingly: */
731 leaq -RBP(%rsi),%rsp
740 CFI_DEF_CFA_REGISTER rsp 732 CFI_DEF_CFA_REGISTER rsp
741 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET 733 CFI_ADJUST_CFA_OFFSET RBP
742 734
743exit_intr: 735 testl $3,CS(%rsp)
744 GET_THREAD_INFO(%rcx)
745 testl $3,CS-ARGOFFSET(%rsp)
746 je retint_kernel 736 je retint_kernel
747
748 /* Interrupt came from user space */ 737 /* Interrupt came from user space */
738
739 GET_THREAD_INFO(%rcx)
749 /* 740 /*
750 * Has a correct top of stack, but a partial stack frame
751 * %rcx: thread info. Interrupts off. 741 * %rcx: thread info. Interrupts off.
752 */ 742 */
753retint_with_reschedule: 743retint_with_reschedule:
@@ -766,84 +756,34 @@ retint_swapgs: /* return to user-space */
766 DISABLE_INTERRUPTS(CLBR_ANY) 756 DISABLE_INTERRUPTS(CLBR_ANY)
767 TRACE_IRQS_IRETQ 757 TRACE_IRQS_IRETQ
768 758
769 /*
770 * Try to use SYSRET instead of IRET if we're returning to
771 * a completely clean 64-bit userspace context.
772 */
773 movq (RCX-R11)(%rsp), %rcx
774 cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
775 jne opportunistic_sysret_failed
776
777 /*
778 * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
779 * in kernel space. This essentially lets the user take over
780 * the kernel, since userspace controls RSP. It's not worth
781 * testing for canonicalness exactly -- this check detects any
782 * of the 17 high bits set, which is true for non-canonical
783 * or kernel addresses. (This will pessimize vsyscall=native.
784 * Big deal.)
785 *
786 * If virtual addresses ever become wider, this will need
787 * to be updated to remain correct on both old and new CPUs.
788 */
789 .ifne __VIRTUAL_MASK_SHIFT - 47
790 .error "virtual address width changed -- sysret checks need update"
791 .endif
792 shr $__VIRTUAL_MASK_SHIFT, %rcx
793 jnz opportunistic_sysret_failed
794
795 cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
796 jne opportunistic_sysret_failed
797
798 movq (R11-ARGOFFSET)(%rsp), %r11
799 cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
800 jne opportunistic_sysret_failed
801
802 /*
803 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
804 * restoring TF results in a trap from userspace immediately after
805 * SYSRET. This would cause an infinite loop whenever #DB happens
806 * with register state that satisfies the opportunistic SYSRET
807 * conditions. For example, single-stepping this user code:
808 *
809 * movq $stuck_here,%rcx
810 * pushfq
811 * popq %r11
812 * stuck_here:
813 *
814 * would never get past 'stuck_here'.
815 */
816 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
817 jnz opportunistic_sysret_failed
818
819 /* nothing to check for RSP */
820
821 cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
822 jne opportunistic_sysret_failed
823
824 /*
825 * We win! This label is here just for ease of understanding
826 * perf profiles. Nothing jumps here.
827 */
828irq_return_via_sysret:
829 CFI_REMEMBER_STATE
830 RESTORE_ARGS 1,8,1
831 movq (RSP-RIP)(%rsp),%rsp
832 USERGS_SYSRET64
833 CFI_RESTORE_STATE
834
835opportunistic_sysret_failed:
836 SWAPGS 759 SWAPGS
837 jmp restore_args 760 jmp restore_c_regs_and_iret
838 761
839retint_restore_args: /* return to kernel space */ 762/* Returning to kernel space */
840 DISABLE_INTERRUPTS(CLBR_ANY) 763retint_kernel:
764#ifdef CONFIG_PREEMPT
765 /* Interrupts are off */
766 /* Check if we need preemption */
767 bt $9,EFLAGS(%rsp) /* interrupts were off? */
768 jnc 1f
7690: cmpl $0,PER_CPU_VAR(__preempt_count)
770 jnz 1f
771 call preempt_schedule_irq
772 jmp 0b
7731:
774#endif
841 /* 775 /*
842 * The iretq could re-enable interrupts: 776 * The iretq could re-enable interrupts:
843 */ 777 */
844 TRACE_IRQS_IRETQ 778 TRACE_IRQS_IRETQ
845restore_args: 779
846 RESTORE_ARGS 1,8,1 780/*
781 * At this label, code paths which return to kernel and to user,
782 * which come from interrupts/exception and from syscalls, merge.
783 */
784restore_c_regs_and_iret:
785 RESTORE_C_REGS
786 REMOVE_PT_GPREGS_FROM_STACK 8
847 787
848irq_return: 788irq_return:
849 INTERRUPT_RETURN 789 INTERRUPT_RETURN
@@ -914,28 +854,17 @@ retint_signal:
914 jz retint_swapgs 854 jz retint_swapgs
915 TRACE_IRQS_ON 855 TRACE_IRQS_ON
916 ENABLE_INTERRUPTS(CLBR_NONE) 856 ENABLE_INTERRUPTS(CLBR_NONE)
917 SAVE_REST 857 SAVE_EXTRA_REGS
918 movq $-1,ORIG_RAX(%rsp) 858 movq $-1,ORIG_RAX(%rsp)
919 xorl %esi,%esi # oldset 859 xorl %esi,%esi # oldset
920 movq %rsp,%rdi # &pt_regs 860 movq %rsp,%rdi # &pt_regs
921 call do_notify_resume 861 call do_notify_resume
922 RESTORE_REST 862 RESTORE_EXTRA_REGS
923 DISABLE_INTERRUPTS(CLBR_NONE) 863 DISABLE_INTERRUPTS(CLBR_NONE)
924 TRACE_IRQS_OFF 864 TRACE_IRQS_OFF
925 GET_THREAD_INFO(%rcx) 865 GET_THREAD_INFO(%rcx)
926 jmp retint_with_reschedule 866 jmp retint_with_reschedule
927 867
928#ifdef CONFIG_PREEMPT
929 /* Returning to kernel space. Check if we need preemption */
930 /* rcx: threadinfo. interrupts off. */
931ENTRY(retint_kernel)
932 cmpl $0,PER_CPU_VAR(__preempt_count)
933 jnz retint_restore_args
934 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
935 jnc retint_restore_args
936 call preempt_schedule_irq
937 jmp exit_intr
938#endif
939 CFI_ENDPROC 868 CFI_ENDPROC
940END(common_interrupt) 869END(common_interrupt)
941 870
@@ -1024,7 +953,7 @@ apicinterrupt IRQ_WORK_VECTOR \
1024/* 953/*
1025 * Exception entry points. 954 * Exception entry points.
1026 */ 955 */
1027#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) 956#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
1028 957
1029.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 958.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
1030ENTRY(\sym) 959ENTRY(\sym)
@@ -1046,8 +975,7 @@ ENTRY(\sym)
1046 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 975 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1047 .endif 976 .endif
1048 977
1049 subq $ORIG_RAX-R15, %rsp 978 ALLOC_PT_GPREGS_ON_STACK
1050 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1051 979
1052 .if \paranoid 980 .if \paranoid
1053 .if \paranoid == 1 981 .if \paranoid == 1
@@ -1055,10 +983,11 @@ ENTRY(\sym)
1055 testl $3, CS(%rsp) /* If coming from userspace, switch */ 983 testl $3, CS(%rsp) /* If coming from userspace, switch */
1056 jnz 1f /* stacks. */ 984 jnz 1f /* stacks. */
1057 .endif 985 .endif
1058 call save_paranoid 986 call paranoid_entry
1059 .else 987 .else
1060 call error_entry 988 call error_entry
1061 .endif 989 .endif
990 /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
1062 991
1063 DEFAULT_FRAME 0 992 DEFAULT_FRAME 0
1064 993
@@ -1080,19 +1009,20 @@ ENTRY(\sym)
1080 .endif 1009 .endif
1081 1010
1082 .if \shift_ist != -1 1011 .if \shift_ist != -1
1083 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) 1012 subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
1084 .endif 1013 .endif
1085 1014
1086 call \do_sym 1015 call \do_sym
1087 1016
1088 .if \shift_ist != -1 1017 .if \shift_ist != -1
1089 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) 1018 addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
1090 .endif 1019 .endif
1091 1020
1021 /* these procedures expect "no swapgs" flag in ebx */
1092 .if \paranoid 1022 .if \paranoid
1093 jmp paranoid_exit /* %ebx: no swapgs flag */ 1023 jmp paranoid_exit
1094 .else 1024 .else
1095 jmp error_exit /* %ebx: no swapgs flag */ 1025 jmp error_exit
1096 .endif 1026 .endif
1097 1027
1098 .if \paranoid == 1 1028 .if \paranoid == 1
@@ -1296,7 +1226,9 @@ ENTRY(xen_failsafe_callback)
1296 addq $0x30,%rsp 1226 addq $0x30,%rsp
1297 CFI_ADJUST_CFA_OFFSET -0x30 1227 CFI_ADJUST_CFA_OFFSET -0x30
1298 pushq_cfi $-1 /* orig_ax = -1 => not a system call */ 1228 pushq_cfi $-1 /* orig_ax = -1 => not a system call */
1299 SAVE_ALL 1229 ALLOC_PT_GPREGS_ON_STACK
1230 SAVE_C_REGS
1231 SAVE_EXTRA_REGS
1300 jmp error_exit 1232 jmp error_exit
1301 CFI_ENDPROC 1233 CFI_ENDPROC
1302END(xen_failsafe_callback) 1234END(xen_failsafe_callback)
@@ -1328,59 +1260,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
1328idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) 1260idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
1329#endif 1261#endif
1330 1262
1331 /* 1263/*
1332 * "Paranoid" exit path from exception stack. This is invoked 1264 * Save all registers in pt_regs, and switch gs if needed.
1333 * only on return from non-NMI IST interrupts that came 1265 * Use slow, but surefire "are we in kernel?" check.
1334 * from kernel space. 1266 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1335 * 1267 */
1336 * We may be returning to very strange contexts (e.g. very early 1268ENTRY(paranoid_entry)
1337 * in syscall entry), so checking for preemption here would 1269 XCPT_FRAME 1 15*8
1338 * be complicated. Fortunately, we there's no good reason 1270 cld
1339 * to try to handle preemption here. 1271 SAVE_C_REGS 8
1340 */ 1272 SAVE_EXTRA_REGS 8
1273 movl $1,%ebx
1274 movl $MSR_GS_BASE,%ecx
1275 rdmsr
1276 testl %edx,%edx
1277 js 1f /* negative -> in kernel */
1278 SWAPGS
1279 xorl %ebx,%ebx
12801: ret
1281 CFI_ENDPROC
1282END(paranoid_entry)
1341 1283
1342 /* ebx: no swapgs flag */ 1284/*
1285 * "Paranoid" exit path from exception stack. This is invoked
1286 * only on return from non-NMI IST interrupts that came
1287 * from kernel space.
1288 *
1289 * We may be returning to very strange contexts (e.g. very early
1290 * in syscall entry), so checking for preemption here would
1291 * be complicated. Fortunately, we there's no good reason
1292 * to try to handle preemption here.
1293 */
1294/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
1343ENTRY(paranoid_exit) 1295ENTRY(paranoid_exit)
1344 DEFAULT_FRAME 1296 DEFAULT_FRAME
1345 DISABLE_INTERRUPTS(CLBR_NONE) 1297 DISABLE_INTERRUPTS(CLBR_NONE)
1346 TRACE_IRQS_OFF_DEBUG 1298 TRACE_IRQS_OFF_DEBUG
1347 testl %ebx,%ebx /* swapgs needed? */ 1299 testl %ebx,%ebx /* swapgs needed? */
1348 jnz paranoid_restore 1300 jnz paranoid_exit_no_swapgs
1349 TRACE_IRQS_IRETQ 0 1301 TRACE_IRQS_IRETQ
1350 SWAPGS_UNSAFE_STACK 1302 SWAPGS_UNSAFE_STACK
1351 RESTORE_ALL 8 1303 jmp paranoid_exit_restore
1352 INTERRUPT_RETURN 1304paranoid_exit_no_swapgs:
1353paranoid_restore: 1305 TRACE_IRQS_IRETQ_DEBUG
1354 TRACE_IRQS_IRETQ_DEBUG 0 1306paranoid_exit_restore:
1355 RESTORE_ALL 8 1307 RESTORE_EXTRA_REGS
1308 RESTORE_C_REGS
1309 REMOVE_PT_GPREGS_FROM_STACK 8
1356 INTERRUPT_RETURN 1310 INTERRUPT_RETURN
1357 CFI_ENDPROC 1311 CFI_ENDPROC
1358END(paranoid_exit) 1312END(paranoid_exit)
1359 1313
1360/* 1314/*
1361 * Exception entry point. This expects an error code/orig_rax on the stack. 1315 * Save all registers in pt_regs, and switch gs if needed.
1362 * returns in "no swapgs flag" in %ebx. 1316 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1363 */ 1317 */
1364ENTRY(error_entry) 1318ENTRY(error_entry)
1365 XCPT_FRAME 1319 XCPT_FRAME 1 15*8
1366 CFI_ADJUST_CFA_OFFSET 15*8
1367 /* oldrax contains error code */
1368 cld 1320 cld
1369 movq %rdi, RDI+8(%rsp) 1321 SAVE_C_REGS 8
1370 movq %rsi, RSI+8(%rsp) 1322 SAVE_EXTRA_REGS 8
1371 movq %rdx, RDX+8(%rsp)
1372 movq %rcx, RCX+8(%rsp)
1373 movq %rax, RAX+8(%rsp)
1374 movq %r8, R8+8(%rsp)
1375 movq %r9, R9+8(%rsp)
1376 movq %r10, R10+8(%rsp)
1377 movq %r11, R11+8(%rsp)
1378 movq_cfi rbx, RBX+8
1379 movq %rbp, RBP+8(%rsp)
1380 movq %r12, R12+8(%rsp)
1381 movq %r13, R13+8(%rsp)
1382 movq %r14, R14+8(%rsp)
1383 movq %r15, R15+8(%rsp)
1384 xorl %ebx,%ebx 1323 xorl %ebx,%ebx
1385 testl $3,CS+8(%rsp) 1324 testl $3,CS+8(%rsp)
1386 je error_kernelspace 1325 je error_kernelspace
@@ -1390,12 +1329,12 @@ error_sti:
1390 TRACE_IRQS_OFF 1329 TRACE_IRQS_OFF
1391 ret 1330 ret
1392 1331
1393/* 1332 /*
1394 * There are two places in the kernel that can potentially fault with 1333 * There are two places in the kernel that can potentially fault with
1395 * usergs. Handle them here. B stepping K8s sometimes report a 1334 * usergs. Handle them here. B stepping K8s sometimes report a
1396 * truncated RIP for IRET exceptions returning to compat mode. Check 1335 * truncated RIP for IRET exceptions returning to compat mode. Check
1397 * for these here too. 1336 * for these here too.
1398 */ 1337 */
1399error_kernelspace: 1338error_kernelspace:
1400 CFI_REL_OFFSET rcx, RCX+8 1339 CFI_REL_OFFSET rcx, RCX+8
1401 incl %ebx 1340 incl %ebx
@@ -1425,11 +1364,11 @@ error_bad_iret:
1425END(error_entry) 1364END(error_entry)
1426 1365
1427 1366
1428/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ 1367/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
1429ENTRY(error_exit) 1368ENTRY(error_exit)
1430 DEFAULT_FRAME 1369 DEFAULT_FRAME
1431 movl %ebx,%eax 1370 movl %ebx,%eax
1432 RESTORE_REST 1371 RESTORE_EXTRA_REGS
1433 DISABLE_INTERRUPTS(CLBR_NONE) 1372 DISABLE_INTERRUPTS(CLBR_NONE)
1434 TRACE_IRQS_OFF 1373 TRACE_IRQS_OFF
1435 GET_THREAD_INFO(%rcx) 1374 GET_THREAD_INFO(%rcx)
@@ -1444,19 +1383,7 @@ ENTRY(error_exit)
1444 CFI_ENDPROC 1383 CFI_ENDPROC
1445END(error_exit) 1384END(error_exit)
1446 1385
1447/* 1386/* Runs on exception stack */
1448 * Test if a given stack is an NMI stack or not.
1449 */
1450 .macro test_in_nmi reg stack nmi_ret normal_ret
1451 cmpq %\reg, \stack
1452 ja \normal_ret
1453 subq $EXCEPTION_STKSZ, %\reg
1454 cmpq %\reg, \stack
1455 jb \normal_ret
1456 jmp \nmi_ret
1457 .endm
1458
1459 /* runs on exception stack */
1460ENTRY(nmi) 1387ENTRY(nmi)
1461 INTR_FRAME 1388 INTR_FRAME
1462 PARAVIRT_ADJUST_EXCEPTION_FRAME 1389 PARAVIRT_ADJUST_EXCEPTION_FRAME
@@ -1492,7 +1419,7 @@ ENTRY(nmi)
1492 * NMI. 1419 * NMI.
1493 */ 1420 */
1494 1421
1495 /* Use %rdx as out temp variable throughout */ 1422 /* Use %rdx as our temp variable throughout */
1496 pushq_cfi %rdx 1423 pushq_cfi %rdx
1497 CFI_REL_OFFSET rdx, 0 1424 CFI_REL_OFFSET rdx, 0
1498 1425
@@ -1517,8 +1444,17 @@ ENTRY(nmi)
1517 * We check the variable because the first NMI could be in a 1444 * We check the variable because the first NMI could be in a
1518 * breakpoint routine using a breakpoint stack. 1445 * breakpoint routine using a breakpoint stack.
1519 */ 1446 */
1520 lea 6*8(%rsp), %rdx 1447 lea 6*8(%rsp), %rdx
1521 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi 1448 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1449 cmpq %rdx, 4*8(%rsp)
1450 /* If the stack pointer is above the NMI stack, this is a normal NMI */
1451 ja first_nmi
1452 subq $EXCEPTION_STKSZ, %rdx
1453 cmpq %rdx, 4*8(%rsp)
1454 /* If it is below the NMI stack, it is a normal NMI */
1455 jb first_nmi
1456 /* Ah, it is within the NMI stack, treat it as nested */
1457
1522 CFI_REMEMBER_STATE 1458 CFI_REMEMBER_STATE
1523 1459
1524nested_nmi: 1460nested_nmi:
@@ -1611,7 +1547,7 @@ first_nmi:
1611 .rept 5 1547 .rept 5
1612 pushq_cfi 11*8(%rsp) 1548 pushq_cfi 11*8(%rsp)
1613 .endr 1549 .endr
1614 CFI_DEF_CFA_OFFSET SS+8-RIP 1550 CFI_DEF_CFA_OFFSET 5*8
1615 1551
1616 /* Everything up to here is safe from nested NMIs */ 1552 /* Everything up to here is safe from nested NMIs */
1617 1553
@@ -1639,7 +1575,7 @@ repeat_nmi:
1639 pushq_cfi -6*8(%rsp) 1575 pushq_cfi -6*8(%rsp)
1640 .endr 1576 .endr
1641 subq $(5*8), %rsp 1577 subq $(5*8), %rsp
1642 CFI_DEF_CFA_OFFSET SS+8-RIP 1578 CFI_DEF_CFA_OFFSET 5*8
1643end_repeat_nmi: 1579end_repeat_nmi:
1644 1580
1645 /* 1581 /*
@@ -1648,16 +1584,16 @@ end_repeat_nmi:
1648 * so that we repeat another NMI. 1584 * so that we repeat another NMI.
1649 */ 1585 */
1650 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1586 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1651 subq $ORIG_RAX-R15, %rsp 1587 ALLOC_PT_GPREGS_ON_STACK
1652 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1588
1653 /* 1589 /*
1654 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit 1590 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1655 * as we should not be calling schedule in NMI context. 1591 * as we should not be calling schedule in NMI context.
1656 * Even with normal interrupts enabled. An NMI should not be 1592 * Even with normal interrupts enabled. An NMI should not be
1657 * setting NEED_RESCHED or anything that normal interrupts and 1593 * setting NEED_RESCHED or anything that normal interrupts and
1658 * exceptions might do. 1594 * exceptions might do.
1659 */ 1595 */
1660 call save_paranoid 1596 call paranoid_entry
1661 DEFAULT_FRAME 0 1597 DEFAULT_FRAME 0
1662 1598
1663 /* 1599 /*
@@ -1688,8 +1624,10 @@ end_repeat_nmi:
1688nmi_swapgs: 1624nmi_swapgs:
1689 SWAPGS_UNSAFE_STACK 1625 SWAPGS_UNSAFE_STACK
1690nmi_restore: 1626nmi_restore:
1627 RESTORE_EXTRA_REGS
1628 RESTORE_C_REGS
1691 /* Pop the extra iret frame at once */ 1629 /* Pop the extra iret frame at once */
1692 RESTORE_ALL 6*8 1630 REMOVE_PT_GPREGS_FROM_STACK 6*8
1693 1631
1694 /* Clear the NMI executing stack variable */ 1632 /* Clear the NMI executing stack variable */
1695 movq $0, 5*8(%rsp) 1633 movq $0, 5*8(%rsp)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f36bd42d6f0c..d031bad9e07e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -22,6 +22,7 @@
22#include <asm/cpufeature.h> 22#include <asm/cpufeature.h>
23#include <asm/percpu.h> 23#include <asm/percpu.h>
24#include <asm/nops.h> 24#include <asm/nops.h>
25#include <asm/bootparam.h>
25 26
26/* Physical address */ 27/* Physical address */
27#define pa(X) ((X) - __PAGE_OFFSET) 28#define pa(X) ((X) - __PAGE_OFFSET)
@@ -90,7 +91,7 @@ ENTRY(startup_32)
90 91
91 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 92 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
92 us to not reload segments */ 93 us to not reload segments */
93 testb $(1<<6), BP_loadflags(%esi) 94 testb $KEEP_SEGMENTS, BP_loadflags(%esi)
94 jnz 2f 95 jnz 2f
95 96
96/* 97/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6fd514d9f69a..ae6588b301c2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit 2 * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
3 * 3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
@@ -56,7 +56,7 @@ startup_64:
56 * %rsi holds a physical pointer to real_mode_data. 56 * %rsi holds a physical pointer to real_mode_data.
57 * 57 *
58 * We come here either directly from a 64bit bootloader, or from 58 * We come here either directly from a 64bit bootloader, or from
59 * arch/x86_64/boot/compressed/head.S. 59 * arch/x86/boot/compressed/head_64.S.
60 * 60 *
61 * We only come here initially at boot nothing else comes here. 61 * We only come here initially at boot nothing else comes here.
62 * 62 *
@@ -146,7 +146,7 @@ startup_64:
146 leaq level2_kernel_pgt(%rip), %rdi 146 leaq level2_kernel_pgt(%rip), %rdi
147 leaq 4096(%rdi), %r8 147 leaq 4096(%rdi), %r8
148 /* See if it is a valid page table entry */ 148 /* See if it is a valid page table entry */
1491: testq $1, 0(%rdi) 1491: testb $1, 0(%rdi)
150 jz 2f 150 jz 2f
151 addq %rbp, 0(%rdi) 151 addq %rbp, 0(%rdi)
152 /* Go to the next page */ 152 /* Go to the next page */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5651fce0b71..29c740deafec 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
68static inline bool interrupted_user_mode(void) 68static inline bool interrupted_user_mode(void)
69{ 69{
70 struct pt_regs *regs = get_irq_regs(); 70 struct pt_regs *regs = get_irq_regs();
71 return regs && user_mode_vm(regs); 71 return regs && user_mode(regs);
72} 72}
73 73
74/* 74/*
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 4ddaf66ea35f..37dae792dbbe 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
54 * because the ->io_bitmap_max value must match the bitmap 54 * because the ->io_bitmap_max value must match the bitmap
55 * contents: 55 * contents:
56 */ 56 */
57 tss = &per_cpu(init_tss, get_cpu()); 57 tss = &per_cpu(cpu_tss, get_cpu());
58 58
59 if (turn_on) 59 if (turn_on)
60 bitmap_clear(t->io_bitmap_ptr, from, num); 60 bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 28d28f5eb8f4..f9fd86a7fcc7 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
165 if (unlikely(!desc)) 165 if (unlikely(!desc))
166 return false; 166 return false;
167 167
168 if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { 168 if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
169 if (unlikely(overflow)) 169 if (unlikely(overflow))
170 print_stack_overflow(); 170 print_stack_overflow();
171 desc->handle_irq(irq, desc); 171 desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index e4b503d5558c..394e643d7830 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
44 u64 estack_top, estack_bottom; 44 u64 estack_top, estack_bottom;
45 u64 curbase = (u64)task_stack_page(current); 45 u64 curbase = (u64)task_stack_page(current);
46 46
47 if (user_mode_vm(regs)) 47 if (user_mode(regs))
48 return; 48 return;
49 49
50 if (regs->sp >= curbase + sizeof(struct thread_info) + 50 if (regs->sp >= curbase + sizeof(struct thread_info) +
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 70e181ea1eac..cd10a6437264 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -178,7 +178,8 @@ void __init native_init_IRQ(void)
178#endif 178#endif
179 for_each_clear_bit_from(i, used_vectors, first_system_vector) { 179 for_each_clear_bit_from(i, used_vectors, first_system_vector) {
180 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ 180 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
181 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 181 set_intr_gate(i, irq_entries_start +
182 8 * (i - FIRST_EXTERNAL_VECTOR));
182 } 183 }
183#ifdef CONFIG_X86_LOCAL_APIC 184#ifdef CONFIG_X86_LOCAL_APIC
184 for_each_clear_bit_from(i, used_vectors, NR_VECTORS) 185 for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 25ecd56cefa8..d6178d9791db 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
126#ifdef CONFIG_X86_32 126#ifdef CONFIG_X86_32
127 switch (regno) { 127 switch (regno) {
128 case GDB_SS: 128 case GDB_SS:
129 if (!user_mode_vm(regs)) 129 if (!user_mode(regs))
130 *(unsigned long *)mem = __KERNEL_DS; 130 *(unsigned long *)mem = __KERNEL_DS;
131 break; 131 break;
132 case GDB_SP: 132 case GDB_SP:
133 if (!user_mode_vm(regs)) 133 if (!user_mode(regs))
134 *(unsigned long *)mem = kernel_stack_pointer(regs); 134 *(unsigned long *)mem = kernel_stack_pointer(regs);
135 break; 135 break;
136 case GDB_GS: 136 case GDB_GS:
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 4e3d5a9621fe..24d079604fd5 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -602,7 +602,7 @@ int kprobe_int3_handler(struct pt_regs *regs)
602 struct kprobe *p; 602 struct kprobe *p;
603 struct kprobe_ctlblk *kcb; 603 struct kprobe_ctlblk *kcb;
604 604
605 if (user_mode_vm(regs)) 605 if (user_mode(regs))
606 return 0; 606 return 0;
607 607
608 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); 608 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
@@ -1007,7 +1007,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
1007 struct die_args *args = data; 1007 struct die_args *args = data;
1008 int ret = NOTIFY_DONE; 1008 int ret = NOTIFY_DONE;
1009 1009
1010 if (args->regs && user_mode_vm(args->regs)) 1010 if (args->regs && user_mode(args->regs))
1011 return ret; 1011 return ret;
1012 1012
1013 if (val == DIE_GPF) { 1013 if (val == DIE_GPF) {
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index d1ac80b72c72..005c03e93fc5 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -33,6 +33,7 @@
33 33
34#include <asm/page.h> 34#include <asm/page.h>
35#include <asm/pgtable.h> 35#include <asm/pgtable.h>
36#include <asm/setup.h>
36 37
37#if 0 38#if 0
38#define DEBUGP(fmt, ...) \ 39#define DEBUGP(fmt, ...) \
@@ -47,21 +48,13 @@ do { \
47 48
48#ifdef CONFIG_RANDOMIZE_BASE 49#ifdef CONFIG_RANDOMIZE_BASE
49static unsigned long module_load_offset; 50static unsigned long module_load_offset;
50static int randomize_modules = 1;
51 51
52/* Mutex protects the module_load_offset. */ 52/* Mutex protects the module_load_offset. */
53static DEFINE_MUTEX(module_kaslr_mutex); 53static DEFINE_MUTEX(module_kaslr_mutex);
54 54
55static int __init parse_nokaslr(char *p)
56{
57 randomize_modules = 0;
58 return 0;
59}
60early_param("nokaslr", parse_nokaslr);
61
62static unsigned long int get_module_load_offset(void) 55static unsigned long int get_module_load_offset(void)
63{ 56{
64 if (randomize_modules) { 57 if (kaslr_enabled()) {
65 mutex_lock(&module_kaslr_mutex); 58 mutex_lock(&module_kaslr_mutex);
66 /* 59 /*
67 * Calculate the module_load_offset the first time this 60 * Calculate the module_load_offset the first time this
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 781861cc5ee8..da8cb987b973 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -131,10 +131,11 @@ void perf_get_regs_user(struct perf_regs *regs_user,
131 } 131 }
132 132
133 /* 133 /*
134 * RIP, flags, and the argument registers are usually saved. 134 * These registers are always saved on 64-bit syscall entry.
135 * orig_ax is probably okay, too. 135 * On 32-bit entry points, they are saved too except r8..r11.
136 */ 136 */
137 regs_user_copy->ip = user_regs->ip; 137 regs_user_copy->ip = user_regs->ip;
138 regs_user_copy->ax = user_regs->ax;
138 regs_user_copy->cx = user_regs->cx; 139 regs_user_copy->cx = user_regs->cx;
139 regs_user_copy->dx = user_regs->dx; 140 regs_user_copy->dx = user_regs->dx;
140 regs_user_copy->si = user_regs->si; 141 regs_user_copy->si = user_regs->si;
@@ -145,9 +146,12 @@ void perf_get_regs_user(struct perf_regs *regs_user,
145 regs_user_copy->r11 = user_regs->r11; 146 regs_user_copy->r11 = user_regs->r11;
146 regs_user_copy->orig_ax = user_regs->orig_ax; 147 regs_user_copy->orig_ax = user_regs->orig_ax;
147 regs_user_copy->flags = user_regs->flags; 148 regs_user_copy->flags = user_regs->flags;
149 regs_user_copy->sp = user_regs->sp;
150 regs_user_copy->cs = user_regs->cs;
151 regs_user_copy->ss = user_regs->ss;
148 152
149 /* 153 /*
150 * Don't even try to report the "rest" regs. 154 * Most system calls don't save these registers, don't report them.
151 */ 155 */
152 regs_user_copy->bx = -1; 156 regs_user_copy->bx = -1;
153 regs_user_copy->bp = -1; 157 regs_user_copy->bp = -1;
@@ -158,37 +162,13 @@ void perf_get_regs_user(struct perf_regs *regs_user,
158 162
159 /* 163 /*
160 * For this to be at all useful, we need a reasonable guess for 164 * For this to be at all useful, we need a reasonable guess for
161 * sp and the ABI. Be careful: we're in NMI context, and we're 165 * the ABI. Be careful: we're in NMI context, and we're
162 * considering current to be the current task, so we should 166 * considering current to be the current task, so we should
163 * be careful not to look at any other percpu variables that might 167 * be careful not to look at any other percpu variables that might
164 * change during context switches. 168 * change during context switches.
165 */ 169 */
166 if (IS_ENABLED(CONFIG_IA32_EMULATION) && 170 regs_user->abi = user_64bit_mode(user_regs) ?
167 task_thread_info(current)->status & TS_COMPAT) { 171 PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
168 /* Easy case: we're in a compat syscall. */
169 regs_user->abi = PERF_SAMPLE_REGS_ABI_32;
170 regs_user_copy->sp = user_regs->sp;
171 regs_user_copy->cs = user_regs->cs;
172 regs_user_copy->ss = user_regs->ss;
173 } else if (user_regs->orig_ax != -1) {
174 /*
175 * We're probably in a 64-bit syscall.
176 * Warning: this code is severely racy. At least it's better
177 * than just blindly copying user_regs.
178 */
179 regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
180 regs_user_copy->sp = this_cpu_read(old_rsp);
181 regs_user_copy->cs = __USER_CS;
182 regs_user_copy->ss = __USER_DS;
183 regs_user_copy->cx = -1; /* usually contains garbage */
184 } else {
185 /* We're probably in an interrupt or exception. */
186 regs_user->abi = user_64bit_mode(user_regs) ?
187 PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
188 regs_user_copy->sp = user_regs->sp;
189 regs_user_copy->cs = user_regs->cs;
190 regs_user_copy->ss = user_regs->ss;
191 }
192 172
193 regs_user->regs = regs_user_copy; 173 regs_user->regs = regs_user_copy;
194} 174}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7af7b6478637..0c8992dbead5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -38,7 +38,26 @@
38 * section. Since TSS's are completely CPU-local, we want them 38 * section. Since TSS's are completely CPU-local, we want them
39 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 39 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
40 */ 40 */
41__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; 41__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
42 .x86_tss = {
43 .sp0 = TOP_OF_INIT_STACK,
44#ifdef CONFIG_X86_32
45 .ss0 = __KERNEL_DS,
46 .ss1 = __KERNEL_CS,
47 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
48#endif
49 },
50#ifdef CONFIG_X86_32
51 /*
52 * Note that the .io_bitmap member must be extra-big. This is because
53 * the CPU will access an additional byte beyond the end of the IO
54 * permission bitmap. The extra byte must be all 1 bits, and must
55 * be within the limit.
56 */
57 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
58#endif
59};
60EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
42 61
43#ifdef CONFIG_X86_64 62#ifdef CONFIG_X86_64
44static DEFINE_PER_CPU(unsigned char, is_idle); 63static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -110,7 +129,7 @@ void exit_thread(void)
110 unsigned long *bp = t->io_bitmap_ptr; 129 unsigned long *bp = t->io_bitmap_ptr;
111 130
112 if (bp) { 131 if (bp) {
113 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 132 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
114 133
115 t->io_bitmap_ptr = NULL; 134 t->io_bitmap_ptr = NULL;
116 clear_thread_flag(TIF_IO_BITMAP); 135 clear_thread_flag(TIF_IO_BITMAP);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f99cb5a..8ed2106b06da 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all)
73 unsigned long sp; 73 unsigned long sp;
74 unsigned short ss, gs; 74 unsigned short ss, gs;
75 75
76 if (user_mode_vm(regs)) { 76 if (user_mode(regs)) {
77 sp = regs->sp; 77 sp = regs->sp;
78 ss = regs->ss & 0xffff; 78 ss = regs->ss & 0xffff;
79 gs = get_user_gs(regs); 79 gs = get_user_gs(regs);
@@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
206 regs->ip = new_ip; 206 regs->ip = new_ip;
207 regs->sp = new_sp; 207 regs->sp = new_sp;
208 regs->flags = X86_EFLAGS_IF; 208 regs->flags = X86_EFLAGS_IF;
209 /* 209 force_iret();
210 * force it to the iret return path by making it look as if there was
211 * some work pending.
212 */
213 set_thread_flag(TIF_NOTIFY_RESUME);
214} 210}
215EXPORT_SYMBOL_GPL(start_thread); 211EXPORT_SYMBOL_GPL(start_thread);
216 212
@@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
248 struct thread_struct *prev = &prev_p->thread, 244 struct thread_struct *prev = &prev_p->thread,
249 *next = &next_p->thread; 245 *next = &next_p->thread;
250 int cpu = smp_processor_id(); 246 int cpu = smp_processor_id();
251 struct tss_struct *tss = &per_cpu(init_tss, cpu); 247 struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
252 fpu_switch_t fpu; 248 fpu_switch_t fpu;
253 249
254 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 250 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
256 fpu = switch_fpu_prepare(prev_p, next_p, cpu); 252 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
257 253
258 /* 254 /*
259 * Reload esp0.
260 */
261 load_sp0(tss, next);
262
263 /*
264 * Save away %gs. No need to save %fs, as it was saved on the 255 * Save away %gs. No need to save %fs, as it was saved on the
265 * stack on entry. No need to save %es and %ds, as those are 256 * stack on entry. No need to save %es and %ds, as those are
266 * always kernel segments while inside the kernel. Doing this 257 * always kernel segments while inside the kernel. Doing this
@@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
310 */ 301 */
311 arch_end_context_switch(next_p); 302 arch_end_context_switch(next_p);
312 303
304 /*
305 * Reload esp0, kernel_stack, and current_top_of_stack. This changes
306 * current_thread_info().
307 */
308 load_sp0(tss, next);
313 this_cpu_write(kernel_stack, 309 this_cpu_write(kernel_stack,
314 (unsigned long)task_stack_page(next_p) + 310 (unsigned long)task_stack_page(next_p) +
315 THREAD_SIZE - KERNEL_STACK_OFFSET); 311 THREAD_SIZE);
312 this_cpu_write(cpu_current_top_of_stack,
313 (unsigned long)task_stack_page(next_p) +
314 THREAD_SIZE);
316 315
317 /* 316 /*
318 * Restore %gs if needed (which is common) 317 * Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 67fcc43577d2..4baaa972f52a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
52 52
53asmlinkage extern void ret_from_fork(void); 53asmlinkage extern void ret_from_fork(void);
54 54
55__visible DEFINE_PER_CPU(unsigned long, old_rsp); 55__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
56 56
57/* Prints also some state that isn't saved in the pt_regs */ 57/* Prints also some state that isn't saved in the pt_regs */
58void __show_regs(struct pt_regs *regs, int all) 58void __show_regs(struct pt_regs *regs, int all)
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
161 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; 161 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
162 childregs = task_pt_regs(p); 162 childregs = task_pt_regs(p);
163 p->thread.sp = (unsigned long) childregs; 163 p->thread.sp = (unsigned long) childregs;
164 p->thread.usersp = me->thread.usersp;
165 set_tsk_thread_flag(p, TIF_FORK); 164 set_tsk_thread_flag(p, TIF_FORK);
166 p->thread.io_bitmap_ptr = NULL; 165 p->thread.io_bitmap_ptr = NULL;
167 166
@@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
207 */ 206 */
208 if (clone_flags & CLONE_SETTLS) { 207 if (clone_flags & CLONE_SETTLS) {
209#ifdef CONFIG_IA32_EMULATION 208#ifdef CONFIG_IA32_EMULATION
210 if (test_thread_flag(TIF_IA32)) 209 if (is_ia32_task())
211 err = do_set_thread_area(p, -1, 210 err = do_set_thread_area(p, -1,
212 (struct user_desc __user *)childregs->si, 0); 211 (struct user_desc __user *)childregs->si, 0);
213 else 212 else
@@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
235 loadsegment(es, _ds); 234 loadsegment(es, _ds);
236 loadsegment(ds, _ds); 235 loadsegment(ds, _ds);
237 load_gs_index(0); 236 load_gs_index(0);
238 current->thread.usersp = new_sp;
239 regs->ip = new_ip; 237 regs->ip = new_ip;
240 regs->sp = new_sp; 238 regs->sp = new_sp;
241 this_cpu_write(old_rsp, new_sp);
242 regs->cs = _cs; 239 regs->cs = _cs;
243 regs->ss = _ss; 240 regs->ss = _ss;
244 regs->flags = X86_EFLAGS_IF; 241 regs->flags = X86_EFLAGS_IF;
242 force_iret();
245} 243}
246 244
247void 245void
@@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
277 struct thread_struct *prev = &prev_p->thread; 275 struct thread_struct *prev = &prev_p->thread;
278 struct thread_struct *next = &next_p->thread; 276 struct thread_struct *next = &next_p->thread;
279 int cpu = smp_processor_id(); 277 int cpu = smp_processor_id();
280 struct tss_struct *tss = &per_cpu(init_tss, cpu); 278 struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
281 unsigned fsindex, gsindex; 279 unsigned fsindex, gsindex;
282 fpu_switch_t fpu; 280 fpu_switch_t fpu;
283 281
284 fpu = switch_fpu_prepare(prev_p, next_p, cpu); 282 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
285 283
286 /* Reload esp0 and ss1. */
287 load_sp0(tss, next);
288
289 /* We must save %fs and %gs before load_TLS() because 284 /* We must save %fs and %gs before load_TLS() because
290 * %fs and %gs may be cleared by load_TLS(). 285 * %fs and %gs may be cleared by load_TLS().
291 * 286 *
@@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
401 /* 396 /*
402 * Switch the PDA and FPU contexts. 397 * Switch the PDA and FPU contexts.
403 */ 398 */
404 prev->usersp = this_cpu_read(old_rsp);
405 this_cpu_write(old_rsp, next->usersp);
406 this_cpu_write(current_task, next_p); 399 this_cpu_write(current_task, next_p);
407 400
408 /* 401 /*
@@ -413,9 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
413 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); 406 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
414 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); 407 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
415 408
409 /* Reload esp0 and ss1. This changes current_thread_info(). */
410 load_sp0(tss, next);
411
416 this_cpu_write(kernel_stack, 412 this_cpu_write(kernel_stack,
417 (unsigned long)task_stack_page(next_p) + 413 (unsigned long)task_stack_page(next_p) + THREAD_SIZE);
418 THREAD_SIZE - KERNEL_STACK_OFFSET);
419 414
420 /* 415 /*
421 * Now maybe reload the debug registers and handle I/O bitmaps 416 * Now maybe reload the debug registers and handle I/O bitmaps
@@ -602,6 +597,5 @@ long sys_arch_prctl(int code, unsigned long addr)
602 597
603unsigned long KSTK_ESP(struct task_struct *task) 598unsigned long KSTK_ESP(struct task_struct *task)
604{ 599{
605 return (test_tsk_thread_flag(task, TIF_IA32)) ? 600 return task_pt_regs(task)->sp;
606 (task_pt_regs(task)->sp) : ((task)->thread.usersp);
607} 601}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e510618b2e91..a7bc79480719 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task,
364 case offsetof(struct user_regs_struct,cs): 364 case offsetof(struct user_regs_struct,cs):
365 if (unlikely(value == 0)) 365 if (unlikely(value == 0))
366 return -EIO; 366 return -EIO;
367#ifdef CONFIG_IA32_EMULATION 367 task_pt_regs(task)->cs = value;
368 if (test_tsk_thread_flag(task, TIF_IA32))
369 task_pt_regs(task)->cs = value;
370#endif
371 break; 368 break;
372 case offsetof(struct user_regs_struct,ss): 369 case offsetof(struct user_regs_struct,ss):
373 if (unlikely(value == 0)) 370 if (unlikely(value == 0))
374 return -EIO; 371 return -EIO;
375#ifdef CONFIG_IA32_EMULATION 372 task_pt_regs(task)->ss = value;
376 if (test_tsk_thread_flag(task, TIF_IA32))
377 task_pt_regs(task)->ss = value;
378#endif
379 break; 373 break;
380 } 374 }
381 375
@@ -1421,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,
1421 memset(info, 0, sizeof(*info)); 1415 memset(info, 0, sizeof(*info));
1422 info->si_signo = SIGTRAP; 1416 info->si_signo = SIGTRAP;
1423 info->si_code = si_code; 1417 info->si_code = si_code;
1424 info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; 1418 info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
1425} 1419}
1426 1420
1427void user_single_step_siginfo(struct task_struct *tsk, 1421void user_single_step_siginfo(struct task_struct *tsk,
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index e13f8e7c22a6..77630d57e7bf 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -226,23 +226,23 @@ swap_pages:
226 movl (%ebx), %ecx 226 movl (%ebx), %ecx
227 addl $4, %ebx 227 addl $4, %ebx
2281: 2281:
229 testl $0x1, %ecx /* is it a destination page */ 229 testb $0x1, %cl /* is it a destination page */
230 jz 2f 230 jz 2f
231 movl %ecx, %edi 231 movl %ecx, %edi
232 andl $0xfffff000, %edi 232 andl $0xfffff000, %edi
233 jmp 0b 233 jmp 0b
2342: 2342:
235 testl $0x2, %ecx /* is it an indirection page */ 235 testb $0x2, %cl /* is it an indirection page */
236 jz 2f 236 jz 2f
237 movl %ecx, %ebx 237 movl %ecx, %ebx
238 andl $0xfffff000, %ebx 238 andl $0xfffff000, %ebx
239 jmp 0b 239 jmp 0b
2402: 2402:
241 testl $0x4, %ecx /* is it the done indicator */ 241 testb $0x4, %cl /* is it the done indicator */
242 jz 2f 242 jz 2f
243 jmp 3f 243 jmp 3f
2442: 2442:
245 testl $0x8, %ecx /* is it the source indicator */ 245 testb $0x8, %cl /* is it the source indicator */
246 jz 0b /* Ignore it otherwise */ 246 jz 0b /* Ignore it otherwise */
247 movl %ecx, %esi /* For every source page do a copy */ 247 movl %ecx, %esi /* For every source page do a copy */
248 andl $0xfffff000, %esi 248 andl $0xfffff000, %esi
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 3fd2c693e475..98111b38ebfd 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -123,7 +123,7 @@ identity_mapped:
123 * Set cr4 to a known state: 123 * Set cr4 to a known state:
124 * - physical address extension enabled 124 * - physical address extension enabled
125 */ 125 */
126 movq $X86_CR4_PAE, %rax 126 movl $X86_CR4_PAE, %eax
127 movq %rax, %cr4 127 movq %rax, %cr4
128 128
129 jmp 1f 129 jmp 1f
@@ -221,23 +221,23 @@ swap_pages:
221 movq (%rbx), %rcx 221 movq (%rbx), %rcx
222 addq $8, %rbx 222 addq $8, %rbx
2231: 2231:
224 testq $0x1, %rcx /* is it a destination page? */ 224 testb $0x1, %cl /* is it a destination page? */
225 jz 2f 225 jz 2f
226 movq %rcx, %rdi 226 movq %rcx, %rdi
227 andq $0xfffffffffffff000, %rdi 227 andq $0xfffffffffffff000, %rdi
228 jmp 0b 228 jmp 0b
2292: 2292:
230 testq $0x2, %rcx /* is it an indirection page? */ 230 testb $0x2, %cl /* is it an indirection page? */
231 jz 2f 231 jz 2f
232 movq %rcx, %rbx 232 movq %rcx, %rbx
233 andq $0xfffffffffffff000, %rbx 233 andq $0xfffffffffffff000, %rbx
234 jmp 0b 234 jmp 0b
2352: 2352:
236 testq $0x4, %rcx /* is it the done indicator? */ 236 testb $0x4, %cl /* is it the done indicator? */
237 jz 2f 237 jz 2f
238 jmp 3f 238 jmp 3f
2392: 2392:
240 testq $0x8, %rcx /* is it the source indicator? */ 240 testb $0x8, %cl /* is it the source indicator? */
241 jz 0b /* Ignore it otherwise */ 241 jz 0b /* Ignore it otherwise */
242 movq %rcx, %rsi /* For ever source page do a copy */ 242 movq %rcx, %rsi /* For ever source page do a copy */
243 andq $0xfffffffffffff000, %rsi 243 andq $0xfffffffffffff000, %rsi
@@ -246,17 +246,17 @@ swap_pages:
246 movq %rsi, %rax 246 movq %rsi, %rax
247 247
248 movq %r10, %rdi 248 movq %r10, %rdi
249 movq $512, %rcx 249 movl $512, %ecx
250 rep ; movsq 250 rep ; movsq
251 251
252 movq %rax, %rdi 252 movq %rax, %rdi
253 movq %rdx, %rsi 253 movq %rdx, %rsi
254 movq $512, %rcx 254 movl $512, %ecx
255 rep ; movsq 255 rep ; movsq
256 256
257 movq %rdx, %rdi 257 movq %rdx, %rdi
258 movq %r10, %rsi 258 movq %r10, %rsi
259 movq $512, %rcx 259 movl $512, %ecx
260 rep ; movsq 260 rep ; movsq
261 261
262 lea PAGE_SIZE(%rax), %rsi 262 lea PAGE_SIZE(%rax), %rsi
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0a2421cca01f..014466b152b5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void)
832static int 832static int
833dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) 833dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
834{ 834{
835 pr_emerg("Kernel Offset: 0x%lx from 0x%lx " 835 if (kaslr_enabled()) {
836 "(relocation range: 0x%lx-0x%lx)\n", 836 pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
837 (unsigned long)&_text - __START_KERNEL, __START_KERNEL, 837 (unsigned long)&_text - __START_KERNEL,
838 __START_KERNEL_map, MODULES_VADDR-1); 838 __START_KERNEL,
839 __START_KERNEL_map,
840 MODULES_VADDR-1);
841 } else {
842 pr_emerg("Kernel Offset: disabled\n");
843 }
839 844
840 return 0; 845 return 0;
841} 846}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e5042463c1bc..53cc4085c3d7 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,8 +61,7 @@
61 regs->seg = GET_SEG(seg) | 3; \ 61 regs->seg = GET_SEG(seg) | 3; \
62} while (0) 62} while (0)
63 63
64int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 64int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
65 unsigned long *pax)
66{ 65{
67 void __user *buf; 66 void __user *buf;
68 unsigned int tmpflags; 67 unsigned int tmpflags;
@@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
81#endif /* CONFIG_X86_32 */ 80#endif /* CONFIG_X86_32 */
82 81
83 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 82 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
84 COPY(dx); COPY(cx); COPY(ip); 83 COPY(dx); COPY(cx); COPY(ip); COPY(ax);
85 84
86#ifdef CONFIG_X86_64 85#ifdef CONFIG_X86_64
87 COPY(r8); 86 COPY(r8);
@@ -94,27 +93,20 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
94 COPY(r15); 93 COPY(r15);
95#endif /* CONFIG_X86_64 */ 94#endif /* CONFIG_X86_64 */
96 95
97#ifdef CONFIG_X86_32
98 COPY_SEG_CPL3(cs); 96 COPY_SEG_CPL3(cs);
99 COPY_SEG_CPL3(ss); 97 COPY_SEG_CPL3(ss);
100#else /* !CONFIG_X86_32 */
101 /* Kernel saves and restores only the CS segment register on signals,
102 * which is the bare minimum needed to allow mixed 32/64-bit code.
103 * App's signal handler can save/restore other segments if needed. */
104 COPY_SEG_CPL3(cs);
105#endif /* CONFIG_X86_32 */
106 98
107 get_user_ex(tmpflags, &sc->flags); 99 get_user_ex(tmpflags, &sc->flags);
108 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 100 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
109 regs->orig_ax = -1; /* disable syscall checks */ 101 regs->orig_ax = -1; /* disable syscall checks */
110 102
111 get_user_ex(buf, &sc->fpstate); 103 get_user_ex(buf, &sc->fpstate);
112
113 get_user_ex(*pax, &sc->ax);
114 } get_user_catch(err); 104 } get_user_catch(err);
115 105
116 err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); 106 err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
117 107
108 force_iret();
109
118 return err; 110 return err;
119} 111}
120 112
@@ -162,8 +154,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
162#else /* !CONFIG_X86_32 */ 154#else /* !CONFIG_X86_32 */
163 put_user_ex(regs->flags, &sc->flags); 155 put_user_ex(regs->flags, &sc->flags);
164 put_user_ex(regs->cs, &sc->cs); 156 put_user_ex(regs->cs, &sc->cs);
165 put_user_ex(0, &sc->gs); 157 put_user_ex(0, &sc->__pad2);
166 put_user_ex(0, &sc->fs); 158 put_user_ex(0, &sc->__pad1);
159 put_user_ex(regs->ss, &sc->ss);
167#endif /* CONFIG_X86_32 */ 160#endif /* CONFIG_X86_32 */
168 161
169 put_user_ex(fpstate, &sc->fpstate); 162 put_user_ex(fpstate, &sc->fpstate);
@@ -457,9 +450,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
457 450
458 regs->sp = (unsigned long)frame; 451 regs->sp = (unsigned long)frame;
459 452
460 /* Set up the CS register to run signal handlers in 64-bit mode, 453 /*
461 even if the handler happens to be interrupting 32-bit code. */ 454 * Set up the CS and SS registers to run signal handlers in
455 * 64-bit mode, even if the handler happens to be interrupting
456 * 32-bit or 16-bit code.
457 *
458 * SS is subtle. In 64-bit mode, we don't need any particular
459 * SS descriptor, but we do need SS to be valid. It's possible
460 * that the old SS is entirely bogus -- this can happen if the
461 * signal we're trying to deliver is #GP or #SS caused by a bad
462 * SS value.
463 */
462 regs->cs = __USER_CS; 464 regs->cs = __USER_CS;
465 regs->ss = __USER_DS;
463 466
464 return 0; 467 return 0;
465} 468}
@@ -539,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void)
539{ 542{
540 struct pt_regs *regs = current_pt_regs(); 543 struct pt_regs *regs = current_pt_regs();
541 struct sigframe __user *frame; 544 struct sigframe __user *frame;
542 unsigned long ax;
543 sigset_t set; 545 sigset_t set;
544 546
545 frame = (struct sigframe __user *)(regs->sp - 8); 547 frame = (struct sigframe __user *)(regs->sp - 8);
@@ -553,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void)
553 555
554 set_current_blocked(&set); 556 set_current_blocked(&set);
555 557
556 if (restore_sigcontext(regs, &frame->sc, &ax)) 558 if (restore_sigcontext(regs, &frame->sc))
557 goto badframe; 559 goto badframe;
558 return ax; 560 return regs->ax;
559 561
560badframe: 562badframe:
561 signal_fault(regs, frame, "sigreturn"); 563 signal_fault(regs, frame, "sigreturn");
@@ -568,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void)
568{ 570{
569 struct pt_regs *regs = current_pt_regs(); 571 struct pt_regs *regs = current_pt_regs();
570 struct rt_sigframe __user *frame; 572 struct rt_sigframe __user *frame;
571 unsigned long ax;
572 sigset_t set; 573 sigset_t set;
573 574
574 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); 575 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
@@ -579,13 +580,13 @@ asmlinkage long sys_rt_sigreturn(void)
579 580
580 set_current_blocked(&set); 581 set_current_blocked(&set);
581 582
582 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 583 if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
583 goto badframe; 584 goto badframe;
584 585
585 if (restore_altstack(&frame->uc.uc_stack)) 586 if (restore_altstack(&frame->uc.uc_stack))
586 goto badframe; 587 goto badframe;
587 588
588 return ax; 589 return regs->ax;
589 590
590badframe: 591badframe:
591 signal_fault(regs, frame, "rt_sigreturn"); 592 signal_fault(regs, frame, "rt_sigreturn");
@@ -780,7 +781,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
780 struct pt_regs *regs = current_pt_regs(); 781 struct pt_regs *regs = current_pt_regs();
781 struct rt_sigframe_x32 __user *frame; 782 struct rt_sigframe_x32 __user *frame;
782 sigset_t set; 783 sigset_t set;
783 unsigned long ax;
784 784
785 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); 785 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
786 786
@@ -791,13 +791,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
791 791
792 set_current_blocked(&set); 792 set_current_blocked(&set);
793 793
794 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 794 if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
795 goto badframe; 795 goto badframe;
796 796
797 if (compat_restore_altstack(&frame->uc.uc_stack)) 797 if (compat_restore_altstack(&frame->uc.uc_stack))
798 goto badframe; 798 goto badframe;
799 799
800 return ax; 800 return regs->ax;
801 801
802badframe: 802badframe:
803 signal_fault(regs, frame, "x32 rt_sigreturn"); 803 signal_fault(regs, frame, "x32 rt_sigreturn");
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ddd2c0674cda..7035f6b21c3f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -779,6 +779,26 @@ out:
779 return boot_error; 779 return boot_error;
780} 780}
781 781
782void common_cpu_up(unsigned int cpu, struct task_struct *idle)
783{
784 /* Just in case we booted with a single CPU. */
785 alternatives_enable_smp();
786
787 per_cpu(current_task, cpu) = idle;
788
789#ifdef CONFIG_X86_32
790 /* Stack for startup_32 can be just as for start_secondary onwards */
791 irq_ctx_init(cpu);
792 per_cpu(cpu_current_top_of_stack, cpu) =
793 (unsigned long)task_stack_page(idle) + THREAD_SIZE;
794#else
795 clear_tsk_thread_flag(idle, TIF_FORK);
796 initial_gs = per_cpu_offset(cpu);
797#endif
798 per_cpu(kernel_stack, cpu) =
799 (unsigned long)task_stack_page(idle) + THREAD_SIZE;
800}
801
782/* 802/*
783 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 803 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
784 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 804 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -796,23 +816,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
796 int cpu0_nmi_registered = 0; 816 int cpu0_nmi_registered = 0;
797 unsigned long timeout; 817 unsigned long timeout;
798 818
799 /* Just in case we booted with a single CPU. */
800 alternatives_enable_smp();
801
802 idle->thread.sp = (unsigned long) (((struct pt_regs *) 819 idle->thread.sp = (unsigned long) (((struct pt_regs *)
803 (THREAD_SIZE + task_stack_page(idle))) - 1); 820 (THREAD_SIZE + task_stack_page(idle))) - 1);
804 per_cpu(current_task, cpu) = idle;
805 821
806#ifdef CONFIG_X86_32
807 /* Stack for startup_32 can be just as for start_secondary onwards */
808 irq_ctx_init(cpu);
809#else
810 clear_tsk_thread_flag(idle, TIF_FORK);
811 initial_gs = per_cpu_offset(cpu);
812#endif
813 per_cpu(kernel_stack, cpu) =
814 (unsigned long)task_stack_page(idle) -
815 KERNEL_STACK_OFFSET + THREAD_SIZE;
816 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 822 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
817 initial_code = (unsigned long)start_secondary; 823 initial_code = (unsigned long)start_secondary;
818 stack_start = idle->thread.sp; 824 stack_start = idle->thread.sp;
@@ -953,6 +959,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
953 /* the FPU context is blank, nobody can own it */ 959 /* the FPU context is blank, nobody can own it */
954 __cpu_disable_lazy_restore(cpu); 960 __cpu_disable_lazy_restore(cpu);
955 961
962 common_cpu_up(cpu, tidle);
963
956 err = do_boot_cpu(apicid, cpu, tidle); 964 err = do_boot_cpu(apicid, cpu, tidle);
957 if (err) { 965 if (err) {
958 pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); 966 pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index e9bcd57d8a9e..3777189c4a19 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -5,21 +5,29 @@
5#include <linux/cache.h> 5#include <linux/cache.h>
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7 7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; 8#ifdef CONFIG_IA32_EMULATION
9#define SYM(sym, compat) compat
10#else
11#define SYM(sym, compat) sym
12#define ia32_sys_call_table sys_call_table
13#define __NR_ia32_syscall_max __NR_syscall_max
14#endif
15
16#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
9#include <asm/syscalls_32.h> 17#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386 18#undef __SYSCALL_I386
11 19
12#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, 20#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
13 21
14typedef asmlinkage void (*sys_call_ptr_t)(void); 22typedef asmlinkage void (*sys_call_ptr_t)(void);
15 23
16extern asmlinkage void sys_ni_syscall(void); 24extern asmlinkage void sys_ni_syscall(void);
17 25
18__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { 26__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
19 /* 27 /*
20 * Smells like a compiler bug -- it doesn't work 28 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed. 29 * when the & below is removed.
22 */ 30 */
23 [0 ... __NR_syscall_max] = &sys_ni_syscall, 31 [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
24#include <asm/syscalls_32.h> 32#include <asm/syscalls_32.h>
25}; 33};
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25adc0e16eaa..d39c09119db6 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs)
30{ 30{
31 unsigned long pc = instruction_pointer(regs); 31 unsigned long pc = instruction_pointer(regs);
32 32
33 if (!user_mode_vm(regs) && in_lock_functions(pc)) { 33 if (!user_mode(regs) && in_lock_functions(pc)) {
34#ifdef CONFIG_FRAME_POINTER 34#ifdef CONFIG_FRAME_POINTER
35 return *(unsigned long *)(regs->bp + sizeof(long)); 35 return *(unsigned long *)(regs->bp + sizeof(long));
36#else 36#else
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4ff5d162ff9f..6751c5c58eec 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
112{ 112{
113 enum ctx_state prev_state; 113 enum ctx_state prev_state;
114 114
115 if (user_mode_vm(regs)) { 115 if (user_mode(regs)) {
116 /* Other than that, we're just an exception. */ 116 /* Other than that, we're just an exception. */
117 prev_state = exception_enter(); 117 prev_state = exception_enter();
118 } else { 118 } else {
@@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
146 /* Must be before exception_exit. */ 146 /* Must be before exception_exit. */
147 preempt_count_sub(HARDIRQ_OFFSET); 147 preempt_count_sub(HARDIRQ_OFFSET);
148 148
149 if (user_mode_vm(regs)) 149 if (user_mode(regs))
150 return exception_exit(prev_state); 150 return exception_exit(prev_state);
151 else 151 else
152 rcu_nmi_exit(); 152 rcu_nmi_exit();
@@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
158 * 158 *
159 * IST exception handlers normally cannot schedule. As a special 159 * IST exception handlers normally cannot schedule. As a special
160 * exception, if the exception interrupted userspace code (i.e. 160 * exception, if the exception interrupted userspace code (i.e.
161 * user_mode_vm(regs) would return true) and the exception was not 161 * user_mode(regs) would return true) and the exception was not
162 * a double fault, it can be safe to schedule. ist_begin_non_atomic() 162 * a double fault, it can be safe to schedule. ist_begin_non_atomic()
163 * begins a non-atomic section within an ist_enter()/ist_exit() region. 163 * begins a non-atomic section within an ist_enter()/ist_exit() region.
164 * Callers are responsible for enabling interrupts themselves inside 164 * Callers are responsible for enabling interrupts themselves inside
@@ -167,15 +167,15 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
167 */ 167 */
168void ist_begin_non_atomic(struct pt_regs *regs) 168void ist_begin_non_atomic(struct pt_regs *regs)
169{ 169{
170 BUG_ON(!user_mode_vm(regs)); 170 BUG_ON(!user_mode(regs));
171 171
172 /* 172 /*
173 * Sanity check: we need to be on the normal thread stack. This 173 * Sanity check: we need to be on the normal thread stack. This
174 * will catch asm bugs and any attempt to use ist_preempt_enable 174 * will catch asm bugs and any attempt to use ist_preempt_enable
175 * from double_fault. 175 * from double_fault.
176 */ 176 */
177 BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) 177 BUG_ON((unsigned long)(current_top_of_stack() -
178 & ~(THREAD_SIZE - 1)) != 0); 178 current_stack_pointer()) >= THREAD_SIZE);
179 179
180 preempt_count_sub(HARDIRQ_OFFSET); 180 preempt_count_sub(HARDIRQ_OFFSET);
181} 181}
@@ -194,8 +194,7 @@ static nokprobe_inline int
194do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, 194do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
195 struct pt_regs *regs, long error_code) 195 struct pt_regs *regs, long error_code)
196{ 196{
197#ifdef CONFIG_X86_32 197 if (v8086_mode(regs)) {
198 if (regs->flags & X86_VM_MASK) {
199 /* 198 /*
200 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. 199 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
201 * On nmi (interrupt 2), do_trap should not be called. 200 * On nmi (interrupt 2), do_trap should not be called.
@@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
207 } 206 }
208 return -1; 207 return -1;
209 } 208 }
210#endif 209
211 if (!user_mode(regs)) { 210 if (!user_mode(regs)) {
212 if (!fixup_exception(regs)) { 211 if (!fixup_exception(regs)) {
213 tsk->thread.error_code = error_code; 212 tsk->thread.error_code = error_code;
@@ -384,7 +383,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
384 goto exit; 383 goto exit;
385 conditional_sti(regs); 384 conditional_sti(regs);
386 385
387 if (!user_mode_vm(regs)) 386 if (!user_mode(regs))
388 die("bounds", regs, error_code); 387 die("bounds", regs, error_code);
389 388
390 if (!cpu_feature_enabled(X86_FEATURE_MPX)) { 389 if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
@@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
462 prev_state = exception_enter(); 461 prev_state = exception_enter();
463 conditional_sti(regs); 462 conditional_sti(regs);
464 463
465#ifdef CONFIG_X86_32 464 if (v8086_mode(regs)) {
466 if (regs->flags & X86_VM_MASK) {
467 local_irq_enable(); 465 local_irq_enable();
468 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 466 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
469 goto exit; 467 goto exit;
470 } 468 }
471#endif
472 469
473 tsk = current; 470 tsk = current;
474 if (!user_mode(regs)) { 471 if (!user_mode(regs)) {
@@ -587,7 +584,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
587 /* Copy the remainder of the stack from the current stack. */ 584 /* Copy the remainder of the stack from the current stack. */
588 memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); 585 memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
589 586
590 BUG_ON(!user_mode_vm(&new_stack->regs)); 587 BUG_ON(!user_mode(&new_stack->regs));
591 return new_stack; 588 return new_stack;
592} 589}
593NOKPROBE_SYMBOL(fixup_bad_iret); 590NOKPROBE_SYMBOL(fixup_bad_iret);
@@ -637,7 +634,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
637 * then it's very likely the result of an icebp/int01 trap. 634 * then it's very likely the result of an icebp/int01 trap.
638 * User wants a sigtrap for that. 635 * User wants a sigtrap for that.
639 */ 636 */
640 if (!dr6 && user_mode_vm(regs)) 637 if (!dr6 && user_mode(regs))
641 user_icebp = 1; 638 user_icebp = 1;
642 639
643 /* Catch kmemcheck conditions first of all! */ 640 /* Catch kmemcheck conditions first of all! */
@@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
673 /* It's safe to allow irq's after DR6 has been saved */ 670 /* It's safe to allow irq's after DR6 has been saved */
674 preempt_conditional_sti(regs); 671 preempt_conditional_sti(regs);
675 672
676 if (regs->flags & X86_VM_MASK) { 673 if (v8086_mode(regs)) {
677 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 674 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
678 X86_TRAP_DB); 675 X86_TRAP_DB);
679 preempt_conditional_cli(regs); 676 preempt_conditional_cli(regs);
@@ -721,7 +718,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
721 return; 718 return;
722 conditional_sti(regs); 719 conditional_sti(regs);
723 720
724 if (!user_mode_vm(regs)) 721 if (!user_mode(regs))
725 { 722 {
726 if (!fixup_exception(regs)) { 723 if (!fixup_exception(regs)) {
727 task->thread.error_code = error_code; 724 task->thread.error_code = error_code;
@@ -925,9 +922,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
925/* Set of traps needed for early debugging. */ 922/* Set of traps needed for early debugging. */
926void __init early_trap_init(void) 923void __init early_trap_init(void)
927{ 924{
928 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); 925 /*
926 * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
927 * is ready in cpu_init() <-- trap_init(). Before trap_init(),
928 * CPU runs at ring 0 so it is impossible to hit an invalid
929 * stack. Using the original stack works well enough at this
930 * early stage. DEBUG_STACK will be equipped after cpu_init() in
931 * trap_init().
932 *
933 * We don't need to set trace_idt_table like set_intr_gate(),
934 * since we don't have trace_debug and it will be reset to
935 * 'debug' in trap_init() by set_intr_gate_ist().
936 */
937 set_intr_gate_notrace(X86_TRAP_DB, debug);
929 /* int3 can be called from all */ 938 /* int3 can be called from all */
930 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); 939 set_system_intr_gate(X86_TRAP_BP, &int3);
931#ifdef CONFIG_X86_32 940#ifdef CONFIG_X86_32
932 set_intr_gate(X86_TRAP_PF, page_fault); 941 set_intr_gate(X86_TRAP_PF, page_fault);
933#endif 942#endif
@@ -1005,6 +1014,15 @@ void __init trap_init(void)
1005 */ 1014 */
1006 cpu_init(); 1015 cpu_init();
1007 1016
1017 /*
1018 * X86_TRAP_DB and X86_TRAP_BP have been set
1019 * in early_trap_init(). However, ITS works only after
1020 * cpu_init() loads TSS. See comments in early_trap_init().
1021 */
1022 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
1023 /* int3 can be called from all */
1024 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
1025
1008 x86_init.irqs.trap_init(); 1026 x86_init.irqs.trap_init();
1009 1027
1010#ifdef CONFIG_X86_64 1028#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 81f8adb0679e..0b81ad67da07 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
912 int ret = NOTIFY_DONE; 912 int ret = NOTIFY_DONE;
913 913
914 /* We are only interested in userspace traps */ 914 /* We are only interested in userspace traps */
915 if (regs && !user_mode_vm(regs)) 915 if (regs && !user_mode(regs))
916 return NOTIFY_DONE; 916 return NOTIFY_DONE;
917 917
918 switch (val) { 918 switch (val) {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e8edcf52e069..fc9db6ef2a95 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
150 do_exit(SIGSEGV); 150 do_exit(SIGSEGV);
151 } 151 }
152 152
153 tss = &per_cpu(init_tss, get_cpu()); 153 tss = &per_cpu(cpu_tss, get_cpu());
154 current->thread.sp0 = current->thread.saved_sp0; 154 current->thread.sp0 = current->thread.saved_sp0;
155 current->thread.sysenter_cs = __KERNEL_CS; 155 current->thread.sysenter_cs = __KERNEL_CS;
156 load_sp0(tss, &current->thread); 156 load_sp0(tss, &current->thread);
@@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
318 tsk->thread.saved_fs = info->regs32->fs; 318 tsk->thread.saved_fs = info->regs32->fs;
319 tsk->thread.saved_gs = get_user_gs(info->regs32); 319 tsk->thread.saved_gs = get_user_gs(info->regs32);
320 320
321 tss = &per_cpu(init_tss, get_cpu()); 321 tss = &per_cpu(cpu_tss, get_cpu());
322 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; 322 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
323 if (cpu_has_sep) 323 if (cpu_has_sep)
324 tsk->thread.sysenter_cs = 0; 324 tsk->thread.sysenter_cs = 0;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ac4453d8520e..717908b16037 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -868,7 +868,8 @@ static void __init lguest_init_IRQ(void)
868 /* Some systems map "vectors" to interrupts weirdly. Not us! */ 868 /* Some systems map "vectors" to interrupts weirdly. Not us! */
869 __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); 869 __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
870 if (i != SYSCALL_VECTOR) 870 if (i != SYSCALL_VECTOR)
871 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 871 set_intr_gate(i, irq_entries_start +
872 8 * (i - FIRST_EXTERNAL_VECTOR));
872 } 873 }
873 874
874 /* 875 /*
@@ -1076,6 +1077,7 @@ static void lguest_load_sp0(struct tss_struct *tss,
1076{ 1077{
1077 lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, 1078 lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
1078 THREAD_SIZE / PAGE_SIZE); 1079 THREAD_SIZE / PAGE_SIZE);
1080 tss->x86_tss.sp0 = thread->sp0;
1079} 1081}
1080 1082
1081/* Let's just say, I wouldn't do debugging under a Guest. */ 1083/* Let's just say, I wouldn't do debugging under a Guest. */
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index f5cc9eb1d51b..082a85167a5b 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -13,16 +13,6 @@
13#include <asm/alternative-asm.h> 13#include <asm/alternative-asm.h>
14#include <asm/dwarf2.h> 14#include <asm/dwarf2.h>
15 15
16.macro SAVE reg
17 pushl_cfi %\reg
18 CFI_REL_OFFSET \reg, 0
19.endm
20
21.macro RESTORE reg
22 popl_cfi %\reg
23 CFI_RESTORE \reg
24.endm
25
26.macro read64 reg 16.macro read64 reg
27 movl %ebx, %eax 17 movl %ebx, %eax
28 movl %ecx, %edx 18 movl %ecx, %edx
@@ -67,10 +57,10 @@ ENDPROC(atomic64_xchg_cx8)
67.macro addsub_return func ins insc 57.macro addsub_return func ins insc
68ENTRY(atomic64_\func\()_return_cx8) 58ENTRY(atomic64_\func\()_return_cx8)
69 CFI_STARTPROC 59 CFI_STARTPROC
70 SAVE ebp 60 pushl_cfi_reg ebp
71 SAVE ebx 61 pushl_cfi_reg ebx
72 SAVE esi 62 pushl_cfi_reg esi
73 SAVE edi 63 pushl_cfi_reg edi
74 64
75 movl %eax, %esi 65 movl %eax, %esi
76 movl %edx, %edi 66 movl %edx, %edi
@@ -89,10 +79,10 @@ ENTRY(atomic64_\func\()_return_cx8)
8910: 7910:
90 movl %ebx, %eax 80 movl %ebx, %eax
91 movl %ecx, %edx 81 movl %ecx, %edx
92 RESTORE edi 82 popl_cfi_reg edi
93 RESTORE esi 83 popl_cfi_reg esi
94 RESTORE ebx 84 popl_cfi_reg ebx
95 RESTORE ebp 85 popl_cfi_reg ebp
96 ret 86 ret
97 CFI_ENDPROC 87 CFI_ENDPROC
98ENDPROC(atomic64_\func\()_return_cx8) 88ENDPROC(atomic64_\func\()_return_cx8)
@@ -104,7 +94,7 @@ addsub_return sub sub sbb
104.macro incdec_return func ins insc 94.macro incdec_return func ins insc
105ENTRY(atomic64_\func\()_return_cx8) 95ENTRY(atomic64_\func\()_return_cx8)
106 CFI_STARTPROC 96 CFI_STARTPROC
107 SAVE ebx 97 pushl_cfi_reg ebx
108 98
109 read64 %esi 99 read64 %esi
1101: 1001:
@@ -119,7 +109,7 @@ ENTRY(atomic64_\func\()_return_cx8)
11910: 10910:
120 movl %ebx, %eax 110 movl %ebx, %eax
121 movl %ecx, %edx 111 movl %ecx, %edx
122 RESTORE ebx 112 popl_cfi_reg ebx
123 ret 113 ret
124 CFI_ENDPROC 114 CFI_ENDPROC
125ENDPROC(atomic64_\func\()_return_cx8) 115ENDPROC(atomic64_\func\()_return_cx8)
@@ -130,7 +120,7 @@ incdec_return dec sub sbb
130 120
131ENTRY(atomic64_dec_if_positive_cx8) 121ENTRY(atomic64_dec_if_positive_cx8)
132 CFI_STARTPROC 122 CFI_STARTPROC
133 SAVE ebx 123 pushl_cfi_reg ebx
134 124
135 read64 %esi 125 read64 %esi
1361: 1261:
@@ -146,18 +136,18 @@ ENTRY(atomic64_dec_if_positive_cx8)
1462: 1362:
147 movl %ebx, %eax 137 movl %ebx, %eax
148 movl %ecx, %edx 138 movl %ecx, %edx
149 RESTORE ebx 139 popl_cfi_reg ebx
150 ret 140 ret
151 CFI_ENDPROC 141 CFI_ENDPROC
152ENDPROC(atomic64_dec_if_positive_cx8) 142ENDPROC(atomic64_dec_if_positive_cx8)
153 143
154ENTRY(atomic64_add_unless_cx8) 144ENTRY(atomic64_add_unless_cx8)
155 CFI_STARTPROC 145 CFI_STARTPROC
156 SAVE ebp 146 pushl_cfi_reg ebp
157 SAVE ebx 147 pushl_cfi_reg ebx
158/* these just push these two parameters on the stack */ 148/* these just push these two parameters on the stack */
159 SAVE edi 149 pushl_cfi_reg edi
160 SAVE ecx 150 pushl_cfi_reg ecx
161 151
162 movl %eax, %ebp 152 movl %eax, %ebp
163 movl %edx, %edi 153 movl %edx, %edi
@@ -179,8 +169,8 @@ ENTRY(atomic64_add_unless_cx8)
1793: 1693:
180 addl $8, %esp 170 addl $8, %esp
181 CFI_ADJUST_CFA_OFFSET -8 171 CFI_ADJUST_CFA_OFFSET -8
182 RESTORE ebx 172 popl_cfi_reg ebx
183 RESTORE ebp 173 popl_cfi_reg ebp
184 ret 174 ret
1854: 1754:
186 cmpl %edx, 4(%esp) 176 cmpl %edx, 4(%esp)
@@ -192,7 +182,7 @@ ENDPROC(atomic64_add_unless_cx8)
192 182
193ENTRY(atomic64_inc_not_zero_cx8) 183ENTRY(atomic64_inc_not_zero_cx8)
194 CFI_STARTPROC 184 CFI_STARTPROC
195 SAVE ebx 185 pushl_cfi_reg ebx
196 186
197 read64 %esi 187 read64 %esi
1981: 1881:
@@ -209,7 +199,7 @@ ENTRY(atomic64_inc_not_zero_cx8)
209 199
210 movl $1, %eax 200 movl $1, %eax
2113: 2013:
212 RESTORE ebx 202 popl_cfi_reg ebx
213 ret 203 ret
214 CFI_ENDPROC 204 CFI_ENDPROC
215ENDPROC(atomic64_inc_not_zero_cx8) 205ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index e78b8eee6615..9bc944a91274 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -51,10 +51,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
51 */ 51 */
52ENTRY(csum_partial) 52ENTRY(csum_partial)
53 CFI_STARTPROC 53 CFI_STARTPROC
54 pushl_cfi %esi 54 pushl_cfi_reg esi
55 CFI_REL_OFFSET esi, 0 55 pushl_cfi_reg ebx
56 pushl_cfi %ebx
57 CFI_REL_OFFSET ebx, 0
58 movl 20(%esp),%eax # Function arg: unsigned int sum 56 movl 20(%esp),%eax # Function arg: unsigned int sum
59 movl 16(%esp),%ecx # Function arg: int len 57 movl 16(%esp),%ecx # Function arg: int len
60 movl 12(%esp),%esi # Function arg: unsigned char *buff 58 movl 12(%esp),%esi # Function arg: unsigned char *buff
@@ -127,14 +125,12 @@ ENTRY(csum_partial)
1276: addl %ecx,%eax 1256: addl %ecx,%eax
128 adcl $0, %eax 126 adcl $0, %eax
1297: 1277:
130 testl $1, 12(%esp) 128 testb $1, 12(%esp)
131 jz 8f 129 jz 8f
132 roll $8, %eax 130 roll $8, %eax
1338: 1318:
134 popl_cfi %ebx 132 popl_cfi_reg ebx
135 CFI_RESTORE ebx 133 popl_cfi_reg esi
136 popl_cfi %esi
137 CFI_RESTORE esi
138 ret 134 ret
139 CFI_ENDPROC 135 CFI_ENDPROC
140ENDPROC(csum_partial) 136ENDPROC(csum_partial)
@@ -145,10 +141,8 @@ ENDPROC(csum_partial)
145 141
146ENTRY(csum_partial) 142ENTRY(csum_partial)
147 CFI_STARTPROC 143 CFI_STARTPROC
148 pushl_cfi %esi 144 pushl_cfi_reg esi
149 CFI_REL_OFFSET esi, 0 145 pushl_cfi_reg ebx
150 pushl_cfi %ebx
151 CFI_REL_OFFSET ebx, 0
152 movl 20(%esp),%eax # Function arg: unsigned int sum 146 movl 20(%esp),%eax # Function arg: unsigned int sum
153 movl 16(%esp),%ecx # Function arg: int len 147 movl 16(%esp),%ecx # Function arg: int len
154 movl 12(%esp),%esi # Function arg: const unsigned char *buf 148 movl 12(%esp),%esi # Function arg: const unsigned char *buf
@@ -251,14 +245,12 @@ ENTRY(csum_partial)
251 addl %ebx,%eax 245 addl %ebx,%eax
252 adcl $0,%eax 246 adcl $0,%eax
25380: 24780:
254 testl $1, 12(%esp) 248 testb $1, 12(%esp)
255 jz 90f 249 jz 90f
256 roll $8, %eax 250 roll $8, %eax
25790: 25190:
258 popl_cfi %ebx 252 popl_cfi_reg ebx
259 CFI_RESTORE ebx 253 popl_cfi_reg esi
260 popl_cfi %esi
261 CFI_RESTORE esi
262 ret 254 ret
263 CFI_ENDPROC 255 CFI_ENDPROC
264ENDPROC(csum_partial) 256ENDPROC(csum_partial)
@@ -298,12 +290,9 @@ ENTRY(csum_partial_copy_generic)
298 CFI_STARTPROC 290 CFI_STARTPROC
299 subl $4,%esp 291 subl $4,%esp
300 CFI_ADJUST_CFA_OFFSET 4 292 CFI_ADJUST_CFA_OFFSET 4
301 pushl_cfi %edi 293 pushl_cfi_reg edi
302 CFI_REL_OFFSET edi, 0 294 pushl_cfi_reg esi
303 pushl_cfi %esi 295 pushl_cfi_reg ebx
304 CFI_REL_OFFSET esi, 0
305 pushl_cfi %ebx
306 CFI_REL_OFFSET ebx, 0
307 movl ARGBASE+16(%esp),%eax # sum 296 movl ARGBASE+16(%esp),%eax # sum
308 movl ARGBASE+12(%esp),%ecx # len 297 movl ARGBASE+12(%esp),%ecx # len
309 movl ARGBASE+4(%esp),%esi # src 298 movl ARGBASE+4(%esp),%esi # src
@@ -412,12 +401,9 @@ DST( movb %cl, (%edi) )
412 401
413.previous 402.previous
414 403
415 popl_cfi %ebx 404 popl_cfi_reg ebx
416 CFI_RESTORE ebx 405 popl_cfi_reg esi
417 popl_cfi %esi 406 popl_cfi_reg edi
418 CFI_RESTORE esi
419 popl_cfi %edi
420 CFI_RESTORE edi
421 popl_cfi %ecx # equivalent to addl $4,%esp 407 popl_cfi %ecx # equivalent to addl $4,%esp
422 ret 408 ret
423 CFI_ENDPROC 409 CFI_ENDPROC
@@ -441,12 +427,9 @@ ENDPROC(csum_partial_copy_generic)
441 427
442ENTRY(csum_partial_copy_generic) 428ENTRY(csum_partial_copy_generic)
443 CFI_STARTPROC 429 CFI_STARTPROC
444 pushl_cfi %ebx 430 pushl_cfi_reg ebx
445 CFI_REL_OFFSET ebx, 0 431 pushl_cfi_reg edi
446 pushl_cfi %edi 432 pushl_cfi_reg esi
447 CFI_REL_OFFSET edi, 0
448 pushl_cfi %esi
449 CFI_REL_OFFSET esi, 0
450 movl ARGBASE+4(%esp),%esi #src 433 movl ARGBASE+4(%esp),%esi #src
451 movl ARGBASE+8(%esp),%edi #dst 434 movl ARGBASE+8(%esp),%edi #dst
452 movl ARGBASE+12(%esp),%ecx #len 435 movl ARGBASE+12(%esp),%ecx #len
@@ -506,12 +489,9 @@ DST( movb %dl, (%edi) )
506 jmp 7b 489 jmp 7b
507.previous 490.previous
508 491
509 popl_cfi %esi 492 popl_cfi_reg esi
510 CFI_RESTORE esi 493 popl_cfi_reg edi
511 popl_cfi %edi 494 popl_cfi_reg ebx
512 CFI_RESTORE edi
513 popl_cfi %ebx
514 CFI_RESTORE ebx
515 ret 495 ret
516 CFI_ENDPROC 496 CFI_ENDPROC
517ENDPROC(csum_partial_copy_generic) 497ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index f2145cfa12a6..e67e579c93bd 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,31 +1,35 @@
1#include <linux/linkage.h> 1#include <linux/linkage.h>
2#include <asm/dwarf2.h> 2#include <asm/dwarf2.h>
3#include <asm/cpufeature.h>
3#include <asm/alternative-asm.h> 4#include <asm/alternative-asm.h>
4 5
5/* 6/*
6 * Zero a page. 7 * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
7 * rdi page 8 * recommended to use this when possible and we do use them by default.
8 */ 9 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
9ENTRY(clear_page_c) 10 * Otherwise, use original.
11 */
12
13/*
14 * Zero a page.
15 * %rdi - page
16 */
17ENTRY(clear_page)
10 CFI_STARTPROC 18 CFI_STARTPROC
19
20 ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
21 "jmp clear_page_c_e", X86_FEATURE_ERMS
22
11 movl $4096/8,%ecx 23 movl $4096/8,%ecx
12 xorl %eax,%eax 24 xorl %eax,%eax
13 rep stosq 25 rep stosq
14 ret 26 ret
15 CFI_ENDPROC 27 CFI_ENDPROC
16ENDPROC(clear_page_c) 28ENDPROC(clear_page)
17 29
18ENTRY(clear_page_c_e) 30ENTRY(clear_page_orig)
19 CFI_STARTPROC 31 CFI_STARTPROC
20 movl $4096,%ecx
21 xorl %eax,%eax
22 rep stosb
23 ret
24 CFI_ENDPROC
25ENDPROC(clear_page_c_e)
26 32
27ENTRY(clear_page)
28 CFI_STARTPROC
29 xorl %eax,%eax 33 xorl %eax,%eax
30 movl $4096/64,%ecx 34 movl $4096/64,%ecx
31 .p2align 4 35 .p2align 4
@@ -45,29 +49,13 @@ ENTRY(clear_page)
45 nop 49 nop
46 ret 50 ret
47 CFI_ENDPROC 51 CFI_ENDPROC
48.Lclear_page_end: 52ENDPROC(clear_page_orig)
49ENDPROC(clear_page)
50
51 /*
52 * Some CPUs support enhanced REP MOVSB/STOSB instructions.
53 * It is recommended to use this when possible.
54 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
55 * Otherwise, use original function.
56 *
57 */
58 53
59#include <asm/cpufeature.h> 54ENTRY(clear_page_c_e)
60 55 CFI_STARTPROC
61 .section .altinstr_replacement,"ax" 56 movl $4096,%ecx
621: .byte 0xeb /* jmp <disp8> */ 57 xorl %eax,%eax
63 .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ 58 rep stosb
642: .byte 0xeb /* jmp <disp8> */ 59 ret
65 .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ 60 CFI_ENDPROC
663: 61ENDPROC(clear_page_c_e)
67 .previous
68 .section .altinstructions,"a"
69 altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
70 .Lclear_page_end-clear_page, 2b-1b
71 altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
72 .Lclear_page_end-clear_page,3b-2b
73 .previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 176cca67212b..8239dbcbf984 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,23 +2,26 @@
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4#include <asm/dwarf2.h>
5#include <asm/cpufeature.h>
5#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
6 7
8/*
9 * Some CPUs run faster using the string copy instructions (sane microcode).
10 * It is also a lot simpler. Use this when possible. But, don't use streaming
11 * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
12 * prefetch distance based on SMP/UP.
13 */
7 ALIGN 14 ALIGN
8copy_page_rep: 15ENTRY(copy_page)
9 CFI_STARTPROC 16 CFI_STARTPROC
17 ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
10 movl $4096/8, %ecx 18 movl $4096/8, %ecx
11 rep movsq 19 rep movsq
12 ret 20 ret
13 CFI_ENDPROC 21 CFI_ENDPROC
14ENDPROC(copy_page_rep) 22ENDPROC(copy_page)
15
16/*
17 * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
18 * Could vary the prefetch distance based on SMP/UP.
19*/
20 23
21ENTRY(copy_page) 24ENTRY(copy_page_regs)
22 CFI_STARTPROC 25 CFI_STARTPROC
23 subq $2*8, %rsp 26 subq $2*8, %rsp
24 CFI_ADJUST_CFA_OFFSET 2*8 27 CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@ ENTRY(copy_page)
90 addq $2*8, %rsp 93 addq $2*8, %rsp
91 CFI_ADJUST_CFA_OFFSET -2*8 94 CFI_ADJUST_CFA_OFFSET -2*8
92 ret 95 ret
93.Lcopy_page_end:
94 CFI_ENDPROC 96 CFI_ENDPROC
95ENDPROC(copy_page) 97ENDPROC(copy_page_regs)
96
97 /* Some CPUs run faster using the string copy instructions.
98 It is also a lot simpler. Use this when possible */
99
100#include <asm/cpufeature.h>
101
102 .section .altinstr_replacement,"ax"
1031: .byte 0xeb /* jmp <disp8> */
104 .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
1052:
106 .previous
107 .section .altinstructions,"a"
108 altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
109 .Lcopy_page_end-copy_page, 2b-1b
110 .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dee945d55594..fa997dfaef24 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -8,9 +8,6 @@
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/dwarf2.h> 10#include <asm/dwarf2.h>
11
12#define FIX_ALIGNMENT 1
13
14#include <asm/current.h> 11#include <asm/current.h>
15#include <asm/asm-offsets.h> 12#include <asm/asm-offsets.h>
16#include <asm/thread_info.h> 13#include <asm/thread_info.h>
@@ -19,33 +16,7 @@
19#include <asm/asm.h> 16#include <asm/asm.h>
20#include <asm/smap.h> 17#include <asm/smap.h>
21 18
22/*
23 * By placing feature2 after feature1 in altinstructions section, we logically
24 * implement:
25 * If CPU has feature2, jmp to alt2 is used
26 * else if CPU has feature1, jmp to alt1 is used
27 * else jmp to orig is used.
28 */
29 .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
300:
31 .byte 0xe9 /* 32bit jump */
32 .long \orig-1f /* by default jump to orig */
331:
34 .section .altinstr_replacement,"ax"
352: .byte 0xe9 /* near jump with 32bit immediate */
36 .long \alt1-1b /* offset */ /* or alternatively to alt1 */
373: .byte 0xe9 /* near jump with 32bit immediate */
38 .long \alt2-1b /* offset */ /* or alternatively to alt2 */
39 .previous
40
41 .section .altinstructions,"a"
42 altinstruction_entry 0b,2b,\feature1,5,5
43 altinstruction_entry 0b,3b,\feature2,5,5
44 .previous
45 .endm
46
47 .macro ALIGN_DESTINATION 19 .macro ALIGN_DESTINATION
48#ifdef FIX_ALIGNMENT
49 /* check for bad alignment of destination */ 20 /* check for bad alignment of destination */
50 movl %edi,%ecx 21 movl %edi,%ecx
51 andl $7,%ecx 22 andl $7,%ecx
@@ -67,7 +38,6 @@
67 38
68 _ASM_EXTABLE(100b,103b) 39 _ASM_EXTABLE(100b,103b)
69 _ASM_EXTABLE(101b,103b) 40 _ASM_EXTABLE(101b,103b)
70#endif
71 .endm 41 .endm
72 42
73/* Standard copy_to_user with segment limit checking */ 43/* Standard copy_to_user with segment limit checking */
@@ -79,9 +49,11 @@ ENTRY(_copy_to_user)
79 jc bad_to_user 49 jc bad_to_user
80 cmpq TI_addr_limit(%rax),%rcx 50 cmpq TI_addr_limit(%rax),%rcx
81 ja bad_to_user 51 ja bad_to_user
82 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ 52 ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
83 copy_user_generic_unrolled,copy_user_generic_string, \ 53 "jmp copy_user_generic_string", \
84 copy_user_enhanced_fast_string 54 X86_FEATURE_REP_GOOD, \
55 "jmp copy_user_enhanced_fast_string", \
56 X86_FEATURE_ERMS
85 CFI_ENDPROC 57 CFI_ENDPROC
86ENDPROC(_copy_to_user) 58ENDPROC(_copy_to_user)
87 59
@@ -94,9 +66,11 @@ ENTRY(_copy_from_user)
94 jc bad_from_user 66 jc bad_from_user
95 cmpq TI_addr_limit(%rax),%rcx 67 cmpq TI_addr_limit(%rax),%rcx
96 ja bad_from_user 68 ja bad_from_user
97 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ 69 ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
98 copy_user_generic_unrolled,copy_user_generic_string, \ 70 "jmp copy_user_generic_string", \
99 copy_user_enhanced_fast_string 71 X86_FEATURE_REP_GOOD, \
72 "jmp copy_user_enhanced_fast_string", \
73 X86_FEATURE_ERMS
100 CFI_ENDPROC 74 CFI_ENDPROC
101ENDPROC(_copy_from_user) 75ENDPROC(_copy_from_user)
102 76
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 2419d5fefae3..9734182966f3 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic)
196 196
197 /* handle last odd byte */ 197 /* handle last odd byte */
198.Lhandle_1: 198.Lhandle_1:
199 testl $1, %r10d 199 testb $1, %r10b
200 jz .Lende 200 jz .Lende
201 xorl %ebx, %ebx 201 xorl %ebx, %ebx
202 source 202 source
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 1313ae6b478b..8f72b334aea0 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -52,6 +52,13 @@
52 */ 52 */
53void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) 53void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
54{ 54{
55 /*
56 * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid
57 * even if the input buffer is long enough to hold them.
58 */
59 if (buf_len > MAX_INSN_SIZE)
60 buf_len = MAX_INSN_SIZE;
61
55 memset(insn, 0, sizeof(*insn)); 62 memset(insn, 0, sizeof(*insn));
56 insn->kaddr = kaddr; 63 insn->kaddr = kaddr;
57 insn->end_kaddr = kaddr + buf_len; 64 insn->end_kaddr = kaddr + buf_len;
@@ -164,6 +171,12 @@ found:
164 /* VEX.W overrides opnd_size */ 171 /* VEX.W overrides opnd_size */
165 insn->opnd_bytes = 8; 172 insn->opnd_bytes = 8;
166 } else { 173 } else {
174 /*
175 * For VEX2, fake VEX3-like byte#2.
176 * Makes it easier to decode vex.W, vex.vvvv,
177 * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0.
178 */
179 insn->vex_prefix.bytes[2] = b2 & 0x7f;
167 insn->vex_prefix.nbytes = 2; 180 insn->vex_prefix.nbytes = 2;
168 insn->next_byte += 2; 181 insn->next_byte += 2;
169 } 182 }
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 89b53c9968e7..b046664f5a1c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,12 +1,20 @@
1/* Copyright 2002 Andi Kleen */ 1/* Copyright 2002 Andi Kleen */
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4
5#include <asm/cpufeature.h> 4#include <asm/cpufeature.h>
6#include <asm/dwarf2.h> 5#include <asm/dwarf2.h>
7#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
8 7
9/* 8/*
9 * We build a jump to memcpy_orig by default which gets NOPped out on
10 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
11 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
12 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
13 */
14
15.weak memcpy
16
17/*
10 * memcpy - Copy a memory block. 18 * memcpy - Copy a memory block.
11 * 19 *
12 * Input: 20 * Input:
@@ -17,15 +25,11 @@
17 * Output: 25 * Output:
18 * rax original destination 26 * rax original destination
19 */ 27 */
28ENTRY(__memcpy)
29ENTRY(memcpy)
30 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
31 "jmp memcpy_erms", X86_FEATURE_ERMS
20 32
21/*
22 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
23 *
24 * This gets patched over the unrolled variant (below) via the
25 * alternative instructions framework:
26 */
27 .section .altinstr_replacement, "ax", @progbits
28.Lmemcpy_c:
29 movq %rdi, %rax 33 movq %rdi, %rax
30 movq %rdx, %rcx 34 movq %rdx, %rcx
31 shrq $3, %rcx 35 shrq $3, %rcx
@@ -34,29 +38,21 @@
34 movl %edx, %ecx 38 movl %edx, %ecx
35 rep movsb 39 rep movsb
36 ret 40 ret
37.Lmemcpy_e: 41ENDPROC(memcpy)
38 .previous 42ENDPROC(__memcpy)
39 43
40/* 44/*
41 * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than 45 * memcpy_erms() - enhanced fast string memcpy. This is faster and
42 * memcpy_c. Use memcpy_c_e when possible. 46 * simpler than memcpy. Use memcpy_erms when possible.
43 *
44 * This gets patched over the unrolled variant (below) via the
45 * alternative instructions framework:
46 */ 47 */
47 .section .altinstr_replacement, "ax", @progbits 48ENTRY(memcpy_erms)
48.Lmemcpy_c_e:
49 movq %rdi, %rax 49 movq %rdi, %rax
50 movq %rdx, %rcx 50 movq %rdx, %rcx
51 rep movsb 51 rep movsb
52 ret 52 ret
53.Lmemcpy_e_e: 53ENDPROC(memcpy_erms)
54 .previous
55
56.weak memcpy
57 54
58ENTRY(__memcpy) 55ENTRY(memcpy_orig)
59ENTRY(memcpy)
60 CFI_STARTPROC 56 CFI_STARTPROC
61 movq %rdi, %rax 57 movq %rdi, %rax
62 58
@@ -183,26 +179,4 @@ ENTRY(memcpy)
183.Lend: 179.Lend:
184 retq 180 retq
185 CFI_ENDPROC 181 CFI_ENDPROC
186ENDPROC(memcpy) 182ENDPROC(memcpy_orig)
187ENDPROC(__memcpy)
188
189 /*
190 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
191 * If the feature is supported, memcpy_c_e() is the first choice.
192 * If enhanced rep movsb copy is not available, use fast string copy
193 * memcpy_c() when possible. This is faster and code is simpler than
194 * original memcpy().
195 * Otherwise, original memcpy() is used.
196 * In .altinstructions section, ERMS feature is placed after REG_GOOD
197 * feature to implement the right patch order.
198 *
199 * Replace only beginning, memcpy is used to apply alternatives,
200 * so it is silly to overwrite itself with nops - reboot is the
201 * only outcome...
202 */
203 .section .altinstructions, "a"
204 altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
205 .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
206 altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
207 .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
208 .previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 9c4b530575da..0f8a0d0331b9 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -5,7 +5,6 @@
5 * This assembly file is re-written from memmove_64.c file. 5 * This assembly file is re-written from memmove_64.c file.
6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> 6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
7 */ 7 */
8#define _STRING_C
9#include <linux/linkage.h> 8#include <linux/linkage.h>
10#include <asm/dwarf2.h> 9#include <asm/dwarf2.h>
11#include <asm/cpufeature.h> 10#include <asm/cpufeature.h>
@@ -44,6 +43,8 @@ ENTRY(__memmove)
44 jg 2f 43 jg 2f
45 44
46.Lmemmove_begin_forward: 45.Lmemmove_begin_forward:
46 ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
47
47 /* 48 /*
48 * movsq instruction have many startup latency 49 * movsq instruction have many startup latency
49 * so we handle small size by general register. 50 * so we handle small size by general register.
@@ -207,21 +208,5 @@ ENTRY(__memmove)
20713: 20813:
208 retq 209 retq
209 CFI_ENDPROC 210 CFI_ENDPROC
210
211 .section .altinstr_replacement,"ax"
212.Lmemmove_begin_forward_efs:
213 /* Forward moving data. */
214 movq %rdx, %rcx
215 rep movsb
216 retq
217.Lmemmove_end_forward_efs:
218 .previous
219
220 .section .altinstructions,"a"
221 altinstruction_entry .Lmemmove_begin_forward, \
222 .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
223 .Lmemmove_end_forward-.Lmemmove_begin_forward, \
224 .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
225 .previous
226ENDPROC(__memmove) 211ENDPROC(__memmove)
227ENDPROC(memmove) 212ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 6f44935c6a60..93118fb23976 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -5,19 +5,30 @@
5#include <asm/cpufeature.h> 5#include <asm/cpufeature.h>
6#include <asm/alternative-asm.h> 6#include <asm/alternative-asm.h>
7 7
8.weak memset
9
8/* 10/*
9 * ISO C memset - set a memory block to a byte value. This function uses fast 11 * ISO C memset - set a memory block to a byte value. This function uses fast
10 * string to get better performance than the original function. The code is 12 * string to get better performance than the original function. The code is
11 * simpler and shorter than the orignal function as well. 13 * simpler and shorter than the orignal function as well.
12 * 14 *
13 * rdi destination 15 * rdi destination
14 * rsi value (char) 16 * rsi value (char)
15 * rdx count (bytes) 17 * rdx count (bytes)
16 * 18 *
17 * rax original destination 19 * rax original destination
18 */ 20 */
19 .section .altinstr_replacement, "ax", @progbits 21ENTRY(memset)
20.Lmemset_c: 22ENTRY(__memset)
23 /*
24 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
25 * to use it when possible. If not available, use fast string instructions.
26 *
27 * Otherwise, use original memset function.
28 */
29 ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
30 "jmp memset_erms", X86_FEATURE_ERMS
31
21 movq %rdi,%r9 32 movq %rdi,%r9
22 movq %rdx,%rcx 33 movq %rdx,%rcx
23 andl $7,%edx 34 andl $7,%edx
@@ -31,8 +42,8 @@
31 rep stosb 42 rep stosb
32 movq %r9,%rax 43 movq %r9,%rax
33 ret 44 ret
34.Lmemset_e: 45ENDPROC(memset)
35 .previous 46ENDPROC(__memset)
36 47
37/* 48/*
38 * ISO C memset - set a memory block to a byte value. This function uses 49 * ISO C memset - set a memory block to a byte value. This function uses
@@ -45,21 +56,16 @@
45 * 56 *
46 * rax original destination 57 * rax original destination
47 */ 58 */
48 .section .altinstr_replacement, "ax", @progbits 59ENTRY(memset_erms)
49.Lmemset_c_e:
50 movq %rdi,%r9 60 movq %rdi,%r9
51 movb %sil,%al 61 movb %sil,%al
52 movq %rdx,%rcx 62 movq %rdx,%rcx
53 rep stosb 63 rep stosb
54 movq %r9,%rax 64 movq %r9,%rax
55 ret 65 ret
56.Lmemset_e_e: 66ENDPROC(memset_erms)
57 .previous
58
59.weak memset
60 67
61ENTRY(memset) 68ENTRY(memset_orig)
62ENTRY(__memset)
63 CFI_STARTPROC 69 CFI_STARTPROC
64 movq %rdi,%r10 70 movq %rdi,%r10
65 71
@@ -134,23 +140,4 @@ ENTRY(__memset)
134 jmp .Lafter_bad_alignment 140 jmp .Lafter_bad_alignment
135.Lfinal: 141.Lfinal:
136 CFI_ENDPROC 142 CFI_ENDPROC
137ENDPROC(memset) 143ENDPROC(memset_orig)
138ENDPROC(__memset)
139
140 /* Some CPUs support enhanced REP MOVSB/STOSB feature.
141 * It is recommended to use this when possible.
142 *
143 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
144 * instructions.
145 *
146 * Otherwise, use original memset function.
147 *
148 * In .altinstructions section, ERMS feature is placed after REG_GOOD
149 * feature to implement the right patch order.
150 */
151 .section .altinstructions,"a"
152 altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
153 .Lfinal-__memset,.Lmemset_e-.Lmemset_c
154 altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
155 .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
156 .previous
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index f6d13eefad10..3ca5218fbece 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -14,8 +14,8 @@
14.macro op_safe_regs op 14.macro op_safe_regs op
15ENTRY(\op\()_safe_regs) 15ENTRY(\op\()_safe_regs)
16 CFI_STARTPROC 16 CFI_STARTPROC
17 pushq_cfi %rbx 17 pushq_cfi_reg rbx
18 pushq_cfi %rbp 18 pushq_cfi_reg rbp
19 movq %rdi, %r10 /* Save pointer */ 19 movq %rdi, %r10 /* Save pointer */
20 xorl %r11d, %r11d /* Return value */ 20 xorl %r11d, %r11d /* Return value */
21 movl (%rdi), %eax 21 movl (%rdi), %eax
@@ -35,8 +35,8 @@ ENTRY(\op\()_safe_regs)
35 movl %ebp, 20(%r10) 35 movl %ebp, 20(%r10)
36 movl %esi, 24(%r10) 36 movl %esi, 24(%r10)
37 movl %edi, 28(%r10) 37 movl %edi, 28(%r10)
38 popq_cfi %rbp 38 popq_cfi_reg rbp
39 popq_cfi %rbx 39 popq_cfi_reg rbx
40 ret 40 ret
413: 413:
42 CFI_RESTORE_STATE 42 CFI_RESTORE_STATE
@@ -53,10 +53,10 @@ ENDPROC(\op\()_safe_regs)
53.macro op_safe_regs op 53.macro op_safe_regs op
54ENTRY(\op\()_safe_regs) 54ENTRY(\op\()_safe_regs)
55 CFI_STARTPROC 55 CFI_STARTPROC
56 pushl_cfi %ebx 56 pushl_cfi_reg ebx
57 pushl_cfi %ebp 57 pushl_cfi_reg ebp
58 pushl_cfi %esi 58 pushl_cfi_reg esi
59 pushl_cfi %edi 59 pushl_cfi_reg edi
60 pushl_cfi $0 /* Return value */ 60 pushl_cfi $0 /* Return value */
61 pushl_cfi %eax 61 pushl_cfi %eax
62 movl 4(%eax), %ecx 62 movl 4(%eax), %ecx
@@ -80,10 +80,10 @@ ENTRY(\op\()_safe_regs)
80 movl %esi, 24(%eax) 80 movl %esi, 24(%eax)
81 movl %edi, 28(%eax) 81 movl %edi, 28(%eax)
82 popl_cfi %eax 82 popl_cfi %eax
83 popl_cfi %edi 83 popl_cfi_reg edi
84 popl_cfi %esi 84 popl_cfi_reg esi
85 popl_cfi %ebp 85 popl_cfi_reg ebp
86 popl_cfi %ebx 86 popl_cfi_reg ebx
87 ret 87 ret
883: 883:
89 CFI_RESTORE_STATE 89 CFI_RESTORE_STATE
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index 5dff5f042468..2322abe4da3b 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -34,10 +34,10 @@
34 */ 34 */
35 35
36#define save_common_regs \ 36#define save_common_regs \
37 pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0 37 pushl_cfi_reg ecx
38 38
39#define restore_common_regs \ 39#define restore_common_regs \
40 popl_cfi %ecx; CFI_RESTORE ecx 40 popl_cfi_reg ecx
41 41
42 /* Avoid uglifying the argument copying x86-64 needs to do. */ 42 /* Avoid uglifying the argument copying x86-64 needs to do. */
43 .macro movq src, dst 43 .macro movq src, dst
@@ -64,22 +64,22 @@
64 */ 64 */
65 65
66#define save_common_regs \ 66#define save_common_regs \
67 pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ 67 pushq_cfi_reg rdi; \
68 pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ 68 pushq_cfi_reg rsi; \
69 pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ 69 pushq_cfi_reg rcx; \
70 pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ 70 pushq_cfi_reg r8; \
71 pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ 71 pushq_cfi_reg r9; \
72 pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ 72 pushq_cfi_reg r10; \
73 pushq_cfi %r11; CFI_REL_OFFSET r11, 0 73 pushq_cfi_reg r11
74 74
75#define restore_common_regs \ 75#define restore_common_regs \
76 popq_cfi %r11; CFI_RESTORE r11; \ 76 popq_cfi_reg r11; \
77 popq_cfi %r10; CFI_RESTORE r10; \ 77 popq_cfi_reg r10; \
78 popq_cfi %r9; CFI_RESTORE r9; \ 78 popq_cfi_reg r9; \
79 popq_cfi %r8; CFI_RESTORE r8; \ 79 popq_cfi_reg r8; \
80 popq_cfi %rcx; CFI_RESTORE rcx; \ 80 popq_cfi_reg rcx; \
81 popq_cfi %rsi; CFI_RESTORE rsi; \ 81 popq_cfi_reg rsi; \
82 popq_cfi %rdi; CFI_RESTORE rdi 82 popq_cfi_reg rdi
83 83
84#endif 84#endif
85 85
@@ -87,12 +87,10 @@
87ENTRY(call_rwsem_down_read_failed) 87ENTRY(call_rwsem_down_read_failed)
88 CFI_STARTPROC 88 CFI_STARTPROC
89 save_common_regs 89 save_common_regs
90 __ASM_SIZE(push,_cfi) %__ASM_REG(dx) 90 __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
91 CFI_REL_OFFSET __ASM_REG(dx), 0
92 movq %rax,%rdi 91 movq %rax,%rdi
93 call rwsem_down_read_failed 92 call rwsem_down_read_failed
94 __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) 93 __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
95 CFI_RESTORE __ASM_REG(dx)
96 restore_common_regs 94 restore_common_regs
97 ret 95 ret
98 CFI_ENDPROC 96 CFI_ENDPROC
@@ -124,12 +122,10 @@ ENDPROC(call_rwsem_wake)
124ENTRY(call_rwsem_downgrade_wake) 122ENTRY(call_rwsem_downgrade_wake)
125 CFI_STARTPROC 123 CFI_STARTPROC
126 save_common_regs 124 save_common_regs
127 __ASM_SIZE(push,_cfi) %__ASM_REG(dx) 125 __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
128 CFI_REL_OFFSET __ASM_REG(dx), 0
129 movq %rax,%rdi 126 movq %rax,%rdi
130 call rwsem_downgrade_wake 127 call rwsem_downgrade_wake
131 __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) 128 __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
132 CFI_RESTORE __ASM_REG(dx)
133 restore_common_regs 129 restore_common_regs
134 ret 130 ret
135 CFI_ENDPROC 131 CFI_ENDPROC
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index e28cdaf5ac2c..5eb715087b80 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -13,12 +13,9 @@
13 .globl \name 13 .globl \name
14\name: 14\name:
15 CFI_STARTPROC 15 CFI_STARTPROC
16 pushl_cfi %eax 16 pushl_cfi_reg eax
17 CFI_REL_OFFSET eax, 0 17 pushl_cfi_reg ecx
18 pushl_cfi %ecx 18 pushl_cfi_reg edx
19 CFI_REL_OFFSET ecx, 0
20 pushl_cfi %edx
21 CFI_REL_OFFSET edx, 0
22 19
23 .if \put_ret_addr_in_eax 20 .if \put_ret_addr_in_eax
24 /* Place EIP in the arg1 */ 21 /* Place EIP in the arg1 */
@@ -26,12 +23,9 @@
26 .endif 23 .endif
27 24
28 call \func 25 call \func
29 popl_cfi %edx 26 popl_cfi_reg edx
30 CFI_RESTORE edx 27 popl_cfi_reg ecx
31 popl_cfi %ecx 28 popl_cfi_reg eax
32 CFI_RESTORE ecx
33 popl_cfi %eax
34 CFI_RESTORE eax
35 ret 29 ret
36 CFI_ENDPROC 30 CFI_ENDPROC
37 _ASM_NOKPROBE(\name) 31 _ASM_NOKPROBE(\name)
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index b30b5ebd614a..f89ba4e93025 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -17,9 +17,18 @@
17 CFI_STARTPROC 17 CFI_STARTPROC
18 18
19 /* this one pushes 9 elems, the next one would be %rIP */ 19 /* this one pushes 9 elems, the next one would be %rIP */
20 SAVE_ARGS 20 pushq_cfi_reg rdi
21 pushq_cfi_reg rsi
22 pushq_cfi_reg rdx
23 pushq_cfi_reg rcx
24 pushq_cfi_reg rax
25 pushq_cfi_reg r8
26 pushq_cfi_reg r9
27 pushq_cfi_reg r10
28 pushq_cfi_reg r11
21 29
22 .if \put_ret_addr_in_rdi 30 .if \put_ret_addr_in_rdi
31 /* 9*8(%rsp) is return addr on stack */
23 movq_cfi_restore 9*8, rdi 32 movq_cfi_restore 9*8, rdi
24 .endif 33 .endif
25 34
@@ -45,11 +54,22 @@
45#endif 54#endif
46#endif 55#endif
47 56
48 /* SAVE_ARGS below is used only for the .cfi directives it contains. */ 57#if defined(CONFIG_TRACE_IRQFLAGS) \
58 || defined(CONFIG_DEBUG_LOCK_ALLOC) \
59 || defined(CONFIG_PREEMPT)
49 CFI_STARTPROC 60 CFI_STARTPROC
50 SAVE_ARGS 61 CFI_ADJUST_CFA_OFFSET 9*8
51restore: 62restore:
52 RESTORE_ARGS 63 popq_cfi_reg r11
64 popq_cfi_reg r10
65 popq_cfi_reg r9
66 popq_cfi_reg r8
67 popq_cfi_reg rax
68 popq_cfi_reg rcx
69 popq_cfi_reg rdx
70 popq_cfi_reg rsi
71 popq_cfi_reg rdi
53 ret 72 ret
54 CFI_ENDPROC 73 CFI_ENDPROC
55 _ASM_NOKPROBE(restore) 74 _ASM_NOKPROBE(restore)
75#endif
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 1a2be7c6895d..816488c0b97e 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -273,6 +273,9 @@ dd: ESC
273de: ESC 273de: ESC
274df: ESC 274df: ESC
275# 0xe0 - 0xef 275# 0xe0 - 0xef
276# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
277# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
278# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
276e0: LOOPNE/LOOPNZ Jb (f64) 279e0: LOOPNE/LOOPNZ Jb (f64)
277e1: LOOPE/LOOPZ Jb (f64) 280e1: LOOPE/LOOPZ Jb (f64)
278e2: LOOP Jb (f64) 281e2: LOOP Jb (f64)
@@ -281,6 +284,10 @@ e4: IN AL,Ib
281e5: IN eAX,Ib 284e5: IN eAX,Ib
282e6: OUT Ib,AL 285e6: OUT Ib,AL
283e7: OUT Ib,eAX 286e7: OUT Ib,eAX
287# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset
288# in "near" jumps and calls is 16-bit. For CALL,
289# push of return address is 16-bit wide, RSP is decremented by 2
290# but is not truncated to 16 bits, unlike RIP.
284e8: CALL Jz (f64) 291e8: CALL Jz (f64)
285e9: JMP-near Jz (f64) 292e9: JMP-near Jz (f64)
286ea: JMP-far Ap (i64) 293ea: JMP-far Ap (i64)
@@ -456,6 +463,7 @@ AVXcode: 1
4567e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) 4637e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
4577f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) 4647f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
458# 0x0f 0x80-0x8f 465# 0x0f 0x80-0x8f
466# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
45980: JO Jz (f64) 46780: JO Jz (f64)
46081: JNO Jz (f64) 46881: JNO Jz (f64)
46182: JB/JC/JNAE Jz (f64) 46982: JB/JC/JNAE Jz (f64)
@@ -842,6 +850,7 @@ EndTable
842GrpTable: Grp5 850GrpTable: Grp5
8430: INC Ev 8510: INC Ev
8441: DEC Ev 8521: DEC Ev
853# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
8452: CALLN Ev (f64) 8542: CALLN Ev (f64)
8463: CALLF Ep 8553: CALLF Ep
8474: JMPN Ev (f64) 8564: JMPN Ev (f64)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ede025fb46f1..181c53bac3a7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -59,7 +59,7 @@ static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
59 int ret = 0; 59 int ret = 0;
60 60
61 /* kprobe_running() needs smp_processor_id() */ 61 /* kprobe_running() needs smp_processor_id() */
62 if (kprobes_built_in() && !user_mode_vm(regs)) { 62 if (kprobes_built_in() && !user_mode(regs)) {
63 preempt_disable(); 63 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 64 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1; 65 ret = 1;
@@ -148,7 +148,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
148 instr = (void *)convert_ip_to_linear(current, regs); 148 instr = (void *)convert_ip_to_linear(current, regs);
149 max_instr = instr + 15; 149 max_instr = instr + 15;
150 150
151 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 151 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
152 return 0; 152 return 0;
153 153
154 while (instr < max_instr) { 154 while (instr < max_instr) {
@@ -1035,7 +1035,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
1035 if (error_code & PF_USER) 1035 if (error_code & PF_USER)
1036 return false; 1036 return false;
1037 1037
1038 if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) 1038 if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
1039 return false; 1039 return false;
1040 1040
1041 return true; 1041 return true;
@@ -1140,7 +1140,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1140 * User-mode registers count as a user access even for any 1140 * User-mode registers count as a user access even for any
1141 * potential system fault or CPU buglet: 1141 * potential system fault or CPU buglet:
1142 */ 1142 */
1143 if (user_mode_vm(regs)) { 1143 if (user_mode(regs)) {
1144 local_irq_enable(); 1144 local_irq_enable();
1145 error_code |= PF_USER; 1145 error_code |= PF_USER;
1146 flags |= FAULT_FLAG_USER; 1146 flags |= FAULT_FLAG_USER;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a110efca6d06..52417e771af9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -179,7 +179,8 @@ static void __init probe_page_size_mask(void)
179 if (cpu_has_pge) { 179 if (cpu_has_pge) {
180 cr4_set_bits_and_update_boot(X86_CR4_PGE); 180 cr4_set_bits_and_update_boot(X86_CR4_PGE);
181 __supported_pte_mask |= _PAGE_GLOBAL; 181 __supported_pte_mask |= _PAGE_GLOBAL;
182 } 182 } else
183 __supported_pte_mask &= ~_PAGE_GLOBAL;
183} 184}
184 185
185#ifdef CONFIG_X86_32 186#ifdef CONFIG_X86_32
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 5d04be5efb64..4e664bdb535a 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -111,7 +111,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
111{ 111{
112 struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); 112 struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
113 113
114 if (!user_mode_vm(regs)) { 114 if (!user_mode(regs)) {
115 unsigned long stack = kernel_stack_pointer(regs); 115 unsigned long stack = kernel_stack_pointer(regs);
116 if (depth) 116 if (depth)
117 dump_trace(NULL, regs, (unsigned long *)stack, 0, 117 dump_trace(NULL, regs, (unsigned long *)stack, 0,
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 3e32ed5648a0..757678fb26e1 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -134,7 +134,7 @@ static void do_fpu_end(void)
134static void fix_processor_context(void) 134static void fix_processor_context(void)
135{ 135{
136 int cpu = smp_processor_id(); 136 int cpu = smp_processor_id();
137 struct tss_struct *t = &per_cpu(init_tss, cpu); 137 struct tss_struct *t = &per_cpu(cpu_tss, cpu);
138#ifdef CONFIG_X86_64 138#ifdef CONFIG_X86_64
139 struct desc_struct *desc = get_cpu_gdt_table(cpu); 139 struct desc_struct *desc = get_cpu_gdt_table(cpu);
140 tss_desc tss; 140 tss_desc tss;
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index b3560ece1c9f..ef8187f9d28d 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -119,7 +119,7 @@
119110 i386 iopl sys_iopl 119110 i386 iopl sys_iopl
120111 i386 vhangup sys_vhangup 120111 i386 vhangup sys_vhangup
121112 i386 idle 121112 i386 idle
122113 i386 vm86old sys_vm86old sys32_vm86_warning 122113 i386 vm86old sys_vm86old sys_ni_syscall
123114 i386 wait4 sys_wait4 compat_sys_wait4 123114 i386 wait4 sys_wait4 compat_sys_wait4
124115 i386 swapoff sys_swapoff 124115 i386 swapoff sys_swapoff
125116 i386 sysinfo sys_sysinfo compat_sys_sysinfo 125116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
@@ -172,7 +172,7 @@
172163 i386 mremap sys_mremap 172163 i386 mremap sys_mremap
173164 i386 setresuid sys_setresuid16 173164 i386 setresuid sys_setresuid16
174165 i386 getresuid sys_getresuid16 174165 i386 getresuid sys_getresuid16
175166 i386 vm86 sys_vm86 sys32_vm86_warning 175166 i386 vm86 sys_vm86 sys_ni_syscall
176167 i386 query_module 176167 i386 query_module
177168 i386 poll sys_poll 177168 i386 poll sys_poll
178169 i386 nfsservctl 178169 i386 nfsservctl
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 8d656fbb57aa..9ef32d5f1b19 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -178,7 +178,7 @@
178169 common reboot sys_reboot 178169 common reboot sys_reboot
179170 common sethostname sys_sethostname 179170 common sethostname sys_sethostname
180171 common setdomainname sys_setdomainname 180171 common setdomainname sys_setdomainname
181172 common iopl stub_iopl 181172 common iopl sys_iopl
182173 common ioperm sys_ioperm 182173 common ioperm sys_ioperm
183174 64 create_module 183174 64 create_module
184175 common init_module sys_init_module 184175 common init_module sys_init_module
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 2d7d9a1f5b53..8ffd2146fa6a 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -64,8 +64,8 @@
64 */ 64 */
65static inline void rdtsc_barrier(void) 65static inline void rdtsc_barrier(void)
66{ 66{
67 alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); 67 alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
68 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); 68 "lfence", X86_FEATURE_LFENCE_RDTSC);
69} 69}
70 70
71#endif 71#endif
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 5cdfa9db2217..a75d8700472a 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -16,7 +16,7 @@
16 */ 16 */
17 17
18/* Not going to be implemented by UML, since we have no hardware. */ 18/* Not going to be implemented by UML, since we have no hardware. */
19#define stub_iopl sys_ni_syscall 19#define sys_iopl sys_ni_syscall
20#define sys_ioperm sys_ni_syscall 20#define sys_ioperm sys_ni_syscall
21 21
22/* 22/*
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5240f563076d..81665c9f2132 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss,
912 mcs = xen_mc_entry(0); 912 mcs = xen_mc_entry(0);
913 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 913 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
914 xen_mc_issue(PARAVIRT_LAZY_CPU); 914 xen_mc_issue(PARAVIRT_LAZY_CPU);
915 tss->x86_tss.sp0 = thread->sp0;
915} 916}
916 917
917static void xen_set_iopl_mask(unsigned mask) 918static void xen_set_iopl_mask(unsigned mask)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 08e8489c47f1..7413ee3706d0 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -445,15 +445,7 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
445{ 445{
446 int rc; 446 int rc;
447 447
448 per_cpu(current_task, cpu) = idle; 448 common_cpu_up(cpu, idle);
449#ifdef CONFIG_X86_32
450 irq_ctx_init(cpu);
451#else
452 clear_tsk_thread_flag(idle, TIF_FORK);
453#endif
454 per_cpu(kernel_stack, cpu) =
455 (unsigned long)task_stack_page(idle) -
456 KERNEL_STACK_OFFSET + THREAD_SIZE;
457 449
458 xen_setup_runstate_info(cpu); 450 xen_setup_runstate_info(cpu);
459 xen_setup_timer(cpu); 451 xen_setup_timer(cpu);
@@ -468,10 +460,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
468 if (rc) 460 if (rc)
469 return rc; 461 return rc;
470 462
471 if (num_online_cpus() == 1)
472 /* Just in case we booted with a single CPU. */
473 alternatives_enable_smp();
474
475 rc = xen_smp_intr_init(cpu); 463 rc = xen_smp_intr_init(cpu);
476 if (rc) 464 if (rc)
477 return rc; 465 return rc;
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 53adefda4275..985fc3ee0973 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -68,11 +68,11 @@ ENTRY(xen_sysret64)
68 * We're already on the usermode stack at this point, but 68 * We're already on the usermode stack at this point, but
69 * still with the kernel gs, so we can easily switch back 69 * still with the kernel gs, so we can easily switch back
70 */ 70 */
71 movq %rsp, PER_CPU_VAR(old_rsp) 71 movq %rsp, PER_CPU_VAR(rsp_scratch)
72 movq PER_CPU_VAR(kernel_stack), %rsp 72 movq PER_CPU_VAR(kernel_stack), %rsp
73 73
74 pushq $__USER_DS 74 pushq $__USER_DS
75 pushq PER_CPU_VAR(old_rsp) 75 pushq PER_CPU_VAR(rsp_scratch)
76 pushq %r11 76 pushq %r11
77 pushq $__USER_CS 77 pushq $__USER_CS
78 pushq %rcx 78 pushq %rcx
@@ -87,11 +87,11 @@ ENTRY(xen_sysret32)
87 * We're already on the usermode stack at this point, but 87 * We're already on the usermode stack at this point, but
88 * still with the kernel gs, so we can easily switch back 88 * still with the kernel gs, so we can easily switch back
89 */ 89 */
90 movq %rsp, PER_CPU_VAR(old_rsp) 90 movq %rsp, PER_CPU_VAR(rsp_scratch)
91 movq PER_CPU_VAR(kernel_stack), %rsp 91 movq PER_CPU_VAR(kernel_stack), %rsp
92 92
93 pushq $__USER32_DS 93 pushq $__USER32_DS
94 pushq PER_CPU_VAR(old_rsp) 94 pushq PER_CPU_VAR(rsp_scratch)
95 pushq %r11 95 pushq %r11
96 pushq $__USER32_CS 96 pushq $__USER32_CS
97 pushq %rcx 97 pushq %rcx
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index 82dc5748f873..7f327121e6d7 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -1210,7 +1210,7 @@ xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args)
1210 1210
1211 if (((die_args->trapnr == X86_TRAP_MF) || 1211 if (((die_args->trapnr == X86_TRAP_MF) ||
1212 (die_args->trapnr == X86_TRAP_XF)) && 1212 (die_args->trapnr == X86_TRAP_XF)) &&
1213 !user_mode_vm(die_args->regs)) 1213 !user_mode(die_args->regs))
1214 xpc_die_deactivate(); 1214 xpc_die_deactivate();
1215 1215
1216 break; 1216 break;
diff --git a/include/linux/stddef.h b/include/linux/stddef.h
index f4aec0e75c3a..076af437284d 100644
--- a/include/linux/stddef.h
+++ b/include/linux/stddef.h
@@ -19,3 +19,12 @@ enum {
19#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) 19#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
20#endif 20#endif
21#endif 21#endif
22
23/**
24 * offsetofend(TYPE, MEMBER)
25 *
26 * @TYPE: The type of the structure
27 * @MEMBER: The member within the structure to get the end offset of
28 */
29#define offsetofend(TYPE, MEMBER) \
30 (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER))
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 2d67b8998fd8..049b2f497bc7 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -78,19 +78,6 @@ extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
78extern void vfio_unregister_iommu_driver( 78extern void vfio_unregister_iommu_driver(
79 const struct vfio_iommu_driver_ops *ops); 79 const struct vfio_iommu_driver_ops *ops);
80 80
81/**
82 * offsetofend(TYPE, MEMBER)
83 *
84 * @TYPE: The type of the structure
85 * @MEMBER: The member within the structure to get the end offset of
86 *
87 * Simple helper macro for dealing with variable sized structures passed
88 * from user space. This allows us to easily determine if the provided
89 * structure is sized to include various fields.
90 */
91#define offsetofend(TYPE, MEMBER) \
92 (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER))
93
94/* 81/*
95 * External user API 82 * External user API
96 */ 83 */
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
index d66ab799b35f..8c0c1a2770c8 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -1,12 +1,12 @@
1 1
2MEMCPY_FN(__memcpy, 2MEMCPY_FN(memcpy_orig,
3 "x86-64-unrolled", 3 "x86-64-unrolled",
4 "unrolled memcpy() in arch/x86/lib/memcpy_64.S") 4 "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
5 5
6MEMCPY_FN(memcpy_c, 6MEMCPY_FN(__memcpy,
7 "x86-64-movsq", 7 "x86-64-movsq",
8 "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") 8 "movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
9 9
10MEMCPY_FN(memcpy_c_e, 10MEMCPY_FN(memcpy_erms,
11 "x86-64-movsb", 11 "x86-64-movsb",
12 "movsb-based memcpy() in arch/x86/lib/memcpy_64.S") 12 "movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
index fcd9cf00600a..e4c2c30143b9 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -1,8 +1,6 @@
1#define memcpy MEMCPY /* don't hide glibc's memcpy() */ 1#define memcpy MEMCPY /* don't hide glibc's memcpy() */
2#define altinstr_replacement text 2#define altinstr_replacement text
3#define globl p2align 4; .globl 3#define globl p2align 4; .globl
4#define Lmemcpy_c globl memcpy_c; memcpy_c
5#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e
6#include "../../../arch/x86/lib/memcpy_64.S" 4#include "../../../arch/x86/lib/memcpy_64.S"
7/* 5/*
8 * We need to provide note.GNU-stack section, saying that we want 6 * We need to provide note.GNU-stack section, saying that we want
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index db1d3a29d97f..d3dfb7936dcd 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -36,7 +36,7 @@ static const struct option options[] = {
36 "Specify length of memory to copy. " 36 "Specify length of memory to copy. "
37 "Available units: B, KB, MB, GB and TB (upper and lower)"), 37 "Available units: B, KB, MB, GB and TB (upper and lower)"),
38 OPT_STRING('r', "routine", &routine, "default", 38 OPT_STRING('r', "routine", &routine, "default",
39 "Specify routine to copy"), 39 "Specify routine to copy, \"all\" runs all available routines"),
40 OPT_INTEGER('i', "iterations", &iterations, 40 OPT_INTEGER('i', "iterations", &iterations,
41 "repeat memcpy() invocation this number of times"), 41 "repeat memcpy() invocation this number of times"),
42 OPT_BOOLEAN('c', "cycle", &use_cycle, 42 OPT_BOOLEAN('c', "cycle", &use_cycle,
@@ -135,55 +135,16 @@ struct bench_mem_info {
135 const char *const *usage; 135 const char *const *usage;
136}; 136};
137 137
138static int bench_mem_common(int argc, const char **argv, 138static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen)
139 const char *prefix __maybe_unused,
140 struct bench_mem_info *info)
141{ 139{
142 int i; 140 const struct routine *r = &info->routines[r_idx];
143 size_t len;
144 double totallen;
145 double result_bps[2]; 141 double result_bps[2];
146 u64 result_cycle[2]; 142 u64 result_cycle[2];
147 143
148 argc = parse_options(argc, argv, options,
149 info->usage, 0);
150
151 if (no_prefault && only_prefault) {
152 fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
153 return 1;
154 }
155
156 if (use_cycle)
157 init_cycle();
158
159 len = (size_t)perf_atoll((char *)length_str);
160 totallen = (double)len * iterations;
161
162 result_cycle[0] = result_cycle[1] = 0ULL; 144 result_cycle[0] = result_cycle[1] = 0ULL;
163 result_bps[0] = result_bps[1] = 0.0; 145 result_bps[0] = result_bps[1] = 0.0;
164 146
165 if ((s64)len <= 0) { 147 printf("Routine %s (%s)\n", r->name, r->desc);
166 fprintf(stderr, "Invalid length:%s\n", length_str);
167 return 1;
168 }
169
170 /* same to without specifying either of prefault and no-prefault */
171 if (only_prefault && no_prefault)
172 only_prefault = no_prefault = false;
173
174 for (i = 0; info->routines[i].name; i++) {
175 if (!strcmp(info->routines[i].name, routine))
176 break;
177 }
178 if (!info->routines[i].name) {
179 printf("Unknown routine:%s\n", routine);
180 printf("Available routines...\n");
181 for (i = 0; info->routines[i].name; i++) {
182 printf("\t%s ... %s\n",
183 info->routines[i].name, info->routines[i].desc);
184 }
185 return 1;
186 }
187 148
188 if (bench_format == BENCH_FORMAT_DEFAULT) 149 if (bench_format == BENCH_FORMAT_DEFAULT)
189 printf("# Copying %s Bytes ...\n\n", length_str); 150 printf("# Copying %s Bytes ...\n\n", length_str);
@@ -191,28 +152,17 @@ static int bench_mem_common(int argc, const char **argv,
191 if (!only_prefault && !no_prefault) { 152 if (!only_prefault && !no_prefault) {
192 /* show both of results */ 153 /* show both of results */
193 if (use_cycle) { 154 if (use_cycle) {
194 result_cycle[0] = 155 result_cycle[0] = info->do_cycle(r, len, false);
195 info->do_cycle(&info->routines[i], len, false); 156 result_cycle[1] = info->do_cycle(r, len, true);
196 result_cycle[1] =
197 info->do_cycle(&info->routines[i], len, true);
198 } else { 157 } else {
199 result_bps[0] = 158 result_bps[0] = info->do_gettimeofday(r, len, false);
200 info->do_gettimeofday(&info->routines[i], 159 result_bps[1] = info->do_gettimeofday(r, len, true);
201 len, false);
202 result_bps[1] =
203 info->do_gettimeofday(&info->routines[i],
204 len, true);
205 } 160 }
206 } else { 161 } else {
207 if (use_cycle) { 162 if (use_cycle)
208 result_cycle[pf] = 163 result_cycle[pf] = info->do_cycle(r, len, only_prefault);
209 info->do_cycle(&info->routines[i], 164 else
210 len, only_prefault); 165 result_bps[pf] = info->do_gettimeofday(r, len, only_prefault);
211 } else {
212 result_bps[pf] =
213 info->do_gettimeofday(&info->routines[i],
214 len, only_prefault);
215 }
216 } 166 }
217 167
218 switch (bench_format) { 168 switch (bench_format) {
@@ -265,6 +215,60 @@ static int bench_mem_common(int argc, const char **argv,
265 die("unknown format: %d\n", bench_format); 215 die("unknown format: %d\n", bench_format);
266 break; 216 break;
267 } 217 }
218}
219
220static int bench_mem_common(int argc, const char **argv,
221 const char *prefix __maybe_unused,
222 struct bench_mem_info *info)
223{
224 int i;
225 size_t len;
226 double totallen;
227
228 argc = parse_options(argc, argv, options,
229 info->usage, 0);
230
231 if (no_prefault && only_prefault) {
232 fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
233 return 1;
234 }
235
236 if (use_cycle)
237 init_cycle();
238
239 len = (size_t)perf_atoll((char *)length_str);
240 totallen = (double)len * iterations;
241
242 if ((s64)len <= 0) {
243 fprintf(stderr, "Invalid length:%s\n", length_str);
244 return 1;
245 }
246
247 /* same to without specifying either of prefault and no-prefault */
248 if (only_prefault && no_prefault)
249 only_prefault = no_prefault = false;
250
251 if (!strncmp(routine, "all", 3)) {
252 for (i = 0; info->routines[i].name; i++)
253 __bench_mem_routine(info, i, len, totallen);
254 return 0;
255 }
256
257 for (i = 0; info->routines[i].name; i++) {
258 if (!strcmp(info->routines[i].name, routine))
259 break;
260 }
261 if (!info->routines[i].name) {
262 printf("Unknown routine:%s\n", routine);
263 printf("Available routines...\n");
264 for (i = 0; info->routines[i].name; i++) {
265 printf("\t%s ... %s\n",
266 info->routines[i].name, info->routines[i].desc);
267 }
268 return 1;
269 }
270
271 __bench_mem_routine(info, i, len, totallen);
268 272
269 return 0; 273 return 0;
270} 274}
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h
index a71dff97c1f5..f02d028771d9 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h
@@ -1,12 +1,12 @@
1 1
2MEMSET_FN(__memset, 2MEMSET_FN(memset_orig,
3 "x86-64-unrolled", 3 "x86-64-unrolled",
4 "unrolled memset() in arch/x86/lib/memset_64.S") 4 "unrolled memset() in arch/x86/lib/memset_64.S")
5 5
6MEMSET_FN(memset_c, 6MEMSET_FN(__memset,
7 "x86-64-stosq", 7 "x86-64-stosq",
8 "movsq-based memset() in arch/x86/lib/memset_64.S") 8 "movsq-based memset() in arch/x86/lib/memset_64.S")
9 9
10MEMSET_FN(memset_c_e, 10MEMSET_FN(memset_erms,
11 "x86-64-stosb", 11 "x86-64-stosb",
12 "movsb-based memset() in arch/x86/lib/memset_64.S") 12 "movsb-based memset() in arch/x86/lib/memset_64.S")
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S
index 9e5af89ed13a..de278784c866 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm.S
+++ b/tools/perf/bench/mem-memset-x86-64-asm.S
@@ -1,8 +1,6 @@
1#define memset MEMSET /* don't hide glibc's memset() */ 1#define memset MEMSET /* don't hide glibc's memset() */
2#define altinstr_replacement text 2#define altinstr_replacement text
3#define globl p2align 4; .globl 3#define globl p2align 4; .globl
4#define Lmemset_c globl memset_c; memset_c
5#define Lmemset_c_e globl memset_c_e; memset_c_e
6#include "../../../arch/x86/lib/memset_64.S" 4#include "../../../arch/x86/lib/memset_64.S"
7 5
8/* 6/*
diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/perf/util/include/asm/alternative-asm.h
index 6789d788d494..3a3a0f16456a 100644
--- a/tools/perf/util/include/asm/alternative-asm.h
+++ b/tools/perf/util/include/asm/alternative-asm.h
@@ -4,5 +4,6 @@
4/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ 4/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
5 5
6#define altinstruction_entry # 6#define altinstruction_entry #
7#define ALTERNATIVE_2 #
7 8
8#endif 9#endif
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index d643d5242537..95abddcd7839 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -17,6 +17,7 @@ TARGETS += sysctl
17TARGETS += timers 17TARGETS += timers
18TARGETS += user 18TARGETS += user
19TARGETS += vm 19TARGETS += vm
20TARGETS += x86
20#Please keep the TARGETS list alphabetically sorted 21#Please keep the TARGETS list alphabetically sorted
21 22
22TARGETS_HOTPLUG = cpu-hotplug 23TARGETS_HOTPLUG = cpu-hotplug
diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore
new file mode 100644
index 000000000000..15034fef9698
--- /dev/null
+++ b/tools/testing/selftests/x86/.gitignore
@@ -0,0 +1,2 @@
1*_32
2*_64
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
new file mode 100644
index 000000000000..f0a7918178dd
--- /dev/null
+++ b/tools/testing/selftests/x86/Makefile
@@ -0,0 +1,48 @@
1.PHONY: all all_32 all_64 check_build32 clean run_tests
2
3TARGETS_C_BOTHBITS := sigreturn
4
5BINARIES_32 := $(TARGETS_C_BOTHBITS:%=%_32)
6BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64)
7
8CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
9
10UNAME_P := $(shell uname -p)
11
12# Always build 32-bit tests
13all: all_32
14
15# If we're on a 64-bit host, build 64-bit tests as well
16ifeq ($(shell uname -p),x86_64)
17all: all_64
18endif
19
20all_32: check_build32 $(BINARIES_32)
21
22all_64: $(BINARIES_64)
23
24clean:
25 $(RM) $(BINARIES_32) $(BINARIES_64)
26
27run_tests:
28 ./run_x86_tests.sh
29
30$(TARGETS_C_BOTHBITS:%=%_32): %_32: %.c
31 $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
32
33$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
34 $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
35
36check_build32:
37 @if ! $(CC) -m32 -o /dev/null trivial_32bit_program.c; then \
38 echo "Warning: you seem to have a broken 32-bit build" 2>&1; \
39 echo "environment. If you are using a Debian-like"; \
40 echo " distribution, try:"; \
41 echo ""; \
42 echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \
43 echo ""; \
44 echo "If you are using a Fedora-like distribution, try:"; \
45 echo ""; \
46 echo " yum install glibc-devel.*i686"; \
47 exit 1; \
48 fi
diff --git a/tools/testing/selftests/x86/run_x86_tests.sh b/tools/testing/selftests/x86/run_x86_tests.sh
new file mode 100644
index 000000000000..3d3ec65f3e7c
--- /dev/null
+++ b/tools/testing/selftests/x86/run_x86_tests.sh
@@ -0,0 +1,11 @@
1#!/bin/bash
2
3# This is deliberately minimal. IMO kselftests should provide a standard
4# script here.
5./sigreturn_32 || exit 1
6
7if [[ "$uname -p" -eq "x86_64" ]]; then
8 ./sigreturn_64 || exit 1
9fi
10
11exit 0
diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c
new file mode 100644
index 000000000000..b5aa1bab7416
--- /dev/null
+++ b/tools/testing/selftests/x86/sigreturn.c
@@ -0,0 +1,684 @@
1/*
2 * sigreturn.c - tests for x86 sigreturn(2) and exit-to-userspace
3 * Copyright (c) 2014-2015 Andrew Lutomirski
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * This is a series of tests that exercises the sigreturn(2) syscall and
15 * the IRET / SYSRET paths in the kernel.
16 *
17 * For now, this focuses on the effects of unusual CS and SS values,
18 * and it has a bunch of tests to make sure that ESP/RSP is restored
19 * properly.
20 *
21 * The basic idea behind these tests is to raise(SIGUSR1) to create a
22 * sigcontext frame, plug in the values to be tested, and then return,
23 * which implicitly invokes sigreturn(2) and programs the user context
24 * as desired.
25 *
26 * For tests for which we expect sigreturn and the subsequent return to
27 * user mode to succeed, we return to a short trampoline that generates
28 * SIGTRAP so that the meat of the tests can be ordinary C code in a
29 * SIGTRAP handler.
30 *
31 * The inner workings of each test is documented below.
32 *
33 * Do not run on outdated, unpatched kernels at risk of nasty crashes.
34 */
35
36#define _GNU_SOURCE
37
38#include <sys/time.h>
39#include <time.h>
40#include <stdlib.h>
41#include <sys/syscall.h>
42#include <unistd.h>
43#include <stdio.h>
44#include <string.h>
45#include <inttypes.h>
46#include <sys/mman.h>
47#include <sys/signal.h>
48#include <sys/ucontext.h>
49#include <asm/ldt.h>
50#include <err.h>
51#include <setjmp.h>
52#include <stddef.h>
53#include <stdbool.h>
54#include <sys/ptrace.h>
55#include <sys/user.h>
56
57/*
58 * In principle, this test can run on Linux emulation layers (e.g.
59 * Illumos "LX branded zones"). Solaris-based kernels reserve LDT
60 * entries 0-5 for their own internal purposes, so start our LDT
61 * allocations above that reservation. (The tests don't pass on LX
62 * branded zones, but at least this lets them run.)
63 */
64#define LDT_OFFSET 6
65
66/* An aligned stack accessible through some of our segments. */
67static unsigned char stack16[65536] __attribute__((aligned(4096)));
68
69/*
70 * An aligned int3 instruction used as a trampoline. Some of the tests
71 * want to fish out their ss values, so this trampoline copies ss to eax
72 * before the int3.
73 */
74asm (".pushsection .text\n\t"
75 ".type int3, @function\n\t"
76 ".align 4096\n\t"
77 "int3:\n\t"
78 "mov %ss,%eax\n\t"
79 "int3\n\t"
80 ".size int3, . - int3\n\t"
81 ".align 4096, 0xcc\n\t"
82 ".popsection");
83extern char int3[4096];
84
85/*
86 * At startup, we prepapre:
87 *
88 * - ldt_nonexistent_sel: An LDT entry that doesn't exist (all-zero
89 * descriptor or out of bounds).
90 * - code16_sel: A 16-bit LDT code segment pointing to int3.
91 * - data16_sel: A 16-bit LDT data segment pointing to stack16.
92 * - npcode32_sel: A 32-bit not-present LDT code segment pointing to int3.
93 * - npdata32_sel: A 32-bit not-present LDT data segment pointing to stack16.
94 * - gdt_data16_idx: A 16-bit GDT data segment pointing to stack16.
95 * - gdt_npdata32_idx: A 32-bit not-present GDT data segment pointing to
96 * stack16.
97 *
98 * For no particularly good reason, xyz_sel is a selector value with the
99 * RPL and LDT bits filled in, whereas xyz_idx is just an index into the
100 * descriptor table. These variables will be zero if their respective
101 * segments could not be allocated.
102 */
103static unsigned short ldt_nonexistent_sel;
104static unsigned short code16_sel, data16_sel, npcode32_sel, npdata32_sel;
105
106static unsigned short gdt_data16_idx, gdt_npdata32_idx;
107
108static unsigned short GDT3(int idx)
109{
110 return (idx << 3) | 3;
111}
112
113static unsigned short LDT3(int idx)
114{
115 return (idx << 3) | 7;
116}
117
118/* Our sigaltstack scratch space. */
119static char altstack_data[SIGSTKSZ];
120
121static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
122 int flags)
123{
124 struct sigaction sa;
125 memset(&sa, 0, sizeof(sa));
126 sa.sa_sigaction = handler;
127 sa.sa_flags = SA_SIGINFO | flags;
128 sigemptyset(&sa.sa_mask);
129 if (sigaction(sig, &sa, 0))
130 err(1, "sigaction");
131}
132
133static void clearhandler(int sig)
134{
135 struct sigaction sa;
136 memset(&sa, 0, sizeof(sa));
137 sa.sa_handler = SIG_DFL;
138 sigemptyset(&sa.sa_mask);
139 if (sigaction(sig, &sa, 0))
140 err(1, "sigaction");
141}
142
143static void add_ldt(const struct user_desc *desc, unsigned short *var,
144 const char *name)
145{
146 if (syscall(SYS_modify_ldt, 1, desc, sizeof(*desc)) == 0) {
147 *var = LDT3(desc->entry_number);
148 } else {
149 printf("[NOTE]\tFailed to create %s segment\n", name);
150 *var = 0;
151 }
152}
153
154static void setup_ldt(void)
155{
156 if ((unsigned long)stack16 > (1ULL << 32) - sizeof(stack16))
157 errx(1, "stack16 is too high\n");
158 if ((unsigned long)int3 > (1ULL << 32) - sizeof(int3))
159 errx(1, "int3 is too high\n");
160
161 ldt_nonexistent_sel = LDT3(LDT_OFFSET + 2);
162
163 const struct user_desc code16_desc = {
164 .entry_number = LDT_OFFSET + 0,
165 .base_addr = (unsigned long)int3,
166 .limit = 4095,
167 .seg_32bit = 0,
168 .contents = 2, /* Code, not conforming */
169 .read_exec_only = 0,
170 .limit_in_pages = 0,
171 .seg_not_present = 0,
172 .useable = 0
173 };
174 add_ldt(&code16_desc, &code16_sel, "code16");
175
176 const struct user_desc data16_desc = {
177 .entry_number = LDT_OFFSET + 1,
178 .base_addr = (unsigned long)stack16,
179 .limit = 0xffff,
180 .seg_32bit = 0,
181 .contents = 0, /* Data, grow-up */
182 .read_exec_only = 0,
183 .limit_in_pages = 0,
184 .seg_not_present = 0,
185 .useable = 0
186 };
187 add_ldt(&data16_desc, &data16_sel, "data16");
188
189 const struct user_desc npcode32_desc = {
190 .entry_number = LDT_OFFSET + 3,
191 .base_addr = (unsigned long)int3,
192 .limit = 4095,
193 .seg_32bit = 1,
194 .contents = 2, /* Code, not conforming */
195 .read_exec_only = 0,
196 .limit_in_pages = 0,
197 .seg_not_present = 1,
198 .useable = 0
199 };
200 add_ldt(&npcode32_desc, &npcode32_sel, "npcode32");
201
202 const struct user_desc npdata32_desc = {
203 .entry_number = LDT_OFFSET + 4,
204 .base_addr = (unsigned long)stack16,
205 .limit = 0xffff,
206 .seg_32bit = 1,
207 .contents = 0, /* Data, grow-up */
208 .read_exec_only = 0,
209 .limit_in_pages = 0,
210 .seg_not_present = 1,
211 .useable = 0
212 };
213 add_ldt(&npdata32_desc, &npdata32_sel, "npdata32");
214
215 struct user_desc gdt_data16_desc = {
216 .entry_number = -1,
217 .base_addr = (unsigned long)stack16,
218 .limit = 0xffff,
219 .seg_32bit = 0,
220 .contents = 0, /* Data, grow-up */
221 .read_exec_only = 0,
222 .limit_in_pages = 0,
223 .seg_not_present = 0,
224 .useable = 0
225 };
226
227 if (syscall(SYS_set_thread_area, &gdt_data16_desc) == 0) {
228 /*
229 * This probably indicates vulnerability to CVE-2014-8133.
230 * Merely getting here isn't definitive, though, and we'll
231 * diagnose the problem for real later on.
232 */
233 printf("[WARN]\tset_thread_area allocated data16 at index %d\n",
234 gdt_data16_desc.entry_number);
235 gdt_data16_idx = gdt_data16_desc.entry_number;
236 } else {
237 printf("[OK]\tset_thread_area refused 16-bit data\n");
238 }
239
240 struct user_desc gdt_npdata32_desc = {
241 .entry_number = -1,
242 .base_addr = (unsigned long)stack16,
243 .limit = 0xffff,
244 .seg_32bit = 1,
245 .contents = 0, /* Data, grow-up */
246 .read_exec_only = 0,
247 .limit_in_pages = 0,
248 .seg_not_present = 1,
249 .useable = 0
250 };
251
252 if (syscall(SYS_set_thread_area, &gdt_npdata32_desc) == 0) {
253 /*
254 * As a hardening measure, newer kernels don't allow this.
255 */
256 printf("[WARN]\tset_thread_area allocated npdata32 at index %d\n",
257 gdt_npdata32_desc.entry_number);
258 gdt_npdata32_idx = gdt_npdata32_desc.entry_number;
259 } else {
260 printf("[OK]\tset_thread_area refused 16-bit data\n");
261 }
262}
263
264/* State used by our signal handlers. */
265static gregset_t initial_regs, requested_regs, resulting_regs;
266
267/* Instructions for the SIGUSR1 handler. */
268static volatile unsigned short sig_cs, sig_ss;
269static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno;
270
271/* Abstractions for some 32-bit vs 64-bit differences. */
272#ifdef __x86_64__
273# define REG_IP REG_RIP
274# define REG_SP REG_RSP
275# define REG_AX REG_RAX
276
277struct selectors {
278 unsigned short cs, gs, fs, ss;
279};
280
281static unsigned short *ssptr(ucontext_t *ctx)
282{
283 struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
284 return &sels->ss;
285}
286
287static unsigned short *csptr(ucontext_t *ctx)
288{
289 struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
290 return &sels->cs;
291}
292#else
293# define REG_IP REG_EIP
294# define REG_SP REG_ESP
295# define REG_AX REG_EAX
296
297static greg_t *ssptr(ucontext_t *ctx)
298{
299 return &ctx->uc_mcontext.gregs[REG_SS];
300}
301
302static greg_t *csptr(ucontext_t *ctx)
303{
304 return &ctx->uc_mcontext.gregs[REG_CS];
305}
306#endif
307
308/* Number of errors in the current test case. */
309static volatile sig_atomic_t nerrs;
310
311/*
312 * SIGUSR1 handler. Sets CS and SS as requested and points IP to the
313 * int3 trampoline. Sets SP to a large known value so that we can see
314 * whether the value round-trips back to user mode correctly.
315 */
316static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
317{
318 ucontext_t *ctx = (ucontext_t*)ctx_void;
319
320 memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
321
322 *csptr(ctx) = sig_cs;
323 *ssptr(ctx) = sig_ss;
324
325 ctx->uc_mcontext.gregs[REG_IP] =
326 sig_cs == code16_sel ? 0 : (unsigned long)&int3;
327 ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL;
328 ctx->uc_mcontext.gregs[REG_AX] = 0;
329
330 memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
331 requested_regs[REG_AX] = *ssptr(ctx); /* The asm code does this. */
332
333 return;
334}
335
336/*
337 * Called after a successful sigreturn. Restores our state so that
338 * the original raise(SIGUSR1) returns.
339 */
340static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
341{
342 ucontext_t *ctx = (ucontext_t*)ctx_void;
343
344 sig_err = ctx->uc_mcontext.gregs[REG_ERR];
345 sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO];
346
347 unsigned short ss;
348 asm ("mov %%ss,%0" : "=r" (ss));
349
350 greg_t asm_ss = ctx->uc_mcontext.gregs[REG_AX];
351 if (asm_ss != sig_ss && sig == SIGTRAP) {
352 /* Sanity check failure. */
353 printf("[FAIL]\tSIGTRAP: ss = %hx, frame ss = %hx, ax = %llx\n",
354 ss, *ssptr(ctx), (unsigned long long)asm_ss);
355 nerrs++;
356 }
357
358 memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
359 memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t));
360
361 sig_trapped = sig;
362}
363
364/*
365 * Checks a given selector for its code bitness or returns -1 if it's not
366 * a usable code segment selector.
367 */
368int cs_bitness(unsigned short cs)
369{
370 uint32_t valid = 0, ar;
371 asm ("lar %[cs], %[ar]\n\t"
372 "jnz 1f\n\t"
373 "mov $1, %[valid]\n\t"
374 "1:"
375 : [ar] "=r" (ar), [valid] "+rm" (valid)
376 : [cs] "r" (cs));
377
378 if (!valid)
379 return -1;
380
381 bool db = (ar & (1 << 22));
382 bool l = (ar & (1 << 21));
383
384 if (!(ar & (1<<11)))
385 return -1; /* Not code. */
386
387 if (l && !db)
388 return 64;
389 else if (!l && db)
390 return 32;
391 else if (!l && !db)
392 return 16;
393 else
394 return -1; /* Unknown bitness. */
395}
396
397/* Finds a usable code segment of the requested bitness. */
398int find_cs(int bitness)
399{
400 unsigned short my_cs;
401
402 asm ("mov %%cs,%0" : "=r" (my_cs));
403
404 if (cs_bitness(my_cs) == bitness)
405 return my_cs;
406 if (cs_bitness(my_cs + (2 << 3)) == bitness)
407 return my_cs + (2 << 3);
408 if (my_cs > (2<<3) && cs_bitness(my_cs - (2 << 3)) == bitness)
409 return my_cs - (2 << 3);
410 if (cs_bitness(code16_sel) == bitness)
411 return code16_sel;
412
413 printf("[WARN]\tCould not find %d-bit CS\n", bitness);
414 return -1;
415}
416
417static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss)
418{
419 int cs = find_cs(cs_bits);
420 if (cs == -1) {
421 printf("[SKIP]\tCode segment unavailable for %d-bit CS, %d-bit SS\n",
422 cs_bits, use_16bit_ss ? 16 : 32);
423 return 0;
424 }
425
426 if (force_ss != -1) {
427 sig_ss = force_ss;
428 } else {
429 if (use_16bit_ss) {
430 if (!data16_sel) {
431 printf("[SKIP]\tData segment unavailable for %d-bit CS, 16-bit SS\n",
432 cs_bits);
433 return 0;
434 }
435 sig_ss = data16_sel;
436 } else {
437 asm volatile ("mov %%ss,%0" : "=r" (sig_ss));
438 }
439 }
440
441 sig_cs = cs;
442
443 printf("[RUN]\tValid sigreturn: %d-bit CS (%hx), %d-bit SS (%hx%s)\n",
444 cs_bits, sig_cs, use_16bit_ss ? 16 : 32, sig_ss,
445 (sig_ss & 4) ? "" : ", GDT");
446
447 raise(SIGUSR1);
448
449 nerrs = 0;
450
451 /*
452 * Check that each register had an acceptable value when the
453 * int3 trampoline was invoked.
454 */
455 for (int i = 0; i < NGREG; i++) {
456 greg_t req = requested_regs[i], res = resulting_regs[i];
457 if (i == REG_TRAPNO || i == REG_IP)
458 continue; /* don't care */
459 if (i == REG_SP) {
460 printf("\tSP: %llx -> %llx\n", (unsigned long long)req,
461 (unsigned long long)res);
462
463 /*
464 * In many circumstances, the high 32 bits of rsp
465 * are zeroed. For example, we could be a real
466 * 32-bit program, or we could hit any of a number
467 * of poorly-documented IRET or segmented ESP
468 * oddities. If this happens, it's okay.
469 */
470 if (res == (req & 0xFFFFFFFF))
471 continue; /* OK; not expected to work */
472 }
473
474 bool ignore_reg = false;
475#if __i386__
476 if (i == REG_UESP)
477 ignore_reg = true;
478#else
479 if (i == REG_CSGSFS) {
480 struct selectors *req_sels =
481 (void *)&requested_regs[REG_CSGSFS];
482 struct selectors *res_sels =
483 (void *)&resulting_regs[REG_CSGSFS];
484 if (req_sels->cs != res_sels->cs) {
485 printf("[FAIL]\tCS mismatch: requested 0x%hx; got 0x%hx\n",
486 req_sels->cs, res_sels->cs);
487 nerrs++;
488 }
489
490 if (req_sels->ss != res_sels->ss) {
491 printf("[FAIL]\tSS mismatch: requested 0x%hx; got 0x%hx\n",
492 req_sels->ss, res_sels->ss);
493 nerrs++;
494 }
495
496 continue;
497 }
498#endif
499
500 /* Sanity check on the kernel */
501 if (i == REG_AX && requested_regs[i] != resulting_regs[i]) {
502 printf("[FAIL]\tAX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n",
503 (unsigned long long)requested_regs[i],
504 (unsigned long long)resulting_regs[i]);
505 nerrs++;
506 continue;
507 }
508
509 if (requested_regs[i] != resulting_regs[i] && !ignore_reg) {
510 /*
511 * SP is particularly interesting here. The
512 * usual cause of failures is that we hit the
513 * nasty IRET case of returning to a 16-bit SS,
514 * in which case bits 16:31 of the *kernel*
515 * stack pointer persist in ESP.
516 */
517 printf("[FAIL]\tReg %d mismatch: requested 0x%llx; got 0x%llx\n",
518 i, (unsigned long long)requested_regs[i],
519 (unsigned long long)resulting_regs[i]);
520 nerrs++;
521 }
522 }
523
524 if (nerrs == 0)
525 printf("[OK]\tall registers okay\n");
526
527 return nerrs;
528}
529
530static int test_bad_iret(int cs_bits, unsigned short ss, int force_cs)
531{
532 int cs = force_cs == -1 ? find_cs(cs_bits) : force_cs;
533 if (cs == -1)
534 return 0;
535
536 sig_cs = cs;
537 sig_ss = ss;
538
539 printf("[RUN]\t%d-bit CS (%hx), bogus SS (%hx)\n",
540 cs_bits, sig_cs, sig_ss);
541
542 sig_trapped = 0;
543 raise(SIGUSR1);
544 if (sig_trapped) {
545 char errdesc[32] = "";
546 if (sig_err) {
547 const char *src = (sig_err & 1) ? " EXT" : "";
548 const char *table;
549 if ((sig_err & 0x6) == 0x0)
550 table = "GDT";
551 else if ((sig_err & 0x6) == 0x4)
552 table = "LDT";
553 else if ((sig_err & 0x6) == 0x2)
554 table = "IDT";
555 else
556 table = "???";
557
558 sprintf(errdesc, "%s%s index %d, ",
559 table, src, sig_err >> 3);
560 }
561
562 char trapname[32];
563 if (sig_trapno == 13)
564 strcpy(trapname, "GP");
565 else if (sig_trapno == 11)
566 strcpy(trapname, "NP");
567 else if (sig_trapno == 12)
568 strcpy(trapname, "SS");
569 else if (sig_trapno == 32)
570 strcpy(trapname, "IRET"); /* X86_TRAP_IRET */
571 else
572 sprintf(trapname, "%d", sig_trapno);
573
574 printf("[OK]\tGot #%s(0x%lx) (i.e. %s%s)\n",
575 trapname, (unsigned long)sig_err,
576 errdesc, strsignal(sig_trapped));
577 return 0;
578 } else {
579 printf("[FAIL]\tDid not get SIGSEGV\n");
580 return 1;
581 }
582}
583
584int main()
585{
586 int total_nerrs = 0;
587 unsigned short my_cs, my_ss;
588
589 asm volatile ("mov %%cs,%0" : "=r" (my_cs));
590 asm volatile ("mov %%ss,%0" : "=r" (my_ss));
591 setup_ldt();
592
593 stack_t stack = {
594 .ss_sp = altstack_data,
595 .ss_size = SIGSTKSZ,
596 };
597 if (sigaltstack(&stack, NULL) != 0)
598 err(1, "sigaltstack");
599
600 sethandler(SIGUSR1, sigusr1, 0);
601 sethandler(SIGTRAP, sigtrap, SA_ONSTACK);
602
603 /* Easy cases: return to a 32-bit SS in each possible CS bitness. */
604 total_nerrs += test_valid_sigreturn(64, false, -1);
605 total_nerrs += test_valid_sigreturn(32, false, -1);
606 total_nerrs += test_valid_sigreturn(16, false, -1);
607
608 /*
609 * Test easy espfix cases: return to a 16-bit LDT SS in each possible
610 * CS bitness. NB: with a long mode CS, the SS bitness is irrelevant.
611 *
612 * This catches the original missing-espfix-on-64-bit-kernels issue
613 * as well as CVE-2014-8134.
614 */
615 total_nerrs += test_valid_sigreturn(64, true, -1);
616 total_nerrs += test_valid_sigreturn(32, true, -1);
617 total_nerrs += test_valid_sigreturn(16, true, -1);
618
619 if (gdt_data16_idx) {
620 /*
621 * For performance reasons, Linux skips espfix if SS points
622 * to the GDT. If we were able to allocate a 16-bit SS in
623 * the GDT, see if it leaks parts of the kernel stack pointer.
624 *
625 * This tests for CVE-2014-8133.
626 */
627 total_nerrs += test_valid_sigreturn(64, true,
628 GDT3(gdt_data16_idx));
629 total_nerrs += test_valid_sigreturn(32, true,
630 GDT3(gdt_data16_idx));
631 total_nerrs += test_valid_sigreturn(16, true,
632 GDT3(gdt_data16_idx));
633 }
634
635 /*
636 * We're done testing valid sigreturn cases. Now we test states
637 * for which sigreturn itself will succeed but the subsequent
638 * entry to user mode will fail.
639 *
640 * Depending on the failure mode and the kernel bitness, these
641 * entry failures can generate SIGSEGV, SIGBUS, or SIGILL.
642 */
643 clearhandler(SIGTRAP);
644 sethandler(SIGSEGV, sigtrap, SA_ONSTACK);
645 sethandler(SIGBUS, sigtrap, SA_ONSTACK);
646 sethandler(SIGILL, sigtrap, SA_ONSTACK); /* 32-bit kernels do this */
647
648 /* Easy failures: invalid SS, resulting in #GP(0) */
649 test_bad_iret(64, ldt_nonexistent_sel, -1);
650 test_bad_iret(32, ldt_nonexistent_sel, -1);
651 test_bad_iret(16, ldt_nonexistent_sel, -1);
652
653 /* These fail because SS isn't a data segment, resulting in #GP(SS) */
654 test_bad_iret(64, my_cs, -1);
655 test_bad_iret(32, my_cs, -1);
656 test_bad_iret(16, my_cs, -1);
657
658 /* Try to return to a not-present code segment, triggering #NP(SS). */
659 test_bad_iret(32, my_ss, npcode32_sel);
660
661 /*
662 * Try to return to a not-present but otherwise valid data segment.
663 * This will cause IRET to fail with #SS on the espfix stack. This
664 * exercises CVE-2014-9322.
665 *
666 * Note that, if espfix is enabled, 64-bit Linux will lose track
667 * of the actual cause of failure and report #GP(0) instead.
668 * This would be very difficult for Linux to avoid, because
669 * espfix64 causes IRET failures to be promoted to #DF, so the
670 * original exception frame is never pushed onto the stack.
671 */
672 test_bad_iret(32, npdata32_sel, -1);
673
674 /*
675 * Try to return to a not-present but otherwise valid data
676 * segment without invoking espfix. Newer kernels don't allow
677 * this to happen in the first place. On older kernels, though,
678 * this can trigger CVE-2014-9322.
679 */
680 if (gdt_npdata32_idx)
681 test_bad_iret(32, GDT3(gdt_npdata32_idx), -1);
682
683 return total_nerrs ? 1 : 0;
684}
diff --git a/tools/testing/selftests/x86/trivial_32bit_program.c b/tools/testing/selftests/x86/trivial_32bit_program.c
new file mode 100644
index 000000000000..2e231beb0a39
--- /dev/null
+++ b/tools/testing/selftests/x86/trivial_32bit_program.c
@@ -0,0 +1,14 @@
1/*
2 * Trivial program to check that we have a valid 32-bit build environment.
3 * Copyright (c) 2015 Andy Lutomirski
4 * GPL v2
5 */
6
7#include <stdio.h>
8
9int main()
10{
11 printf("\n");
12
13 return 0;
14}