summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-01 11:40:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-01 11:40:25 -0400
commit5778077d03cb25aac9b6a428e18970642fc019e3 (patch)
tree2e3f3da1fb99c3646da5ed9a09644696ca5f2309
parent65a99597f044c083983f4274ab049c9ec3b9d764 (diff)
parent7e01ebffffedec22cea86ebe94802f909e4579ca (diff)
Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 asm changes from Ingo Molnar: "The biggest changes in this cycle were: - Revamp, simplify (and in some cases fix) Time Stamp Counter (TSC) primitives. (Andy Lutomirski) - Add new, comprehensible entry and exit handlers written in C. (Andy Lutomirski) - vm86 mode cleanups and fixes. (Brian Gerst) - 32-bit compat code cleanups. (Brian Gerst) The amount of simplification in low level assembly code is already palpable: arch/x86/entry/entry_32.S | 130 +---- arch/x86/entry/entry_64.S | 197 ++----- but more simplifications are planned. There's also the usual laudry mix of low level changes - see the changelog for details" * 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (83 commits) x86/asm: Drop repeated macro of X86_EFLAGS_AC definition x86/asm/msr: Make wrmsrl() a function x86/asm/delay: Introduce an MWAITX-based delay with a configurable timer x86/asm: Add MONITORX/MWAITX instruction support x86/traps: Weaken context tracking entry assertions x86/asm/tsc: Add rdtscll() merge helper selftests/x86: Add syscall_nt selftest selftests/x86: Disable sigreturn_64 x86/vdso: Emit a GNU hash x86/entry: Remove do_notify_resume(), syscall_trace_leave(), and their TIF masks x86/entry/32: Migrate to C exit path x86/entry/32: Remove 32-bit syscall audit optimizations x86/vm86: Rename vm86->v86flags and v86mask x86/vm86: Rename vm86->vm86_info to user_vm86 x86/vm86: Clean up vm86.h includes x86/vm86: Move the vm86 IRQ definitions to vm86.h x86/vm86: Use the normal pt_regs area for vm86 x86/vm86: Eliminate 'struct kernel_vm86_struct' x86/vm86: Move fields from 'struct kernel_vm86_struct' to 'struct vm86' x86/vm86: Move vm86 fields out of 'thread_struct' ...
-rw-r--r--arch/um/include/shared/kern_util.h3
-rw-r--r--arch/um/kernel/process.c6
-rw-r--r--arch/um/kernel/signal.c8
-rw-r--r--arch/um/kernel/tlb.c2
-rw-r--r--arch/um/kernel/trap.c2
-rw-r--r--arch/x86/Kconfig60
-rw-r--r--arch/x86/Makefile13
-rw-r--r--arch/x86/boot/compressed/aslr.c2
-rw-r--r--arch/x86/entry/Makefile1
-rw-r--r--arch/x86/entry/calling.h9
-rw-r--r--arch/x86/entry/common.c318
-rw-r--r--arch/x86/entry/entry_32.S130
-rw-r--r--arch/x86/entry/entry_64.S197
-rw-r--r--arch/x86/entry/entry_64_compat.S61
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl15
-rw-r--r--arch/x86/entry/vdso/Makefile8
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c16
-rw-r--r--arch/x86/entry/vdso/vma.c7
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c93
-rw-r--r--arch/x86/include/asm/barrier.h11
-rw-r--r--arch/x86/include/asm/context_tracking.h10
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/delay.h1
-rw-r--r--arch/x86/include/asm/elf.h17
-rw-r--r--arch/x86/include/asm/ia32.h9
-rw-r--r--arch/x86/include/asm/irq_vectors.h10
-rw-r--r--arch/x86/include/asm/math_emu.h6
-rw-r--r--arch/x86/include/asm/mmu.h2
-rw-r--r--arch/x86/include/asm/mmu_context.h28
-rw-r--r--arch/x86/include/asm/msr.h70
-rw-r--r--arch/x86/include/asm/mwait.h45
-rw-r--r--arch/x86/include/asm/paravirt.h40
-rw-r--r--arch/x86/include/asm/paravirt_types.h2
-rw-r--r--arch/x86/include/asm/processor.h13
-rw-r--r--arch/x86/include/asm/ptrace.h1
-rw-r--r--arch/x86/include/asm/pvclock.h10
-rw-r--r--arch/x86/include/asm/sigframe.h10
-rw-r--r--arch/x86/include/asm/signal.h2
-rw-r--r--arch/x86/include/asm/stackprotector.h2
-rw-r--r--arch/x86/include/asm/syscalls.h1
-rw-r--r--arch/x86/include/asm/thread_info.h27
-rw-r--r--arch/x86/include/asm/traps.h4
-rw-r--r--arch/x86/include/asm/tsc.h18
-rw-r--r--arch/x86/include/asm/vm86.h57
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h2
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/apb_timer.c8
-rw-r--r--arch/x86/kernel/apic/apic.c8
-rw-r--r--arch/x86/kernel/cpu/amd.c10
-rw-r--r--arch/x86/kernel/cpu/common.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event.c6
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/hpet.c4
-rw-r--r--arch/x86/kernel/irq.c15
-rw-r--r--arch/x86/kernel/nmi.c10
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/process.c3
-rw-r--r--arch/x86/kernel/process_32.c1
-rw-r--r--arch/x86/kernel/process_64.c6
-rw-r--r--arch/x86/kernel/ptrace.c340
-rw-r--r--arch/x86/kernel/signal.c33
-rw-r--r--arch/x86/kernel/signal_compat.c95
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/trace_clock.c7
-rw-r--r--arch/x86/kernel/traps.c88
-rw-r--r--arch/x86/kernel/tsc.c12
-rw-r--r--arch/x86/kernel/tsc_sync.c14
-rw-r--r--arch/x86/kernel/vm86_32.c373
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/svm.c4
-rw-r--r--arch/x86/kvm/vmx.c4
-rw-r--r--arch/x86/kvm/x86.c26
-rw-r--r--arch/x86/lib/delay.c60
-rw-r--r--arch/x86/math-emu/get_address.c1
-rw-r--r--arch/x86/mm/fault.c7
-rw-r--r--arch/x86/um/asm/barrier.h13
-rw-r--r--arch/x86/xen/enlighten.c3
-rw-r--r--drivers/cpufreq/intel_pstate.c2
-rw-r--r--drivers/input/gameport/gameport.c4
-rw-r--r--drivers/input/joystick/analog.c4
-rw-r--r--drivers/net/hamradio/baycom_epp.c2
-rw-r--r--drivers/scsi/dpt_i2o.c3
-rw-r--r--drivers/staging/media/lirc/lirc_serial.c63
-rw-r--r--drivers/thermal/intel_powerclamp.c4
-rw-r--r--include/linux/context_tracking.h15
-rw-r--r--include/linux/context_tracking_state.h1
-rw-r--r--include/linux/spinlock.h30
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c4
-rw-r--r--tools/testing/selftests/x86/Makefile4
-rw-r--r--tools/testing/selftests/x86/entry_from_vm86.c139
-rw-r--r--tools/testing/selftests/x86/ldt_gdt.c576
-rw-r--r--tools/testing/selftests/x86/syscall_arg_fault.c130
-rw-r--r--tools/testing/selftests/x86/syscall_nt.c54
100 files changed, 2197 insertions, 1384 deletions
diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h
index 83a91f976330..35ab97e4bb9b 100644
--- a/arch/um/include/shared/kern_util.h
+++ b/arch/um/include/shared/kern_util.h
@@ -22,7 +22,8 @@ extern int kmalloc_ok;
22extern unsigned long alloc_stack(int order, int atomic); 22extern unsigned long alloc_stack(int order, int atomic);
23extern void free_stack(unsigned long stack, int order); 23extern void free_stack(unsigned long stack, int order);
24 24
25extern int do_signal(void); 25struct pt_regs;
26extern void do_signal(struct pt_regs *regs);
26extern void interrupt_end(void); 27extern void interrupt_end(void);
27extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs); 28extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs);
28 29
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 68b9119841cd..a6d922672b9f 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -90,12 +90,14 @@ void *__switch_to(struct task_struct *from, struct task_struct *to)
90 90
91void interrupt_end(void) 91void interrupt_end(void)
92{ 92{
93 struct pt_regs *regs = &current->thread.regs;
94
93 if (need_resched()) 95 if (need_resched())
94 schedule(); 96 schedule();
95 if (test_thread_flag(TIF_SIGPENDING)) 97 if (test_thread_flag(TIF_SIGPENDING))
96 do_signal(); 98 do_signal(regs);
97 if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) 99 if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME))
98 tracehook_notify_resume(&current->thread.regs); 100 tracehook_notify_resume(regs);
99} 101}
100 102
101void exit_thread(void) 103void exit_thread(void)
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 4f60e4aad790..57acbd67d85d 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -64,7 +64,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
64 signal_setup_done(err, ksig, singlestep); 64 signal_setup_done(err, ksig, singlestep);
65} 65}
66 66
67static int kern_do_signal(struct pt_regs *regs) 67void do_signal(struct pt_regs *regs)
68{ 68{
69 struct ksignal ksig; 69 struct ksignal ksig;
70 int handled_sig = 0; 70 int handled_sig = 0;
@@ -110,10 +110,4 @@ static int kern_do_signal(struct pt_regs *regs)
110 */ 110 */
111 if (!handled_sig) 111 if (!handled_sig)
112 restore_saved_sigmask(); 112 restore_saved_sigmask();
113 return handled_sig;
114}
115
116int do_signal(void)
117{
118 return kern_do_signal(&current->thread.regs);
119} 113}
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index f1b3eb14b855..2077248e8a72 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -291,7 +291,7 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
291 /* We are under mmap_sem, release it such that current can terminate */ 291 /* We are under mmap_sem, release it such that current can terminate */
292 up_write(&current->mm->mmap_sem); 292 up_write(&current->mm->mmap_sem);
293 force_sig(SIGKILL, current); 293 force_sig(SIGKILL, current);
294 do_signal(); 294 do_signal(&current->thread.regs);
295 } 295 }
296} 296}
297 297
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 557232f758b6..d8a9fce6ee2e 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -173,7 +173,7 @@ static void bad_segv(struct faultinfo fi, unsigned long ip)
173void fatal_sigsegv(void) 173void fatal_sigsegv(void)
174{ 174{
175 force_sigsegv(SIGSEGV, current); 175 force_sigsegv(SIGSEGV, current);
176 do_signal(); 176 do_signal(&current->thread.regs);
177 /* 177 /*
178 * This is to tell gcc that we're not returning - do_signal 178 * This is to tell gcc that we're not returning - do_signal
179 * can, in general, return, but in this case, it's not, since 179 * can, in general, return, but in this case, it's not, since
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 06dbb5da90c6..48f7433dac6f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -133,7 +133,7 @@ config X86
133 select HAVE_PERF_USER_STACK_DUMP 133 select HAVE_PERF_USER_STACK_DUMP
134 select HAVE_REGS_AND_STACK_ACCESS_API 134 select HAVE_REGS_AND_STACK_ACCESS_API
135 select HAVE_SYSCALL_TRACEPOINTS 135 select HAVE_SYSCALL_TRACEPOINTS
136 select HAVE_UID16 if X86_32 136 select HAVE_UID16 if X86_32 || IA32_EMULATION
137 select HAVE_UNSTABLE_SCHED_CLOCK 137 select HAVE_UNSTABLE_SCHED_CLOCK
138 select HAVE_USER_RETURN_NOTIFIER 138 select HAVE_USER_RETURN_NOTIFIER
139 select IRQ_FORCED_THREADING 139 select IRQ_FORCED_THREADING
@@ -1003,19 +1003,41 @@ config X86_THERMAL_VECTOR
1003 def_bool y 1003 def_bool y
1004 depends on X86_MCE_INTEL 1004 depends on X86_MCE_INTEL
1005 1005
1006config VM86 1006config X86_LEGACY_VM86
1007 bool "Enable VM86 support" if EXPERT 1007 bool "Legacy VM86 support (obsolete)"
1008 default y 1008 default n
1009 depends on X86_32 1009 depends on X86_32
1010 ---help--- 1010 ---help---
1011 This option is required by programs like DOSEMU to run 1011 This option allows user programs to put the CPU into V8086
1012 16-bit real mode legacy code on x86 processors. It also may 1012 mode, which is an 80286-era approximation of 16-bit real mode.
1013 be needed by software like XFree86 to initialize some video 1013
1014 cards via BIOS. Disabling this option saves about 6K. 1014 Some very old versions of X and/or vbetool require this option
1015 for user mode setting. Similarly, DOSEMU will use it if
1016 available to accelerate real mode DOS programs. However, any
1017 recent version of DOSEMU, X, or vbetool should be fully
1018 functional even without kernel VM86 support, as they will all
1019 fall back to (pretty well performing) software emulation.
1020
1021 Anything that works on a 64-bit kernel is unlikely to need
1022 this option, as 64-bit kernels don't, and can't, support V8086
1023 mode. This option is also unrelated to 16-bit protected mode
1024 and is not needed to run most 16-bit programs under Wine.
1025
1026 Enabling this option adds considerable attack surface to the
1027 kernel and slows down system calls and exception handling.
1028
1029 Unless you use very old userspace or need the last drop of
1030 performance in your real mode DOS games and can't use KVM,
1031 say N here.
1032
1033config VM86
1034 bool
1035 default X86_LEGACY_VM86
1015 1036
1016config X86_16BIT 1037config X86_16BIT
1017 bool "Enable support for 16-bit segments" if EXPERT 1038 bool "Enable support for 16-bit segments" if EXPERT
1018 default y 1039 default y
1040 depends on MODIFY_LDT_SYSCALL
1019 ---help--- 1041 ---help---
1020 This option is required by programs like Wine to run 16-bit 1042 This option is required by programs like Wine to run 16-bit
1021 protected mode legacy code on x86 processors. Disabling 1043 protected mode legacy code on x86 processors. Disabling
@@ -1510,6 +1532,7 @@ config X86_RESERVE_LOW
1510 1532
1511config MATH_EMULATION 1533config MATH_EMULATION
1512 bool 1534 bool
1535 depends on MODIFY_LDT_SYSCALL
1513 prompt "Math emulation" if X86_32 1536 prompt "Math emulation" if X86_32
1514 ---help--- 1537 ---help---
1515 Linux can emulate a math coprocessor (used for floating point 1538 Linux can emulate a math coprocessor (used for floating point
@@ -2054,6 +2077,22 @@ config CMDLINE_OVERRIDE
2054 This is used to work around broken boot loaders. This should 2077 This is used to work around broken boot loaders. This should
2055 be set to 'N' under normal conditions. 2078 be set to 'N' under normal conditions.
2056 2079
2080config MODIFY_LDT_SYSCALL
2081 bool "Enable the LDT (local descriptor table)" if EXPERT
2082 default y
2083 ---help---
2084 Linux can allow user programs to install a per-process x86
2085 Local Descriptor Table (LDT) using the modify_ldt(2) system
2086 call. This is required to run 16-bit or segmented code such as
2087 DOSEMU or some Wine programs. It is also used by some very old
2088 threading libraries.
2089
2090 Enabling this feature adds a small amount of overhead to
2091 context switches and increases the low-level kernel attack
2092 surface. Disabling it removes the modify_ldt(2) system call.
2093
2094 Saying 'N' here may make sense for embedded or server kernels.
2095
2057source "kernel/livepatch/Kconfig" 2096source "kernel/livepatch/Kconfig"
2058 2097
2059endmenu 2098endmenu
@@ -2523,7 +2562,7 @@ config IA32_EMULATION
2523 depends on X86_64 2562 depends on X86_64
2524 select BINFMT_ELF 2563 select BINFMT_ELF
2525 select COMPAT_BINFMT_ELF 2564 select COMPAT_BINFMT_ELF
2526 select HAVE_UID16 2565 select ARCH_WANT_OLD_COMPAT_IPC
2527 ---help--- 2566 ---help---
2528 Include code to run legacy 32-bit programs under a 2567 Include code to run legacy 32-bit programs under a
2529 64-bit kernel. You should likely turn this on, unless you're 2568 64-bit kernel. You should likely turn this on, unless you're
@@ -2537,7 +2576,7 @@ config IA32_AOUT
2537 2576
2538config X86_X32 2577config X86_X32
2539 bool "x32 ABI for 64-bit mode" 2578 bool "x32 ABI for 64-bit mode"
2540 depends on X86_64 && IA32_EMULATION 2579 depends on X86_64
2541 ---help--- 2580 ---help---
2542 Include code to run binaries for the x32 native 32-bit ABI 2581 Include code to run binaries for the x32 native 32-bit ABI
2543 for 64-bit processors. An x32 process gets access to the 2582 for 64-bit processors. An x32 process gets access to the
@@ -2551,7 +2590,6 @@ config X86_X32
2551config COMPAT 2590config COMPAT
2552 def_bool y 2591 def_bool y
2553 depends on IA32_EMULATION || X86_X32 2592 depends on IA32_EMULATION || X86_X32
2554 select ARCH_WANT_OLD_COMPAT_IPC
2555 2593
2556if COMPAT 2594if COMPAT
2557config COMPAT_FOR_U64_ALIGNMENT 2595config COMPAT_FOR_U64_ALIGNMENT
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 0f38418719ab..747860c696e1 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -39,6 +39,16 @@ ifdef CONFIG_X86_NEED_RELOCS
39 LDFLAGS_vmlinux := --emit-relocs 39 LDFLAGS_vmlinux := --emit-relocs
40endif 40endif
41 41
42#
43# Prevent GCC from generating any FP code by mistake.
44#
45# This must happen before we try the -mpreferred-stack-boundary, see:
46#
47# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
48#
49KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
50KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
51
42ifeq ($(CONFIG_X86_32),y) 52ifeq ($(CONFIG_X86_32),y)
43 BITS := 32 53 BITS := 32
44 UTS_MACHINE := i386 54 UTS_MACHINE := i386
@@ -167,9 +177,6 @@ KBUILD_CFLAGS += -pipe
167KBUILD_CFLAGS += -Wno-sign-compare 177KBUILD_CFLAGS += -Wno-sign-compare
168# 178#
169KBUILD_CFLAGS += -fno-asynchronous-unwind-tables 179KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
170# prevent gcc from generating any FP code by mistake
171KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
172KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
173 180
174KBUILD_CFLAGS += $(mflags-y) 181KBUILD_CFLAGS += $(mflags-y)
175KBUILD_AFLAGS += $(mflags-y) 182KBUILD_AFLAGS += $(mflags-y)
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index d7b1f655b3ef..6a9b96b4624d 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -82,7 +82,7 @@ static unsigned long get_random_long(void)
82 82
83 if (has_cpuflag(X86_FEATURE_TSC)) { 83 if (has_cpuflag(X86_FEATURE_TSC)) {
84 debug_putstr(" RDTSC"); 84 debug_putstr(" RDTSC");
85 rdtscll(raw); 85 raw = rdtsc();
86 86
87 random ^= raw; 87 random ^= raw;
88 use_i8254 = false; 88 use_i8254 = false;
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 7a144971db79..bd55dedd7614 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -2,6 +2,7 @@
2# Makefile for the x86 low level entry code 2# Makefile for the x86 low level entry code
3# 3#
4obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o 4obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
5obj-y += common.o
5 6
6obj-y += vdso/ 7obj-y += vdso/
7obj-y += vsyscall/ 8obj-y += vsyscall/
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index f4e6308c4200..3c71dd947c7b 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -135,9 +135,6 @@ For 32-bit we have the following conventions - kernel is built with
135 movq %rbp, 4*8+\offset(%rsp) 135 movq %rbp, 4*8+\offset(%rsp)
136 movq %rbx, 5*8+\offset(%rsp) 136 movq %rbx, 5*8+\offset(%rsp)
137 .endm 137 .endm
138 .macro SAVE_EXTRA_REGS_RBP offset=0
139 movq %rbp, 4*8+\offset(%rsp)
140 .endm
141 138
142 .macro RESTORE_EXTRA_REGS offset=0 139 .macro RESTORE_EXTRA_REGS offset=0
143 movq 0*8+\offset(%rsp), %r15 140 movq 0*8+\offset(%rsp), %r15
@@ -193,12 +190,6 @@ For 32-bit we have the following conventions - kernel is built with
193 .macro RESTORE_C_REGS_EXCEPT_RCX_R11 190 .macro RESTORE_C_REGS_EXCEPT_RCX_R11
194 RESTORE_C_REGS_HELPER 1,0,0,1,1 191 RESTORE_C_REGS_HELPER 1,0,0,1,1
195 .endm 192 .endm
196 .macro RESTORE_RSI_RDI
197 RESTORE_C_REGS_HELPER 0,0,0,0,0
198 .endm
199 .macro RESTORE_RSI_RDI_RDX
200 RESTORE_C_REGS_HELPER 0,0,0,0,1
201 .endm
202 193
203 .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 194 .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
204 subq $-(15*8+\addskip), %rsp 195 subq $-(15*8+\addskip), %rsp
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
new file mode 100644
index 000000000000..80dcc9261ca3
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,318 @@
1/*
2 * common.c - C code for kernel entry and exit
3 * Copyright (c) 2015 Andrew Lutomirski
4 * GPL v2
5 *
6 * Based on asm and ptrace code by many authors. The code here originated
7 * in ptrace.c and signal.c.
8 */
9
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/errno.h>
15#include <linux/ptrace.h>
16#include <linux/tracehook.h>
17#include <linux/audit.h>
18#include <linux/seccomp.h>
19#include <linux/signal.h>
20#include <linux/export.h>
21#include <linux/context_tracking.h>
22#include <linux/user-return-notifier.h>
23#include <linux/uprobes.h>
24
25#include <asm/desc.h>
26#include <asm/traps.h>
27
28#define CREATE_TRACE_POINTS
29#include <trace/events/syscalls.h>
30
31#ifdef CONFIG_CONTEXT_TRACKING
32/* Called on entry from user mode with IRQs off. */
33__visible void enter_from_user_mode(void)
34{
35 CT_WARN_ON(ct_state() != CONTEXT_USER);
36 user_exit();
37}
38#endif
39
40static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
41{
42#ifdef CONFIG_X86_64
43 if (arch == AUDIT_ARCH_X86_64) {
44 audit_syscall_entry(regs->orig_ax, regs->di,
45 regs->si, regs->dx, regs->r10);
46 } else
47#endif
48 {
49 audit_syscall_entry(regs->orig_ax, regs->bx,
50 regs->cx, regs->dx, regs->si);
51 }
52}
53
54/*
55 * We can return 0 to resume the syscall or anything else to go to phase
56 * 2. If we resume the syscall, we need to put something appropriate in
57 * regs->orig_ax.
58 *
59 * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
60 * are fully functional.
61 *
62 * For phase 2's benefit, our return value is:
63 * 0: resume the syscall
64 * 1: go to phase 2; no seccomp phase 2 needed
65 * anything else: go to phase 2; pass return value to seccomp
66 */
67unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
68{
69 unsigned long ret = 0;
70 u32 work;
71
72 BUG_ON(regs != task_pt_regs(current));
73
74 work = ACCESS_ONCE(current_thread_info()->flags) &
75 _TIF_WORK_SYSCALL_ENTRY;
76
77#ifdef CONFIG_CONTEXT_TRACKING
78 /*
79 * If TIF_NOHZ is set, we are required to call user_exit() before
80 * doing anything that could touch RCU.
81 */
82 if (work & _TIF_NOHZ) {
83 enter_from_user_mode();
84 work &= ~_TIF_NOHZ;
85 }
86#endif
87
88#ifdef CONFIG_SECCOMP
89 /*
90 * Do seccomp first -- it should minimize exposure of other
91 * code, and keeping seccomp fast is probably more valuable
92 * than the rest of this.
93 */
94 if (work & _TIF_SECCOMP) {
95 struct seccomp_data sd;
96
97 sd.arch = arch;
98 sd.nr = regs->orig_ax;
99 sd.instruction_pointer = regs->ip;
100#ifdef CONFIG_X86_64
101 if (arch == AUDIT_ARCH_X86_64) {
102 sd.args[0] = regs->di;
103 sd.args[1] = regs->si;
104 sd.args[2] = regs->dx;
105 sd.args[3] = regs->r10;
106 sd.args[4] = regs->r8;
107 sd.args[5] = regs->r9;
108 } else
109#endif
110 {
111 sd.args[0] = regs->bx;
112 sd.args[1] = regs->cx;
113 sd.args[2] = regs->dx;
114 sd.args[3] = regs->si;
115 sd.args[4] = regs->di;
116 sd.args[5] = regs->bp;
117 }
118
119 BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
120 BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
121
122 ret = seccomp_phase1(&sd);
123 if (ret == SECCOMP_PHASE1_SKIP) {
124 regs->orig_ax = -1;
125 ret = 0;
126 } else if (ret != SECCOMP_PHASE1_OK) {
127 return ret; /* Go directly to phase 2 */
128 }
129
130 work &= ~_TIF_SECCOMP;
131 }
132#endif
133
134 /* Do our best to finish without phase 2. */
135 if (work == 0)
136 return ret; /* seccomp and/or nohz only (ret == 0 here) */
137
138#ifdef CONFIG_AUDITSYSCALL
139 if (work == _TIF_SYSCALL_AUDIT) {
140 /*
141 * If there is no more work to be done except auditing,
142 * then audit in phase 1. Phase 2 always audits, so, if
143 * we audit here, then we can't go on to phase 2.
144 */
145 do_audit_syscall_entry(regs, arch);
146 return 0;
147 }
148#endif
149
150 return 1; /* Something is enabled that we can't handle in phase 1 */
151}
152
153/* Returns the syscall nr to run (which should match regs->orig_ax). */
154long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
155 unsigned long phase1_result)
156{
157 long ret = 0;
158 u32 work = ACCESS_ONCE(current_thread_info()->flags) &
159 _TIF_WORK_SYSCALL_ENTRY;
160
161 BUG_ON(regs != task_pt_regs(current));
162
163 /*
164 * If we stepped into a sysenter/syscall insn, it trapped in
165 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
166 * If user-mode had set TF itself, then it's still clear from
167 * do_debug() and we need to set it again to restore the user
168 * state. If we entered on the slow path, TF was already set.
169 */
170 if (work & _TIF_SINGLESTEP)
171 regs->flags |= X86_EFLAGS_TF;
172
173#ifdef CONFIG_SECCOMP
174 /*
175 * Call seccomp_phase2 before running the other hooks so that
176 * they can see any changes made by a seccomp tracer.
177 */
178 if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
179 /* seccomp failures shouldn't expose any additional code. */
180 return -1;
181 }
182#endif
183
184 if (unlikely(work & _TIF_SYSCALL_EMU))
185 ret = -1L;
186
187 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
188 tracehook_report_syscall_entry(regs))
189 ret = -1L;
190
191 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
192 trace_sys_enter(regs, regs->orig_ax);
193
194 do_audit_syscall_entry(regs, arch);
195
196 return ret ?: regs->orig_ax;
197}
198
199long syscall_trace_enter(struct pt_regs *regs)
200{
201 u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
202 unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
203
204 if (phase1_result == 0)
205 return regs->orig_ax;
206 else
207 return syscall_trace_enter_phase2(regs, arch, phase1_result);
208}
209
210static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
211{
212 unsigned long top_of_stack =
213 (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
214 return (struct thread_info *)(top_of_stack - THREAD_SIZE);
215}
216
217/* Called with IRQs disabled. */
218__visible void prepare_exit_to_usermode(struct pt_regs *regs)
219{
220 if (WARN_ON(!irqs_disabled()))
221 local_irq_disable();
222
223 /*
224 * In order to return to user mode, we need to have IRQs off with
225 * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
226 * _TIF_UPROBE, or _TIF_NEED_RESCHED set. Several of these flags
227 * can be set at any time on preemptable kernels if we have IRQs on,
228 * so we need to loop. Disabling preemption wouldn't help: doing the
229 * work to clear some of the flags can sleep.
230 */
231 while (true) {
232 u32 cached_flags =
233 READ_ONCE(pt_regs_to_thread_info(regs)->flags);
234
235 if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
236 _TIF_UPROBE | _TIF_NEED_RESCHED |
237 _TIF_USER_RETURN_NOTIFY)))
238 break;
239
240 /* We have work to do. */
241 local_irq_enable();
242
243 if (cached_flags & _TIF_NEED_RESCHED)
244 schedule();
245
246 if (cached_flags & _TIF_UPROBE)
247 uprobe_notify_resume(regs);
248
249 /* deal with pending signal delivery */
250 if (cached_flags & _TIF_SIGPENDING)
251 do_signal(regs);
252
253 if (cached_flags & _TIF_NOTIFY_RESUME) {
254 clear_thread_flag(TIF_NOTIFY_RESUME);
255 tracehook_notify_resume(regs);
256 }
257
258 if (cached_flags & _TIF_USER_RETURN_NOTIFY)
259 fire_user_return_notifiers();
260
261 /* Disable IRQs and retry */
262 local_irq_disable();
263 }
264
265 user_enter();
266}
267
268/*
269 * Called with IRQs on and fully valid regs. Returns with IRQs off in a
270 * state such that we can immediately switch to user mode.
271 */
272__visible void syscall_return_slowpath(struct pt_regs *regs)
273{
274 struct thread_info *ti = pt_regs_to_thread_info(regs);
275 u32 cached_flags = READ_ONCE(ti->flags);
276 bool step;
277
278 CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
279
280 if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
281 regs->orig_ax))
282 local_irq_enable();
283
284 /*
285 * First do one-time work. If these work items are enabled, we
286 * want to run them exactly once per syscall exit with IRQs on.
287 */
288 if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |
289 _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) {
290 audit_syscall_exit(regs);
291
292 if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
293 trace_sys_exit(regs, regs->ax);
294
295 /*
296 * If TIF_SYSCALL_EMU is set, we only get here because of
297 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
298 * We already reported this syscall instruction in
299 * syscall_trace_enter().
300 */
301 step = unlikely(
302 (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
303 == _TIF_SINGLESTEP);
304 if (step || cached_flags & _TIF_SYSCALL_TRACE)
305 tracehook_report_syscall_exit(regs, step);
306 }
307
308#ifdef CONFIG_COMPAT
309 /*
310 * Compat syscalls set TS_COMPAT. Make sure we clear it before
311 * returning to user mode.
312 */
313 ti->status &= ~TS_COMPAT;
314#endif
315
316 local_irq_disable();
317 prepare_exit_to_usermode(regs);
318}
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 21dc60a60b5f..b2909bf8cf70 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -45,16 +45,6 @@
45#include <asm/asm.h> 45#include <asm/asm.h>
46#include <asm/smap.h> 46#include <asm/smap.h>
47 47
48/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
49#include <linux/elf-em.h>
50#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
51#define __AUDIT_ARCH_LE 0x40000000
52
53#ifndef CONFIG_AUDITSYSCALL
54# define sysenter_audit syscall_trace_entry
55# define sysexit_audit syscall_exit_work
56#endif
57
58 .section .entry.text, "ax" 48 .section .entry.text, "ax"
59 49
60/* 50/*
@@ -266,14 +256,10 @@ ret_from_intr:
266 256
267ENTRY(resume_userspace) 257ENTRY(resume_userspace)
268 LOCKDEP_SYS_EXIT 258 LOCKDEP_SYS_EXIT
269 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 259 DISABLE_INTERRUPTS(CLBR_ANY)
270 # setting need_resched or sigpending
271 # between sampling and the iret
272 TRACE_IRQS_OFF 260 TRACE_IRQS_OFF
273 movl TI_flags(%ebp), %ecx 261 movl %esp, %eax
274 andl $_TIF_WORK_MASK, %ecx # is there any work to be done on 262 call prepare_exit_to_usermode
275 # int/exception return?
276 jne work_pending
277 jmp restore_all 263 jmp restore_all
278END(ret_from_exception) 264END(ret_from_exception)
279 265
@@ -339,7 +325,7 @@ sysenter_past_esp:
339 GET_THREAD_INFO(%ebp) 325 GET_THREAD_INFO(%ebp)
340 326
341 testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) 327 testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
342 jnz sysenter_audit 328 jnz syscall_trace_entry
343sysenter_do_call: 329sysenter_do_call:
344 cmpl $(NR_syscalls), %eax 330 cmpl $(NR_syscalls), %eax
345 jae sysenter_badsys 331 jae sysenter_badsys
@@ -351,7 +337,7 @@ sysenter_after_call:
351 TRACE_IRQS_OFF 337 TRACE_IRQS_OFF
352 movl TI_flags(%ebp), %ecx 338 movl TI_flags(%ebp), %ecx
353 testl $_TIF_ALLWORK_MASK, %ecx 339 testl $_TIF_ALLWORK_MASK, %ecx
354 jnz sysexit_audit 340 jnz syscall_exit_work_irqs_off
355sysenter_exit: 341sysenter_exit:
356/* if something modifies registers it must also disable sysexit */ 342/* if something modifies registers it must also disable sysexit */
357 movl PT_EIP(%esp), %edx 343 movl PT_EIP(%esp), %edx
@@ -362,40 +348,6 @@ sysenter_exit:
362 PTGS_TO_GS 348 PTGS_TO_GS
363 ENABLE_INTERRUPTS_SYSEXIT 349 ENABLE_INTERRUPTS_SYSEXIT
364 350
365#ifdef CONFIG_AUDITSYSCALL
366sysenter_audit:
367 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp)
368 jnz syscall_trace_entry
369 /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
370 movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */
371 /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
372 pushl PT_ESI(%esp) /* a3: 5th arg */
373 pushl PT_EDX+4(%esp) /* a2: 4th arg */
374 call __audit_syscall_entry
375 popl %ecx /* get that remapped edx off the stack */
376 popl %ecx /* get that remapped esi off the stack */
377 movl PT_EAX(%esp), %eax /* reload syscall number */
378 jmp sysenter_do_call
379
380sysexit_audit:
381 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
382 jnz syscall_exit_work
383 TRACE_IRQS_ON
384 ENABLE_INTERRUPTS(CLBR_ANY)
385 movl %eax, %edx /* second arg, syscall return value */
386 cmpl $-MAX_ERRNO, %eax /* is it an error ? */
387 setbe %al /* 1 if so, 0 if not */
388 movzbl %al, %eax /* zero-extend that */
389 call __audit_syscall_exit
390 DISABLE_INTERRUPTS(CLBR_ANY)
391 TRACE_IRQS_OFF
392 movl TI_flags(%ebp), %ecx
393 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
394 jnz syscall_exit_work
395 movl PT_EAX(%esp), %eax /* reload syscall return value */
396 jmp sysenter_exit
397#endif
398
399.pushsection .fixup, "ax" 351.pushsection .fixup, "ax"
4002: movl $0, PT_FS(%esp) 3522: movl $0, PT_FS(%esp)
401 jmp 1b 353 jmp 1b
@@ -421,13 +373,7 @@ syscall_after_call:
421 movl %eax, PT_EAX(%esp) # store the return value 373 movl %eax, PT_EAX(%esp) # store the return value
422syscall_exit: 374syscall_exit:
423 LOCKDEP_SYS_EXIT 375 LOCKDEP_SYS_EXIT
424 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 376 jmp syscall_exit_work
425 # setting need_resched or sigpending
426 # between sampling and the iret
427 TRACE_IRQS_OFF
428 movl TI_flags(%ebp), %ecx
429 testl $_TIF_ALLWORK_MASK, %ecx # current->work
430 jnz syscall_exit_work
431 377
432restore_all: 378restore_all:
433 TRACE_IRQS_IRET 379 TRACE_IRQS_IRET
@@ -504,57 +450,6 @@ ldt_ss:
504#endif 450#endif
505ENDPROC(entry_INT80_32) 451ENDPROC(entry_INT80_32)
506 452
507 # perform work that needs to be done immediately before resumption
508 ALIGN
509work_pending:
510 testb $_TIF_NEED_RESCHED, %cl
511 jz work_notifysig
512work_resched:
513 call schedule
514 LOCKDEP_SYS_EXIT
515 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
516 # setting need_resched or sigpending
517 # between sampling and the iret
518 TRACE_IRQS_OFF
519 movl TI_flags(%ebp), %ecx
520 andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
521 # than syscall tracing?
522 jz restore_all
523 testb $_TIF_NEED_RESCHED, %cl
524 jnz work_resched
525
526work_notifysig: # deal with pending signals and
527 # notify-resume requests
528#ifdef CONFIG_VM86
529 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
530 movl %esp, %eax
531 jnz work_notifysig_v86 # returning to kernel-space or
532 # vm86-space
5331:
534#else
535 movl %esp, %eax
536#endif
537 TRACE_IRQS_ON
538 ENABLE_INTERRUPTS(CLBR_NONE)
539 movb PT_CS(%esp), %bl
540 andb $SEGMENT_RPL_MASK, %bl
541 cmpb $USER_RPL, %bl
542 jb resume_kernel
543 xorl %edx, %edx
544 call do_notify_resume
545 jmp resume_userspace
546
547#ifdef CONFIG_VM86
548 ALIGN
549work_notifysig_v86:
550 pushl %ecx # save ti_flags for do_notify_resume
551 call save_v86_state # %eax contains pt_regs pointer
552 popl %ecx
553 movl %eax, %esp
554 jmp 1b
555#endif
556END(work_pending)
557
558 # perform syscall exit tracing 453 # perform syscall exit tracing
559 ALIGN 454 ALIGN
560syscall_trace_entry: 455syscall_trace_entry:
@@ -569,15 +464,14 @@ END(syscall_trace_entry)
569 464
570 # perform syscall exit tracing 465 # perform syscall exit tracing
571 ALIGN 466 ALIGN
572syscall_exit_work: 467syscall_exit_work_irqs_off:
573 testl $_TIF_WORK_SYSCALL_EXIT, %ecx
574 jz work_pending
575 TRACE_IRQS_ON 468 TRACE_IRQS_ON
576 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call 469 ENABLE_INTERRUPTS(CLBR_ANY)
577 # schedule() instead 470
471syscall_exit_work:
578 movl %esp, %eax 472 movl %esp, %eax
579 call syscall_trace_leave 473 call syscall_return_slowpath
580 jmp resume_userspace 474 jmp restore_all
581END(syscall_exit_work) 475END(syscall_exit_work)
582 476
583syscall_fault: 477syscall_fault:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 8cb3e438f21e..d3033183ed70 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -33,7 +33,6 @@
33#include <asm/paravirt.h> 33#include <asm/paravirt.h>
34#include <asm/percpu.h> 34#include <asm/percpu.h>
35#include <asm/asm.h> 35#include <asm/asm.h>
36#include <asm/context_tracking.h>
37#include <asm/smap.h> 36#include <asm/smap.h>
38#include <asm/pgtable_types.h> 37#include <asm/pgtable_types.h>
39#include <linux/err.h> 38#include <linux/err.h>
@@ -229,6 +228,11 @@ entry_SYSCALL_64_fastpath:
229 */ 228 */
230 USERGS_SYSRET64 229 USERGS_SYSRET64
231 230
231GLOBAL(int_ret_from_sys_call_irqs_off)
232 TRACE_IRQS_ON
233 ENABLE_INTERRUPTS(CLBR_NONE)
234 jmp int_ret_from_sys_call
235
232 /* Do syscall entry tracing */ 236 /* Do syscall entry tracing */
233tracesys: 237tracesys:
234 movq %rsp, %rdi 238 movq %rsp, %rdi
@@ -272,69 +276,11 @@ tracesys_phase2:
272 * Has correct iret frame. 276 * Has correct iret frame.
273 */ 277 */
274GLOBAL(int_ret_from_sys_call) 278GLOBAL(int_ret_from_sys_call)
275 DISABLE_INTERRUPTS(CLBR_NONE)
276int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
277 TRACE_IRQS_OFF
278 movl $_TIF_ALLWORK_MASK, %edi
279 /* edi: mask to check */
280GLOBAL(int_with_check)
281 LOCKDEP_SYS_EXIT_IRQ
282 GET_THREAD_INFO(%rcx)
283 movl TI_flags(%rcx), %edx
284 andl %edi, %edx
285 jnz int_careful
286 andl $~TS_COMPAT, TI_status(%rcx)
287 jmp syscall_return
288
289 /*
290 * Either reschedule or signal or syscall exit tracking needed.
291 * First do a reschedule test.
292 * edx: work, edi: workmask
293 */
294int_careful:
295 bt $TIF_NEED_RESCHED, %edx
296 jnc int_very_careful
297 TRACE_IRQS_ON
298 ENABLE_INTERRUPTS(CLBR_NONE)
299 pushq %rdi
300 SCHEDULE_USER
301 popq %rdi
302 DISABLE_INTERRUPTS(CLBR_NONE)
303 TRACE_IRQS_OFF
304 jmp int_with_check
305
306 /* handle signals and tracing -- both require a full pt_regs */
307int_very_careful:
308 TRACE_IRQS_ON
309 ENABLE_INTERRUPTS(CLBR_NONE)
310 SAVE_EXTRA_REGS 279 SAVE_EXTRA_REGS
311 /* Check for syscall exit trace */ 280 movq %rsp, %rdi
312 testl $_TIF_WORK_SYSCALL_EXIT, %edx 281 call syscall_return_slowpath /* returns with IRQs disabled */
313 jz int_signal
314 pushq %rdi
315 leaq 8(%rsp), %rdi /* &ptregs -> arg1 */
316 call syscall_trace_leave
317 popq %rdi
318 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi
319 jmp int_restore_rest
320
321int_signal:
322 testl $_TIF_DO_NOTIFY_MASK, %edx
323 jz 1f
324 movq %rsp, %rdi /* &ptregs -> arg1 */
325 xorl %esi, %esi /* oldset -> arg2 */
326 call do_notify_resume
3271: movl $_TIF_WORK_MASK, %edi
328int_restore_rest:
329 RESTORE_EXTRA_REGS 282 RESTORE_EXTRA_REGS
330 DISABLE_INTERRUPTS(CLBR_NONE) 283 TRACE_IRQS_IRETQ /* we're about to change IF */
331 TRACE_IRQS_OFF
332 jmp int_with_check
333
334syscall_return:
335 /* The IRETQ could re-enable interrupts: */
336 DISABLE_INTERRUPTS(CLBR_ANY)
337 TRACE_IRQS_IRETQ
338 284
339 /* 285 /*
340 * Try to use SYSRET instead of IRET if we're returning to 286 * Try to use SYSRET instead of IRET if we're returning to
@@ -555,23 +501,22 @@ END(irq_entries_start)
555/* 0(%rsp): ~(interrupt number) */ 501/* 0(%rsp): ~(interrupt number) */
556 .macro interrupt func 502 .macro interrupt func
557 cld 503 cld
558 /* 504 ALLOC_PT_GPREGS_ON_STACK
559 * Since nothing in interrupt handling code touches r12...r15 members 505 SAVE_C_REGS
560 * of "struct pt_regs", and since interrupts can nest, we can save 506 SAVE_EXTRA_REGS
561 * four stack slots and simultaneously provide
562 * an unwind-friendly stack layout by saving "truncated" pt_regs
563 * exactly up to rbp slot, without these members.
564 */
565 ALLOC_PT_GPREGS_ON_STACK -RBP
566 SAVE_C_REGS -RBP
567 /* this goes to 0(%rsp) for unwinder, not for saving the value: */
568 SAVE_EXTRA_REGS_RBP -RBP
569
570 leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */
571 507
572 testb $3, CS-RBP(%rsp) 508 testb $3, CS(%rsp)
573 jz 1f 509 jz 1f
510
511 /*
512 * IRQ from user mode. Switch to kernel gsbase and inform context
513 * tracking that we're in kernel mode.
514 */
574 SWAPGS 515 SWAPGS
516#ifdef CONFIG_CONTEXT_TRACKING
517 call enter_from_user_mode
518#endif
519
5751: 5201:
576 /* 521 /*
577 * Save previous stack pointer, optionally switch to interrupt stack. 522 * Save previous stack pointer, optionally switch to interrupt stack.
@@ -580,14 +525,14 @@ END(irq_entries_start)
580 * a little cheaper to use a separate counter in the PDA (short of 525 * a little cheaper to use a separate counter in the PDA (short of
581 * moving irq_enter into assembly, which would be too much work) 526 * moving irq_enter into assembly, which would be too much work)
582 */ 527 */
583 movq %rsp, %rsi 528 movq %rsp, %rdi
584 incl PER_CPU_VAR(irq_count) 529 incl PER_CPU_VAR(irq_count)
585 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp 530 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
586 pushq %rsi 531 pushq %rdi
587 /* We entered an interrupt context - irqs are off: */ 532 /* We entered an interrupt context - irqs are off: */
588 TRACE_IRQS_OFF 533 TRACE_IRQS_OFF
589 534
590 call \func 535 call \func /* rdi points to pt_regs */
591 .endm 536 .endm
592 537
593 /* 538 /*
@@ -606,34 +551,19 @@ ret_from_intr:
606 decl PER_CPU_VAR(irq_count) 551 decl PER_CPU_VAR(irq_count)
607 552
608 /* Restore saved previous stack */ 553 /* Restore saved previous stack */
609 popq %rsi 554 popq %rsp
610 /* return code expects complete pt_regs - adjust rsp accordingly: */
611 leaq -RBP(%rsi), %rsp
612 555
613 testb $3, CS(%rsp) 556 testb $3, CS(%rsp)
614 jz retint_kernel 557 jz retint_kernel
615 /* Interrupt came from user space */
616retint_user:
617 GET_THREAD_INFO(%rcx)
618 558
619 /* %rcx: thread info. Interrupts are off. */ 559 /* Interrupt came from user space */
620retint_with_reschedule:
621 movl $_TIF_WORK_MASK, %edi
622retint_check:
623 LOCKDEP_SYS_EXIT_IRQ 560 LOCKDEP_SYS_EXIT_IRQ
624 movl TI_flags(%rcx), %edx 561GLOBAL(retint_user)
625 andl %edi, %edx 562 mov %rsp,%rdi
626 jnz retint_careful 563 call prepare_exit_to_usermode
627
628retint_swapgs: /* return to user-space */
629 /*
630 * The iretq could re-enable interrupts:
631 */
632 DISABLE_INTERRUPTS(CLBR_ANY)
633 TRACE_IRQS_IRETQ 564 TRACE_IRQS_IRETQ
634
635 SWAPGS 565 SWAPGS
636 jmp restore_c_regs_and_iret 566 jmp restore_regs_and_iret
637 567
638/* Returning to kernel space */ 568/* Returning to kernel space */
639retint_kernel: 569retint_kernel:
@@ -657,6 +587,8 @@ retint_kernel:
657 * At this label, code paths which return to kernel and to user, 587 * At this label, code paths which return to kernel and to user,
658 * which come from interrupts/exception and from syscalls, merge. 588 * which come from interrupts/exception and from syscalls, merge.
659 */ 589 */
590restore_regs_and_iret:
591 RESTORE_EXTRA_REGS
660restore_c_regs_and_iret: 592restore_c_regs_and_iret:
661 RESTORE_C_REGS 593 RESTORE_C_REGS
662 REMOVE_PT_GPREGS_FROM_STACK 8 594 REMOVE_PT_GPREGS_FROM_STACK 8
@@ -707,37 +639,6 @@ native_irq_return_ldt:
707 popq %rax 639 popq %rax
708 jmp native_irq_return_iret 640 jmp native_irq_return_iret
709#endif 641#endif
710
711 /* edi: workmask, edx: work */
712retint_careful:
713 bt $TIF_NEED_RESCHED, %edx
714 jnc retint_signal
715 TRACE_IRQS_ON
716 ENABLE_INTERRUPTS(CLBR_NONE)
717 pushq %rdi
718 SCHEDULE_USER
719 popq %rdi
720 GET_THREAD_INFO(%rcx)
721 DISABLE_INTERRUPTS(CLBR_NONE)
722 TRACE_IRQS_OFF
723 jmp retint_check
724
725retint_signal:
726 testl $_TIF_DO_NOTIFY_MASK, %edx
727 jz retint_swapgs
728 TRACE_IRQS_ON
729 ENABLE_INTERRUPTS(CLBR_NONE)
730 SAVE_EXTRA_REGS
731 movq $-1, ORIG_RAX(%rsp)
732 xorl %esi, %esi /* oldset */
733 movq %rsp, %rdi /* &pt_regs */
734 call do_notify_resume
735 RESTORE_EXTRA_REGS
736 DISABLE_INTERRUPTS(CLBR_NONE)
737 TRACE_IRQS_OFF
738 GET_THREAD_INFO(%rcx)
739 jmp retint_with_reschedule
740
741END(common_interrupt) 642END(common_interrupt)
742 643
743/* 644/*
@@ -1143,12 +1044,22 @@ ENTRY(error_entry)
1143 SAVE_EXTRA_REGS 8 1044 SAVE_EXTRA_REGS 8
1144 xorl %ebx, %ebx 1045 xorl %ebx, %ebx
1145 testb $3, CS+8(%rsp) 1046 testb $3, CS+8(%rsp)
1146 jz error_kernelspace 1047 jz .Lerror_kernelspace
1147 1048
1148 /* We entered from user mode */ 1049.Lerror_entry_from_usermode_swapgs:
1050 /*
1051 * We entered from user mode or we're pretending to have entered
1052 * from user mode due to an IRET fault.
1053 */
1149 SWAPGS 1054 SWAPGS
1150 1055
1151error_entry_done: 1056.Lerror_entry_from_usermode_after_swapgs:
1057#ifdef CONFIG_CONTEXT_TRACKING
1058 call enter_from_user_mode
1059#endif
1060
1061.Lerror_entry_done:
1062
1152 TRACE_IRQS_OFF 1063 TRACE_IRQS_OFF
1153 ret 1064 ret
1154 1065
@@ -1158,31 +1069,30 @@ error_entry_done:
1158 * truncated RIP for IRET exceptions returning to compat mode. Check 1069 * truncated RIP for IRET exceptions returning to compat mode. Check
1159 * for these here too. 1070 * for these here too.
1160 */ 1071 */
1161error_kernelspace: 1072.Lerror_kernelspace:
1162 incl %ebx 1073 incl %ebx
1163 leaq native_irq_return_iret(%rip), %rcx 1074 leaq native_irq_return_iret(%rip), %rcx
1164 cmpq %rcx, RIP+8(%rsp) 1075 cmpq %rcx, RIP+8(%rsp)
1165 je error_bad_iret 1076 je .Lerror_bad_iret
1166 movl %ecx, %eax /* zero extend */ 1077 movl %ecx, %eax /* zero extend */
1167 cmpq %rax, RIP+8(%rsp) 1078 cmpq %rax, RIP+8(%rsp)
1168 je bstep_iret 1079 je .Lbstep_iret
1169 cmpq $gs_change, RIP+8(%rsp) 1080 cmpq $gs_change, RIP+8(%rsp)
1170 jne error_entry_done 1081 jne .Lerror_entry_done
1171 1082
1172 /* 1083 /*
1173 * hack: gs_change can fail with user gsbase. If this happens, fix up 1084 * hack: gs_change can fail with user gsbase. If this happens, fix up
1174 * gsbase and proceed. We'll fix up the exception and land in 1085 * gsbase and proceed. We'll fix up the exception and land in
1175 * gs_change's error handler with kernel gsbase. 1086 * gs_change's error handler with kernel gsbase.
1176 */ 1087 */
1177 SWAPGS 1088 jmp .Lerror_entry_from_usermode_swapgs
1178 jmp error_entry_done
1179 1089
1180bstep_iret: 1090.Lbstep_iret:
1181 /* Fix truncated RIP */ 1091 /* Fix truncated RIP */
1182 movq %rcx, RIP+8(%rsp) 1092 movq %rcx, RIP+8(%rsp)
1183 /* fall through */ 1093 /* fall through */
1184 1094
1185error_bad_iret: 1095.Lerror_bad_iret:
1186 /* 1096 /*
1187 * We came from an IRET to user mode, so we have user gsbase. 1097 * We came from an IRET to user mode, so we have user gsbase.
1188 * Switch to kernel gsbase: 1098 * Switch to kernel gsbase:
@@ -1198,7 +1108,7 @@ error_bad_iret:
1198 call fixup_bad_iret 1108 call fixup_bad_iret
1199 mov %rax, %rsp 1109 mov %rax, %rsp
1200 decl %ebx 1110 decl %ebx
1201 jmp error_entry_done 1111 jmp .Lerror_entry_from_usermode_after_swapgs
1202END(error_entry) 1112END(error_entry)
1203 1113
1204 1114
@@ -1209,7 +1119,6 @@ END(error_entry)
1209 */ 1119 */
1210ENTRY(error_exit) 1120ENTRY(error_exit)
1211 movl %ebx, %eax 1121 movl %ebx, %eax
1212 RESTORE_EXTRA_REGS
1213 DISABLE_INTERRUPTS(CLBR_NONE) 1122 DISABLE_INTERRUPTS(CLBR_NONE)
1214 TRACE_IRQS_OFF 1123 TRACE_IRQS_OFF
1215 testl %eax, %eax 1124 testl %eax, %eax
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index a7e257d9cb90..a9360d40fb7f 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -22,8 +22,8 @@
22#define __AUDIT_ARCH_LE 0x40000000 22#define __AUDIT_ARCH_LE 0x40000000
23 23
24#ifndef CONFIG_AUDITSYSCALL 24#ifndef CONFIG_AUDITSYSCALL
25# define sysexit_audit ia32_ret_from_sys_call 25# define sysexit_audit ia32_ret_from_sys_call_irqs_off
26# define sysretl_audit ia32_ret_from_sys_call 26# define sysretl_audit ia32_ret_from_sys_call_irqs_off
27#endif 27#endif
28 28
29 .section .entry.text, "ax" 29 .section .entry.text, "ax"
@@ -141,7 +141,8 @@ sysexit_from_sys_call:
141 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 141 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
142 movl RIP(%rsp), %ecx /* User %eip */ 142 movl RIP(%rsp), %ecx /* User %eip */
143 movq RAX(%rsp), %rax 143 movq RAX(%rsp), %rax
144 RESTORE_RSI_RDI 144 movl RSI(%rsp), %esi
145 movl RDI(%rsp), %edi
145 xorl %edx, %edx /* Do not leak kernel information */ 146 xorl %edx, %edx /* Do not leak kernel information */
146 xorq %r8, %r8 147 xorq %r8, %r8
147 xorq %r9, %r9 148 xorq %r9, %r9
@@ -209,10 +210,10 @@ sysexit_from_sys_call:
209 .endm 210 .endm
210 211
211 .macro auditsys_exit exit 212 .macro auditsys_exit exit
212 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
213 jnz ia32_ret_from_sys_call
214 TRACE_IRQS_ON 213 TRACE_IRQS_ON
215 ENABLE_INTERRUPTS(CLBR_NONE) 214 ENABLE_INTERRUPTS(CLBR_NONE)
215 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
216 jnz ia32_ret_from_sys_call
216 movl %eax, %esi /* second arg, syscall return value */ 217 movl %eax, %esi /* second arg, syscall return value */
217 cmpl $-MAX_ERRNO, %eax /* is it an error ? */ 218 cmpl $-MAX_ERRNO, %eax /* is it an error ? */
218 jbe 1f 219 jbe 1f
@@ -230,7 +231,7 @@ sysexit_from_sys_call:
230 movq %rax, R10(%rsp) 231 movq %rax, R10(%rsp)
231 movq %rax, R9(%rsp) 232 movq %rax, R9(%rsp)
232 movq %rax, R8(%rsp) 233 movq %rax, R8(%rsp)
233 jmp int_with_check 234 jmp int_ret_from_sys_call_irqs_off
234 .endm 235 .endm
235 236
236sysenter_auditsys: 237sysenter_auditsys:
@@ -365,7 +366,9 @@ cstar_dispatch:
365 366
366sysretl_from_sys_call: 367sysretl_from_sys_call:
367 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 368 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
368 RESTORE_RSI_RDI_RDX 369 movl RDX(%rsp), %edx
370 movl RSI(%rsp), %esi
371 movl RDI(%rsp), %edi
369 movl RIP(%rsp), %ecx 372 movl RIP(%rsp), %ecx
370 movl EFLAGS(%rsp), %r11d 373 movl EFLAGS(%rsp), %r11d
371 movq RAX(%rsp), %rax 374 movq RAX(%rsp), %rax
@@ -430,8 +433,48 @@ cstar_tracesys:
430END(entry_SYSCALL_compat) 433END(entry_SYSCALL_compat)
431 434
432ia32_badarg: 435ia32_badarg:
433 ASM_CLAC 436 /*
434 movq $-EFAULT, RAX(%rsp) 437 * So far, we've entered kernel mode, set AC, turned on IRQs, and
438 * saved C regs except r8-r11. We haven't done any of the other
439 * standard entry work, though. We want to bail, but we shouldn't
440 * treat this as a syscall entry since we don't even know what the
441 * args are. Instead, treat this as a non-syscall entry, finish
442 * the entry work, and immediately exit after setting AX = -EFAULT.
443 *
444 * We're really just being polite here. Killing the task outright
445 * would be a reasonable action, too. Given that the only valid
446 * way to have gotten here is through the vDSO, and we already know
447 * that the stack pointer is bad, the task isn't going to survive
448 * for long no matter what we do.
449 */
450
451 ASM_CLAC /* undo STAC */
452 movq $-EFAULT, RAX(%rsp) /* return -EFAULT if possible */
453
454 /* Fill in the rest of pt_regs */
455 xorl %eax, %eax
456 movq %rax, R11(%rsp)
457 movq %rax, R10(%rsp)
458 movq %rax, R9(%rsp)
459 movq %rax, R8(%rsp)
460 SAVE_EXTRA_REGS
461
462 /* Turn IRQs back off. */
463 DISABLE_INTERRUPTS(CLBR_NONE)
464 TRACE_IRQS_OFF
465
466 /* Now finish entering normal kernel mode. */
467#ifdef CONFIG_CONTEXT_TRACKING
468 call enter_from_user_mode
469#endif
470
471 /* And exit again. */
472 jmp retint_user
473
474ia32_ret_from_sys_call_irqs_off:
475 TRACE_IRQS_ON
476 ENABLE_INTERRUPTS(CLBR_NONE)
477
435ia32_ret_from_sys_call: 478ia32_ret_from_sys_call:
436 xorl %eax, %eax /* Do not leak kernel information */ 479 xorl %eax, %eax /* Do not leak kernel information */
437 movq %rax, R11(%rsp) 480 movq %rax, R11(%rsp)
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ef8187f9d28d..25e3cf1cd8fd 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -365,3 +365,18 @@
365356 i386 memfd_create sys_memfd_create 365356 i386 memfd_create sys_memfd_create
366357 i386 bpf sys_bpf 366357 i386 bpf sys_bpf
367358 i386 execveat sys_execveat stub32_execveat 367358 i386 execveat sys_execveat stub32_execveat
368359 i386 socket sys_socket
369360 i386 socketpair sys_socketpair
370361 i386 bind sys_bind
371362 i386 connect sys_connect
372363 i386 listen sys_listen
373364 i386 accept4 sys_accept4
374365 i386 getsockopt sys_getsockopt compat_sys_getsockopt
375366 i386 setsockopt sys_setsockopt compat_sys_setsockopt
376367 i386 getsockname sys_getsockname
377368 i386 getpeername sys_getpeername
378369 i386 sendto sys_sendto
379370 i386 sendmsg sys_sendmsg compat_sys_sendmsg
380371 i386 recvfrom sys_recvfrom compat_sys_recvfrom
381372 i386 recvmsg sys_recvmsg compat_sys_recvmsg
382373 i386 shutdown sys_shutdown
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index e97032069f88..a3d0767a6b29 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -8,7 +8,7 @@ KASAN_SANITIZE := n
8VDSO64-$(CONFIG_X86_64) := y 8VDSO64-$(CONFIG_X86_64) := y
9VDSOX32-$(CONFIG_X86_X32_ABI) := y 9VDSOX32-$(CONFIG_X86_X32_ABI) := y
10VDSO32-$(CONFIG_X86_32) := y 10VDSO32-$(CONFIG_X86_32) := y
11VDSO32-$(CONFIG_COMPAT) := y 11VDSO32-$(CONFIG_IA32_EMULATION) := y
12 12
13# files to link into the vdso 13# files to link into the vdso
14vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o 14vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
@@ -20,7 +20,7 @@ obj-y += vma.o
20vdso_img-$(VDSO64-y) += 64 20vdso_img-$(VDSO64-y) += 64
21vdso_img-$(VDSOX32-y) += x32 21vdso_img-$(VDSOX32-y) += x32
22vdso_img-$(VDSO32-y) += 32-int80 22vdso_img-$(VDSO32-y) += 32-int80
23vdso_img-$(CONFIG_COMPAT) += 32-syscall 23vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall
24vdso_img-$(VDSO32-y) += 32-sysenter 24vdso_img-$(VDSO32-y) += 32-sysenter
25 25
26obj-$(VDSO32-y) += vdso32-setup.o 26obj-$(VDSO32-y) += vdso32-setup.o
@@ -126,7 +126,7 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
126# Build multiple 32-bit vDSO images to choose from at boot time. 126# Build multiple 32-bit vDSO images to choose from at boot time.
127# 127#
128vdso32.so-$(VDSO32-y) += int80 128vdso32.so-$(VDSO32-y) += int80
129vdso32.so-$(CONFIG_COMPAT) += syscall 129vdso32.so-$(CONFIG_IA32_EMULATION) += syscall
130vdso32.so-$(VDSO32-y) += sysenter 130vdso32.so-$(VDSO32-y) += sysenter
131 131
132vdso32-images = $(vdso32.so-y:%=vdso32-%.so) 132vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
@@ -175,7 +175,7 @@ quiet_cmd_vdso = VDSO $@
175 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ 175 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
176 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' 176 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
177 177
178VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ 178VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \
179 $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) 179 $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
180GCOV_PROFILE := n 180GCOV_PROFILE := n
181 181
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 9793322751e0..ca94fa649251 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -175,20 +175,8 @@ static notrace cycle_t vread_pvclock(int *mode)
175 175
176notrace static cycle_t vread_tsc(void) 176notrace static cycle_t vread_tsc(void)
177{ 177{
178 cycle_t ret; 178 cycle_t ret = (cycle_t)rdtsc_ordered();
179 u64 last; 179 u64 last = gtod->cycle_last;
180
181 /*
182 * Empirically, a fence (of type that depends on the CPU)
183 * before rdtsc is enough to ensure that rdtsc is ordered
184 * with respect to loads. The various CPU manuals are unclear
185 * as to whether rdtsc can be reordered with later loads,
186 * but no one has ever seen it happen.
187 */
188 rdtsc_barrier();
189 ret = (cycle_t)__native_read_tsc();
190
191 last = gtod->cycle_last;
192 180
193 if (likely(ret >= last)) 181 if (likely(ret >= last))
194 return ret; 182 return ret;
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 1c9f750c3859..434543145d78 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -177,7 +177,7 @@ up_fail:
177 return ret; 177 return ret;
178} 178}
179 179
180#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) 180#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
181static int load_vdso32(void) 181static int load_vdso32(void)
182{ 182{
183 int ret; 183 int ret;
@@ -219,8 +219,11 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
219 return map_vdso(&vdso_image_x32, true); 219 return map_vdso(&vdso_image_x32, true);
220 } 220 }
221#endif 221#endif
222 222#ifdef CONFIG_IA32_EMULATION
223 return load_vdso32(); 223 return load_vdso32();
224#else
225 return 0;
226#endif
224} 227}
225#endif 228#endif
226#else 229#else
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 2dcc6ff6fdcc..26a46f44e298 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -290,7 +290,7 @@ static struct vm_area_struct gate_vma = {
290 290
291struct vm_area_struct *get_gate_vma(struct mm_struct *mm) 291struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
292{ 292{
293#ifdef CONFIG_IA32_EMULATION 293#ifdef CONFIG_COMPAT
294 if (!mm || mm->context.ia32_compat) 294 if (!mm || mm->context.ia32_compat)
295 return NULL; 295 return NULL;
296#endif 296#endif
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index ae3a29ae875b..a0a19b7ba22d 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -34,99 +34,6 @@
34#include <asm/sys_ia32.h> 34#include <asm/sys_ia32.h>
35#include <asm/smap.h> 35#include <asm/smap.h>
36 36
37int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
38{
39 int err = 0;
40 bool ia32 = test_thread_flag(TIF_IA32);
41
42 if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
43 return -EFAULT;
44
45 put_user_try {
46 /* If you change siginfo_t structure, please make sure that
47 this code is fixed accordingly.
48 It should never copy any pad contained in the structure
49 to avoid security leaks, but must copy the generic
50 3 ints plus the relevant union member. */
51 put_user_ex(from->si_signo, &to->si_signo);
52 put_user_ex(from->si_errno, &to->si_errno);
53 put_user_ex((short)from->si_code, &to->si_code);
54
55 if (from->si_code < 0) {
56 put_user_ex(from->si_pid, &to->si_pid);
57 put_user_ex(from->si_uid, &to->si_uid);
58 put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
59 } else {
60 /*
61 * First 32bits of unions are always present:
62 * si_pid === si_band === si_tid === si_addr(LS half)
63 */
64 put_user_ex(from->_sifields._pad[0],
65 &to->_sifields._pad[0]);
66 switch (from->si_code >> 16) {
67 case __SI_FAULT >> 16:
68 break;
69 case __SI_SYS >> 16:
70 put_user_ex(from->si_syscall, &to->si_syscall);
71 put_user_ex(from->si_arch, &to->si_arch);
72 break;
73 case __SI_CHLD >> 16:
74 if (ia32) {
75 put_user_ex(from->si_utime, &to->si_utime);
76 put_user_ex(from->si_stime, &to->si_stime);
77 } else {
78 put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
79 put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
80 }
81 put_user_ex(from->si_status, &to->si_status);
82 /* FALL THROUGH */
83 default:
84 case __SI_KILL >> 16:
85 put_user_ex(from->si_uid, &to->si_uid);
86 break;
87 case __SI_POLL >> 16:
88 put_user_ex(from->si_fd, &to->si_fd);
89 break;
90 case __SI_TIMER >> 16:
91 put_user_ex(from->si_overrun, &to->si_overrun);
92 put_user_ex(ptr_to_compat(from->si_ptr),
93 &to->si_ptr);
94 break;
95 /* This is not generated by the kernel as of now. */
96 case __SI_RT >> 16:
97 case __SI_MESGQ >> 16:
98 put_user_ex(from->si_uid, &to->si_uid);
99 put_user_ex(from->si_int, &to->si_int);
100 break;
101 }
102 }
103 } put_user_catch(err);
104
105 return err;
106}
107
108int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
109{
110 int err = 0;
111 u32 ptr32;
112
113 if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
114 return -EFAULT;
115
116 get_user_try {
117 get_user_ex(to->si_signo, &from->si_signo);
118 get_user_ex(to->si_errno, &from->si_errno);
119 get_user_ex(to->si_code, &from->si_code);
120
121 get_user_ex(to->si_pid, &from->si_pid);
122 get_user_ex(to->si_uid, &from->si_uid);
123 get_user_ex(ptr32, &from->si_ptr);
124 to->si_ptr = compat_ptr(ptr32);
125 } get_user_catch(err);
126
127 return err;
128}
129
130/* 37/*
131 * Do a signal return; undo the signal stack. 38 * Do a signal return; undo the signal stack.
132 */ 39 */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index e51a8f803f55..818cb8788225 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -91,15 +91,4 @@ do { \
91#define smp_mb__before_atomic() barrier() 91#define smp_mb__before_atomic() barrier()
92#define smp_mb__after_atomic() barrier() 92#define smp_mb__after_atomic() barrier()
93 93
94/*
95 * Stop RDTSC speculation. This is needed when you need to use RDTSC
96 * (or get_cycles or vread that possibly accesses the TSC) in a defined
97 * code region.
98 */
99static __always_inline void rdtsc_barrier(void)
100{
101 alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
102 "lfence", X86_FEATURE_LFENCE_RDTSC);
103}
104
105#endif /* _ASM_X86_BARRIER_H */ 94#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/context_tracking.h b/arch/x86/include/asm/context_tracking.h
deleted file mode 100644
index 1fe49704b146..000000000000
--- a/arch/x86/include/asm/context_tracking.h
+++ /dev/null
@@ -1,10 +0,0 @@
1#ifndef _ASM_X86_CONTEXT_TRACKING_H
2#define _ASM_X86_CONTEXT_TRACKING_H
3
4#ifdef CONFIG_CONTEXT_TRACKING
5# define SCHEDULE_USER call schedule_user
6#else
7# define SCHEDULE_USER call schedule
8#endif
9
10#endif
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3d6606fb97d0..a39e5708209b 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -176,6 +176,7 @@
176#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ 176#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
177#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ 177#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
178#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ 178#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
179#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
179 180
180/* 181/*
181 * Auxiliary flags: Linux defined - For features scattered in various 182 * Auxiliary flags: Linux defined - For features scattered in various
diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
index 9b3b4f2754c7..36a760bda462 100644
--- a/arch/x86/include/asm/delay.h
+++ b/arch/x86/include/asm/delay.h
@@ -4,5 +4,6 @@
4#include <asm-generic/delay.h> 4#include <asm-generic/delay.h>
5 5
6void use_tsc_delay(void); 6void use_tsc_delay(void);
7void use_mwaitx_delay(void);
7 8
8#endif /* _ASM_X86_DELAY_H */ 9#endif /* _ASM_X86_DELAY_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f161c189c27b..141c561f4664 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -78,7 +78,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
78#ifdef CONFIG_X86_64 78#ifdef CONFIG_X86_64
79extern unsigned int vdso64_enabled; 79extern unsigned int vdso64_enabled;
80#endif 80#endif
81#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) 81#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
82extern unsigned int vdso32_enabled; 82extern unsigned int vdso32_enabled;
83#endif 83#endif
84 84
@@ -187,8 +187,8 @@ static inline void elf_common_init(struct thread_struct *t,
187#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ 187#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
188 elf_common_init(&current->thread, regs, __USER_DS) 188 elf_common_init(&current->thread, regs, __USER_DS)
189 189
190void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); 190void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp);
191#define compat_start_thread start_thread_ia32 191#define compat_start_thread compat_start_thread
192 192
193void set_personality_ia32(bool); 193void set_personality_ia32(bool);
194#define COMPAT_SET_PERSONALITY(ex) \ 194#define COMPAT_SET_PERSONALITY(ex) \
@@ -344,14 +344,9 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
344 */ 344 */
345static inline int mmap_is_ia32(void) 345static inline int mmap_is_ia32(void)
346{ 346{
347#ifdef CONFIG_X86_32 347 return config_enabled(CONFIG_X86_32) ||
348 return 1; 348 (config_enabled(CONFIG_COMPAT) &&
349#endif 349 test_thread_flag(TIF_ADDR32));
350#ifdef CONFIG_IA32_EMULATION
351 if (test_thread_flag(TIF_ADDR32))
352 return 1;
353#endif
354 return 0;
355} 350}
356 351
357/* Do not change the values. See get_align_mask() */ 352/* Do not change the values. See get_align_mask() */
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index d0e8e0141041..28019765442e 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -22,15 +22,6 @@ struct ucontext_ia32 {
22 compat_sigset_t uc_sigmask; /* mask last for extensibility */ 22 compat_sigset_t uc_sigmask; /* mask last for extensibility */
23}; 23};
24 24
25struct ucontext_x32 {
26 unsigned int uc_flags;
27 unsigned int uc_link;
28 compat_stack_t uc_stack;
29 unsigned int uc__pad0; /* needed for alignment */
30 struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */
31 compat_sigset_t uc_sigmask; /* mask last for extensibility */
32};
33
34/* This matches struct stat64 in glibc2.2, hence the absolutely 25/* This matches struct stat64 in glibc2.2, hence the absolutely
35 * insane amounts of padding around dev_t's. 26 * insane amounts of padding around dev_t's.
36 */ 27 */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 4c2d2eb2060a..6ca9fd6234e1 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -117,16 +117,6 @@
117 117
118#define FPU_IRQ 13 118#define FPU_IRQ 13
119 119
120#define FIRST_VM86_IRQ 3
121#define LAST_VM86_IRQ 15
122
123#ifndef __ASSEMBLY__
124static inline int invalid_vm86_irq(int irq)
125{
126 return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
127}
128#endif
129
130/* 120/*
131 * Size the maximum number of interrupts. 121 * Size the maximum number of interrupts.
132 * 122 *
diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
index 031f6266f425..0d9b14f60d2c 100644
--- a/arch/x86/include/asm/math_emu.h
+++ b/arch/x86/include/asm/math_emu.h
@@ -2,7 +2,6 @@
2#define _ASM_X86_MATH_EMU_H 2#define _ASM_X86_MATH_EMU_H
3 3
4#include <asm/ptrace.h> 4#include <asm/ptrace.h>
5#include <asm/vm86.h>
6 5
7/* This structure matches the layout of the data saved to the stack 6/* This structure matches the layout of the data saved to the stack
8 following a device-not-present interrupt, part of it saved 7 following a device-not-present interrupt, part of it saved
@@ -10,9 +9,6 @@
10 */ 9 */
11struct math_emu_info { 10struct math_emu_info {
12 long ___orig_eip; 11 long ___orig_eip;
13 union { 12 struct pt_regs *regs;
14 struct pt_regs *regs;
15 struct kernel_vm86_regs *vm86;
16 };
17}; 13};
18#endif /* _ASM_X86_MATH_EMU_H */ 14#endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 364d27481a52..55234d5e7160 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -9,7 +9,9 @@
9 * we put the segment information here. 9 * we put the segment information here.
10 */ 10 */
11typedef struct { 11typedef struct {
12#ifdef CONFIG_MODIFY_LDT_SYSCALL
12 struct ldt_struct *ldt; 13 struct ldt_struct *ldt;
14#endif
13 15
14#ifdef CONFIG_X86_64 16#ifdef CONFIG_X86_64
15 /* True if mm supports a task running in 32 bit compatibility mode. */ 17 /* True if mm supports a task running in 32 bit compatibility mode. */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 984abfe47edc..379cd3658799 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -33,6 +33,7 @@ static inline void load_mm_cr4(struct mm_struct *mm)
33static inline void load_mm_cr4(struct mm_struct *mm) {} 33static inline void load_mm_cr4(struct mm_struct *mm) {}
34#endif 34#endif
35 35
36#ifdef CONFIG_MODIFY_LDT_SYSCALL
36/* 37/*
37 * ldt_structs can be allocated, used, and freed, but they are never 38 * ldt_structs can be allocated, used, and freed, but they are never
38 * modified while live. 39 * modified while live.
@@ -48,8 +49,23 @@ struct ldt_struct {
48 int size; 49 int size;
49}; 50};
50 51
52/*
53 * Used for LDT copy/destruction.
54 */
55int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
56void destroy_context(struct mm_struct *mm);
57#else /* CONFIG_MODIFY_LDT_SYSCALL */
58static inline int init_new_context(struct task_struct *tsk,
59 struct mm_struct *mm)
60{
61 return 0;
62}
63static inline void destroy_context(struct mm_struct *mm) {}
64#endif
65
51static inline void load_mm_ldt(struct mm_struct *mm) 66static inline void load_mm_ldt(struct mm_struct *mm)
52{ 67{
68#ifdef CONFIG_MODIFY_LDT_SYSCALL
53 struct ldt_struct *ldt; 69 struct ldt_struct *ldt;
54 70
55 /* lockless_dereference synchronizes with smp_store_release */ 71 /* lockless_dereference synchronizes with smp_store_release */
@@ -73,17 +89,13 @@ static inline void load_mm_ldt(struct mm_struct *mm)
73 set_ldt(ldt->entries, ldt->size); 89 set_ldt(ldt->entries, ldt->size);
74 else 90 else
75 clear_LDT(); 91 clear_LDT();
92#else
93 clear_LDT();
94#endif
76 95
77 DEBUG_LOCKS_WARN_ON(preemptible()); 96 DEBUG_LOCKS_WARN_ON(preemptible());
78} 97}
79 98
80/*
81 * Used for LDT copy/destruction.
82 */
83int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
84void destroy_context(struct mm_struct *mm);
85
86
87static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 99static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
88{ 100{
89#ifdef CONFIG_SMP 101#ifdef CONFIG_SMP
@@ -114,6 +126,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
114 /* Load per-mm CR4 state */ 126 /* Load per-mm CR4 state */
115 load_mm_cr4(next); 127 load_mm_cr4(next);
116 128
129#ifdef CONFIG_MODIFY_LDT_SYSCALL
117 /* 130 /*
118 * Load the LDT, if the LDT is different. 131 * Load the LDT, if the LDT is different.
119 * 132 *
@@ -128,6 +141,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
128 */ 141 */
129 if (unlikely(prev->context.ldt != next->context.ldt)) 142 if (unlikely(prev->context.ldt != next->context.ldt))
130 load_mm_ldt(next); 143 load_mm_ldt(next);
144#endif
131 } 145 }
132#ifdef CONFIG_SMP 146#ifdef CONFIG_SMP
133 else { 147 else {
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index e6a707eb5081..77d8b284e4a7 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -47,14 +47,13 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
47 * it means rax *or* rdx. 47 * it means rax *or* rdx.
48 */ 48 */
49#ifdef CONFIG_X86_64 49#ifdef CONFIG_X86_64
50#define DECLARE_ARGS(val, low, high) unsigned low, high 50/* Using 64-bit values saves one instruction clearing the high half of low */
51#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32)) 51#define DECLARE_ARGS(val, low, high) unsigned long low, high
52#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high) 52#define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32)
53#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) 53#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)
54#else 54#else
55#define DECLARE_ARGS(val, low, high) unsigned long long val 55#define DECLARE_ARGS(val, low, high) unsigned long long val
56#define EAX_EDX_VAL(val, low, high) (val) 56#define EAX_EDX_VAL(val, low, high) (val)
57#define EAX_EDX_ARGS(val, low, high) "A" (val)
58#define EAX_EDX_RET(val, low, high) "=A" (val) 57#define EAX_EDX_RET(val, low, high) "=A" (val)
59#endif 58#endif
60 59
@@ -106,12 +105,19 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
106 return err; 105 return err;
107} 106}
108 107
109extern unsigned long long native_read_tsc(void);
110
111extern int rdmsr_safe_regs(u32 regs[8]); 108extern int rdmsr_safe_regs(u32 regs[8]);
112extern int wrmsr_safe_regs(u32 regs[8]); 109extern int wrmsr_safe_regs(u32 regs[8]);
113 110
114static __always_inline unsigned long long __native_read_tsc(void) 111/**
112 * rdtsc() - returns the current TSC without ordering constraints
113 *
114 * rdtsc() returns the result of RDTSC as a 64-bit integer. The
115 * only ordering constraint it supplies is the ordering implied by
116 * "asm volatile": it will put the RDTSC in the place you expect. The
117 * CPU can and will speculatively execute that RDTSC, though, so the
118 * results can be non-monotonic if compared on different CPUs.
119 */
120static __always_inline unsigned long long rdtsc(void)
115{ 121{
116 DECLARE_ARGS(val, low, high); 122 DECLARE_ARGS(val, low, high);
117 123
@@ -120,6 +126,35 @@ static __always_inline unsigned long long __native_read_tsc(void)
120 return EAX_EDX_VAL(val, low, high); 126 return EAX_EDX_VAL(val, low, high);
121} 127}
122 128
129/**
130 * rdtsc_ordered() - read the current TSC in program order
131 *
132 * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
133 * It is ordered like a load to a global in-memory counter. It should
134 * be impossible to observe non-monotonic rdtsc_unordered() behavior
135 * across multiple CPUs as long as the TSC is synced.
136 */
137static __always_inline unsigned long long rdtsc_ordered(void)
138{
139 /*
140 * The RDTSC instruction is not ordered relative to memory
141 * access. The Intel SDM and the AMD APM are both vague on this
142 * point, but empirically an RDTSC instruction can be
143 * speculatively executed before prior loads. An RDTSC
144 * immediately after an appropriate barrier appears to be
145 * ordered as a normal load, that is, it provides the same
146 * ordering guarantees as reading from a global memory location
147 * that some other imaginary CPU is updating continuously with a
148 * time stamp.
149 */
150 alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
151 "lfence", X86_FEATURE_LFENCE_RDTSC);
152 return rdtsc();
153}
154
155/* Deprecated, keep it for a cycle for easier merging: */
156#define rdtscll(now) do { (now) = rdtsc_ordered(); } while (0)
157
123static inline unsigned long long native_read_pmc(int counter) 158static inline unsigned long long native_read_pmc(int counter)
124{ 159{
125 DECLARE_ARGS(val, low, high); 160 DECLARE_ARGS(val, low, high);
@@ -153,8 +188,10 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
153#define rdmsrl(msr, val) \ 188#define rdmsrl(msr, val) \
154 ((val) = native_read_msr((msr))) 189 ((val) = native_read_msr((msr)))
155 190
156#define wrmsrl(msr, val) \ 191static inline void wrmsrl(unsigned msr, u64 val)
157 native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32)) 192{
193 native_write_msr(msr, (u32)val, (u32)(val >> 32));
194}
158 195
159/* wrmsr with exception handling */ 196/* wrmsr with exception handling */
160static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) 197static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
@@ -180,12 +217,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
180 return err; 217 return err;
181} 218}
182 219
183#define rdtscl(low) \
184 ((low) = (u32)__native_read_tsc())
185
186#define rdtscll(val) \
187 ((val) = __native_read_tsc())
188
189#define rdpmc(counter, low, high) \ 220#define rdpmc(counter, low, high) \
190do { \ 221do { \
191 u64 _l = native_read_pmc((counter)); \ 222 u64 _l = native_read_pmc((counter)); \
@@ -195,15 +226,6 @@ do { \
195 226
196#define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) 227#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
197 228
198#define rdtscp(low, high, aux) \
199do { \
200 unsigned long long _val = native_read_tscp(&(aux)); \
201 (low) = (u32)_val; \
202 (high) = (u32)(_val >> 32); \
203} while (0)
204
205#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
206
207#endif /* !CONFIG_PARAVIRT */ 229#endif /* !CONFIG_PARAVIRT */
208 230
209/* 231/*
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 653dfa7662e1..c70689b5e5aa 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -14,6 +14,9 @@
14#define CPUID5_ECX_INTERRUPT_BREAK 0x2 14#define CPUID5_ECX_INTERRUPT_BREAK 0x2
15 15
16#define MWAIT_ECX_INTERRUPT_BREAK 0x1 16#define MWAIT_ECX_INTERRUPT_BREAK 0x1
17#define MWAITX_ECX_TIMER_ENABLE BIT(1)
18#define MWAITX_MAX_LOOPS ((u32)-1)
19#define MWAITX_DISABLE_CSTATES 0xf
17 20
18static inline void __monitor(const void *eax, unsigned long ecx, 21static inline void __monitor(const void *eax, unsigned long ecx,
19 unsigned long edx) 22 unsigned long edx)
@@ -23,6 +26,14 @@ static inline void __monitor(const void *eax, unsigned long ecx,
23 :: "a" (eax), "c" (ecx), "d"(edx)); 26 :: "a" (eax), "c" (ecx), "d"(edx));
24} 27}
25 28
29static inline void __monitorx(const void *eax, unsigned long ecx,
30 unsigned long edx)
31{
32 /* "monitorx %eax, %ecx, %edx;" */
33 asm volatile(".byte 0x0f, 0x01, 0xfa;"
34 :: "a" (eax), "c" (ecx), "d"(edx));
35}
36
26static inline void __mwait(unsigned long eax, unsigned long ecx) 37static inline void __mwait(unsigned long eax, unsigned long ecx)
27{ 38{
28 /* "mwait %eax, %ecx;" */ 39 /* "mwait %eax, %ecx;" */
@@ -30,6 +41,40 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
30 :: "a" (eax), "c" (ecx)); 41 :: "a" (eax), "c" (ecx));
31} 42}
32 43
44/*
45 * MWAITX allows for a timer expiration to get the core out a wait state in
46 * addition to the default MWAIT exit condition of a store appearing at a
47 * monitored virtual address.
48 *
49 * Registers:
50 *
51 * MWAITX ECX[1]: enable timer if set
52 * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0
53 * frequency is the same as the TSC frequency.
54 *
55 * Below is a comparison between MWAIT and MWAITX on AMD processors:
56 *
57 * MWAIT MWAITX
58 * opcode 0f 01 c9 | 0f 01 fb
59 * ECX[0] value of RFLAGS.IF seen by instruction
60 * ECX[1] unused/#GP if set | enable timer if set
61 * ECX[31:2] unused/#GP if set
62 * EAX unused (reserve for hint)
63 * EBX[31:0] unused | max wait time (P0 clocks)
64 *
65 * MONITOR MONITORX
66 * opcode 0f 01 c8 | 0f 01 fa
67 * EAX (logical) address to monitor
68 * ECX #GP if not zero
69 */
70static inline void __mwaitx(unsigned long eax, unsigned long ebx,
71 unsigned long ecx)
72{
73 /* "mwaitx %eax, %ebx, %ecx;" */
74 asm volatile(".byte 0x0f, 0x01, 0xfb;"
75 :: "a" (eax), "b" (ebx), "c" (ecx));
76}
77
33static inline void __sti_mwait(unsigned long eax, unsigned long ecx) 78static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
34{ 79{
35 trace_hardirqs_on(); 80 trace_hardirqs_on();
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index d143bfad45d7..10d0596433f8 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -153,7 +153,11 @@ do { \
153 val = paravirt_read_msr(msr, &_err); \ 153 val = paravirt_read_msr(msr, &_err); \
154} while (0) 154} while (0)
155 155
156#define wrmsrl(msr, val) wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32) 156static inline void wrmsrl(unsigned msr, u64 val)
157{
158 wrmsr(msr, (u32)val, (u32)(val>>32));
159}
160
157#define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b) 161#define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b)
158 162
159/* rdmsr with exception handling */ 163/* rdmsr with exception handling */
@@ -174,19 +178,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
174 return err; 178 return err;
175} 179}
176 180
177static inline u64 paravirt_read_tsc(void)
178{
179 return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
180}
181
182#define rdtscl(low) \
183do { \
184 u64 _l = paravirt_read_tsc(); \
185 low = (int)_l; \
186} while (0)
187
188#define rdtscll(val) (val = paravirt_read_tsc())
189
190static inline unsigned long long paravirt_sched_clock(void) 181static inline unsigned long long paravirt_sched_clock(void)
191{ 182{
192 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); 183 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
@@ -215,27 +206,6 @@ do { \
215 206
216#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) 207#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
217 208
218static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
219{
220 return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
221}
222
223#define rdtscp(low, high, aux) \
224do { \
225 int __aux; \
226 unsigned long __val = paravirt_rdtscp(&__aux); \
227 (low) = (u32)__val; \
228 (high) = (u32)(__val >> 32); \
229 (aux) = __aux; \
230} while (0)
231
232#define rdtscpll(val, aux) \
233do { \
234 unsigned long __aux; \
235 val = paravirt_rdtscp(&__aux); \
236 (aux) = __aux; \
237} while (0)
238
239static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) 209static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
240{ 210{
241 PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries); 211 PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index a6b8f9fadb06..ce029e4fa7c6 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -156,9 +156,7 @@ struct pv_cpu_ops {
156 u64 (*read_msr)(unsigned int msr, int *err); 156 u64 (*read_msr)(unsigned int msr, int *err);
157 int (*write_msr)(unsigned int msr, unsigned low, unsigned high); 157 int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
158 158
159 u64 (*read_tsc)(void);
160 u64 (*read_pmc)(int counter); 159 u64 (*read_pmc)(int counter);
161 unsigned long long (*read_tscp)(unsigned int *aux);
162 160
163#ifdef CONFIG_X86_32 161#ifdef CONFIG_X86_32
164 /* 162 /*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 944f1785ed0d..9615a4e2645e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -6,8 +6,8 @@
6/* Forward declaration, a strange C thing */ 6/* Forward declaration, a strange C thing */
7struct task_struct; 7struct task_struct;
8struct mm_struct; 8struct mm_struct;
9struct vm86;
9 10
10#include <asm/vm86.h>
11#include <asm/math_emu.h> 11#include <asm/math_emu.h>
12#include <asm/segment.h> 12#include <asm/segment.h>
13#include <asm/types.h> 13#include <asm/types.h>
@@ -400,15 +400,9 @@ struct thread_struct {
400 unsigned long cr2; 400 unsigned long cr2;
401 unsigned long trap_nr; 401 unsigned long trap_nr;
402 unsigned long error_code; 402 unsigned long error_code;
403#ifdef CONFIG_X86_32 403#ifdef CONFIG_VM86
404 /* Virtual 86 mode info */ 404 /* Virtual 86 mode info */
405 struct vm86_struct __user *vm86_info; 405 struct vm86 *vm86;
406 unsigned long screen_bitmap;
407 unsigned long v86flags;
408 unsigned long v86mask;
409 unsigned long saved_sp0;
410 unsigned int saved_fs;
411 unsigned int saved_gs;
412#endif 406#endif
413 /* IO permissions: */ 407 /* IO permissions: */
414 unsigned long *io_bitmap_ptr; 408 unsigned long *io_bitmap_ptr;
@@ -720,7 +714,6 @@ static inline void spin_lock_prefetch(const void *x)
720 714
721#define INIT_THREAD { \ 715#define INIT_THREAD { \
722 .sp0 = TOP_OF_INIT_STACK, \ 716 .sp0 = TOP_OF_INIT_STACK, \
723 .vm86_info = NULL, \
724 .sysenter_cs = __KERNEL_CS, \ 717 .sysenter_cs = __KERNEL_CS, \
725 .io_bitmap_ptr = NULL, \ 718 .io_bitmap_ptr = NULL, \
726} 719}
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5fabf1362942..6271281f947d 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -88,7 +88,6 @@ extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
88 unsigned long phase1_result); 88 unsigned long phase1_result);
89 89
90extern long syscall_trace_enter(struct pt_regs *); 90extern long syscall_trace_enter(struct pt_regs *);
91extern void syscall_trace_leave(struct pt_regs *);
92 91
93static inline unsigned long regs_return_value(struct pt_regs *regs) 92static inline unsigned long regs_return_value(struct pt_regs *regs)
94{ 93{
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 628954ceede1..7a6bed5c08bc 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
62static __always_inline 62static __always_inline
63u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) 63u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
64{ 64{
65 u64 delta = __native_read_tsc() - src->tsc_timestamp; 65 u64 delta = rdtsc_ordered() - src->tsc_timestamp;
66 return pvclock_scale_delta(delta, src->tsc_to_system_mul, 66 return pvclock_scale_delta(delta, src->tsc_to_system_mul,
67 src->tsc_shift); 67 src->tsc_shift);
68} 68}
@@ -76,13 +76,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
76 u8 ret_flags; 76 u8 ret_flags;
77 77
78 version = src->version; 78 version = src->version;
79 /* Note: emulated platforms which do not advertise SSE2 support 79
80 * result in kvmclock not using the necessary RDTSC barriers.
81 * Without barriers, it is possible that RDTSC instruction reads from
82 * the time stamp counter outside rdtsc_barrier protected section
83 * below, resulting in violation of monotonicity.
84 */
85 rdtsc_barrier();
86 offset = pvclock_get_nsec_offset(src); 80 offset = pvclock_get_nsec_offset(src);
87 ret = src->system_time + offset; 81 ret = src->system_time + offset;
88 ret_flags = src->flags; 82 ret_flags = src->flags;
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 7c7c27c97daa..1f3175bb994e 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -4,6 +4,7 @@
4#include <asm/sigcontext.h> 4#include <asm/sigcontext.h>
5#include <asm/siginfo.h> 5#include <asm/siginfo.h>
6#include <asm/ucontext.h> 6#include <asm/ucontext.h>
7#include <linux/compat.h>
7 8
8#ifdef CONFIG_X86_32 9#ifdef CONFIG_X86_32
9#define sigframe_ia32 sigframe 10#define sigframe_ia32 sigframe
@@ -69,6 +70,15 @@ struct rt_sigframe {
69 70
70#ifdef CONFIG_X86_X32_ABI 71#ifdef CONFIG_X86_X32_ABI
71 72
73struct ucontext_x32 {
74 unsigned int uc_flags;
75 unsigned int uc_link;
76 compat_stack_t uc_stack;
77 unsigned int uc__pad0; /* needed for alignment */
78 struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */
79 compat_sigset_t uc_sigmask; /* mask last for extensibility */
80};
81
72struct rt_sigframe_x32 { 82struct rt_sigframe_x32 {
73 u64 pretcode; 83 u64 pretcode;
74 struct ucontext_x32 uc; 84 struct ucontext_x32 uc;
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 31eab867e6d3..c481be78fcf1 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -30,7 +30,7 @@ typedef sigset_t compat_sigset_t;
30#endif /* __ASSEMBLY__ */ 30#endif /* __ASSEMBLY__ */
31#include <uapi/asm/signal.h> 31#include <uapi/asm/signal.h>
32#ifndef __ASSEMBLY__ 32#ifndef __ASSEMBLY__
33extern void do_notify_resume(struct pt_regs *, void *, __u32); 33extern void do_signal(struct pt_regs *regs);
34 34
35#define __ARCH_HAS_SA_RESTORER 35#define __ARCH_HAS_SA_RESTORER
36 36
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index c2e00bb2a136..58505f01962f 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -72,7 +72,7 @@ static __always_inline void boot_init_stack_canary(void)
72 * on during the bootup the random pool has true entropy too. 72 * on during the bootup the random pool has true entropy too.
73 */ 73 */
74 get_random_bytes(&canary, sizeof(canary)); 74 get_random_bytes(&canary, sizeof(canary));
75 tsc = __native_read_tsc(); 75 tsc = rdtsc();
76 canary += tsc + (tsc << 32UL); 76 canary += tsc + (tsc << 32UL);
77 77
78 current->stack_canary = canary; 78 current->stack_canary = canary;
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 592a6a672e07..91dfcafe27a6 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -37,6 +37,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *);
37asmlinkage unsigned long sys_sigreturn(void); 37asmlinkage unsigned long sys_sigreturn(void);
38 38
39/* kernel/vm86_32.c */ 39/* kernel/vm86_32.c */
40struct vm86_struct;
40asmlinkage long sys_vm86old(struct vm86_struct __user *); 41asmlinkage long sys_vm86old(struct vm86_struct __user *);
41asmlinkage long sys_vm86(unsigned long, unsigned long); 42asmlinkage long sys_vm86(unsigned long, unsigned long);
42 43
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 225ee545e1a0..8afdc3e44247 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -27,14 +27,17 @@
27 * Without this offset, that can result in a page fault. (We are 27 * Without this offset, that can result in a page fault. (We are
28 * careful that, in this case, the value we read doesn't matter.) 28 * careful that, in this case, the value we read doesn't matter.)
29 * 29 *
30 * In vm86 mode, the hardware frame is much longer still, but we neither 30 * In vm86 mode, the hardware frame is much longer still, so add 16
31 * access the extra members from NMI context, nor do we write such a 31 * bytes to make room for the real-mode segments.
32 * frame at sp0 at all.
33 * 32 *
34 * x86_64 has a fixed-length stack frame. 33 * x86_64 has a fixed-length stack frame.
35 */ 34 */
36#ifdef CONFIG_X86_32 35#ifdef CONFIG_X86_32
37# define TOP_OF_KERNEL_STACK_PADDING 8 36# ifdef CONFIG_VM86
37# define TOP_OF_KERNEL_STACK_PADDING 16
38# else
39# define TOP_OF_KERNEL_STACK_PADDING 8
40# endif
38#else 41#else
39# define TOP_OF_KERNEL_STACK_PADDING 0 42# define TOP_OF_KERNEL_STACK_PADDING 0
40#endif 43#endif
@@ -140,27 +143,11 @@ struct thread_info {
140 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ 143 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \
141 _TIF_NOHZ) 144 _TIF_NOHZ)
142 145
143/* work to do in syscall_trace_leave() */
144#define _TIF_WORK_SYSCALL_EXIT \
145 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
146 _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ)
147
148/* work to do on interrupt/exception return */
149#define _TIF_WORK_MASK \
150 (0x0000FFFF & \
151 ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \
152 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
153
154/* work to do on any return to user space */ 146/* work to do on any return to user space */
155#define _TIF_ALLWORK_MASK \ 147#define _TIF_ALLWORK_MASK \
156 ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \ 148 ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \
157 _TIF_NOHZ) 149 _TIF_NOHZ)
158 150
159/* Only used for 64 bit */
160#define _TIF_DO_NOTIFY_MASK \
161 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \
162 _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
163
164/* flags to check in __switch_to() */ 151/* flags to check in __switch_to() */
165#define _TIF_WORK_CTXSW \ 152#define _TIF_WORK_CTXSW \
166 (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) 153 (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index c5380bea2a36..c3496619740a 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -112,8 +112,8 @@ asmlinkage void smp_threshold_interrupt(void);
112asmlinkage void smp_deferred_error_interrupt(void); 112asmlinkage void smp_deferred_error_interrupt(void);
113#endif 113#endif
114 114
115extern enum ctx_state ist_enter(struct pt_regs *regs); 115extern void ist_enter(struct pt_regs *regs);
116extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); 116extern void ist_exit(struct pt_regs *regs);
117extern void ist_begin_non_atomic(struct pt_regs *regs); 117extern void ist_begin_non_atomic(struct pt_regs *regs);
118extern void ist_end_non_atomic(void); 118extern void ist_end_non_atomic(void);
119 119
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index aad56eb3bbe2..6d7c5479bcea 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -21,28 +21,12 @@ extern void disable_TSC(void);
21 21
22static inline cycles_t get_cycles(void) 22static inline cycles_t get_cycles(void)
23{ 23{
24 unsigned long long ret = 0;
25
26#ifndef CONFIG_X86_TSC 24#ifndef CONFIG_X86_TSC
27 if (!cpu_has_tsc) 25 if (!cpu_has_tsc)
28 return 0; 26 return 0;
29#endif 27#endif
30 rdtscll(ret);
31
32 return ret;
33}
34 28
35static __always_inline cycles_t vget_cycles(void) 29 return rdtsc();
36{
37 /*
38 * We only do VDSOs on TSC capable CPUs, so this shouldn't
39 * access boot_cpu_data (which is not VDSO-safe):
40 */
41#ifndef CONFIG_X86_TSC
42 if (!cpu_has_tsc)
43 return 0;
44#endif
45 return (cycles_t)__native_read_tsc();
46} 30}
47 31
48extern void tsc_init(void); 32extern void tsc_init(void);
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 1d8de3f3feca..1e491f3af317 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -1,7 +1,6 @@
1#ifndef _ASM_X86_VM86_H 1#ifndef _ASM_X86_VM86_H
2#define _ASM_X86_VM86_H 2#define _ASM_X86_VM86_H
3 3
4
5#include <asm/ptrace.h> 4#include <asm/ptrace.h>
6#include <uapi/asm/vm86.h> 5#include <uapi/asm/vm86.h>
7 6
@@ -28,43 +27,49 @@ struct kernel_vm86_regs {
28 unsigned short gs, __gsh; 27 unsigned short gs, __gsh;
29}; 28};
30 29
31struct kernel_vm86_struct { 30struct vm86 {
32 struct kernel_vm86_regs regs; 31 struct vm86plus_struct __user *user_vm86;
33/* 32 struct pt_regs regs32;
34 * the below part remains on the kernel stack while we are in VM86 mode. 33 unsigned long veflags;
35 * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we 34 unsigned long veflags_mask;
36 * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above 35 unsigned long saved_sp0;
37 * 'struct kernel_vm86_regs' with the then actual values. 36
38 * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
39 * in kernelspace, hence we need not reget the data from userspace.
40 */
41#define VM86_TSS_ESP0 flags
42 unsigned long flags; 37 unsigned long flags;
43 unsigned long screen_bitmap; 38 unsigned long screen_bitmap;
44 unsigned long cpu_type; 39 unsigned long cpu_type;
45 struct revectored_struct int_revectored; 40 struct revectored_struct int_revectored;
46 struct revectored_struct int21_revectored; 41 struct revectored_struct int21_revectored;
47 struct vm86plus_info_struct vm86plus; 42 struct vm86plus_info_struct vm86plus;
48 struct pt_regs *regs32; /* here we save the pointer to the old regs */
49/*
50 * The below is not part of the structure, but the stack layout continues
51 * this way. In front of 'return-eip' may be some data, depending on
52 * compilation, so we don't rely on this and save the pointer to 'oldregs'
53 * in 'regs32' above.
54 * However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
55
56 long return-eip; from call to vm86()
57 struct pt_regs oldregs; user space registers as saved by syscall
58 */
59}; 43};
60 44
61#ifdef CONFIG_VM86 45#ifdef CONFIG_VM86
62 46
63void handle_vm86_fault(struct kernel_vm86_regs *, long); 47void handle_vm86_fault(struct kernel_vm86_regs *, long);
64int handle_vm86_trap(struct kernel_vm86_regs *, long, int); 48int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
65struct pt_regs *save_v86_state(struct kernel_vm86_regs *); 49void save_v86_state(struct kernel_vm86_regs *, int);
66 50
67struct task_struct; 51struct task_struct;
52
53#define free_vm86(t) do { \
54 struct thread_struct *__t = (t); \
55 if (__t->vm86 != NULL) { \
56 kfree(__t->vm86); \
57 __t->vm86 = NULL; \
58 } \
59} while (0)
60
61/*
62 * Support for VM86 programs to request interrupts for
63 * real mode hardware drivers:
64 */
65#define FIRST_VM86_IRQ 3
66#define LAST_VM86_IRQ 15
67
68static inline int invalid_vm86_irq(int irq)
69{
70 return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
71}
72
68void release_vm86_irqs(struct task_struct *); 73void release_vm86_irqs(struct task_struct *);
69 74
70#else 75#else
@@ -77,6 +82,10 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
77 return 0; 82 return 0;
78} 83}
79 84
85static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
86
87#define free_vm86(t) do { } while(0)
88
80#endif /* CONFIG_VM86 */ 89#endif /* CONFIG_VM86 */
81 90
82#endif /* _ASM_X86_VM86_H */ 91#endif /* _ASM_X86_VM86_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 180a0c3c224d..79887abcb5e1 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -37,8 +37,6 @@
37#define X86_EFLAGS_VM _BITUL(X86_EFLAGS_VM_BIT) 37#define X86_EFLAGS_VM _BITUL(X86_EFLAGS_VM_BIT)
38#define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */ 38#define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */
39#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT) 39#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT)
40#define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */
41#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT)
42#define X86_EFLAGS_VIF_BIT 19 /* Virtual Interrupt Flag */ 40#define X86_EFLAGS_VIF_BIT 19 /* Virtual Interrupt Flag */
43#define X86_EFLAGS_VIF _BITUL(X86_EFLAGS_VIF_BIT) 41#define X86_EFLAGS_VIF _BITUL(X86_EFLAGS_VIF_BIT)
44#define X86_EFLAGS_VIP_BIT 20 /* Virtual Interrupt Pending */ 42#define X86_EFLAGS_VIP_BIT 20 /* Virtual Interrupt Pending */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0f15af41bd80..514064897d55 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,8 +23,10 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
23CFLAGS_irq.o := -I$(src)/../include/asm/trace 23CFLAGS_irq.o := -I$(src)/../include/asm/trace
24 24
25obj-y := process_$(BITS).o signal.o 25obj-y := process_$(BITS).o signal.o
26obj-$(CONFIG_COMPAT) += signal_compat.o
26obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 27obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
27obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o 28obj-y += time.o ioport.o dumpstack.o nmi.o
29obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
28obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o 30obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
29obj-$(CONFIG_IRQ_WORK) += irq_work.o 31obj-$(CONFIG_IRQ_WORK) += irq_work.o
30obj-y += probe_roms.o 32obj-y += probe_roms.o
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index ede92c3364d3..222a57076039 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -263,7 +263,7 @@ static int apbt_clocksource_register(void)
263 263
264 /* Verify whether apbt counter works */ 264 /* Verify whether apbt counter works */
265 t1 = dw_apb_clocksource_read(clocksource_apbt); 265 t1 = dw_apb_clocksource_read(clocksource_apbt);
266 rdtscll(start); 266 start = rdtsc();
267 267
268 /* 268 /*
269 * We don't know the TSC frequency yet, but waiting for 269 * We don't know the TSC frequency yet, but waiting for
@@ -273,7 +273,7 @@ static int apbt_clocksource_register(void)
273 */ 273 */
274 do { 274 do {
275 rep_nop(); 275 rep_nop();
276 rdtscll(now); 276 now = rdtsc();
277 } while ((now - start) < 200000UL); 277 } while ((now - start) < 200000UL);
278 278
279 /* APBT is the only always on clocksource, it has to work! */ 279 /* APBT is the only always on clocksource, it has to work! */
@@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void)
390 old = dw_apb_clocksource_read(clocksource_apbt); 390 old = dw_apb_clocksource_read(clocksource_apbt);
391 old += loop; 391 old += loop;
392 392
393 t1 = __native_read_tsc(); 393 t1 = rdtsc();
394 394
395 do { 395 do {
396 new = dw_apb_clocksource_read(clocksource_apbt); 396 new = dw_apb_clocksource_read(clocksource_apbt);
397 } while (new < old); 397 } while (new < old);
398 398
399 t2 = __native_read_tsc(); 399 t2 = rdtsc();
400 400
401 shift = 5; 401 shift = 5;
402 if (unlikely(loop >> shift == 0)) { 402 if (unlikely(loop >> shift == 0)) {
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index cde732c1b495..5aba9220a5ac 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -457,7 +457,7 @@ static int lapic_next_deadline(unsigned long delta,
457{ 457{
458 u64 tsc; 458 u64 tsc;
459 459
460 rdtscll(tsc); 460 tsc = rdtsc();
461 wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); 461 wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
462 return 0; 462 return 0;
463} 463}
@@ -592,7 +592,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
592 unsigned long pm = acpi_pm_read_early(); 592 unsigned long pm = acpi_pm_read_early();
593 593
594 if (cpu_has_tsc) 594 if (cpu_has_tsc)
595 rdtscll(tsc); 595 tsc = rdtsc();
596 596
597 switch (lapic_cal_loops++) { 597 switch (lapic_cal_loops++) {
598 case 0: 598 case 0:
@@ -1209,7 +1209,7 @@ void setup_local_APIC(void)
1209 long long max_loops = cpu_khz ? cpu_khz : 1000000; 1209 long long max_loops = cpu_khz ? cpu_khz : 1000000;
1210 1210
1211 if (cpu_has_tsc) 1211 if (cpu_has_tsc)
1212 rdtscll(tsc); 1212 tsc = rdtsc();
1213 1213
1214 if (disable_apic) { 1214 if (disable_apic) {
1215 disable_ioapic_support(); 1215 disable_ioapic_support();
@@ -1293,7 +1293,7 @@ void setup_local_APIC(void)
1293 } 1293 }
1294 if (queued) { 1294 if (queued) {
1295 if (cpu_has_tsc && cpu_khz) { 1295 if (cpu_has_tsc && cpu_khz) {
1296 rdtscll(ntsc); 1296 ntsc = rdtsc();
1297 max_loops = (cpu_khz << 10) - (ntsc - tsc); 1297 max_loops = (cpu_khz << 10) - (ntsc - tsc);
1298 } else 1298 } else
1299 max_loops--; 1299 max_loops--;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index dd3a4baffe50..4a70fc6d400a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -11,6 +11,7 @@
11#include <asm/cpu.h> 11#include <asm/cpu.h>
12#include <asm/smp.h> 12#include <asm/smp.h>
13#include <asm/pci-direct.h> 13#include <asm/pci-direct.h>
14#include <asm/delay.h>
14 15
15#ifdef CONFIG_X86_64 16#ifdef CONFIG_X86_64
16# include <asm/mmconfig.h> 17# include <asm/mmconfig.h>
@@ -114,7 +115,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
114 const int K6_BUG_LOOP = 1000000; 115 const int K6_BUG_LOOP = 1000000;
115 int n; 116 int n;
116 void (*f_vide)(void); 117 void (*f_vide)(void);
117 unsigned long d, d2; 118 u64 d, d2;
118 119
119 printk(KERN_INFO "AMD K6 stepping B detected - "); 120 printk(KERN_INFO "AMD K6 stepping B detected - ");
120 121
@@ -125,10 +126,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
125 126
126 n = K6_BUG_LOOP; 127 n = K6_BUG_LOOP;
127 f_vide = vide; 128 f_vide = vide;
128 rdtscl(d); 129 d = rdtsc();
129 while (n--) 130 while (n--)
130 f_vide(); 131 f_vide();
131 rdtscl(d2); 132 d2 = rdtsc();
132 d = d2-d; 133 d = d2-d;
133 134
134 if (d > 20*K6_BUG_LOOP) 135 if (d > 20*K6_BUG_LOOP)
@@ -506,6 +507,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
506 /* A random value per boot for bit slice [12:upper_bit) */ 507 /* A random value per boot for bit slice [12:upper_bit) */
507 va_align.bits = get_random_int() & va_align.mask; 508 va_align.bits = get_random_int() & va_align.mask;
508 } 509 }
510
511 if (cpu_has(c, X86_FEATURE_MWAITX))
512 use_mwaitx_delay();
509} 513}
510 514
511static void early_init_amd(struct cpuinfo_x86 *c) 515static void early_init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cb9e5df42dd2..b128808853a2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1185,10 +1185,10 @@ void syscall_init(void)
1185 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. 1185 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
1186 */ 1186 */
1187 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); 1187 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1188 wrmsrl(MSR_LSTAR, entry_SYSCALL_64); 1188 wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
1189 1189
1190#ifdef CONFIG_IA32_EMULATION 1190#ifdef CONFIG_IA32_EMULATION
1191 wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); 1191 wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
1192 /* 1192 /*
1193 * This only works on Intel CPUs. 1193 * This only works on Intel CPUs.
1194 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. 1194 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1199,7 +1199,7 @@ void syscall_init(void)
1199 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); 1199 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
1200 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); 1200 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
1201#else 1201#else
1202 wrmsrl(MSR_CSTAR, ignore_sysret); 1202 wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
1203 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); 1203 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
1204 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); 1204 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
1205 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); 1205 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 0f8f21c8284a..9d014b82a124 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -127,7 +127,7 @@ void mce_setup(struct mce *m)
127{ 127{
128 memset(m, 0, sizeof(struct mce)); 128 memset(m, 0, sizeof(struct mce));
129 m->cpu = m->extcpu = smp_processor_id(); 129 m->cpu = m->extcpu = smp_processor_id();
130 rdtscll(m->tsc); 130 m->tsc = rdtsc();
131 /* We hope get_seconds stays lockless */ 131 /* We hope get_seconds stays lockless */
132 m->time = get_seconds(); 132 m->time = get_seconds();
133 m->cpuvendor = boot_cpu_data.x86_vendor; 133 m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -974,7 +974,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
974{ 974{
975 struct mca_config *cfg = &mca_cfg; 975 struct mca_config *cfg = &mca_cfg;
976 struct mce m, *final; 976 struct mce m, *final;
977 enum ctx_state prev_state;
978 int i; 977 int i;
979 int worst = 0; 978 int worst = 0;
980 int severity; 979 int severity;
@@ -1000,7 +999,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1000 int flags = MF_ACTION_REQUIRED; 999 int flags = MF_ACTION_REQUIRED;
1001 int lmce = 0; 1000 int lmce = 0;
1002 1001
1003 prev_state = ist_enter(regs); 1002 ist_enter(regs);
1004 1003
1005 this_cpu_inc(mce_exception_count); 1004 this_cpu_inc(mce_exception_count);
1006 1005
@@ -1166,7 +1165,7 @@ out:
1166 local_irq_disable(); 1165 local_irq_disable();
1167 ist_end_non_atomic(); 1166 ist_end_non_atomic();
1168done: 1167done:
1169 ist_exit(regs, prev_state); 1168 ist_exit(regs);
1170} 1169}
1171EXPORT_SYMBOL_GPL(do_machine_check); 1170EXPORT_SYMBOL_GPL(do_machine_check);
1172 1171
@@ -1754,7 +1753,7 @@ static void collect_tscs(void *data)
1754{ 1753{
1755 unsigned long *cpu_tsc = (unsigned long *)data; 1754 unsigned long *cpu_tsc = (unsigned long *)data;
1756 1755
1757 rdtscll(cpu_tsc[smp_processor_id()]); 1756 cpu_tsc[smp_processor_id()] = rdtsc();
1758} 1757}
1759 1758
1760static int mce_apei_read_done; 1759static int mce_apei_read_done;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 737b0ad4e61a..12402e10aeff 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly;
19/* Machine check handler for Pentium class Intel CPUs: */ 19/* Machine check handler for Pentium class Intel CPUs: */
20static void pentium_machine_check(struct pt_regs *regs, long error_code) 20static void pentium_machine_check(struct pt_regs *regs, long error_code)
21{ 21{
22 enum ctx_state prev_state;
23 u32 loaddr, hi, lotype; 22 u32 loaddr, hi, lotype;
24 23
25 prev_state = ist_enter(regs); 24 ist_enter(regs);
26 25
27 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 26 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
28 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); 27 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
39 38
40 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 39 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
41 40
42 ist_exit(regs, prev_state); 41 ist_exit(regs);
43} 42}
44 43
45/* Set up machine check reporting for processors with Intel style MCE: */ 44/* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 44f138296fbe..01dd8702880b 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,12 +15,12 @@
15/* Machine check handler for WinChip C6: */ 15/* Machine check handler for WinChip C6: */
16static void winchip_machine_check(struct pt_regs *regs, long error_code) 16static void winchip_machine_check(struct pt_regs *regs, long error_code)
17{ 17{
18 enum ctx_state prev_state = ist_enter(regs); 18 ist_enter(regs);
19 19
20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
21 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 21 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
22 22
23 ist_exit(regs, prev_state); 23 ist_exit(regs);
24} 24}
25 25
26/* Set up machine check reporting on the Winchip C6 series */ 26/* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f56cf074d01a..66dd3fe99b82 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2179,6 +2179,7 @@ static unsigned long get_segment_base(unsigned int segment)
2179 int idx = segment >> 3; 2179 int idx = segment >> 3;
2180 2180
2181 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { 2181 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2182#ifdef CONFIG_MODIFY_LDT_SYSCALL
2182 struct ldt_struct *ldt; 2183 struct ldt_struct *ldt;
2183 2184
2184 if (idx > LDT_ENTRIES) 2185 if (idx > LDT_ENTRIES)
@@ -2190,6 +2191,9 @@ static unsigned long get_segment_base(unsigned int segment)
2190 return 0; 2191 return 0;
2191 2192
2192 desc = &ldt->entries[idx]; 2193 desc = &ldt->entries[idx];
2194#else
2195 return 0;
2196#endif
2193 } else { 2197 } else {
2194 if (idx > GDT_ENTRIES) 2198 if (idx > GDT_ENTRIES)
2195 return 0; 2199 return 0;
@@ -2200,7 +2204,7 @@ static unsigned long get_segment_base(unsigned int segment)
2200 return get_desc_base(desc); 2204 return get_desc_base(desc);
2201} 2205}
2202 2206
2203#ifdef CONFIG_COMPAT 2207#ifdef CONFIG_IA32_EMULATION
2204 2208
2205#include <asm/compat.h> 2209#include <asm/compat.h>
2206 2210
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index ce95676abd60..4d38416e2a7f 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -110,7 +110,7 @@ static void init_espfix_random(void)
110 */ 110 */
111 if (!arch_get_random_long(&rand)) { 111 if (!arch_get_random_long(&rand)) {
112 /* The constant is an arbitrary large prime */ 112 /* The constant is an arbitrary large prime */
113 rdtscll(rand); 113 rand = rdtsc();
114 rand *= 0xc345c6b72fd16123UL; 114 rand *= 0xc345c6b72fd16123UL;
115 } 115 }
116 116
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 10757d0a3fcf..f75c5908c7a6 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -735,7 +735,7 @@ static int hpet_clocksource_register(void)
735 735
736 /* Verify whether hpet counter works */ 736 /* Verify whether hpet counter works */
737 t1 = hpet_readl(HPET_COUNTER); 737 t1 = hpet_readl(HPET_COUNTER);
738 rdtscll(start); 738 start = rdtsc();
739 739
740 /* 740 /*
741 * We don't know the TSC frequency yet, but waiting for 741 * We don't know the TSC frequency yet, but waiting for
@@ -745,7 +745,7 @@ static int hpet_clocksource_register(void)
745 */ 745 */
746 do { 746 do {
747 rep_nop(); 747 rep_nop();
748 rdtscll(now); 748 now = rdtsc();
749 } while ((now - start) < 200000UL); 749 } while ((now - start) < 200000UL);
750 750
751 if (t1 == hpet_readl(HPET_COUNTER)) { 751 if (t1 == hpet_readl(HPET_COUNTER)) {
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c7dfe1be784e..4616672a4049 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -216,8 +216,23 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
216 unsigned vector = ~regs->orig_ax; 216 unsigned vector = ~regs->orig_ax;
217 unsigned irq; 217 unsigned irq;
218 218
219 /*
220 * NB: Unlike exception entries, IRQ entries do not reliably
221 * handle context tracking in the low-level entry code. This is
222 * because syscall entries execute briefly with IRQs on before
223 * updating context tracking state, so we can take an IRQ from
224 * kernel mode with CONTEXT_USER. The low-level entry code only
225 * updates the context if we came from user mode, so we won't
226 * switch to CONTEXT_KERNEL. We'll fix that once the syscall
227 * code is cleaned up enough that we can cleanly defer enabling
228 * IRQs.
229 */
230
219 entering_irq(); 231 entering_irq();
220 232
233 /* entering_irq() tells RCU that we're not quiescent. Check it. */
234 RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
235
221 irq = __this_cpu_read(vector_irq[vector]); 236 irq = __this_cpu_read(vector_irq[vector]);
222 237
223 if (!handle_irq(irq, regs)) { 238 if (!handle_irq(irq, regs)) {
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index d05bd2e2ee91..697f90db0e37 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w)
110 a->handler, whole_msecs, decimal_msecs); 110 a->handler, whole_msecs, decimal_msecs);
111} 111}
112 112
113static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) 113static int nmi_handle(unsigned int type, struct pt_regs *regs)
114{ 114{
115 struct nmi_desc *desc = nmi_to_desc(type); 115 struct nmi_desc *desc = nmi_to_desc(type);
116 struct nmiaction *a; 116 struct nmiaction *a;
@@ -213,7 +213,7 @@ static void
213pci_serr_error(unsigned char reason, struct pt_regs *regs) 213pci_serr_error(unsigned char reason, struct pt_regs *regs)
214{ 214{
215 /* check to see if anyone registered against these types of errors */ 215 /* check to see if anyone registered against these types of errors */
216 if (nmi_handle(NMI_SERR, regs, false)) 216 if (nmi_handle(NMI_SERR, regs))
217 return; 217 return;
218 218
219 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", 219 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
@@ -247,7 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
247 unsigned long i; 247 unsigned long i;
248 248
249 /* check to see if anyone registered against these types of errors */ 249 /* check to see if anyone registered against these types of errors */
250 if (nmi_handle(NMI_IO_CHECK, regs, false)) 250 if (nmi_handle(NMI_IO_CHECK, regs))
251 return; 251 return;
252 252
253 pr_emerg( 253 pr_emerg(
@@ -284,7 +284,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
284 * as only the first one is ever run (unless it can actually determine 284 * as only the first one is ever run (unless it can actually determine
285 * if it caused the NMI) 285 * if it caused the NMI)
286 */ 286 */
287 handled = nmi_handle(NMI_UNKNOWN, regs, false); 287 handled = nmi_handle(NMI_UNKNOWN, regs);
288 if (handled) { 288 if (handled) {
289 __this_cpu_add(nmi_stats.unknown, handled); 289 __this_cpu_add(nmi_stats.unknown, handled);
290 return; 290 return;
@@ -332,7 +332,7 @@ static void default_do_nmi(struct pt_regs *regs)
332 332
333 __this_cpu_write(last_nmi_rip, regs->ip); 333 __this_cpu_write(last_nmi_rip, regs->ip);
334 334
335 handled = nmi_handle(NMI_LOCAL, regs, b2b); 335 handled = nmi_handle(NMI_LOCAL, regs);
336 __this_cpu_add(nmi_stats.normal, handled); 336 __this_cpu_add(nmi_stats.normal, handled);
337 if (handled) { 337 if (handled) {
338 /* 338 /*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 58bcfb67c01f..f68e48f5f6c2 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -351,9 +351,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
351 .wbinvd = native_wbinvd, 351 .wbinvd = native_wbinvd,
352 .read_msr = native_read_msr_safe, 352 .read_msr = native_read_msr_safe,
353 .write_msr = native_write_msr_safe, 353 .write_msr = native_write_msr_safe,
354 .read_tsc = native_read_tsc,
355 .read_pmc = native_read_pmc, 354 .read_pmc = native_read_pmc,
356 .read_tscp = native_read_tscp,
357 .load_tr_desc = native_load_tr_desc, 355 .load_tr_desc = native_load_tr_desc,
358 .set_ldt = native_set_ldt, 356 .set_ldt = native_set_ldt,
359 .load_gdt = native_load_gdt, 357 .load_gdt = native_load_gdt,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index e1b013696dde..c89f50a76e97 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
10DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); 10DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
11DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); 11DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
12DEF_NATIVE(pv_cpu_ops, clts, "clts"); 12DEF_NATIVE(pv_cpu_ops, clts, "clts");
13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
14 13
15#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) 14#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
16DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); 15DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
@@ -52,7 +51,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
52 PATCH_SITE(pv_mmu_ops, read_cr3); 51 PATCH_SITE(pv_mmu_ops, read_cr3);
53 PATCH_SITE(pv_mmu_ops, write_cr3); 52 PATCH_SITE(pv_mmu_ops, write_cr3);
54 PATCH_SITE(pv_cpu_ops, clts); 53 PATCH_SITE(pv_cpu_ops, clts);
55 PATCH_SITE(pv_cpu_ops, read_tsc);
56#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) 54#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
57 case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): 55 case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
58 if (pv_is_native_spin_unlock()) { 56 if (pv_is_native_spin_unlock()) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d83740ab85b0..6d0e62ae8516 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -30,6 +30,7 @@
30#include <asm/nmi.h> 30#include <asm/nmi.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/mce.h> 32#include <asm/mce.h>
33#include <asm/vm86.h>
33 34
34/* 35/*
35 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 36 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -111,6 +112,8 @@ void exit_thread(void)
111 kfree(bp); 112 kfree(bp);
112 } 113 }
113 114
115 free_vm86(t);
116
114 fpu__drop(fpu); 117 fpu__drop(fpu);
115} 118}
116 119
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f73c962fe636..c13df2c735f8 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -53,6 +53,7 @@
53#include <asm/syscalls.h> 53#include <asm/syscalls.h>
54#include <asm/debugreg.h> 54#include <asm/debugreg.h>
55#include <asm/switch_to.h> 55#include <asm/switch_to.h>
56#include <asm/vm86.h>
56 57
57asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 58asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
58asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); 59asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index f6b916387590..3c1bbcf12924 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -121,6 +121,7 @@ void __show_regs(struct pt_regs *regs, int all)
121void release_thread(struct task_struct *dead_task) 121void release_thread(struct task_struct *dead_task)
122{ 122{
123 if (dead_task->mm) { 123 if (dead_task->mm) {
124#ifdef CONFIG_MODIFY_LDT_SYSCALL
124 if (dead_task->mm->context.ldt) { 125 if (dead_task->mm->context.ldt) {
125 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", 126 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
126 dead_task->comm, 127 dead_task->comm,
@@ -128,6 +129,7 @@ void release_thread(struct task_struct *dead_task)
128 dead_task->mm->context.ldt->size); 129 dead_task->mm->context.ldt->size);
129 BUG(); 130 BUG();
130 } 131 }
132#endif
131 } 133 }
132} 134}
133 135
@@ -248,8 +250,8 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
248 __USER_CS, __USER_DS, 0); 250 __USER_CS, __USER_DS, 0);
249} 251}
250 252
251#ifdef CONFIG_IA32_EMULATION 253#ifdef CONFIG_COMPAT
252void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) 254void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
253{ 255{
254 start_thread_common(regs, new_ip, new_sp, 256 start_thread_common(regs, new_ip, new_sp,
255 test_thread_flag(TIF_X32) 257 test_thread_flag(TIF_X32)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 9be72bc3613f..558f50edebca 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -37,12 +37,10 @@
37#include <asm/proto.h> 37#include <asm/proto.h>
38#include <asm/hw_breakpoint.h> 38#include <asm/hw_breakpoint.h>
39#include <asm/traps.h> 39#include <asm/traps.h>
40#include <asm/syscall.h>
40 41
41#include "tls.h" 42#include "tls.h"
42 43
43#define CREATE_TRACE_POINTS
44#include <trace/events/syscalls.h>
45
46enum x86_regset { 44enum x86_regset {
47 REGSET_GENERAL, 45 REGSET_GENERAL,
48 REGSET_FP, 46 REGSET_FP,
@@ -1123,6 +1121,73 @@ static int genregs32_set(struct task_struct *target,
1123 return ret; 1121 return ret;
1124} 1122}
1125 1123
1124static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
1125 compat_ulong_t caddr, compat_ulong_t cdata)
1126{
1127 unsigned long addr = caddr;
1128 unsigned long data = cdata;
1129 void __user *datap = compat_ptr(data);
1130 int ret;
1131 __u32 val;
1132
1133 switch (request) {
1134 case PTRACE_PEEKUSR:
1135 ret = getreg32(child, addr, &val);
1136 if (ret == 0)
1137 ret = put_user(val, (__u32 __user *)datap);
1138 break;
1139
1140 case PTRACE_POKEUSR:
1141 ret = putreg32(child, addr, data);
1142 break;
1143
1144 case PTRACE_GETREGS: /* Get all gp regs from the child. */
1145 return copy_regset_to_user(child, &user_x86_32_view,
1146 REGSET_GENERAL,
1147 0, sizeof(struct user_regs_struct32),
1148 datap);
1149
1150 case PTRACE_SETREGS: /* Set all gp regs in the child. */
1151 return copy_regset_from_user(child, &user_x86_32_view,
1152 REGSET_GENERAL, 0,
1153 sizeof(struct user_regs_struct32),
1154 datap);
1155
1156 case PTRACE_GETFPREGS: /* Get the child FPU state. */
1157 return copy_regset_to_user(child, &user_x86_32_view,
1158 REGSET_FP, 0,
1159 sizeof(struct user_i387_ia32_struct),
1160 datap);
1161
1162 case PTRACE_SETFPREGS: /* Set the child FPU state. */
1163 return copy_regset_from_user(
1164 child, &user_x86_32_view, REGSET_FP,
1165 0, sizeof(struct user_i387_ia32_struct), datap);
1166
1167 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
1168 return copy_regset_to_user(child, &user_x86_32_view,
1169 REGSET_XFP, 0,
1170 sizeof(struct user32_fxsr_struct),
1171 datap);
1172
1173 case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
1174 return copy_regset_from_user(child, &user_x86_32_view,
1175 REGSET_XFP, 0,
1176 sizeof(struct user32_fxsr_struct),
1177 datap);
1178
1179 case PTRACE_GET_THREAD_AREA:
1180 case PTRACE_SET_THREAD_AREA:
1181 return arch_ptrace(child, request, addr, data);
1182
1183 default:
1184 return compat_ptrace_request(child, request, addr, data);
1185 }
1186
1187 return ret;
1188}
1189#endif /* CONFIG_IA32_EMULATION */
1190
1126#ifdef CONFIG_X86_X32_ABI 1191#ifdef CONFIG_X86_X32_ABI
1127static long x32_arch_ptrace(struct task_struct *child, 1192static long x32_arch_ptrace(struct task_struct *child,
1128 compat_long_t request, compat_ulong_t caddr, 1193 compat_long_t request, compat_ulong_t caddr,
@@ -1211,78 +1276,21 @@ static long x32_arch_ptrace(struct task_struct *child,
1211} 1276}
1212#endif 1277#endif
1213 1278
1279#ifdef CONFIG_COMPAT
1214long compat_arch_ptrace(struct task_struct *child, compat_long_t request, 1280long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1215 compat_ulong_t caddr, compat_ulong_t cdata) 1281 compat_ulong_t caddr, compat_ulong_t cdata)
1216{ 1282{
1217 unsigned long addr = caddr;
1218 unsigned long data = cdata;
1219 void __user *datap = compat_ptr(data);
1220 int ret;
1221 __u32 val;
1222
1223#ifdef CONFIG_X86_X32_ABI 1283#ifdef CONFIG_X86_X32_ABI
1224 if (!is_ia32_task()) 1284 if (!is_ia32_task())
1225 return x32_arch_ptrace(child, request, caddr, cdata); 1285 return x32_arch_ptrace(child, request, caddr, cdata);
1226#endif 1286#endif
1227 1287#ifdef CONFIG_IA32_EMULATION
1228 switch (request) { 1288 return ia32_arch_ptrace(child, request, caddr, cdata);
1229 case PTRACE_PEEKUSR: 1289#else
1230 ret = getreg32(child, addr, &val); 1290 return 0;
1231 if (ret == 0) 1291#endif
1232 ret = put_user(val, (__u32 __user *)datap);
1233 break;
1234
1235 case PTRACE_POKEUSR:
1236 ret = putreg32(child, addr, data);
1237 break;
1238
1239 case PTRACE_GETREGS: /* Get all gp regs from the child. */
1240 return copy_regset_to_user(child, &user_x86_32_view,
1241 REGSET_GENERAL,
1242 0, sizeof(struct user_regs_struct32),
1243 datap);
1244
1245 case PTRACE_SETREGS: /* Set all gp regs in the child. */
1246 return copy_regset_from_user(child, &user_x86_32_view,
1247 REGSET_GENERAL, 0,
1248 sizeof(struct user_regs_struct32),
1249 datap);
1250
1251 case PTRACE_GETFPREGS: /* Get the child FPU state. */
1252 return copy_regset_to_user(child, &user_x86_32_view,
1253 REGSET_FP, 0,
1254 sizeof(struct user_i387_ia32_struct),
1255 datap);
1256
1257 case PTRACE_SETFPREGS: /* Set the child FPU state. */
1258 return copy_regset_from_user(
1259 child, &user_x86_32_view, REGSET_FP,
1260 0, sizeof(struct user_i387_ia32_struct), datap);
1261
1262 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
1263 return copy_regset_to_user(child, &user_x86_32_view,
1264 REGSET_XFP, 0,
1265 sizeof(struct user32_fxsr_struct),
1266 datap);
1267
1268 case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
1269 return copy_regset_from_user(child, &user_x86_32_view,
1270 REGSET_XFP, 0,
1271 sizeof(struct user32_fxsr_struct),
1272 datap);
1273
1274 case PTRACE_GET_THREAD_AREA:
1275 case PTRACE_SET_THREAD_AREA:
1276 return arch_ptrace(child, request, addr, data);
1277
1278 default:
1279 return compat_ptrace_request(child, request, addr, data);
1280 }
1281
1282 return ret;
1283} 1292}
1284 1293#endif /* CONFIG_COMPAT */
1285#endif /* CONFIG_IA32_EMULATION */
1286 1294
1287#ifdef CONFIG_X86_64 1295#ifdef CONFIG_X86_64
1288 1296
@@ -1434,201 +1442,3 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1434 /* Send us the fake SIGTRAP */ 1442 /* Send us the fake SIGTRAP */
1435 force_sig_info(SIGTRAP, &info, tsk); 1443 force_sig_info(SIGTRAP, &info, tsk);
1436} 1444}
1437
1438static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
1439{
1440#ifdef CONFIG_X86_64
1441 if (arch == AUDIT_ARCH_X86_64) {
1442 audit_syscall_entry(regs->orig_ax, regs->di,
1443 regs->si, regs->dx, regs->r10);
1444 } else
1445#endif
1446 {
1447 audit_syscall_entry(regs->orig_ax, regs->bx,
1448 regs->cx, regs->dx, regs->si);
1449 }
1450}
1451
1452/*
1453 * We can return 0 to resume the syscall or anything else to go to phase
1454 * 2. If we resume the syscall, we need to put something appropriate in
1455 * regs->orig_ax.
1456 *
1457 * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
1458 * are fully functional.
1459 *
1460 * For phase 2's benefit, our return value is:
1461 * 0: resume the syscall
1462 * 1: go to phase 2; no seccomp phase 2 needed
1463 * anything else: go to phase 2; pass return value to seccomp
1464 */
1465unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
1466{
1467 unsigned long ret = 0;
1468 u32 work;
1469
1470 BUG_ON(regs != task_pt_regs(current));
1471
1472 work = ACCESS_ONCE(current_thread_info()->flags) &
1473 _TIF_WORK_SYSCALL_ENTRY;
1474
1475 /*
1476 * If TIF_NOHZ is set, we are required to call user_exit() before
1477 * doing anything that could touch RCU.
1478 */
1479 if (work & _TIF_NOHZ) {
1480 user_exit();
1481 work &= ~_TIF_NOHZ;
1482 }
1483
1484#ifdef CONFIG_SECCOMP
1485 /*
1486 * Do seccomp first -- it should minimize exposure of other
1487 * code, and keeping seccomp fast is probably more valuable
1488 * than the rest of this.
1489 */
1490 if (work & _TIF_SECCOMP) {
1491 struct seccomp_data sd;
1492
1493 sd.arch = arch;
1494 sd.nr = regs->orig_ax;
1495 sd.instruction_pointer = regs->ip;
1496#ifdef CONFIG_X86_64
1497 if (arch == AUDIT_ARCH_X86_64) {
1498 sd.args[0] = regs->di;
1499 sd.args[1] = regs->si;
1500 sd.args[2] = regs->dx;
1501 sd.args[3] = regs->r10;
1502 sd.args[4] = regs->r8;
1503 sd.args[5] = regs->r9;
1504 } else
1505#endif
1506 {
1507 sd.args[0] = regs->bx;
1508 sd.args[1] = regs->cx;
1509 sd.args[2] = regs->dx;
1510 sd.args[3] = regs->si;
1511 sd.args[4] = regs->di;
1512 sd.args[5] = regs->bp;
1513 }
1514
1515 BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
1516 BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
1517
1518 ret = seccomp_phase1(&sd);
1519 if (ret == SECCOMP_PHASE1_SKIP) {
1520 regs->orig_ax = -1;
1521 ret = 0;
1522 } else if (ret != SECCOMP_PHASE1_OK) {
1523 return ret; /* Go directly to phase 2 */
1524 }
1525
1526 work &= ~_TIF_SECCOMP;
1527 }
1528#endif
1529
1530 /* Do our best to finish without phase 2. */
1531 if (work == 0)
1532 return ret; /* seccomp and/or nohz only (ret == 0 here) */
1533
1534#ifdef CONFIG_AUDITSYSCALL
1535 if (work == _TIF_SYSCALL_AUDIT) {
1536 /*
1537 * If there is no more work to be done except auditing,
1538 * then audit in phase 1. Phase 2 always audits, so, if
1539 * we audit here, then we can't go on to phase 2.
1540 */
1541 do_audit_syscall_entry(regs, arch);
1542 return 0;
1543 }
1544#endif
1545
1546 return 1; /* Something is enabled that we can't handle in phase 1 */
1547}
1548
1549/* Returns the syscall nr to run (which should match regs->orig_ax). */
1550long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
1551 unsigned long phase1_result)
1552{
1553 long ret = 0;
1554 u32 work = ACCESS_ONCE(current_thread_info()->flags) &
1555 _TIF_WORK_SYSCALL_ENTRY;
1556
1557 BUG_ON(regs != task_pt_regs(current));
1558
1559 /*
1560 * If we stepped into a sysenter/syscall insn, it trapped in
1561 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
1562 * If user-mode had set TF itself, then it's still clear from
1563 * do_debug() and we need to set it again to restore the user
1564 * state. If we entered on the slow path, TF was already set.
1565 */
1566 if (work & _TIF_SINGLESTEP)
1567 regs->flags |= X86_EFLAGS_TF;
1568
1569#ifdef CONFIG_SECCOMP
1570 /*
1571 * Call seccomp_phase2 before running the other hooks so that
1572 * they can see any changes made by a seccomp tracer.
1573 */
1574 if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
1575 /* seccomp failures shouldn't expose any additional code. */
1576 return -1;
1577 }
1578#endif
1579
1580 if (unlikely(work & _TIF_SYSCALL_EMU))
1581 ret = -1L;
1582
1583 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
1584 tracehook_report_syscall_entry(regs))
1585 ret = -1L;
1586
1587 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1588 trace_sys_enter(regs, regs->orig_ax);
1589
1590 do_audit_syscall_entry(regs, arch);
1591
1592 return ret ?: regs->orig_ax;
1593}
1594
1595long syscall_trace_enter(struct pt_regs *regs)
1596{
1597 u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
1598 unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
1599
1600 if (phase1_result == 0)
1601 return regs->orig_ax;
1602 else
1603 return syscall_trace_enter_phase2(regs, arch, phase1_result);
1604}
1605
1606void syscall_trace_leave(struct pt_regs *regs)
1607{
1608 bool step;
1609
1610 /*
1611 * We may come here right after calling schedule_user()
1612 * or do_notify_resume(), in which case we can be in RCU
1613 * user mode.
1614 */
1615 user_exit();
1616
1617 audit_syscall_exit(regs);
1618
1619 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1620 trace_sys_exit(regs, regs->ax);
1621
1622 /*
1623 * If TIF_SYSCALL_EMU is set, we only get here because of
1624 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1625 * We already reported this syscall instruction in
1626 * syscall_trace_enter().
1627 */
1628 step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
1629 !test_thread_flag(TIF_SYSCALL_EMU);
1630 if (step || test_thread_flag(TIF_SYSCALL_TRACE))
1631 tracehook_report_syscall_exit(regs, step);
1632
1633 user_enter();
1634}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 71820c42b6ce..da52e6bb5c7f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -31,11 +31,11 @@
31#include <asm/vdso.h> 31#include <asm/vdso.h>
32#include <asm/mce.h> 32#include <asm/mce.h>
33#include <asm/sighandling.h> 33#include <asm/sighandling.h>
34#include <asm/vm86.h>
34 35
35#ifdef CONFIG_X86_64 36#ifdef CONFIG_X86_64
36#include <asm/proto.h> 37#include <asm/proto.h>
37#include <asm/ia32_unistd.h> 38#include <asm/ia32_unistd.h>
38#include <asm/sys_ia32.h>
39#endif /* CONFIG_X86_64 */ 39#endif /* CONFIG_X86_64 */
40 40
41#include <asm/syscall.h> 41#include <asm/syscall.h>
@@ -632,6 +632,9 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
632 bool stepping, failed; 632 bool stepping, failed;
633 struct fpu *fpu = &current->thread.fpu; 633 struct fpu *fpu = &current->thread.fpu;
634 634
635 if (v8086_mode(regs))
636 save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
637
635 /* Are we from a system call? */ 638 /* Are we from a system call? */
636 if (syscall_get_nr(current, regs) >= 0) { 639 if (syscall_get_nr(current, regs) >= 0) {
637 /* If so, check system call restarting.. */ 640 /* If so, check system call restarting.. */
@@ -697,7 +700,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
697 * want to handle. Thus you cannot kill init even with a SIGKILL even by 700 * want to handle. Thus you cannot kill init even with a SIGKILL even by
698 * mistake. 701 * mistake.
699 */ 702 */
700static void do_signal(struct pt_regs *regs) 703void do_signal(struct pt_regs *regs)
701{ 704{
702 struct ksignal ksig; 705 struct ksignal ksig;
703 706
@@ -732,32 +735,6 @@ static void do_signal(struct pt_regs *regs)
732 restore_saved_sigmask(); 735 restore_saved_sigmask();
733} 736}
734 737
735/*
736 * notification of userspace execution resumption
737 * - triggered by the TIF_WORK_MASK flags
738 */
739__visible void
740do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
741{
742 user_exit();
743
744 if (thread_info_flags & _TIF_UPROBE)
745 uprobe_notify_resume(regs);
746
747 /* deal with pending signal delivery */
748 if (thread_info_flags & _TIF_SIGPENDING)
749 do_signal(regs);
750
751 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
752 clear_thread_flag(TIF_NOTIFY_RESUME);
753 tracehook_notify_resume(regs);
754 }
755 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
756 fire_user_return_notifiers();
757
758 user_enter();
759}
760
761void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 738void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
762{ 739{
763 struct task_struct *me = current; 740 struct task_struct *me = current;
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
new file mode 100644
index 000000000000..dc3c0b1c816f
--- /dev/null
+++ b/arch/x86/kernel/signal_compat.c
@@ -0,0 +1,95 @@
1#include <linux/compat.h>
2#include <linux/uaccess.h>
3
4int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
5{
6 int err = 0;
7 bool ia32 = test_thread_flag(TIF_IA32);
8
9 if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
10 return -EFAULT;
11
12 put_user_try {
13 /* If you change siginfo_t structure, please make sure that
14 this code is fixed accordingly.
15 It should never copy any pad contained in the structure
16 to avoid security leaks, but must copy the generic
17 3 ints plus the relevant union member. */
18 put_user_ex(from->si_signo, &to->si_signo);
19 put_user_ex(from->si_errno, &to->si_errno);
20 put_user_ex((short)from->si_code, &to->si_code);
21
22 if (from->si_code < 0) {
23 put_user_ex(from->si_pid, &to->si_pid);
24 put_user_ex(from->si_uid, &to->si_uid);
25 put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
26 } else {
27 /*
28 * First 32bits of unions are always present:
29 * si_pid === si_band === si_tid === si_addr(LS half)
30 */
31 put_user_ex(from->_sifields._pad[0],
32 &to->_sifields._pad[0]);
33 switch (from->si_code >> 16) {
34 case __SI_FAULT >> 16:
35 break;
36 case __SI_SYS >> 16:
37 put_user_ex(from->si_syscall, &to->si_syscall);
38 put_user_ex(from->si_arch, &to->si_arch);
39 break;
40 case __SI_CHLD >> 16:
41 if (ia32) {
42 put_user_ex(from->si_utime, &to->si_utime);
43 put_user_ex(from->si_stime, &to->si_stime);
44 } else {
45 put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
46 put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
47 }
48 put_user_ex(from->si_status, &to->si_status);
49 /* FALL THROUGH */
50 default:
51 case __SI_KILL >> 16:
52 put_user_ex(from->si_uid, &to->si_uid);
53 break;
54 case __SI_POLL >> 16:
55 put_user_ex(from->si_fd, &to->si_fd);
56 break;
57 case __SI_TIMER >> 16:
58 put_user_ex(from->si_overrun, &to->si_overrun);
59 put_user_ex(ptr_to_compat(from->si_ptr),
60 &to->si_ptr);
61 break;
62 /* This is not generated by the kernel as of now. */
63 case __SI_RT >> 16:
64 case __SI_MESGQ >> 16:
65 put_user_ex(from->si_uid, &to->si_uid);
66 put_user_ex(from->si_int, &to->si_int);
67 break;
68 }
69 }
70 } put_user_catch(err);
71
72 return err;
73}
74
75int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
76{
77 int err = 0;
78 u32 ptr32;
79
80 if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
81 return -EFAULT;
82
83 get_user_try {
84 get_user_ex(to->si_signo, &from->si_signo);
85 get_user_ex(to->si_errno, &from->si_errno);
86 get_user_ex(to->si_code, &from->si_code);
87
88 get_user_ex(to->si_pid, &from->si_pid);
89 get_user_ex(to->si_uid, &from->si_uid);
90 get_user_ex(ptr32, &from->si_ptr);
91 to->si_ptr = compat_ptr(ptr32);
92 } get_user_catch(err);
93
94 return err;
95}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 0ccb53a9fcd9..c9a073866ca7 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -18,6 +18,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
18 return addr; 18 return addr;
19 } 19 }
20 20
21#ifdef CONFIG_MODIFY_LDT_SYSCALL
21 /* 22 /*
22 * We'll assume that the code segments in the GDT 23 * We'll assume that the code segments in the GDT
23 * are all zero-based. That is largely true: the 24 * are all zero-based. That is largely true: the
@@ -45,6 +46,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
45 } 46 }
46 mutex_unlock(&child->mm->context.lock); 47 mutex_unlock(&child->mm->context.lock);
47 } 48 }
49#endif
48 50
49 return addr; 51 return addr;
50} 52}
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
index 25b993729f9b..80bb24d9b880 100644
--- a/arch/x86/kernel/trace_clock.c
+++ b/arch/x86/kernel/trace_clock.c
@@ -12,10 +12,5 @@
12 */ 12 */
13u64 notrace trace_clock_x86_tsc(void) 13u64 notrace trace_clock_x86_tsc(void)
14{ 14{
15 u64 ret; 15 return rdtsc_ordered();
16
17 rdtsc_barrier();
18 rdtscll(ret);
19
20 return ret;
21} 16}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c5a5231d1d11..346eec73f7db 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -62,6 +62,7 @@
62#include <asm/fpu/xstate.h> 62#include <asm/fpu/xstate.h>
63#include <asm/trace/mpx.h> 63#include <asm/trace/mpx.h>
64#include <asm/mpx.h> 64#include <asm/mpx.h>
65#include <asm/vm86.h>
65 66
66#ifdef CONFIG_X86_64 67#ifdef CONFIG_X86_64
67#include <asm/x86_init.h> 68#include <asm/x86_init.h>
@@ -108,13 +109,10 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
108 preempt_count_dec(); 109 preempt_count_dec();
109} 110}
110 111
111enum ctx_state ist_enter(struct pt_regs *regs) 112void ist_enter(struct pt_regs *regs)
112{ 113{
113 enum ctx_state prev_state;
114
115 if (user_mode(regs)) { 114 if (user_mode(regs)) {
116 /* Other than that, we're just an exception. */ 115 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
117 prev_state = exception_enter();
118 } else { 116 } else {
119 /* 117 /*
120 * We might have interrupted pretty much anything. In 118 * We might have interrupted pretty much anything. In
@@ -123,32 +121,25 @@ enum ctx_state ist_enter(struct pt_regs *regs)
123 * but we need to notify RCU. 121 * but we need to notify RCU.
124 */ 122 */
125 rcu_nmi_enter(); 123 rcu_nmi_enter();
126 prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */
127 } 124 }
128 125
129 /* 126 /*
130 * We are atomic because we're on the IST stack (or we're on x86_32, 127 * We are atomic because we're on the IST stack; or we're on
131 * in which case we still shouldn't schedule). 128 * x86_32, in which case we still shouldn't schedule; or we're
132 * 129 * on x86_64 and entered from user mode, in which case we're
133 * This must be after exception_enter(), because exception_enter() 130 * still atomic unless ist_begin_non_atomic is called.
134 * won't do anything if in_interrupt() returns true.
135 */ 131 */
136 preempt_count_add(HARDIRQ_OFFSET); 132 preempt_count_add(HARDIRQ_OFFSET);
137 133
138 /* This code is a bit fragile. Test it. */ 134 /* This code is a bit fragile. Test it. */
139 RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); 135 RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
140
141 return prev_state;
142} 136}
143 137
144void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) 138void ist_exit(struct pt_regs *regs)
145{ 139{
146 /* Must be before exception_exit. */
147 preempt_count_sub(HARDIRQ_OFFSET); 140 preempt_count_sub(HARDIRQ_OFFSET);
148 141
149 if (user_mode(regs)) 142 if (!user_mode(regs))
150 return exception_exit(prev_state);
151 else
152 rcu_nmi_exit(); 143 rcu_nmi_exit();
153} 144}
154 145
@@ -162,7 +153,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
162 * a double fault, it can be safe to schedule. ist_begin_non_atomic() 153 * a double fault, it can be safe to schedule. ist_begin_non_atomic()
163 * begins a non-atomic section within an ist_enter()/ist_exit() region. 154 * begins a non-atomic section within an ist_enter()/ist_exit() region.
164 * Callers are responsible for enabling interrupts themselves inside 155 * Callers are responsible for enabling interrupts themselves inside
165 * the non-atomic section, and callers must call is_end_non_atomic() 156 * the non-atomic section, and callers must call ist_end_non_atomic()
166 * before ist_exit(). 157 * before ist_exit().
167 */ 158 */
168void ist_begin_non_atomic(struct pt_regs *regs) 159void ist_begin_non_atomic(struct pt_regs *regs)
@@ -289,17 +280,16 @@ NOKPROBE_SYMBOL(do_trap);
289static void do_error_trap(struct pt_regs *regs, long error_code, char *str, 280static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
290 unsigned long trapnr, int signr) 281 unsigned long trapnr, int signr)
291{ 282{
292 enum ctx_state prev_state = exception_enter();
293 siginfo_t info; 283 siginfo_t info;
294 284
285 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
286
295 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != 287 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
296 NOTIFY_STOP) { 288 NOTIFY_STOP) {
297 conditional_sti(regs); 289 conditional_sti(regs);
298 do_trap(trapnr, signr, str, regs, error_code, 290 do_trap(trapnr, signr, str, regs, error_code,
299 fill_trap_info(regs, signr, trapnr, &info)); 291 fill_trap_info(regs, signr, trapnr, &info));
300 } 292 }
301
302 exception_exit(prev_state);
303} 293}
304 294
305#define DO_ERROR(trapnr, signr, str, name) \ 295#define DO_ERROR(trapnr, signr, str, name) \
@@ -351,7 +341,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
351 } 341 }
352#endif 342#endif
353 343
354 ist_enter(regs); /* Discard prev_state because we won't return. */ 344 ist_enter(regs);
355 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); 345 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
356 346
357 tsk->thread.error_code = error_code; 347 tsk->thread.error_code = error_code;
@@ -371,14 +361,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
371 361
372dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) 362dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
373{ 363{
374 enum ctx_state prev_state;
375 const struct bndcsr *bndcsr; 364 const struct bndcsr *bndcsr;
376 siginfo_t *info; 365 siginfo_t *info;
377 366
378 prev_state = exception_enter(); 367 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
379 if (notify_die(DIE_TRAP, "bounds", regs, error_code, 368 if (notify_die(DIE_TRAP, "bounds", regs, error_code,
380 X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) 369 X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
381 goto exit; 370 return;
382 conditional_sti(regs); 371 conditional_sti(regs);
383 372
384 if (!user_mode(regs)) 373 if (!user_mode(regs))
@@ -435,9 +424,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
435 die("bounds", regs, error_code); 424 die("bounds", regs, error_code);
436 } 425 }
437 426
438exit:
439 exception_exit(prev_state);
440 return; 427 return;
428
441exit_trap: 429exit_trap:
442 /* 430 /*
443 * This path out is for all the cases where we could not 431 * This path out is for all the cases where we could not
@@ -447,35 +435,33 @@ exit_trap:
447 * time.. 435 * time..
448 */ 436 */
449 do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL); 437 do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
450 exception_exit(prev_state);
451} 438}
452 439
453dotraplinkage void 440dotraplinkage void
454do_general_protection(struct pt_regs *regs, long error_code) 441do_general_protection(struct pt_regs *regs, long error_code)
455{ 442{
456 struct task_struct *tsk; 443 struct task_struct *tsk;
457 enum ctx_state prev_state;
458 444
459 prev_state = exception_enter(); 445 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
460 conditional_sti(regs); 446 conditional_sti(regs);
461 447
462 if (v8086_mode(regs)) { 448 if (v8086_mode(regs)) {
463 local_irq_enable(); 449 local_irq_enable();
464 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 450 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
465 goto exit; 451 return;
466 } 452 }
467 453
468 tsk = current; 454 tsk = current;
469 if (!user_mode(regs)) { 455 if (!user_mode(regs)) {
470 if (fixup_exception(regs)) 456 if (fixup_exception(regs))
471 goto exit; 457 return;
472 458
473 tsk->thread.error_code = error_code; 459 tsk->thread.error_code = error_code;
474 tsk->thread.trap_nr = X86_TRAP_GP; 460 tsk->thread.trap_nr = X86_TRAP_GP;
475 if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 461 if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
476 X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) 462 X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
477 die("general protection fault", regs, error_code); 463 die("general protection fault", regs, error_code);
478 goto exit; 464 return;
479 } 465 }
480 466
481 tsk->thread.error_code = error_code; 467 tsk->thread.error_code = error_code;
@@ -491,16 +477,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
491 } 477 }
492 478
493 force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); 479 force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
494exit:
495 exception_exit(prev_state);
496} 480}
497NOKPROBE_SYMBOL(do_general_protection); 481NOKPROBE_SYMBOL(do_general_protection);
498 482
499/* May run on IST stack. */ 483/* May run on IST stack. */
500dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) 484dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
501{ 485{
502 enum ctx_state prev_state;
503
504#ifdef CONFIG_DYNAMIC_FTRACE 486#ifdef CONFIG_DYNAMIC_FTRACE
505 /* 487 /*
506 * ftrace must be first, everything else may cause a recursive crash. 488 * ftrace must be first, everything else may cause a recursive crash.
@@ -513,7 +495,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
513 if (poke_int3_handler(regs)) 495 if (poke_int3_handler(regs))
514 return; 496 return;
515 497
516 prev_state = ist_enter(regs); 498 ist_enter(regs);
499 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
517#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 500#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
518 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 501 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
519 SIGTRAP) == NOTIFY_STOP) 502 SIGTRAP) == NOTIFY_STOP)
@@ -539,7 +522,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
539 preempt_conditional_cli(regs); 522 preempt_conditional_cli(regs);
540 debug_stack_usage_dec(); 523 debug_stack_usage_dec();
541exit: 524exit:
542 ist_exit(regs, prev_state); 525 ist_exit(regs);
543} 526}
544NOKPROBE_SYMBOL(do_int3); 527NOKPROBE_SYMBOL(do_int3);
545 528
@@ -615,12 +598,11 @@ NOKPROBE_SYMBOL(fixup_bad_iret);
615dotraplinkage void do_debug(struct pt_regs *regs, long error_code) 598dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
616{ 599{
617 struct task_struct *tsk = current; 600 struct task_struct *tsk = current;
618 enum ctx_state prev_state;
619 int user_icebp = 0; 601 int user_icebp = 0;
620 unsigned long dr6; 602 unsigned long dr6;
621 int si_code; 603 int si_code;
622 604
623 prev_state = ist_enter(regs); 605 ist_enter(regs);
624 606
625 get_debugreg(dr6, 6); 607 get_debugreg(dr6, 6);
626 608
@@ -695,7 +677,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
695 debug_stack_usage_dec(); 677 debug_stack_usage_dec();
696 678
697exit: 679exit:
698 ist_exit(regs, prev_state); 680 ist_exit(regs);
699} 681}
700NOKPROBE_SYMBOL(do_debug); 682NOKPROBE_SYMBOL(do_debug);
701 683
@@ -747,21 +729,15 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
747 729
748dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 730dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
749{ 731{
750 enum ctx_state prev_state; 732 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
751
752 prev_state = exception_enter();
753 math_error(regs, error_code, X86_TRAP_MF); 733 math_error(regs, error_code, X86_TRAP_MF);
754 exception_exit(prev_state);
755} 734}
756 735
757dotraplinkage void 736dotraplinkage void
758do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 737do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
759{ 738{
760 enum ctx_state prev_state; 739 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
761
762 prev_state = exception_enter();
763 math_error(regs, error_code, X86_TRAP_XF); 740 math_error(regs, error_code, X86_TRAP_XF);
764 exception_exit(prev_state);
765} 741}
766 742
767dotraplinkage void 743dotraplinkage void
@@ -773,9 +749,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
773dotraplinkage void 749dotraplinkage void
774do_device_not_available(struct pt_regs *regs, long error_code) 750do_device_not_available(struct pt_regs *regs, long error_code)
775{ 751{
776 enum ctx_state prev_state; 752 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
777
778 prev_state = exception_enter();
779 BUG_ON(use_eager_fpu()); 753 BUG_ON(use_eager_fpu());
780 754
781#ifdef CONFIG_MATH_EMULATION 755#ifdef CONFIG_MATH_EMULATION
@@ -786,7 +760,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
786 760
787 info.regs = regs; 761 info.regs = regs;
788 math_emulate(&info); 762 math_emulate(&info);
789 exception_exit(prev_state);
790 return; 763 return;
791 } 764 }
792#endif 765#endif
@@ -794,7 +767,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
794#ifdef CONFIG_X86_32 767#ifdef CONFIG_X86_32
795 conditional_sti(regs); 768 conditional_sti(regs);
796#endif 769#endif
797 exception_exit(prev_state);
798} 770}
799NOKPROBE_SYMBOL(do_device_not_available); 771NOKPROBE_SYMBOL(do_device_not_available);
800 772
@@ -802,9 +774,8 @@ NOKPROBE_SYMBOL(do_device_not_available);
802dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) 774dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
803{ 775{
804 siginfo_t info; 776 siginfo_t info;
805 enum ctx_state prev_state;
806 777
807 prev_state = exception_enter(); 778 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
808 local_irq_enable(); 779 local_irq_enable();
809 780
810 info.si_signo = SIGILL; 781 info.si_signo = SIGILL;
@@ -816,7 +787,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
816 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, 787 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
817 &info); 788 &info);
818 } 789 }
819 exception_exit(prev_state);
820} 790}
821#endif 791#endif
822 792
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 88e9a38c71a5..79055cf2c497 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
248 248
249 data = cyc2ns_write_begin(cpu); 249 data = cyc2ns_write_begin(cpu);
250 250
251 rdtscll(tsc_now); 251 tsc_now = rdtsc();
252 ns_now = cycles_2_ns(tsc_now); 252 ns_now = cycles_2_ns(tsc_now);
253 253
254 /* 254 /*
@@ -290,7 +290,7 @@ u64 native_sched_clock(void)
290 } 290 }
291 291
292 /* read the Time Stamp Counter: */ 292 /* read the Time Stamp Counter: */
293 rdtscll(tsc_now); 293 tsc_now = rdtsc();
294 294
295 /* return the value in ns */ 295 /* return the value in ns */
296 return cycles_2_ns(tsc_now); 296 return cycles_2_ns(tsc_now);
@@ -316,12 +316,6 @@ unsigned long long
316sched_clock(void) __attribute__((alias("native_sched_clock"))); 316sched_clock(void) __attribute__((alias("native_sched_clock")));
317#endif 317#endif
318 318
319unsigned long long native_read_tsc(void)
320{
321 return __native_read_tsc();
322}
323EXPORT_SYMBOL(native_read_tsc);
324
325int check_tsc_unstable(void) 319int check_tsc_unstable(void)
326{ 320{
327 return tsc_unstable; 321 return tsc_unstable;
@@ -984,7 +978,7 @@ static struct clocksource clocksource_tsc;
984 */ 978 */
985static cycle_t read_tsc(struct clocksource *cs) 979static cycle_t read_tsc(struct clocksource *cs)
986{ 980{
987 return (cycle_t)get_cycles(); 981 return (cycle_t)rdtsc_ordered();
988} 982}
989 983
990/* 984/*
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index dd8d0791dfb5..78083bf23ed1 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -39,16 +39,15 @@ static cycles_t max_warp;
39static int nr_warps; 39static int nr_warps;
40 40
41/* 41/*
42 * TSC-warp measurement loop running on both CPUs: 42 * TSC-warp measurement loop running on both CPUs. This is not called
43 * if there is no TSC.
43 */ 44 */
44static void check_tsc_warp(unsigned int timeout) 45static void check_tsc_warp(unsigned int timeout)
45{ 46{
46 cycles_t start, now, prev, end; 47 cycles_t start, now, prev, end;
47 int i; 48 int i;
48 49
49 rdtsc_barrier(); 50 start = rdtsc_ordered();
50 start = get_cycles();
51 rdtsc_barrier();
52 /* 51 /*
53 * The measurement runs for 'timeout' msecs: 52 * The measurement runs for 'timeout' msecs:
54 */ 53 */
@@ -63,9 +62,7 @@ static void check_tsc_warp(unsigned int timeout)
63 */ 62 */
64 arch_spin_lock(&sync_lock); 63 arch_spin_lock(&sync_lock);
65 prev = last_tsc; 64 prev = last_tsc;
66 rdtsc_barrier(); 65 now = rdtsc_ordered();
67 now = get_cycles();
68 rdtsc_barrier();
69 last_tsc = now; 66 last_tsc = now;
70 arch_spin_unlock(&sync_lock); 67 arch_spin_unlock(&sync_lock);
71 68
@@ -126,7 +123,7 @@ void check_tsc_sync_source(int cpu)
126 123
127 /* 124 /*
128 * No need to check if we already know that the TSC is not 125 * No need to check if we already know that the TSC is not
129 * synchronized: 126 * synchronized or if we have no TSC.
130 */ 127 */
131 if (unsynchronized_tsc()) 128 if (unsynchronized_tsc())
132 return; 129 return;
@@ -190,6 +187,7 @@ void check_tsc_sync_target(void)
190{ 187{
191 int cpus = 2; 188 int cpus = 2;
192 189
190 /* Also aborts if there is no TSC. */
193 if (unsynchronized_tsc() || tsc_clocksource_reliable) 191 if (unsynchronized_tsc() || tsc_clocksource_reliable)
194 return; 192 return;
195 193
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index fc9db6ef2a95..abd8b856bd2b 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -44,11 +44,14 @@
44#include <linux/ptrace.h> 44#include <linux/ptrace.h>
45#include <linux/audit.h> 45#include <linux/audit.h>
46#include <linux/stddef.h> 46#include <linux/stddef.h>
47#include <linux/slab.h>
47 48
48#include <asm/uaccess.h> 49#include <asm/uaccess.h>
49#include <asm/io.h> 50#include <asm/io.h>
50#include <asm/tlbflush.h> 51#include <asm/tlbflush.h>
51#include <asm/irq.h> 52#include <asm/irq.h>
53#include <asm/traps.h>
54#include <asm/vm86.h>
52 55
53/* 56/*
54 * Known problems: 57 * Known problems:
@@ -66,10 +69,6 @@
66 */ 69 */
67 70
68 71
69#define KVM86 ((struct kernel_vm86_struct *)regs)
70#define VMPI KVM86->vm86plus
71
72
73/* 72/*
74 * 8- and 16-bit register defines.. 73 * 8- and 16-bit register defines..
75 */ 74 */
@@ -81,8 +80,8 @@
81/* 80/*
82 * virtual flags (16 and 32-bit versions) 81 * virtual flags (16 and 32-bit versions)
83 */ 82 */
84#define VFLAGS (*(unsigned short *)&(current->thread.v86flags)) 83#define VFLAGS (*(unsigned short *)&(current->thread.vm86->veflags))
85#define VEFLAGS (current->thread.v86flags) 84#define VEFLAGS (current->thread.vm86->veflags)
86 85
87#define set_flags(X, new, mask) \ 86#define set_flags(X, new, mask) \
88((X) = ((X) & ~(mask)) | ((new) & (mask))) 87((X) = ((X) & ~(mask)) | ((new) & (mask)))
@@ -90,46 +89,13 @@
90#define SAFE_MASK (0xDD5) 89#define SAFE_MASK (0xDD5)
91#define RETURN_MASK (0xDFF) 90#define RETURN_MASK (0xDFF)
92 91
93/* convert kernel_vm86_regs to vm86_regs */ 92void save_v86_state(struct kernel_vm86_regs *regs, int retval)
94static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
95 const struct kernel_vm86_regs *regs)
96{
97 int ret = 0;
98
99 /*
100 * kernel_vm86_regs is missing gs, so copy everything up to
101 * (but not including) orig_eax, and then rest including orig_eax.
102 */
103 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
104 ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
105 sizeof(struct kernel_vm86_regs) -
106 offsetof(struct kernel_vm86_regs, pt.orig_ax));
107
108 return ret;
109}
110
111/* convert vm86_regs to kernel_vm86_regs */
112static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
113 const struct vm86_regs __user *user,
114 unsigned extra)
115{
116 int ret = 0;
117
118 /* copy ax-fs inclusive */
119 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
120 /* copy orig_ax-__gsh+extra */
121 ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
122 sizeof(struct kernel_vm86_regs) -
123 offsetof(struct kernel_vm86_regs, pt.orig_ax) +
124 extra);
125 return ret;
126}
127
128struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
129{ 93{
130 struct tss_struct *tss; 94 struct tss_struct *tss;
131 struct pt_regs *ret; 95 struct task_struct *tsk = current;
132 unsigned long tmp; 96 struct vm86plus_struct __user *user;
97 struct vm86 *vm86 = current->thread.vm86;
98 long err = 0;
133 99
134 /* 100 /*
135 * This gets called from entry.S with interrupts disabled, but 101 * This gets called from entry.S with interrupts disabled, but
@@ -138,31 +104,57 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
138 */ 104 */
139 local_irq_enable(); 105 local_irq_enable();
140 106
141 if (!current->thread.vm86_info) { 107 if (!vm86 || !vm86->user_vm86) {
142 pr_alert("no vm86_info: BAD\n"); 108 pr_alert("no user_vm86: BAD\n");
143 do_exit(SIGSEGV); 109 do_exit(SIGSEGV);
144 } 110 }
145 set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); 111 set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
146 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs); 112 user = vm86->user_vm86;
147 tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap); 113
148 if (tmp) { 114 if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
149 pr_alert("could not access userspace vm86_info\n"); 115 sizeof(struct vm86plus_struct) :
116 sizeof(struct vm86_struct))) {
117 pr_alert("could not access userspace vm86 info\n");
118 do_exit(SIGSEGV);
119 }
120
121 put_user_try {
122 put_user_ex(regs->pt.bx, &user->regs.ebx);
123 put_user_ex(regs->pt.cx, &user->regs.ecx);
124 put_user_ex(regs->pt.dx, &user->regs.edx);
125 put_user_ex(regs->pt.si, &user->regs.esi);
126 put_user_ex(regs->pt.di, &user->regs.edi);
127 put_user_ex(regs->pt.bp, &user->regs.ebp);
128 put_user_ex(regs->pt.ax, &user->regs.eax);
129 put_user_ex(regs->pt.ip, &user->regs.eip);
130 put_user_ex(regs->pt.cs, &user->regs.cs);
131 put_user_ex(regs->pt.flags, &user->regs.eflags);
132 put_user_ex(regs->pt.sp, &user->regs.esp);
133 put_user_ex(regs->pt.ss, &user->regs.ss);
134 put_user_ex(regs->es, &user->regs.es);
135 put_user_ex(regs->ds, &user->regs.ds);
136 put_user_ex(regs->fs, &user->regs.fs);
137 put_user_ex(regs->gs, &user->regs.gs);
138
139 put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
140 } put_user_catch(err);
141 if (err) {
142 pr_alert("could not access userspace vm86 info\n");
150 do_exit(SIGSEGV); 143 do_exit(SIGSEGV);
151 } 144 }
152 145
153 tss = &per_cpu(cpu_tss, get_cpu()); 146 tss = &per_cpu(cpu_tss, get_cpu());
154 current->thread.sp0 = current->thread.saved_sp0; 147 tsk->thread.sp0 = vm86->saved_sp0;
155 current->thread.sysenter_cs = __KERNEL_CS; 148 tsk->thread.sysenter_cs = __KERNEL_CS;
156 load_sp0(tss, &current->thread); 149 load_sp0(tss, &tsk->thread);
157 current->thread.saved_sp0 = 0; 150 vm86->saved_sp0 = 0;
158 put_cpu(); 151 put_cpu();
159 152
160 ret = KVM86->regs32; 153 memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
161 154
162 ret->fs = current->thread.saved_fs; 155 lazy_load_gs(vm86->regs32.gs);
163 set_user_gs(ret, current->thread.saved_gs);
164 156
165 return ret; 157 regs->pt.ax = retval;
166} 158}
167 159
168static void mark_screen_rdonly(struct mm_struct *mm) 160static void mark_screen_rdonly(struct mm_struct *mm)
@@ -200,45 +192,16 @@ out:
200 192
201 193
202static int do_vm86_irq_handling(int subfunction, int irqnumber); 194static int do_vm86_irq_handling(int subfunction, int irqnumber);
203static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 195static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
204 196
205SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86) 197SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
206{ 198{
207 struct kernel_vm86_struct info; /* declare this _on top_, 199 return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
208 * this avoids wasting of stack space.
209 * This remains on the stack until we
210 * return to 32 bit user space.
211 */
212 struct task_struct *tsk = current;
213 int tmp;
214
215 if (tsk->thread.saved_sp0)
216 return -EPERM;
217 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
218 offsetof(struct kernel_vm86_struct, vm86plus) -
219 sizeof(info.regs));
220 if (tmp)
221 return -EFAULT;
222 memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
223 info.regs32 = current_pt_regs();
224 tsk->thread.vm86_info = v86;
225 do_sys_vm86(&info, tsk);
226 return 0; /* we never return here */
227} 200}
228 201
229 202
230SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) 203SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
231{ 204{
232 struct kernel_vm86_struct info; /* declare this _on top_,
233 * this avoids wasting of stack space.
234 * This remains on the stack until we
235 * return to 32 bit user space.
236 */
237 struct task_struct *tsk;
238 int tmp;
239 struct vm86plus_struct __user *v86;
240
241 tsk = current;
242 switch (cmd) { 205 switch (cmd) {
243 case VM86_REQUEST_IRQ: 206 case VM86_REQUEST_IRQ:
244 case VM86_FREE_IRQ: 207 case VM86_FREE_IRQ:
@@ -256,114 +219,133 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
256 } 219 }
257 220
258 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ 221 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
259 if (tsk->thread.saved_sp0) 222 return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
260 return -EPERM;
261 v86 = (struct vm86plus_struct __user *)arg;
262 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
263 offsetof(struct kernel_vm86_struct, regs32) -
264 sizeof(info.regs));
265 if (tmp)
266 return -EFAULT;
267 info.regs32 = current_pt_regs();
268 info.vm86plus.is_vm86pus = 1;
269 tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
270 do_sys_vm86(&info, tsk);
271 return 0; /* we never return here */
272} 223}
273 224
274 225
275static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) 226static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
276{ 227{
277 struct tss_struct *tss; 228 struct tss_struct *tss;
278/* 229 struct task_struct *tsk = current;
279 * make sure the vm86() system call doesn't try to do anything silly 230 struct vm86 *vm86 = tsk->thread.vm86;
280 */ 231 struct kernel_vm86_regs vm86regs;
281 info->regs.pt.ds = 0; 232 struct pt_regs *regs = current_pt_regs();
282 info->regs.pt.es = 0; 233 unsigned long err = 0;
283 info->regs.pt.fs = 0; 234
284#ifndef CONFIG_X86_32_LAZY_GS 235 if (!vm86) {
285 info->regs.pt.gs = 0; 236 if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
286#endif 237 return -ENOMEM;
238 tsk->thread.vm86 = vm86;
239 }
240 if (vm86->saved_sp0)
241 return -EPERM;
242
243 if (!access_ok(VERIFY_READ, user_vm86, plus ?
244 sizeof(struct vm86_struct) :
245 sizeof(struct vm86plus_struct)))
246 return -EFAULT;
247
248 memset(&vm86regs, 0, sizeof(vm86regs));
249 get_user_try {
250 unsigned short seg;
251 get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
252 get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
253 get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
254 get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
255 get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
256 get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
257 get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
258 get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
259 get_user_ex(seg, &user_vm86->regs.cs);
260 vm86regs.pt.cs = seg;
261 get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
262 get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
263 get_user_ex(seg, &user_vm86->regs.ss);
264 vm86regs.pt.ss = seg;
265 get_user_ex(vm86regs.es, &user_vm86->regs.es);
266 get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
267 get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
268 get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
269
270 get_user_ex(vm86->flags, &user_vm86->flags);
271 get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
272 get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
273 } get_user_catch(err);
274 if (err)
275 return err;
276
277 if (copy_from_user(&vm86->int_revectored,
278 &user_vm86->int_revectored,
279 sizeof(struct revectored_struct)))
280 return -EFAULT;
281 if (copy_from_user(&vm86->int21_revectored,
282 &user_vm86->int21_revectored,
283 sizeof(struct revectored_struct)))
284 return -EFAULT;
285 if (plus) {
286 if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
287 sizeof(struct vm86plus_info_struct)))
288 return -EFAULT;
289 vm86->vm86plus.is_vm86pus = 1;
290 } else
291 memset(&vm86->vm86plus, 0,
292 sizeof(struct vm86plus_info_struct));
293
294 memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
295 vm86->user_vm86 = user_vm86;
287 296
288/* 297/*
289 * The flags register is also special: we cannot trust that the user 298 * The flags register is also special: we cannot trust that the user
290 * has set it up safely, so this makes sure interrupt etc flags are 299 * has set it up safely, so this makes sure interrupt etc flags are
291 * inherited from protected mode. 300 * inherited from protected mode.
292 */ 301 */
293 VEFLAGS = info->regs.pt.flags; 302 VEFLAGS = vm86regs.pt.flags;
294 info->regs.pt.flags &= SAFE_MASK; 303 vm86regs.pt.flags &= SAFE_MASK;
295 info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK; 304 vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
296 info->regs.pt.flags |= X86_VM_MASK; 305 vm86regs.pt.flags |= X86_VM_MASK;
306
307 vm86regs.pt.orig_ax = regs->orig_ax;
297 308
298 switch (info->cpu_type) { 309 switch (vm86->cpu_type) {
299 case CPU_286: 310 case CPU_286:
300 tsk->thread.v86mask = 0; 311 vm86->veflags_mask = 0;
301 break; 312 break;
302 case CPU_386: 313 case CPU_386:
303 tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; 314 vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
304 break; 315 break;
305 case CPU_486: 316 case CPU_486:
306 tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; 317 vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
307 break; 318 break;
308 default: 319 default:
309 tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; 320 vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
310 break; 321 break;
311 } 322 }
312 323
313/* 324/*
314 * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) 325 * Save old state
315 */ 326 */
316 info->regs32->ax = VM86_SIGNAL; 327 vm86->saved_sp0 = tsk->thread.sp0;
317 tsk->thread.saved_sp0 = tsk->thread.sp0; 328 lazy_save_gs(vm86->regs32.gs);
318 tsk->thread.saved_fs = info->regs32->fs;
319 tsk->thread.saved_gs = get_user_gs(info->regs32);
320 329
321 tss = &per_cpu(cpu_tss, get_cpu()); 330 tss = &per_cpu(cpu_tss, get_cpu());
322 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; 331 /* make room for real-mode segments */
332 tsk->thread.sp0 += 16;
323 if (cpu_has_sep) 333 if (cpu_has_sep)
324 tsk->thread.sysenter_cs = 0; 334 tsk->thread.sysenter_cs = 0;
325 load_sp0(tss, &tsk->thread); 335 load_sp0(tss, &tsk->thread);
326 put_cpu(); 336 put_cpu();
327 337
328 tsk->thread.screen_bitmap = info->screen_bitmap; 338 if (vm86->flags & VM86_SCREEN_BITMAP)
329 if (info->flags & VM86_SCREEN_BITMAP)
330 mark_screen_rdonly(tsk->mm); 339 mark_screen_rdonly(tsk->mm);
331 340
332 /*call __audit_syscall_exit since we do not exit via the normal paths */ 341 memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
333#ifdef CONFIG_AUDITSYSCALL 342 force_iret();
334 if (unlikely(current->audit_context)) 343 return regs->ax;
335 __audit_syscall_exit(1, 0);
336#endif
337
338 __asm__ __volatile__(
339 "movl %0,%%esp\n\t"
340 "movl %1,%%ebp\n\t"
341#ifdef CONFIG_X86_32_LAZY_GS
342 "mov %2, %%gs\n\t"
343#endif
344 "jmp resume_userspace"
345 : /* no outputs */
346 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
347 /* we never return here */
348}
349
350static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
351{
352 struct pt_regs *regs32;
353
354 regs32 = save_v86_state(regs16);
355 regs32->ax = retval;
356 __asm__ __volatile__("movl %0,%%esp\n\t"
357 "movl %1,%%ebp\n\t"
358 "jmp resume_userspace"
359 : : "r" (regs32), "r" (current_thread_info()));
360} 344}
361 345
362static inline void set_IF(struct kernel_vm86_regs *regs) 346static inline void set_IF(struct kernel_vm86_regs *regs)
363{ 347{
364 VEFLAGS |= X86_EFLAGS_VIF; 348 VEFLAGS |= X86_EFLAGS_VIF;
365 if (VEFLAGS & X86_EFLAGS_VIP)
366 return_to_32bit(regs, VM86_STI);
367} 349}
368 350
369static inline void clear_IF(struct kernel_vm86_regs *regs) 351static inline void clear_IF(struct kernel_vm86_regs *regs)
@@ -395,7 +377,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)
395 377
396static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs) 378static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
397{ 379{
398 set_flags(VEFLAGS, flags, current->thread.v86mask); 380 set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
399 set_flags(regs->pt.flags, flags, SAFE_MASK); 381 set_flags(regs->pt.flags, flags, SAFE_MASK);
400 if (flags & X86_EFLAGS_IF) 382 if (flags & X86_EFLAGS_IF)
401 set_IF(regs); 383 set_IF(regs);
@@ -405,7 +387,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs
405 387
406static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs) 388static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
407{ 389{
408 set_flags(VFLAGS, flags, current->thread.v86mask); 390 set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
409 set_flags(regs->pt.flags, flags, SAFE_MASK); 391 set_flags(regs->pt.flags, flags, SAFE_MASK);
410 if (flags & X86_EFLAGS_IF) 392 if (flags & X86_EFLAGS_IF)
411 set_IF(regs); 393 set_IF(regs);
@@ -420,7 +402,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
420 if (VEFLAGS & X86_EFLAGS_VIF) 402 if (VEFLAGS & X86_EFLAGS_VIF)
421 flags |= X86_EFLAGS_IF; 403 flags |= X86_EFLAGS_IF;
422 flags |= X86_EFLAGS_IOPL; 404 flags |= X86_EFLAGS_IOPL;
423 return flags | (VEFLAGS & current->thread.v86mask); 405 return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
424} 406}
425 407
426static inline int is_revectored(int nr, struct revectored_struct *bitmap) 408static inline int is_revectored(int nr, struct revectored_struct *bitmap)
@@ -518,12 +500,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
518{ 500{
519 unsigned long __user *intr_ptr; 501 unsigned long __user *intr_ptr;
520 unsigned long segoffs; 502 unsigned long segoffs;
503 struct vm86 *vm86 = current->thread.vm86;
521 504
522 if (regs->pt.cs == BIOSSEG) 505 if (regs->pt.cs == BIOSSEG)
523 goto cannot_handle; 506 goto cannot_handle;
524 if (is_revectored(i, &KVM86->int_revectored)) 507 if (is_revectored(i, &vm86->int_revectored))
525 goto cannot_handle; 508 goto cannot_handle;
526 if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored)) 509 if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
527 goto cannot_handle; 510 goto cannot_handle;
528 intr_ptr = (unsigned long __user *) (i << 2); 511 intr_ptr = (unsigned long __user *) (i << 2);
529 if (get_user(segoffs, intr_ptr)) 512 if (get_user(segoffs, intr_ptr))
@@ -542,18 +525,16 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
542 return; 525 return;
543 526
544cannot_handle: 527cannot_handle:
545 return_to_32bit(regs, VM86_INTx + (i << 8)); 528 save_v86_state(regs, VM86_INTx + (i << 8));
546} 529}
547 530
548int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) 531int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
549{ 532{
550 if (VMPI.is_vm86pus) { 533 struct vm86 *vm86 = current->thread.vm86;
534
535 if (vm86->vm86plus.is_vm86pus) {
551 if ((trapno == 3) || (trapno == 1)) { 536 if ((trapno == 3) || (trapno == 1)) {
552 KVM86->regs32->ax = VM86_TRAP + (trapno << 8); 537 save_v86_state(regs, VM86_TRAP + (trapno << 8));
553 /* setting this flag forces the code in entry_32.S to
554 the path where we call save_v86_state() and change
555 the stack pointer to KVM86->regs32 */
556 set_thread_flag(TIF_NOTIFY_RESUME);
557 return 0; 538 return 0;
558 } 539 }
559 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); 540 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
@@ -574,16 +555,11 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
574 unsigned char __user *ssp; 555 unsigned char __user *ssp;
575 unsigned short ip, sp, orig_flags; 556 unsigned short ip, sp, orig_flags;
576 int data32, pref_done; 557 int data32, pref_done;
558 struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus;
577 559
578#define CHECK_IF_IN_TRAP \ 560#define CHECK_IF_IN_TRAP \
579 if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \ 561 if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
580 newflags |= X86_EFLAGS_TF 562 newflags |= X86_EFLAGS_TF
581#define VM86_FAULT_RETURN do { \
582 if (VMPI.force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
583 return_to_32bit(regs, VM86_PICRETURN); \
584 if (orig_flags & X86_EFLAGS_TF) \
585 handle_vm86_trap(regs, 0, 1); \
586 return; } while (0)
587 563
588 orig_flags = *(unsigned short *)&regs->pt.flags; 564 orig_flags = *(unsigned short *)&regs->pt.flags;
589 565
@@ -622,7 +598,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
622 SP(regs) -= 2; 598 SP(regs) -= 2;
623 } 599 }
624 IP(regs) = ip; 600 IP(regs) = ip;
625 VM86_FAULT_RETURN; 601 goto vm86_fault_return;
626 602
627 /* popf */ 603 /* popf */
628 case 0x9d: 604 case 0x9d:
@@ -642,16 +618,18 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
642 else 618 else
643 set_vflags_short(newflags, regs); 619 set_vflags_short(newflags, regs);
644 620
645 VM86_FAULT_RETURN; 621 goto check_vip;
646 } 622 }
647 623
648 /* int xx */ 624 /* int xx */
649 case 0xcd: { 625 case 0xcd: {
650 int intno = popb(csp, ip, simulate_sigsegv); 626 int intno = popb(csp, ip, simulate_sigsegv);
651 IP(regs) = ip; 627 IP(regs) = ip;
652 if (VMPI.vm86dbg_active) { 628 if (vmpi->vm86dbg_active) {
653 if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3]) 629 if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
654 return_to_32bit(regs, VM86_INTx + (intno << 8)); 630 save_v86_state(regs, VM86_INTx + (intno << 8));
631 return;
632 }
655 } 633 }
656 do_int(regs, intno, ssp, sp); 634 do_int(regs, intno, ssp, sp);
657 return; 635 return;
@@ -682,14 +660,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
682 } else { 660 } else {
683 set_vflags_short(newflags, regs); 661 set_vflags_short(newflags, regs);
684 } 662 }
685 VM86_FAULT_RETURN; 663 goto check_vip;
686 } 664 }
687 665
688 /* cli */ 666 /* cli */
689 case 0xfa: 667 case 0xfa:
690 IP(regs) = ip; 668 IP(regs) = ip;
691 clear_IF(regs); 669 clear_IF(regs);
692 VM86_FAULT_RETURN; 670 goto vm86_fault_return;
693 671
694 /* sti */ 672 /* sti */
695 /* 673 /*
@@ -701,14 +679,29 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
701 case 0xfb: 679 case 0xfb:
702 IP(regs) = ip; 680 IP(regs) = ip;
703 set_IF(regs); 681 set_IF(regs);
704 VM86_FAULT_RETURN; 682 goto check_vip;
705 683
706 default: 684 default:
707 return_to_32bit(regs, VM86_UNKNOWN); 685 save_v86_state(regs, VM86_UNKNOWN);
708 } 686 }
709 687
710 return; 688 return;
711 689
690check_vip:
691 if (VEFLAGS & X86_EFLAGS_VIP) {
692 save_v86_state(regs, VM86_STI);
693 return;
694 }
695
696vm86_fault_return:
697 if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
698 save_v86_state(regs, VM86_PICRETURN);
699 return;
700 }
701 if (orig_flags & X86_EFLAGS_TF)
702 handle_vm86_trap(regs, 0, X86_TRAP_DB);
703 return;
704
712simulate_sigsegv: 705simulate_sigsegv:
713 /* FIXME: After a long discussion with Stas we finally 706 /* FIXME: After a long discussion with Stas we finally
714 * agreed, that this is wrong. Here we should 707 * agreed, that this is wrong. Here we should
@@ -720,7 +713,7 @@ simulate_sigsegv:
720 * should be a mixture of the two, but how do we 713 * should be a mixture of the two, but how do we
721 * get the information? [KD] 714 * get the information? [KD]
722 */ 715 */
723 return_to_32bit(regs, VM86_UNKNOWN); 716 save_v86_state(regs, VM86_UNKNOWN);
724} 717}
725 718
726/* ---------------- vm86 special IRQ passing stuff ----------------- */ 719/* ---------------- vm86 special IRQ passing stuff ----------------- */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9a3e342e3cda..8d9013c5e1ee 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1172,7 +1172,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
1172 1172
1173 tsc_deadline = apic->lapic_timer.expired_tscdeadline; 1173 tsc_deadline = apic->lapic_timer.expired_tscdeadline;
1174 apic->lapic_timer.expired_tscdeadline = 0; 1174 apic->lapic_timer.expired_tscdeadline = 0;
1175 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); 1175 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
1176 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); 1176 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
1177 1177
1178 /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ 1178 /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
@@ -1240,7 +1240,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
1240 local_irq_save(flags); 1240 local_irq_save(flags);
1241 1241
1242 now = apic->lapic_timer.timer.base->get_time(); 1242 now = apic->lapic_timer.timer.base->get_time();
1243 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); 1243 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
1244 if (likely(tscdeadline > guest_tsc)) { 1244 if (likely(tscdeadline > guest_tsc)) {
1245 ns = (tscdeadline - guest_tsc) * 1000000ULL; 1245 ns = (tscdeadline - guest_tsc) * 1000000ULL;
1246 do_div(ns, this_tsc_khz); 1246 do_div(ns, this_tsc_khz);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 74d825716f4f..fdb8cb63a6c0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1139,7 +1139,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1139{ 1139{
1140 u64 tsc; 1140 u64 tsc;
1141 1141
1142 tsc = svm_scale_tsc(vcpu, native_read_tsc()); 1142 tsc = svm_scale_tsc(vcpu, rdtsc());
1143 1143
1144 return target_tsc - tsc; 1144 return target_tsc - tsc;
1145} 1145}
@@ -3174,7 +3174,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3174 switch (msr_info->index) { 3174 switch (msr_info->index) {
3175 case MSR_IA32_TSC: { 3175 case MSR_IA32_TSC: {
3176 msr_info->data = svm->vmcb->control.tsc_offset + 3176 msr_info->data = svm->vmcb->control.tsc_offset +
3177 svm_scale_tsc(vcpu, native_read_tsc()); 3177 svm_scale_tsc(vcpu, rdtsc());
3178 3178
3179 break; 3179 break;
3180 } 3180 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index da1590ea43fc..4a4eec30cc08 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2236,7 +2236,7 @@ static u64 guest_read_tsc(void)
2236{ 2236{
2237 u64 host_tsc, tsc_offset; 2237 u64 host_tsc, tsc_offset;
2238 2238
2239 rdtscll(host_tsc); 2239 host_tsc = rdtsc();
2240 tsc_offset = vmcs_read64(TSC_OFFSET); 2240 tsc_offset = vmcs_read64(TSC_OFFSET);
2241 return host_tsc + tsc_offset; 2241 return host_tsc + tsc_offset;
2242} 2242}
@@ -2317,7 +2317,7 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
2317 2317
2318static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 2318static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2319{ 2319{
2320 return target_tsc - native_read_tsc(); 2320 return target_tsc - rdtsc();
2321} 2321}
2322 2322
2323static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) 2323static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4bbc2a1676c9..1e7e76e14e89 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1441,20 +1441,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
1441 1441
1442static cycle_t read_tsc(void) 1442static cycle_t read_tsc(void)
1443{ 1443{
1444 cycle_t ret; 1444 cycle_t ret = (cycle_t)rdtsc_ordered();
1445 u64 last; 1445 u64 last = pvclock_gtod_data.clock.cycle_last;
1446
1447 /*
1448 * Empirically, a fence (of type that depends on the CPU)
1449 * before rdtsc is enough to ensure that rdtsc is ordered
1450 * with respect to loads. The various CPU manuals are unclear
1451 * as to whether rdtsc can be reordered with later loads,
1452 * but no one has ever seen it happen.
1453 */
1454 rdtsc_barrier();
1455 ret = (cycle_t)vget_cycles();
1456
1457 last = pvclock_gtod_data.clock.cycle_last;
1458 1446
1459 if (likely(ret >= last)) 1447 if (likely(ret >= last))
1460 return ret; 1448 return ret;
@@ -1643,7 +1631,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1643 return 1; 1631 return 1;
1644 } 1632 }
1645 if (!use_master_clock) { 1633 if (!use_master_clock) {
1646 host_tsc = native_read_tsc(); 1634 host_tsc = rdtsc();
1647 kernel_ns = get_kernel_ns(); 1635 kernel_ns = get_kernel_ns();
1648 } 1636 }
1649 1637
@@ -2620,7 +2608,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2620 2608
2621 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { 2609 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2622 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : 2610 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
2623 native_read_tsc() - vcpu->arch.last_host_tsc; 2611 rdtsc() - vcpu->arch.last_host_tsc;
2624 if (tsc_delta < 0) 2612 if (tsc_delta < 0)
2625 mark_tsc_unstable("KVM discovered backwards TSC"); 2613 mark_tsc_unstable("KVM discovered backwards TSC");
2626 if (check_tsc_unstable()) { 2614 if (check_tsc_unstable()) {
@@ -2648,7 +2636,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2648{ 2636{
2649 kvm_x86_ops->vcpu_put(vcpu); 2637 kvm_x86_ops->vcpu_put(vcpu);
2650 kvm_put_guest_fpu(vcpu); 2638 kvm_put_guest_fpu(vcpu);
2651 vcpu->arch.last_host_tsc = native_read_tsc(); 2639 vcpu->arch.last_host_tsc = rdtsc();
2652} 2640}
2653 2641
2654static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2642static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -6387,7 +6375,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6387 hw_breakpoint_restore(); 6375 hw_breakpoint_restore();
6388 6376
6389 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, 6377 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
6390 native_read_tsc()); 6378 rdtsc());
6391 6379
6392 vcpu->mode = OUTSIDE_GUEST_MODE; 6380 vcpu->mode = OUTSIDE_GUEST_MODE;
6393 smp_wmb(); 6381 smp_wmb();
@@ -7196,7 +7184,7 @@ int kvm_arch_hardware_enable(void)
7196 if (ret != 0) 7184 if (ret != 0)
7197 return ret; 7185 return ret;
7198 7186
7199 local_tsc = native_read_tsc(); 7187 local_tsc = rdtsc();
7200 stable = !check_tsc_unstable(); 7188 stable = !check_tsc_unstable();
7201 list_for_each_entry(kvm, &vm_list, vm_list) { 7189 list_for_each_entry(kvm, &vm_list, vm_list) {
7202 kvm_for_each_vcpu(i, vcpu, kvm) { 7190 kvm_for_each_vcpu(i, vcpu, kvm) {
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 39d6a3db0b96..e912b2f6d36e 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -20,6 +20,7 @@
20#include <asm/processor.h> 20#include <asm/processor.h>
21#include <asm/delay.h> 21#include <asm/delay.h>
22#include <asm/timer.h> 22#include <asm/timer.h>
23#include <asm/mwait.h>
23 24
24#ifdef CONFIG_SMP 25#ifdef CONFIG_SMP
25# include <asm/smp.h> 26# include <asm/smp.h>
@@ -49,16 +50,14 @@ static void delay_loop(unsigned long loops)
49/* TSC based delay: */ 50/* TSC based delay: */
50static void delay_tsc(unsigned long __loops) 51static void delay_tsc(unsigned long __loops)
51{ 52{
52 u32 bclock, now, loops = __loops; 53 u64 bclock, now, loops = __loops;
53 int cpu; 54 int cpu;
54 55
55 preempt_disable(); 56 preempt_disable();
56 cpu = smp_processor_id(); 57 cpu = smp_processor_id();
57 rdtsc_barrier(); 58 bclock = rdtsc_ordered();
58 rdtscl(bclock);
59 for (;;) { 59 for (;;) {
60 rdtsc_barrier(); 60 now = rdtsc_ordered();
61 rdtscl(now);
62 if ((now - bclock) >= loops) 61 if ((now - bclock) >= loops)
63 break; 62 break;
64 63
@@ -79,14 +78,51 @@ static void delay_tsc(unsigned long __loops)
79 if (unlikely(cpu != smp_processor_id())) { 78 if (unlikely(cpu != smp_processor_id())) {
80 loops -= (now - bclock); 79 loops -= (now - bclock);
81 cpu = smp_processor_id(); 80 cpu = smp_processor_id();
82 rdtsc_barrier(); 81 bclock = rdtsc_ordered();
83 rdtscl(bclock);
84 } 82 }
85 } 83 }
86 preempt_enable(); 84 preempt_enable();
87} 85}
88 86
89/* 87/*
88 * On some AMD platforms, MWAITX has a configurable 32-bit timer, that
89 * counts with TSC frequency. The input value is the loop of the
90 * counter, it will exit when the timer expires.
91 */
92static void delay_mwaitx(unsigned long __loops)
93{
94 u64 start, end, delay, loops = __loops;
95
96 start = rdtsc_ordered();
97
98 for (;;) {
99 delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
100
101 /*
102 * Use cpu_tss as a cacheline-aligned, seldomly
103 * accessed per-cpu variable as the monitor target.
104 */
105 __monitorx(this_cpu_ptr(&cpu_tss), 0, 0);
106
107 /*
108 * AMD, like Intel, supports the EAX hint and EAX=0xf
109 * means, do not enter any deep C-state and we use it
110 * here in delay() to minimize wakeup latency.
111 */
112 __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
113
114 end = rdtsc_ordered();
115
116 if (loops <= end - start)
117 break;
118
119 loops -= end - start;
120
121 start = end;
122 }
123}
124
125/*
90 * Since we calibrate only once at boot, this 126 * Since we calibrate only once at boot, this
91 * function should be set once at boot and not changed 127 * function should be set once at boot and not changed
92 */ 128 */
@@ -94,13 +130,19 @@ static void (*delay_fn)(unsigned long) = delay_loop;
94 130
95void use_tsc_delay(void) 131void use_tsc_delay(void)
96{ 132{
97 delay_fn = delay_tsc; 133 if (delay_fn == delay_loop)
134 delay_fn = delay_tsc;
135}
136
137void use_mwaitx_delay(void)
138{
139 delay_fn = delay_mwaitx;
98} 140}
99 141
100int read_current_timer(unsigned long *timer_val) 142int read_current_timer(unsigned long *timer_val)
101{ 143{
102 if (delay_fn == delay_tsc) { 144 if (delay_fn == delay_tsc) {
103 rdtscll(*timer_val); 145 *timer_val = rdtsc();
104 return 0; 146 return 0;
105 } 147 }
106 return -1; 148 return -1;
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index 8300db71c2a6..8db26591d91a 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -20,6 +20,7 @@
20#include <linux/stddef.h> 20#include <linux/stddef.h>
21 21
22#include <asm/uaccess.h> 22#include <asm/uaccess.h>
23#include <asm/vm86.h>
23 24
24#include "fpu_system.h" 25#include "fpu_system.h"
25#include "exception.h" 26#include "exception.h"
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9dc909841739..eef44d9a3f77 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,7 @@
20#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 20#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
21#include <asm/fixmap.h> /* VSYSCALL_ADDR */ 21#include <asm/fixmap.h> /* VSYSCALL_ADDR */
22#include <asm/vsyscall.h> /* emulate_vsyscall */ 22#include <asm/vsyscall.h> /* emulate_vsyscall */
23#include <asm/vm86.h> /* struct vm86 */
23 24
24#define CREATE_TRACE_POINTS 25#define CREATE_TRACE_POINTS
25#include <asm/trace/exceptions.h> 26#include <asm/trace/exceptions.h>
@@ -301,14 +302,16 @@ static inline void
301check_v8086_mode(struct pt_regs *regs, unsigned long address, 302check_v8086_mode(struct pt_regs *regs, unsigned long address,
302 struct task_struct *tsk) 303 struct task_struct *tsk)
303{ 304{
305#ifdef CONFIG_VM86
304 unsigned long bit; 306 unsigned long bit;
305 307
306 if (!v8086_mode(regs)) 308 if (!v8086_mode(regs) || !tsk->thread.vm86)
307 return; 309 return;
308 310
309 bit = (address - 0xA0000) >> PAGE_SHIFT; 311 bit = (address - 0xA0000) >> PAGE_SHIFT;
310 if (bit < 32) 312 if (bit < 32)
311 tsk->thread.screen_bitmap |= 1 << bit; 313 tsk->thread.vm86->screen_bitmap |= 1 << bit;
314#endif
312} 315}
313 316
314static bool low_pfn(unsigned long pfn) 317static bool low_pfn(unsigned long pfn)
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index b9531d343134..755481f14d90 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -45,17 +45,4 @@
45#define read_barrier_depends() do { } while (0) 45#define read_barrier_depends() do { } while (0)
46#define smp_read_barrier_depends() do { } while (0) 46#define smp_read_barrier_depends() do { } while (0)
47 47
48/*
49 * Stop RDTSC speculation. This is needed when you need to use RDTSC
50 * (or get_cycles or vread that possibly accesses the TSC) in a defined
51 * code region.
52 *
53 * (Could use an alternative three way for this if there was one.)
54 */
55static inline void rdtsc_barrier(void)
56{
57 alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
58 "lfence", X86_FEATURE_LFENCE_RDTSC);
59}
60
61#endif 48#endif
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 11d6fb4e8483..d9cfa452da9d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1215,11 +1215,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
1215 .read_msr = xen_read_msr_safe, 1215 .read_msr = xen_read_msr_safe,
1216 .write_msr = xen_write_msr_safe, 1216 .write_msr = xen_write_msr_safe,
1217 1217
1218 .read_tsc = native_read_tsc,
1219 .read_pmc = native_read_pmc, 1218 .read_pmc = native_read_pmc,
1220 1219
1221 .read_tscp = native_read_tscp,
1222
1223 .iret = xen_iret, 1220 .iret = xen_iret,
1224#ifdef CONFIG_X86_64 1221#ifdef CONFIG_X86_64
1225 .usergs_sysret32 = xen_sysret32, 1222 .usergs_sysret32 = xen_sysret32,
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index fcb929ec5304..7898de054f4e 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -766,7 +766,7 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
766 local_irq_save(flags); 766 local_irq_save(flags);
767 rdmsrl(MSR_IA32_APERF, aperf); 767 rdmsrl(MSR_IA32_APERF, aperf);
768 rdmsrl(MSR_IA32_MPERF, mperf); 768 rdmsrl(MSR_IA32_MPERF, mperf);
769 tsc = native_read_tsc(); 769 tsc = rdtsc();
770 local_irq_restore(flags); 770 local_irq_restore(flags);
771 771
772 cpu->last_sample_time = cpu->sample.time; 772 cpu->last_sample_time = cpu->sample.time;
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index e853a2134680..4a2a9e370be7 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -149,9 +149,9 @@ static int old_gameport_measure_speed(struct gameport *gameport)
149 149
150 for(i = 0; i < 50; i++) { 150 for(i = 0; i < 50; i++) {
151 local_irq_save(flags); 151 local_irq_save(flags);
152 rdtscl(t1); 152 t1 = rdtsc();
153 for (t = 0; t < 50; t++) gameport_read(gameport); 153 for (t = 0; t < 50; t++) gameport_read(gameport);
154 rdtscl(t2); 154 t2 = rdtsc();
155 local_irq_restore(flags); 155 local_irq_restore(flags);
156 udelay(i * 10); 156 udelay(i * 10);
157 if (t2 - t1 < tx) tx = t2 - t1; 157 if (t2 - t1 < tx) tx = t2 - t1;
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index 4284080e481d..6f8b084e13d0 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -143,7 +143,7 @@ struct analog_port {
143 143
144#include <linux/i8253.h> 144#include <linux/i8253.h>
145 145
146#define GET_TIME(x) do { if (cpu_has_tsc) rdtscl(x); else x = get_time_pit(); } while (0) 146#define GET_TIME(x) do { if (cpu_has_tsc) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0)
147#define DELTA(x,y) (cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0))) 147#define DELTA(x,y) (cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0)))
148#define TIME_NAME (cpu_has_tsc?"TSC":"PIT") 148#define TIME_NAME (cpu_has_tsc?"TSC":"PIT")
149static unsigned int get_time_pit(void) 149static unsigned int get_time_pit(void)
@@ -160,7 +160,7 @@ static unsigned int get_time_pit(void)
160 return count; 160 return count;
161} 161}
162#elif defined(__x86_64__) 162#elif defined(__x86_64__)
163#define GET_TIME(x) rdtscl(x) 163#define GET_TIME(x) do { x = (unsigned int)rdtsc(); } while (0)
164#define DELTA(x,y) ((y)-(x)) 164#define DELTA(x,y) ((y)-(x))
165#define TIME_NAME "TSC" 165#define TIME_NAME "TSC"
166#elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE) 166#elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE)
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 83c7cce0d172..72c9f1f352b4 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -638,7 +638,7 @@ static int receive(struct net_device *dev, int cnt)
638#define GETTICK(x) \ 638#define GETTICK(x) \
639({ \ 639({ \
640 if (cpu_has_tsc) \ 640 if (cpu_has_tsc) \
641 rdtscl(x); \ 641 x = (unsigned int)rdtsc(); \
642}) 642})
643#else /* __i386__ */ 643#else /* __i386__ */
644#define GETTICK(x) 644#define GETTICK(x)
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index f35ed53adaac..d4cda5e9600e 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -1924,6 +1924,9 @@ static void adpt_alpha_info(sysInfo_S* si)
1924#endif 1924#endif
1925 1925
1926#if defined __i386__ 1926#if defined __i386__
1927
1928#include <uapi/asm/vm86.h>
1929
1927static void adpt_i386_info(sysInfo_S* si) 1930static void adpt_i386_info(sysInfo_S* si)
1928{ 1931{
1929 // This is all the info we need for now 1932 // This is all the info we need for now
diff --git a/drivers/staging/media/lirc/lirc_serial.c b/drivers/staging/media/lirc/lirc_serial.c
index dc7984455c3a..465796a686c4 100644
--- a/drivers/staging/media/lirc/lirc_serial.c
+++ b/drivers/staging/media/lirc/lirc_serial.c
@@ -327,9 +327,6 @@ static void safe_udelay(unsigned long usecs)
327 * time 327 * time
328 */ 328 */
329 329
330/* So send_pulse can quickly convert microseconds to clocks */
331static unsigned long conv_us_to_clocks;
332
333static int init_timing_params(unsigned int new_duty_cycle, 330static int init_timing_params(unsigned int new_duty_cycle,
334 unsigned int new_freq) 331 unsigned int new_freq)
335{ 332{
@@ -344,7 +341,6 @@ static int init_timing_params(unsigned int new_duty_cycle,
344 /* How many clocks in a microsecond?, avoiding long long divide */ 341 /* How many clocks in a microsecond?, avoiding long long divide */
345 work = loops_per_sec; 342 work = loops_per_sec;
346 work *= 4295; /* 4295 = 2^32 / 1e6 */ 343 work *= 4295; /* 4295 = 2^32 / 1e6 */
347 conv_us_to_clocks = work >> 32;
348 344
349 /* 345 /*
350 * Carrier period in clocks, approach good up to 32GHz clock, 346 * Carrier period in clocks, approach good up to 32GHz clock,
@@ -357,10 +353,9 @@ static int init_timing_params(unsigned int new_duty_cycle,
357 pulse_width = period * duty_cycle / 100; 353 pulse_width = period * duty_cycle / 100;
358 space_width = period - pulse_width; 354 space_width = period - pulse_width;
359 dprintk("in init_timing_params, freq=%d, duty_cycle=%d, " 355 dprintk("in init_timing_params, freq=%d, duty_cycle=%d, "
360 "clk/jiffy=%ld, pulse=%ld, space=%ld, " 356 "clk/jiffy=%ld, pulse=%ld, space=%ld\n",
361 "conv_us_to_clocks=%ld\n",
362 freq, duty_cycle, __this_cpu_read(cpu_info.loops_per_jiffy), 357 freq, duty_cycle, __this_cpu_read(cpu_info.loops_per_jiffy),
363 pulse_width, space_width, conv_us_to_clocks); 358 pulse_width, space_width);
364 return 0; 359 return 0;
365} 360}
366#else /* ! USE_RDTSC */ 361#else /* ! USE_RDTSC */
@@ -431,63 +426,14 @@ static long send_pulse_irdeo(unsigned long length)
431 return ret; 426 return ret;
432} 427}
433 428
434#ifdef USE_RDTSC
435/* Version that uses Pentium rdtsc instruction to measure clocks */
436
437/*
438 * This version does sub-microsecond timing using rdtsc instruction,
439 * and does away with the fudged LIRC_SERIAL_TRANSMITTER_LATENCY
440 * Implicitly i586 architecture... - Steve
441 */
442
443static long send_pulse_homebrew_softcarrier(unsigned long length)
444{
445 int flag;
446 unsigned long target, start, now;
447
448 /* Get going quick as we can */
449 rdtscl(start);
450 on();
451 /* Convert length from microseconds to clocks */
452 length *= conv_us_to_clocks;
453 /* And loop till time is up - flipping at right intervals */
454 now = start;
455 target = pulse_width;
456 flag = 1;
457 /*
458 * FIXME: This looks like a hard busy wait, without even an occasional,
459 * polite, cpu_relax() call. There's got to be a better way?
460 *
461 * The i2c code has the result of a lot of bit-banging work, I wonder if
462 * there's something there which could be helpful here.
463 */
464 while ((now - start) < length) {
465 /* Delay till flip time */
466 do {
467 rdtscl(now);
468 } while ((now - start) < target);
469
470 /* flip */
471 if (flag) {
472 rdtscl(now);
473 off();
474 target += space_width;
475 } else {
476 rdtscl(now); on();
477 target += pulse_width;
478 }
479 flag = !flag;
480 }
481 rdtscl(now);
482 return ((now - start) - length) / conv_us_to_clocks;
483}
484#else /* ! USE_RDTSC */
485/* Version using udelay() */ 429/* Version using udelay() */
486 430
487/* 431/*
488 * here we use fixed point arithmetic, with 8 432 * here we use fixed point arithmetic, with 8
489 * fractional bits. that gets us within 0.1% or so of the right average 433 * fractional bits. that gets us within 0.1% or so of the right average
490 * frequency, albeit with some jitter in pulse length - Steve 434 * frequency, albeit with some jitter in pulse length - Steve
435 *
436 * This should use ndelay instead.
491 */ 437 */
492 438
493/* To match 8 fractional bits used for pulse/space length */ 439/* To match 8 fractional bits used for pulse/space length */
@@ -520,7 +466,6 @@ static long send_pulse_homebrew_softcarrier(unsigned long length)
520 } 466 }
521 return (actual-length) >> 8; 467 return (actual-length) >> 8;
522} 468}
523#endif /* USE_RDTSC */
524 469
525static long send_pulse_homebrew(unsigned long length) 470static long send_pulse_homebrew(unsigned long length)
526{ 471{
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 5820e8513927..2ac0c704bcb8 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -340,7 +340,7 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
340 340
341 /* check result for the last window */ 341 /* check result for the last window */
342 msr_now = pkg_state_counter(); 342 msr_now = pkg_state_counter();
343 rdtscll(tsc_now); 343 tsc_now = rdtsc();
344 344
345 /* calculate pkg cstate vs tsc ratio */ 345 /* calculate pkg cstate vs tsc ratio */
346 if (!msr_last || !tsc_last) 346 if (!msr_last || !tsc_last)
@@ -482,7 +482,7 @@ static void poll_pkg_cstate(struct work_struct *dummy)
482 u64 val64; 482 u64 val64;
483 483
484 msr_now = pkg_state_counter(); 484 msr_now = pkg_state_counter();
485 rdtscll(tsc_now); 485 tsc_now = rdtsc();
486 jiffies_now = jiffies; 486 jiffies_now = jiffies;
487 487
488 /* calculate pkg cstate vs tsc ratio */ 488 /* calculate pkg cstate vs tsc ratio */
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index b96bd299966f..008fc67d0d96 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -49,13 +49,28 @@ static inline void exception_exit(enum ctx_state prev_ctx)
49 } 49 }
50} 50}
51 51
52
53/**
54 * ct_state() - return the current context tracking state if known
55 *
56 * Returns the current cpu's context tracking state if context tracking
57 * is enabled. If context tracking is disabled, returns
58 * CONTEXT_DISABLED. This should be used primarily for debugging.
59 */
60static inline enum ctx_state ct_state(void)
61{
62 return context_tracking_is_enabled() ?
63 this_cpu_read(context_tracking.state) : CONTEXT_DISABLED;
64}
52#else 65#else
53static inline void user_enter(void) { } 66static inline void user_enter(void) { }
54static inline void user_exit(void) { } 67static inline void user_exit(void) { }
55static inline enum ctx_state exception_enter(void) { return 0; } 68static inline enum ctx_state exception_enter(void) { return 0; }
56static inline void exception_exit(enum ctx_state prev_ctx) { } 69static inline void exception_exit(enum ctx_state prev_ctx) { }
70static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
57#endif /* !CONFIG_CONTEXT_TRACKING */ 71#endif /* !CONFIG_CONTEXT_TRACKING */
58 72
73#define CT_WARN_ON(cond) WARN_ON(context_tracking_is_enabled() && (cond))
59 74
60#ifdef CONFIG_CONTEXT_TRACKING_FORCE 75#ifdef CONFIG_CONTEXT_TRACKING_FORCE
61extern void context_tracking_init(void); 76extern void context_tracking_init(void);
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 678ecdf90cf6..ee956c528fab 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -14,6 +14,7 @@ struct context_tracking {
14 bool active; 14 bool active;
15 int recursion; 15 int recursion;
16 enum ctx_state { 16 enum ctx_state {
17 CONTEXT_DISABLED = -1, /* returned by ct_state() if unknown */
17 CONTEXT_KERNEL = 0, 18 CONTEXT_KERNEL = 0,
18 CONTEXT_USER, 19 CONTEXT_USER,
19 CONTEXT_GUEST, 20 CONTEXT_GUEST,
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 16c5ed5a627c..47dd0cebd204 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -286,7 +286,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
286 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n 286 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
287 */ 287 */
288 288
289static inline raw_spinlock_t *spinlock_check(spinlock_t *lock) 289static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
290{ 290{
291 return &lock->rlock; 291 return &lock->rlock;
292} 292}
@@ -297,17 +297,17 @@ do { \
297 raw_spin_lock_init(&(_lock)->rlock); \ 297 raw_spin_lock_init(&(_lock)->rlock); \
298} while (0) 298} while (0)
299 299
300static inline void spin_lock(spinlock_t *lock) 300static __always_inline void spin_lock(spinlock_t *lock)
301{ 301{
302 raw_spin_lock(&lock->rlock); 302 raw_spin_lock(&lock->rlock);
303} 303}
304 304
305static inline void spin_lock_bh(spinlock_t *lock) 305static __always_inline void spin_lock_bh(spinlock_t *lock)
306{ 306{
307 raw_spin_lock_bh(&lock->rlock); 307 raw_spin_lock_bh(&lock->rlock);
308} 308}
309 309
310static inline int spin_trylock(spinlock_t *lock) 310static __always_inline int spin_trylock(spinlock_t *lock)
311{ 311{
312 return raw_spin_trylock(&lock->rlock); 312 return raw_spin_trylock(&lock->rlock);
313} 313}
@@ -327,7 +327,7 @@ do { \
327 raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ 327 raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \
328} while (0) 328} while (0)
329 329
330static inline void spin_lock_irq(spinlock_t *lock) 330static __always_inline void spin_lock_irq(spinlock_t *lock)
331{ 331{
332 raw_spin_lock_irq(&lock->rlock); 332 raw_spin_lock_irq(&lock->rlock);
333} 333}
@@ -342,32 +342,32 @@ do { \
342 raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \ 342 raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
343} while (0) 343} while (0)
344 344
345static inline void spin_unlock(spinlock_t *lock) 345static __always_inline void spin_unlock(spinlock_t *lock)
346{ 346{
347 raw_spin_unlock(&lock->rlock); 347 raw_spin_unlock(&lock->rlock);
348} 348}
349 349
350static inline void spin_unlock_bh(spinlock_t *lock) 350static __always_inline void spin_unlock_bh(spinlock_t *lock)
351{ 351{
352 raw_spin_unlock_bh(&lock->rlock); 352 raw_spin_unlock_bh(&lock->rlock);
353} 353}
354 354
355static inline void spin_unlock_irq(spinlock_t *lock) 355static __always_inline void spin_unlock_irq(spinlock_t *lock)
356{ 356{
357 raw_spin_unlock_irq(&lock->rlock); 357 raw_spin_unlock_irq(&lock->rlock);
358} 358}
359 359
360static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 360static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
361{ 361{
362 raw_spin_unlock_irqrestore(&lock->rlock, flags); 362 raw_spin_unlock_irqrestore(&lock->rlock, flags);
363} 363}
364 364
365static inline int spin_trylock_bh(spinlock_t *lock) 365static __always_inline int spin_trylock_bh(spinlock_t *lock)
366{ 366{
367 return raw_spin_trylock_bh(&lock->rlock); 367 return raw_spin_trylock_bh(&lock->rlock);
368} 368}
369 369
370static inline int spin_trylock_irq(spinlock_t *lock) 370static __always_inline int spin_trylock_irq(spinlock_t *lock)
371{ 371{
372 return raw_spin_trylock_irq(&lock->rlock); 372 return raw_spin_trylock_irq(&lock->rlock);
373} 373}
@@ -377,22 +377,22 @@ static inline int spin_trylock_irq(spinlock_t *lock)
377 raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ 377 raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
378}) 378})
379 379
380static inline void spin_unlock_wait(spinlock_t *lock) 380static __always_inline void spin_unlock_wait(spinlock_t *lock)
381{ 381{
382 raw_spin_unlock_wait(&lock->rlock); 382 raw_spin_unlock_wait(&lock->rlock);
383} 383}
384 384
385static inline int spin_is_locked(spinlock_t *lock) 385static __always_inline int spin_is_locked(spinlock_t *lock)
386{ 386{
387 return raw_spin_is_locked(&lock->rlock); 387 return raw_spin_is_locked(&lock->rlock);
388} 388}
389 389
390static inline int spin_is_contended(spinlock_t *lock) 390static __always_inline int spin_is_contended(spinlock_t *lock)
391{ 391{
392 return raw_spin_is_contended(&lock->rlock); 392 return raw_spin_is_contended(&lock->rlock);
393} 393}
394 394
395static inline int spin_can_lock(spinlock_t *lock) 395static __always_inline int spin_can_lock(spinlock_t *lock)
396{ 396{
397 return raw_spin_can_lock(&lock->rlock); 397 return raw_spin_can_lock(&lock->rlock);
398} 398}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index ae9fc7cc360e..fd2c9acbcc19 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str,
544 .signr = sig, 544 .signr = sig,
545 545
546 }; 546 };
547 RCU_LOCKDEP_WARN(!rcu_is_watching(),
548 "notify_die called but RCU thinks we're quiescent");
547 return atomic_notifier_call_chain(&die_chain, val, &args); 549 return atomic_notifier_call_chain(&die_chain, val, &args);
548} 550}
549NOKPROBE_SYMBOL(notify_die); 551NOKPROBE_SYMBOL(notify_die);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7995ef5868d8..ca7d84f438f1 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask);
140cond_syscall(sys_ssetmask); 140cond_syscall(sys_ssetmask);
141cond_syscall(sys_vm86old); 141cond_syscall(sys_vm86old);
142cond_syscall(sys_vm86); 142cond_syscall(sys_vm86);
143cond_syscall(sys_modify_ldt);
143cond_syscall(sys_ipc); 144cond_syscall(sys_ipc);
144cond_syscall(compat_sys_ipc); 145cond_syscall(compat_sys_ipc);
145cond_syscall(compat_sys_sysctl); 146cond_syscall(compat_sys_sysctl);
diff --git a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
index 5224ee5b392d..6ff8383f2941 100644
--- a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
+++ b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
@@ -81,11 +81,11 @@ static int __init cpufreq_test_tsc(void)
81 81
82 printk(KERN_DEBUG "start--> \n"); 82 printk(KERN_DEBUG "start--> \n");
83 then = read_pmtmr(); 83 then = read_pmtmr();
84 rdtscll(then_tsc); 84 then_tsc = rdtsc();
85 for (i=0;i<20;i++) { 85 for (i=0;i<20;i++) {
86 mdelay(100); 86 mdelay(100);
87 now = read_pmtmr(); 87 now = read_pmtmr();
88 rdtscll(now_tsc); 88 now_tsc = rdtsc();
89 diff = (now - then) & 0xFFFFFF; 89 diff = (now - then) & 0xFFFFFF;
90 diff_tsc = now_tsc - then_tsc; 90 diff_tsc = now_tsc - then_tsc;
91 printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc); 91 printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc);
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index caa60d56d7d1..29089b24d18b 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,8 +4,8 @@ include ../lib.mk
4 4
5.PHONY: all all_32 all_64 warn_32bit_failure clean 5.PHONY: all all_32 all_64 warn_32bit_failure clean
6 6
7TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs 7TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt
8TARGETS_C_32BIT_ONLY := entry_from_vm86 8TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn
9 9
10TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) 10TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
11BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) 11BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c
index 5c38a187677b..9a43a59a9bb4 100644
--- a/tools/testing/selftests/x86/entry_from_vm86.c
+++ b/tools/testing/selftests/x86/entry_from_vm86.c
@@ -28,6 +28,55 @@
28static unsigned long load_addr = 0x10000; 28static unsigned long load_addr = 0x10000;
29static int nerrs = 0; 29static int nerrs = 0;
30 30
31static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
32 int flags)
33{
34 struct sigaction sa;
35 memset(&sa, 0, sizeof(sa));
36 sa.sa_sigaction = handler;
37 sa.sa_flags = SA_SIGINFO | flags;
38 sigemptyset(&sa.sa_mask);
39 if (sigaction(sig, &sa, 0))
40 err(1, "sigaction");
41}
42
43static void clearhandler(int sig)
44{
45 struct sigaction sa;
46 memset(&sa, 0, sizeof(sa));
47 sa.sa_handler = SIG_DFL;
48 sigemptyset(&sa.sa_mask);
49 if (sigaction(sig, &sa, 0))
50 err(1, "sigaction");
51}
52
53static sig_atomic_t got_signal;
54
55static void sighandler(int sig, siginfo_t *info, void *ctx_void)
56{
57 ucontext_t *ctx = (ucontext_t*)ctx_void;
58
59 if (ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_VM ||
60 (ctx->uc_mcontext.gregs[REG_CS] & 3) != 3) {
61 printf("[FAIL]\tSignal frame should not reflect vm86 mode\n");
62 nerrs++;
63 }
64
65 const char *signame;
66 if (sig == SIGSEGV)
67 signame = "SIGSEGV";
68 else if (sig == SIGILL)
69 signame = "SIGILL";
70 else
71 signame = "unexpected signal";
72
73 printf("[INFO]\t%s: FLAGS = 0x%lx, CS = 0x%hx\n", signame,
74 (unsigned long)ctx->uc_mcontext.gregs[REG_EFL],
75 (unsigned short)ctx->uc_mcontext.gregs[REG_CS]);
76
77 got_signal = 1;
78}
79
31asm ( 80asm (
32 ".pushsection .rodata\n\t" 81 ".pushsection .rodata\n\t"
33 ".type vmcode_bound, @object\n\t" 82 ".type vmcode_bound, @object\n\t"
@@ -38,6 +87,14 @@ asm (
38 "int3\n\t" 87 "int3\n\t"
39 "vmcode_sysenter:\n\t" 88 "vmcode_sysenter:\n\t"
40 "sysenter\n\t" 89 "sysenter\n\t"
90 "vmcode_syscall:\n\t"
91 "syscall\n\t"
92 "vmcode_sti:\n\t"
93 "sti\n\t"
94 "vmcode_int3:\n\t"
95 "int3\n\t"
96 "vmcode_int80:\n\t"
97 "int $0x80\n\t"
41 ".size vmcode, . - vmcode\n\t" 98 ".size vmcode, . - vmcode\n\t"
42 "end_vmcode:\n\t" 99 "end_vmcode:\n\t"
43 ".code32\n\t" 100 ".code32\n\t"
@@ -45,9 +102,12 @@ asm (
45 ); 102 );
46 103
47extern unsigned char vmcode[], end_vmcode[]; 104extern unsigned char vmcode[], end_vmcode[];
48extern unsigned char vmcode_bound[], vmcode_sysenter[]; 105extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[],
106 vmcode_sti[], vmcode_int3[], vmcode_int80[];
49 107
50static void do_test(struct vm86plus_struct *v86, unsigned long eip, 108/* Returns false if the test was skipped. */
109static bool do_test(struct vm86plus_struct *v86, unsigned long eip,
110 unsigned int rettype, unsigned int retarg,
51 const char *text) 111 const char *text)
52{ 112{
53 long ret; 113 long ret;
@@ -58,7 +118,7 @@ static void do_test(struct vm86plus_struct *v86, unsigned long eip,
58 118
59 if (ret == -1 && errno == ENOSYS) { 119 if (ret == -1 && errno == ENOSYS) {
60 printf("[SKIP]\tvm86 not supported\n"); 120 printf("[SKIP]\tvm86 not supported\n");
61 return; 121 return false;
62 } 122 }
63 123
64 if (VM86_TYPE(ret) == VM86_INTx) { 124 if (VM86_TYPE(ret) == VM86_INTx) {
@@ -73,13 +133,30 @@ static void do_test(struct vm86plus_struct *v86, unsigned long eip,
73 else 133 else
74 sprintf(trapname, "%d", trapno); 134 sprintf(trapname, "%d", trapno);
75 135
76 printf("[OK]\tExited vm86 mode due to #%s\n", trapname); 136 printf("[INFO]\tExited vm86 mode due to #%s\n", trapname);
77 } else if (VM86_TYPE(ret) == VM86_UNKNOWN) { 137 } else if (VM86_TYPE(ret) == VM86_UNKNOWN) {
78 printf("[OK]\tExited vm86 mode due to unhandled GP fault\n"); 138 printf("[INFO]\tExited vm86 mode due to unhandled GP fault\n");
139 } else if (VM86_TYPE(ret) == VM86_TRAP) {
140 printf("[INFO]\tExited vm86 mode due to a trap (arg=%ld)\n",
141 VM86_ARG(ret));
142 } else if (VM86_TYPE(ret) == VM86_SIGNAL) {
143 printf("[INFO]\tExited vm86 mode due to a signal\n");
144 } else if (VM86_TYPE(ret) == VM86_STI) {
145 printf("[INFO]\tExited vm86 mode due to STI\n");
79 } else { 146 } else {
80 printf("[OK]\tExited vm86 mode due to type %ld, arg %ld\n", 147 printf("[INFO]\tExited vm86 mode due to type %ld, arg %ld\n",
81 VM86_TYPE(ret), VM86_ARG(ret)); 148 VM86_TYPE(ret), VM86_ARG(ret));
82 } 149 }
150
151 if (rettype == -1 ||
152 (VM86_TYPE(ret) == rettype && VM86_ARG(ret) == retarg)) {
153 printf("[OK]\tReturned correctly\n");
154 } else {
155 printf("[FAIL]\tIncorrect return reason\n");
156 nerrs++;
157 }
158
159 return true;
83} 160}
84 161
85int main(void) 162int main(void)
@@ -105,10 +182,52 @@ int main(void)
105 assert((v86.regs.cs & 3) == 0); /* Looks like RPL = 0 */ 182 assert((v86.regs.cs & 3) == 0); /* Looks like RPL = 0 */
106 183
107 /* #BR -- should deliver SIG??? */ 184 /* #BR -- should deliver SIG??? */
108 do_test(&v86, vmcode_bound - vmcode, "#BR"); 185 do_test(&v86, vmcode_bound - vmcode, VM86_INTx, 5, "#BR");
109 186
110 /* SYSENTER -- should cause #GP or #UD depending on CPU */ 187 /*
111 do_test(&v86, vmcode_sysenter - vmcode, "SYSENTER"); 188 * SYSENTER -- should cause #GP or #UD depending on CPU.
189 * Expected return type -1 means that we shouldn't validate
190 * the vm86 return value. This will avoid problems on non-SEP
191 * CPUs.
192 */
193 sethandler(SIGILL, sighandler, 0);
194 do_test(&v86, vmcode_sysenter - vmcode, -1, 0, "SYSENTER");
195 clearhandler(SIGILL);
196
197 /*
198 * SYSCALL would be a disaster in VM86 mode. Fortunately,
199 * there is no kernel that both enables SYSCALL and sets
200 * EFER.SCE, so it's #UD on all systems. But vm86 is
201 * buggy (or has a "feature"), so the SIGILL will actually
202 * be delivered.
203 */
204 sethandler(SIGILL, sighandler, 0);
205 do_test(&v86, vmcode_syscall - vmcode, VM86_SIGNAL, 0, "SYSCALL");
206 clearhandler(SIGILL);
207
208 /* STI with VIP set */
209 v86.regs.eflags |= X86_EFLAGS_VIP;
210 v86.regs.eflags &= ~X86_EFLAGS_IF;
211 do_test(&v86, vmcode_sti - vmcode, VM86_STI, 0, "STI with VIP set");
212
213 /* INT3 -- should cause #BP */
214 do_test(&v86, vmcode_int3 - vmcode, VM86_TRAP, 3, "INT3");
215
216 /* INT80 -- should exit with "INTx 0x80" */
217 v86.regs.eax = (unsigned int)-1;
218 do_test(&v86, vmcode_int80 - vmcode, VM86_INTx, 0x80, "int80");
219
220 /* Execute a null pointer */
221 v86.regs.cs = 0;
222 v86.regs.ss = 0;
223 sethandler(SIGSEGV, sighandler, 0);
224 got_signal = 0;
225 if (do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer") &&
226 !got_signal) {
227 printf("[FAIL]\tDid not receive SIGSEGV\n");
228 nerrs++;
229 }
230 clearhandler(SIGSEGV);
112 231
113 return (nerrs == 0 ? 0 : 1); 232 return (nerrs == 0 ? 0 : 1);
114} 233}
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
new file mode 100644
index 000000000000..31a3035cd4eb
--- /dev/null
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -0,0 +1,576 @@
1/*
2 * ldt_gdt.c - Test cases for LDT and GDT access
3 * Copyright (c) 2015 Andrew Lutomirski
4 */
5
6#define _GNU_SOURCE
7#include <err.h>
8#include <stdio.h>
9#include <stdint.h>
10#include <signal.h>
11#include <setjmp.h>
12#include <stdlib.h>
13#include <string.h>
14#include <errno.h>
15#include <unistd.h>
16#include <sys/syscall.h>
17#include <asm/ldt.h>
18#include <sys/types.h>
19#include <sys/wait.h>
20#include <stdbool.h>
21#include <pthread.h>
22#include <sched.h>
23#include <linux/futex.h>
24
25#define AR_ACCESSED (1<<8)
26
27#define AR_TYPE_RODATA (0 * (1<<9))
28#define AR_TYPE_RWDATA (1 * (1<<9))
29#define AR_TYPE_RODATA_EXPDOWN (2 * (1<<9))
30#define AR_TYPE_RWDATA_EXPDOWN (3 * (1<<9))
31#define AR_TYPE_XOCODE (4 * (1<<9))
32#define AR_TYPE_XRCODE (5 * (1<<9))
33#define AR_TYPE_XOCODE_CONF (6 * (1<<9))
34#define AR_TYPE_XRCODE_CONF (7 * (1<<9))
35
36#define AR_DPL3 (3 * (1<<13))
37
38#define AR_S (1 << 12)
39#define AR_P (1 << 15)
40#define AR_AVL (1 << 20)
41#define AR_L (1 << 21)
42#define AR_DB (1 << 22)
43#define AR_G (1 << 23)
44
45static int nerrs;
46
47static void check_invalid_segment(uint16_t index, int ldt)
48{
49 uint32_t has_limit = 0, has_ar = 0, limit, ar;
50 uint32_t selector = (index << 3) | (ldt << 2) | 3;
51
52 asm ("lsl %[selector], %[limit]\n\t"
53 "jnz 1f\n\t"
54 "movl $1, %[has_limit]\n\t"
55 "1:"
56 : [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
57 : [selector] "r" (selector));
58 asm ("larl %[selector], %[ar]\n\t"
59 "jnz 1f\n\t"
60 "movl $1, %[has_ar]\n\t"
61 "1:"
62 : [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
63 : [selector] "r" (selector));
64
65 if (has_limit || has_ar) {
66 printf("[FAIL]\t%s entry %hu is valid but should be invalid\n",
67 (ldt ? "LDT" : "GDT"), index);
68 nerrs++;
69 } else {
70 printf("[OK]\t%s entry %hu is invalid\n",
71 (ldt ? "LDT" : "GDT"), index);
72 }
73}
74
75static void check_valid_segment(uint16_t index, int ldt,
76 uint32_t expected_ar, uint32_t expected_limit,
77 bool verbose)
78{
79 uint32_t has_limit = 0, has_ar = 0, limit, ar;
80 uint32_t selector = (index << 3) | (ldt << 2) | 3;
81
82 asm ("lsl %[selector], %[limit]\n\t"
83 "jnz 1f\n\t"
84 "movl $1, %[has_limit]\n\t"
85 "1:"
86 : [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
87 : [selector] "r" (selector));
88 asm ("larl %[selector], %[ar]\n\t"
89 "jnz 1f\n\t"
90 "movl $1, %[has_ar]\n\t"
91 "1:"
92 : [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
93 : [selector] "r" (selector));
94
95 if (!has_limit || !has_ar) {
96 printf("[FAIL]\t%s entry %hu is invalid but should be valid\n",
97 (ldt ? "LDT" : "GDT"), index);
98 nerrs++;
99 return;
100 }
101
102 if (ar != expected_ar) {
103 printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
104 (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
105 nerrs++;
106 } else if (limit != expected_limit) {
107 printf("[FAIL]\t%s entry %hu has limit 0x%08X but expected 0x%08X\n",
108 (ldt ? "LDT" : "GDT"), index, limit, expected_limit);
109 nerrs++;
110 } else if (verbose) {
111 printf("[OK]\t%s entry %hu has AR 0x%08X and limit 0x%08X\n",
112 (ldt ? "LDT" : "GDT"), index, ar, limit);
113 }
114}
115
116static bool install_valid_mode(const struct user_desc *desc, uint32_t ar,
117 bool oldmode)
118{
119 int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
120 desc, sizeof(*desc));
121 if (ret < -1)
122 errno = -ret;
123 if (ret == 0) {
124 uint32_t limit = desc->limit;
125 if (desc->limit_in_pages)
126 limit = (limit << 12) + 4095;
127 check_valid_segment(desc->entry_number, 1, ar, limit, true);
128 return true;
129 } else if (errno == ENOSYS) {
130 printf("[OK]\tmodify_ldt returned -ENOSYS\n");
131 return false;
132 } else {
133 if (desc->seg_32bit) {
134 printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
135 errno);
136 nerrs++;
137 return false;
138 } else {
139 printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
140 return false;
141 }
142 }
143}
144
145static bool install_valid(const struct user_desc *desc, uint32_t ar)
146{
147 return install_valid_mode(desc, ar, false);
148}
149
150static void install_invalid(const struct user_desc *desc, bool oldmode)
151{
152 int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
153 desc, sizeof(*desc));
154 if (ret < -1)
155 errno = -ret;
156 if (ret == 0) {
157 check_invalid_segment(desc->entry_number, 1);
158 } else if (errno == ENOSYS) {
159 printf("[OK]\tmodify_ldt returned -ENOSYS\n");
160 } else {
161 if (desc->seg_32bit) {
162 printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
163 errno);
164 nerrs++;
165 } else {
166 printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
167 }
168 }
169}
170
171static int safe_modify_ldt(int func, struct user_desc *ptr,
172 unsigned long bytecount)
173{
174 int ret = syscall(SYS_modify_ldt, 0x11, ptr, bytecount);
175 if (ret < -1)
176 errno = -ret;
177 return ret;
178}
179
180static void fail_install(struct user_desc *desc)
181{
182 if (safe_modify_ldt(0x11, desc, sizeof(*desc)) == 0) {
183 printf("[FAIL]\tmodify_ldt accepted a bad descriptor\n");
184 nerrs++;
185 } else if (errno == ENOSYS) {
186 printf("[OK]\tmodify_ldt returned -ENOSYS\n");
187 } else {
188 printf("[OK]\tmodify_ldt failure %d\n", errno);
189 }
190}
191
192static void do_simple_tests(void)
193{
194 struct user_desc desc = {
195 .entry_number = 0,
196 .base_addr = 0,
197 .limit = 10,
198 .seg_32bit = 1,
199 .contents = 2, /* Code, not conforming */
200 .read_exec_only = 0,
201 .limit_in_pages = 0,
202 .seg_not_present = 0,
203 .useable = 0
204 };
205 install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
206
207 desc.limit_in_pages = 1;
208 install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
209 AR_S | AR_P | AR_DB | AR_G);
210
211 check_invalid_segment(1, 1);
212
213 desc.entry_number = 2;
214 install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
215 AR_S | AR_P | AR_DB | AR_G);
216
217 check_invalid_segment(1, 1);
218
219 desc.base_addr = 0xf0000000;
220 install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
221 AR_S | AR_P | AR_DB | AR_G);
222
223 desc.useable = 1;
224 install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
225 AR_S | AR_P | AR_DB | AR_G | AR_AVL);
226
227 desc.seg_not_present = 1;
228 install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
229 AR_S | AR_DB | AR_G | AR_AVL);
230
231 desc.seg_32bit = 0;
232 install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
233 AR_S | AR_G | AR_AVL);
234
235 desc.seg_32bit = 1;
236 desc.contents = 0;
237 install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA |
238 AR_S | AR_DB | AR_G | AR_AVL);
239
240 desc.read_exec_only = 1;
241 install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA |
242 AR_S | AR_DB | AR_G | AR_AVL);
243
244 desc.contents = 1;
245 install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN |
246 AR_S | AR_DB | AR_G | AR_AVL);
247
248 desc.read_exec_only = 0;
249 desc.limit_in_pages = 0;
250 install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN |
251 AR_S | AR_DB | AR_AVL);
252
253 desc.contents = 3;
254 install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE_CONF |
255 AR_S | AR_DB | AR_AVL);
256
257 desc.read_exec_only = 1;
258 install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE_CONF |
259 AR_S | AR_DB | AR_AVL);
260
261 desc.read_exec_only = 0;
262 desc.contents = 2;
263 install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
264 AR_S | AR_DB | AR_AVL);
265
266 desc.read_exec_only = 1;
267
268#ifdef __x86_64__
269 desc.lm = 1;
270 install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
271 AR_S | AR_DB | AR_AVL);
272 desc.lm = 0;
273#endif
274
275 bool entry1_okay = install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
276 AR_S | AR_DB | AR_AVL);
277
278 if (entry1_okay) {
279 printf("[RUN]\tTest fork\n");
280 pid_t child = fork();
281 if (child == 0) {
282 nerrs = 0;
283 check_valid_segment(desc.entry_number, 1,
284 AR_DPL3 | AR_TYPE_XOCODE |
285 AR_S | AR_DB | AR_AVL, desc.limit,
286 true);
287 check_invalid_segment(1, 1);
288 exit(nerrs ? 1 : 0);
289 } else {
290 int status;
291 if (waitpid(child, &status, 0) != child ||
292 !WIFEXITED(status)) {
293 printf("[FAIL]\tChild died\n");
294 nerrs++;
295 } else if (WEXITSTATUS(status) != 0) {
296 printf("[FAIL]\tChild failed\n");
297 nerrs++;
298 } else {
299 printf("[OK]\tChild succeeded\n");
300 }
301 }
302
303 printf("[RUN]\tTest size\n");
304 int i;
305 for (i = 0; i < 8192; i++) {
306 desc.entry_number = i;
307 desc.limit = i;
308 if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
309 printf("[FAIL]\tFailed to install entry %d\n", i);
310 nerrs++;
311 break;
312 }
313 }
314 for (int j = 0; j < i; j++) {
315 check_valid_segment(j, 1, AR_DPL3 | AR_TYPE_XOCODE |
316 AR_S | AR_DB | AR_AVL, j, false);
317 }
318 printf("[DONE]\tSize test\n");
319 } else {
320 printf("[SKIP]\tSkipping fork and size tests because we have no LDT\n");
321 }
322
323 /* Test entry_number too high. */
324 desc.entry_number = 8192;
325 fail_install(&desc);
326
327 /* Test deletion and actions mistakeable for deletion. */
328 memset(&desc, 0, sizeof(desc));
329 install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P);
330
331 desc.seg_not_present = 1;
332 install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
333
334 desc.seg_not_present = 0;
335 desc.read_exec_only = 1;
336 install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P);
337
338 desc.read_exec_only = 0;
339 desc.seg_not_present = 1;
340 install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
341
342 desc.read_exec_only = 1;
343 desc.limit = 1;
344 install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
345
346 desc.limit = 0;
347 desc.base_addr = 1;
348 install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
349
350 desc.base_addr = 0;
351 install_invalid(&desc, false);
352
353 desc.seg_not_present = 0;
354 desc.read_exec_only = 0;
355 desc.seg_32bit = 1;
356 install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB);
357 install_invalid(&desc, true);
358}
359
360/*
361 * 0: thread is idle
362 * 1: thread armed
363 * 2: thread should clear LDT entry 0
364 * 3: thread should exit
365 */
366static volatile unsigned int ftx;
367
368static void *threadproc(void *ctx)
369{
370 cpu_set_t cpuset;
371 CPU_ZERO(&cpuset);
372 CPU_SET(1, &cpuset);
373 if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
374 err(1, "sched_setaffinity to CPU 1"); /* should never fail */
375
376 while (1) {
377 syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0);
378 while (ftx != 2) {
379 if (ftx >= 3)
380 return NULL;
381 }
382
383 /* clear LDT entry 0 */
384 const struct user_desc desc = {};
385 if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) != 0)
386 err(1, "modify_ldt");
387
388 /* If ftx == 2, set it to zero. If ftx == 100, quit. */
389 unsigned int x = -2;
390 asm volatile ("lock xaddl %[x], %[ftx]" :
391 [x] "+r" (x), [ftx] "+m" (ftx));
392 if (x != 2)
393 return NULL;
394 }
395}
396
397static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
398 int flags)
399{
400 struct sigaction sa;
401 memset(&sa, 0, sizeof(sa));
402 sa.sa_sigaction = handler;
403 sa.sa_flags = SA_SIGINFO | flags;
404 sigemptyset(&sa.sa_mask);
405 if (sigaction(sig, &sa, 0))
406 err(1, "sigaction");
407
408}
409
410static jmp_buf jmpbuf;
411
412static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
413{
414 siglongjmp(jmpbuf, 1);
415}
416
417static void do_multicpu_tests(void)
418{
419 cpu_set_t cpuset;
420 pthread_t thread;
421 int failures = 0, iters = 5, i;
422 unsigned short orig_ss;
423
424 CPU_ZERO(&cpuset);
425 CPU_SET(1, &cpuset);
426 if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
427 printf("[SKIP]\tCannot set affinity to CPU 1\n");
428 return;
429 }
430
431 CPU_ZERO(&cpuset);
432 CPU_SET(0, &cpuset);
433 if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
434 printf("[SKIP]\tCannot set affinity to CPU 0\n");
435 return;
436 }
437
438 sethandler(SIGSEGV, sigsegv, 0);
439#ifdef __i386__
440 /* True 32-bit kernels send SIGILL instead of SIGSEGV on IRET faults. */
441 sethandler(SIGILL, sigsegv, 0);
442#endif
443
444 printf("[RUN]\tCross-CPU LDT invalidation\n");
445
446 if (pthread_create(&thread, 0, threadproc, 0) != 0)
447 err(1, "pthread_create");
448
449 asm volatile ("mov %%ss, %0" : "=rm" (orig_ss));
450
451 for (i = 0; i < 5; i++) {
452 if (sigsetjmp(jmpbuf, 1) != 0)
453 continue;
454
455 /* Make sure the thread is ready after the last test. */
456 while (ftx != 0)
457 ;
458
459 struct user_desc desc = {
460 .entry_number = 0,
461 .base_addr = 0,
462 .limit = 0xfffff,
463 .seg_32bit = 1,
464 .contents = 0, /* Data */
465 .read_exec_only = 0,
466 .limit_in_pages = 1,
467 .seg_not_present = 0,
468 .useable = 0
469 };
470
471 if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
472 if (errno != ENOSYS)
473 err(1, "modify_ldt");
474 printf("[SKIP]\tmodify_ldt unavailable\n");
475 break;
476 }
477
478 /* Arm the thread. */
479 ftx = 1;
480 syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
481
482 asm volatile ("mov %0, %%ss" : : "r" (0x7));
483
484 /* Go! */
485 ftx = 2;
486
487 while (ftx != 0)
488 ;
489
490 /*
491 * On success, modify_ldt will segfault us synchronously,
492 * and we'll escape via siglongjmp.
493 */
494
495 failures++;
496 asm volatile ("mov %0, %%ss" : : "rm" (orig_ss));
497 };
498
499 ftx = 100; /* Kill the thread. */
500 syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
501
502 if (pthread_join(thread, NULL) != 0)
503 err(1, "pthread_join");
504
505 if (failures) {
506 printf("[FAIL]\t%d of %d iterations failed\n", failures, iters);
507 nerrs++;
508 } else {
509 printf("[OK]\tAll %d iterations succeeded\n", iters);
510 }
511}
512
513static int finish_exec_test(void)
514{
515 /*
516 * In a sensible world, this would be check_invalid_segment(0, 1);
517 * For better or for worse, though, the LDT is inherited across exec.
518 * We can probably change this safely, but for now we test it.
519 */
520 check_valid_segment(0, 1,
521 AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB,
522 42, true);
523
524 return nerrs ? 1 : 0;
525}
526
527static void do_exec_test(void)
528{
529 printf("[RUN]\tTest exec\n");
530
531 struct user_desc desc = {
532 .entry_number = 0,
533 .base_addr = 0,
534 .limit = 42,
535 .seg_32bit = 1,
536 .contents = 2, /* Code, not conforming */
537 .read_exec_only = 0,
538 .limit_in_pages = 0,
539 .seg_not_present = 0,
540 .useable = 0
541 };
542 install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
543
544 pid_t child = fork();
545 if (child == 0) {
546 execl("/proc/self/exe", "ldt_gdt_test_exec", NULL);
547 printf("[FAIL]\tCould not exec self\n");
548 exit(1); /* exec failed */
549 } else {
550 int status;
551 if (waitpid(child, &status, 0) != child ||
552 !WIFEXITED(status)) {
553 printf("[FAIL]\tChild died\n");
554 nerrs++;
555 } else if (WEXITSTATUS(status) != 0) {
556 printf("[FAIL]\tChild failed\n");
557 nerrs++;
558 } else {
559 printf("[OK]\tChild succeeded\n");
560 }
561 }
562}
563
564int main(int argc, char **argv)
565{
566 if (argc == 1 && !strcmp(argv[0], "ldt_gdt_test_exec"))
567 return finish_exec_test();
568
569 do_simple_tests();
570
571 do_multicpu_tests();
572
573 do_exec_test();
574
575 return nerrs ? 1 : 0;
576}
diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c
new file mode 100644
index 000000000000..7db4fc9fa09f
--- /dev/null
+++ b/tools/testing/selftests/x86/syscall_arg_fault.c
@@ -0,0 +1,130 @@
1/*
2 * syscall_arg_fault.c - tests faults 32-bit fast syscall stack args
3 * Copyright (c) 2015 Andrew Lutomirski
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 */
14
15#define _GNU_SOURCE
16
17#include <stdlib.h>
18#include <stdio.h>
19#include <string.h>
20#include <sys/signal.h>
21#include <sys/ucontext.h>
22#include <err.h>
23#include <setjmp.h>
24#include <errno.h>
25
26/* Our sigaltstack scratch space. */
27static unsigned char altstack_data[SIGSTKSZ];
28
29static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
30 int flags)
31{
32 struct sigaction sa;
33 memset(&sa, 0, sizeof(sa));
34 sa.sa_sigaction = handler;
35 sa.sa_flags = SA_SIGINFO | flags;
36 sigemptyset(&sa.sa_mask);
37 if (sigaction(sig, &sa, 0))
38 err(1, "sigaction");
39}
40
41static volatile sig_atomic_t sig_traps;
42static sigjmp_buf jmpbuf;
43
44static volatile sig_atomic_t n_errs;
45
46static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
47{
48 ucontext_t *ctx = (ucontext_t*)ctx_void;
49
50 if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) {
51 printf("[FAIL]\tAX had the wrong value: 0x%x\n",
52 ctx->uc_mcontext.gregs[REG_EAX]);
53 n_errs++;
54 } else {
55 printf("[OK]\tSeems okay\n");
56 }
57
58 siglongjmp(jmpbuf, 1);
59}
60
61static void sigill(int sig, siginfo_t *info, void *ctx_void)
62{
63 printf("[SKIP]\tIllegal instruction\n");
64 siglongjmp(jmpbuf, 1);
65}
66
67int main()
68{
69 stack_t stack = {
70 .ss_sp = altstack_data,
71 .ss_size = SIGSTKSZ,
72 };
73 if (sigaltstack(&stack, NULL) != 0)
74 err(1, "sigaltstack");
75
76 sethandler(SIGSEGV, sigsegv, SA_ONSTACK);
77 sethandler(SIGILL, sigill, SA_ONSTACK);
78
79 /*
80 * Exercise another nasty special case. The 32-bit SYSCALL
81 * and SYSENTER instructions (even in compat mode) each
82 * clobber one register. A Linux system call has a syscall
83 * number and six arguments, and the user stack pointer
84 * needs to live in some register on return. That means
85 * that we need eight registers, but SYSCALL and SYSENTER
86 * only preserve seven registers. As a result, one argument
87 * ends up on the stack. The stack is user memory, which
88 * means that the kernel can fail to read it.
89 *
90 * The 32-bit fast system calls don't have a defined ABI:
91 * we're supposed to invoke them through the vDSO. So we'll
92 * fudge it: we set all regs to invalid pointer values and
93 * invoke the entry instruction. The return will fail no
94 * matter what, and we completely lose our program state,
95 * but we can fix it up with a signal handler.
96 */
97
98 printf("[RUN]\tSYSENTER with invalid state\n");
99 if (sigsetjmp(jmpbuf, 1) == 0) {
100 asm volatile (
101 "movl $-1, %%eax\n\t"
102 "movl $-1, %%ebx\n\t"
103 "movl $-1, %%ecx\n\t"
104 "movl $-1, %%edx\n\t"
105 "movl $-1, %%esi\n\t"
106 "movl $-1, %%edi\n\t"
107 "movl $-1, %%ebp\n\t"
108 "movl $-1, %%esp\n\t"
109 "sysenter"
110 : : : "memory", "flags");
111 }
112
113 printf("[RUN]\tSYSCALL with invalid state\n");
114 if (sigsetjmp(jmpbuf, 1) == 0) {
115 asm volatile (
116 "movl $-1, %%eax\n\t"
117 "movl $-1, %%ebx\n\t"
118 "movl $-1, %%ecx\n\t"
119 "movl $-1, %%edx\n\t"
120 "movl $-1, %%esi\n\t"
121 "movl $-1, %%edi\n\t"
122 "movl $-1, %%ebp\n\t"
123 "movl $-1, %%esp\n\t"
124 "syscall\n\t"
125 "pushl $0" /* make sure we segfault cleanly */
126 : : : "memory", "flags");
127 }
128
129 return 0;
130}
diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c
new file mode 100644
index 000000000000..60c06af4646a
--- /dev/null
+++ b/tools/testing/selftests/x86/syscall_nt.c
@@ -0,0 +1,54 @@
1/*
2 * syscall_nt.c - checks syscalls with NT set
3 * Copyright (c) 2014-2015 Andrew Lutomirski
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * Some obscure user-space code requires the ability to make system calls
15 * with FLAGS.NT set. Make sure it works.
16 */
17
18#include <stdio.h>
19#include <unistd.h>
20#include <sys/syscall.h>
21#include <asm/processor-flags.h>
22
23#ifdef __x86_64__
24# define WIDTH "q"
25#else
26# define WIDTH "l"
27#endif
28
29static unsigned long get_eflags(void)
30{
31 unsigned long eflags;
32 asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags));
33 return eflags;
34}
35
36static void set_eflags(unsigned long eflags)
37{
38 asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH
39 : : "rm" (eflags) : "flags");
40}
41
42int main()
43{
44 printf("[RUN]\tSet NT and issue a syscall\n");
45 set_eflags(get_eflags() | X86_EFLAGS_NT);
46 syscall(SYS_getpid);
47 if (get_eflags() & X86_EFLAGS_NT) {
48 printf("[OK]\tThe syscall worked and NT is still set\n");
49 return 0;
50 } else {
51 printf("[FAIL]\tThe syscall worked but NT was cleared\n");
52 return 1;
53 }
54}