aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig6
-rw-r--r--arch/x86/boot/Makefile8
-rw-r--r--arch/x86/boot/boot.h28
-rw-r--r--arch/x86/boot/cmdline.c6
-rw-r--r--arch/x86/boot/compressed/Makefile4
-rw-r--r--arch/x86/boot/compressed/cmdline.c21
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c5
-rw-r--r--arch/x86/boot/compressed/head_32.S13
-rw-r--r--arch/x86/boot/compressed/head_64.S13
-rw-r--r--arch/x86/boot/compressed/misc.c56
-rw-r--r--arch/x86/boot/compressed/misc.h39
-rw-r--r--arch/x86/boot/compressed/string.c2
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S6
-rw-r--r--arch/x86/boot/ctype.h21
-rw-r--r--arch/x86/boot/early_serial_console.c139
-rw-r--r--arch/x86/boot/main.c9
-rw-r--r--arch/x86/boot/printf.c4
-rw-r--r--arch/x86/boot/string.c63
-rw-r--r--arch/x86/boot/tty.c37
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/apb_timer.h1
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h198
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h83
-rw-r--r--arch/x86/include/asm/cpufeature.h29
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h2
-rw-r--r--arch/x86/include/asm/hypervisor.h1
-rw-r--r--arch/x86/include/asm/i387.h2
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h20
-rw-r--r--arch/x86/include/asm/irq_vectors.h3
-rw-r--r--arch/x86/include/asm/kgdb.h20
-rw-r--r--arch/x86/include/asm/kvm.h22
-rw-r--r--arch/x86/include/asm/kvm_emulate.h30
-rw-r--r--arch/x86/include/asm/kvm_host.h70
-rw-r--r--arch/x86/include/asm/local64.h1
-rw-r--r--arch/x86/include/asm/mrst.h26
-rw-r--r--arch/x86/include/asm/msr-index.h23
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/nmi.h2
-rw-r--r--arch/x86/include/asm/pci_x86.h1
-rw-r--r--arch/x86/include/asm/perf_event.h18
-rw-r--r--arch/x86/include/asm/perf_event_p4.h99
-rw-r--r--arch/x86/include/asm/processor.h21
-rw-r--r--arch/x86/include/asm/required-features.h2
-rw-r--r--arch/x86/include/asm/rwsem.h21
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/stacktrace.h49
-rw-r--r--arch/x86/include/asm/system.h7
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/asm/xsave.h6
-rw-r--r--arch/x86/kernel/acpi/cstate.c9
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S2
-rw-r--r--arch/x86/kernel/acpi/sleep.c11
-rw-r--r--arch/x86/kernel/amd_iommu.c8
-rw-r--r--arch/x86/kernel/apb_timer.c37
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile7
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/apic/es7000_32.c1
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c107
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/nmi.c7
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c77
-rw-r--r--arch/x86/kernel/cpu/common.c12
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h26
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c41
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c19
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c108
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c206
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c56
-rw-r--r--arch/x86/kernel/cpu/perf_event.c62
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c156
-rw-r--r--arch/x86/kernel/cpu/scattered.c63
-rw-r--r--arch/x86/kernel/cpu/topology.c (renamed from arch/x86/kernel/cpu/addon_cpuid_features.c)58
-rw-r--r--arch/x86/kernel/cpu/vmware.c9
-rw-r--r--arch/x86/kernel/dumpstack.c1
-rw-r--r--arch/x86/kernel/dumpstack.h56
-rw-r--r--arch/x86/kernel/dumpstack_32.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/early-quirks.c18
-rw-r--r--arch/x86/kernel/entry_32.S14
-rw-r--r--arch/x86/kernel/entry_64.S13
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head_64.S5
-rw-r--r--arch/x86/kernel/hpet.c15
-rw-r--r--arch/x86/kernel/hw_breakpoint.c51
-rw-r--r--arch/x86/kernel/i387.c3
-rw-r--r--arch/x86/kernel/i8259.c25
-rw-r--r--arch/x86/kernel/kgdb.c198
-rw-r--r--arch/x86/kernel/kprobes.c35
-rw-r--r--arch/x86/kernel/mrst.c112
-rw-r--r--arch/x86/kernel/pci-calgary_64.c4
-rw-r--r--arch/x86/kernel/process.c54
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--arch/x86/kernel/quirks.c5
-rw-r--r--arch/x86/kernel/setup_percpu.c17
-rw-r--r--arch/x86/kernel/smpboot.c7
-rw-r--r--arch/x86/kernel/stacktrace.c31
-rw-r--r--arch/x86/kernel/traps.c7
-rw-r--r--arch/x86/kernel/tsc.c5
-rw-r--r--arch/x86/kernel/verify_cpu_64.S3
-rw-r--r--arch/x86/kernel/vsyscall_64.c17
-rw-r--r--arch/x86/kernel/x86_init.c7
-rw-r--r--arch/x86/kernel/xsave.c13
-rw-r--r--arch/x86/kvm/emulate.c749
-rw-r--r--arch/x86/kvm/i8254.c146
-rw-r--r--arch/x86/kvm/i8254.h4
-rw-r--r--arch/x86/kvm/i8259.c48
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/irq.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h8
-rw-r--r--arch/x86/kvm/lapic.c17
-rw-r--r--arch/x86/kvm/mmu.c809
-rw-r--r--arch/x86/kvm/mmutrace.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h251
-rw-r--r--arch/x86/kvm/svm.c147
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/vmx.c268
-rw-r--r--arch/x86/kvm/x86.c1180
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/cmpxchg.c (renamed from arch/x86/kernel/cpu/cmpxchg.c)18
-rw-r--r--arch/x86/mm/dump_pagetables.c32
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/ioremap.c14
-rw-r--r--arch/x86/mm/kmmio.c16
-rw-r--r--arch/x86/mm/pat.c2
-rw-r--r--arch/x86/mm/pat_rbtree.c34
-rw-r--r--arch/x86/mm/pf_in.c30
-rw-r--r--arch/x86/mm/testmmiotrace.c22
-rw-r--r--arch/x86/mm/tlb.c4
-rw-r--r--arch/x86/oprofile/nmi_int.c16
-rw-r--r--arch/x86/pci/acpi.c9
-rw-r--r--arch/x86/pci/common.c20
-rw-r--r--arch/x86/pci/i386.c1
-rw-r--r--arch/x86/pci/irq.c6
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/pci/mrst.c7
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/power/hibernate_64.c2
-rw-r--r--arch/x86/vdso/Makefile3
-rwxr-xr-xarch/x86/vdso/checkundef.sh10
-rw-r--r--arch/x86/vdso/vdso32-setup.c2
-rw-r--r--arch/x86/vdso/vma.c3
-rw-r--r--arch/x86/xen/Kconfig5
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c197
-rw-r--r--arch/x86/xen/mmu.c35
-rw-r--r--arch/x86/xen/mmu.h1
-rw-r--r--arch/x86/xen/platform-pci-unplug.c137
-rw-r--r--arch/x86/xen/setup.c72
-rw-r--r--arch/x86/xen/smp.c2
-rw-r--r--arch/x86/xen/suspend.c12
-rw-r--r--arch/x86/xen/time.c96
-rw-r--r--arch/x86/xen/xen-ops.h13
172 files changed, 4965 insertions, 2672 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dcb0593b4a66..baa34e510222 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,6 +55,7 @@ config X86
55 select HAVE_HW_BREAKPOINT 55 select HAVE_HW_BREAKPOINT
56 select HAVE_MIXED_BREAKPOINTS_REGS 56 select HAVE_MIXED_BREAKPOINTS_REGS
57 select PERF_EVENTS 57 select PERF_EVENTS
58 select HAVE_PERF_EVENTS_NMI
58 select ANON_INODES 59 select ANON_INODES
59 select HAVE_ARCH_KMEMCHECK 60 select HAVE_ARCH_KMEMCHECK
60 select HAVE_USER_RETURN_NOTIFIER 61 select HAVE_USER_RETURN_NOTIFIER
@@ -72,9 +73,6 @@ config ARCH_DEFCONFIG
72 default "arch/x86/configs/i386_defconfig" if X86_32 73 default "arch/x86/configs/i386_defconfig" if X86_32
73 default "arch/x86/configs/x86_64_defconfig" if X86_64 74 default "arch/x86/configs/x86_64_defconfig" if X86_64
74 75
75config GENERIC_TIME
76 def_bool y
77
78config GENERIC_CMOS_UPDATE 76config GENERIC_CMOS_UPDATE
79 def_bool y 77 def_bool y
80 78
@@ -2046,7 +2044,7 @@ config SCx200
2046 2044
2047config SCx200HR_TIMER 2045config SCx200HR_TIMER
2048 tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" 2046 tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
2049 depends on SCx200 && GENERIC_TIME 2047 depends on SCx200
2050 default y 2048 default y
2051 ---help--- 2049 ---help---
2052 This driver provides a clocksource built upon the on-chip 2050 This driver provides a clocksource built upon the on-chip
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index ec749c2bfdd7..f7cb086b4add 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,10 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf 26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
27subdir- := compressed 27subdir- := compressed
28 28
29setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o 29setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o
30setup-y += header.o main.o mca.o memory.o pm.o pmjump.o 30setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o
31setup-y += printf.o regs.o string.o tty.o video.o video-mode.o 31setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o
32setup-y += version.o 32setup-y += video-mode.o version.o
33setup-$(CONFIG_X86_APM_BOOT) += apm.o 33setup-$(CONFIG_X86_APM_BOOT) += apm.o
34 34
35# The link order of the video-*.o modules can matter. In particular, 35# The link order of the video-*.o modules can matter. In particular,
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 98239d2658f2..c7093bd9f2d3 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -28,6 +28,7 @@
28#include "bitops.h" 28#include "bitops.h"
29#include <asm/cpufeature.h> 29#include <asm/cpufeature.h>
30#include <asm/processor-flags.h> 30#include <asm/processor-flags.h>
31#include "ctype.h"
31 32
32/* Useful macros */ 33/* Useful macros */
33#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 34#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -37,6 +38,8 @@
37extern struct setup_header hdr; 38extern struct setup_header hdr;
38extern struct boot_params boot_params; 39extern struct boot_params boot_params;
39 40
41#define cpu_relax() asm volatile("rep; nop")
42
40/* Basic port I/O */ 43/* Basic port I/O */
41static inline void outb(u8 v, u16 port) 44static inline void outb(u8 v, u16 port)
42{ 45{
@@ -198,11 +201,6 @@ static inline int memcmp_gs(const void *s1, addr_t s2, size_t len)
198 return diff; 201 return diff;
199} 202}
200 203
201static inline int isdigit(int ch)
202{
203 return (ch >= '0') && (ch <= '9');
204}
205
206/* Heap -- available for dynamic lists. */ 204/* Heap -- available for dynamic lists. */
207extern char _end[]; 205extern char _end[];
208extern char *HEAP; 206extern char *HEAP;
@@ -287,8 +285,18 @@ struct biosregs {
287void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); 285void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
288 286
289/* cmdline.c */ 287/* cmdline.c */
290int cmdline_find_option(const char *option, char *buffer, int bufsize); 288int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize);
291int cmdline_find_option_bool(const char *option); 289int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option);
290static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
291{
292 return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize);
293}
294
295static inline int cmdline_find_option_bool(const char *option)
296{
297 return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option);
298}
299
292 300
293/* cpu.c, cpucheck.c */ 301/* cpu.c, cpucheck.c */
294struct cpu_features { 302struct cpu_features {
@@ -300,6 +308,10 @@ extern struct cpu_features cpu;
300int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); 308int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
301int validate_cpu(void); 309int validate_cpu(void);
302 310
311/* early_serial_console.c */
312extern int early_serial_base;
313void console_init(void);
314
303/* edd.c */ 315/* edd.c */
304void query_edd(void); 316void query_edd(void);
305 317
@@ -329,8 +341,10 @@ void initregs(struct biosregs *regs);
329 341
330/* string.c */ 342/* string.c */
331int strcmp(const char *str1, const char *str2); 343int strcmp(const char *str1, const char *str2);
344int strncmp(const char *cs, const char *ct, size_t count);
332size_t strnlen(const char *s, size_t maxlen); 345size_t strnlen(const char *s, size_t maxlen);
333unsigned int atou(const char *s); 346unsigned int atou(const char *s);
347unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base);
334 348
335/* tty.c */ 349/* tty.c */
336void puts(const char *); 350void puts(const char *);
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index a1d35634bce0..6b3b6f708c04 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -27,9 +27,8 @@ static inline int myisspace(u8 c)
27 * Returns the length of the argument (regardless of if it was 27 * Returns the length of the argument (regardless of if it was
28 * truncated to fit in the buffer), or -1 on not found. 28 * truncated to fit in the buffer), or -1 on not found.
29 */ 29 */
30int cmdline_find_option(const char *option, char *buffer, int bufsize) 30int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize)
31{ 31{
32 u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
33 addr_t cptr; 32 addr_t cptr;
34 char c; 33 char c;
35 int len = -1; 34 int len = -1;
@@ -100,9 +99,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize)
100 * Returns the position of that option (starts counting with 1) 99 * Returns the position of that option (starts counting with 1)
101 * or 0 on not found 100 * or 0 on not found
102 */ 101 */
103int cmdline_find_option_bool(const char *option) 102int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
104{ 103{
105 u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
106 addr_t cptr; 104 addr_t cptr;
107 char c; 105 char c;
108 int pos = 0, wstart = 0; 106 int pos = 0, wstart = 0;
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index fbb47daf2459..0c229551eead 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o 7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
8 8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -23,7 +23,7 @@ LDFLAGS_vmlinux := -T
23 23
24hostprogs-y := mkpiggy 24hostprogs-y := mkpiggy
25 25
26$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE 26$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE
27 $(call if_changed,ld) 27 $(call if_changed,ld)
28 @: 28 @:
29 29
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
new file mode 100644
index 000000000000..cb62f786990d
--- /dev/null
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -0,0 +1,21 @@
1#include "misc.h"
2
3static unsigned long fs;
4static inline void set_fs(unsigned long seg)
5{
6 fs = seg << 4; /* shift it back */
7}
8typedef unsigned long addr_t;
9static inline char rdfs8(addr_t addr)
10{
11 return *((char *)(fs + addr));
12}
13#include "../cmdline.c"
14int cmdline_find_option(const char *option, char *buffer, int bufsize)
15{
16 return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize);
17}
18int cmdline_find_option_bool(const char *option)
19{
20 return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
21}
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
new file mode 100644
index 000000000000..261e81fb9582
--- /dev/null
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -0,0 +1,5 @@
1#include "misc.h"
2
3int early_serial_base;
4
5#include "../early_serial_console.c"
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index f543b70ffae2..67a655a39ce4 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -124,6 +124,19 @@ relocated:
124 rep stosl 124 rep stosl
125 125
126/* 126/*
127 * Adjust our own GOT
128 */
129 leal _got(%ebx), %edx
130 leal _egot(%ebx), %ecx
1311:
132 cmpl %ecx, %edx
133 jae 2f
134 addl %ebx, (%edx)
135 addl $4, %edx
136 jmp 1b
1372:
138
139/*
127 * Do the decompression, and jump to the new kernel.. 140 * Do the decompression, and jump to the new kernel..
128 */ 141 */
129 leal z_extract_offset_negative(%ebx), %ebp 142 leal z_extract_offset_negative(%ebx), %ebp
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index faff0dc9c06a..52f85a196fa0 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -280,6 +280,19 @@ relocated:
280 rep stosq 280 rep stosq
281 281
282/* 282/*
283 * Adjust our own GOT
284 */
285 leaq _got(%rip), %rdx
286 leaq _egot(%rip), %rcx
2871:
288 cmpq %rcx, %rdx
289 jae 2f
290 addq %rbx, (%rdx)
291 addq $8, %rdx
292 jmp 1b
2932:
294
295/*
283 * Do the decompression, and jump to the new kernel.. 296 * Do the decompression, and jump to the new kernel..
284 */ 297 */
285 pushq %rsi /* Save the real mode argument */ 298 pushq %rsi /* Save the real mode argument */
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 51e240779a44..8f7bef8e9fff 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -9,23 +9,7 @@
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */ 10 */
11 11
12/* 12#include "misc.h"
13 * we have to be careful, because no indirections are allowed here, and
14 * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
15 * we just keep it from happening
16 */
17#undef CONFIG_PARAVIRT
18#ifdef CONFIG_X86_32
19#define _ASM_X86_DESC_H 1
20#endif
21
22#include <linux/linkage.h>
23#include <linux/screen_info.h>
24#include <linux/elf.h>
25#include <linux/io.h>
26#include <asm/page.h>
27#include <asm/boot.h>
28#include <asm/bootparam.h>
29 13
30/* WARNING!! 14/* WARNING!!
31 * This code is compiled with -fPIC and it is relocated dynamically 15 * This code is compiled with -fPIC and it is relocated dynamically
@@ -123,15 +107,13 @@ static void error(char *m);
123/* 107/*
124 * This is set up by the setup-routine at boot-time 108 * This is set up by the setup-routine at boot-time
125 */ 109 */
126static struct boot_params *real_mode; /* Pointer to real-mode data */ 110struct boot_params *real_mode; /* Pointer to real-mode data */
127static int quiet; 111static int quiet;
112static int debug;
128 113
129void *memset(void *s, int c, size_t n); 114void *memset(void *s, int c, size_t n);
130void *memcpy(void *dest, const void *src, size_t n); 115void *memcpy(void *dest, const void *src, size_t n);
131 116
132static void __putstr(int, const char *);
133#define putstr(__x) __putstr(0, __x)
134
135#ifdef CONFIG_X86_64 117#ifdef CONFIG_X86_64
136#define memptr long 118#define memptr long
137#else 119#else
@@ -170,7 +152,21 @@ static void scroll(void)
170 vidmem[i] = ' '; 152 vidmem[i] = ' ';
171} 153}
172 154
173static void __putstr(int error, const char *s) 155#define XMTRDY 0x20
156
157#define TXR 0 /* Transmit register (WRITE) */
158#define LSR 5 /* Line Status */
159static void serial_putchar(int ch)
160{
161 unsigned timeout = 0xffff;
162
163 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
164 cpu_relax();
165
166 outb(ch, early_serial_base + TXR);
167}
168
169void __putstr(int error, const char *s)
174{ 170{
175 int x, y, pos; 171 int x, y, pos;
176 char c; 172 char c;
@@ -179,6 +175,14 @@ static void __putstr(int error, const char *s)
179 if (!error) 175 if (!error)
180 return; 176 return;
181#endif 177#endif
178 if (early_serial_base) {
179 const char *str = s;
180 while (*str) {
181 if (*str == '\n')
182 serial_putchar('\r');
183 serial_putchar(*str++);
184 }
185 }
182 186
183 if (real_mode->screen_info.orig_video_mode == 0 && 187 if (real_mode->screen_info.orig_video_mode == 0 &&
184 lines == 0 && cols == 0) 188 lines == 0 && cols == 0)
@@ -305,8 +309,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
305{ 309{
306 real_mode = rmode; 310 real_mode = rmode;
307 311
308 if (real_mode->hdr.loadflags & QUIET_FLAG) 312 if (cmdline_find_option_bool("quiet"))
309 quiet = 1; 313 quiet = 1;
314 if (cmdline_find_option_bool("debug"))
315 debug = 1;
310 316
311 if (real_mode->screen_info.orig_video_mode == 7) { 317 if (real_mode->screen_info.orig_video_mode == 7) {
312 vidmem = (char *) 0xb0000; 318 vidmem = (char *) 0xb0000;
@@ -319,6 +325,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
319 lines = real_mode->screen_info.orig_video_lines; 325 lines = real_mode->screen_info.orig_video_lines;
320 cols = real_mode->screen_info.orig_video_cols; 326 cols = real_mode->screen_info.orig_video_cols;
321 327
328 console_init();
329 if (debug)
330 putstr("early console in decompress_kernel\n");
331
322 free_mem_ptr = heap; /* Heap */ 332 free_mem_ptr = heap; /* Heap */
323 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 333 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
324 334
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
new file mode 100644
index 000000000000..3f19c81a6203
--- /dev/null
+++ b/arch/x86/boot/compressed/misc.h
@@ -0,0 +1,39 @@
1#ifndef BOOT_COMPRESSED_MISC_H
2#define BOOT_COMPRESSED_MISC_H
3
4/*
5 * we have to be careful, because no indirections are allowed here, and
6 * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
7 * we just keep it from happening
8 */
9#undef CONFIG_PARAVIRT
10#ifdef CONFIG_X86_32
11#define _ASM_X86_DESC_H 1
12#endif
13
14#include <linux/linkage.h>
15#include <linux/screen_info.h>
16#include <linux/elf.h>
17#include <linux/io.h>
18#include <asm/page.h>
19#include <asm/boot.h>
20#include <asm/bootparam.h>
21
22#define BOOT_BOOT_H
23#include "../ctype.h"
24
25/* misc.c */
26extern struct boot_params *real_mode; /* Pointer to real-mode data */
27void __putstr(int error, const char *s);
28#define putstr(__x) __putstr(0, __x)
29#define puts(__x) __putstr(0, __x)
30
31/* cmdline.c */
32int cmdline_find_option(const char *option, char *buffer, int bufsize);
33int cmdline_find_option_bool(const char *option);
34
35/* early_serial_console.c */
36extern int early_serial_base;
37void console_init(void);
38
39#endif
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
new file mode 100644
index 000000000000..19b3e693cd72
--- /dev/null
+++ b/arch/x86/boot/compressed/string.c
@@ -0,0 +1,2 @@
1#include "misc.h"
2#include "../string.c"
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 5ddabceee124..34d047c98284 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -41,6 +41,12 @@ SECTIONS
41 *(.rodata.*) 41 *(.rodata.*)
42 _erodata = . ; 42 _erodata = . ;
43 } 43 }
44 .got : {
45 _got = .;
46 KEEP(*(.got.plt))
47 KEEP(*(.got))
48 _egot = .;
49 }
44 .data : { 50 .data : {
45 _data = . ; 51 _data = . ;
46 *(.data) 52 *(.data)
diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h
new file mode 100644
index 000000000000..25e13403193c
--- /dev/null
+++ b/arch/x86/boot/ctype.h
@@ -0,0 +1,21 @@
1#ifndef BOOT_ISDIGIT_H
2
3#define BOOT_ISDIGIT_H
4
5static inline int isdigit(int ch)
6{
7 return (ch >= '0') && (ch <= '9');
8}
9
10static inline int isxdigit(int ch)
11{
12 if (isdigit(ch))
13 return true;
14
15 if ((ch >= 'a') && (ch <= 'f'))
16 return true;
17
18 return (ch >= 'A') && (ch <= 'F');
19}
20
21#endif
diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c
new file mode 100644
index 000000000000..030f4b93e255
--- /dev/null
+++ b/arch/x86/boot/early_serial_console.c
@@ -0,0 +1,139 @@
1#include "boot.h"
2
3#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
4
5#define XMTRDY 0x20
6
7#define DLAB 0x80
8
9#define TXR 0 /* Transmit register (WRITE) */
10#define RXR 0 /* Receive register (READ) */
11#define IER 1 /* Interrupt Enable */
12#define IIR 2 /* Interrupt ID */
13#define FCR 2 /* FIFO control */
14#define LCR 3 /* Line control */
15#define MCR 4 /* Modem control */
16#define LSR 5 /* Line Status */
17#define MSR 6 /* Modem Status */
18#define DLL 0 /* Divisor Latch Low */
19#define DLH 1 /* Divisor latch High */
20
21#define DEFAULT_BAUD 9600
22
23static void early_serial_init(int port, int baud)
24{
25 unsigned char c;
26 unsigned divisor;
27
28 outb(0x3, port + LCR); /* 8n1 */
29 outb(0, port + IER); /* no interrupt */
30 outb(0, port + FCR); /* no fifo */
31 outb(0x3, port + MCR); /* DTR + RTS */
32
33 divisor = 115200 / baud;
34 c = inb(port + LCR);
35 outb(c | DLAB, port + LCR);
36 outb(divisor & 0xff, port + DLL);
37 outb((divisor >> 8) & 0xff, port + DLH);
38 outb(c & ~DLAB, port + LCR);
39
40 early_serial_base = port;
41}
42
43static void parse_earlyprintk(void)
44{
45 int baud = DEFAULT_BAUD;
46 char arg[32];
47 int pos = 0;
48 int port = 0;
49
50 if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) {
51 char *e;
52
53 if (!strncmp(arg, "serial", 6)) {
54 port = DEFAULT_SERIAL_PORT;
55 pos += 6;
56 }
57
58 if (arg[pos] == ',')
59 pos++;
60
61 if (!strncmp(arg, "ttyS", 4)) {
62 static const int bases[] = { 0x3f8, 0x2f8 };
63 int idx = 0;
64
65 if (!strncmp(arg + pos, "ttyS", 4))
66 pos += 4;
67
68 if (arg[pos++] == '1')
69 idx = 1;
70
71 port = bases[idx];
72 }
73
74 if (arg[pos] == ',')
75 pos++;
76
77 baud = simple_strtoull(arg + pos, &e, 0);
78 if (baud == 0 || arg + pos == e)
79 baud = DEFAULT_BAUD;
80 }
81
82 if (port)
83 early_serial_init(port, baud);
84}
85
86#define BASE_BAUD (1843200/16)
87static unsigned int probe_baud(int port)
88{
89 unsigned char lcr, dll, dlh;
90 unsigned int quot;
91
92 lcr = inb(port + LCR);
93 outb(lcr | DLAB, port + LCR);
94 dll = inb(port + DLL);
95 dlh = inb(port + DLH);
96 outb(lcr, port + LCR);
97 quot = (dlh << 8) | dll;
98
99 return BASE_BAUD / quot;
100}
101
102static void parse_console_uart8250(void)
103{
104 char optstr[64], *options;
105 int baud = DEFAULT_BAUD;
106 int port = 0;
107
108 /*
109 * console=uart8250,io,0x3f8,115200n8
110 * need to make sure it is last one console !
111 */
112 if (cmdline_find_option("console", optstr, sizeof optstr) <= 0)
113 return;
114
115 options = optstr;
116
117 if (!strncmp(options, "uart8250,io,", 12))
118 port = simple_strtoull(options + 12, &options, 0);
119 else if (!strncmp(options, "uart,io,", 8))
120 port = simple_strtoull(options + 8, &options, 0);
121 else
122 return;
123
124 if (options && (options[0] == ','))
125 baud = simple_strtoull(options + 1, &options, 0);
126 else
127 baud = probe_baud(port);
128
129 if (port)
130 early_serial_init(port, baud);
131}
132
133void console_init(void)
134{
135 parse_earlyprintk();
136
137 if (!early_serial_base)
138 parse_console_uart8250();
139}
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 140172b895bd..40358c8905be 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -130,6 +130,11 @@ void main(void)
130 /* First, copy the boot header into the "zeropage" */ 130 /* First, copy the boot header into the "zeropage" */
131 copy_boot_params(); 131 copy_boot_params();
132 132
133 /* Initialize the early-boot console */
134 console_init();
135 if (cmdline_find_option_bool("debug"))
136 puts("early console in setup code\n");
137
133 /* End of heap check */ 138 /* End of heap check */
134 init_heap(); 139 init_heap();
135 140
@@ -168,10 +173,6 @@ void main(void)
168 /* Set the video mode */ 173 /* Set the video mode */
169 set_video(); 174 set_video();
170 175
171 /* Parse command line for 'quiet' and pass it to decompressor. */
172 if (cmdline_find_option_bool("quiet"))
173 boot_params.hdr.loadflags |= QUIET_FLAG;
174
175 /* Do the last things and invoke protected mode */ 176 /* Do the last things and invoke protected mode */
176 go_to_protected_mode(); 177 go_to_protected_mode();
177} 178}
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index 50e47cdbdddd..cdac91ca55d3 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -34,7 +34,7 @@ static int skip_atoi(const char **s)
34#define SMALL 32 /* Must be 32 == 0x20 */ 34#define SMALL 32 /* Must be 32 == 0x20 */
35#define SPECIAL 64 /* 0x */ 35#define SPECIAL 64 /* 0x */
36 36
37#define do_div(n,base) ({ \ 37#define __do_div(n, base) ({ \
38int __res; \ 38int __res; \
39__res = ((unsigned long) n) % (unsigned) base; \ 39__res = ((unsigned long) n) % (unsigned) base; \
40n = ((unsigned long) n) / (unsigned) base; \ 40n = ((unsigned long) n) / (unsigned) base; \
@@ -83,7 +83,7 @@ static char *number(char *str, long num, int base, int size, int precision,
83 tmp[i++] = '0'; 83 tmp[i++] = '0';
84 else 84 else
85 while (num != 0) 85 while (num != 0)
86 tmp[i++] = (digits[do_div(num, base)] | locase); 86 tmp[i++] = (digits[__do_div(num, base)] | locase);
87 if (i > precision) 87 if (i > precision)
88 precision = i; 88 precision = i;
89 size -= precision; 89 size -= precision;
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index f94b7a0c2abf..3cbc4058dd26 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -30,6 +30,22 @@ int strcmp(const char *str1, const char *str2)
30 return 0; 30 return 0;
31} 31}
32 32
33int strncmp(const char *cs, const char *ct, size_t count)
34{
35 unsigned char c1, c2;
36
37 while (count) {
38 c1 = *cs++;
39 c2 = *ct++;
40 if (c1 != c2)
41 return c1 < c2 ? -1 : 1;
42 if (!c1)
43 break;
44 count--;
45 }
46 return 0;
47}
48
33size_t strnlen(const char *s, size_t maxlen) 49size_t strnlen(const char *s, size_t maxlen)
34{ 50{
35 const char *es = s; 51 const char *es = s;
@@ -48,3 +64,50 @@ unsigned int atou(const char *s)
48 i = i * 10 + (*s++ - '0'); 64 i = i * 10 + (*s++ - '0');
49 return i; 65 return i;
50} 66}
67
68/* Works only for digits and letters, but small and fast */
69#define TOLOWER(x) ((x) | 0x20)
70
71static unsigned int simple_guess_base(const char *cp)
72{
73 if (cp[0] == '0') {
74 if (TOLOWER(cp[1]) == 'x' && isxdigit(cp[2]))
75 return 16;
76 else
77 return 8;
78 } else {
79 return 10;
80 }
81}
82
83/**
84 * simple_strtoull - convert a string to an unsigned long long
85 * @cp: The start of the string
86 * @endp: A pointer to the end of the parsed string will be placed here
87 * @base: The number base to use
88 */
89
90unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
91{
92 unsigned long long result = 0;
93
94 if (!base)
95 base = simple_guess_base(cp);
96
97 if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x')
98 cp += 2;
99
100 while (isxdigit(*cp)) {
101 unsigned int value;
102
103 value = isdigit(*cp) ? *cp - '0' : TOLOWER(*cp) - 'a' + 10;
104 if (value >= base)
105 break;
106 result = result * base + value;
107 cp++;
108 }
109 if (endp)
110 *endp = (char *)cp;
111
112 return result;
113}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 01ec69c901c7..def2451f46ae 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -10,23 +10,36 @@
10 * ----------------------------------------------------------------------- */ 10 * ----------------------------------------------------------------------- */
11 11
12/* 12/*
13 * Very simple screen I/O 13 * Very simple screen and serial I/O
14 * XXX: Probably should add very simple serial I/O?
15 */ 14 */
16 15
17#include "boot.h" 16#include "boot.h"
18 17
18int early_serial_base;
19
20#define XMTRDY 0x20
21
22#define TXR 0 /* Transmit register (WRITE) */
23#define LSR 5 /* Line Status */
24
19/* 25/*
20 * These functions are in .inittext so they can be used to signal 26 * These functions are in .inittext so they can be used to signal
21 * error during initialization. 27 * error during initialization.
22 */ 28 */
23 29
24void __attribute__((section(".inittext"))) putchar(int ch) 30static void __attribute__((section(".inittext"))) serial_putchar(int ch)
25{ 31{
26 struct biosregs ireg; 32 unsigned timeout = 0xffff;
27 33
28 if (ch == '\n') 34 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
29 putchar('\r'); /* \n -> \r\n */ 35 cpu_relax();
36
37 outb(ch, early_serial_base + TXR);
38}
39
40static void __attribute__((section(".inittext"))) bios_putchar(int ch)
41{
42 struct biosregs ireg;
30 43
31 initregs(&ireg); 44 initregs(&ireg);
32 ireg.bx = 0x0007; 45 ireg.bx = 0x0007;
@@ -36,6 +49,17 @@ void __attribute__((section(".inittext"))) putchar(int ch)
36 intcall(0x10, &ireg, NULL); 49 intcall(0x10, &ireg, NULL);
37} 50}
38 51
52void __attribute__((section(".inittext"))) putchar(int ch)
53{
54 if (ch == '\n')
55 putchar('\r'); /* \n -> \r\n */
56
57 bios_putchar(ch);
58
59 if (early_serial_base != 0)
60 serial_putchar(ch);
61}
62
39void __attribute__((section(".inittext"))) puts(const char *str) 63void __attribute__((section(".inittext"))) puts(const char *str)
40{ 64{
41 while (*str) 65 while (*str)
@@ -112,3 +136,4 @@ int getchar_timeout(void)
112 136
113 return 0; /* Timeout! */ 137 return 0; /* Timeout! */
114} 138}
139
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index d28fad19654a..e3a32431ca1e 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1471,6 +1471,7 @@ CONFIG_HWMON=y
1471# CONFIG_SENSORS_GL518SM is not set 1471# CONFIG_SENSORS_GL518SM is not set
1472# CONFIG_SENSORS_GL520SM is not set 1472# CONFIG_SENSORS_GL520SM is not set
1473# CONFIG_SENSORS_CORETEMP is not set 1473# CONFIG_SENSORS_CORETEMP is not set
1474# CONFIG_SENSORS_PKGTEMP is not set
1474# CONFIG_SENSORS_IT87 is not set 1475# CONFIG_SENSORS_IT87 is not set
1475# CONFIG_SENSORS_LM63 is not set 1476# CONFIG_SENSORS_LM63 is not set
1476# CONFIG_SENSORS_LM75 is not set 1477# CONFIG_SENSORS_LM75 is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 6c86acd847a4..4251f8372050 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1456,6 +1456,7 @@ CONFIG_HWMON=y
1456# CONFIG_SENSORS_GL518SM is not set 1456# CONFIG_SENSORS_GL518SM is not set
1457# CONFIG_SENSORS_GL520SM is not set 1457# CONFIG_SENSORS_GL520SM is not set
1458# CONFIG_SENSORS_CORETEMP is not set 1458# CONFIG_SENSORS_CORETEMP is not set
1459# CONFIG_SENSORS_PKGTEMP is not set
1459# CONFIG_SENSORS_IT87 is not set 1460# CONFIG_SENSORS_IT87 is not set
1460# CONFIG_SENSORS_LM63 is not set 1461# CONFIG_SENSORS_LM63 is not set
1461# CONFIG_SENSORS_LM75 is not set 1462# CONFIG_SENSORS_LM75 is not set
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index aa2c39d968fc..92091de11113 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -134,7 +134,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
134 boot_cpu_data.x86_model <= 0x05 && 134 boot_cpu_data.x86_model <= 0x05 &&
135 boot_cpu_data.x86_mask < 0x0A) 135 boot_cpu_data.x86_mask < 0x0A)
136 return 1; 136 return 1;
137 else if (boot_cpu_has(X86_FEATURE_AMDC1E)) 137 else if (c1e_detected)
138 return 1; 138 return 1;
139 else 139 else
140 return max_cstate; 140 return max_cstate;
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index c74a2eebe570..a69b1ac9eaf8 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -55,7 +55,6 @@ extern unsigned long apbt_quick_calibrate(void);
55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); 55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
56extern void apbt_setup_secondary_clock(void); 56extern void apbt_setup_secondary_clock(void);
57extern unsigned int boot_cpu_id; 57extern unsigned int boot_cpu_id;
58extern int disable_apbt_percpu;
59 58
60extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); 59extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
61extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); 60extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 8859e12dd3cf..284a6e8f7ce1 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -11,38 +11,42 @@
11extern void __xchg_wrong_size(void); 11extern void __xchg_wrong_size(void);
12 12
13/* 13/*
14 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway 14 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
15 * Note 2: xchg has side effect, so that attribute volatile is necessary, 15 * Since this is generally used to protect other memory information, we
16 * but generally the primitive is invalid, *ptr is output argument. --ANK 16 * use "asm volatile" and "memory" clobbers to prevent gcc from moving
17 * information around.
17 */ 18 */
18
19struct __xchg_dummy {
20 unsigned long a[100];
21};
22#define __xg(x) ((struct __xchg_dummy *)(x))
23
24#define __xchg(x, ptr, size) \ 19#define __xchg(x, ptr, size) \
25({ \ 20({ \
26 __typeof(*(ptr)) __x = (x); \ 21 __typeof(*(ptr)) __x = (x); \
27 switch (size) { \ 22 switch (size) { \
28 case 1: \ 23 case 1: \
29 asm volatile("xchgb %b0,%1" \ 24 { \
30 : "=q" (__x) \ 25 volatile u8 *__ptr = (volatile u8 *)(ptr); \
31 : "m" (*__xg(ptr)), "0" (__x) \ 26 asm volatile("xchgb %0,%1" \
27 : "=q" (__x), "+m" (*__ptr) \
28 : "0" (__x) \
32 : "memory"); \ 29 : "memory"); \
33 break; \ 30 break; \
31 } \
34 case 2: \ 32 case 2: \
35 asm volatile("xchgw %w0,%1" \ 33 { \
36 : "=r" (__x) \ 34 volatile u16 *__ptr = (volatile u16 *)(ptr); \
37 : "m" (*__xg(ptr)), "0" (__x) \ 35 asm volatile("xchgw %0,%1" \
36 : "=r" (__x), "+m" (*__ptr) \
37 : "0" (__x) \
38 : "memory"); \ 38 : "memory"); \
39 break; \ 39 break; \
40 } \
40 case 4: \ 41 case 4: \
42 { \
43 volatile u32 *__ptr = (volatile u32 *)(ptr); \
41 asm volatile("xchgl %0,%1" \ 44 asm volatile("xchgl %0,%1" \
42 : "=r" (__x) \ 45 : "=r" (__x), "+m" (*__ptr) \
43 : "m" (*__xg(ptr)), "0" (__x) \ 46 : "0" (__x) \
44 : "memory"); \ 47 : "memory"); \
45 break; \ 48 break; \
49 } \
46 default: \ 50 default: \
47 __xchg_wrong_size(); \ 51 __xchg_wrong_size(); \
48 } \ 52 } \
@@ -53,60 +57,33 @@ struct __xchg_dummy {
53 __xchg((v), (ptr), sizeof(*ptr)) 57 __xchg((v), (ptr), sizeof(*ptr))
54 58
55/* 59/*
56 * The semantics of XCHGCMP8B are a bit strange, this is why 60 * CMPXCHG8B only writes to the target if we had the previous
57 * there is a loop and the loading of %%eax and %%edx has to 61 * value in registers, otherwise it acts as a read and gives us the
58 * be inside. This inlines well in most cases, the cached 62 * "new previous" value. That is why there is a loop. Preloading
59 * cost is around ~38 cycles. (in the future we might want 63 * EDX:EAX is a performance optimization: in the common case it means
60 * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that 64 * we need only one locked operation.
61 * might have an implicit FPU-save as a cost, so it's not
62 * clear which path to go.)
63 * 65 *
64 * cmpxchg8b must be used with the lock prefix here to allow 66 * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
65 * the instruction to be executed atomically, see page 3-102 67 * least an FPU save and/or %cr0.ts manipulation.
66 * of the instruction set reference 24319102.pdf. We need 68 *
67 * the reader side to see the coherent 64bit value. 69 * cmpxchg8b must be used with the lock prefix here to allow the
70 * instruction to be executed atomically. We need to have the reader
71 * side to see the coherent 64bit value.
68 */ 72 */
69static inline void __set_64bit(unsigned long long *ptr, 73static inline void set_64bit(volatile u64 *ptr, u64 value)
70 unsigned int low, unsigned int high)
71{ 74{
75 u32 low = value;
76 u32 high = value >> 32;
77 u64 prev = *ptr;
78
72 asm volatile("\n1:\t" 79 asm volatile("\n1:\t"
73 "movl (%0), %%eax\n\t" 80 LOCK_PREFIX "cmpxchg8b %0\n\t"
74 "movl 4(%0), %%edx\n\t"
75 LOCK_PREFIX "cmpxchg8b (%0)\n\t"
76 "jnz 1b" 81 "jnz 1b"
77 : /* no outputs */ 82 : "=m" (*ptr), "+A" (prev)
78 : "D"(ptr), 83 : "b" (low), "c" (high)
79 "b"(low), 84 : "memory");
80 "c"(high)
81 : "ax", "dx", "memory");
82}
83
84static inline void __set_64bit_constant(unsigned long long *ptr,
85 unsigned long long value)
86{
87 __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
88}
89
90#define ll_low(x) *(((unsigned int *)&(x)) + 0)
91#define ll_high(x) *(((unsigned int *)&(x)) + 1)
92
93static inline void __set_64bit_var(unsigned long long *ptr,
94 unsigned long long value)
95{
96 __set_64bit(ptr, ll_low(value), ll_high(value));
97} 85}
98 86
99#define set_64bit(ptr, value) \
100 (__builtin_constant_p((value)) \
101 ? __set_64bit_constant((ptr), (value)) \
102 : __set_64bit_var((ptr), (value)))
103
104#define _set_64bit(ptr, value) \
105 (__builtin_constant_p(value) \
106 ? __set_64bit(ptr, (unsigned int)(value), \
107 (unsigned int)((value) >> 32)) \
108 : __set_64bit(ptr, ll_low((value)), ll_high((value))))
109
110extern void __cmpxchg_wrong_size(void); 87extern void __cmpxchg_wrong_size(void);
111 88
112/* 89/*
@@ -121,23 +98,32 @@ extern void __cmpxchg_wrong_size(void);
121 __typeof__(*(ptr)) __new = (new); \ 98 __typeof__(*(ptr)) __new = (new); \
122 switch (size) { \ 99 switch (size) { \
123 case 1: \ 100 case 1: \
124 asm volatile(lock "cmpxchgb %b1,%2" \ 101 { \
125 : "=a"(__ret) \ 102 volatile u8 *__ptr = (volatile u8 *)(ptr); \
126 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ 103 asm volatile(lock "cmpxchgb %2,%1" \
104 : "=a" (__ret), "+m" (*__ptr) \
105 : "q" (__new), "0" (__old) \
127 : "memory"); \ 106 : "memory"); \
128 break; \ 107 break; \
108 } \
129 case 2: \ 109 case 2: \
130 asm volatile(lock "cmpxchgw %w1,%2" \ 110 { \
131 : "=a"(__ret) \ 111 volatile u16 *__ptr = (volatile u16 *)(ptr); \
132 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 112 asm volatile(lock "cmpxchgw %2,%1" \
113 : "=a" (__ret), "+m" (*__ptr) \
114 : "r" (__new), "0" (__old) \
133 : "memory"); \ 115 : "memory"); \
134 break; \ 116 break; \
117 } \
135 case 4: \ 118 case 4: \
136 asm volatile(lock "cmpxchgl %1,%2" \ 119 { \
137 : "=a"(__ret) \ 120 volatile u32 *__ptr = (volatile u32 *)(ptr); \
138 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 121 asm volatile(lock "cmpxchgl %2,%1" \
122 : "=a" (__ret), "+m" (*__ptr) \
123 : "r" (__new), "0" (__old) \
139 : "memory"); \ 124 : "memory"); \
140 break; \ 125 break; \
126 } \
141 default: \ 127 default: \
142 __cmpxchg_wrong_size(); \ 128 __cmpxchg_wrong_size(); \
143 } \ 129 } \
@@ -175,32 +161,28 @@ extern void __cmpxchg_wrong_size(void);
175 (unsigned long long)(n))) 161 (unsigned long long)(n)))
176#endif 162#endif
177 163
178static inline unsigned long long __cmpxchg64(volatile void *ptr, 164static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
179 unsigned long long old,
180 unsigned long long new)
181{ 165{
182 unsigned long long prev; 166 u64 prev;
183 asm volatile(LOCK_PREFIX "cmpxchg8b %3" 167 asm volatile(LOCK_PREFIX "cmpxchg8b %1"
184 : "=A"(prev) 168 : "=A" (prev),
185 : "b"((unsigned long)new), 169 "+m" (*ptr)
186 "c"((unsigned long)(new >> 32)), 170 : "b" ((u32)new),
187 "m"(*__xg(ptr)), 171 "c" ((u32)(new >> 32)),
188 "0"(old) 172 "0" (old)
189 : "memory"); 173 : "memory");
190 return prev; 174 return prev;
191} 175}
192 176
193static inline unsigned long long __cmpxchg64_local(volatile void *ptr, 177static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
194 unsigned long long old,
195 unsigned long long new)
196{ 178{
197 unsigned long long prev; 179 u64 prev;
198 asm volatile("cmpxchg8b %3" 180 asm volatile("cmpxchg8b %1"
199 : "=A"(prev) 181 : "=A" (prev),
200 : "b"((unsigned long)new), 182 "+m" (*ptr)
201 "c"((unsigned long)(new >> 32)), 183 : "b" ((u32)new),
202 "m"(*__xg(ptr)), 184 "c" ((u32)(new >> 32)),
203 "0"(old) 185 "0" (old)
204 : "memory"); 186 : "memory");
205 return prev; 187 return prev;
206} 188}
@@ -264,8 +246,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
264 * to simulate the cmpxchg8b on the 80386 and 80486 CPU. 246 * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
265 */ 247 */
266 248
267extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
268
269#define cmpxchg64(ptr, o, n) \ 249#define cmpxchg64(ptr, o, n) \
270({ \ 250({ \
271 __typeof__(*(ptr)) __ret; \ 251 __typeof__(*(ptr)) __ret; \
@@ -283,20 +263,20 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
283 __ret; }) 263 __ret; })
284 264
285 265
286 266#define cmpxchg64_local(ptr, o, n) \
287#define cmpxchg64_local(ptr, o, n) \ 267({ \
288({ \ 268 __typeof__(*(ptr)) __ret; \
289 __typeof__(*(ptr)) __ret; \ 269 __typeof__(*(ptr)) __old = (o); \
290 if (likely(boot_cpu_data.x86 > 4)) \ 270 __typeof__(*(ptr)) __new = (n); \
291 __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \ 271 alternative_io("call cmpxchg8b_emu", \
292 (unsigned long long)(o), \ 272 "cmpxchg8b (%%esi)" , \
293 (unsigned long long)(n)); \ 273 X86_FEATURE_CX8, \
294 else \ 274 "=A" (__ret), \
295 __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \ 275 "S" ((ptr)), "0" (__old), \
296 (unsigned long long)(o), \ 276 "b" ((unsigned int)__new), \
297 (unsigned long long)(n)); \ 277 "c" ((unsigned int)(__new>>32)) \
298 __ret; \ 278 : "memory"); \
299}) 279 __ret; })
300 280
301#endif 281#endif
302 282
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 485ae415faec..423ae58aa020 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -3,51 +3,60 @@
3 3
4#include <asm/alternative.h> /* Provides LOCK_PREFIX */ 4#include <asm/alternative.h> /* Provides LOCK_PREFIX */
5 5
6#define __xg(x) ((volatile long *)(x)) 6static inline void set_64bit(volatile u64 *ptr, u64 val)
7
8static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
9{ 7{
10 *ptr = val; 8 *ptr = val;
11} 9}
12 10
13#define _set_64bit set_64bit
14
15extern void __xchg_wrong_size(void); 11extern void __xchg_wrong_size(void);
16extern void __cmpxchg_wrong_size(void); 12extern void __cmpxchg_wrong_size(void);
17 13
18/* 14/*
19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway 15 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
20 * Note 2: xchg has side effect, so that attribute volatile is necessary, 16 * Since this is generally used to protect other memory information, we
21 * but generally the primitive is invalid, *ptr is output argument. --ANK 17 * use "asm volatile" and "memory" clobbers to prevent gcc from moving
18 * information around.
22 */ 19 */
23#define __xchg(x, ptr, size) \ 20#define __xchg(x, ptr, size) \
24({ \ 21({ \
25 __typeof(*(ptr)) __x = (x); \ 22 __typeof(*(ptr)) __x = (x); \
26 switch (size) { \ 23 switch (size) { \
27 case 1: \ 24 case 1: \
28 asm volatile("xchgb %b0,%1" \ 25 { \
29 : "=q" (__x) \ 26 volatile u8 *__ptr = (volatile u8 *)(ptr); \
30 : "m" (*__xg(ptr)), "0" (__x) \ 27 asm volatile("xchgb %0,%1" \
28 : "=q" (__x), "+m" (*__ptr) \
29 : "0" (__x) \
31 : "memory"); \ 30 : "memory"); \
32 break; \ 31 break; \
32 } \
33 case 2: \ 33 case 2: \
34 asm volatile("xchgw %w0,%1" \ 34 { \
35 : "=r" (__x) \ 35 volatile u16 *__ptr = (volatile u16 *)(ptr); \
36 : "m" (*__xg(ptr)), "0" (__x) \ 36 asm volatile("xchgw %0,%1" \
37 : "=r" (__x), "+m" (*__ptr) \
38 : "0" (__x) \
37 : "memory"); \ 39 : "memory"); \
38 break; \ 40 break; \
41 } \
39 case 4: \ 42 case 4: \
40 asm volatile("xchgl %k0,%1" \ 43 { \
41 : "=r" (__x) \ 44 volatile u32 *__ptr = (volatile u32 *)(ptr); \
42 : "m" (*__xg(ptr)), "0" (__x) \ 45 asm volatile("xchgl %0,%1" \
46 : "=r" (__x), "+m" (*__ptr) \
47 : "0" (__x) \
43 : "memory"); \ 48 : "memory"); \
44 break; \ 49 break; \
50 } \
45 case 8: \ 51 case 8: \
52 { \
53 volatile u64 *__ptr = (volatile u64 *)(ptr); \
46 asm volatile("xchgq %0,%1" \ 54 asm volatile("xchgq %0,%1" \
47 : "=r" (__x) \ 55 : "=r" (__x), "+m" (*__ptr) \
48 : "m" (*__xg(ptr)), "0" (__x) \ 56 : "0" (__x) \
49 : "memory"); \ 57 : "memory"); \
50 break; \ 58 break; \
59 } \
51 default: \ 60 default: \
52 __xchg_wrong_size(); \ 61 __xchg_wrong_size(); \
53 } \ 62 } \
@@ -71,29 +80,41 @@ extern void __cmpxchg_wrong_size(void);
71 __typeof__(*(ptr)) __new = (new); \ 80 __typeof__(*(ptr)) __new = (new); \
72 switch (size) { \ 81 switch (size) { \
73 case 1: \ 82 case 1: \
74 asm volatile(lock "cmpxchgb %b1,%2" \ 83 { \
75 : "=a"(__ret) \ 84 volatile u8 *__ptr = (volatile u8 *)(ptr); \
76 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ 85 asm volatile(lock "cmpxchgb %2,%1" \
86 : "=a" (__ret), "+m" (*__ptr) \
87 : "q" (__new), "0" (__old) \
77 : "memory"); \ 88 : "memory"); \
78 break; \ 89 break; \
90 } \
79 case 2: \ 91 case 2: \
80 asm volatile(lock "cmpxchgw %w1,%2" \ 92 { \
81 : "=a"(__ret) \ 93 volatile u16 *__ptr = (volatile u16 *)(ptr); \
82 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 94 asm volatile(lock "cmpxchgw %2,%1" \
95 : "=a" (__ret), "+m" (*__ptr) \
96 : "r" (__new), "0" (__old) \
83 : "memory"); \ 97 : "memory"); \
84 break; \ 98 break; \
99 } \
85 case 4: \ 100 case 4: \
86 asm volatile(lock "cmpxchgl %k1,%2" \ 101 { \
87 : "=a"(__ret) \ 102 volatile u32 *__ptr = (volatile u32 *)(ptr); \
88 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 103 asm volatile(lock "cmpxchgl %2,%1" \
104 : "=a" (__ret), "+m" (*__ptr) \
105 : "r" (__new), "0" (__old) \
89 : "memory"); \ 106 : "memory"); \
90 break; \ 107 break; \
108 } \
91 case 8: \ 109 case 8: \
92 asm volatile(lock "cmpxchgq %1,%2" \ 110 { \
93 : "=a"(__ret) \ 111 volatile u64 *__ptr = (volatile u64 *)(ptr); \
94 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 112 asm volatile(lock "cmpxchgq %2,%1" \
113 : "=a" (__ret), "+m" (*__ptr) \
114 : "r" (__new), "0" (__old) \
95 : "memory"); \ 115 : "memory"); \
96 break; \ 116 break; \
117 } \
97 default: \ 118 default: \
98 __cmpxchg_wrong_size(); \ 119 __cmpxchg_wrong_size(); \
99 } \ 120 } \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 468145914389..0b205b8a4308 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -6,7 +6,7 @@
6 6
7#include <asm/required-features.h> 7#include <asm/required-features.h>
8 8
9#define NCAPINTS 9 /* N 32-bit words worth of info */ 9#define NCAPINTS 10 /* N 32-bit words worth of info */
10 10
11/* 11/*
12 * Note: If the comment begins with a quoted string, that string is used 12 * Note: If the comment begins with a quoted string, that string is used
@@ -89,7 +89,7 @@
89#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */ 89#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
90#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */ 90#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
91#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ 91#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
92#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ 92 /* 21 available, was AMD_C1E */
93#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ 93#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ 94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ 95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
@@ -124,6 +124,8 @@
124#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ 124#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
125#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 125#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
126#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ 126#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
127#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
128#define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */
127#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ 129#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
128 130
129/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ 131/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
@@ -157,22 +159,29 @@
157 159
158/* 160/*
159 * Auxiliary flags: Linux defined - For features scattered in various 161 * Auxiliary flags: Linux defined - For features scattered in various
160 * CPUID levels like 0x6, 0xA etc 162 * CPUID levels like 0x6, 0xA etc, word 7
161 */ 163 */
162#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ 164#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
163#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ 165#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */
164#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ 166#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */
167#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
168#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */
169#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
170#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
165 171
166/* Virtualization flags: Linux defined */ 172/* Virtualization flags: Linux defined, word 8 */
167#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ 173#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
168#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */ 174#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
169#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ 175#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
170#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ 176#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
171#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ 177#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
172#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */ 178#define X86_FEATURE_NPT (8*32+ 5) /* AMD Nested Page Table support */
173#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */ 179#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */
174#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ 180#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
175#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ 181#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
182
183/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
184#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
176 185
177#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 186#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
178 187
@@ -194,7 +203,9 @@ extern const char * const x86_power_flags[32];
194 (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \ 203 (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
195 (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \ 204 (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
196 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ 205 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
197 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \ 206 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
207 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
208 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \
198 ? 1 : \ 209 ? 1 : \
199 test_cpu_cap(c, bit)) 210 test_cpu_cap(c, bit))
200 211
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 942255310e6a..528a11e8d3e3 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -20,10 +20,10 @@ struct arch_hw_breakpoint {
20#include <linux/list.h> 20#include <linux/list.h>
21 21
22/* Available HW breakpoint length encodings */ 22/* Available HW breakpoint length encodings */
23#define X86_BREAKPOINT_LEN_X 0x00
23#define X86_BREAKPOINT_LEN_1 0x40 24#define X86_BREAKPOINT_LEN_1 0x40
24#define X86_BREAKPOINT_LEN_2 0x44 25#define X86_BREAKPOINT_LEN_2 0x44
25#define X86_BREAKPOINT_LEN_4 0x4c 26#define X86_BREAKPOINT_LEN_4 0x4c
26#define X86_BREAKPOINT_LEN_EXECUTE 0x40
27 27
28#ifdef CONFIG_X86_64 28#ifdef CONFIG_X86_64
29#define X86_BREAKPOINT_LEN_8 0x48 29#define X86_BREAKPOINT_LEN_8 0x48
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 70abda7058c8..ff2546ce7178 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -45,5 +45,6 @@ extern const struct hypervisor_x86 *x86_hyper;
45/* Recognized hypervisors */ 45/* Recognized hypervisors */
46extern const struct hypervisor_x86 x86_hyper_vmware; 46extern const struct hypervisor_x86 x86_hyper_vmware;
47extern const struct hypervisor_x86 x86_hyper_ms_hyperv; 47extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
48extern const struct hypervisor_x86 x86_hyper_xen_hvm;
48 49
49#endif 50#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 0f1cf5d53dd8..f1accc625beb 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -491,6 +491,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src)
491 memcpy(dst->state, src->state, xstate_size); 491 memcpy(dst->state, src->state, xstate_size);
492} 492}
493 493
494extern void fpu_finit(struct fpu *fpu);
495
494#endif /* __ASSEMBLY__ */ 496#endif /* __ASSEMBLY__ */
495 497
496#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 498#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 4470c9ad4a3e..29f66793cc55 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -1,6 +1,12 @@
1#ifndef _ASM_X86_INTEL_SCU_IPC_H_ 1#ifndef _ASM_X86_INTEL_SCU_IPC_H_
2#define _ASM_X86_INTEL_SCU_IPC_H_ 2#define _ASM_X86_INTEL_SCU_IPC_H_
3 3
4#define IPCMSG_VRTC 0xFA /* Set vRTC device */
5
6/* Command id associated with message IPCMSG_VRTC */
7#define IPC_CMD_VRTC_SETTIME 1 /* Set time */
8#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */
9
4/* Read single register */ 10/* Read single register */
5int intel_scu_ipc_ioread8(u16 addr, u8 *data); 11int intel_scu_ipc_ioread8(u16 addr, u8 *data);
6 12
@@ -28,20 +34,6 @@ int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
28/* Update single register based on the mask */ 34/* Update single register based on the mask */
29int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); 35int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
30 36
31/*
32 * Indirect register read
33 * Can be used when SCCB(System Controller Configuration Block) register
34 * HRIM(Honor Restricted IPC Messages) is set (bit 23)
35 */
36int intel_scu_ipc_register_read(u32 addr, u32 *data);
37
38/*
39 * Indirect register write
40 * Can be used when SCCB(System Controller Configuration Block) register
41 * HRIM(Honor Restricted IPC Messages) is set (bit 23)
42 */
43int intel_scu_ipc_register_write(u32 addr, u32 data);
44
45/* Issue commands to the SCU with or without data */ 37/* Issue commands to the SCU with or without data */
46int intel_scu_ipc_simple_command(int cmd, int sub); 38int intel_scu_ipc_simple_command(int cmd, int sub);
47int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, 39int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8767d99c4f64..e2ca30092557 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -125,6 +125,9 @@
125 */ 125 */
126#define MCE_SELF_VECTOR 0xeb 126#define MCE_SELF_VECTOR 0xeb
127 127
128/* Xen vector callback to receive events in a HVM domain */
129#define XEN_HVM_EVTCHN_CALLBACK 0xe9
130
128#define NR_VECTORS 256 131#define NR_VECTORS 256
129 132
130#define FPU_IRQ 13 133#define FPU_IRQ 13
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 006da3687cdc..396f5b5fc4d7 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -39,9 +39,11 @@ enum regnames {
39 GDB_FS, /* 14 */ 39 GDB_FS, /* 14 */
40 GDB_GS, /* 15 */ 40 GDB_GS, /* 15 */
41}; 41};
42#define GDB_ORIG_AX 41
43#define DBG_MAX_REG_NUM 16
42#define NUMREGBYTES ((GDB_GS+1)*4) 44#define NUMREGBYTES ((GDB_GS+1)*4)
43#else /* ! CONFIG_X86_32 */ 45#else /* ! CONFIG_X86_32 */
44enum regnames64 { 46enum regnames {
45 GDB_AX, /* 0 */ 47 GDB_AX, /* 0 */
46 GDB_BX, /* 1 */ 48 GDB_BX, /* 1 */
47 GDB_CX, /* 2 */ 49 GDB_CX, /* 2 */
@@ -59,15 +61,15 @@ enum regnames64 {
59 GDB_R14, /* 14 */ 61 GDB_R14, /* 14 */
60 GDB_R15, /* 15 */ 62 GDB_R15, /* 15 */
61 GDB_PC, /* 16 */ 63 GDB_PC, /* 16 */
64 GDB_PS, /* 17 */
65 GDB_CS, /* 18 */
66 GDB_SS, /* 19 */
62}; 67};
63 68#define GDB_ORIG_AX 57
64enum regnames32 { 69#define DBG_MAX_REG_NUM 20
65 GDB_PS = 34, 70/* 17 64 bit regs and 3 32 bit regs */
66 GDB_CS, 71#define NUMREGBYTES ((17 * 8) + (3 * 4))
67 GDB_SS, 72#endif /* ! CONFIG_X86_32 */
68};
69#define NUMREGBYTES ((GDB_SS+1)*4)
70#endif /* CONFIG_X86_32 */
71 73
72static inline void arch_kgdb_breakpoint(void) 74static inline void arch_kgdb_breakpoint(void)
73{ 75{
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ff90055c7f0b..4d8dcbdfc120 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -22,6 +22,8 @@
22#define __KVM_HAVE_XEN_HVM 22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS 23#define __KVM_HAVE_VCPU_EVENTS
24#define __KVM_HAVE_DEBUGREGS 24#define __KVM_HAVE_DEBUGREGS
25#define __KVM_HAVE_XSAVE
26#define __KVM_HAVE_XCRS
25 27
26/* Architectural interrupt line count. */ 28/* Architectural interrupt line count. */
27#define KVM_NR_INTERRUPTS 256 29#define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
299 __u64 reserved[9]; 301 __u64 reserved[9];
300}; 302};
301 303
304/* for KVM_CAP_XSAVE */
305struct kvm_xsave {
306 __u32 region[1024];
307};
308
309#define KVM_MAX_XCRS 16
310
311struct kvm_xcr {
312 __u32 xcr;
313 __u32 reserved;
314 __u64 value;
315};
316
317struct kvm_xcrs {
318 __u32 nr_xcrs;
319 __u32 flags;
320 struct kvm_xcr xcrs[KVM_MAX_XCRS];
321 __u64 padding[16];
322};
323
302#endif /* _ASM_X86_KVM_H */ 324#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0b2729bf2070..51cfd730ac5d 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -51,8 +51,10 @@ struct x86_emulate_ctxt;
51#define X86EMUL_UNHANDLEABLE 1 51#define X86EMUL_UNHANDLEABLE 1
52/* Terminate emulation but return success to the caller. */ 52/* Terminate emulation but return success to the caller. */
53#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ 53#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
54#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ 54#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
55#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ 55#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
56#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
57
56struct x86_emulate_ops { 58struct x86_emulate_ops {
57 /* 59 /*
58 * read_std: Read bytes of standard (non-emulated/special) memory. 60 * read_std: Read bytes of standard (non-emulated/special) memory.
@@ -92,6 +94,7 @@ struct x86_emulate_ops {
92 int (*read_emulated)(unsigned long addr, 94 int (*read_emulated)(unsigned long addr,
93 void *val, 95 void *val,
94 unsigned int bytes, 96 unsigned int bytes,
97 unsigned int *error,
95 struct kvm_vcpu *vcpu); 98 struct kvm_vcpu *vcpu);
96 99
97 /* 100 /*
@@ -104,6 +107,7 @@ struct x86_emulate_ops {
104 int (*write_emulated)(unsigned long addr, 107 int (*write_emulated)(unsigned long addr,
105 const void *val, 108 const void *val,
106 unsigned int bytes, 109 unsigned int bytes,
110 unsigned int *error,
107 struct kvm_vcpu *vcpu); 111 struct kvm_vcpu *vcpu);
108 112
109 /* 113 /*
@@ -118,6 +122,7 @@ struct x86_emulate_ops {
118 const void *old, 122 const void *old,
119 const void *new, 123 const void *new,
120 unsigned int bytes, 124 unsigned int bytes,
125 unsigned int *error,
121 struct kvm_vcpu *vcpu); 126 struct kvm_vcpu *vcpu);
122 127
123 int (*pio_in_emulated)(int size, unsigned short port, void *val, 128 int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -132,18 +137,26 @@ struct x86_emulate_ops {
132 int seg, struct kvm_vcpu *vcpu); 137 int seg, struct kvm_vcpu *vcpu);
133 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 138 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
134 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 139 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
140 unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
135 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 141 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
136 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); 142 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
137 void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); 143 int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
138 int (*cpl)(struct kvm_vcpu *vcpu); 144 int (*cpl)(struct kvm_vcpu *vcpu);
139 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 145 int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
146 int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
147 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
148 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
140}; 149};
141 150
142/* Type, address-of, and value of an instruction's operand. */ 151/* Type, address-of, and value of an instruction's operand. */
143struct operand { 152struct operand {
144 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; 153 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
145 unsigned int bytes; 154 unsigned int bytes;
146 unsigned long val, orig_val, *ptr; 155 unsigned long orig_val, *ptr;
156 union {
157 unsigned long val;
158 char valptr[sizeof(unsigned long) + 2];
159 };
147}; 160};
148 161
149struct fetch_cache { 162struct fetch_cache {
@@ -186,6 +199,7 @@ struct decode_cache {
186 unsigned long modrm_val; 199 unsigned long modrm_val;
187 struct fetch_cache fetch; 200 struct fetch_cache fetch;
188 struct read_cache io_read; 201 struct read_cache io_read;
202 struct read_cache mem_read;
189}; 203};
190 204
191struct x86_emulate_ctxt { 205struct x86_emulate_ctxt {
@@ -202,6 +216,12 @@ struct x86_emulate_ctxt {
202 int interruptibility; 216 int interruptibility;
203 217
204 bool restart; /* restart string instruction after writeback */ 218 bool restart; /* restart string instruction after writeback */
219
220 int exception; /* exception that happens during emulation or -1 */
221 u32 error_code; /* error code for exception */
222 bool error_code_valid;
223 unsigned long cr2; /* faulted address in case of #PF */
224
205 /* decode cache */ 225 /* decode cache */
206 struct decode_cache decode; 226 struct decode_cache decode;
207}; 227};
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 76f5483cffec..502e53f999cf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/mmu_notifier.h> 16#include <linux/mmu_notifier.h>
17#include <linux/tracepoint.h> 17#include <linux/tracepoint.h>
18#include <linux/cpumask.h>
18 19
19#include <linux/kvm.h> 20#include <linux/kvm.h>
20#include <linux/kvm_para.h> 21#include <linux/kvm_para.h>
@@ -39,11 +40,14 @@
39 0xFFFFFF0000000000ULL) 40 0xFFFFFF0000000000ULL)
40 41
41#define INVALID_PAGE (~(hpa_t)0) 42#define INVALID_PAGE (~(hpa_t)0)
43#define VALID_PAGE(x) ((x) != INVALID_PAGE)
44
42#define UNMAPPED_GVA (~(gpa_t)0) 45#define UNMAPPED_GVA (~(gpa_t)0)
43 46
44/* KVM Hugepage definitions for x86 */ 47/* KVM Hugepage definitions for x86 */
45#define KVM_NR_PAGE_SIZES 3 48#define KVM_NR_PAGE_SIZES 3
46#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) 49#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
50#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
47#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) 51#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
48#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) 52#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
49#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) 53#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
@@ -69,8 +73,6 @@
69 73
70#define IOPL_SHIFT 12 74#define IOPL_SHIFT 12
71 75
72#define KVM_ALIAS_SLOTS 4
73
74#define KVM_PERMILLE_MMU_PAGES 20 76#define KVM_PERMILLE_MMU_PAGES 20
75#define KVM_MIN_ALLOC_MMU_PAGES 64 77#define KVM_MIN_ALLOC_MMU_PAGES 64
76#define KVM_MMU_HASH_SHIFT 10 78#define KVM_MMU_HASH_SHIFT 10
@@ -241,7 +243,7 @@ struct kvm_mmu {
241 void (*prefetch_page)(struct kvm_vcpu *vcpu, 243 void (*prefetch_page)(struct kvm_vcpu *vcpu,
242 struct kvm_mmu_page *page); 244 struct kvm_mmu_page *page);
243 int (*sync_page)(struct kvm_vcpu *vcpu, 245 int (*sync_page)(struct kvm_vcpu *vcpu,
244 struct kvm_mmu_page *sp); 246 struct kvm_mmu_page *sp, bool clear_unsync);
245 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 247 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
246 hpa_t root_hpa; 248 hpa_t root_hpa;
247 int root_level; 249 int root_level;
@@ -301,8 +303,8 @@ struct kvm_vcpu_arch {
301 unsigned long mmu_seq; 303 unsigned long mmu_seq;
302 } update_pte; 304 } update_pte;
303 305
304 struct i387_fxsave_struct host_fx_image; 306 struct fpu guest_fpu;
305 struct i387_fxsave_struct guest_fx_image; 307 u64 xcr0;
306 308
307 gva_t mmio_fault_cr2; 309 gva_t mmio_fault_cr2;
308 struct kvm_pio_request pio; 310 struct kvm_pio_request pio;
@@ -360,26 +362,11 @@ struct kvm_vcpu_arch {
360 362
361 /* fields used by HYPER-V emulation */ 363 /* fields used by HYPER-V emulation */
362 u64 hv_vapic; 364 u64 hv_vapic;
363};
364
365struct kvm_mem_alias {
366 gfn_t base_gfn;
367 unsigned long npages;
368 gfn_t target_gfn;
369#define KVM_ALIAS_INVALID 1UL
370 unsigned long flags;
371};
372 365
373#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION 366 cpumask_var_t wbinvd_dirty_mask;
374
375struct kvm_mem_aliases {
376 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
377 int naliases;
378}; 367};
379 368
380struct kvm_arch { 369struct kvm_arch {
381 struct kvm_mem_aliases *aliases;
382
383 unsigned int n_free_mmu_pages; 370 unsigned int n_free_mmu_pages;
384 unsigned int n_requested_mmu_pages; 371 unsigned int n_requested_mmu_pages;
385 unsigned int n_alloc_mmu_pages; 372 unsigned int n_alloc_mmu_pages;
@@ -533,6 +520,8 @@ struct kvm_x86_ops {
533 520
534 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); 521 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
535 522
523 bool (*has_wbinvd_exit)(void);
524
536 const struct trace_print_flags *exit_reasons_str; 525 const struct trace_print_flags *exit_reasons_str;
537}; 526};
538 527
@@ -576,7 +565,6 @@ enum emulation_result {
576#define EMULTYPE_SKIP (1 << 2) 565#define EMULTYPE_SKIP (1 << 2)
577int emulate_instruction(struct kvm_vcpu *vcpu, 566int emulate_instruction(struct kvm_vcpu *vcpu,
578 unsigned long cr2, u16 error_code, int emulation_type); 567 unsigned long cr2, u16 error_code, int emulation_type);
579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 568void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
581void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 569void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
582 570
@@ -591,10 +579,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
591int kvm_emulate_halt(struct kvm_vcpu *vcpu); 579int kvm_emulate_halt(struct kvm_vcpu *vcpu);
592int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 580int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
593int emulate_clts(struct kvm_vcpu *vcpu); 581int emulate_clts(struct kvm_vcpu *vcpu);
594int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, 582int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
595 unsigned long *dest);
596int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
597 unsigned long value);
598 583
599void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 584void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
600int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 585int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
@@ -602,15 +587,16 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
602int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 587int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
603 bool has_error_code, u32 error_code); 588 bool has_error_code, u32 error_code);
604 589
605void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 590int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
606void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 591int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
607void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 592int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
608void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 593void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
609int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); 594int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
610int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); 595int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
611unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 596unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
612void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); 597void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
613void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); 598void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
599int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
614 600
615int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 601int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
616int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 602int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -630,12 +616,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level);
630 616
631void kvm_inject_nmi(struct kvm_vcpu *vcpu); 617void kvm_inject_nmi(struct kvm_vcpu *vcpu);
632 618
633void fx_init(struct kvm_vcpu *vcpu); 619int fx_init(struct kvm_vcpu *vcpu);
634
635int emulator_write_emulated(unsigned long addr,
636 const void *val,
637 unsigned int bytes,
638 struct kvm_vcpu *vcpu);
639 620
640void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 621void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
641void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 622void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -664,8 +645,6 @@ void kvm_disable_tdp(void);
664int complete_pio(struct kvm_vcpu *vcpu); 645int complete_pio(struct kvm_vcpu *vcpu);
665bool kvm_check_iopl(struct kvm_vcpu *vcpu); 646bool kvm_check_iopl(struct kvm_vcpu *vcpu);
666 647
667struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
668
669static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 648static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
670{ 649{
671 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); 650 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -719,21 +698,6 @@ static inline unsigned long read_msr(unsigned long msr)
719} 698}
720#endif 699#endif
721 700
722static inline void kvm_fx_save(struct i387_fxsave_struct *image)
723{
724 asm("fxsave (%0)":: "r" (image));
725}
726
727static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
728{
729 asm("fxrstor (%0)":: "r" (image));
730}
731
732static inline void kvm_fx_finit(void)
733{
734 asm("finit");
735}
736
737static inline u32 get_rdx_init_val(void) 701static inline u32 get_rdx_init_val(void)
738{ 702{
739 return 0x600; /* P6 family */ 703 return 0x600; /* P6 family */
diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h
new file mode 100644
index 000000000000..36c93b5cc239
--- /dev/null
+++ b/arch/x86/include/asm/local64.h
@@ -0,0 +1 @@
#include <asm-generic/local64.h>
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 451d30e7f62d..16350740edf6 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -13,6 +13,32 @@
13extern int pci_mrst_init(void); 13extern int pci_mrst_init(void);
14int __init sfi_parse_mrtc(struct sfi_table_header *table); 14int __init sfi_parse_mrtc(struct sfi_table_header *table);
15 15
16/*
17 * Medfield is the follow-up of Moorestown, it combines two chip solution into
18 * one. Other than that it also added always-on and constant tsc and lapic
19 * timers. Medfield is the platform name, and the chip name is called Penwell
20 * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be
21 * identified via MSRs.
22 */
23enum mrst_cpu_type {
24 MRST_CPU_CHIP_LINCROFT = 1,
25 MRST_CPU_CHIP_PENWELL,
26};
27
28extern enum mrst_cpu_type __mrst_cpu_chip;
29static enum mrst_cpu_type mrst_identify_cpu(void)
30{
31 return __mrst_cpu_chip;
32}
33
34enum mrst_timer_options {
35 MRST_TIMER_DEFAULT,
36 MRST_TIMER_APBT_ONLY,
37 MRST_TIMER_LAPIC_APBT,
38};
39
40extern enum mrst_timer_options mrst_timer_options;
41
16#define SFI_MTMR_MAX_NUM 8 42#define SFI_MTMR_MAX_NUM 8
17#define SFI_MRTC_MAX 8 43#define SFI_MRTC_MAX 8
18 44
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8c7ae4318629..65bbec2093aa 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -20,6 +20,7 @@
20#define _EFER_LMA 10 /* Long mode active (read-only) */ 20#define _EFER_LMA 10 /* Long mode active (read-only) */
21#define _EFER_NX 11 /* No execute enable */ 21#define _EFER_NX 11 /* No execute enable */
22#define _EFER_SVME 12 /* Enable virtualization */ 22#define _EFER_SVME 12 /* Enable virtualization */
23#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
23#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ 24#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
24 25
25#define EFER_SCE (1<<_EFER_SCE) 26#define EFER_SCE (1<<_EFER_SCE)
@@ -27,6 +28,7 @@
27#define EFER_LMA (1<<_EFER_LMA) 28#define EFER_LMA (1<<_EFER_LMA)
28#define EFER_NX (1<<_EFER_NX) 29#define EFER_NX (1<<_EFER_NX)
29#define EFER_SVME (1<<_EFER_SVME) 30#define EFER_SVME (1<<_EFER_SVME)
31#define EFER_LMSLE (1<<_EFER_LMSLE)
30#define EFER_FFXSR (1<<_EFER_FFXSR) 32#define EFER_FFXSR (1<<_EFER_FFXSR)
31 33
32/* Intel MSRs. Some also available on other CPUs */ 34/* Intel MSRs. Some also available on other CPUs */
@@ -159,8 +161,6 @@
159#define MSR_K7_FID_VID_STATUS 0xc0010042 161#define MSR_K7_FID_VID_STATUS 0xc0010042
160 162
161/* K6 MSRs */ 163/* K6 MSRs */
162#define MSR_K6_EFER 0xc0000080
163#define MSR_K6_STAR 0xc0000081
164#define MSR_K6_WHCR 0xc0000082 164#define MSR_K6_WHCR 0xc0000082
165#define MSR_K6_UWCCR 0xc0000085 165#define MSR_K6_UWCCR 0xc0000085
166#define MSR_K6_EPMR 0xc0000086 166#define MSR_K6_EPMR 0xc0000086
@@ -224,12 +224,14 @@
224#define MSR_IA32_THERM_CONTROL 0x0000019a 224#define MSR_IA32_THERM_CONTROL 0x0000019a
225#define MSR_IA32_THERM_INTERRUPT 0x0000019b 225#define MSR_IA32_THERM_INTERRUPT 0x0000019b
226 226
227#define THERM_INT_LOW_ENABLE (1 << 0) 227#define THERM_INT_HIGH_ENABLE (1 << 0)
228#define THERM_INT_HIGH_ENABLE (1 << 1) 228#define THERM_INT_LOW_ENABLE (1 << 1)
229#define THERM_INT_PLN_ENABLE (1 << 24)
229 230
230#define MSR_IA32_THERM_STATUS 0x0000019c 231#define MSR_IA32_THERM_STATUS 0x0000019c
231 232
232#define THERM_STATUS_PROCHOT (1 << 0) 233#define THERM_STATUS_PROCHOT (1 << 0)
234#define THERM_STATUS_POWER_LIMIT (1 << 10)
233 235
234#define MSR_THERM2_CTL 0x0000019d 236#define MSR_THERM2_CTL 0x0000019d
235 237
@@ -239,6 +241,19 @@
239 241
240#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 242#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
241 243
244#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
245
246#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
247
248#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
249#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10)
250
251#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2
252
253#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
254#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
255#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
256
242/* MISC_ENABLE bits: architectural */ 257/* MISC_ENABLE bits: architectural */
243#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) 258#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
244#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) 259#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index c5bc4c2d33f5..084ef95274cd 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -148,8 +148,8 @@ static inline unsigned long long native_read_pmc(int counter)
148#define rdmsr(msr, val1, val2) \ 148#define rdmsr(msr, val1, val2) \
149do { \ 149do { \
150 u64 __val = native_read_msr((msr)); \ 150 u64 __val = native_read_msr((msr)); \
151 (val1) = (u32)__val; \ 151 (void)((val1) = (u32)__val); \
152 (val2) = (u32)(__val >> 32); \ 152 (void)((val2) = (u32)(__val >> 32)); \
153} while (0) 153} while (0)
154 154
155static inline void wrmsr(unsigned msr, unsigned low, unsigned high) 155static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 93da9c3f3341..932f0f86b4b7 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
17 17
18extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); 18extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
19extern int check_nmi_watchdog(void); 19extern int check_nmi_watchdog(void);
20#if !defined(CONFIG_LOCKUP_DETECTOR)
20extern int nmi_watchdog_enabled; 21extern int nmi_watchdog_enabled;
22#endif
21extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 23extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
22extern int reserve_perfctr_nmi(unsigned int); 24extern int reserve_perfctr_nmi(unsigned int);
23extern void release_perfctr_nmi(unsigned int); 25extern void release_perfctr_nmi(unsigned int);
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index cd2a31dc5fb8..49c7219826f9 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -30,6 +30,7 @@
30#define PCI_HAS_IO_ECS 0x40000 30#define PCI_HAS_IO_ECS 0x40000
31#define PCI_NOASSIGN_ROMS 0x80000 31#define PCI_NOASSIGN_ROMS 0x80000
32#define PCI_ROOT_NO_CRS 0x100000 32#define PCI_ROOT_NO_CRS 0x100000
33#define PCI_NOASSIGN_BARS 0x200000
33 34
34extern unsigned int pci_probe; 35extern unsigned int pci_probe;
35extern unsigned long pirq_table_addr; 36extern unsigned long pirq_table_addr;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 254883d0c7e0..6e742cc4251b 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -68,8 +68,9 @@ union cpuid10_eax {
68 68
69union cpuid10_edx { 69union cpuid10_edx {
70 struct { 70 struct {
71 unsigned int num_counters_fixed:4; 71 unsigned int num_counters_fixed:5;
72 unsigned int reserved:28; 72 unsigned int bit_width_fixed:8;
73 unsigned int reserved:19;
73 } split; 74 } split;
74 unsigned int full; 75 unsigned int full;
75}; 76};
@@ -140,6 +141,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
140extern unsigned long perf_misc_flags(struct pt_regs *regs); 141extern unsigned long perf_misc_flags(struct pt_regs *regs);
141#define perf_misc_flags(regs) perf_misc_flags(regs) 142#define perf_misc_flags(regs) perf_misc_flags(regs)
142 143
144#include <asm/stacktrace.h>
145
146/*
147 * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
148 * and the comment with PERF_EFLAGS_EXACT.
149 */
150#define perf_arch_fetch_caller_regs(regs, __ip) { \
151 (regs)->ip = (__ip); \
152 (regs)->bp = caller_frame_pointer(); \
153 (regs)->cs = __KERNEL_CS; \
154 regs->flags = 0; \
155}
156
143#else 157#else
144static inline void init_hw_perf_events(void) { } 158static inline void init_hw_perf_events(void) { }
145static inline void perf_events_lapic_init(void) { } 159static inline void perf_events_lapic_init(void) { }
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 64a8ebff06fc..def500776b16 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -19,7 +19,6 @@
19#define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */ 19#define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */
20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) 20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
21#define ARCH_P4_MAX_CCCR (18) 21#define ARCH_P4_MAX_CCCR (18)
22#define ARCH_P4_MAX_COUNTER (ARCH_P4_MAX_CCCR / 2)
23 22
24#define P4_ESCR_EVENT_MASK 0x7e000000U 23#define P4_ESCR_EVENT_MASK 0x7e000000U
25#define P4_ESCR_EVENT_SHIFT 25 24#define P4_ESCR_EVENT_SHIFT 25
@@ -71,10 +70,6 @@
71#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) 70#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
72#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) 71#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
73 72
74/* Custom bits in reerved CCCR area */
75#define P4_CCCR_CACHE_OPS_MASK 0x0000003fU
76
77
78/* Non HT mask */ 73/* Non HT mask */
79#define P4_CCCR_MASK \ 74#define P4_CCCR_MASK \
80 (P4_CCCR_OVF | \ 75 (P4_CCCR_OVF | \
@@ -106,8 +101,7 @@
106 * ESCR and CCCR but rather an only packed value should 101 * ESCR and CCCR but rather an only packed value should
107 * be unpacked and written to a proper addresses 102 * be unpacked and written to a proper addresses
108 * 103 *
109 * the base idea is to pack as much info as 104 * the base idea is to pack as much info as possible
110 * possible
111 */ 105 */
112#define p4_config_pack_escr(v) (((u64)(v)) << 32) 106#define p4_config_pack_escr(v) (((u64)(v)) << 32)
113#define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL) 107#define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL)
@@ -130,8 +124,6 @@
130 t; \ 124 t; \
131 }) 125 })
132 126
133#define p4_config_unpack_cache_event(v) (((u64)(v)) & P4_CCCR_CACHE_OPS_MASK)
134
135#define P4_CONFIG_HT_SHIFT 63 127#define P4_CONFIG_HT_SHIFT 63
136#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) 128#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
137 129
@@ -214,6 +206,12 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
214 return escr; 206 return escr;
215} 207}
216 208
209/*
210 * This are the events which should be used in "Event Select"
211 * field of ESCR register, they are like unique keys which allow
212 * the kernel to determinate which CCCR and COUNTER should be
213 * used to track an event
214 */
217enum P4_EVENTS { 215enum P4_EVENTS {
218 P4_EVENT_TC_DELIVER_MODE, 216 P4_EVENT_TC_DELIVER_MODE,
219 P4_EVENT_BPU_FETCH_REQUEST, 217 P4_EVENT_BPU_FETCH_REQUEST,
@@ -561,7 +559,7 @@ enum P4_EVENT_OPCODES {
561 * a caller should use P4_ESCR_EMASK_NAME helper to 559 * a caller should use P4_ESCR_EMASK_NAME helper to
562 * pick the EventMask needed, for example 560 * pick the EventMask needed, for example
563 * 561 *
564 * P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD) 562 * P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)
565 */ 563 */
566enum P4_ESCR_EMASKS { 564enum P4_ESCR_EMASKS {
567 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0), 565 P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
@@ -753,43 +751,50 @@ enum P4_ESCR_EMASKS {
753 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1), 751 P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
754}; 752};
755 753
756/* P4 PEBS: stale for a while */ 754/*
757#define P4_PEBS_METRIC_MASK 0x00001fffU 755 * P4 PEBS specifics (Replay Event only)
758#define P4_PEBS_UOB_TAG 0x01000000U 756 *
759#define P4_PEBS_ENABLE 0x02000000U 757 * Format (bits):
760 758 * 0-6: metric from P4_PEBS_METRIC enum
761/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */ 759 * 7 : reserved
762#define P4_PEBS__1stl_cache_load_miss_retired 0x3000001 760 * 8 : reserved
763#define P4_PEBS__2ndl_cache_load_miss_retired 0x3000002 761 * 9-11 : reserved
764#define P4_PEBS__dtlb_load_miss_retired 0x3000004 762 *
765#define P4_PEBS__dtlb_store_miss_retired 0x3000004 763 * Note we have UOP and PEBS bits reserved for now
766#define P4_PEBS__dtlb_all_miss_retired 0x3000004 764 * just in case if we will need them once
767#define P4_PEBS__tagged_mispred_branch 0x3018000 765 */
768#define P4_PEBS__mob_load_replay_retired 0x3000200 766#define P4_PEBS_CONFIG_ENABLE (1 << 7)
769#define P4_PEBS__split_load_retired 0x3000400 767#define P4_PEBS_CONFIG_UOP_TAG (1 << 8)
770#define P4_PEBS__split_store_retired 0x3000400 768#define P4_PEBS_CONFIG_METRIC_MASK 0x3f
771 769#define P4_PEBS_CONFIG_MASK 0xff
772#define P4_VERT__1stl_cache_load_miss_retired 0x0000001 770
773#define P4_VERT__2ndl_cache_load_miss_retired 0x0000001 771/*
774#define P4_VERT__dtlb_load_miss_retired 0x0000001 772 * mem: Only counters MSR_IQ_COUNTER4 (16) and
775#define P4_VERT__dtlb_store_miss_retired 0x0000002 773 * MSR_IQ_COUNTER5 (17) are allowed for PEBS sampling
776#define P4_VERT__dtlb_all_miss_retired 0x0000003 774 */
777#define P4_VERT__tagged_mispred_branch 0x0000010 775#define P4_PEBS_ENABLE 0x02000000U
778#define P4_VERT__mob_load_replay_retired 0x0000001 776#define P4_PEBS_ENABLE_UOP_TAG 0x01000000U
779#define P4_VERT__split_load_retired 0x0000001 777
780#define P4_VERT__split_store_retired 0x0000002 778#define p4_config_unpack_metric(v) (((u64)(v)) & P4_PEBS_CONFIG_METRIC_MASK)
781 779#define p4_config_unpack_pebs(v) (((u64)(v)) & P4_PEBS_CONFIG_MASK)
782enum P4_CACHE_EVENTS { 780
783 P4_CACHE__NONE, 781#define p4_config_pebs_has(v, mask) (p4_config_unpack_pebs(v) & (mask))
784 782
785 P4_CACHE__1stl_cache_load_miss_retired, 783enum P4_PEBS_METRIC {
786 P4_CACHE__2ndl_cache_load_miss_retired, 784 P4_PEBS_METRIC__none,
787 P4_CACHE__dtlb_load_miss_retired, 785
788 P4_CACHE__dtlb_store_miss_retired, 786 P4_PEBS_METRIC__1stl_cache_load_miss_retired,
789 P4_CACHE__itlb_reference_hit, 787 P4_PEBS_METRIC__2ndl_cache_load_miss_retired,
790 P4_CACHE__itlb_reference_miss, 788 P4_PEBS_METRIC__dtlb_load_miss_retired,
791 789 P4_PEBS_METRIC__dtlb_store_miss_retired,
792 P4_CACHE__MAX 790 P4_PEBS_METRIC__dtlb_all_miss_retired,
791 P4_PEBS_METRIC__tagged_mispred_branch,
792 P4_PEBS_METRIC__mob_load_replay_retired,
793 P4_PEBS_METRIC__split_load_retired,
794 P4_PEBS_METRIC__split_store_retired,
795
796 P4_PEBS_METRIC__max
793}; 797};
794 798
795#endif /* PERF_EVENT_P4_H */ 799#endif /* PERF_EVENT_P4_H */
800
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7e5c6a60b8ee..325b7bdbebaa 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -762,6 +762,7 @@ extern void init_c1e_mask(void);
762extern unsigned long boot_option_idle_override; 762extern unsigned long boot_option_idle_override;
763extern unsigned long idle_halt; 763extern unsigned long idle_halt;
764extern unsigned long idle_nomwait; 764extern unsigned long idle_nomwait;
765extern bool c1e_detected;
765 766
766/* 767/*
767 * on systems with caches, caches must be flashed as the absolute 768 * on systems with caches, caches must be flashed as the absolute
@@ -1025,4 +1026,24 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
1025 return ratio; 1026 return ratio;
1026} 1027}
1027 1028
1029/*
1030 * AMD errata checking
1031 */
1032#ifdef CONFIG_CPU_SUP_AMD
1033extern const int amd_erratum_383[];
1034extern const int amd_erratum_400[];
1035extern bool cpu_has_amd_erratum(const int *);
1036
1037#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
1038#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
1039#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
1040 ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
1041#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
1042#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
1043#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
1044
1045#else
1046#define cpu_has_amd_erratum(x) (false)
1047#endif /* CONFIG_CPU_SUP_AMD */
1048
1028#endif /* _ASM_X86_PROCESSOR_H */ 1049#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 64cf2d24fad1..6c7fc25f2c34 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -84,5 +84,7 @@
84#define REQUIRED_MASK5 0 84#define REQUIRED_MASK5 0
85#define REQUIRED_MASK6 0 85#define REQUIRED_MASK6 0
86#define REQUIRED_MASK7 0 86#define REQUIRED_MASK7 0
87#define REQUIRED_MASK8 0
88#define REQUIRED_MASK9 0
87 89
88#endif /* _ASM_X86_REQUIRED_FEATURES_H */ 90#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 606ede126972..d1e41b0f9b60 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -118,7 +118,7 @@ static inline void __down_read(struct rw_semaphore *sem)
118{ 118{
119 asm volatile("# beginning down_read\n\t" 119 asm volatile("# beginning down_read\n\t"
120 LOCK_PREFIX _ASM_INC "(%1)\n\t" 120 LOCK_PREFIX _ASM_INC "(%1)\n\t"
121 /* adds 0x00000001, returns the old value */ 121 /* adds 0x00000001 */
122 " jns 1f\n" 122 " jns 1f\n"
123 " call call_rwsem_down_read_failed\n" 123 " call call_rwsem_down_read_failed\n"
124 "1:\n\t" 124 "1:\n\t"
@@ -156,11 +156,9 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) 156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
157{ 157{
158 rwsem_count_t tmp; 158 rwsem_count_t tmp;
159
160 tmp = RWSEM_ACTIVE_WRITE_BIAS;
161 asm volatile("# beginning down_write\n\t" 159 asm volatile("# beginning down_write\n\t"
162 LOCK_PREFIX " xadd %1,(%2)\n\t" 160 LOCK_PREFIX " xadd %1,(%2)\n\t"
163 /* subtract 0x0000ffff, returns the old value */ 161 /* adds 0xffff0001, returns the old value */
164 " test %1,%1\n\t" 162 " test %1,%1\n\t"
165 /* was the count 0 before? */ 163 /* was the count 0 before? */
166 " jz 1f\n" 164 " jz 1f\n"
@@ -168,7 +166,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
168 "1:\n" 166 "1:\n"
169 "# ending down_write" 167 "# ending down_write"
170 : "+m" (sem->count), "=d" (tmp) 168 : "+m" (sem->count), "=d" (tmp)
171 : "a" (sem), "1" (tmp) 169 : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS)
172 : "memory", "cc"); 170 : "memory", "cc");
173} 171}
174 172
@@ -195,16 +193,16 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
195 */ 193 */
196static inline void __up_read(struct rw_semaphore *sem) 194static inline void __up_read(struct rw_semaphore *sem)
197{ 195{
198 rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS; 196 rwsem_count_t tmp;
199 asm volatile("# beginning __up_read\n\t" 197 asm volatile("# beginning __up_read\n\t"
200 LOCK_PREFIX " xadd %1,(%2)\n\t" 198 LOCK_PREFIX " xadd %1,(%2)\n\t"
201 /* subtracts 1, returns the old value */ 199 /* subtracts 1, returns the old value */
202 " jns 1f\n\t" 200 " jns 1f\n\t"
203 " call call_rwsem_wake\n" 201 " call call_rwsem_wake\n" /* expects old value in %edx */
204 "1:\n" 202 "1:\n"
205 "# ending __up_read\n" 203 "# ending __up_read\n"
206 : "+m" (sem->count), "=d" (tmp) 204 : "+m" (sem->count), "=d" (tmp)
207 : "a" (sem), "1" (tmp) 205 : "a" (sem), "1" (-RWSEM_ACTIVE_READ_BIAS)
208 : "memory", "cc"); 206 : "memory", "cc");
209} 207}
210 208
@@ -216,10 +214,9 @@ static inline void __up_write(struct rw_semaphore *sem)
216 rwsem_count_t tmp; 214 rwsem_count_t tmp;
217 asm volatile("# beginning __up_write\n\t" 215 asm volatile("# beginning __up_write\n\t"
218 LOCK_PREFIX " xadd %1,(%2)\n\t" 216 LOCK_PREFIX " xadd %1,(%2)\n\t"
219 /* tries to transition 217 /* subtracts 0xffff0001, returns the old value */
220 0xffff0001 -> 0x00000000 */ 218 " jns 1f\n\t"
221 " jz 1f\n" 219 " call call_rwsem_wake\n" /* expects old value in %edx */
222 " call call_rwsem_wake\n"
223 "1:\n\t" 220 "1:\n\t"
224 "# ending __up_write\n" 221 "# ending __up_write\n"
225 : "+m" (sem->count), "=d" (tmp) 222 : "+m" (sem->count), "=d" (tmp)
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 86b1506f4179..ef292c792d74 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -82,7 +82,7 @@ void *extend_brk(size_t size, size_t align);
82 * executable.) 82 * executable.)
83 */ 83 */
84#define RESERVE_BRK(name,sz) \ 84#define RESERVE_BRK(name,sz) \
85 static void __section(.discard) __used \ 85 static void __section(.discard.text) __used \
86 __brk_reservation_fn_##name##__(void) { \ 86 __brk_reservation_fn_##name##__(void) { \
87 asm volatile ( \ 87 asm volatile ( \
88 ".pushsection .brk_reservation,\"aw\",@nobits;" \ 88 ".pushsection .brk_reservation,\"aw\",@nobits;" \
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 4dab78edbad9..2b16a2ad23dc 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -1,6 +1,13 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5
1#ifndef _ASM_X86_STACKTRACE_H 6#ifndef _ASM_X86_STACKTRACE_H
2#define _ASM_X86_STACKTRACE_H 7#define _ASM_X86_STACKTRACE_H
3 8
9#include <linux/uaccess.h>
10
4extern int kstack_depth_to_print; 11extern int kstack_depth_to_print;
5 12
6struct thread_info; 13struct thread_info;
@@ -42,4 +49,46 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
42 unsigned long *stack, unsigned long bp, 49 unsigned long *stack, unsigned long bp,
43 const struct stacktrace_ops *ops, void *data); 50 const struct stacktrace_ops *ops, void *data);
44 51
52#ifdef CONFIG_X86_32
53#define STACKSLOTS_PER_LINE 8
54#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
55#else
56#define STACKSLOTS_PER_LINE 4
57#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
58#endif
59
60extern void
61show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
62 unsigned long *stack, unsigned long bp, char *log_lvl);
63
64extern void
65show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
66 unsigned long *sp, unsigned long bp, char *log_lvl);
67
68extern unsigned int code_bytes;
69
70/* The form of the top of the frame on the stack */
71struct stack_frame {
72 struct stack_frame *next_frame;
73 unsigned long return_address;
74};
75
76struct stack_frame_ia32 {
77 u32 next_frame;
78 u32 return_address;
79};
80
81static inline unsigned long caller_frame_pointer(void)
82{
83 struct stack_frame *frame;
84
85 get_bp(frame);
86
87#ifdef CONFIG_FRAME_POINTER
88 frame = frame->next_frame;
89#endif
90
91 return (unsigned long)frame;
92}
93
45#endif /* _ASM_X86_STACKTRACE_H */ 94#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index e7f4d33c55ed..33ecc3ea8782 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -457,4 +457,11 @@ static __always_inline void rdtsc_barrier(void)
457 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); 457 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
458} 458}
459 459
460/*
461 * We handle most unaligned accesses in hardware. On the other hand
462 * unaligned DMA can be quite expensive on some Nehalem processors.
463 *
464 * Based on this we disable the IP header alignment in network drivers.
465 */
466#define NET_IP_ALIGN 0
460#endif /* _ASM_X86_SYSTEM_H */ 467#endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9e6779f7cf2d..9f0cbd987d50 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -257,6 +257,7 @@ enum vmcs_field {
257#define EXIT_REASON_IO_INSTRUCTION 30 257#define EXIT_REASON_IO_INSTRUCTION 30
258#define EXIT_REASON_MSR_READ 31 258#define EXIT_REASON_MSR_READ 31
259#define EXIT_REASON_MSR_WRITE 32 259#define EXIT_REASON_MSR_WRITE 32
260#define EXIT_REASON_INVALID_STATE 33
260#define EXIT_REASON_MWAIT_INSTRUCTION 36 261#define EXIT_REASON_MWAIT_INSTRUCTION 36
261#define EXIT_REASON_MONITOR_INSTRUCTION 39 262#define EXIT_REASON_MONITOR_INSTRUCTION 39
262#define EXIT_REASON_PAUSE_INSTRUCTION 40 263#define EXIT_REASON_PAUSE_INSTRUCTION 40
@@ -266,6 +267,7 @@ enum vmcs_field {
266#define EXIT_REASON_EPT_VIOLATION 48 267#define EXIT_REASON_EPT_VIOLATION 48
267#define EXIT_REASON_EPT_MISCONFIG 49 268#define EXIT_REASON_EPT_MISCONFIG 49
268#define EXIT_REASON_WBINVD 54 269#define EXIT_REASON_WBINVD 54
270#define EXIT_REASON_XSETBV 55
269 271
270/* 272/*
271 * Interruption-information format 273 * Interruption-information format
@@ -375,6 +377,9 @@ enum vmcs_field {
375#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 377#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
376#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 378#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
377 379
380#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
381#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
382
378#define VMX_EPT_DEFAULT_GAW 3 383#define VMX_EPT_DEFAULT_GAW 3
379#define VMX_EPT_MAX_GAW 0x4 384#define VMX_EPT_MAX_GAW 0x4
380#define VMX_EPT_MT_EPTE_SHIFT 3 385#define VMX_EPT_MT_EPTE_SHIFT 3
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 519b54327d75..baa579c8e038 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -142,6 +142,7 @@ struct x86_cpuinit_ops {
142 * @set_wallclock: set time back to HW clock 142 * @set_wallclock: set time back to HW clock
143 * @is_untracked_pat_range exclude from PAT logic 143 * @is_untracked_pat_range exclude from PAT logic
144 * @nmi_init enable NMI on cpus 144 * @nmi_init enable NMI on cpus
145 * @i8042_detect pre-detect if i8042 controller exists
145 */ 146 */
146struct x86_platform_ops { 147struct x86_platform_ops {
147 unsigned long (*calibrate_tsc)(void); 148 unsigned long (*calibrate_tsc)(void);
@@ -150,6 +151,7 @@ struct x86_platform_ops {
150 void (*iommu_shutdown)(void); 151 void (*iommu_shutdown)(void);
151 bool (*is_untracked_pat_range)(u64 start, u64 end); 152 bool (*is_untracked_pat_range)(u64 start, u64 end);
152 void (*nmi_init)(void); 153 void (*nmi_init)(void);
154 int (*i8042_detect)(void);
153}; 155};
154 156
155extern struct x86_init_ops x86_init; 157extern struct x86_init_ops x86_init;
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9c371e4a9fa6..7fda040a76cd 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -417,6 +417,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
417 return _hypercall2(int, nmi_op, op, arg); 417 return _hypercall2(int, nmi_op, op, arg);
418} 418}
419 419
420static inline unsigned long __must_check
421HYPERVISOR_hvm_op(int op, void *arg)
422{
423 return _hypercall2(unsigned long, hvm_op, op, arg);
424}
425
420static inline void 426static inline void
421MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) 427MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
422{ 428{
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 30dfc81804d5..06acdbd7570a 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -13,6 +13,12 @@
13 13
14#define FXSAVE_SIZE 512 14#define FXSAVE_SIZE 512
15 15
16#define XSAVE_HDR_SIZE 64
17#define XSAVE_HDR_OFFSET FXSAVE_SIZE
18
19#define XSAVE_YMM_SIZE 256
20#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
21
16/* 22/*
17 * These are the features that the OS can handle currently. 23 * These are the features that the OS can handle currently.
18 */ 24 */
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 2e837f5080fe..fb7a5f052e2b 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -145,6 +145,15 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
145 percpu_entry->states[cx->index].eax = cx->address; 145 percpu_entry->states[cx->index].eax = cx->address;
146 percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK; 146 percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
147 } 147 }
148
149 /*
150 * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared,
151 * then we should skip checking BM_STS for this C-state.
152 * ref: "Intel Processor Vendor-Specific ACPI Interface Specification"
153 */
154 if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2))
155 cx->bm_sts_skip = 1;
156
148 return retval; 157 return retval;
149} 158}
150EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); 159EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 580b4e296010..28595d6df47c 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -104,7 +104,7 @@ _start:
104 movl %eax, %ecx 104 movl %eax, %ecx
105 orl %edx, %ecx 105 orl %edx, %ecx
106 jz 1f 106 jz 1f
107 movl $0xc0000080, %ecx 107 movl $MSR_EFER, %ecx
108 wrmsr 108 wrmsr
1091: 1091:
110 110
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 82e508677b91..33cec152070d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -2,7 +2,7 @@
2 * sleep.c - x86-specific ACPI sleep support. 2 * sleep.c - x86-specific ACPI sleep support.
3 * 3 *
4 * Copyright (C) 2001-2003 Patrick Mochel 4 * Copyright (C) 2001-2003 Patrick Mochel
5 * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
6 */ 6 */
7 7
8#include <linux/acpi.h> 8#include <linux/acpi.h>
@@ -157,9 +157,14 @@ static int __init acpi_sleep_setup(char *str)
157#ifdef CONFIG_HIBERNATION 157#ifdef CONFIG_HIBERNATION
158 if (strncmp(str, "s4_nohwsig", 10) == 0) 158 if (strncmp(str, "s4_nohwsig", 10) == 0)
159 acpi_no_s4_hw_signature(); 159 acpi_no_s4_hw_signature();
160 if (strncmp(str, "s4_nonvs", 8) == 0) 160 if (strncmp(str, "s4_nonvs", 8) == 0) {
161 acpi_s4_no_nvs(); 161 pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
162 "please use acpi_sleep=nonvs instead");
163 acpi_nvs_nosave();
164 }
162#endif 165#endif
166 if (strncmp(str, "nonvs", 5) == 0)
167 acpi_nvs_nosave();
163 if (strncmp(str, "old_ordering", 12) == 0) 168 if (strncmp(str, "old_ordering", 12) == 0)
164 acpi_old_suspend_ordering(); 169 acpi_old_suspend_ordering();
165 str = strchr(str, ','); 170 str = strchr(str, ',');
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0d20286d78c6..fa044e1e30a2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2572,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2572static int amd_iommu_domain_has_cap(struct iommu_domain *domain, 2572static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
2573 unsigned long cap) 2573 unsigned long cap)
2574{ 2574{
2575 switch (cap) {
2576 case IOMMU_CAP_CACHE_COHERENCY:
2577 return 1;
2578 }
2579
2575 return 0; 2580 return 0;
2576} 2581}
2577 2582
@@ -2609,8 +2614,7 @@ int __init amd_iommu_init_passthrough(void)
2609 2614
2610 pt_domain->mode |= PAGE_MODE_NONE; 2615 pt_domain->mode |= PAGE_MODE_NONE;
2611 2616
2612 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2617 for_each_pci_dev(dev) {
2613
2614 if (!check_device(&dev->dev)) 2618 if (!check_device(&dev->dev))
2615 continue; 2619 continue;
2616 2620
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index a35347501d36..8dd77800ff5d 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -43,10 +43,11 @@
43 43
44#include <asm/fixmap.h> 44#include <asm/fixmap.h>
45#include <asm/apb_timer.h> 45#include <asm/apb_timer.h>
46#include <asm/mrst.h>
46 47
47#define APBT_MASK CLOCKSOURCE_MASK(32) 48#define APBT_MASK CLOCKSOURCE_MASK(32)
48#define APBT_SHIFT 22 49#define APBT_SHIFT 22
49#define APBT_CLOCKEVENT_RATING 150 50#define APBT_CLOCKEVENT_RATING 110
50#define APBT_CLOCKSOURCE_RATING 250 51#define APBT_CLOCKSOURCE_RATING 250
51#define APBT_MIN_DELTA_USEC 200 52#define APBT_MIN_DELTA_USEC 200
52 53
@@ -83,8 +84,6 @@ struct apbt_dev {
83 char name[10]; 84 char name[10];
84}; 85};
85 86
86int disable_apbt_percpu __cpuinitdata;
87
88static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); 87static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
89 88
90#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
@@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = {
195}; 194};
196 195
197/* 196/*
198 * if user does not want to use per CPU apb timer, just give it a lower rating
199 * than local apic timer and skip the late per cpu timer init.
200 */
201static inline int __init setup_x86_mrst_timer(char *arg)
202{
203 if (!arg)
204 return -EINVAL;
205
206 if (strcmp("apbt_only", arg) == 0)
207 disable_apbt_percpu = 0;
208 else if (strcmp("lapic_and_apbt", arg) == 0)
209 disable_apbt_percpu = 1;
210 else {
211 pr_warning("X86 MRST timer option %s not recognised"
212 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
213 arg);
214 return -EINVAL;
215 }
216 return 0;
217}
218__setup("x86_mrst_timer=", setup_x86_mrst_timer);
219
220/*
221 * start count down from 0xffff_ffff. this is done by toggling the enable bit 197 * start count down from 0xffff_ffff. this is done by toggling the enable bit
222 * then load initial load count to ~0. 198 * then load initial load count to ~0.
223 */ 199 */
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void)
335 adev->num = smp_processor_id(); 311 adev->num = smp_processor_id();
336 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); 312 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
337 313
338 if (disable_apbt_percpu) { 314 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
339 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; 315 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
340 global_clock_event = &adev->evt; 316 global_clock_event = &adev->evt;
341 printk(KERN_DEBUG "%s clockevent registered as global\n", 317 printk(KERN_DEBUG "%s clockevent registered as global\n",
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
429 405
430static __init int apbt_late_init(void) 406static __init int apbt_late_init(void)
431{ 407{
432 if (disable_apbt_percpu || !apb_timer_block_enabled) 408 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
409 !apb_timer_block_enabled)
433 return 0; 410 return 0;
434 /* This notifier should be called after workqueue is ready */ 411 /* This notifier should be called after workqueue is ready */
435 hotcpu_notifier(apbt_cpuhp_notify, -20); 412 hotcpu_notifier(apbt_cpuhp_notify, -20);
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode,
450 int timer_num; 427 int timer_num;
451 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); 428 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
452 429
430 BUG_ON(!apbt_virt_address);
431
453 timer_num = adev->num; 432 timer_num = adev->num;
454 pr_debug("%s CPU %d timer %d mode=%d\n", 433 pr_debug("%s CPU %d timer %d mode=%d\n",
455 __func__, first_cpu(*evt->cpumask), timer_num, mode); 434 __func__, first_cpu(*evt->cpumask), timer_num, mode);
@@ -676,7 +655,7 @@ void __init apbt_time_init(void)
676 } 655 }
677#ifdef CONFIG_SMP 656#ifdef CONFIG_SMP
678 /* kernel cmdline disable apb timer, so we will use lapic timers */ 657 /* kernel cmdline disable apb timer, so we will use lapic timers */
679 if (disable_apbt_percpu) { 658 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
680 printk(KERN_INFO "apbt: disabled per cpu timer\n"); 659 printk(KERN_INFO "apbt: disabled per cpu timer\n");
681 return; 660 return;
682 } 661 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b5d8b0bcf235..a2e0caf26e17 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void)
280 * or BIOS forget to put that in reserved. 280 * or BIOS forget to put that in reserved.
281 * try to update e820 to make that region as reserved. 281 * try to update e820 to make that region as reserved.
282 */ 282 */
283 u32 agp_aper_base = 0, agp_aper_order = 0; 283 u32 agp_aper_order = 0;
284 int i, fix, slot, valid_agp = 0; 284 int i, fix, slot, valid_agp = 0;
285 u32 ctl; 285 u32 ctl;
286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void)
291 return; 291 return;
292 292
293 /* This is mostly duplicate of iommu_hole_init */ 293 /* This is mostly duplicate of iommu_hole_init */
294 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); 294 search_agp_bridge(&agp_aper_order, &valid_agp);
295 295
296 fix = 0; 296 fix = 0;
297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 565c1bfc507d..910f20b457c4 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,12 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
6ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
7obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
8endif
9obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
10
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o 11obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o 12obj-$(CONFIG_SMP) += ipi.o
8 13
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c02cc692985c..980508c79082 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -460,7 +460,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
460} 460}
461 461
462/* 462/*
463 * Setup the local APIC timer for this CPU. Copy the initilized values 463 * Setup the local APIC timer for this CPU. Copy the initialized values
464 * of the boot CPU and register the clock event in the framework. 464 * of the boot CPU and register the clock event in the framework.
465 */ 465 */
466static void __cpuinit setup_APIC_timer(void) 466static void __cpuinit setup_APIC_timer(void)
@@ -921,7 +921,7 @@ void disable_local_APIC(void)
921 unsigned int value; 921 unsigned int value;
922 922
923 /* APIC hasn't been mapped yet */ 923 /* APIC hasn't been mapped yet */
924 if (!apic_phys) 924 if (!x2apic_mode && !apic_phys)
925 return; 925 return;
926 926
927 clear_local_APIC(); 927 clear_local_APIC();
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 425e53a87feb..8593582d8022 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -129,7 +129,6 @@ int es7000_plat;
129 * GSI override for ES7000 platforms. 129 * GSI override for ES7000 platforms.
130 */ 130 */
131 131
132static unsigned int base;
133 132
134static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) 133static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
135{ 134{
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 000000000000..cefd6942f0e9
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,107 @@
1/*
2 * HW NMI watchdog support
3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 *
6 * Arch specific calls to support NMI watchdog
7 *
8 * Bits copied from original nmi.c file
9 *
10 */
11#include <asm/apic.h>
12
13#include <linux/cpumask.h>
14#include <linux/kdebug.h>
15#include <linux/notifier.h>
16#include <linux/kprobes.h>
17#include <linux/nmi.h>
18#include <linux/module.h>
19
20/* For reliability, we're prepared to waste bits here. */
21static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
22
23u64 hw_nmi_get_sample_period(void)
24{
25 return (u64)(cpu_khz) * 1000 * 60;
26}
27
28#ifdef ARCH_HAS_NMI_WATCHDOG
29void arch_trigger_all_cpu_backtrace(void)
30{
31 int i;
32
33 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
34
35 printk(KERN_INFO "sending NMI to all CPUs:\n");
36 apic->send_IPI_all(NMI_VECTOR);
37
38 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
39 for (i = 0; i < 10 * 1000; i++) {
40 if (cpumask_empty(to_cpumask(backtrace_mask)))
41 break;
42 mdelay(1);
43 }
44}
45
46static int __kprobes
47arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
48 unsigned long cmd, void *__args)
49{
50 struct die_args *args = __args;
51 struct pt_regs *regs;
52 int cpu = smp_processor_id();
53
54 switch (cmd) {
55 case DIE_NMI:
56 case DIE_NMI_IPI:
57 break;
58
59 default:
60 return NOTIFY_DONE;
61 }
62
63 regs = args->regs;
64
65 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
66 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
67
68 arch_spin_lock(&lock);
69 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
70 show_regs(regs);
71 dump_stack();
72 arch_spin_unlock(&lock);
73 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
74 return NOTIFY_STOP;
75 }
76
77 return NOTIFY_DONE;
78}
79
80static __read_mostly struct notifier_block backtrace_notifier = {
81 .notifier_call = arch_trigger_all_cpu_backtrace_handler,
82 .next = NULL,
83 .priority = 1
84};
85
86static int __init register_trigger_all_cpu_backtrace(void)
87{
88 register_die_notifier(&backtrace_notifier);
89 return 0;
90}
91early_initcall(register_trigger_all_cpu_backtrace);
92#endif
93
94/* STUB calls to mimic old nmi_watchdog behaviour */
95#if defined(CONFIG_X86_LOCAL_APIC)
96unsigned int nmi_watchdog = NMI_NONE;
97EXPORT_SYMBOL(nmi_watchdog);
98void acpi_nmi_enable(void) { return; }
99void acpi_nmi_disable(void) { return; }
100#endif
101atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
102EXPORT_SYMBOL(nmi_active);
103int unknown_nmi_panic;
104void cpu_nmi_set_wd_enabled(void) { return; }
105void stop_apic_nmi_watchdog(void *unused) { return; }
106void setup_apic_nmi_watchdog(void *unused) { return; }
107int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e41ed24ab26d..4dc0084ec1b1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3397 3397
3398 cfg = desc->chip_data; 3398 cfg = desc->chip_data;
3399 3399
3400 read_msi_msg_desc(desc, &msg); 3400 get_cached_msi_msg_desc(desc, &msg);
3401 3401
3402 msg.data &= ~MSI_DATA_VECTOR_MASK; 3402 msg.data &= ~MSI_DATA_VECTOR_MASK;
3403 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3403 msg.data |= MSI_DATA_VECTOR(cfg->vector);
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 1edaf15c0b8e..a43f71cb30f8 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -401,13 +401,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
401 int cpu = smp_processor_id(); 401 int cpu = smp_processor_id();
402 int rc = 0; 402 int rc = 0;
403 403
404 /* check for other users first */
405 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
406 == NOTIFY_STOP) {
407 rc = 1;
408 touched = 1;
409 }
410
411 sum = get_timer_irqs(cpu); 404 sum = get_timer_irqs(cpu);
412 405
413 if (__get_cpu_var(nmi_touch)) { 406 if (__get_cpu_var(nmi_touch)) {
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index c4f9182ca3ac..4c9c67bf09b7 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -140,7 +140,7 @@
140 * is now the way life works). 140 * is now the way life works).
141 * Fix thinko in suspend() (wrong return). 141 * Fix thinko in suspend() (wrong return).
142 * Notify drivers on critical suspend. 142 * Notify drivers on critical suspend.
143 * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz> 143 * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
144 * modified by sfr). 144 * modified by sfr).
145 * Disable interrupts while we are suspended (Andy Henroid 145 * Disable interrupts while we are suspended (Andy Henroid
146 * <andy_henroid@yahoo.com> fixed by sfr). 146 * <andy_henroid@yahoo.com> fixed by sfr).
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3a785da34b6f..3f0ebe429a01 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,11 +12,11 @@ endif
12nostackp := $(call cc-option, -fno-stack-protector) 12nostackp := $(call cc-option, -fno-stack-protector)
13CFLAGS_common.o := $(nostackp) 13CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o addon_cpuid_features.o 15obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o mshyperv.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18 18
19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o
20obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
21 21
22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d2..60a57b13082d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
466 } 466 }
467 467
468 } 468 }
469 if (c->x86 == 0x10 || c->x86 == 0x11) 469 if (c->x86 >= 0x10)
470 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 470 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
471 471
472 /* get apicid instead of initial apic id from cpuid */ 472 /* get apicid instead of initial apic id from cpuid */
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
529 num_cache_leaves = 3; 529 num_cache_leaves = 3;
530 } 530 }
531 531
532 if (c->x86 >= 0xf && c->x86 <= 0x11) 532 if (c->x86 >= 0xf)
533 set_cpu_cap(c, X86_FEATURE_K8); 533 set_cpu_cap(c, X86_FEATURE_K8);
534 534
535 if (cpu_has_xmm2) { 535 if (cpu_has_xmm2) {
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
546 fam10h_check_enable_mmcfg(); 546 fam10h_check_enable_mmcfg();
547 } 547 }
548 548
549 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { 549 if (c == &boot_cpu_data && c->x86 >= 0xf) {
550 unsigned long long tseg; 550 unsigned long long tseg;
551 551
552 /* 552 /*
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
609}; 609};
610 610
611cpu_dev_register(amd_cpu_dev); 611cpu_dev_register(amd_cpu_dev);
612
613/*
614 * AMD errata checking
615 *
616 * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
617 * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
618 * have an OSVW id assigned, which it takes as first argument. Both take a
619 * variable number of family-specific model-stepping ranges created by
620 * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
621 * int[] in arch/x86/include/asm/processor.h.
622 *
623 * Example:
624 *
625 * const int amd_erratum_319[] =
626 * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
627 * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
628 * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
629 */
630
631const int amd_erratum_400[] =
632 AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
633 AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
634EXPORT_SYMBOL_GPL(amd_erratum_400);
635
636const int amd_erratum_383[] =
637 AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
638EXPORT_SYMBOL_GPL(amd_erratum_383);
639
640bool cpu_has_amd_erratum(const int *erratum)
641{
642 struct cpuinfo_x86 *cpu = &current_cpu_data;
643 int osvw_id = *erratum++;
644 u32 range;
645 u32 ms;
646
647 /*
648 * If called early enough that current_cpu_data hasn't been initialized
649 * yet, fall back to boot_cpu_data.
650 */
651 if (cpu->x86 == 0)
652 cpu = &boot_cpu_data;
653
654 if (cpu->x86_vendor != X86_VENDOR_AMD)
655 return false;
656
657 if (osvw_id >= 0 && osvw_id < 65536 &&
658 cpu_has(cpu, X86_FEATURE_OSVW)) {
659 u64 osvw_len;
660
661 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
662 if (osvw_id < osvw_len) {
663 u64 osvw_bits;
664
665 rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
666 osvw_bits);
667 return osvw_bits & (1ULL << (osvw_id & 0x3f));
668 }
669 }
670
671 /* OSVW unavailable or ID unknown, match family-model-stepping range */
672 ms = (cpu->x86_model << 8) | cpu->x86_mask;
673 while ((range = *erratum++))
674 if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
675 (ms >= AMD_MODEL_RANGE_START(range)) &&
676 (ms <= AMD_MODEL_RANGE_END(range)))
677 return true;
678
679 return false;
680}
681
682EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68e4a6f2211e..f10273138382 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -551,6 +551,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
551 c->x86_capability[4] = excap; 551 c->x86_capability[4] = excap;
552 } 552 }
553 553
554 /* Additional Intel-defined flags: level 0x00000007 */
555 if (c->cpuid_level >= 0x00000007) {
556 u32 eax, ebx, ecx, edx;
557
558 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
559
560 if (eax > 0)
561 c->x86_capability[9] = ebx;
562 }
563
554 /* AMD-defined flags: level 0x80000001 */ 564 /* AMD-defined flags: level 0x80000001 */
555 xlvl = cpuid_eax(0x80000000); 565 xlvl = cpuid_eax(0x80000000);
556 c->extended_cpuid_level = xlvl; 566 c->extended_cpuid_level = xlvl;
@@ -576,6 +586,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
576 if (c->extended_cpuid_level >= 0x80000007) 586 if (c->extended_cpuid_level >= 0x80000007)
577 c->x86_power = cpuid_edx(0x80000007); 587 c->x86_power = cpuid_edx(0x80000007);
578 588
589 init_scattered_cpuid_features(c);
579} 590}
580 591
581static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) 592static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -731,7 +742,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
731 742
732 get_model_name(c); /* Default name */ 743 get_model_name(c); /* Default name */
733 744
734 init_scattered_cpuid_features(c);
735 detect_nopl(c); 745 detect_nopl(c);
736} 746}
737 747
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1d3cddaa40ee..246cd3afbb5f 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -34,7 +34,6 @@
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <trace/events/power.h>
38 37
39#include <linux/acpi.h> 38#include <linux/acpi.h>
40#include <linux/io.h> 39#include <linux/io.h>
@@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
324 } 323 }
325 } 324 }
326 325
327 trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
328
329 switch (data->cpu_feature) { 326 switch (data->cpu_feature) {
330 case SYSTEM_INTEL_MSR_CAPABLE: 327 case SYSTEM_INTEL_MSR_CAPABLE:
331 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 328 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
@@ -351,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
351 348
352 freqs.old = perf->states[perf->state].core_frequency * 1000; 349 freqs.old = perf->states[perf->state].core_frequency * 1000;
353 freqs.new = data->freq_table[next_state].frequency; 350 freqs.new = data->freq_table[next_state].frequency;
354 for_each_cpu(i, cmd.mask) { 351 for_each_cpu(i, policy->cpus) {
355 freqs.cpu = i; 352 freqs.cpu = i;
356 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 353 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
357 } 354 }
@@ -367,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
367 } 364 }
368 } 365 }
369 366
370 for_each_cpu(i, cmd.mask) { 367 for_each_cpu(i, policy->cpus) {
371 freqs.cpu = i; 368 freqs.cpu = i;
372 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 369 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
373 } 370 }
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 16e3483be9e3..32974cf84232 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -169,12 +169,9 @@ static int gx_freq_mult[16] = {
169 * Low Level chipset interface * 169 * Low Level chipset interface *
170 ****************************************************************/ 170 ****************************************************************/
171static struct pci_device_id gx_chipset_tbl[] __initdata = { 171static struct pci_device_id gx_chipset_tbl[] __initdata = {
172 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, 172 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
173 PCI_ANY_ID, PCI_ANY_ID }, 173 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
174 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, 174 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
175 PCI_ANY_ID, PCI_ANY_ID },
176 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
177 PCI_ANY_ID, PCI_ANY_ID },
178 { 0, }, 175 { 0, },
179}; 176};
180 177
@@ -199,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
199 } 196 }
200 197
201 /* detect which companion chip is used */ 198 /* detect which companion chip is used */
202 while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { 199 for_each_pci_dev(gx_pci) {
203 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) 200 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
204 return gx_pci; 201 return gx_pci;
205 } 202 }
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 7e7eea4f8261..03162dac6271 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -426,7 +426,7 @@ static int guess_fsb(int mult)
426} 426}
427 427
428 428
429static int __init longhaul_get_ranges(void) 429static int __cpuinit longhaul_get_ranges(void)
430{ 430{
431 unsigned int i, j, k = 0; 431 unsigned int i, j, k = 0;
432 unsigned int ratio; 432 unsigned int ratio;
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void)
530} 530}
531 531
532 532
533static void __init longhaul_setup_voltagescaling(void) 533static void __cpuinit longhaul_setup_voltagescaling(void)
534{ 534{
535 union msr_longhaul longhaul; 535 union msr_longhaul longhaul;
536 struct mV_pos minvid, maxvid, vid; 536 struct mV_pos minvid, maxvid, vid;
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void)
784 return 0; 784 return 0;
785} 785}
786 786
787static int __init longhaul_cpu_init(struct cpufreq_policy *policy) 787static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
788{ 788{
789 struct cpuinfo_x86 *c = &cpu_data(0); 789 struct cpuinfo_x86 *c = &cpu_data(0);
790 char *cpuname = NULL; 790 char *cpuname = NULL;
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index e2360a469f79..cbf48fbca881 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -56,7 +56,7 @@ union msr_longhaul {
56/* 56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0) 57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */ 58 */
59static const int __initdata samuel1_mults[16] = { 59static const int __cpuinitdata samuel1_mults[16] = {
60 -1, /* 0000 -> RESERVED */ 60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */ 61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */ 62 40, /* 0010 -> 4.0x */
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = {
75 -1, /* 1111 -> RESERVED */ 75 -1, /* 1111 -> RESERVED */
76}; 76};
77 77
78static const int __initdata samuel1_eblcr[16] = { 78static const int __cpuinitdata samuel1_eblcr[16] = {
79 50, /* 0000 -> RESERVED */ 79 50, /* 0000 -> RESERVED */
80 30, /* 0001 -> 3.0x */ 80 30, /* 0001 -> 3.0x */
81 40, /* 0010 -> 4.0x */ 81 40, /* 0010 -> 4.0x */
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = {
97/* 97/*
98 * VIA C3 Samuel2 Stepping 1->15 98 * VIA C3 Samuel2 Stepping 1->15
99 */ 99 */
100static const int __initdata samuel2_eblcr[16] = { 100static const int __cpuinitdata samuel2_eblcr[16] = {
101 50, /* 0000 -> 5.0x */ 101 50, /* 0000 -> 5.0x */
102 30, /* 0001 -> 3.0x */ 102 30, /* 0001 -> 3.0x */
103 40, /* 0010 -> 4.0x */ 103 40, /* 0010 -> 4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
119/* 119/*
120 * VIA C3 Ezra 120 * VIA C3 Ezra
121 */ 121 */
122static const int __initdata ezra_mults[16] = { 122static const int __cpuinitdata ezra_mults[16] = {
123 100, /* 0000 -> 10.0x */ 123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */ 124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */ 125 40, /* 0010 -> 4.0x */
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = {
138 120, /* 1111 -> 12.0x */ 138 120, /* 1111 -> 12.0x */
139}; 139};
140 140
141static const int __initdata ezra_eblcr[16] = { 141static const int __cpuinitdata ezra_eblcr[16] = {
142 50, /* 0000 -> 5.0x */ 142 50, /* 0000 -> 5.0x */
143 30, /* 0001 -> 3.0x */ 143 30, /* 0001 -> 3.0x */
144 40, /* 0010 -> 4.0x */ 144 40, /* 0010 -> 4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
160/* 160/*
161 * VIA C3 (Ezra-T) [C5M]. 161 * VIA C3 (Ezra-T) [C5M].
162 */ 162 */
163static const int __initdata ezrat_mults[32] = { 163static const int __cpuinitdata ezrat_mults[32] = {
164 100, /* 0000 -> 10.0x */ 164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */ 165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */ 166 40, /* 0010 -> 4.0x */
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = {
196 -1, /* 1111 -> RESERVED (12.0x) */ 196 -1, /* 1111 -> RESERVED (12.0x) */
197}; 197};
198 198
199static const int __initdata ezrat_eblcr[32] = { 199static const int __cpuinitdata ezrat_eblcr[32] = {
200 50, /* 0000 -> 5.0x */ 200 50, /* 0000 -> 5.0x */
201 30, /* 0001 -> 3.0x */ 201 30, /* 0001 -> 3.0x */
202 40, /* 0010 -> 4.0x */ 202 40, /* 0010 -> 4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
235/* 235/*
236 * VIA C3 Nehemiah */ 236 * VIA C3 Nehemiah */
237 237
238static const int __initdata nehemiah_mults[32] = { 238static const int __cpuinitdata nehemiah_mults[32] = {
239 100, /* 0000 -> 10.0x */ 239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */ 240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */ 241 40, /* 0010 -> 4.0x */
@@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = {
270 -1, /* 1111 -> 12.0x */ 270 -1, /* 1111 -> 12.0x */
271}; 271};
272 272
273static const int __initdata nehemiah_eblcr[32] = { 273static const int __cpuinitdata nehemiah_eblcr[32] = {
274 50, /* 0000 -> 5.0x */ 274 50, /* 0000 -> 5.0x */
275 160, /* 0001 -> 16.0x */ 275 160, /* 0001 -> 16.0x */
276 40, /* 0010 -> 4.0x */ 276 40, /* 0010 -> 4.0x */
@@ -315,7 +315,7 @@ struct mV_pos {
315 unsigned short pos; 315 unsigned short pos;
316}; 316};
317 317
318static const struct mV_pos __initdata vrm85_mV[32] = { 318static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, 319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, 320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, 321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = {
326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} 326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
327}; 327};
328 328
329static const unsigned char __initdata mV_vrm85[32] = { 329static const unsigned char __cpuinitdata mV_vrm85[32] = {
330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, 330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, 331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, 332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
334}; 334};
335 335
336static const struct mV_pos __initdata mobilevrm_mV[32] = { 336static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, 337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, 338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, 339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = {
344 {675, 3}, {650, 2}, {625, 1}, {600, 0} 344 {675, 3}, {650, 2}, {625, 1}, {600, 0}
345}; 345};
346 346
347static const unsigned char __initdata mV_mobilevrm[32] = { 347static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index e7b559d74c52..fc09f142d94d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu)
165 * TMTA rules: 165 * TMTA rules:
166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) 166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
167 */ 167 */
168static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, 168static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
169 unsigned int *high_freq) 169 unsigned int *high_freq)
170{ 170{
171 u32 msr_lo, msr_hi; 171 u32 msr_lo, msr_hi;
172 u32 save_lo, save_hi; 172 u32 save_lo, save_hi;
@@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
258} 258}
259 259
260 260
261static int __init longrun_cpu_init(struct cpufreq_policy *policy) 261static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
262{ 262{
263 int result = 0; 263 int result = 0;
264 264
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 7b8a8ba67b07..bd1cac747f67 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
178 } 178 }
179 } 179 }
180 180
181 if (c->x86 != 0xF) { 181 if (c->x86 != 0xF)
182 if (!cpu_has(c, X86_FEATURE_EST))
183 printk(KERN_WARNING PFX "Unknown CPU. "
184 "Please send an e-mail to "
185 "<cpufreq@vger.kernel.org>\n");
186 return 0; 182 return 0;
187 }
188 183
189 /* on P-4s, the TSC runs with constant frequency independent whether 184 /* on P-4s, the TSC runs with constant frequency independent whether
190 * throttling is active or not. */ 185 * throttling is active or not. */
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index ce7cde713e71..a36de5bbb622 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -368,22 +368,16 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
368 return -ENODEV; 368 return -ENODEV;
369 369
370 out_obj = output.pointer; 370 out_obj = output.pointer;
371 if (out_obj->type != ACPI_TYPE_BUFFER) { 371 if (out_obj->type != ACPI_TYPE_BUFFER)
372 ret = -ENODEV; 372 return -ENODEV;
373 goto out_free;
374 }
375 373
376 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); 374 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
377 if (errors) { 375 if (errors)
378 ret = -ENODEV; 376 return -ENODEV;
379 goto out_free;
380 }
381 377
382 supported = *((u32 *)(out_obj->buffer.pointer + 4)); 378 supported = *((u32 *)(out_obj->buffer.pointer + 4));
383 if (!(supported & 0x1)) { 379 if (!(supported & 0x1))
384 ret = -ENODEV; 380 return -ENODEV;
385 goto out_free;
386 }
387 381
388out_free: 382out_free:
389 kfree(output.pointer); 383 kfree(output.pointer);
@@ -397,13 +391,17 @@ static int __init pcc_cpufreq_probe(void)
397 struct pcc_memory_resource *mem_resource; 391 struct pcc_memory_resource *mem_resource;
398 struct pcc_register_resource *reg_resource; 392 struct pcc_register_resource *reg_resource;
399 union acpi_object *out_obj, *member; 393 union acpi_object *out_obj, *member;
400 acpi_handle handle, osc_handle; 394 acpi_handle handle, osc_handle, pcch_handle;
401 int ret = 0; 395 int ret = 0;
402 396
403 status = acpi_get_handle(NULL, "\\_SB", &handle); 397 status = acpi_get_handle(NULL, "\\_SB", &handle);
404 if (ACPI_FAILURE(status)) 398 if (ACPI_FAILURE(status))
405 return -ENODEV; 399 return -ENODEV;
406 400
401 status = acpi_get_handle(handle, "PCCH", &pcch_handle);
402 if (ACPI_FAILURE(status))
403 return -ENODEV;
404
407 status = acpi_get_handle(handle, "_OSC", &osc_handle); 405 status = acpi_get_handle(handle, "_OSC", &osc_handle);
408 if (ACPI_SUCCESS(status)) { 406 if (ACPI_SUCCESS(status)) {
409 ret = pcc_cpufreq_do_osc(&osc_handle); 407 ret = pcc_cpufreq_do_osc(&osc_handle);
@@ -543,13 +541,13 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
543 541
544 if (!pcch_virt_addr) { 542 if (!pcch_virt_addr) {
545 result = -1; 543 result = -1;
546 goto pcch_null; 544 goto out;
547 } 545 }
548 546
549 result = pcc_get_offset(cpu); 547 result = pcc_get_offset(cpu);
550 if (result) { 548 if (result) {
551 dprintk("init: PCCP evaluation failed\n"); 549 dprintk("init: PCCP evaluation failed\n");
552 goto free; 550 goto out;
553 } 551 }
554 552
555 policy->max = policy->cpuinfo.max_freq = 553 policy->max = policy->cpuinfo.max_freq =
@@ -558,14 +556,15 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
558 ioread32(&pcch_hdr->minimum_frequency) * 1000; 556 ioread32(&pcch_hdr->minimum_frequency) * 1000;
559 policy->cur = pcc_get_freq(cpu); 557 policy->cur = pcc_get_freq(cpu);
560 558
559 if (!policy->cur) {
560 dprintk("init: Unable to get current CPU frequency\n");
561 result = -EINVAL;
562 goto out;
563 }
564
561 dprintk("init: policy->max is %d, policy->min is %d\n", 565 dprintk("init: policy->max is %d, policy->min is %d\n",
562 policy->max, policy->min); 566 policy->max, policy->min);
563 567out:
564 return 0;
565free:
566 pcc_clear_mapping();
567 free_percpu(pcc_cpu_info);
568pcch_null:
569 return result; 568 return result;
570} 569}
571 570
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 9a97116f89e5..4a45fd6e41ba 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy)
569 * We will then get the same kind of behaviour already tested under 569 * We will then get the same kind of behaviour already tested under
570 * the "well-known" other OS. 570 * the "well-known" other OS.
571 */ 571 */
572static int __init fixup_sgtc(void) 572static int __cpuinit fixup_sgtc(void)
573{ 573{
574 unsigned int sgtc; 574 unsigned int sgtc;
575 unsigned int m; 575 unsigned int m;
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu)
603} 603}
604 604
605 605
606static int __init acer_cpufreq_pst(const struct dmi_system_id *d) 606static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
607{ 607{
608 printk(KERN_WARNING PFX 608 printk(KERN_WARNING PFX
609 "%s laptop with broken PST tables in BIOS detected.\n", 609 "%s laptop with broken PST tables in BIOS detected.\n",
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
621 * A BIOS update is all that can save them. 621 * A BIOS update is all that can save them.
622 * Mention this, and disable cpufreq. 622 * Mention this, and disable cpufreq.
623 */ 623 */
624static struct dmi_system_id __initdata powernow_dmi_table[] = { 624static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
625 { 625 {
626 .callback = acer_cpufreq_pst, 626 .callback = acer_cpufreq_pst,
627 .ident = "Acer Aspire", 627 .ident = "Acer Aspire",
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
633 { } 633 { }
634}; 634};
635 635
636static int __init powernow_cpu_init(struct cpufreq_policy *policy) 636static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
637{ 637{
638 union msr_fidvidstatus fidvidstatus; 638 union msr_fidvidstatus fidvidstatus;
639 int result; 639 int result;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 7ec2123838e6..491977baf6c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -9,7 +9,7 @@
9 * Based on the powernow-k7.c module written by Dave Jones. 9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones on behalf of SuSE Labs 10 * (C) 2003 Dave Jones on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de> 11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@suse.cz> 12 * (C) 2004 Pavel Machek <pavel@ucw.cz>
13 * Licensed under the terms of the GNU GPL License version 2. 13 * Licensed under the terms of the GNU GPL License version 2.
14 * Based upon datasheets & sample CPUs kindly provided by AMD. 14 * Based upon datasheets & sample CPUs kindly provided by AMD.
15 * 15 *
@@ -806,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data)
806 * www.amd.com 806 * www.amd.com
807 */ 807 */
808 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); 808 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
809 printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
810 " and Cool'N'Quiet support is enabled in BIOS setup\n");
809 return -ENODEV; 811 return -ENODEV;
810} 812}
811 813
@@ -910,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
910{ 912{
911 int i; 913 int i;
912 u32 hi = 0, lo = 0; 914 u32 hi = 0, lo = 0;
913 rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); 915 rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
914 data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; 916 data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
915 917
916 for (i = 0; i < data->acpi_data.state_count; i++) { 918 for (i = 0; i < data->acpi_data.state_count; i++) {
917 u32 index; 919 u32 index;
@@ -1023,13 +1025,12 @@ static int get_transition_latency(struct powernow_k8_data *data)
1023 } 1025 }
1024 if (max_latency == 0) { 1026 if (max_latency == 0) {
1025 /* 1027 /*
1026 * Fam 11h always returns 0 as transition latency. 1028 * Fam 11h and later may return 0 as transition latency. This
1027 * This is intended and means "very fast". While cpufreq core 1029 * is intended and means "very fast". While cpufreq core and
1028 * and governors currently can handle that gracefully, better 1030 * governors currently can handle that gracefully, better set it
1029 * set it to 1 to avoid problems in the future. 1031 * to 1 to avoid problems in the future.
1030 * For all others it's a BIOS bug.
1031 */ 1032 */
1032 if (boot_cpu_data.x86 != 0x11) 1033 if (boot_cpu_data.x86 < 0x11)
1033 printk(KERN_ERR FW_WARN PFX "Invalid zero transition " 1034 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1034 "latency\n"); 1035 "latency\n");
1035 max_latency = 1; 1036 max_latency = 1;
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index dd531cc56a8f..8095f8611f8a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,6 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
34{ 34{
35 &x86_hyper_vmware, 35 &x86_hyper_vmware,
36 &x86_hyper_ms_hyperv, 36 &x86_hyper_ms_hyperv,
37#ifdef CONFIG_XEN_PVHVM
38 &x86_hyper_xen_hvm,
39#endif
37}; 40};
38 41
39const struct hypervisor_x86 *x86_hyper; 42const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 33eae2062cf5..898c2f4eab88 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
347 return l3; 347 return l3;
348} 348}
349 349
350static void __cpuinit 350static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
351amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 351 int index)
352{ 352{
353 int node; 353 int node;
354 354
@@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
396 this_leaf->l3 = l3_caches[node]; 396 this_leaf->l3 = l3_caches[node];
397} 397}
398 398
399/*
400 * check whether a slot used for disabling an L3 index is occupied.
401 * @l3: L3 cache descriptor
402 * @slot: slot number (0..1)
403 *
404 * @returns: the disabled index if used or negative value if slot free.
405 */
406int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
407{
408 unsigned int reg = 0;
409
410 pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
411
412 /* check whether this slot is activated already */
413 if (reg & (3UL << 30))
414 return reg & 0xfff;
415
416 return -1;
417}
418
399static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, 419static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
400 unsigned int slot) 420 unsigned int slot)
401{ 421{
402 struct pci_dev *dev = this_leaf->l3->dev; 422 int index;
403 unsigned int reg = 0;
404 423
405 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 424 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
406 return -EINVAL; 425 return -EINVAL;
407 426
408 if (!dev) 427 index = amd_get_l3_disable_slot(this_leaf->l3, slot);
409 return -EINVAL; 428 if (index >= 0)
429 return sprintf(buf, "%d\n", index);
410 430
411 pci_read_config_dword(dev, 0x1BC + slot * 4, &reg); 431 return sprintf(buf, "FREE\n");
412 return sprintf(buf, "0x%08x\n", reg);
413} 432}
414 433
415#define SHOW_CACHE_DISABLE(slot) \ 434#define SHOW_CACHE_DISABLE(slot) \
@@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
451 } 470 }
452} 471}
453 472
454 473/*
455static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, 474 * disable a L3 cache index by using a disable-slot
456 const char *buf, size_t count, 475 *
457 unsigned int slot) 476 * @l3: L3 cache descriptor
477 * @cpu: A CPU on the node containing the L3 cache
478 * @slot: slot number (0..1)
479 * @index: index to disable
480 *
481 * @return: 0 on success, error status on failure
482 */
483int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
484 unsigned long index)
458{ 485{
459 struct pci_dev *dev = this_leaf->l3->dev; 486 int ret = 0;
460 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
461 unsigned long val = 0;
462 487
463#define SUBCACHE_MASK (3UL << 20) 488#define SUBCACHE_MASK (3UL << 20)
464#define SUBCACHE_INDEX 0xfff 489#define SUBCACHE_INDEX 0xfff
465 490
466 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 491 /*
492 * check whether this slot is already used or
493 * the index is already disabled
494 */
495 ret = amd_get_l3_disable_slot(l3, slot);
496 if (ret >= 0)
467 return -EINVAL; 497 return -EINVAL;
468 498
499 /*
500 * check whether the other slot has disabled the
501 * same index already
502 */
503 if (index == amd_get_l3_disable_slot(l3, !slot))
504 return -EINVAL;
505
506 /* do not allow writes outside of allowed bits */
507 if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
508 ((index & SUBCACHE_INDEX) > l3->indices))
509 return -EINVAL;
510
511 amd_l3_disable_index(l3, cpu, slot, index);
512
513 return 0;
514}
515
516static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
517 const char *buf, size_t count,
518 unsigned int slot)
519{
520 unsigned long val = 0;
521 int cpu, err = 0;
522
469 if (!capable(CAP_SYS_ADMIN)) 523 if (!capable(CAP_SYS_ADMIN))
470 return -EPERM; 524 return -EPERM;
471 525
472 if (!dev) 526 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
473 return -EINVAL; 527 return -EINVAL;
474 528
475 if (strict_strtoul(buf, 10, &val) < 0) 529 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
476 return -EINVAL;
477 530
478 /* do not allow writes outside of allowed bits */ 531 if (strict_strtoul(buf, 10, &val) < 0)
479 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
480 ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
481 return -EINVAL; 532 return -EINVAL;
482 533
483 amd_l3_disable_index(this_leaf->l3, cpu, slot, val); 534 err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
484 535 if (err) {
536 if (err == -EEXIST)
537 printk(KERN_WARNING "L3 disable slot %d in use!\n",
538 slot);
539 return err;
540 }
485 return count; 541 return count;
486} 542}
487 543
@@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
502 558
503#else /* CONFIG_CPU_SUP_AMD */ 559#else /* CONFIG_CPU_SUP_AMD */
504static void __cpuinit 560static void __cpuinit
505amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 561amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
506{ 562{
507}; 563};
508#endif /* CONFIG_CPU_SUP_AMD */ 564#endif /* CONFIG_CPU_SUP_AMD */
@@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
518 574
519 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 575 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
520 amd_cpuid4(index, &eax, &ebx, &ecx); 576 amd_cpuid4(index, &eax, &ebx, &ecx);
521 amd_check_l3_disable(index, this_leaf); 577 amd_check_l3_disable(this_leaf, index);
522 } else { 578 } else {
523 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 579 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
524 } 580 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18cc42562250..e1269d62c569 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,7 +51,7 @@
51static DEFINE_MUTEX(mce_read_mutex); 51static DEFINE_MUTEX(mce_read_mutex);
52 52
53#define rcu_dereference_check_mce(p) \ 53#define rcu_dereference_check_mce(p) \
54 rcu_dereference_check((p), \ 54 rcu_dereference_index_check((p), \
55 rcu_read_lock_sched_held() || \ 55 rcu_read_lock_sched_held() || \
56 lockdep_is_held(&mce_read_mutex)) 56 lockdep_is_held(&mce_read_mutex))
57 57
@@ -600,6 +600,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
600 */ 600 */
601 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 601 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
602 mce_log(&m); 602 mce_log(&m);
603 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
603 add_taint(TAINT_MACHINE_CHECK); 604 add_taint(TAINT_MACHINE_CHECK);
604 } 605 }
605 606
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e1a0a3bf9716..c2a8b26d4fea 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,15 +34,25 @@
34/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
35#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
36 36
37#define THERMAL_THROTTLING_EVENT 0
38#define POWER_LIMIT_EVENT 1
39
37/* 40/*
38 * Current thermal throttling state: 41 * Current thermal event state:
39 */ 42 */
40struct thermal_state { 43struct _thermal_state {
41 bool is_throttled; 44 bool new_event;
42 45 int event;
43 u64 next_check; 46 u64 next_check;
44 unsigned long throttle_count; 47 unsigned long count;
45 unsigned long last_throttle_count; 48 unsigned long last_count;
49};
50
51struct thermal_state {
52 struct _thermal_state core_throttle;
53 struct _thermal_state core_power_limit;
54 struct _thermal_state package_throttle;
55 struct _thermal_state package_power_limit;
46}; 56};
47 57
48static DEFINE_PER_CPU(struct thermal_state, thermal_state); 58static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly;
53 63
54#ifdef CONFIG_SYSFS 64#ifdef CONFIG_SYSFS
55#define define_therm_throt_sysdev_one_ro(_name) \ 65#define define_therm_throt_sysdev_one_ro(_name) \
56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 66 static SYSDEV_ATTR(_name, 0444, \
67 therm_throt_sysdev_show_##_name, \
68 NULL) \
57 69
58#define define_therm_throt_sysdev_show_func(name) \ 70#define define_therm_throt_sysdev_show_func(event, name) \
59 \ 71 \
60static ssize_t therm_throt_sysdev_show_##name( \ 72static ssize_t therm_throt_sysdev_show_##event##_##name( \
61 struct sys_device *dev, \ 73 struct sys_device *dev, \
62 struct sysdev_attribute *attr, \ 74 struct sysdev_attribute *attr, \
63 char *buf) \ 75 char *buf) \
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \
66 ssize_t ret; \ 78 ssize_t ret; \
67 \ 79 \
68 preempt_disable(); /* CPU hotplug */ \ 80 preempt_disable(); /* CPU hotplug */ \
69 if (cpu_online(cpu)) \ 81 if (cpu_online(cpu)) { \
70 ret = sprintf(buf, "%lu\n", \ 82 ret = sprintf(buf, "%lu\n", \
71 per_cpu(thermal_state, cpu).name); \ 83 per_cpu(thermal_state, cpu).event.name); \
72 else \ 84 } else \
73 ret = 0; \ 85 ret = 0; \
74 preempt_enable(); \ 86 preempt_enable(); \
75 \ 87 \
76 return ret; \ 88 return ret; \
77} 89}
78 90
79define_therm_throt_sysdev_show_func(throttle_count); 91define_therm_throt_sysdev_show_func(core_throttle, count);
80define_therm_throt_sysdev_one_ro(throttle_count); 92define_therm_throt_sysdev_one_ro(core_throttle_count);
93
94define_therm_throt_sysdev_show_func(core_power_limit, count);
95define_therm_throt_sysdev_one_ro(core_power_limit_count);
96
97define_therm_throt_sysdev_show_func(package_throttle, count);
98define_therm_throt_sysdev_one_ro(package_throttle_count);
99
100define_therm_throt_sysdev_show_func(package_power_limit, count);
101define_therm_throt_sysdev_one_ro(package_power_limit_count);
81 102
82static struct attribute *thermal_throttle_attrs[] = { 103static struct attribute *thermal_throttle_attrs[] = {
83 &attr_throttle_count.attr, 104 &attr_core_throttle_count.attr,
84 NULL 105 NULL
85}; 106};
86 107
87static struct attribute_group thermal_throttle_attr_group = { 108static struct attribute_group thermal_attr_group = {
88 .attrs = thermal_throttle_attrs, 109 .attrs = thermal_throttle_attrs,
89 .name = "thermal_throttle" 110 .name = "thermal_throttle"
90}; 111};
91#endif /* CONFIG_SYSFS */ 112#endif /* CONFIG_SYSFS */
92 113
114#define CORE_LEVEL 0
115#define PACKAGE_LEVEL 1
116
93/*** 117/***
94 * therm_throt_process - Process thermal throttling event from interrupt 118 * therm_throt_process - Process thermal throttling event from interrupt
95 * @curr: Whether the condition is current or not (boolean), since the 119 * @curr: Whether the condition is current or not (boolean), since the
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
106 * 1 : Event should be logged further, and a message has been 130 * 1 : Event should be logged further, and a message has been
107 * printed to the syslog. 131 * printed to the syslog.
108 */ 132 */
109static int therm_throt_process(bool is_throttled) 133static int therm_throt_process(bool new_event, int event, int level)
110{ 134{
111 struct thermal_state *state; 135 struct _thermal_state *state;
112 unsigned int this_cpu; 136 unsigned int this_cpu = smp_processor_id();
113 bool was_throttled; 137 bool old_event;
114 u64 now; 138 u64 now;
139 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
115 140
116 this_cpu = smp_processor_id();
117 now = get_jiffies_64(); 141 now = get_jiffies_64();
118 state = &per_cpu(thermal_state, this_cpu); 142 if (level == CORE_LEVEL) {
143 if (event == THERMAL_THROTTLING_EVENT)
144 state = &pstate->core_throttle;
145 else if (event == POWER_LIMIT_EVENT)
146 state = &pstate->core_power_limit;
147 else
148 return 0;
149 } else if (level == PACKAGE_LEVEL) {
150 if (event == THERMAL_THROTTLING_EVENT)
151 state = &pstate->package_throttle;
152 else if (event == POWER_LIMIT_EVENT)
153 state = &pstate->package_power_limit;
154 else
155 return 0;
156 } else
157 return 0;
119 158
120 was_throttled = state->is_throttled; 159 old_event = state->new_event;
121 state->is_throttled = is_throttled; 160 state->new_event = new_event;
122 161
123 if (is_throttled) 162 if (new_event)
124 state->throttle_count++; 163 state->count++;
125 164
126 if (time_before64(now, state->next_check) && 165 if (time_before64(now, state->next_check) &&
127 state->throttle_count != state->last_throttle_count) 166 state->count != state->last_count)
128 return 0; 167 return 0;
129 168
130 state->next_check = now + CHECK_INTERVAL; 169 state->next_check = now + CHECK_INTERVAL;
131 state->last_throttle_count = state->throttle_count; 170 state->last_count = state->count;
132 171
133 /* if we just entered the thermal event */ 172 /* if we just entered the thermal event */
134 if (is_throttled) { 173 if (new_event) {
135 printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); 174 if (event == THERMAL_THROTTLING_EVENT)
175 printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
176 this_cpu,
177 level == CORE_LEVEL ? "Core" : "Package",
178 state->count);
179 else
180 printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
181 this_cpu,
182 level == CORE_LEVEL ? "Core" : "Package",
183 state->count);
136 184
137 add_taint(TAINT_MACHINE_CHECK); 185 add_taint(TAINT_MACHINE_CHECK);
138 return 1; 186 return 1;
139 } 187 }
140 if (was_throttled) { 188 if (old_event) {
141 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); 189 if (event == THERMAL_THROTTLING_EVENT)
190 printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
191 this_cpu,
192 level == CORE_LEVEL ? "Core" : "Package");
193 else
194 printk(KERN_INFO "CPU%d: %s power limit normal\n",
195 this_cpu,
196 level == CORE_LEVEL ? "Core" : "Package");
142 return 1; 197 return 1;
143 } 198 }
144 199
@@ -149,13 +204,32 @@ static int therm_throt_process(bool is_throttled)
149/* Add/Remove thermal_throttle interface for CPU device: */ 204/* Add/Remove thermal_throttle interface for CPU device: */
150static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 205static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
151{ 206{
152 return sysfs_create_group(&sys_dev->kobj, 207 int err;
153 &thermal_throttle_attr_group); 208 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
209
210 err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
211 if (err)
212 return err;
213
214 if (cpu_has(c, X86_FEATURE_PLN))
215 err = sysfs_add_file_to_group(&sys_dev->kobj,
216 &attr_core_power_limit_count.attr,
217 thermal_attr_group.name);
218 if (cpu_has(c, X86_FEATURE_PTS))
219 err = sysfs_add_file_to_group(&sys_dev->kobj,
220 &attr_package_throttle_count.attr,
221 thermal_attr_group.name);
222 if (cpu_has(c, X86_FEATURE_PLN))
223 err = sysfs_add_file_to_group(&sys_dev->kobj,
224 &attr_package_power_limit_count.attr,
225 thermal_attr_group.name);
226
227 return err;
154} 228}
155 229
156static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 230static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
157{ 231{
158 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 232 sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
159} 233}
160 234
161/* Mutex protecting device creation against CPU hotplug: */ 235/* Mutex protecting device creation against CPU hotplug: */
@@ -226,14 +300,50 @@ device_initcall(thermal_throttle_init_device);
226 300
227#endif /* CONFIG_SYSFS */ 301#endif /* CONFIG_SYSFS */
228 302
303/*
304 * Set up the most two significant bit to notify mce log that this thermal
305 * event type.
306 * This is a temp solution. May be changed in the future with mce log
307 * infrasture.
308 */
309#define CORE_THROTTLED (0)
310#define CORE_POWER_LIMIT ((__u64)1 << 62)
311#define PACKAGE_THROTTLED ((__u64)2 << 62)
312#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
313
229/* Thermal transition interrupt handler */ 314/* Thermal transition interrupt handler */
230static void intel_thermal_interrupt(void) 315static void intel_thermal_interrupt(void)
231{ 316{
232 __u64 msr_val; 317 __u64 msr_val;
318 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
233 319
234 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 320 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
235 if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) 321
236 mce_log_therm_throt_event(msr_val); 322 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
323 THERMAL_THROTTLING_EVENT,
324 CORE_LEVEL) != 0)
325 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
326
327 if (cpu_has(c, X86_FEATURE_PLN))
328 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
329 POWER_LIMIT_EVENT,
330 CORE_LEVEL) != 0)
331 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
332
333 if (cpu_has(c, X86_FEATURE_PTS)) {
334 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
335 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
336 THERMAL_THROTTLING_EVENT,
337 PACKAGE_LEVEL) != 0)
338 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
339 if (cpu_has(c, X86_FEATURE_PLN))
340 if (therm_throt_process(msr_val &
341 PACKAGE_THERM_STATUS_POWER_LIMIT,
342 POWER_LIMIT_EVENT,
343 PACKAGE_LEVEL) != 0)
344 mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
345 | msr_val);
346 }
237} 347}
238 348
239static void unexpected_thermal_interrupt(void) 349static void unexpected_thermal_interrupt(void)
@@ -335,8 +445,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
335 apic_write(APIC_LVTTHMR, h); 445 apic_write(APIC_LVTTHMR, h);
336 446
337 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 447 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
338 wrmsr(MSR_IA32_THERM_INTERRUPT, 448 if (cpu_has(c, X86_FEATURE_PLN))
339 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); 449 wrmsr(MSR_IA32_THERM_INTERRUPT,
450 l | (THERM_INT_LOW_ENABLE
451 | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
452 else
453 wrmsr(MSR_IA32_THERM_INTERRUPT,
454 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
455
456 if (cpu_has(c, X86_FEATURE_PTS)) {
457 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
458 if (cpu_has(c, X86_FEATURE_PLN))
459 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
460 l | (PACKAGE_THERM_INT_LOW_ENABLE
461 | PACKAGE_THERM_INT_HIGH_ENABLE
462 | PACKAGE_THERM_INT_PLN_ENABLE), h);
463 else
464 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
465 l | (PACKAGE_THERM_INT_LOW_ENABLE
466 | PACKAGE_THERM_INT_HIGH_ENABLE), h);
467 }
340 468
341 smp_thermal_vector = intel_thermal_interrupt; 469 smp_thermal_vector = intel_thermal_interrupt;
342 470
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 16f41bbe46b6..d944bf6c50e9 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
18#include <asm/mshyperv.h> 18#include <asm/mshyperv.h>
19 19
20struct ms_hyperv_info ms_hyperv; 20struct ms_hyperv_info ms_hyperv;
21EXPORT_SYMBOL_GPL(ms_hyperv);
21 22
22static bool __init ms_hyperv_platform(void) 23static bool __init ms_hyperv_platform(void)
23{ 24{
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f012..c5f59d071425 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
632 unsigned long gran_base, chunk_base, lose_base; 632 unsigned long gran_base, chunk_base, lose_base;
633 char gran_factor, chunk_factor, lose_factor; 633 char gran_factor, chunk_factor, lose_factor;
634 634
635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), 637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
638 638
639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", 639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
640 result[i].bad ? "*BAD*" : " ", 640 result[i].bad ? "*BAD*" : " ",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61c..7d28d7d03885 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
433{ 433{
434 unsigned int mask_lo, mask_hi, base_lo, base_hi; 434 unsigned int mask_lo, mask_hi, base_lo, base_hi;
435 unsigned int tmp, hi; 435 unsigned int tmp, hi;
436 int cpu;
437 436
438 /* 437 /*
439 * get_mtrr doesn't need to update mtrr_state, also it could be called 438 * get_mtrr doesn't need to update mtrr_state, also it could be called
440 * from any cpu, so try to print it out directly. 439 * from any cpu, so try to print it out directly.
441 */ 440 */
442 cpu = get_cpu(); 441 get_cpu();
443 442
444 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 443 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
445 444
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79556bd9b602..01c0f3ee6cc3 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
35 35
36#include <linux/types.h> /* FIXME: kvm_para.h needs this */ 36#include <linux/types.h> /* FIXME: kvm_para.h needs this */
37 37
38#include <linux/stop_machine.h>
38#include <linux/kvm_para.h> 39#include <linux/kvm_para.h>
39#include <linux/uaccess.h> 40#include <linux/uaccess.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -143,22 +144,28 @@ struct set_mtrr_data {
143 mtrr_type smp_type; 144 mtrr_type smp_type;
144}; 145};
145 146
147static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
148
146/** 149/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 150 * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data 151 * @info: pointer to mtrr configuration data
149 * 152 *
150 * Returns nothing. 153 * Returns nothing.
151 */ 154 */
152static void ipi_handler(void *info) 155static int mtrr_work_handler(void *info)
153{ 156{
154#ifdef CONFIG_SMP 157#ifdef CONFIG_SMP
155 struct set_mtrr_data *data = info; 158 struct set_mtrr_data *data = info;
156 unsigned long flags; 159 unsigned long flags;
157 160
161 atomic_dec(&data->count);
162 while (!atomic_read(&data->gate))
163 cpu_relax();
164
158 local_irq_save(flags); 165 local_irq_save(flags);
159 166
160 atomic_dec(&data->count); 167 atomic_dec(&data->count);
161 while (!atomic_read(&data->gate)) 168 while (atomic_read(&data->gate))
162 cpu_relax(); 169 cpu_relax();
163 170
164 /* The master has cleared me to execute */ 171 /* The master has cleared me to execute */
@@ -173,12 +180,13 @@ static void ipi_handler(void *info)
173 } 180 }
174 181
175 atomic_dec(&data->count); 182 atomic_dec(&data->count);
176 while (atomic_read(&data->gate)) 183 while (!atomic_read(&data->gate))
177 cpu_relax(); 184 cpu_relax();
178 185
179 atomic_dec(&data->count); 186 atomic_dec(&data->count);
180 local_irq_restore(flags); 187 local_irq_restore(flags);
181#endif 188#endif
189 return 0;
182} 190}
183 191
184static inline int types_compatible(mtrr_type type1, mtrr_type type2) 192static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
198 * 206 *
199 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: 207 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
200 * 208 *
201 * 1. Send IPI to do the following: 209 * 1. Queue work to do the following on all processors:
202 * 2. Disable Interrupts 210 * 2. Disable Interrupts
203 * 3. Wait for all procs to do so 211 * 3. Wait for all procs to do so
204 * 4. Enter no-fill cache mode 212 * 4. Enter no-fill cache mode
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
215 * 15. Enable interrupts. 223 * 15. Enable interrupts.
216 * 224 *
217 * What does that mean for us? Well, first we set data.count to the number 225 * What does that mean for us? Well, first we set data.count to the number
218 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait 226 * of CPUs. As each CPU announces that it started the rendezvous handler by
219 * until it hits 0 and proceed. We set the data.gate flag and reset data.count. 227 * decrementing the count, We reset data.count and set the data.gate flag
220 * Meanwhile, they are waiting for that flag to be set. Once it's set, each 228 * allowing all the cpu's to proceed with the work. As each cpu disables
229 * interrupts, it'll decrement data.count once. We wait until it hits 0 and
230 * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
231 * are waiting for that flag to be cleared. Once it's cleared, each
221 * CPU goes through the transition of updating MTRRs. 232 * CPU goes through the transition of updating MTRRs.
222 * The CPU vendors may each do it differently, 233 * The CPU vendors may each do it differently,
223 * so we call mtrr_if->set() callback and let them take care of it. 234 * so we call mtrr_if->set() callback and let them take care of it.
224 * When they're done, they again decrement data->count and wait for data.gate 235 * When they're done, they again decrement data->count and wait for data.gate
225 * to be reset. 236 * to be set.
226 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag 237 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
227 * Everyone then enables interrupts and we all continue on. 238 * Everyone then enables interrupts and we all continue on.
228 * 239 *
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
234{ 245{
235 struct set_mtrr_data data; 246 struct set_mtrr_data data;
236 unsigned long flags; 247 unsigned long flags;
248 int cpu;
249
250 preempt_disable();
237 251
238 data.smp_reg = reg; 252 data.smp_reg = reg;
239 data.smp_base = base; 253 data.smp_base = base;
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
246 atomic_set(&data.gate, 0); 260 atomic_set(&data.gate, 0);
247 261
248 /* Start the ball rolling on other CPUs */ 262 /* Start the ball rolling on other CPUs */
249 if (smp_call_function(ipi_handler, &data, 0) != 0) 263 for_each_online_cpu(cpu) {
250 panic("mtrr: timed out waiting for other CPUs\n"); 264 struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
265
266 if (cpu == smp_processor_id())
267 continue;
268
269 stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
270 }
251 271
252 local_irq_save(flags);
253 272
254 while (atomic_read(&data.count)) 273 while (atomic_read(&data.count))
255 cpu_relax(); 274 cpu_relax();
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
259 smp_wmb(); 278 smp_wmb();
260 atomic_set(&data.gate, 1); 279 atomic_set(&data.gate, 1);
261 280
281 local_irq_save(flags);
282
283 while (atomic_read(&data.count))
284 cpu_relax();
285
286 /* Ok, reset count and toggle gate */
287 atomic_set(&data.count, num_booting_cpus() - 1);
288 smp_wmb();
289 atomic_set(&data.gate, 0);
290
262 /* Do our MTRR business */ 291 /* Do our MTRR business */
263 292
264 /* 293 /*
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
279 308
280 atomic_set(&data.count, num_booting_cpus() - 1); 309 atomic_set(&data.count, num_booting_cpus() - 1);
281 smp_wmb(); 310 smp_wmb();
282 atomic_set(&data.gate, 0); 311 atomic_set(&data.gate, 1);
283 312
284 /* 313 /*
285 * Wait here for everyone to have seen the gate change 314 * Wait here for everyone to have seen the gate change
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
289 cpu_relax(); 318 cpu_relax();
290 319
291 local_irq_restore(flags); 320 local_irq_restore(flags);
321 preempt_enable();
292} 322}
293 323
294/** 324/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5db5b7d65a18..f2da20fda02d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -220,6 +220,7 @@ struct x86_pmu {
220 struct perf_event *event); 220 struct perf_event *event);
221 struct event_constraint *event_constraints; 221 struct event_constraint *event_constraints;
222 void (*quirks)(void); 222 void (*quirks)(void);
223 int perfctr_second_write;
223 224
224 int (*cpu_prepare)(int cpu); 225 int (*cpu_prepare)(int cpu);
225 void (*cpu_starting)(int cpu); 226 void (*cpu_starting)(int cpu);
@@ -295,10 +296,10 @@ x86_perf_event_update(struct perf_event *event)
295 * count to the generic event atomically: 296 * count to the generic event atomically:
296 */ 297 */
297again: 298again:
298 prev_raw_count = atomic64_read(&hwc->prev_count); 299 prev_raw_count = local64_read(&hwc->prev_count);
299 rdmsrl(hwc->event_base + idx, new_raw_count); 300 rdmsrl(hwc->event_base + idx, new_raw_count);
300 301
301 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, 302 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
302 new_raw_count) != prev_raw_count) 303 new_raw_count) != prev_raw_count)
303 goto again; 304 goto again;
304 305
@@ -313,8 +314,8 @@ again:
313 delta = (new_raw_count << shift) - (prev_raw_count << shift); 314 delta = (new_raw_count << shift) - (prev_raw_count << shift);
314 delta >>= shift; 315 delta >>= shift;
315 316
316 atomic64_add(delta, &event->count); 317 local64_add(delta, &event->count);
317 atomic64_sub(delta, &hwc->period_left); 318 local64_sub(delta, &hwc->period_left);
318 319
319 return new_raw_count; 320 return new_raw_count;
320} 321}
@@ -438,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event)
438 if (!hwc->sample_period) { 439 if (!hwc->sample_period) {
439 hwc->sample_period = x86_pmu.max_period; 440 hwc->sample_period = x86_pmu.max_period;
440 hwc->last_period = hwc->sample_period; 441 hwc->last_period = hwc->sample_period;
441 atomic64_set(&hwc->period_left, hwc->sample_period); 442 local64_set(&hwc->period_left, hwc->sample_period);
442 } else { 443 } else {
443 /* 444 /*
444 * If we have a PMU initialized but no APIC 445 * If we have a PMU initialized but no APIC
@@ -885,7 +886,7 @@ static int
885x86_perf_event_set_period(struct perf_event *event) 886x86_perf_event_set_period(struct perf_event *event)
886{ 887{
887 struct hw_perf_event *hwc = &event->hw; 888 struct hw_perf_event *hwc = &event->hw;
888 s64 left = atomic64_read(&hwc->period_left); 889 s64 left = local64_read(&hwc->period_left);
889 s64 period = hwc->sample_period; 890 s64 period = hwc->sample_period;
890 int ret = 0, idx = hwc->idx; 891 int ret = 0, idx = hwc->idx;
891 892
@@ -897,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event)
897 */ 898 */
898 if (unlikely(left <= -period)) { 899 if (unlikely(left <= -period)) {
899 left = period; 900 left = period;
900 atomic64_set(&hwc->period_left, left); 901 local64_set(&hwc->period_left, left);
901 hwc->last_period = period; 902 hwc->last_period = period;
902 ret = 1; 903 ret = 1;
903 } 904 }
904 905
905 if (unlikely(left <= 0)) { 906 if (unlikely(left <= 0)) {
906 left += period; 907 left += period;
907 atomic64_set(&hwc->period_left, left); 908 local64_set(&hwc->period_left, left);
908 hwc->last_period = period; 909 hwc->last_period = period;
909 ret = 1; 910 ret = 1;
910 } 911 }
@@ -923,10 +924,19 @@ x86_perf_event_set_period(struct perf_event *event)
923 * The hw event starts counting from this event offset, 924 * The hw event starts counting from this event offset,
924 * mark it to be able to extra future deltas: 925 * mark it to be able to extra future deltas:
925 */ 926 */
926 atomic64_set(&hwc->prev_count, (u64)-left); 927 local64_set(&hwc->prev_count, (u64)-left);
927 928
928 wrmsrl(hwc->event_base + idx, 929 wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
930
931 /*
932 * Due to erratum on certan cpu we need
933 * a second write to be sure the register
934 * is updated properly
935 */
936 if (x86_pmu.perfctr_second_write) {
937 wrmsrl(hwc->event_base + idx,
929 (u64)(-left) & x86_pmu.cntval_mask); 938 (u64)(-left) & x86_pmu.cntval_mask);
939 }
930 940
931 perf_event_update_userpage(event); 941 perf_event_update_userpage(event);
932 942
@@ -969,7 +979,7 @@ static int x86_pmu_enable(struct perf_event *event)
969 * skip the schedulability test here, it will be peformed 979 * skip the schedulability test here, it will be peformed
970 * at commit time(->commit_txn) as a whole 980 * at commit time(->commit_txn) as a whole
971 */ 981 */
972 if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) 982 if (cpuc->group_flag & PERF_EVENT_TXN)
973 goto out; 983 goto out;
974 984
975 ret = x86_pmu.schedule_events(cpuc, n, assign); 985 ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1096,7 +1106,7 @@ static void x86_pmu_disable(struct perf_event *event)
1096 * The events never got scheduled and ->cancel_txn will truncate 1106 * The events never got scheduled and ->cancel_txn will truncate
1097 * the event_list. 1107 * the event_list.
1098 */ 1108 */
1099 if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) 1109 if (cpuc->group_flag & PERF_EVENT_TXN)
1100 return; 1110 return;
1101 1111
1102 x86_pmu_stop(event); 1112 x86_pmu_stop(event);
@@ -1388,7 +1398,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
1388{ 1398{
1389 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1399 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1390 1400
1391 cpuc->group_flag |= PERF_EVENT_TXN_STARTED; 1401 cpuc->group_flag |= PERF_EVENT_TXN;
1392 cpuc->n_txn = 0; 1402 cpuc->n_txn = 0;
1393} 1403}
1394 1404
@@ -1401,7 +1411,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
1401{ 1411{
1402 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1412 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1403 1413
1404 cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; 1414 cpuc->group_flag &= ~PERF_EVENT_TXN;
1405 /* 1415 /*
1406 * Truncate the collected events. 1416 * Truncate the collected events.
1407 */ 1417 */
@@ -1435,11 +1445,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
1435 */ 1445 */
1436 memcpy(cpuc->assign, assign, n*sizeof(int)); 1446 memcpy(cpuc->assign, assign, n*sizeof(int));
1437 1447
1438 /* 1448 cpuc->group_flag &= ~PERF_EVENT_TXN;
1439 * Clear out the txn count so that ->cancel_txn() which gets
1440 * run after ->commit_txn() doesn't undo things.
1441 */
1442 cpuc->n_txn = 0;
1443 1449
1444 return 0; 1450 return 0;
1445} 1451}
@@ -1607,8 +1613,6 @@ static const struct stacktrace_ops backtrace_ops = {
1607 .walk_stack = print_context_stack_bp, 1613 .walk_stack = print_context_stack_bp,
1608}; 1614};
1609 1615
1610#include "../dumpstack.h"
1611
1612static void 1616static void
1613perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) 1617perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1614{ 1618{
@@ -1730,22 +1734,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1730 return entry; 1734 return entry;
1731} 1735}
1732 1736
1733void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
1734{
1735 regs->ip = ip;
1736 /*
1737 * perf_arch_fetch_caller_regs adds another call, we need to increment
1738 * the skip level
1739 */
1740 regs->bp = rewind_frame_pointer(skip + 1);
1741 regs->cs = __KERNEL_CS;
1742 /*
1743 * We abuse bit 3 to pass exact information, see perf_misc_flags
1744 * and the comment with PERF_EFLAGS_EXACT.
1745 */
1746 regs->flags = 0;
1747}
1748
1749unsigned long perf_instruction_pointer(struct pt_regs *regs) 1737unsigned long perf_instruction_pointer(struct pt_regs *regs)
1750{ 1738{
1751 unsigned long ip; 1739 unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ae85d69644d1..107711bf0ee8 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -21,22 +21,36 @@ struct p4_event_bind {
21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ 21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22}; 22};
23 23
24struct p4_cache_event_bind { 24struct p4_pebs_bind {
25 unsigned int metric_pebs; 25 unsigned int metric_pebs;
26 unsigned int metric_vert; 26 unsigned int metric_vert;
27}; 27};
28 28
29#define P4_GEN_CACHE_EVENT_BIND(name) \ 29/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
30 [P4_CACHE__##name] = { \ 30#define P4_GEN_PEBS_BIND(name, pebs, vert) \
31 .metric_pebs = P4_PEBS__##name, \ 31 [P4_PEBS_METRIC__##name] = { \
32 .metric_vert = P4_VERT__##name, \ 32 .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
33 .metric_vert = vert, \
33 } 34 }
34 35
35static struct p4_cache_event_bind p4_cache_event_bind_map[] = { 36/*
36 P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired), 37 * note we have P4_PEBS_ENABLE_UOP_TAG always set here
37 P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired), 38 *
38 P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired), 39 * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
39 P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired), 40 * event configuration to find out which values are to be
41 * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
42 * resgisters
43 */
44static struct p4_pebs_bind p4_pebs_bind_map[] = {
45 P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
46 P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
47 P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
48 P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
49 P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
50 P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
51 P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
52 P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
53 P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
40}; 54};
41 55
42/* 56/*
@@ -281,10 +295,10 @@ static struct p4_event_bind p4_event_bind_map[] = {
281 }, 295 },
282}; 296};
283 297
284#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \ 298#define P4_GEN_CACHE_EVENT(event, bit, metric) \
285 p4_config_pack_escr(P4_ESCR_EVENT(event) | \ 299 p4_config_pack_escr(P4_ESCR_EVENT(event) | \
286 P4_ESCR_EMASK_BIT(event, bit)) | \ 300 P4_ESCR_EMASK_BIT(event, bit)) | \
287 p4_config_pack_cccr(cache_event | \ 301 p4_config_pack_cccr(metric | \
288 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) 302 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
289 303
290static __initconst const u64 p4_hw_cache_event_ids 304static __initconst const u64 p4_hw_cache_event_ids
@@ -296,34 +310,34 @@ static __initconst const u64 p4_hw_cache_event_ids
296 [ C(OP_READ) ] = { 310 [ C(OP_READ) ] = {
297 [ C(RESULT_ACCESS) ] = 0x0, 311 [ C(RESULT_ACCESS) ] = 0x0,
298 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, 312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
299 P4_CACHE__1stl_cache_load_miss_retired), 313 P4_PEBS_METRIC__1stl_cache_load_miss_retired),
300 }, 314 },
301 }, 315 },
302 [ C(LL ) ] = { 316 [ C(LL ) ] = {
303 [ C(OP_READ) ] = { 317 [ C(OP_READ) ] = {
304 [ C(RESULT_ACCESS) ] = 0x0, 318 [ C(RESULT_ACCESS) ] = 0x0,
305 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, 319 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
306 P4_CACHE__2ndl_cache_load_miss_retired), 320 P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
307 }, 321 },
308}, 322},
309 [ C(DTLB) ] = { 323 [ C(DTLB) ] = {
310 [ C(OP_READ) ] = { 324 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x0, 325 [ C(RESULT_ACCESS) ] = 0x0,
312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, 326 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
313 P4_CACHE__dtlb_load_miss_retired), 327 P4_PEBS_METRIC__dtlb_load_miss_retired),
314 }, 328 },
315 [ C(OP_WRITE) ] = { 329 [ C(OP_WRITE) ] = {
316 [ C(RESULT_ACCESS) ] = 0x0, 330 [ C(RESULT_ACCESS) ] = 0x0,
317 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, 331 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
318 P4_CACHE__dtlb_store_miss_retired), 332 P4_PEBS_METRIC__dtlb_store_miss_retired),
319 }, 333 },
320 }, 334 },
321 [ C(ITLB) ] = { 335 [ C(ITLB) ] = {
322 [ C(OP_READ) ] = { 336 [ C(OP_READ) ] = {
323 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, 337 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
324 P4_CACHE__itlb_reference_hit), 338 P4_PEBS_METRIC__none),
325 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, 339 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
326 P4_CACHE__itlb_reference_miss), 340 P4_PEBS_METRIC__none),
327 }, 341 },
328 [ C(OP_WRITE) ] = { 342 [ C(OP_WRITE) ] = {
329 [ C(RESULT_ACCESS) ] = -1, 343 [ C(RESULT_ACCESS) ] = -1,
@@ -414,11 +428,37 @@ static u64 p4_pmu_event_map(int hw_event)
414 return config; 428 return config;
415} 429}
416 430
431static int p4_validate_raw_event(struct perf_event *event)
432{
433 unsigned int v;
434
435 /* user data may have out-of-bound event index */
436 v = p4_config_unpack_event(event->attr.config);
437 if (v >= ARRAY_SIZE(p4_event_bind_map)) {
438 pr_warning("P4 PMU: Unknown event code: %d\n", v);
439 return -EINVAL;
440 }
441
442 /*
443 * it may have some screwed PEBS bits
444 */
445 if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
446 pr_warning("P4 PMU: PEBS are not supported yet\n");
447 return -EINVAL;
448 }
449 v = p4_config_unpack_metric(event->attr.config);
450 if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
451 pr_warning("P4 PMU: Unknown metric code: %d\n", v);
452 return -EINVAL;
453 }
454
455 return 0;
456}
457
417static int p4_hw_config(struct perf_event *event) 458static int p4_hw_config(struct perf_event *event)
418{ 459{
419 int cpu = get_cpu(); 460 int cpu = get_cpu();
420 int rc = 0; 461 int rc = 0;
421 unsigned int evnt;
422 u32 escr, cccr; 462 u32 escr, cccr;
423 463
424 /* 464 /*
@@ -438,12 +478,9 @@ static int p4_hw_config(struct perf_event *event)
438 478
439 if (event->attr.type == PERF_TYPE_RAW) { 479 if (event->attr.type == PERF_TYPE_RAW) {
440 480
441 /* user data may have out-of-bound event index */ 481 rc = p4_validate_raw_event(event);
442 evnt = p4_config_unpack_event(event->attr.config); 482 if (rc)
443 if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
444 rc = -EINVAL;
445 goto out; 483 goto out;
446 }
447 484
448 /* 485 /*
449 * We don't control raw events so it's up to the caller 486 * We don't control raw events so it's up to the caller
@@ -451,12 +488,15 @@ static int p4_hw_config(struct perf_event *event)
451 * on HT machine but allow HT-compatible specifics to be 488 * on HT machine but allow HT-compatible specifics to be
452 * passed on) 489 * passed on)
453 * 490 *
491 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
492 * bits since we keep additional info here (for cache events and etc)
493 *
454 * XXX: HT wide things should check perf_paranoid_cpu() && 494 * XXX: HT wide things should check perf_paranoid_cpu() &&
455 * CAP_SYS_ADMIN 495 * CAP_SYS_ADMIN
456 */ 496 */
457 event->hw.config |= event->attr.config & 497 event->hw.config |= event->attr.config &
458 (p4_config_pack_escr(P4_ESCR_MASK_HT) | 498 (p4_config_pack_escr(P4_ESCR_MASK_HT) |
459 p4_config_pack_cccr(P4_CCCR_MASK_HT)); 499 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
460 } 500 }
461 501
462 rc = x86_setup_perfctr(event); 502 rc = x86_setup_perfctr(event);
@@ -482,6 +522,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
482 return overflow; 522 return overflow;
483} 523}
484 524
525static void p4_pmu_disable_pebs(void)
526{
527 /*
528 * FIXME
529 *
530 * It's still allowed that two threads setup same cache
531 * events so we can't simply clear metrics until we knew
532 * noone is depending on us, so we need kind of counter
533 * for "ReplayEvent" users.
534 *
535 * What is more complex -- RAW events, if user (for some
536 * reason) will pass some cache event metric with improper
537 * event opcode -- it's fine from hardware point of view
538 * but completely nonsence from "meaning" of such action.
539 *
540 * So at moment let leave metrics turned on forever -- it's
541 * ok for now but need to be revisited!
542 *
543 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
544 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
545 */
546}
547
485static inline void p4_pmu_disable_event(struct perf_event *event) 548static inline void p4_pmu_disable_event(struct perf_event *event)
486{ 549{
487 struct hw_perf_event *hwc = &event->hw; 550 struct hw_perf_event *hwc = &event->hw;
@@ -507,6 +570,26 @@ static void p4_pmu_disable_all(void)
507 continue; 570 continue;
508 p4_pmu_disable_event(event); 571 p4_pmu_disable_event(event);
509 } 572 }
573
574 p4_pmu_disable_pebs();
575}
576
577/* configuration must be valid */
578static void p4_pmu_enable_pebs(u64 config)
579{
580 struct p4_pebs_bind *bind;
581 unsigned int idx;
582
583 BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
584
585 idx = p4_config_unpack_metric(config);
586 if (idx == P4_PEBS_METRIC__none)
587 return;
588
589 bind = &p4_pebs_bind_map[idx];
590
591 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
592 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
510} 593}
511 594
512static void p4_pmu_enable_event(struct perf_event *event) 595static void p4_pmu_enable_event(struct perf_event *event)
@@ -515,9 +598,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
515 int thread = p4_ht_config_thread(hwc->config); 598 int thread = p4_ht_config_thread(hwc->config);
516 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); 599 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
517 unsigned int idx = p4_config_unpack_event(hwc->config); 600 unsigned int idx = p4_config_unpack_event(hwc->config);
518 unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
519 struct p4_event_bind *bind; 601 struct p4_event_bind *bind;
520 struct p4_cache_event_bind *bind_cache;
521 u64 escr_addr, cccr; 602 u64 escr_addr, cccr;
522 603
523 bind = &p4_event_bind_map[idx]; 604 bind = &p4_event_bind_map[idx];
@@ -537,16 +618,10 @@ static void p4_pmu_enable_event(struct perf_event *event)
537 cccr = p4_config_unpack_cccr(hwc->config); 618 cccr = p4_config_unpack_cccr(hwc->config);
538 619
539 /* 620 /*
540 * it could be Cache event so that we need to 621 * it could be Cache event so we need to write metrics
541 * set metrics into additional MSRs 622 * into additional MSRs
542 */ 623 */
543 BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK); 624 p4_pmu_enable_pebs(hwc->config);
544 if (idx_cache > P4_CACHE__NONE &&
545 idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
546 bind_cache = &p4_cache_event_bind_map[idx_cache];
547 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
548 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
549 }
550 625
551 (void)checking_wrmsrl(escr_addr, escr_conf); 626 (void)checking_wrmsrl(escr_addr, escr_conf);
552 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 627 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
@@ -829,6 +904,15 @@ static __initconst const struct x86_pmu p4_pmu = {
829 .max_period = (1ULL << 39) - 1, 904 .max_period = (1ULL << 39) - 1,
830 .hw_config = p4_hw_config, 905 .hw_config = p4_hw_config,
831 .schedule_events = p4_pmu_schedule_events, 906 .schedule_events = p4_pmu_schedule_events,
907 /*
908 * This handles erratum N15 in intel doc 249199-029,
909 * the counter may not be updated correctly on write
910 * so we need a second write operation to do the trick
911 * (the official workaround didn't work)
912 *
913 * the former idea is taken from OProfile code
914 */
915 .perfctr_second_write = 1,
832}; 916};
833 917
834static __init int p4_pmu_init(void) 918static __init int p4_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000000..34b4dad6f0b8
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,63 @@
1/*
2 * Routines to indentify additional cpu features that are scattered in
3 * cpuid space.
4 */
5#include <linux/cpu.h>
6
7#include <asm/pat.h>
8#include <asm/processor.h>
9
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17 u32 sub_leaf;
18};
19
20enum cpuid_regs {
21 CR_EAX = 0,
22 CR_ECX,
23 CR_EDX,
24 CR_EBX
25};
26
27void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
28{
29 u32 max_level;
30 u32 regs[4];
31 const struct cpuid_bit *cb;
32
33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
34 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
35 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
36 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
37 { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
38 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
39 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
40 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
41 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
42 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
43 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
44 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
45 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
46 { 0, 0, 0, 0, 0 }
47 };
48
49 for (cb = cpuid_bits; cb->feature; cb++) {
50
51 /* Verify that the level is valid */
52 max_level = cpuid_eax(cb->level & 0xffff0000);
53 if (max_level < cb->level ||
54 max_level > (cb->level | 0xffff))
55 continue;
56
57 cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
58 &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
59
60 if (regs[cb->reg] & (1 << cb->bit))
61 set_cpu_cap(c, cb->feature);
62 }
63}
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c
index 10fa5684a662..4397e987a1cf 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,62 +1,14 @@
1/* 1/*
2 * Routines to indentify additional cpu features that are scattered in 2 * Check for extended topology enumeration cpuid leaf 0xb and if it
3 * cpuid space. 3 * exists, use it for populating initial_apicid and cpu topology
4 * detection.
4 */ 5 */
5#include <linux/cpu.h>
6 6
7#include <linux/cpu.h>
8#include <asm/apic.h>
7#include <asm/pat.h> 9#include <asm/pat.h>
8#include <asm/processor.h> 10#include <asm/processor.h>
9 11
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17};
18
19enum cpuid_regs {
20 CR_EAX = 0,
21 CR_ECX,
22 CR_EDX,
23 CR_EBX
24};
25
26void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
27{
28 u32 max_level;
29 u32 regs[4];
30 const struct cpuid_bit *cb;
31
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 },
36 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 },
37 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
38 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
39 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
40 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
41 { 0, 0, 0, 0 }
42 };
43
44 for (cb = cpuid_bits; cb->feature; cb++) {
45
46 /* Verify that the level is valid */
47 max_level = cpuid_eax(cb->level & 0xffff0000);
48 if (max_level < cb->level ||
49 max_level > (cb->level | 0xffff))
50 continue;
51
52 cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
53 &regs[CR_ECX], &regs[CR_EDX]);
54
55 if (regs[cb->reg] & (1 << cb->bit))
56 set_cpu_cap(c, cb->feature);
57 }
58}
59
60/* leaf 0xb SMT level */ 12/* leaf 0xb SMT level */
61#define SMT_LEVEL 0 13#define SMT_LEVEL 0
62 14
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index b9d1ff588445..227b0448960d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void)
51 51
52static unsigned long vmware_get_tsc_khz(void) 52static unsigned long vmware_get_tsc_khz(void)
53{ 53{
54 uint64_t tsc_hz; 54 uint64_t tsc_hz, lpj;
55 uint32_t eax, ebx, ecx, edx; 55 uint32_t eax, ebx, ecx, edx;
56 56
57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); 57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void)
62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", 62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
63 (unsigned long) tsc_hz / 1000, 63 (unsigned long) tsc_hz / 1000,
64 (unsigned long) tsc_hz % 1000); 64 (unsigned long) tsc_hz % 1000);
65
66 if (!preset_lpj) {
67 lpj = ((u64)tsc_hz * 1000);
68 do_div(lpj, HZ);
69 preset_lpj = lpj;
70 }
71
65 return tsc_hz; 72 return tsc_hz;
66} 73}
67 74
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c89a386930b7..6e8752c1bd52 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,7 +18,6 @@
18 18
19#include <asm/stacktrace.h> 19#include <asm/stacktrace.h>
20 20
21#include "dumpstack.h"
22 21
23int panic_on_unrecovered_nmi; 22int panic_on_unrecovered_nmi;
24int panic_on_io_nmi; 23int panic_on_io_nmi;
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
deleted file mode 100644
index e1a93be4fd44..000000000000
--- a/arch/x86/kernel/dumpstack.h
+++ /dev/null
@@ -1,56 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5
6#ifndef DUMPSTACK_H
7#define DUMPSTACK_H
8
9#ifdef CONFIG_X86_32
10#define STACKSLOTS_PER_LINE 8
11#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
12#else
13#define STACKSLOTS_PER_LINE 4
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif
16
17#include <linux/uaccess.h>
18
19extern void
20show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
21 unsigned long *stack, unsigned long bp, char *log_lvl);
22
23extern void
24show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *sp, unsigned long bp, char *log_lvl);
26
27extern unsigned int code_bytes;
28
29/* The form of the top of the frame on the stack */
30struct stack_frame {
31 struct stack_frame *next_frame;
32 unsigned long return_address;
33};
34
35struct stack_frame_ia32 {
36 u32 next_frame;
37 u32 return_address;
38};
39
40static inline unsigned long rewind_frame_pointer(int n)
41{
42 struct stack_frame *frame;
43
44 get_bp(frame);
45
46#ifdef CONFIG_FRAME_POINTER
47 while (n--) {
48 if (probe_kernel_address(&frame->next_frame, frame))
49 break;
50 }
51#endif
52
53 return (unsigned long)frame;
54}
55
56#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 11540a189d93..0f6376ffa2d9 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,8 +16,6 @@
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h"
20
21 19
22void dump_trace(struct task_struct *task, struct pt_regs *regs, 20void dump_trace(struct task_struct *task, struct pt_regs *regs,
23 unsigned long *stack, unsigned long bp, 21 unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 272c9f1f05f3..57a21f11c791 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,7 +16,6 @@
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h"
20 19
21#define N_EXCEPTION_STACKS_END \ 20#define N_EXCEPTION_STACKS_END \
22 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) 21 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index ebdb85cf2686..e5cc7e82e60d 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -18,6 +18,7 @@
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <asm/iommu.h> 19#include <asm/iommu.h>
20#include <asm/gart.h> 20#include <asm/gart.h>
21#include <asm/hpet.h>
21 22
22static void __init fix_hypertransport_config(int num, int slot, int func) 23static void __init fix_hypertransport_config(int num, int slot, int func)
23{ 24{
@@ -191,6 +192,21 @@ static void __init ati_bugs_contd(int num, int slot, int func)
191} 192}
192#endif 193#endif
193 194
195/*
196 * Force the read back of the CMP register in hpet_next_event()
197 * to work around the problem that the CMP register write seems to be
198 * delayed. See hpet_next_event() for details.
199 *
200 * We do this on all SMBUS incarnations for now until we have more
201 * information about the affected chipsets.
202 */
203static void __init ati_hpet_bugs(int num, int slot, int func)
204{
205#ifdef CONFIG_HPET_TIMER
206 hpet_readback_cmp = 1;
207#endif
208}
209
194#define QFLAG_APPLY_ONCE 0x1 210#define QFLAG_APPLY_ONCE 0x1
195#define QFLAG_APPLIED 0x2 211#define QFLAG_APPLIED 0x2
196#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 212#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -220,6 +236,8 @@ static struct chipset early_qrk[] __initdata = {
220 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, 236 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
221 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, 237 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
222 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, 238 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
239 { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
240 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs },
223 {} 241 {}
224}; 242};
225 243
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf153..258e93fa2630 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -611,14 +611,14 @@ ldt_ss:
611 * compensating for the offset by changing to the ESPFIX segment with 611 * compensating for the offset by changing to the ESPFIX segment with
612 * a base address that matches for the difference. 612 * a base address that matches for the difference.
613 */ 613 */
614#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
614 mov %esp, %edx /* load kernel esp */ 615 mov %esp, %edx /* load kernel esp */
615 mov PT_OLDESP(%esp), %eax /* load userspace esp */ 616 mov PT_OLDESP(%esp), %eax /* load userspace esp */
616 mov %dx, %ax /* eax: new kernel esp */ 617 mov %dx, %ax /* eax: new kernel esp */
617 sub %eax, %edx /* offset (low word is 0) */ 618 sub %eax, %edx /* offset (low word is 0) */
618 PER_CPU(gdt_page, %ebx)
619 shr $16, %edx 619 shr $16, %edx
620 mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ 620 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
621 mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ 621 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
622 pushl $__ESPFIX_SS 622 pushl $__ESPFIX_SS
623 CFI_ADJUST_CFA_OFFSET 4 623 CFI_ADJUST_CFA_OFFSET 4
624 push %eax /* new kernel esp */ 624 push %eax /* new kernel esp */
@@ -791,9 +791,8 @@ ptregs_clone:
791 * normal stack and adjusts ESP with the matching offset. 791 * normal stack and adjusts ESP with the matching offset.
792 */ 792 */
793 /* fixup the stack */ 793 /* fixup the stack */
794 PER_CPU(gdt_page, %ebx) 794 mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
795 mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ 795 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
796 mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
797 shl $16, %eax 796 shl $16, %eax
798 addl %esp, %eax /* the adjusted stack pointer */ 797 addl %esp, %eax /* the adjusted stack pointer */
799 pushl $__KERNEL_DS 798 pushl $__KERNEL_DS
@@ -1166,6 +1165,9 @@ ENTRY(xen_failsafe_callback)
1166.previous 1165.previous
1167ENDPROC(xen_failsafe_callback) 1166ENDPROC(xen_failsafe_callback)
1168 1167
1168BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
1169 xen_evtchn_do_upcall)
1170
1169#endif /* CONFIG_XEN */ 1171#endif /* CONFIG_XEN */
1170 1172
1171#ifdef CONFIG_FUNCTION_TRACER 1173#ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 0697ff139837..c5ea5cdbe7b3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -571,8 +571,8 @@ auditsys:
571 * masked off. 571 * masked off.
572 */ 572 */
573sysret_audit: 573sysret_audit:
574 movq %rax,%rsi /* second arg, syscall return value */ 574 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
575 cmpq $0,%rax /* is it < 0? */ 575 cmpq $0,%rsi /* is it < 0? */
576 setl %al /* 1 if so, 0 if not */ 576 setl %al /* 1 if so, 0 if not */
577 movzbl %al,%edi /* zero-extend that into %edi */ 577 movzbl %al,%edi /* zero-extend that into %edi */
578 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ 578 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
@@ -1065,6 +1065,7 @@ ENTRY(\sym)
1065END(\sym) 1065END(\sym)
1066.endm 1066.endm
1067 1067
1068#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
1068.macro paranoidzeroentry_ist sym do_sym ist 1069.macro paranoidzeroentry_ist sym do_sym ist
1069ENTRY(\sym) 1070ENTRY(\sym)
1070 INTR_FRAME 1071 INTR_FRAME
@@ -1076,10 +1077,9 @@ ENTRY(\sym)
1076 TRACE_IRQS_OFF 1077 TRACE_IRQS_OFF
1077 movq %rsp,%rdi /* pt_regs pointer */ 1078 movq %rsp,%rdi /* pt_regs pointer */
1078 xorl %esi,%esi /* no error code */ 1079 xorl %esi,%esi /* no error code */
1079 PER_CPU(init_tss, %r12) 1080 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1080 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1081 call \do_sym 1081 call \do_sym
1082 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) 1082 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1083 jmp paranoid_exit /* %ebx: no swapgs flag */ 1083 jmp paranoid_exit /* %ebx: no swapgs flag */
1084 CFI_ENDPROC 1084 CFI_ENDPROC
1085END(\sym) 1085END(\sym)
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback)
1329 CFI_ENDPROC 1329 CFI_ENDPROC
1330END(xen_failsafe_callback) 1330END(xen_failsafe_callback)
1331 1331
1332apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
1333 xen_hvm_callback_vector xen_evtchn_do_upcall
1334
1332#endif /* CONFIG_XEN */ 1335#endif /* CONFIG_XEN */
1333 1336
1334/* 1337/*
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index b2e246037392..784360c0625c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -20,7 +20,7 @@
20 20
21static void __init i386_default_early_setup(void) 21static void __init i386_default_early_setup(void)
22{ 22{
23 /* Initilize 32bit specific setup functions */ 23 /* Initialize 32bit specific setup functions */
24 x86_init.resources.probe_roms = probe_roms; 24 x86_init.resources.probe_roms = probe_roms;
25 x86_init.resources.reserve_resources = i386_reserve_resources; 25 x86_init.resources.reserve_resources = i386_reserve_resources;
26 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; 26 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3d1e6f16b7a6..239046bd447f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64)
234 * init data section till per cpu areas are set up. 234 * init data section till per cpu areas are set up.
235 */ 235 */
236 movl $MSR_GS_BASE,%ecx 236 movl $MSR_GS_BASE,%ecx
237 movq initial_gs(%rip),%rax 237 movl initial_gs(%rip),%eax
238 movq %rax,%rdx 238 movl initial_gs+4(%rip),%edx
239 shrq $32,%rdx
240 wrmsr 239 wrmsr
241 240
242 /* esi is pointer to real mode structure with interesting info. 241 /* esi is pointer to real mode structure with interesting info.
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index a198b7c87a12..33dbcc4ec5ff 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,6 @@
16#include <asm/hpet.h> 16#include <asm/hpet.h>
17 17
18#define HPET_MASK CLOCKSOURCE_MASK(32) 18#define HPET_MASK CLOCKSOURCE_MASK(32)
19#define HPET_SHIFT 22
20 19
21/* FSEC = 10^-15 20/* FSEC = 10^-15
22 NSEC = 10^-9 */ 21 NSEC = 10^-9 */
@@ -787,7 +786,6 @@ static struct clocksource clocksource_hpet = {
787 .rating = 250, 786 .rating = 250,
788 .read = read_hpet, 787 .read = read_hpet,
789 .mask = HPET_MASK, 788 .mask = HPET_MASK,
790 .shift = HPET_SHIFT,
791 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 789 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
792 .resume = hpet_resume_counter, 790 .resume = hpet_resume_counter,
793#ifdef CONFIG_X86_64 791#ifdef CONFIG_X86_64
@@ -798,6 +796,7 @@ static struct clocksource clocksource_hpet = {
798static int hpet_clocksource_register(void) 796static int hpet_clocksource_register(void)
799{ 797{
800 u64 start, now; 798 u64 start, now;
799 u64 hpet_freq;
801 cycle_t t1; 800 cycle_t t1;
802 801
803 /* Start the counter */ 802 /* Start the counter */
@@ -832,9 +831,15 @@ static int hpet_clocksource_register(void)
832 * mult = (hpet_period * 2^shift)/10^6 831 * mult = (hpet_period * 2^shift)/10^6
833 * mult = (hpet_period << shift)/FSEC_PER_NSEC 832 * mult = (hpet_period << shift)/FSEC_PER_NSEC
834 */ 833 */
835 clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
836 834
837 clocksource_register(&clocksource_hpet); 835 /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
836 *
837 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
838 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
839 */
840 hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC;
841 do_div(hpet_freq, hpet_period);
842 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
838 843
839 return 0; 844 return 0;
840} 845}
@@ -964,7 +969,7 @@ fs_initcall(hpet_late_init);
964 969
965void hpet_disable(void) 970void hpet_disable(void)
966{ 971{
967 if (is_hpet_capable()) { 972 if (is_hpet_capable() && hpet_virt_address) {
968 unsigned int cfg = hpet_readl(HPET_CFG); 973 unsigned int cfg = hpet_readl(HPET_CFG);
969 974
970 if (hpet_legacy_int_enabled) { 975 if (hpet_legacy_int_enabled) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a8f1b803d2fd..a474ec37c32f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -208,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
208{ 208{
209 /* Len */ 209 /* Len */
210 switch (x86_len) { 210 switch (x86_len) {
211 case X86_BREAKPOINT_LEN_X:
212 *gen_len = sizeof(long);
213 break;
211 case X86_BREAKPOINT_LEN_1: 214 case X86_BREAKPOINT_LEN_1:
212 *gen_len = HW_BREAKPOINT_LEN_1; 215 *gen_len = HW_BREAKPOINT_LEN_1;
213 break; 216 break;
@@ -251,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp)
251 254
252 info->address = bp->attr.bp_addr; 255 info->address = bp->attr.bp_addr;
253 256
257 /* Type */
258 switch (bp->attr.bp_type) {
259 case HW_BREAKPOINT_W:
260 info->type = X86_BREAKPOINT_WRITE;
261 break;
262 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
263 info->type = X86_BREAKPOINT_RW;
264 break;
265 case HW_BREAKPOINT_X:
266 info->type = X86_BREAKPOINT_EXECUTE;
267 /*
268 * x86 inst breakpoints need to have a specific undefined len.
269 * But we still need to check userspace is not trying to setup
270 * an unsupported length, to get a range breakpoint for example.
271 */
272 if (bp->attr.bp_len == sizeof(long)) {
273 info->len = X86_BREAKPOINT_LEN_X;
274 return 0;
275 }
276 default:
277 return -EINVAL;
278 }
279
254 /* Len */ 280 /* Len */
255 switch (bp->attr.bp_len) { 281 switch (bp->attr.bp_len) {
256 case HW_BREAKPOINT_LEN_1: 282 case HW_BREAKPOINT_LEN_1:
@@ -271,21 +297,6 @@ static int arch_build_bp_info(struct perf_event *bp)
271 return -EINVAL; 297 return -EINVAL;
272 } 298 }
273 299
274 /* Type */
275 switch (bp->attr.bp_type) {
276 case HW_BREAKPOINT_W:
277 info->type = X86_BREAKPOINT_WRITE;
278 break;
279 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
280 info->type = X86_BREAKPOINT_RW;
281 break;
282 case HW_BREAKPOINT_X:
283 info->type = X86_BREAKPOINT_EXECUTE;
284 break;
285 default:
286 return -EINVAL;
287 }
288
289 return 0; 300 return 0;
290} 301}
291/* 302/*
@@ -305,6 +316,9 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
305 ret = -EINVAL; 316 ret = -EINVAL;
306 317
307 switch (info->len) { 318 switch (info->len) {
319 case X86_BREAKPOINT_LEN_X:
320 align = sizeof(long) -1;
321 break;
308 case X86_BREAKPOINT_LEN_1: 322 case X86_BREAKPOINT_LEN_1:
309 align = 0; 323 align = 0;
310 break; 324 break;
@@ -466,6 +480,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
466 480
467 perf_bp_event(bp, args->regs); 481 perf_bp_event(bp, args->regs);
468 482
483 /*
484 * Set up resume flag to avoid breakpoint recursion when
485 * returning back to origin.
486 */
487 if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
488 args->regs->flags |= X86_EFLAGS_RF;
489
469 rcu_read_unlock(); 490 rcu_read_unlock();
470 } 491 }
471 /* 492 /*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b32253..c4444bce8469 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -107,7 +107,7 @@ void __cpuinit fpu_init(void)
107} 107}
108#endif /* CONFIG_X86_64 */ 108#endif /* CONFIG_X86_64 */
109 109
110static void fpu_finit(struct fpu *fpu) 110void fpu_finit(struct fpu *fpu)
111{ 111{
112#ifdef CONFIG_X86_32 112#ifdef CONFIG_X86_32
113 if (!HAVE_HWFP) { 113 if (!HAVE_HWFP) {
@@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu)
132 fp->fos = 0xffff0000u; 132 fp->fos = 0xffff0000u;
133 } 133 }
134} 134}
135EXPORT_SYMBOL_GPL(fpu_finit);
135 136
136/* 137/*
137 * The _current_ task is using the FPU for the first time 138 * The _current_ task is using the FPU for the first time
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 7c9f02c130f3..cafa7c80ac95 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -276,16 +276,6 @@ static struct sys_device device_i8259A = {
276 .cls = &i8259_sysdev_class, 276 .cls = &i8259_sysdev_class,
277}; 277};
278 278
279static int __init i8259A_init_sysfs(void)
280{
281 int error = sysdev_class_register(&i8259_sysdev_class);
282 if (!error)
283 error = sysdev_register(&device_i8259A);
284 return error;
285}
286
287device_initcall(i8259A_init_sysfs);
288
289static void mask_8259A(void) 279static void mask_8259A(void)
290{ 280{
291 unsigned long flags; 281 unsigned long flags;
@@ -407,3 +397,18 @@ struct legacy_pic default_legacy_pic = {
407}; 397};
408 398
409struct legacy_pic *legacy_pic = &default_legacy_pic; 399struct legacy_pic *legacy_pic = &default_legacy_pic;
400
401static int __init i8259A_init_sysfs(void)
402{
403 int error;
404
405 if (legacy_pic != &default_legacy_pic)
406 return 0;
407
408 error = sysdev_class_register(&i8259_sysdev_class);
409 if (!error)
410 error = sysdev_register(&device_i8259A);
411 return error;
412}
413
414device_initcall(i8259A_init_sysfs);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 4f4af75b9482..ef10940e1af0 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -49,55 +49,94 @@
49#include <asm/system.h> 49#include <asm/system.h>
50#include <asm/apic.h> 50#include <asm/apic.h>
51 51
52/** 52struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
53 * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
54 * @gdb_regs: A pointer to hold the registers in the order GDB wants.
55 * @regs: The &struct pt_regs of the current process.
56 *
57 * Convert the pt_regs in @regs into the format for registers that
58 * GDB expects, stored in @gdb_regs.
59 */
60void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
61{ 53{
62#ifndef CONFIG_X86_32 54#ifdef CONFIG_X86_32
63 u32 *gdb_regs32 = (u32 *)gdb_regs; 55 { "ax", 4, offsetof(struct pt_regs, ax) },
56 { "cx", 4, offsetof(struct pt_regs, cx) },
57 { "dx", 4, offsetof(struct pt_regs, dx) },
58 { "bx", 4, offsetof(struct pt_regs, bx) },
59 { "sp", 4, offsetof(struct pt_regs, sp) },
60 { "bp", 4, offsetof(struct pt_regs, bp) },
61 { "si", 4, offsetof(struct pt_regs, si) },
62 { "di", 4, offsetof(struct pt_regs, di) },
63 { "ip", 4, offsetof(struct pt_regs, ip) },
64 { "flags", 4, offsetof(struct pt_regs, flags) },
65 { "cs", 4, offsetof(struct pt_regs, cs) },
66 { "ss", 4, offsetof(struct pt_regs, ss) },
67 { "ds", 4, offsetof(struct pt_regs, ds) },
68 { "es", 4, offsetof(struct pt_regs, es) },
69 { "fs", 4, -1 },
70 { "gs", 4, -1 },
71#else
72 { "ax", 8, offsetof(struct pt_regs, ax) },
73 { "bx", 8, offsetof(struct pt_regs, bx) },
74 { "cx", 8, offsetof(struct pt_regs, cx) },
75 { "dx", 8, offsetof(struct pt_regs, dx) },
76 { "si", 8, offsetof(struct pt_regs, dx) },
77 { "di", 8, offsetof(struct pt_regs, di) },
78 { "bp", 8, offsetof(struct pt_regs, bp) },
79 { "sp", 8, offsetof(struct pt_regs, sp) },
80 { "r8", 8, offsetof(struct pt_regs, r8) },
81 { "r9", 8, offsetof(struct pt_regs, r9) },
82 { "r10", 8, offsetof(struct pt_regs, r10) },
83 { "r11", 8, offsetof(struct pt_regs, r11) },
84 { "r12", 8, offsetof(struct pt_regs, r12) },
85 { "r13", 8, offsetof(struct pt_regs, r13) },
86 { "r14", 8, offsetof(struct pt_regs, r14) },
87 { "r15", 8, offsetof(struct pt_regs, r15) },
88 { "ip", 8, offsetof(struct pt_regs, ip) },
89 { "flags", 4, offsetof(struct pt_regs, flags) },
90 { "cs", 4, offsetof(struct pt_regs, cs) },
91 { "ss", 4, offsetof(struct pt_regs, ss) },
64#endif 92#endif
65 gdb_regs[GDB_AX] = regs->ax; 93};
66 gdb_regs[GDB_BX] = regs->bx; 94
67 gdb_regs[GDB_CX] = regs->cx; 95int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
68 gdb_regs[GDB_DX] = regs->dx; 96{
69 gdb_regs[GDB_SI] = regs->si; 97 if (
70 gdb_regs[GDB_DI] = regs->di;
71 gdb_regs[GDB_BP] = regs->bp;
72 gdb_regs[GDB_PC] = regs->ip;
73#ifdef CONFIG_X86_32 98#ifdef CONFIG_X86_32
74 gdb_regs[GDB_PS] = regs->flags; 99 regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
75 gdb_regs[GDB_DS] = regs->ds; 100#endif
76 gdb_regs[GDB_ES] = regs->es; 101 regno == GDB_SP || regno == GDB_ORIG_AX)
77 gdb_regs[GDB_CS] = regs->cs; 102 return 0;
78 gdb_regs[GDB_FS] = 0xFFFF; 103
79 gdb_regs[GDB_GS] = 0xFFFF; 104 if (dbg_reg_def[regno].offset != -1)
80 if (user_mode_vm(regs)) { 105 memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
81 gdb_regs[GDB_SS] = regs->ss; 106 dbg_reg_def[regno].size);
82 gdb_regs[GDB_SP] = regs->sp; 107 return 0;
83 } else { 108}
84 gdb_regs[GDB_SS] = __KERNEL_DS; 109
85 gdb_regs[GDB_SP] = kernel_stack_pointer(regs); 110char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
111{
112 if (regno == GDB_ORIG_AX) {
113 memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
114 return "orig_ax";
86 } 115 }
87#else 116 if (regno >= DBG_MAX_REG_NUM || regno < 0)
88 gdb_regs[GDB_R8] = regs->r8; 117 return NULL;
89 gdb_regs[GDB_R9] = regs->r9; 118
90 gdb_regs[GDB_R10] = regs->r10; 119 if (dbg_reg_def[regno].offset != -1)
91 gdb_regs[GDB_R11] = regs->r11; 120 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
92 gdb_regs[GDB_R12] = regs->r12; 121 dbg_reg_def[regno].size);
93 gdb_regs[GDB_R13] = regs->r13; 122
94 gdb_regs[GDB_R14] = regs->r14; 123 switch (regno) {
95 gdb_regs[GDB_R15] = regs->r15; 124#ifdef CONFIG_X86_32
96 gdb_regs32[GDB_PS] = regs->flags; 125 case GDB_SS:
97 gdb_regs32[GDB_CS] = regs->cs; 126 if (!user_mode_vm(regs))
98 gdb_regs32[GDB_SS] = regs->ss; 127 *(unsigned long *)mem = __KERNEL_DS;
99 gdb_regs[GDB_SP] = kernel_stack_pointer(regs); 128 break;
129 case GDB_SP:
130 if (!user_mode_vm(regs))
131 *(unsigned long *)mem = kernel_stack_pointer(regs);
132 break;
133 case GDB_GS:
134 case GDB_FS:
135 *(unsigned long *)mem = 0xFFFF;
136 break;
100#endif 137#endif
138 }
139 return dbg_reg_def[regno].name;
101} 140}
102 141
103/** 142/**
@@ -150,54 +189,13 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
150 gdb_regs[GDB_SP] = p->thread.sp; 189 gdb_regs[GDB_SP] = p->thread.sp;
151} 190}
152 191
153/**
154 * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
155 * @gdb_regs: A pointer to hold the registers we've received from GDB.
156 * @regs: A pointer to a &struct pt_regs to hold these values in.
157 *
158 * Convert the GDB regs in @gdb_regs into the pt_regs, and store them
159 * in @regs.
160 */
161void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
162{
163#ifndef CONFIG_X86_32
164 u32 *gdb_regs32 = (u32 *)gdb_regs;
165#endif
166 regs->ax = gdb_regs[GDB_AX];
167 regs->bx = gdb_regs[GDB_BX];
168 regs->cx = gdb_regs[GDB_CX];
169 regs->dx = gdb_regs[GDB_DX];
170 regs->si = gdb_regs[GDB_SI];
171 regs->di = gdb_regs[GDB_DI];
172 regs->bp = gdb_regs[GDB_BP];
173 regs->ip = gdb_regs[GDB_PC];
174#ifdef CONFIG_X86_32
175 regs->flags = gdb_regs[GDB_PS];
176 regs->ds = gdb_regs[GDB_DS];
177 regs->es = gdb_regs[GDB_ES];
178 regs->cs = gdb_regs[GDB_CS];
179#else
180 regs->r8 = gdb_regs[GDB_R8];
181 regs->r9 = gdb_regs[GDB_R9];
182 regs->r10 = gdb_regs[GDB_R10];
183 regs->r11 = gdb_regs[GDB_R11];
184 regs->r12 = gdb_regs[GDB_R12];
185 regs->r13 = gdb_regs[GDB_R13];
186 regs->r14 = gdb_regs[GDB_R14];
187 regs->r15 = gdb_regs[GDB_R15];
188 regs->flags = gdb_regs32[GDB_PS];
189 regs->cs = gdb_regs32[GDB_CS];
190 regs->ss = gdb_regs32[GDB_SS];
191#endif
192}
193
194static struct hw_breakpoint { 192static struct hw_breakpoint {
195 unsigned enabled; 193 unsigned enabled;
196 unsigned long addr; 194 unsigned long addr;
197 int len; 195 int len;
198 int type; 196 int type;
199 struct perf_event **pev; 197 struct perf_event **pev;
200} breakinfo[4]; 198} breakinfo[HBP_NUM];
201 199
202static unsigned long early_dr7; 200static unsigned long early_dr7;
203 201
@@ -205,7 +203,7 @@ static void kgdb_correct_hw_break(void)
205{ 203{
206 int breakno; 204 int breakno;
207 205
208 for (breakno = 0; breakno < 4; breakno++) { 206 for (breakno = 0; breakno < HBP_NUM; breakno++) {
209 struct perf_event *bp; 207 struct perf_event *bp;
210 struct arch_hw_breakpoint *info; 208 struct arch_hw_breakpoint *info;
211 int val; 209 int val;
@@ -292,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
292{ 290{
293 int i; 291 int i;
294 292
295 for (i = 0; i < 4; i++) 293 for (i = 0; i < HBP_NUM; i++)
296 if (breakinfo[i].addr == addr && breakinfo[i].enabled) 294 if (breakinfo[i].addr == addr && breakinfo[i].enabled)
297 break; 295 break;
298 if (i == 4) 296 if (i == HBP_NUM)
299 return -1; 297 return -1;
300 298
301 if (hw_break_release_slot(i)) { 299 if (hw_break_release_slot(i)) {
@@ -313,7 +311,7 @@ static void kgdb_remove_all_hw_break(void)
313 int cpu = raw_smp_processor_id(); 311 int cpu = raw_smp_processor_id();
314 struct perf_event *bp; 312 struct perf_event *bp;
315 313
316 for (i = 0; i < 4; i++) { 314 for (i = 0; i < HBP_NUM; i++) {
317 if (!breakinfo[i].enabled) 315 if (!breakinfo[i].enabled)
318 continue; 316 continue;
319 bp = *per_cpu_ptr(breakinfo[i].pev, cpu); 317 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -333,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
333{ 331{
334 int i; 332 int i;
335 333
336 for (i = 0; i < 4; i++) 334 for (i = 0; i < HBP_NUM; i++)
337 if (!breakinfo[i].enabled) 335 if (!breakinfo[i].enabled)
338 break; 336 break;
339 if (i == 4) 337 if (i == HBP_NUM)
340 return -1; 338 return -1;
341 339
342 switch (bptype) { 340 switch (bptype) {
@@ -397,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
397 395
398 /* Disable hardware debugging while we are in kgdb: */ 396 /* Disable hardware debugging while we are in kgdb: */
399 set_debugreg(0UL, 7); 397 set_debugreg(0UL, 7);
400 for (i = 0; i < 4; i++) { 398 for (i = 0; i < HBP_NUM; i++) {
401 if (!breakinfo[i].enabled) 399 if (!breakinfo[i].enabled)
402 continue; 400 continue;
403 if (dbg_is_early) { 401 if (dbg_is_early) {
@@ -458,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
458{ 456{
459 unsigned long addr; 457 unsigned long addr;
460 char *ptr; 458 char *ptr;
461 int newPC;
462 459
463 switch (remcomInBuffer[0]) { 460 switch (remcomInBuffer[0]) {
464 case 'c': 461 case 'c':
@@ -469,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
469 linux_regs->ip = addr; 466 linux_regs->ip = addr;
470 case 'D': 467 case 'D':
471 case 'k': 468 case 'k':
472 newPC = linux_regs->ip;
473
474 /* clear the trace bit */ 469 /* clear the trace bit */
475 linux_regs->flags &= ~X86_EFLAGS_TF; 470 linux_regs->flags &= ~X86_EFLAGS_TF;
476 atomic_set(&kgdb_cpu_doing_single_step, -1); 471 atomic_set(&kgdb_cpu_doing_single_step, -1);
@@ -572,7 +567,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
572 return NOTIFY_STOP; 567 return NOTIFY_STOP;
573} 568}
574 569
575#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
576int kgdb_ll_trap(int cmd, const char *str, 570int kgdb_ll_trap(int cmd, const char *str,
577 struct pt_regs *regs, long err, int trap, int sig) 571 struct pt_regs *regs, long err, int trap, int sig)
578{ 572{
@@ -590,7 +584,6 @@ int kgdb_ll_trap(int cmd, const char *str,
590 584
591 return __kgdb_notify(&args, cmd); 585 return __kgdb_notify(&args, cmd);
592} 586}
593#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
594 587
595static int 588static int
596kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) 589kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
@@ -625,6 +618,12 @@ int kgdb_arch_init(void)
625 return register_die_notifier(&kgdb_notifier); 618 return register_die_notifier(&kgdb_notifier);
626} 619}
627 620
621static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
622 struct perf_sample_data *data, struct pt_regs *regs)
623{
624 kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP);
625}
626
628void kgdb_arch_late(void) 627void kgdb_arch_late(void)
629{ 628{
630 int i, cpu; 629 int i, cpu;
@@ -641,7 +640,7 @@ void kgdb_arch_late(void)
641 attr.bp_len = HW_BREAKPOINT_LEN_1; 640 attr.bp_len = HW_BREAKPOINT_LEN_1;
642 attr.bp_type = HW_BREAKPOINT_W; 641 attr.bp_type = HW_BREAKPOINT_W;
643 attr.disabled = 1; 642 attr.disabled = 1;
644 for (i = 0; i < 4; i++) { 643 for (i = 0; i < HBP_NUM; i++) {
645 if (breakinfo[i].pev) 644 if (breakinfo[i].pev)
646 continue; 645 continue;
647 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); 646 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
@@ -655,6 +654,7 @@ void kgdb_arch_late(void)
655 for_each_online_cpu(cpu) { 654 for_each_online_cpu(cpu) {
656 pevent = per_cpu_ptr(breakinfo[i].pev, cpu); 655 pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
657 pevent[0]->hw.sample_period = 1; 656 pevent[0]->hw.sample_period = 1;
657 pevent[0]->overflow_handler = kgdb_hw_overflow_handler;
658 if (pevent[0]->destroy != NULL) { 658 if (pevent[0]->destroy != NULL) {
659 pevent[0]->destroy = NULL; 659 pevent[0]->destroy = NULL;
660 release_bp_slot(*pevent); 660 release_bp_slot(*pevent);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 345a4b1fe144..1bfb6cf4dd55 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to)
126} 126}
127 127
128/* 128/*
129 * Check for the REX prefix which can only exist on X86_64 129 * Skip the prefixes of the instruction.
130 * X86_32 always returns 0
131 */ 130 */
132static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) 131static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
133{ 132{
133 insn_attr_t attr;
134
135 attr = inat_get_opcode_attribute((insn_byte_t)*insn);
136 while (inat_is_legacy_prefix(attr)) {
137 insn++;
138 attr = inat_get_opcode_attribute((insn_byte_t)*insn);
139 }
134#ifdef CONFIG_X86_64 140#ifdef CONFIG_X86_64
135 if ((*insn & 0xf0) == 0x40) 141 if (inat_is_rex_prefix(attr))
136 return 1; 142 insn++;
137#endif 143#endif
138 return 0; 144 return insn;
139} 145}
140 146
141/* 147/*
@@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr)
272 */ 278 */
273static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) 279static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
274{ 280{
281 /* Skip prefixes */
282 insn = skip_prefixes(insn);
283
275 switch (*insn) { 284 switch (*insn) {
276 case 0xfa: /* cli */ 285 case 0xfa: /* cli */
277 case 0xfb: /* sti */ 286 case 0xfb: /* sti */
@@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
280 return 1; 289 return 1;
281 } 290 }
282 291
283 /*
284 * on X86_64, 0x40-0x4f are REX prefixes so we need to look
285 * at the next byte instead.. but of course not recurse infinitely
286 */
287 if (is_REX_prefix(insn))
288 return is_IF_modifier(++insn);
289
290 return 0; 292 return 0;
291} 293}
292 294
@@ -640,8 +642,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
640 /* Skip cs, ip, orig_ax and gs. */ \ 642 /* Skip cs, ip, orig_ax and gs. */ \
641 " subl $16, %esp\n" \ 643 " subl $16, %esp\n" \
642 " pushl %fs\n" \ 644 " pushl %fs\n" \
643 " pushl %ds\n" \
644 " pushl %es\n" \ 645 " pushl %es\n" \
646 " pushl %ds\n" \
645 " pushl %eax\n" \ 647 " pushl %eax\n" \
646 " pushl %ebp\n" \ 648 " pushl %ebp\n" \
647 " pushl %edi\n" \ 649 " pushl %edi\n" \
@@ -803,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p,
803 unsigned long orig_ip = (unsigned long)p->addr; 805 unsigned long orig_ip = (unsigned long)p->addr;
804 kprobe_opcode_t *insn = p->ainsn.insn; 806 kprobe_opcode_t *insn = p->ainsn.insn;
805 807
806 /*skip the REX prefix*/ 808 /* Skip prefixes */
807 if (is_REX_prefix(insn)) 809 insn = skip_prefixes(insn);
808 insn++;
809 810
810 regs->flags &= ~X86_EFLAGS_TF; 811 regs->flags &= ~X86_EFLAGS_TF;
811 switch (*insn) { 812 switch (*insn) {
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index e796448f0eb5..79ae68154e87 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -25,8 +25,34 @@
25#include <asm/i8259.h> 25#include <asm/i8259.h>
26#include <asm/apb_timer.h> 26#include <asm/apb_timer.h>
27 27
28/*
29 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
30 * cmdline option x86_mrst_timer can be used to override the configuration
31 * to prefer one or the other.
32 * at runtime, there are basically three timer configurations:
33 * 1. per cpu apbt clock only
34 * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
35 * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
36 *
37 * by default (without cmdline option), platform code first detects cpu type
38 * to see if we are on lincroft or penwell, then set up both lapic or apbt
39 * clocks accordingly.
40 * i.e. by default, medfield uses configuration #2, moorestown uses #1.
41 * config #3 is supported but not recommended on medfield.
42 *
43 * rating and feature summary:
44 * lapic (with C3STOP) --------- 100
45 * apbt (always-on) ------------ 110
46 * lapic (always-on,ARAT) ------ 150
47 */
48
49__cpuinitdata enum mrst_timer_options mrst_timer_options;
50
28static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; 51static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
29static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; 52static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
53enum mrst_cpu_type __mrst_cpu_chip;
54EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
55
30int sfi_mtimer_num; 56int sfi_mtimer_num;
31 57
32struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; 58struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
@@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
167 return 0; 193 return 0;
168} 194}
169 195
170/*
171 * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
172 * APBT but cmdline option can also override it.
173 */
174static void __cpuinit mrst_setup_secondary_clock(void)
175{
176 /* restore default lapic clock if disabled by cmdline */
177 if (disable_apbt_percpu)
178 return setup_secondary_APIC_clock();
179 apbt_setup_secondary_clock();
180}
181
182static unsigned long __init mrst_calibrate_tsc(void) 196static unsigned long __init mrst_calibrate_tsc(void)
183{ 197{
184 unsigned long flags, fast_calibrate; 198 unsigned long flags, fast_calibrate;
@@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void)
195 209
196void __init mrst_time_init(void) 210void __init mrst_time_init(void)
197{ 211{
212 switch (mrst_timer_options) {
213 case MRST_TIMER_APBT_ONLY:
214 break;
215 case MRST_TIMER_LAPIC_APBT:
216 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
217 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
218 break;
219 default:
220 if (!boot_cpu_has(X86_FEATURE_ARAT))
221 break;
222 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
223 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
224 return;
225 }
226 /* we need at least one APB timer */
198 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); 227 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
199 pre_init_apic_IRQ0(); 228 pre_init_apic_IRQ0();
200 apbt_time_init(); 229 apbt_time_init();
@@ -205,16 +234,27 @@ void __init mrst_rtc_init(void)
205 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); 234 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
206} 235}
207 236
208/* 237void __cpuinit mrst_arch_setup(void)
209 * if we use per cpu apb timer, the bootclock already setup. if we use lapic
210 * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
211 */
212static void __init mrst_setup_boot_clock(void)
213{ 238{
214 pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); 239 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
215 if (disable_apbt_percpu) 240 __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
216 setup_boot_APIC_clock(); 241 else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
217}; 242 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
243 else {
244 pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
245 boot_cpu_data.x86, boot_cpu_data.x86_model);
246 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
247 }
248 pr_debug("Moorestown CPU %s identified\n",
249 (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
250 "Lincroft" : "Penwell");
251}
252
253/* MID systems don't have i8042 controller */
254static int mrst_i8042_detect(void)
255{
256 return 0;
257}
218 258
219/* 259/*
220 * Moorestown specific x86_init function overrides and early setup 260 * Moorestown specific x86_init function overrides and early setup
@@ -226,13 +266,16 @@ void __init x86_mrst_early_setup(void)
226 x86_init.resources.reserve_resources = x86_init_noop; 266 x86_init.resources.reserve_resources = x86_init_noop;
227 267
228 x86_init.timers.timer_init = mrst_time_init; 268 x86_init.timers.timer_init = mrst_time_init;
229 x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; 269 x86_init.timers.setup_percpu_clockev = x86_init_noop;
230 270
231 x86_init.irqs.pre_vector_init = x86_init_noop; 271 x86_init.irqs.pre_vector_init = x86_init_noop;
232 272
233 x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; 273 x86_init.oem.arch_setup = mrst_arch_setup;
274
275 x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
234 276
235 x86_platform.calibrate_tsc = mrst_calibrate_tsc; 277 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
278 x86_platform.i8042_detect = mrst_i8042_detect;
236 x86_init.pci.init = pci_mrst_init; 279 x86_init.pci.init = pci_mrst_init;
237 x86_init.pci.fixup_irqs = x86_init_noop; 280 x86_init.pci.fixup_irqs = x86_init_noop;
238 281
@@ -243,3 +286,26 @@ void __init x86_mrst_early_setup(void)
243 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 286 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
244 287
245} 288}
289
290/*
291 * if user does not want to use per CPU apb timer, just give it a lower rating
292 * than local apic timer and skip the late per cpu timer init.
293 */
294static inline int __init setup_x86_mrst_timer(char *arg)
295{
296 if (!arg)
297 return -EINVAL;
298
299 if (strcmp("apbt_only", arg) == 0)
300 mrst_timer_options = MRST_TIMER_APBT_ONLY;
301 else if (strcmp("lapic_and_apbt", arg) == 0)
302 mrst_timer_options = MRST_TIMER_LAPIC_APBT;
303 else {
304 pr_warning("X86 MRST timer option %s not recognised"
305 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
306 arg);
307 return -EINVAL;
308 }
309 return 0;
310}
311__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 0b96b5589f08..078d4ec1a9d9 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -110,7 +110,7 @@ int use_calgary __read_mostly = 0;
110 * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256 110 * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256
111 * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128 111 * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128
112 */ 112 */
113#define MAX_PHB_BUS_NUM 384 113#define MAX_PHB_BUS_NUM 256
114 114
115#define PHBS_PER_CALGARY 4 115#define PHBS_PER_CALGARY 4
116 116
@@ -1056,8 +1056,6 @@ static int __init calgary_init_one(struct pci_dev *dev)
1056 struct iommu_table *tbl; 1056 struct iommu_table *tbl;
1057 int ret; 1057 int ret;
1058 1058
1059 BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
1060
1061 bbar = busno_to_bbar(dev->bus->number); 1059 bbar = busno_to_bbar(dev->bus->number);
1062 ret = calgary_setup_tar(dev, bbar); 1060 ret = calgary_setup_tar(dev, bbar);
1063 if (ret) 1061 if (ret)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32f..d401f1d2d06e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
28EXPORT_SYMBOL(idle_nomwait); 28EXPORT_SYMBOL(idle_nomwait);
29 29
30struct kmem_cache *task_xstate_cachep; 30struct kmem_cache *task_xstate_cachep;
31EXPORT_SYMBOL_GPL(task_xstate_cachep);
31 32
32int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 33int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
33{ 34{
@@ -371,7 +372,7 @@ static inline int hlt_use_halt(void)
371void default_idle(void) 372void default_idle(void)
372{ 373{
373 if (hlt_use_halt()) { 374 if (hlt_use_halt()) {
374 trace_power_start(POWER_CSTATE, 1); 375 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
375 current_thread_info()->status &= ~TS_POLLING; 376 current_thread_info()->status &= ~TS_POLLING;
376 /* 377 /*
377 * TS_POLLING-cleared state must be visible before we 378 * TS_POLLING-cleared state must be visible before we
@@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
441 */ 442 */
442void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 443void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
443{ 444{
444 trace_power_start(POWER_CSTATE, (ax>>4)+1); 445 trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
445 if (!need_resched()) { 446 if (!need_resched()) {
446 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 447 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
447 clflush((void *)&current_thread_info()->flags); 448 clflush((void *)&current_thread_info()->flags);
@@ -457,7 +458,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
457static void mwait_idle(void) 458static void mwait_idle(void)
458{ 459{
459 if (!need_resched()) { 460 if (!need_resched()) {
460 trace_power_start(POWER_CSTATE, 1); 461 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
461 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 462 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
462 clflush((void *)&current_thread_info()->flags); 463 clflush((void *)&current_thread_info()->flags);
463 464
@@ -478,7 +479,7 @@ static void mwait_idle(void)
478 */ 479 */
479static void poll_idle(void) 480static void poll_idle(void)
480{ 481{
481 trace_power_start(POWER_CSTATE, 0); 482 trace_power_start(POWER_CSTATE, 0, smp_processor_id());
482 local_irq_enable(); 483 local_irq_enable();
483 while (!need_resched()) 484 while (!need_resched())
484 cpu_relax(); 485 cpu_relax();
@@ -525,44 +526,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
525 return (edx & MWAIT_EDX_C1); 526 return (edx & MWAIT_EDX_C1);
526} 527}
527 528
528/* 529bool c1e_detected;
529 * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. 530EXPORT_SYMBOL(c1e_detected);
530 * For more information see
531 * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
532 * - Erratum #365 for family 0x11 (not affected because C1e not in use)
533 */
534static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
535{
536 u64 val;
537 if (c->x86_vendor != X86_VENDOR_AMD)
538 goto no_c1e_idle;
539
540 /* Family 0x0f models < rev F do not have C1E */
541 if (c->x86 == 0x0F && c->x86_model >= 0x40)
542 return 1;
543
544 if (c->x86 == 0x10) {
545 /*
546 * check OSVW bit for CPUs that are not affected
547 * by erratum #400
548 */
549 if (cpu_has(c, X86_FEATURE_OSVW)) {
550 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
551 if (val >= 2) {
552 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
553 if (!(val & BIT(1)))
554 goto no_c1e_idle;
555 }
556 }
557 return 1;
558 }
559
560no_c1e_idle:
561 return 0;
562}
563 531
564static cpumask_var_t c1e_mask; 532static cpumask_var_t c1e_mask;
565static int c1e_detected;
566 533
567void c1e_remove_cpu(int cpu) 534void c1e_remove_cpu(int cpu)
568{ 535{
@@ -584,12 +551,12 @@ static void c1e_idle(void)
584 u32 lo, hi; 551 u32 lo, hi;
585 552
586 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 553 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
554
587 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 555 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
588 c1e_detected = 1; 556 c1e_detected = true;
589 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 557 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
590 mark_tsc_unstable("TSC halt in AMD C1E"); 558 mark_tsc_unstable("TSC halt in AMD C1E");
591 printk(KERN_INFO "System has AMD C1E enabled\n"); 559 printk(KERN_INFO "System has AMD C1E enabled\n");
592 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
593 } 560 }
594 } 561 }
595 562
@@ -638,7 +605,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
638 */ 605 */
639 printk(KERN_INFO "using mwait in idle threads.\n"); 606 printk(KERN_INFO "using mwait in idle threads.\n");
640 pm_idle = mwait_idle; 607 pm_idle = mwait_idle;
641 } else if (check_c1e_idle(c)) { 608 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
609 /* E400: APIC timer interrupt does not wake up CPU from C1e */
642 printk(KERN_INFO "using C1E aware idle routine\n"); 610 printk(KERN_INFO "using C1E aware idle routine\n");
643 pm_idle = c1e_idle; 611 pm_idle = c1e_idle;
644 } else 612 } else
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af47..96586c3cbbbf 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,6 +57,8 @@
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/debugreg.h> 58#include <asm/debugreg.h>
59 59
60#include <trace/events/power.h>
61
60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
61 63
62/* 64/*
@@ -111,6 +113,8 @@ void cpu_idle(void)
111 stop_critical_timings(); 113 stop_critical_timings();
112 pm_idle(); 114 pm_idle();
113 start_critical_timings(); 115 start_critical_timings();
116
117 trace_power_end(smp_processor_id());
114 } 118 }
115 tick_nohz_restart_sched_tick(); 119 tick_nohz_restart_sched_tick();
116 preempt_enable_no_resched(); 120 preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3c2422a99f1f..3d9ea531ddd1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,6 +51,8 @@
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/debugreg.h> 52#include <asm/debugreg.h>
53 53
54#include <trace/events/power.h>
55
54asmlinkage extern void ret_from_fork(void); 56asmlinkage extern void ret_from_fork(void);
55 57
56DEFINE_PER_CPU(unsigned long, old_rsp); 58DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -138,6 +140,9 @@ void cpu_idle(void)
138 stop_critical_timings(); 140 stop_critical_timings();
139 pm_idle(); 141 pm_idle();
140 start_critical_timings(); 142 start_critical_timings();
143
144 trace_power_end(smp_processor_id());
145
141 /* In many cases the interrupt that ended idle 146 /* In many cases the interrupt that ended idle
142 has already called exit_idle. But some idle 147 has already called exit_idle. But some idle
143 loops can be woken up without interrupt. */ 148 loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index e72d3fc6547d..939b9e98245f 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -498,15 +498,10 @@ void force_hpet_resume(void)
498 * See erratum #27 (Misinterpreted MSI Requests May Result in 498 * See erratum #27 (Misinterpreted MSI Requests May Result in
499 * Corrupted LPC DMA Data) in AMD Publication #46837, 499 * Corrupted LPC DMA Data) in AMD Publication #46837,
500 * "SB700 Family Product Errata", Rev. 1.0, March 2010. 500 * "SB700 Family Product Errata", Rev. 1.0, March 2010.
501 *
502 * Also force the read back of the CMP register in hpet_next_event()
503 * to work around the problem that the CMP register write seems to be
504 * delayed. See hpet_next_event() for details.
505 */ 501 */
506static void force_disable_hpet_msi(struct pci_dev *unused) 502static void force_disable_hpet_msi(struct pci_dev *unused)
507{ 503{
508 hpet_msi_disable = 1; 504 hpet_msi_disable = 1;
509 hpet_readback_cmp = 1;
510} 505}
511 506
512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, 507DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index de3b63ae3da2..a60df9ae6454 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -238,6 +238,15 @@ void __init setup_per_cpu_areas(void)
238#ifdef CONFIG_NUMA 238#ifdef CONFIG_NUMA
239 per_cpu(x86_cpu_to_node_map, cpu) = 239 per_cpu(x86_cpu_to_node_map, cpu) =
240 early_per_cpu_map(x86_cpu_to_node_map, cpu); 240 early_per_cpu_map(x86_cpu_to_node_map, cpu);
241 /*
242 * Ensure that the boot cpu numa_node is correct when the boot
243 * cpu is on a node that doesn't have memory installed.
244 * Also cpu_up() will call cpu_to_node() for APs when
245 * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set
246 * up later with c_init aka intel_init/amd_init.
247 * So set them all (boot cpu and all APs).
248 */
249 set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
241#endif 250#endif
242#endif 251#endif
243 /* 252 /*
@@ -257,14 +266,6 @@ void __init setup_per_cpu_areas(void)
257 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 266 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
258#endif 267#endif
259 268
260#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
261 /*
262 * make sure boot cpu numa_node is right, when boot cpu is on the
263 * node that doesn't have mem installed
264 */
265 set_cpu_numa_node(boot_cpu_id, early_cpu_to_node(boot_cpu_id));
266#endif
267
268 /* Setup node to cpumask map */ 269 /* Setup node to cpumask map */
269 setup_node_to_cpumask_map(); 270 setup_node_to_cpumask_map();
270 271
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d6..11015fd1abbc 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -816,6 +816,13 @@ do_rest:
816 if (cpumask_test_cpu(cpu, cpu_callin_mask)) 816 if (cpumask_test_cpu(cpu, cpu_callin_mask))
817 break; /* It has booted */ 817 break; /* It has booted */
818 udelay(100); 818 udelay(100);
819 /*
820 * Allow other tasks to run while we wait for the
821 * AP to come online. This also gives a chance
822 * for the MTRR work(triggered by the AP coming online)
823 * to be completed in the stop machine context.
824 */
825 schedule();
819 } 826 }
820 827
821 if (cpumask_test_cpu(cpu, cpu_callin_mask)) 828 if (cpumask_test_cpu(cpu, cpu_callin_mask))
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 922eefbb3f6c..b53c525368a7 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -23,11 +23,16 @@ static int save_stack_stack(void *data, char *name)
23 return 0; 23 return 0;
24} 24}
25 25
26static void save_stack_address(void *data, unsigned long addr, int reliable) 26static void
27__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
27{ 28{
28 struct stack_trace *trace = data; 29 struct stack_trace *trace = data;
30#ifdef CONFIG_FRAME_POINTER
29 if (!reliable) 31 if (!reliable)
30 return; 32 return;
33#endif
34 if (nosched && in_sched_functions(addr))
35 return;
31 if (trace->skip > 0) { 36 if (trace->skip > 0) {
32 trace->skip--; 37 trace->skip--;
33 return; 38 return;
@@ -36,20 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
36 trace->entries[trace->nr_entries++] = addr; 41 trace->entries[trace->nr_entries++] = addr;
37} 42}
38 43
44static void save_stack_address(void *data, unsigned long addr, int reliable)
45{
46 return __save_stack_address(data, addr, reliable, false);
47}
48
39static void 49static void
40save_stack_address_nosched(void *data, unsigned long addr, int reliable) 50save_stack_address_nosched(void *data, unsigned long addr, int reliable)
41{ 51{
42 struct stack_trace *trace = (struct stack_trace *)data; 52 return __save_stack_address(data, addr, reliable, true);
43 if (!reliable)
44 return;
45 if (in_sched_functions(addr))
46 return;
47 if (trace->skip > 0) {
48 trace->skip--;
49 return;
50 }
51 if (trace->nr_entries < trace->max_entries)
52 trace->entries[trace->nr_entries++] = addr;
53} 53}
54 54
55static const struct stacktrace_ops save_stack_ops = { 55static const struct stacktrace_ops save_stack_ops = {
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
96 96
97/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ 97/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
98 98
99struct stack_frame { 99struct stack_frame_user {
100 const void __user *next_fp; 100 const void __user *next_fp;
101 unsigned long ret_addr; 101 unsigned long ret_addr;
102}; 102};
103 103
104static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) 104static int
105copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
105{ 106{
106 int ret; 107 int ret;
107 108
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace)
126 trace->entries[trace->nr_entries++] = regs->ip; 127 trace->entries[trace->nr_entries++] = regs->ip;
127 128
128 while (trace->nr_entries < trace->max_entries) { 129 while (trace->nr_entries < trace->max_entries) {
129 struct stack_frame frame; 130 struct stack_frame_user frame;
130 131
131 frame.next_fp = NULL; 132 frame.next_fp = NULL;
132 frame.ret_addr = 0; 133 frame.ret_addr = 0;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 725ef4d17cd5..60788dee0f8a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -392,7 +392,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
392 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) 392 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
393 == NOTIFY_STOP) 393 == NOTIFY_STOP)
394 return; 394 return;
395
395#ifdef CONFIG_X86_LOCAL_APIC 396#ifdef CONFIG_X86_LOCAL_APIC
397 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
398 == NOTIFY_STOP)
399 return;
400
401#ifndef CONFIG_LOCKUP_DETECTOR
396 /* 402 /*
397 * Ok, so this is none of the documented NMI sources, 403 * Ok, so this is none of the documented NMI sources,
398 * so it must be the NMI watchdog. 404 * so it must be the NMI watchdog.
@@ -400,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
400 if (nmi_watchdog_tick(regs, reason)) 406 if (nmi_watchdog_tick(regs, reason))
401 return; 407 return;
402 if (!do_nmi_callback(regs, cpu)) 408 if (!do_nmi_callback(regs, cpu))
409#endif /* !CONFIG_LOCKUP_DETECTOR */
403 unknown_nmi_error(reason, regs); 410 unknown_nmi_error(reason, regs);
404#else 411#else
405 unknown_nmi_error(reason, regs); 412 unknown_nmi_error(reason, regs);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9faf91ae1841..ce8e50239332 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = {
751 .read = read_tsc, 751 .read = read_tsc,
752 .resume = resume_tsc, 752 .resume = resume_tsc,
753 .mask = CLOCKSOURCE_MASK(64), 753 .mask = CLOCKSOURCE_MASK(64),
754 .shift = 22,
755 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 754 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
756 CLOCK_SOURCE_MUST_VERIFY, 755 CLOCK_SOURCE_MUST_VERIFY,
757#ifdef CONFIG_X86_64 756#ifdef CONFIG_X86_64
@@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void)
845 844
846static void __init init_tsc_clocksource(void) 845static void __init init_tsc_clocksource(void)
847{ 846{
848 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
849 clocksource_tsc.shift);
850 if (tsc_clocksource_reliable) 847 if (tsc_clocksource_reliable)
851 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 848 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
852 /* lower the rating if we already know its unstable: */ 849 /* lower the rating if we already know its unstable: */
@@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void)
854 clocksource_tsc.rating = 0; 851 clocksource_tsc.rating = 0;
855 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 852 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
856 } 853 }
857 clocksource_register(&clocksource_tsc); 854 clocksource_register_khz(&clocksource_tsc, tsc_khz);
858} 855}
859 856
860#ifdef CONFIG_X86_64 857#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
index 45b6f8a975a1..56a8c2a867d9 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <asm/cpufeature.h> 33#include <asm/cpufeature.h>
34#include <asm/msr-index.h>
34 35
35verify_cpu: 36verify_cpu:
36 pushfl # Save caller passed flags 37 pushfl # Save caller passed flags
@@ -88,7 +89,7 @@ verify_cpu_sse_test:
88 je verify_cpu_sse_ok 89 je verify_cpu_sse_ok
89 test %di,%di 90 test %di,%di
90 jz verify_cpu_no_longmode # only try to force SSE on AMD 91 jz verify_cpu_no_longmode # only try to force SSE on AMD
91 movl $0xc0010015,%ecx # HWCR 92 movl $MSR_K7_HWCR,%ecx
92 rdmsr 93 rdmsr
93 btr $15,%eax # enable SSE 94 btr $15,%eax # enable SSE
94 wrmsr 95 wrmsr
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1c0c6ab9c60f..dcbb28c4b694 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, 76void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
77 u32 mult) 77 struct clocksource *clock, u32 mult)
78{ 78{
79 unsigned long flags; 79 unsigned long flags;
80 80
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
87 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
90 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; 90 vsyscall_gtod_data.wall_to_monotonic = *wtm;
91 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 91 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
92 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 92 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
93} 93}
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
169 * unlikely */ 169 * unlikely */
170time_t __vsyscall(1) vtime(time_t *t) 170time_t __vsyscall(1) vtime(time_t *t)
171{ 171{
172 struct timeval tv; 172 unsigned seq;
173 time_t result; 173 time_t result;
174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) 174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
175 return time_syscall(t); 175 return time_syscall(t);
176 176
177 vgettimeofday(&tv, NULL); 177 do {
178 result = tv.tv_sec; 178 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
179
180 result = __vsyscall_gtod_data.wall_time_sec;
181
182 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
183
179 if (t) 184 if (t)
180 *t = result; 185 *t = result;
181 return result; 186 return result;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 61a1e8c7e19f..cd6da6bf3eca 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -5,6 +5,7 @@
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/ioport.h> 7#include <linux/ioport.h>
8#include <linux/module.h>
8 9
9#include <asm/bios_ebda.h> 10#include <asm/bios_ebda.h>
10#include <asm/paravirt.h> 11#include <asm/paravirt.h>
@@ -85,6 +86,7 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
85}; 86};
86 87
87static void default_nmi_init(void) { }; 88static void default_nmi_init(void) { };
89static int default_i8042_detect(void) { return 1; };
88 90
89struct x86_platform_ops x86_platform = { 91struct x86_platform_ops x86_platform = {
90 .calibrate_tsc = native_calibrate_tsc, 92 .calibrate_tsc = native_calibrate_tsc,
@@ -92,5 +94,8 @@ struct x86_platform_ops x86_platform = {
92 .set_wallclock = mach_set_rtc_mmss, 94 .set_wallclock = mach_set_rtc_mmss,
93 .iommu_shutdown = iommu_shutdown_noop, 95 .iommu_shutdown = iommu_shutdown_noop,
94 .is_untracked_pat_range = is_ISA_range, 96 .is_untracked_pat_range = is_ISA_range,
95 .nmi_init = default_nmi_init 97 .nmi_init = default_nmi_init,
98 .i8042_detect = default_i8042_detect
96}; 99};
100
101EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 6e73db1b7b4e..a4ae302f03aa 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -36,15 +36,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
36 36
37 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], 37 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
38 sizeof(struct _fpx_sw_bytes)); 38 sizeof(struct _fpx_sw_bytes));
39
40 if (err) 39 if (err)
41 return err; 40 return -EFAULT;
42 41
43 /* 42 /*
44 * First Magic check failed. 43 * First Magic check failed.
45 */ 44 */
46 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) 45 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
47 return -1; 46 return -EINVAL;
48 47
49 /* 48 /*
50 * Check for error scenarios. 49 * Check for error scenarios.
@@ -52,19 +51,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
52 if (fx_sw_user->xstate_size < min_xstate_size || 51 if (fx_sw_user->xstate_size < min_xstate_size ||
53 fx_sw_user->xstate_size > xstate_size || 52 fx_sw_user->xstate_size > xstate_size ||
54 fx_sw_user->xstate_size > fx_sw_user->extended_size) 53 fx_sw_user->xstate_size > fx_sw_user->extended_size)
55 return -1; 54 return -EINVAL;
56 55
57 err = __get_user(magic2, (__u32 *) (((void *)fpstate) + 56 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
58 fx_sw_user->extended_size - 57 fx_sw_user->extended_size -
59 FP_XSTATE_MAGIC2_SIZE)); 58 FP_XSTATE_MAGIC2_SIZE));
59 if (err)
60 return err;
60 /* 61 /*
61 * Check for the presence of second magic word at the end of memory 62 * Check for the presence of second magic word at the end of memory
62 * layout. This detects the case where the user just copied the legacy 63 * layout. This detects the case where the user just copied the legacy
63 * fpstate layout with out copying the extended state information 64 * fpstate layout with out copying the extended state information
64 * in the memory layout. 65 * in the memory layout.
65 */ 66 */
66 if (err || magic2 != FP_XSTATE_MAGIC2) 67 if (magic2 != FP_XSTATE_MAGIC2)
67 return -1; 68 return -EFAULT;
68 69
69 return 0; 70 return 0;
70} 71}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5ac0bb465ed6..b38bd8b92aa6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,6 +9,7 @@
9 * privileged instructions: 9 * privileged instructions:
10 * 10 *
11 * Copyright (C) 2006 Qumranet 11 * Copyright (C) 2006 Qumranet
12 * Copyright 2010 Red Hat, Inc. and/or its affilates.
12 * 13 *
13 * Avi Kivity <avi@qumranet.com> 14 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com> 15 * Yaniv Kamay <yaniv@qumranet.com>
@@ -67,6 +68,9 @@
67#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 68#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
68#define SrcImmU (9<<4) /* Immediate operand, unsigned */ 69#define SrcImmU (9<<4) /* Immediate operand, unsigned */
69#define SrcSI (0xa<<4) /* Source is in the DS:RSI */ 70#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
73#define SrcAcc (0xd<<4) /* Source Accumulator */
70#define SrcMask (0xf<<4) 74#define SrcMask (0xf<<4)
71/* Generic ModRM decode. */ 75/* Generic ModRM decode. */
72#define ModRM (1<<8) 76#define ModRM (1<<8)
@@ -88,10 +92,6 @@
88#define Src2CL (1<<29) 92#define Src2CL (1<<29)
89#define Src2ImmByte (2<<29) 93#define Src2ImmByte (2<<29)
90#define Src2One (3<<29) 94#define Src2One (3<<29)
91#define Src2Imm16 (4<<29)
92#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
93 in memory and second argument is located
94 immediately after the first one in memory. */
95#define Src2Mask (7<<29) 95#define Src2Mask (7<<29)
96 96
97enum { 97enum {
@@ -124,15 +124,15 @@ static u32 opcode_table[256] = {
124 /* 0x20 - 0x27 */ 124 /* 0x20 - 0x27 */
125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
127 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 127 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
128 /* 0x28 - 0x2F */ 128 /* 0x28 - 0x2F */
129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
131 0, 0, 0, 0, 131 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
132 /* 0x30 - 0x37 */ 132 /* 0x30 - 0x37 */
133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
135 0, 0, 0, 0, 135 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
136 /* 0x38 - 0x3F */ 136 /* 0x38 - 0x3F */
137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -170,20 +170,20 @@ static u32 opcode_table[256] = {
170 /* 0x88 - 0x8F */ 170 /* 0x88 - 0x8F */
171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
173 DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, 173 DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
174 DstReg | SrcMem | ModRM | Mov, Group | Group1A, 174 ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
175 /* 0x90 - 0x97 */ 175 /* 0x90 - 0x97 */
176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
177 /* 0x98 - 0x9F */ 177 /* 0x98 - 0x9F */
178 0, 0, SrcImm | Src2Imm16 | No64, 0, 178 0, 0, SrcImmFAddr | No64, 0,
179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
180 /* 0xA0 - 0xA7 */ 180 /* 0xA0 - 0xA7 */
181 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 181 ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
182 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 182 ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, 183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, 184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
185 /* 0xA8 - 0xAF */ 185 /* 0xA8 - 0xAF */
186 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, 186 DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, 187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
188 ByteOp | DstDI | String, DstDI | String, 188 ByteOp | DstDI | String, DstDI | String,
189 /* 0xB0 - 0xB7 */ 189 /* 0xB0 - 0xB7 */
@@ -215,7 +215,7 @@ static u32 opcode_table[256] = {
215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, 215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
216 /* 0xE8 - 0xEF */ 216 /* 0xE8 - 0xEF */
217 SrcImm | Stack, SrcImm | ImplicitOps, 217 SrcImm | Stack, SrcImm | ImplicitOps,
218 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, 218 SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, 219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, 220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
221 /* 0xF0 - 0xF7 */ 221 /* 0xF0 - 0xF7 */
@@ -337,20 +337,20 @@ static u32 group_table[] = {
337 [Group1A*8] = 337 [Group1A*8] =
338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, 338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
339 [Group3_Byte*8] = 339 [Group3_Byte*8] =
340 ByteOp | SrcImm | DstMem | ModRM, 0, 340 ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
342 0, 0, 0, 0, 342 0, 0, 0, 0,
343 [Group3*8] = 343 [Group3*8] =
344 DstMem | SrcImm | ModRM, 0, 344 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
346 0, 0, 0, 0, 346 0, 0, 0, 0,
347 [Group4*8] = 347 [Group4*8] =
348 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 348 ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
349 0, 0, 0, 0, 0, 0, 349 0, 0, 0, 0, 0, 0,
350 [Group5*8] = 350 [Group5*8] =
351 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 351 DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
352 SrcMem | ModRM | Stack, 0, 352 SrcMem | ModRM | Stack, 0,
353 SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, 353 SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
354 SrcMem | ModRM | Stack, 0, 354 SrcMem | ModRM | Stack, 0,
355 [Group7*8] = 355 [Group7*8] =
356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, 356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
@@ -576,6 +576,13 @@ static u32 group2_table[] = {
576 (_type)_x; \ 576 (_type)_x; \
577}) 577})
578 578
579#define insn_fetch_arr(_arr, _size, _eip) \
580({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
581 if (rc != X86EMUL_CONTINUE) \
582 goto done; \
583 (_eip) += (_size); \
584})
585
579static inline unsigned long ad_mask(struct decode_cache *c) 586static inline unsigned long ad_mask(struct decode_cache *c)
580{ 587{
581 return (1UL << (c->ad_bytes << 3)) - 1; 588 return (1UL << (c->ad_bytes << 3)) - 1;
@@ -617,31 +624,66 @@ static void set_seg_override(struct decode_cache *c, int seg)
617 c->seg_override = seg; 624 c->seg_override = seg;
618} 625}
619 626
620static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) 627static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
628 struct x86_emulate_ops *ops, int seg)
621{ 629{
622 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 630 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
623 return 0; 631 return 0;
624 632
625 return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); 633 return ops->get_cached_segment_base(seg, ctxt->vcpu);
626} 634}
627 635
628static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 636static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
637 struct x86_emulate_ops *ops,
629 struct decode_cache *c) 638 struct decode_cache *c)
630{ 639{
631 if (!c->has_seg_override) 640 if (!c->has_seg_override)
632 return 0; 641 return 0;
633 642
634 return seg_base(ctxt, c->seg_override); 643 return seg_base(ctxt, ops, c->seg_override);
644}
645
646static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
647 struct x86_emulate_ops *ops)
648{
649 return seg_base(ctxt, ops, VCPU_SREG_ES);
650}
651
652static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
653 struct x86_emulate_ops *ops)
654{
655 return seg_base(ctxt, ops, VCPU_SREG_SS);
656}
657
658static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
659 u32 error, bool valid)
660{
661 ctxt->exception = vec;
662 ctxt->error_code = error;
663 ctxt->error_code_valid = valid;
664 ctxt->restart = false;
665}
666
667static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
668{
669 emulate_exception(ctxt, GP_VECTOR, err, true);
635} 670}
636 671
637static unsigned long es_base(struct x86_emulate_ctxt *ctxt) 672static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
673 int err)
638{ 674{
639 return seg_base(ctxt, VCPU_SREG_ES); 675 ctxt->cr2 = addr;
676 emulate_exception(ctxt, PF_VECTOR, err, true);
640} 677}
641 678
642static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) 679static void emulate_ud(struct x86_emulate_ctxt *ctxt)
643{ 680{
644 return seg_base(ctxt, VCPU_SREG_SS); 681 emulate_exception(ctxt, UD_VECTOR, 0, false);
682}
683
684static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
685{
686 emulate_exception(ctxt, TS_VECTOR, err, true);
645} 687}
646 688
647static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 689static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -932,12 +974,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
932 /* we cannot decode insn before we complete previous rep insn */ 974 /* we cannot decode insn before we complete previous rep insn */
933 WARN_ON(ctxt->restart); 975 WARN_ON(ctxt->restart);
934 976
935 /* Shadow copy of register state. Committed on successful emulation. */
936 memset(c, 0, sizeof(struct decode_cache));
937 c->eip = ctxt->eip; 977 c->eip = ctxt->eip;
938 c->fetch.start = c->fetch.end = c->eip; 978 c->fetch.start = c->fetch.end = c->eip;
939 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 979 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
940 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
941 980
942 switch (mode) { 981 switch (mode) {
943 case X86EMUL_MODE_REAL: 982 case X86EMUL_MODE_REAL:
@@ -1060,7 +1099,7 @@ done_prefixes:
1060 set_seg_override(c, VCPU_SREG_DS); 1099 set_seg_override(c, VCPU_SREG_DS);
1061 1100
1062 if (!(!c->twobyte && c->b == 0x8d)) 1101 if (!(!c->twobyte && c->b == 0x8d))
1063 c->modrm_ea += seg_override_base(ctxt, c); 1102 c->modrm_ea += seg_override_base(ctxt, ops, c);
1064 1103
1065 if (c->ad_bytes != 8) 1104 if (c->ad_bytes != 8)
1066 c->modrm_ea = (u32)c->modrm_ea; 1105 c->modrm_ea = (u32)c->modrm_ea;
@@ -1148,6 +1187,25 @@ done_prefixes:
1148 else 1187 else
1149 c->src.val = insn_fetch(u8, 1, c->eip); 1188 c->src.val = insn_fetch(u8, 1, c->eip);
1150 break; 1189 break;
1190 case SrcAcc:
1191 c->src.type = OP_REG;
1192 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1193 c->src.ptr = &c->regs[VCPU_REGS_RAX];
1194 switch (c->src.bytes) {
1195 case 1:
1196 c->src.val = *(u8 *)c->src.ptr;
1197 break;
1198 case 2:
1199 c->src.val = *(u16 *)c->src.ptr;
1200 break;
1201 case 4:
1202 c->src.val = *(u32 *)c->src.ptr;
1203 break;
1204 case 8:
1205 c->src.val = *(u64 *)c->src.ptr;
1206 break;
1207 }
1208 break;
1151 case SrcOne: 1209 case SrcOne:
1152 c->src.bytes = 1; 1210 c->src.bytes = 1;
1153 c->src.val = 1; 1211 c->src.val = 1;
@@ -1156,10 +1214,21 @@ done_prefixes:
1156 c->src.type = OP_MEM; 1214 c->src.type = OP_MEM;
1157 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1215 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1158 c->src.ptr = (unsigned long *) 1216 c->src.ptr = (unsigned long *)
1159 register_address(c, seg_override_base(ctxt, c), 1217 register_address(c, seg_override_base(ctxt, ops, c),
1160 c->regs[VCPU_REGS_RSI]); 1218 c->regs[VCPU_REGS_RSI]);
1161 c->src.val = 0; 1219 c->src.val = 0;
1162 break; 1220 break;
1221 case SrcImmFAddr:
1222 c->src.type = OP_IMM;
1223 c->src.ptr = (unsigned long *)c->eip;
1224 c->src.bytes = c->op_bytes + 2;
1225 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
1226 break;
1227 case SrcMemFAddr:
1228 c->src.type = OP_MEM;
1229 c->src.ptr = (unsigned long *)c->modrm_ea;
1230 c->src.bytes = c->op_bytes + 2;
1231 break;
1163 } 1232 }
1164 1233
1165 /* 1234 /*
@@ -1179,22 +1248,10 @@ done_prefixes:
1179 c->src2.bytes = 1; 1248 c->src2.bytes = 1;
1180 c->src2.val = insn_fetch(u8, 1, c->eip); 1249 c->src2.val = insn_fetch(u8, 1, c->eip);
1181 break; 1250 break;
1182 case Src2Imm16:
1183 c->src2.type = OP_IMM;
1184 c->src2.ptr = (unsigned long *)c->eip;
1185 c->src2.bytes = 2;
1186 c->src2.val = insn_fetch(u16, 2, c->eip);
1187 break;
1188 case Src2One: 1251 case Src2One:
1189 c->src2.bytes = 1; 1252 c->src2.bytes = 1;
1190 c->src2.val = 1; 1253 c->src2.val = 1;
1191 break; 1254 break;
1192 case Src2Mem16:
1193 c->src2.type = OP_MEM;
1194 c->src2.bytes = 2;
1195 c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
1196 c->src2.val = 0;
1197 break;
1198 } 1255 }
1199 1256
1200 /* Decode and fetch the destination operand: register or memory. */ 1257 /* Decode and fetch the destination operand: register or memory. */
@@ -1253,7 +1310,7 @@ done_prefixes:
1253 c->dst.type = OP_MEM; 1310 c->dst.type = OP_MEM;
1254 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1311 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1255 c->dst.ptr = (unsigned long *) 1312 c->dst.ptr = (unsigned long *)
1256 register_address(c, es_base(ctxt), 1313 register_address(c, es_base(ctxt, ops),
1257 c->regs[VCPU_REGS_RDI]); 1314 c->regs[VCPU_REGS_RDI]);
1258 c->dst.val = 0; 1315 c->dst.val = 0;
1259 break; 1316 break;
@@ -1263,6 +1320,37 @@ done:
1263 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1320 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1264} 1321}
1265 1322
1323static int read_emulated(struct x86_emulate_ctxt *ctxt,
1324 struct x86_emulate_ops *ops,
1325 unsigned long addr, void *dest, unsigned size)
1326{
1327 int rc;
1328 struct read_cache *mc = &ctxt->decode.mem_read;
1329 u32 err;
1330
1331 while (size) {
1332 int n = min(size, 8u);
1333 size -= n;
1334 if (mc->pos < mc->end)
1335 goto read_cached;
1336
1337 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
1338 ctxt->vcpu);
1339 if (rc == X86EMUL_PROPAGATE_FAULT)
1340 emulate_pf(ctxt, addr, err);
1341 if (rc != X86EMUL_CONTINUE)
1342 return rc;
1343 mc->end += n;
1344
1345 read_cached:
1346 memcpy(dest, mc->data + mc->pos, n);
1347 mc->pos += n;
1348 dest += n;
1349 addr += n;
1350 }
1351 return X86EMUL_CONTINUE;
1352}
1353
1266static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, 1354static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 struct x86_emulate_ops *ops, 1355 struct x86_emulate_ops *ops,
1268 unsigned int size, unsigned short port, 1356 unsigned int size, unsigned short port,
@@ -1330,13 +1418,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1330 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1418 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1331 1419
1332 if (dt.size < index * 8 + 7) { 1420 if (dt.size < index * 8 + 7) {
1333 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); 1421 emulate_gp(ctxt, selector & 0xfffc);
1334 return X86EMUL_PROPAGATE_FAULT; 1422 return X86EMUL_PROPAGATE_FAULT;
1335 } 1423 }
1336 addr = dt.address + index * 8; 1424 addr = dt.address + index * 8;
1337 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1425 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1338 if (ret == X86EMUL_PROPAGATE_FAULT) 1426 if (ret == X86EMUL_PROPAGATE_FAULT)
1339 kvm_inject_page_fault(ctxt->vcpu, addr, err); 1427 emulate_pf(ctxt, addr, err);
1340 1428
1341 return ret; 1429 return ret;
1342} 1430}
@@ -1355,14 +1443,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1355 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1443 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1356 1444
1357 if (dt.size < index * 8 + 7) { 1445 if (dt.size < index * 8 + 7) {
1358 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); 1446 emulate_gp(ctxt, selector & 0xfffc);
1359 return X86EMUL_PROPAGATE_FAULT; 1447 return X86EMUL_PROPAGATE_FAULT;
1360 } 1448 }
1361 1449
1362 addr = dt.address + index * 8; 1450 addr = dt.address + index * 8;
1363 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1451 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1364 if (ret == X86EMUL_PROPAGATE_FAULT) 1452 if (ret == X86EMUL_PROPAGATE_FAULT)
1365 kvm_inject_page_fault(ctxt->vcpu, addr, err); 1453 emulate_pf(ctxt, addr, err);
1366 1454
1367 return ret; 1455 return ret;
1368} 1456}
@@ -1481,11 +1569,70 @@ load:
1481 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); 1569 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
1482 return X86EMUL_CONTINUE; 1570 return X86EMUL_CONTINUE;
1483exception: 1571exception:
1484 kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); 1572 emulate_exception(ctxt, err_vec, err_code, true);
1485 return X86EMUL_PROPAGATE_FAULT; 1573 return X86EMUL_PROPAGATE_FAULT;
1486} 1574}
1487 1575
1488static inline void emulate_push(struct x86_emulate_ctxt *ctxt) 1576static inline int writeback(struct x86_emulate_ctxt *ctxt,
1577 struct x86_emulate_ops *ops)
1578{
1579 int rc;
1580 struct decode_cache *c = &ctxt->decode;
1581 u32 err;
1582
1583 switch (c->dst.type) {
1584 case OP_REG:
1585 /* The 4-byte case *is* correct:
1586 * in 64-bit mode we zero-extend.
1587 */
1588 switch (c->dst.bytes) {
1589 case 1:
1590 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1591 break;
1592 case 2:
1593 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1594 break;
1595 case 4:
1596 *c->dst.ptr = (u32)c->dst.val;
1597 break; /* 64b: zero-ext */
1598 case 8:
1599 *c->dst.ptr = c->dst.val;
1600 break;
1601 }
1602 break;
1603 case OP_MEM:
1604 if (c->lock_prefix)
1605 rc = ops->cmpxchg_emulated(
1606 (unsigned long)c->dst.ptr,
1607 &c->dst.orig_val,
1608 &c->dst.val,
1609 c->dst.bytes,
1610 &err,
1611 ctxt->vcpu);
1612 else
1613 rc = ops->write_emulated(
1614 (unsigned long)c->dst.ptr,
1615 &c->dst.val,
1616 c->dst.bytes,
1617 &err,
1618 ctxt->vcpu);
1619 if (rc == X86EMUL_PROPAGATE_FAULT)
1620 emulate_pf(ctxt,
1621 (unsigned long)c->dst.ptr, err);
1622 if (rc != X86EMUL_CONTINUE)
1623 return rc;
1624 break;
1625 case OP_NONE:
1626 /* no writeback */
1627 break;
1628 default:
1629 break;
1630 }
1631 return X86EMUL_CONTINUE;
1632}
1633
1634static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1635 struct x86_emulate_ops *ops)
1489{ 1636{
1490 struct decode_cache *c = &ctxt->decode; 1637 struct decode_cache *c = &ctxt->decode;
1491 1638
@@ -1493,7 +1640,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1493 c->dst.bytes = c->op_bytes; 1640 c->dst.bytes = c->op_bytes;
1494 c->dst.val = c->src.val; 1641 c->dst.val = c->src.val;
1495 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1642 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1496 c->dst.ptr = (void *) register_address(c, ss_base(ctxt), 1643 c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
1497 c->regs[VCPU_REGS_RSP]); 1644 c->regs[VCPU_REGS_RSP]);
1498} 1645}
1499 1646
@@ -1504,9 +1651,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1504 struct decode_cache *c = &ctxt->decode; 1651 struct decode_cache *c = &ctxt->decode;
1505 int rc; 1652 int rc;
1506 1653
1507 rc = ops->read_emulated(register_address(c, ss_base(ctxt), 1654 rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
1508 c->regs[VCPU_REGS_RSP]), 1655 c->regs[VCPU_REGS_RSP]),
1509 dest, len, ctxt->vcpu); 1656 dest, len);
1510 if (rc != X86EMUL_CONTINUE) 1657 if (rc != X86EMUL_CONTINUE)
1511 return rc; 1658 return rc;
1512 1659
@@ -1541,7 +1688,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1541 break; 1688 break;
1542 case X86EMUL_MODE_VM86: 1689 case X86EMUL_MODE_VM86:
1543 if (iopl < 3) { 1690 if (iopl < 3) {
1544 kvm_inject_gp(ctxt->vcpu, 0); 1691 emulate_gp(ctxt, 0);
1545 return X86EMUL_PROPAGATE_FAULT; 1692 return X86EMUL_PROPAGATE_FAULT;
1546 } 1693 }
1547 change_mask |= EFLG_IF; 1694 change_mask |= EFLG_IF;
@@ -1557,15 +1704,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1557 return rc; 1704 return rc;
1558} 1705}
1559 1706
1560static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1707static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
1708 struct x86_emulate_ops *ops, int seg)
1561{ 1709{
1562 struct decode_cache *c = &ctxt->decode; 1710 struct decode_cache *c = &ctxt->decode;
1563 struct kvm_segment segment;
1564 1711
1565 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); 1712 c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
1566 1713
1567 c->src.val = segment.selector; 1714 emulate_push(ctxt, ops);
1568 emulate_push(ctxt);
1569} 1715}
1570 1716
1571static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, 1717static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1583,19 +1729,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1583 return rc; 1729 return rc;
1584} 1730}
1585 1731
1586static void emulate_pusha(struct x86_emulate_ctxt *ctxt) 1732static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
1733 struct x86_emulate_ops *ops)
1587{ 1734{
1588 struct decode_cache *c = &ctxt->decode; 1735 struct decode_cache *c = &ctxt->decode;
1589 unsigned long old_esp = c->regs[VCPU_REGS_RSP]; 1736 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1737 int rc = X86EMUL_CONTINUE;
1590 int reg = VCPU_REGS_RAX; 1738 int reg = VCPU_REGS_RAX;
1591 1739
1592 while (reg <= VCPU_REGS_RDI) { 1740 while (reg <= VCPU_REGS_RDI) {
1593 (reg == VCPU_REGS_RSP) ? 1741 (reg == VCPU_REGS_RSP) ?
1594 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1742 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1595 1743
1596 emulate_push(ctxt); 1744 emulate_push(ctxt, ops);
1745
1746 rc = writeback(ctxt, ops);
1747 if (rc != X86EMUL_CONTINUE)
1748 return rc;
1749
1597 ++reg; 1750 ++reg;
1598 } 1751 }
1752
1753 /* Disable writeback. */
1754 c->dst.type = OP_NONE;
1755
1756 return rc;
1599} 1757}
1600 1758
1601static int emulate_popa(struct x86_emulate_ctxt *ctxt, 1759static int emulate_popa(struct x86_emulate_ctxt *ctxt,
@@ -1695,14 +1853,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1695 old_eip = c->eip; 1853 old_eip = c->eip;
1696 c->eip = c->src.val; 1854 c->eip = c->src.val;
1697 c->src.val = old_eip; 1855 c->src.val = old_eip;
1698 emulate_push(ctxt); 1856 emulate_push(ctxt, ops);
1699 break; 1857 break;
1700 } 1858 }
1701 case 4: /* jmp abs */ 1859 case 4: /* jmp abs */
1702 c->eip = c->src.val; 1860 c->eip = c->src.val;
1703 break; 1861 break;
1704 case 6: /* push */ 1862 case 6: /* push */
1705 emulate_push(ctxt); 1863 emulate_push(ctxt, ops);
1706 break; 1864 break;
1707 } 1865 }
1708 return X86EMUL_CONTINUE; 1866 return X86EMUL_CONTINUE;
@@ -1748,145 +1906,82 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1748 return rc; 1906 return rc;
1749} 1907}
1750 1908
1751static inline int writeback(struct x86_emulate_ctxt *ctxt,
1752 struct x86_emulate_ops *ops)
1753{
1754 int rc;
1755 struct decode_cache *c = &ctxt->decode;
1756
1757 switch (c->dst.type) {
1758 case OP_REG:
1759 /* The 4-byte case *is* correct:
1760 * in 64-bit mode we zero-extend.
1761 */
1762 switch (c->dst.bytes) {
1763 case 1:
1764 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1765 break;
1766 case 2:
1767 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1768 break;
1769 case 4:
1770 *c->dst.ptr = (u32)c->dst.val;
1771 break; /* 64b: zero-ext */
1772 case 8:
1773 *c->dst.ptr = c->dst.val;
1774 break;
1775 }
1776 break;
1777 case OP_MEM:
1778 if (c->lock_prefix)
1779 rc = ops->cmpxchg_emulated(
1780 (unsigned long)c->dst.ptr,
1781 &c->dst.orig_val,
1782 &c->dst.val,
1783 c->dst.bytes,
1784 ctxt->vcpu);
1785 else
1786 rc = ops->write_emulated(
1787 (unsigned long)c->dst.ptr,
1788 &c->dst.val,
1789 c->dst.bytes,
1790 ctxt->vcpu);
1791 if (rc != X86EMUL_CONTINUE)
1792 return rc;
1793 break;
1794 case OP_NONE:
1795 /* no writeback */
1796 break;
1797 default:
1798 break;
1799 }
1800 return X86EMUL_CONTINUE;
1801}
1802
1803static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1804{
1805 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
1806 /*
1807 * an sti; sti; sequence only disable interrupts for the first
1808 * instruction. So, if the last instruction, be it emulated or
1809 * not, left the system with the INT_STI flag enabled, it
1810 * means that the last instruction is an sti. We should not
1811 * leave the flag on in this case. The same goes for mov ss
1812 */
1813 if (!(int_shadow & mask))
1814 ctxt->interruptibility = mask;
1815}
1816
1817static inline void 1909static inline void
1818setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1910setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1819 struct kvm_segment *cs, struct kvm_segment *ss) 1911 struct x86_emulate_ops *ops, struct desc_struct *cs,
1912 struct desc_struct *ss)
1820{ 1913{
1821 memset(cs, 0, sizeof(struct kvm_segment)); 1914 memset(cs, 0, sizeof(struct desc_struct));
1822 kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); 1915 ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
1823 memset(ss, 0, sizeof(struct kvm_segment)); 1916 memset(ss, 0, sizeof(struct desc_struct));
1824 1917
1825 cs->l = 0; /* will be adjusted later */ 1918 cs->l = 0; /* will be adjusted later */
1826 cs->base = 0; /* flat segment */ 1919 set_desc_base(cs, 0); /* flat segment */
1827 cs->g = 1; /* 4kb granularity */ 1920 cs->g = 1; /* 4kb granularity */
1828 cs->limit = 0xffffffff; /* 4GB limit */ 1921 set_desc_limit(cs, 0xfffff); /* 4GB limit */
1829 cs->type = 0x0b; /* Read, Execute, Accessed */ 1922 cs->type = 0x0b; /* Read, Execute, Accessed */
1830 cs->s = 1; 1923 cs->s = 1;
1831 cs->dpl = 0; /* will be adjusted later */ 1924 cs->dpl = 0; /* will be adjusted later */
1832 cs->present = 1; 1925 cs->p = 1;
1833 cs->db = 1; 1926 cs->d = 1;
1834 1927
1835 ss->unusable = 0; 1928 set_desc_base(ss, 0); /* flat segment */
1836 ss->base = 0; /* flat segment */ 1929 set_desc_limit(ss, 0xfffff); /* 4GB limit */
1837 ss->limit = 0xffffffff; /* 4GB limit */
1838 ss->g = 1; /* 4kb granularity */ 1930 ss->g = 1; /* 4kb granularity */
1839 ss->s = 1; 1931 ss->s = 1;
1840 ss->type = 0x03; /* Read/Write, Accessed */ 1932 ss->type = 0x03; /* Read/Write, Accessed */
1841 ss->db = 1; /* 32bit stack segment */ 1933 ss->d = 1; /* 32bit stack segment */
1842 ss->dpl = 0; 1934 ss->dpl = 0;
1843 ss->present = 1; 1935 ss->p = 1;
1844} 1936}
1845 1937
1846static int 1938static int
1847emulate_syscall(struct x86_emulate_ctxt *ctxt) 1939emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1848{ 1940{
1849 struct decode_cache *c = &ctxt->decode; 1941 struct decode_cache *c = &ctxt->decode;
1850 struct kvm_segment cs, ss; 1942 struct desc_struct cs, ss;
1851 u64 msr_data; 1943 u64 msr_data;
1944 u16 cs_sel, ss_sel;
1852 1945
1853 /* syscall is not available in real mode */ 1946 /* syscall is not available in real mode */
1854 if (ctxt->mode == X86EMUL_MODE_REAL || 1947 if (ctxt->mode == X86EMUL_MODE_REAL ||
1855 ctxt->mode == X86EMUL_MODE_VM86) { 1948 ctxt->mode == X86EMUL_MODE_VM86) {
1856 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 1949 emulate_ud(ctxt);
1857 return X86EMUL_PROPAGATE_FAULT; 1950 return X86EMUL_PROPAGATE_FAULT;
1858 } 1951 }
1859 1952
1860 setup_syscalls_segments(ctxt, &cs, &ss); 1953 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1861 1954
1862 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1955 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1863 msr_data >>= 32; 1956 msr_data >>= 32;
1864 cs.selector = (u16)(msr_data & 0xfffc); 1957 cs_sel = (u16)(msr_data & 0xfffc);
1865 ss.selector = (u16)(msr_data + 8); 1958 ss_sel = (u16)(msr_data + 8);
1866 1959
1867 if (is_long_mode(ctxt->vcpu)) { 1960 if (is_long_mode(ctxt->vcpu)) {
1868 cs.db = 0; 1961 cs.d = 0;
1869 cs.l = 1; 1962 cs.l = 1;
1870 } 1963 }
1871 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 1964 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
1872 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 1965 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1966 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
1967 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1873 1968
1874 c->regs[VCPU_REGS_RCX] = c->eip; 1969 c->regs[VCPU_REGS_RCX] = c->eip;
1875 if (is_long_mode(ctxt->vcpu)) { 1970 if (is_long_mode(ctxt->vcpu)) {
1876#ifdef CONFIG_X86_64 1971#ifdef CONFIG_X86_64
1877 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 1972 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1878 1973
1879 kvm_x86_ops->get_msr(ctxt->vcpu, 1974 ops->get_msr(ctxt->vcpu,
1880 ctxt->mode == X86EMUL_MODE_PROT64 ? 1975 ctxt->mode == X86EMUL_MODE_PROT64 ?
1881 MSR_LSTAR : MSR_CSTAR, &msr_data); 1976 MSR_LSTAR : MSR_CSTAR, &msr_data);
1882 c->eip = msr_data; 1977 c->eip = msr_data;
1883 1978
1884 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); 1979 ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
1885 ctxt->eflags &= ~(msr_data | EFLG_RF); 1980 ctxt->eflags &= ~(msr_data | EFLG_RF);
1886#endif 1981#endif
1887 } else { 1982 } else {
1888 /* legacy mode */ 1983 /* legacy mode */
1889 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1984 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1890 c->eip = (u32)msr_data; 1985 c->eip = (u32)msr_data;
1891 1986
1892 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1987 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1896,15 +1991,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1896} 1991}
1897 1992
1898static int 1993static int
1899emulate_sysenter(struct x86_emulate_ctxt *ctxt) 1994emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1900{ 1995{
1901 struct decode_cache *c = &ctxt->decode; 1996 struct decode_cache *c = &ctxt->decode;
1902 struct kvm_segment cs, ss; 1997 struct desc_struct cs, ss;
1903 u64 msr_data; 1998 u64 msr_data;
1999 u16 cs_sel, ss_sel;
1904 2000
1905 /* inject #GP if in real mode */ 2001 /* inject #GP if in real mode */
1906 if (ctxt->mode == X86EMUL_MODE_REAL) { 2002 if (ctxt->mode == X86EMUL_MODE_REAL) {
1907 kvm_inject_gp(ctxt->vcpu, 0); 2003 emulate_gp(ctxt, 0);
1908 return X86EMUL_PROPAGATE_FAULT; 2004 return X86EMUL_PROPAGATE_FAULT;
1909 } 2005 }
1910 2006
@@ -1912,67 +2008,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1912 * Therefore, we inject an #UD. 2008 * Therefore, we inject an #UD.
1913 */ 2009 */
1914 if (ctxt->mode == X86EMUL_MODE_PROT64) { 2010 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1915 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2011 emulate_ud(ctxt);
1916 return X86EMUL_PROPAGATE_FAULT; 2012 return X86EMUL_PROPAGATE_FAULT;
1917 } 2013 }
1918 2014
1919 setup_syscalls_segments(ctxt, &cs, &ss); 2015 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1920 2016
1921 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2017 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1922 switch (ctxt->mode) { 2018 switch (ctxt->mode) {
1923 case X86EMUL_MODE_PROT32: 2019 case X86EMUL_MODE_PROT32:
1924 if ((msr_data & 0xfffc) == 0x0) { 2020 if ((msr_data & 0xfffc) == 0x0) {
1925 kvm_inject_gp(ctxt->vcpu, 0); 2021 emulate_gp(ctxt, 0);
1926 return X86EMUL_PROPAGATE_FAULT; 2022 return X86EMUL_PROPAGATE_FAULT;
1927 } 2023 }
1928 break; 2024 break;
1929 case X86EMUL_MODE_PROT64: 2025 case X86EMUL_MODE_PROT64:
1930 if (msr_data == 0x0) { 2026 if (msr_data == 0x0) {
1931 kvm_inject_gp(ctxt->vcpu, 0); 2027 emulate_gp(ctxt, 0);
1932 return X86EMUL_PROPAGATE_FAULT; 2028 return X86EMUL_PROPAGATE_FAULT;
1933 } 2029 }
1934 break; 2030 break;
1935 } 2031 }
1936 2032
1937 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 2033 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1938 cs.selector = (u16)msr_data; 2034 cs_sel = (u16)msr_data;
1939 cs.selector &= ~SELECTOR_RPL_MASK; 2035 cs_sel &= ~SELECTOR_RPL_MASK;
1940 ss.selector = cs.selector + 8; 2036 ss_sel = cs_sel + 8;
1941 ss.selector &= ~SELECTOR_RPL_MASK; 2037 ss_sel &= ~SELECTOR_RPL_MASK;
1942 if (ctxt->mode == X86EMUL_MODE_PROT64 2038 if (ctxt->mode == X86EMUL_MODE_PROT64
1943 || is_long_mode(ctxt->vcpu)) { 2039 || is_long_mode(ctxt->vcpu)) {
1944 cs.db = 0; 2040 cs.d = 0;
1945 cs.l = 1; 2041 cs.l = 1;
1946 } 2042 }
1947 2043
1948 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2044 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
1949 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 2045 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
2046 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2047 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1950 2048
1951 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 2049 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
1952 c->eip = msr_data; 2050 c->eip = msr_data;
1953 2051
1954 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 2052 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
1955 c->regs[VCPU_REGS_RSP] = msr_data; 2053 c->regs[VCPU_REGS_RSP] = msr_data;
1956 2054
1957 return X86EMUL_CONTINUE; 2055 return X86EMUL_CONTINUE;
1958} 2056}
1959 2057
1960static int 2058static int
1961emulate_sysexit(struct x86_emulate_ctxt *ctxt) 2059emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1962{ 2060{
1963 struct decode_cache *c = &ctxt->decode; 2061 struct decode_cache *c = &ctxt->decode;
1964 struct kvm_segment cs, ss; 2062 struct desc_struct cs, ss;
1965 u64 msr_data; 2063 u64 msr_data;
1966 int usermode; 2064 int usermode;
2065 u16 cs_sel, ss_sel;
1967 2066
1968 /* inject #GP if in real mode or Virtual 8086 mode */ 2067 /* inject #GP if in real mode or Virtual 8086 mode */
1969 if (ctxt->mode == X86EMUL_MODE_REAL || 2068 if (ctxt->mode == X86EMUL_MODE_REAL ||
1970 ctxt->mode == X86EMUL_MODE_VM86) { 2069 ctxt->mode == X86EMUL_MODE_VM86) {
1971 kvm_inject_gp(ctxt->vcpu, 0); 2070 emulate_gp(ctxt, 0);
1972 return X86EMUL_PROPAGATE_FAULT; 2071 return X86EMUL_PROPAGATE_FAULT;
1973 } 2072 }
1974 2073
1975 setup_syscalls_segments(ctxt, &cs, &ss); 2074 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1976 2075
1977 if ((c->rex_prefix & 0x8) != 0x0) 2076 if ((c->rex_prefix & 0x8) != 0x0)
1978 usermode = X86EMUL_MODE_PROT64; 2077 usermode = X86EMUL_MODE_PROT64;
@@ -1981,35 +2080,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1981 2080
1982 cs.dpl = 3; 2081 cs.dpl = 3;
1983 ss.dpl = 3; 2082 ss.dpl = 3;
1984 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2083 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1985 switch (usermode) { 2084 switch (usermode) {
1986 case X86EMUL_MODE_PROT32: 2085 case X86EMUL_MODE_PROT32:
1987 cs.selector = (u16)(msr_data + 16); 2086 cs_sel = (u16)(msr_data + 16);
1988 if ((msr_data & 0xfffc) == 0x0) { 2087 if ((msr_data & 0xfffc) == 0x0) {
1989 kvm_inject_gp(ctxt->vcpu, 0); 2088 emulate_gp(ctxt, 0);
1990 return X86EMUL_PROPAGATE_FAULT; 2089 return X86EMUL_PROPAGATE_FAULT;
1991 } 2090 }
1992 ss.selector = (u16)(msr_data + 24); 2091 ss_sel = (u16)(msr_data + 24);
1993 break; 2092 break;
1994 case X86EMUL_MODE_PROT64: 2093 case X86EMUL_MODE_PROT64:
1995 cs.selector = (u16)(msr_data + 32); 2094 cs_sel = (u16)(msr_data + 32);
1996 if (msr_data == 0x0) { 2095 if (msr_data == 0x0) {
1997 kvm_inject_gp(ctxt->vcpu, 0); 2096 emulate_gp(ctxt, 0);
1998 return X86EMUL_PROPAGATE_FAULT; 2097 return X86EMUL_PROPAGATE_FAULT;
1999 } 2098 }
2000 ss.selector = cs.selector + 8; 2099 ss_sel = cs_sel + 8;
2001 cs.db = 0; 2100 cs.d = 0;
2002 cs.l = 1; 2101 cs.l = 1;
2003 break; 2102 break;
2004 } 2103 }
2005 cs.selector |= SELECTOR_RPL_MASK; 2104 cs_sel |= SELECTOR_RPL_MASK;
2006 ss.selector |= SELECTOR_RPL_MASK; 2105 ss_sel |= SELECTOR_RPL_MASK;
2007 2106
2008 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2107 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
2009 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 2108 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
2109 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2110 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
2010 2111
2011 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; 2112 c->eip = c->regs[VCPU_REGS_RDX];
2012 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; 2113 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
2013 2114
2014 return X86EMUL_CONTINUE; 2115 return X86EMUL_CONTINUE;
2015} 2116}
@@ -2030,25 +2131,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2030 struct x86_emulate_ops *ops, 2131 struct x86_emulate_ops *ops,
2031 u16 port, u16 len) 2132 u16 port, u16 len)
2032{ 2133{
2033 struct kvm_segment tr_seg; 2134 struct desc_struct tr_seg;
2034 int r; 2135 int r;
2035 u16 io_bitmap_ptr; 2136 u16 io_bitmap_ptr;
2036 u8 perm, bit_idx = port & 0x7; 2137 u8 perm, bit_idx = port & 0x7;
2037 unsigned mask = (1 << len) - 1; 2138 unsigned mask = (1 << len) - 1;
2038 2139
2039 kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); 2140 ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
2040 if (tr_seg.unusable) 2141 if (!tr_seg.p)
2041 return false; 2142 return false;
2042 if (tr_seg.limit < 103) 2143 if (desc_limit_scaled(&tr_seg) < 103)
2043 return false; 2144 return false;
2044 r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, 2145 r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
2045 NULL); 2146 ctxt->vcpu, NULL);
2046 if (r != X86EMUL_CONTINUE) 2147 if (r != X86EMUL_CONTINUE)
2047 return false; 2148 return false;
2048 if (io_bitmap_ptr + port/8 > tr_seg.limit) 2149 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
2049 return false; 2150 return false;
2050 r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, 2151 r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
2051 ctxt->vcpu, NULL); 2152 &perm, 1, ctxt->vcpu, NULL);
2052 if (r != X86EMUL_CONTINUE) 2153 if (r != X86EMUL_CONTINUE)
2053 return false; 2154 return false;
2054 if ((perm >> bit_idx) & mask) 2155 if ((perm >> bit_idx) & mask)
@@ -2066,17 +2167,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2066 return true; 2167 return true;
2067} 2168}
2068 2169
2069static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
2070 struct x86_emulate_ops *ops,
2071 int seg)
2072{
2073 struct desc_struct desc;
2074 if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
2075 return get_desc_base(&desc);
2076 else
2077 return ~0;
2078}
2079
2080static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, 2170static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2081 struct x86_emulate_ops *ops, 2171 struct x86_emulate_ops *ops,
2082 struct tss_segment_16 *tss) 2172 struct tss_segment_16 *tss)
@@ -2165,7 +2255,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2165 &err); 2255 &err);
2166 if (ret == X86EMUL_PROPAGATE_FAULT) { 2256 if (ret == X86EMUL_PROPAGATE_FAULT) {
2167 /* FIXME: need to provide precise fault address */ 2257 /* FIXME: need to provide precise fault address */
2168 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2258 emulate_pf(ctxt, old_tss_base, err);
2169 return ret; 2259 return ret;
2170 } 2260 }
2171 2261
@@ -2175,7 +2265,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2175 &err); 2265 &err);
2176 if (ret == X86EMUL_PROPAGATE_FAULT) { 2266 if (ret == X86EMUL_PROPAGATE_FAULT) {
2177 /* FIXME: need to provide precise fault address */ 2267 /* FIXME: need to provide precise fault address */
2178 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2268 emulate_pf(ctxt, old_tss_base, err);
2179 return ret; 2269 return ret;
2180 } 2270 }
2181 2271
@@ -2183,7 +2273,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2183 &err); 2273 &err);
2184 if (ret == X86EMUL_PROPAGATE_FAULT) { 2274 if (ret == X86EMUL_PROPAGATE_FAULT) {
2185 /* FIXME: need to provide precise fault address */ 2275 /* FIXME: need to provide precise fault address */
2186 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2276 emulate_pf(ctxt, new_tss_base, err);
2187 return ret; 2277 return ret;
2188 } 2278 }
2189 2279
@@ -2196,7 +2286,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2196 ctxt->vcpu, &err); 2286 ctxt->vcpu, &err);
2197 if (ret == X86EMUL_PROPAGATE_FAULT) { 2287 if (ret == X86EMUL_PROPAGATE_FAULT) {
2198 /* FIXME: need to provide precise fault address */ 2288 /* FIXME: need to provide precise fault address */
2199 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2289 emulate_pf(ctxt, new_tss_base, err);
2200 return ret; 2290 return ret;
2201 } 2291 }
2202 } 2292 }
@@ -2238,7 +2328,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2238 struct decode_cache *c = &ctxt->decode; 2328 struct decode_cache *c = &ctxt->decode;
2239 int ret; 2329 int ret;
2240 2330
2241 ops->set_cr(3, tss->cr3, ctxt->vcpu); 2331 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
2332 emulate_gp(ctxt, 0);
2333 return X86EMUL_PROPAGATE_FAULT;
2334 }
2242 c->eip = tss->eip; 2335 c->eip = tss->eip;
2243 ctxt->eflags = tss->eflags | 2; 2336 ctxt->eflags = tss->eflags | 2;
2244 c->regs[VCPU_REGS_RAX] = tss->eax; 2337 c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2304,7 +2397,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2304 &err); 2397 &err);
2305 if (ret == X86EMUL_PROPAGATE_FAULT) { 2398 if (ret == X86EMUL_PROPAGATE_FAULT) {
2306 /* FIXME: need to provide precise fault address */ 2399 /* FIXME: need to provide precise fault address */
2307 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2400 emulate_pf(ctxt, old_tss_base, err);
2308 return ret; 2401 return ret;
2309 } 2402 }
2310 2403
@@ -2314,7 +2407,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2314 &err); 2407 &err);
2315 if (ret == X86EMUL_PROPAGATE_FAULT) { 2408 if (ret == X86EMUL_PROPAGATE_FAULT) {
2316 /* FIXME: need to provide precise fault address */ 2409 /* FIXME: need to provide precise fault address */
2317 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2410 emulate_pf(ctxt, old_tss_base, err);
2318 return ret; 2411 return ret;
2319 } 2412 }
2320 2413
@@ -2322,7 +2415,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2322 &err); 2415 &err);
2323 if (ret == X86EMUL_PROPAGATE_FAULT) { 2416 if (ret == X86EMUL_PROPAGATE_FAULT) {
2324 /* FIXME: need to provide precise fault address */ 2417 /* FIXME: need to provide precise fault address */
2325 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2418 emulate_pf(ctxt, new_tss_base, err);
2326 return ret; 2419 return ret;
2327 } 2420 }
2328 2421
@@ -2335,7 +2428,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2335 ctxt->vcpu, &err); 2428 ctxt->vcpu, &err);
2336 if (ret == X86EMUL_PROPAGATE_FAULT) { 2429 if (ret == X86EMUL_PROPAGATE_FAULT) {
2337 /* FIXME: need to provide precise fault address */ 2430 /* FIXME: need to provide precise fault address */
2338 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2431 emulate_pf(ctxt, new_tss_base, err);
2339 return ret; 2432 return ret;
2340 } 2433 }
2341 } 2434 }
@@ -2352,7 +2445,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2352 int ret; 2445 int ret;
2353 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); 2446 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
2354 ulong old_tss_base = 2447 ulong old_tss_base =
2355 get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); 2448 ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
2356 u32 desc_limit; 2449 u32 desc_limit;
2357 2450
2358 /* FIXME: old_tss_base == ~0 ? */ 2451 /* FIXME: old_tss_base == ~0 ? */
@@ -2369,7 +2462,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2369 if (reason != TASK_SWITCH_IRET) { 2462 if (reason != TASK_SWITCH_IRET) {
2370 if ((tss_selector & 3) > next_tss_desc.dpl || 2463 if ((tss_selector & 3) > next_tss_desc.dpl ||
2371 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2464 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
2372 kvm_inject_gp(ctxt->vcpu, 0); 2465 emulate_gp(ctxt, 0);
2373 return X86EMUL_PROPAGATE_FAULT; 2466 return X86EMUL_PROPAGATE_FAULT;
2374 } 2467 }
2375 } 2468 }
@@ -2378,8 +2471,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2378 if (!next_tss_desc.p || 2471 if (!next_tss_desc.p ||
2379 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || 2472 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
2380 desc_limit < 0x2b)) { 2473 desc_limit < 0x2b)) {
2381 kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, 2474 emulate_ts(ctxt, tss_selector & 0xfffc);
2382 tss_selector & 0xfffc);
2383 return X86EMUL_PROPAGATE_FAULT; 2475 return X86EMUL_PROPAGATE_FAULT;
2384 } 2476 }
2385 2477
@@ -2425,7 +2517,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2425 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2517 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2426 c->lock_prefix = 0; 2518 c->lock_prefix = 0;
2427 c->src.val = (unsigned long) error_code; 2519 c->src.val = (unsigned long) error_code;
2428 emulate_push(ctxt); 2520 emulate_push(ctxt, ops);
2429 } 2521 }
2430 2522
2431 return ret; 2523 return ret;
@@ -2439,18 +2531,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2439 struct decode_cache *c = &ctxt->decode; 2531 struct decode_cache *c = &ctxt->decode;
2440 int rc; 2532 int rc;
2441 2533
2442 memset(c, 0, sizeof(struct decode_cache));
2443 c->eip = ctxt->eip; 2534 c->eip = ctxt->eip;
2444 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2445 c->dst.type = OP_NONE; 2535 c->dst.type = OP_NONE;
2446 2536
2447 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2537 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2448 has_error_code, error_code); 2538 has_error_code, error_code);
2449 2539
2450 if (rc == X86EMUL_CONTINUE) { 2540 if (rc == X86EMUL_CONTINUE) {
2451 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2452 kvm_rip_write(ctxt->vcpu, c->eip);
2453 rc = writeback(ctxt, ops); 2541 rc = writeback(ctxt, ops);
2542 if (rc == X86EMUL_CONTINUE)
2543 ctxt->eip = c->eip;
2454 } 2544 }
2455 2545
2456 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2546 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -2474,29 +2564,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2474 int rc = X86EMUL_CONTINUE; 2564 int rc = X86EMUL_CONTINUE;
2475 int saved_dst_type = c->dst.type; 2565 int saved_dst_type = c->dst.type;
2476 2566
2477 ctxt->interruptibility = 0; 2567 ctxt->decode.mem_read.pos = 0;
2478
2479 /* Shadow copy of register state. Committed on successful emulation.
2480 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
2481 * modify them.
2482 */
2483
2484 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2485 2568
2486 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 2569 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
2487 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2570 emulate_ud(ctxt);
2488 goto done; 2571 goto done;
2489 } 2572 }
2490 2573
2491 /* LOCK prefix is allowed only with some instructions */ 2574 /* LOCK prefix is allowed only with some instructions */
2492 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 2575 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
2493 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2576 emulate_ud(ctxt);
2494 goto done; 2577 goto done;
2495 } 2578 }
2496 2579
2497 /* Privileged instruction can be executed only in CPL=0 */ 2580 /* Privileged instruction can be executed only in CPL=0 */
2498 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 2581 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
2499 kvm_inject_gp(ctxt->vcpu, 0); 2582 emulate_gp(ctxt, 0);
2500 goto done; 2583 goto done;
2501 } 2584 }
2502 2585
@@ -2506,7 +2589,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2506 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 2589 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2507 string_done: 2590 string_done:
2508 ctxt->restart = false; 2591 ctxt->restart = false;
2509 kvm_rip_write(ctxt->vcpu, c->eip); 2592 ctxt->eip = c->eip;
2510 goto done; 2593 goto done;
2511 } 2594 }
2512 /* The second termination condition only applies for REPE 2595 /* The second termination condition only applies for REPE
@@ -2529,20 +2612,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2529 } 2612 }
2530 2613
2531 if (c->src.type == OP_MEM) { 2614 if (c->src.type == OP_MEM) {
2532 rc = ops->read_emulated((unsigned long)c->src.ptr, 2615 rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
2533 &c->src.val, 2616 c->src.valptr, c->src.bytes);
2534 c->src.bytes,
2535 ctxt->vcpu);
2536 if (rc != X86EMUL_CONTINUE) 2617 if (rc != X86EMUL_CONTINUE)
2537 goto done; 2618 goto done;
2538 c->src.orig_val = c->src.val; 2619 c->src.orig_val = c->src.val;
2539 } 2620 }
2540 2621
2541 if (c->src2.type == OP_MEM) { 2622 if (c->src2.type == OP_MEM) {
2542 rc = ops->read_emulated((unsigned long)c->src2.ptr, 2623 rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
2543 &c->src2.val, 2624 &c->src2.val, c->src2.bytes);
2544 c->src2.bytes,
2545 ctxt->vcpu);
2546 if (rc != X86EMUL_CONTINUE) 2625 if (rc != X86EMUL_CONTINUE)
2547 goto done; 2626 goto done;
2548 } 2627 }
@@ -2553,8 +2632,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2553 2632
2554 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 2633 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
2555 /* optimisation - avoid slow emulated read if Mov */ 2634 /* optimisation - avoid slow emulated read if Mov */
2556 rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, 2635 rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
2557 c->dst.bytes, ctxt->vcpu); 2636 &c->dst.val, c->dst.bytes);
2558 if (rc != X86EMUL_CONTINUE) 2637 if (rc != X86EMUL_CONTINUE)
2559 goto done; 2638 goto done;
2560 } 2639 }
@@ -2571,7 +2650,7 @@ special_insn:
2571 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 2650 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2572 break; 2651 break;
2573 case 0x06: /* push es */ 2652 case 0x06: /* push es */
2574 emulate_push_sreg(ctxt, VCPU_SREG_ES); 2653 emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
2575 break; 2654 break;
2576 case 0x07: /* pop es */ 2655 case 0x07: /* pop es */
2577 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 2656 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
@@ -2583,14 +2662,14 @@ special_insn:
2583 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 2662 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2584 break; 2663 break;
2585 case 0x0e: /* push cs */ 2664 case 0x0e: /* push cs */
2586 emulate_push_sreg(ctxt, VCPU_SREG_CS); 2665 emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
2587 break; 2666 break;
2588 case 0x10 ... 0x15: 2667 case 0x10 ... 0x15:
2589 adc: /* adc */ 2668 adc: /* adc */
2590 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 2669 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2591 break; 2670 break;
2592 case 0x16: /* push ss */ 2671 case 0x16: /* push ss */
2593 emulate_push_sreg(ctxt, VCPU_SREG_SS); 2672 emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
2594 break; 2673 break;
2595 case 0x17: /* pop ss */ 2674 case 0x17: /* pop ss */
2596 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 2675 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
@@ -2602,7 +2681,7 @@ special_insn:
2602 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 2681 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2603 break; 2682 break;
2604 case 0x1e: /* push ds */ 2683 case 0x1e: /* push ds */
2605 emulate_push_sreg(ctxt, VCPU_SREG_DS); 2684 emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
2606 break; 2685 break;
2607 case 0x1f: /* pop ds */ 2686 case 0x1f: /* pop ds */
2608 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 2687 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
@@ -2632,7 +2711,7 @@ special_insn:
2632 emulate_1op("dec", c->dst, ctxt->eflags); 2711 emulate_1op("dec", c->dst, ctxt->eflags);
2633 break; 2712 break;
2634 case 0x50 ... 0x57: /* push reg */ 2713 case 0x50 ... 0x57: /* push reg */
2635 emulate_push(ctxt); 2714 emulate_push(ctxt, ops);
2636 break; 2715 break;
2637 case 0x58 ... 0x5f: /* pop reg */ 2716 case 0x58 ... 0x5f: /* pop reg */
2638 pop_instruction: 2717 pop_instruction:
@@ -2641,7 +2720,9 @@ special_insn:
2641 goto done; 2720 goto done;
2642 break; 2721 break;
2643 case 0x60: /* pusha */ 2722 case 0x60: /* pusha */
2644 emulate_pusha(ctxt); 2723 rc = emulate_pusha(ctxt, ops);
2724 if (rc != X86EMUL_CONTINUE)
2725 goto done;
2645 break; 2726 break;
2646 case 0x61: /* popa */ 2727 case 0x61: /* popa */
2647 rc = emulate_popa(ctxt, ops); 2728 rc = emulate_popa(ctxt, ops);
@@ -2655,14 +2736,14 @@ special_insn:
2655 break; 2736 break;
2656 case 0x68: /* push imm */ 2737 case 0x68: /* push imm */
2657 case 0x6a: /* push imm8 */ 2738 case 0x6a: /* push imm8 */
2658 emulate_push(ctxt); 2739 emulate_push(ctxt, ops);
2659 break; 2740 break;
2660 case 0x6c: /* insb */ 2741 case 0x6c: /* insb */
2661 case 0x6d: /* insw/insd */ 2742 case 0x6d: /* insw/insd */
2662 c->dst.bytes = min(c->dst.bytes, 4u); 2743 c->dst.bytes = min(c->dst.bytes, 4u);
2663 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2744 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2664 c->dst.bytes)) { 2745 c->dst.bytes)) {
2665 kvm_inject_gp(ctxt->vcpu, 0); 2746 emulate_gp(ctxt, 0);
2666 goto done; 2747 goto done;
2667 } 2748 }
2668 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, 2749 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
@@ -2674,7 +2755,7 @@ special_insn:
2674 c->src.bytes = min(c->src.bytes, 4u); 2755 c->src.bytes = min(c->src.bytes, 4u);
2675 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2756 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2676 c->src.bytes)) { 2757 c->src.bytes)) {
2677 kvm_inject_gp(ctxt->vcpu, 0); 2758 emulate_gp(ctxt, 0);
2678 goto done; 2759 goto done;
2679 } 2760 }
2680 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], 2761 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
@@ -2707,6 +2788,7 @@ special_insn:
2707 } 2788 }
2708 break; 2789 break;
2709 case 0x84 ... 0x85: 2790 case 0x84 ... 0x85:
2791 test:
2710 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 2792 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
2711 break; 2793 break;
2712 case 0x86 ... 0x87: /* xchg */ 2794 case 0x86 ... 0x87: /* xchg */
@@ -2735,18 +2817,13 @@ special_insn:
2735 break; 2817 break;
2736 case 0x88 ... 0x8b: /* mov */ 2818 case 0x88 ... 0x8b: /* mov */
2737 goto mov; 2819 goto mov;
2738 case 0x8c: { /* mov r/m, sreg */ 2820 case 0x8c: /* mov r/m, sreg */
2739 struct kvm_segment segreg; 2821 if (c->modrm_reg > VCPU_SREG_GS) {
2740 2822 emulate_ud(ctxt);
2741 if (c->modrm_reg <= VCPU_SREG_GS)
2742 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
2743 else {
2744 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2745 goto done; 2823 goto done;
2746 } 2824 }
2747 c->dst.val = segreg.selector; 2825 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
2748 break; 2826 break;
2749 }
2750 case 0x8d: /* lea r16/r32, m */ 2827 case 0x8d: /* lea r16/r32, m */
2751 c->dst.val = c->modrm_ea; 2828 c->dst.val = c->modrm_ea;
2752 break; 2829 break;
@@ -2757,12 +2834,12 @@ special_insn:
2757 2834
2758 if (c->modrm_reg == VCPU_SREG_CS || 2835 if (c->modrm_reg == VCPU_SREG_CS ||
2759 c->modrm_reg > VCPU_SREG_GS) { 2836 c->modrm_reg > VCPU_SREG_GS) {
2760 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2837 emulate_ud(ctxt);
2761 goto done; 2838 goto done;
2762 } 2839 }
2763 2840
2764 if (c->modrm_reg == VCPU_SREG_SS) 2841 if (c->modrm_reg == VCPU_SREG_SS)
2765 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); 2842 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
2766 2843
2767 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); 2844 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
2768 2845
@@ -2775,19 +2852,19 @@ special_insn:
2775 goto done; 2852 goto done;
2776 break; 2853 break;
2777 case 0x90: /* nop / xchg r8,rax */ 2854 case 0x90: /* nop / xchg r8,rax */
2778 if (!(c->rex_prefix & 1)) { /* nop */ 2855 if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
2779 c->dst.type = OP_NONE; 2856 c->dst.type = OP_NONE; /* nop */
2780 break; 2857 break;
2781 } 2858 }
2782 case 0x91 ... 0x97: /* xchg reg,rax */ 2859 case 0x91 ... 0x97: /* xchg reg,rax */
2783 c->src.type = c->dst.type = OP_REG; 2860 c->src.type = OP_REG;
2784 c->src.bytes = c->dst.bytes = c->op_bytes; 2861 c->src.bytes = c->op_bytes;
2785 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; 2862 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
2786 c->src.val = *(c->src.ptr); 2863 c->src.val = *(c->src.ptr);
2787 goto xchg; 2864 goto xchg;
2788 case 0x9c: /* pushf */ 2865 case 0x9c: /* pushf */
2789 c->src.val = (unsigned long) ctxt->eflags; 2866 c->src.val = (unsigned long) ctxt->eflags;
2790 emulate_push(ctxt); 2867 emulate_push(ctxt, ops);
2791 break; 2868 break;
2792 case 0x9d: /* popf */ 2869 case 0x9d: /* popf */
2793 c->dst.type = OP_REG; 2870 c->dst.type = OP_REG;
@@ -2797,19 +2874,15 @@ special_insn:
2797 if (rc != X86EMUL_CONTINUE) 2874 if (rc != X86EMUL_CONTINUE)
2798 goto done; 2875 goto done;
2799 break; 2876 break;
2800 case 0xa0 ... 0xa1: /* mov */ 2877 case 0xa0 ... 0xa3: /* mov */
2801 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2802 c->dst.val = c->src.val;
2803 break;
2804 case 0xa2 ... 0xa3: /* mov */
2805 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
2806 break;
2807 case 0xa4 ... 0xa5: /* movs */ 2878 case 0xa4 ... 0xa5: /* movs */
2808 goto mov; 2879 goto mov;
2809 case 0xa6 ... 0xa7: /* cmps */ 2880 case 0xa6 ... 0xa7: /* cmps */
2810 c->dst.type = OP_NONE; /* Disable writeback. */ 2881 c->dst.type = OP_NONE; /* Disable writeback. */
2811 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2882 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
2812 goto cmp; 2883 goto cmp;
2884 case 0xa8 ... 0xa9: /* test ax, imm */
2885 goto test;
2813 case 0xaa ... 0xab: /* stos */ 2886 case 0xaa ... 0xab: /* stos */
2814 c->dst.val = c->regs[VCPU_REGS_RAX]; 2887 c->dst.val = c->regs[VCPU_REGS_RAX];
2815 break; 2888 break;
@@ -2855,19 +2928,23 @@ special_insn:
2855 long int rel = c->src.val; 2928 long int rel = c->src.val;
2856 c->src.val = (unsigned long) c->eip; 2929 c->src.val = (unsigned long) c->eip;
2857 jmp_rel(c, rel); 2930 jmp_rel(c, rel);
2858 emulate_push(ctxt); 2931 emulate_push(ctxt, ops);
2859 break; 2932 break;
2860 } 2933 }
2861 case 0xe9: /* jmp rel */ 2934 case 0xe9: /* jmp rel */
2862 goto jmp; 2935 goto jmp;
2863 case 0xea: /* jmp far */ 2936 case 0xea: { /* jmp far */
2937 unsigned short sel;
2864 jump_far: 2938 jump_far:
2865 if (load_segment_descriptor(ctxt, ops, c->src2.val, 2939 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
2866 VCPU_SREG_CS)) 2940
2941 if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
2867 goto done; 2942 goto done;
2868 2943
2869 c->eip = c->src.val; 2944 c->eip = 0;
2945 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2870 break; 2946 break;
2947 }
2871 case 0xeb: 2948 case 0xeb:
2872 jmp: /* jmp rel short */ 2949 jmp: /* jmp rel short */
2873 jmp_rel(c, c->src.val); 2950 jmp_rel(c, c->src.val);
@@ -2879,20 +2956,20 @@ special_insn:
2879 do_io_in: 2956 do_io_in:
2880 c->dst.bytes = min(c->dst.bytes, 4u); 2957 c->dst.bytes = min(c->dst.bytes, 4u);
2881 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 2958 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2882 kvm_inject_gp(ctxt->vcpu, 0); 2959 emulate_gp(ctxt, 0);
2883 goto done; 2960 goto done;
2884 } 2961 }
2885 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 2962 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
2886 &c->dst.val)) 2963 &c->dst.val))
2887 goto done; /* IO is needed */ 2964 goto done; /* IO is needed */
2888 break; 2965 break;
2889 case 0xee: /* out al,dx */ 2966 case 0xee: /* out dx,al */
2890 case 0xef: /* out (e/r)ax,dx */ 2967 case 0xef: /* out dx,(e/r)ax */
2891 c->src.val = c->regs[VCPU_REGS_RDX]; 2968 c->src.val = c->regs[VCPU_REGS_RDX];
2892 do_io_out: 2969 do_io_out:
2893 c->dst.bytes = min(c->dst.bytes, 4u); 2970 c->dst.bytes = min(c->dst.bytes, 4u);
2894 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 2971 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2895 kvm_inject_gp(ctxt->vcpu, 0); 2972 emulate_gp(ctxt, 0);
2896 goto done; 2973 goto done;
2897 } 2974 }
2898 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, 2975 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
@@ -2916,18 +2993,20 @@ special_insn:
2916 c->dst.type = OP_NONE; /* Disable writeback. */ 2993 c->dst.type = OP_NONE; /* Disable writeback. */
2917 break; 2994 break;
2918 case 0xfa: /* cli */ 2995 case 0xfa: /* cli */
2919 if (emulator_bad_iopl(ctxt, ops)) 2996 if (emulator_bad_iopl(ctxt, ops)) {
2920 kvm_inject_gp(ctxt->vcpu, 0); 2997 emulate_gp(ctxt, 0);
2921 else { 2998 goto done;
2999 } else {
2922 ctxt->eflags &= ~X86_EFLAGS_IF; 3000 ctxt->eflags &= ~X86_EFLAGS_IF;
2923 c->dst.type = OP_NONE; /* Disable writeback. */ 3001 c->dst.type = OP_NONE; /* Disable writeback. */
2924 } 3002 }
2925 break; 3003 break;
2926 case 0xfb: /* sti */ 3004 case 0xfb: /* sti */
2927 if (emulator_bad_iopl(ctxt, ops)) 3005 if (emulator_bad_iopl(ctxt, ops)) {
2928 kvm_inject_gp(ctxt->vcpu, 0); 3006 emulate_gp(ctxt, 0);
2929 else { 3007 goto done;
2930 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); 3008 } else {
3009 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
2931 ctxt->eflags |= X86_EFLAGS_IF; 3010 ctxt->eflags |= X86_EFLAGS_IF;
2932 c->dst.type = OP_NONE; /* Disable writeback. */ 3011 c->dst.type = OP_NONE; /* Disable writeback. */
2933 } 3012 }
@@ -2964,11 +3043,12 @@ writeback:
2964 c->dst.type = saved_dst_type; 3043 c->dst.type = saved_dst_type;
2965 3044
2966 if ((c->d & SrcMask) == SrcSI) 3045 if ((c->d & SrcMask) == SrcSI)
2967 string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, 3046 string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
2968 &c->src); 3047 VCPU_REGS_RSI, &c->src);
2969 3048
2970 if ((c->d & DstMask) == DstDI) 3049 if ((c->d & DstMask) == DstDI)
2971 string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); 3050 string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
3051 &c->dst);
2972 3052
2973 if (c->rep_prefix && (c->d & String)) { 3053 if (c->rep_prefix && (c->d & String)) {
2974 struct read_cache *rc = &ctxt->decode.io_read; 3054 struct read_cache *rc = &ctxt->decode.io_read;
@@ -2981,11 +3061,12 @@ writeback:
2981 (rc->end != 0 && rc->end == rc->pos)) 3061 (rc->end != 0 && rc->end == rc->pos))
2982 ctxt->restart = false; 3062 ctxt->restart = false;
2983 } 3063 }
2984 3064 /*
2985 /* Commit shadow register state. */ 3065 * reset read cache here in case string instruction is restared
2986 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 3066 * without decoding
2987 kvm_rip_write(ctxt->vcpu, c->eip); 3067 */
2988 ops->set_rflags(ctxt->vcpu, ctxt->eflags); 3068 ctxt->decode.mem_read.end = 0;
3069 ctxt->eip = c->eip;
2989 3070
2990done: 3071done:
2991 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 3072 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -3051,7 +3132,7 @@ twobyte_insn:
3051 c->dst.type = OP_NONE; 3132 c->dst.type = OP_NONE;
3052 break; 3133 break;
3053 case 5: /* not defined */ 3134 case 5: /* not defined */
3054 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3135 emulate_ud(ctxt);
3055 goto done; 3136 goto done;
3056 case 7: /* invlpg*/ 3137 case 7: /* invlpg*/
3057 emulate_invlpg(ctxt->vcpu, c->modrm_ea); 3138 emulate_invlpg(ctxt->vcpu, c->modrm_ea);
@@ -3063,7 +3144,7 @@ twobyte_insn:
3063 } 3144 }
3064 break; 3145 break;
3065 case 0x05: /* syscall */ 3146 case 0x05: /* syscall */
3066 rc = emulate_syscall(ctxt); 3147 rc = emulate_syscall(ctxt, ops);
3067 if (rc != X86EMUL_CONTINUE) 3148 if (rc != X86EMUL_CONTINUE)
3068 goto done; 3149 goto done;
3069 else 3150 else
@@ -3073,8 +3154,11 @@ twobyte_insn:
3073 emulate_clts(ctxt->vcpu); 3154 emulate_clts(ctxt->vcpu);
3074 c->dst.type = OP_NONE; 3155 c->dst.type = OP_NONE;
3075 break; 3156 break;
3076 case 0x08: /* invd */
3077 case 0x09: /* wbinvd */ 3157 case 0x09: /* wbinvd */
3158 kvm_emulate_wbinvd(ctxt->vcpu);
3159 c->dst.type = OP_NONE;
3160 break;
3161 case 0x08: /* invd */
3078 case 0x0d: /* GrpP (prefetch) */ 3162 case 0x0d: /* GrpP (prefetch) */
3079 case 0x18: /* Grp16 (prefetch/nop) */ 3163 case 0x18: /* Grp16 (prefetch/nop) */
3080 c->dst.type = OP_NONE; 3164 c->dst.type = OP_NONE;
@@ -3084,7 +3168,7 @@ twobyte_insn:
3084 case 1: 3168 case 1:
3085 case 5 ... 7: 3169 case 5 ... 7:
3086 case 9 ... 15: 3170 case 9 ... 15:
3087 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3171 emulate_ud(ctxt);
3088 goto done; 3172 goto done;
3089 } 3173 }
3090 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3174 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3093,31 +3177,42 @@ twobyte_insn:
3093 case 0x21: /* mov from dr to reg */ 3177 case 0x21: /* mov from dr to reg */
3094 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3178 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3095 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3179 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3096 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3180 emulate_ud(ctxt);
3097 goto done; 3181 goto done;
3098 } 3182 }
3099 emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); 3183 ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
3100 c->dst.type = OP_NONE; /* no writeback */ 3184 c->dst.type = OP_NONE; /* no writeback */
3101 break; 3185 break;
3102 case 0x22: /* mov reg, cr */ 3186 case 0x22: /* mov reg, cr */
3103 ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); 3187 if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
3188 emulate_gp(ctxt, 0);
3189 goto done;
3190 }
3104 c->dst.type = OP_NONE; 3191 c->dst.type = OP_NONE;
3105 break; 3192 break;
3106 case 0x23: /* mov from reg to dr */ 3193 case 0x23: /* mov from reg to dr */
3107 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3194 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3108 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3195 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3109 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3196 emulate_ud(ctxt);
3197 goto done;
3198 }
3199
3200 if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
3201 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
3202 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3203 /* #UD condition is already handled by the code above */
3204 emulate_gp(ctxt, 0);
3110 goto done; 3205 goto done;
3111 } 3206 }
3112 emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); 3207
3113 c->dst.type = OP_NONE; /* no writeback */ 3208 c->dst.type = OP_NONE; /* no writeback */
3114 break; 3209 break;
3115 case 0x30: 3210 case 0x30:
3116 /* wrmsr */ 3211 /* wrmsr */
3117 msr_data = (u32)c->regs[VCPU_REGS_RAX] 3212 msr_data = (u32)c->regs[VCPU_REGS_RAX]
3118 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3213 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3119 if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 3214 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
3120 kvm_inject_gp(ctxt->vcpu, 0); 3215 emulate_gp(ctxt, 0);
3121 goto done; 3216 goto done;
3122 } 3217 }
3123 rc = X86EMUL_CONTINUE; 3218 rc = X86EMUL_CONTINUE;
@@ -3125,8 +3220,8 @@ twobyte_insn:
3125 break; 3220 break;
3126 case 0x32: 3221 case 0x32:
3127 /* rdmsr */ 3222 /* rdmsr */
3128 if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 3223 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
3129 kvm_inject_gp(ctxt->vcpu, 0); 3224 emulate_gp(ctxt, 0);
3130 goto done; 3225 goto done;
3131 } else { 3226 } else {
3132 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3227 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3136,14 +3231,14 @@ twobyte_insn:
3136 c->dst.type = OP_NONE; 3231 c->dst.type = OP_NONE;
3137 break; 3232 break;
3138 case 0x34: /* sysenter */ 3233 case 0x34: /* sysenter */
3139 rc = emulate_sysenter(ctxt); 3234 rc = emulate_sysenter(ctxt, ops);
3140 if (rc != X86EMUL_CONTINUE) 3235 if (rc != X86EMUL_CONTINUE)
3141 goto done; 3236 goto done;
3142 else 3237 else
3143 goto writeback; 3238 goto writeback;
3144 break; 3239 break;
3145 case 0x35: /* sysexit */ 3240 case 0x35: /* sysexit */
3146 rc = emulate_sysexit(ctxt); 3241 rc = emulate_sysexit(ctxt, ops);
3147 if (rc != X86EMUL_CONTINUE) 3242 if (rc != X86EMUL_CONTINUE)
3148 goto done; 3243 goto done;
3149 else 3244 else
@@ -3160,7 +3255,7 @@ twobyte_insn:
3160 c->dst.type = OP_NONE; 3255 c->dst.type = OP_NONE;
3161 break; 3256 break;
3162 case 0xa0: /* push fs */ 3257 case 0xa0: /* push fs */
3163 emulate_push_sreg(ctxt, VCPU_SREG_FS); 3258 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
3164 break; 3259 break;
3165 case 0xa1: /* pop fs */ 3260 case 0xa1: /* pop fs */
3166 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3261 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3179,7 +3274,7 @@ twobyte_insn:
3179 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 3274 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
3180 break; 3275 break;
3181 case 0xa8: /* push gs */ 3276 case 0xa8: /* push gs */
3182 emulate_push_sreg(ctxt, VCPU_SREG_GS); 3277 emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
3183 break; 3278 break;
3184 case 0xa9: /* pop gs */ 3279 case 0xa9: /* pop gs */
3185 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3280 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0150affad25d..0fd6378981f4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,6 +5,7 @@
5 * Copyright (c) 2006 Intel Corporation 5 * Copyright (c) 2006 Intel Corporation
6 * Copyright (c) 2007 Keir Fraser, XenSource Inc 6 * Copyright (c) 2007 Keir Fraser, XenSource Inc
7 * Copyright (c) 2008 Intel Corporation 7 * Copyright (c) 2008 Intel Corporation
8 * Copyright 2009 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal 11 * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,7 @@
33 34
34#include <linux/kvm_host.h> 35#include <linux/kvm_host.h>
35#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/workqueue.h>
36 38
37#include "irq.h" 39#include "irq.h"
38#include "i8254.h" 40#include "i8254.h"
@@ -243,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
243{ 245{
244 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 246 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
245 irq_ack_notifier); 247 irq_ack_notifier);
246 raw_spin_lock(&ps->inject_lock); 248 int value;
247 if (atomic_dec_return(&ps->pit_timer.pending) < 0) 249
250 spin_lock(&ps->inject_lock);
251 value = atomic_dec_return(&ps->pit_timer.pending);
252 if (value < 0)
253 /* spurious acks can be generated if, for example, the
254 * PIC is being reset. Handle it gracefully here
255 */
248 atomic_inc(&ps->pit_timer.pending); 256 atomic_inc(&ps->pit_timer.pending);
257 else if (value > 0)
258 /* in this case, we had multiple outstanding pit interrupts
259 * that we needed to inject. Reinject
260 */
261 queue_work(ps->pit->wq, &ps->pit->expired);
249 ps->irq_ack = 1; 262 ps->irq_ack = 1;
250 raw_spin_unlock(&ps->inject_lock); 263 spin_unlock(&ps->inject_lock);
251} 264}
252 265
253void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 266void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -263,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
263 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 276 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
264} 277}
265 278
266static void destroy_pit_timer(struct kvm_timer *pt) 279static void destroy_pit_timer(struct kvm_pit *pit)
267{ 280{
268 pr_debug("execute del timer!\n"); 281 hrtimer_cancel(&pit->pit_state.pit_timer.timer);
269 hrtimer_cancel(&pt->timer); 282 cancel_work_sync(&pit->expired);
270} 283}
271 284
272static bool kpit_is_periodic(struct kvm_timer *ktimer) 285static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -280,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = {
280 .is_periodic = kpit_is_periodic, 293 .is_periodic = kpit_is_periodic,
281}; 294};
282 295
296static void pit_do_work(struct work_struct *work)
297{
298 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
299 struct kvm *kvm = pit->kvm;
300 struct kvm_vcpu *vcpu;
301 int i;
302 struct kvm_kpit_state *ps = &pit->pit_state;
303 int inject = 0;
304
305 /* Try to inject pending interrupts when
306 * last one has been acked.
307 */
308 spin_lock(&ps->inject_lock);
309 if (ps->irq_ack) {
310 ps->irq_ack = 0;
311 inject = 1;
312 }
313 spin_unlock(&ps->inject_lock);
314 if (inject) {
315 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
316 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
317
318 /*
319 * Provides NMI watchdog support via Virtual Wire mode.
320 * The route is: PIT -> PIC -> LVT0 in NMI mode.
321 *
322 * Note: Our Virtual Wire implementation is simplified, only
323 * propagating PIT interrupts to all VCPUs when they have set
324 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
325 * VCPU0, and only if its LVT0 is in EXTINT mode.
326 */
327 if (kvm->arch.vapics_in_nmi_mode > 0)
328 kvm_for_each_vcpu(i, vcpu, kvm)
329 kvm_apic_nmi_wd_deliver(vcpu);
330 }
331}
332
333static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
334{
335 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
336 struct kvm_pit *pt = ktimer->kvm->arch.vpit;
337
338 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
339 atomic_inc(&ktimer->pending);
340 queue_work(pt->wq, &pt->expired);
341 }
342
343 if (ktimer->t_ops->is_periodic(ktimer)) {
344 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
345 return HRTIMER_RESTART;
346 } else
347 return HRTIMER_NORESTART;
348}
349
283static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 350static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
284{ 351{
285 struct kvm_timer *pt = &ps->pit_timer; 352 struct kvm_timer *pt = &ps->pit_timer;
@@ -291,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
291 358
292 /* TODO The new value only affected after the retriggered */ 359 /* TODO The new value only affected after the retriggered */
293 hrtimer_cancel(&pt->timer); 360 hrtimer_cancel(&pt->timer);
361 cancel_work_sync(&ps->pit->expired);
294 pt->period = interval; 362 pt->period = interval;
295 ps->is_periodic = is_period; 363 ps->is_periodic = is_period;
296 364
297 pt->timer.function = kvm_timer_fn; 365 pt->timer.function = pit_timer_fn;
298 pt->t_ops = &kpit_ops; 366 pt->t_ops = &kpit_ops;
299 pt->kvm = ps->pit->kvm; 367 pt->kvm = ps->pit->kvm;
300 pt->vcpu = pt->kvm->bsp_vcpu;
301 368
302 atomic_set(&pt->pending, 0); 369 atomic_set(&pt->pending, 0);
303 ps->irq_ack = 1; 370 ps->irq_ack = 1;
@@ -346,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
346 } 413 }
347 break; 414 break;
348 default: 415 default:
349 destroy_pit_timer(&ps->pit_timer); 416 destroy_pit_timer(kvm->arch.vpit);
350 } 417 }
351} 418}
352 419
@@ -625,7 +692,15 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
625 692
626 mutex_init(&pit->pit_state.lock); 693 mutex_init(&pit->pit_state.lock);
627 mutex_lock(&pit->pit_state.lock); 694 mutex_lock(&pit->pit_state.lock);
628 raw_spin_lock_init(&pit->pit_state.inject_lock); 695 spin_lock_init(&pit->pit_state.inject_lock);
696
697 pit->wq = create_singlethread_workqueue("kvm-pit-wq");
698 if (!pit->wq) {
699 mutex_unlock(&pit->pit_state.lock);
700 kfree(pit);
701 return NULL;
702 }
703 INIT_WORK(&pit->expired, pit_do_work);
629 704
630 kvm->arch.vpit = pit; 705 kvm->arch.vpit = pit;
631 pit->kvm = kvm; 706 pit->kvm = kvm;
@@ -677,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm)
677 struct hrtimer *timer; 752 struct hrtimer *timer;
678 753
679 if (kvm->arch.vpit) { 754 if (kvm->arch.vpit) {
755 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
756 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
757 &kvm->arch.vpit->speaker_dev);
680 kvm_unregister_irq_mask_notifier(kvm, 0, 758 kvm_unregister_irq_mask_notifier(kvm, 0,
681 &kvm->arch.vpit->mask_notifier); 759 &kvm->arch.vpit->mask_notifier);
682 kvm_unregister_irq_ack_notifier(kvm, 760 kvm_unregister_irq_ack_notifier(kvm,
@@ -684,54 +762,10 @@ void kvm_free_pit(struct kvm *kvm)
684 mutex_lock(&kvm->arch.vpit->pit_state.lock); 762 mutex_lock(&kvm->arch.vpit->pit_state.lock);
685 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 763 timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
686 hrtimer_cancel(timer); 764 hrtimer_cancel(timer);
765 cancel_work_sync(&kvm->arch.vpit->expired);
687 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); 766 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
688 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 767 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
768 destroy_workqueue(kvm->arch.vpit->wq);
689 kfree(kvm->arch.vpit); 769 kfree(kvm->arch.vpit);
690 } 770 }
691} 771}
692
693static void __inject_pit_timer_intr(struct kvm *kvm)
694{
695 struct kvm_vcpu *vcpu;
696 int i;
697
698 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
699 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
700
701 /*
702 * Provides NMI watchdog support via Virtual Wire mode.
703 * The route is: PIT -> PIC -> LVT0 in NMI mode.
704 *
705 * Note: Our Virtual Wire implementation is simplified, only
706 * propagating PIT interrupts to all VCPUs when they have set
707 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
708 * VCPU0, and only if its LVT0 is in EXTINT mode.
709 */
710 if (kvm->arch.vapics_in_nmi_mode > 0)
711 kvm_for_each_vcpu(i, vcpu, kvm)
712 kvm_apic_nmi_wd_deliver(vcpu);
713}
714
715void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
716{
717 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
718 struct kvm *kvm = vcpu->kvm;
719 struct kvm_kpit_state *ps;
720
721 if (pit) {
722 int inject = 0;
723 ps = &pit->pit_state;
724
725 /* Try to inject pending interrupts when
726 * last one has been acked.
727 */
728 raw_spin_lock(&ps->inject_lock);
729 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
730 ps->irq_ack = 0;
731 inject = 1;
732 }
733 raw_spin_unlock(&ps->inject_lock);
734 if (inject)
735 __inject_pit_timer_intr(kvm);
736 }
737}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 900d6b0ba7c2..46d08ca0b48f 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
27 u32 speaker_data_on; 27 u32 speaker_data_on;
28 struct mutex lock; 28 struct mutex lock;
29 struct kvm_pit *pit; 29 struct kvm_pit *pit;
30 raw_spinlock_t inject_lock; 30 spinlock_t inject_lock;
31 unsigned long irq_ack; 31 unsigned long irq_ack;
32 struct kvm_irq_ack_notifier irq_ack_notifier; 32 struct kvm_irq_ack_notifier irq_ack_notifier;
33}; 33};
@@ -40,6 +40,8 @@ struct kvm_pit {
40 struct kvm_kpit_state pit_state; 40 struct kvm_kpit_state pit_state;
41 int irq_source_id; 41 int irq_source_id;
42 struct kvm_irq_mask_notifier mask_notifier; 42 struct kvm_irq_mask_notifier mask_notifier;
43 struct workqueue_struct *wq;
44 struct work_struct expired;
43}; 45};
44 46
45#define KVM_PIT_BASE_ADDRESS 0x40 47#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 93825ff3338f..8d10c063d7f2 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard 4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation 5 * Copyright (c) 2007 Intel Corporation
6 * Copyright 2009 Red Hat, Inc. and/or its affilates.
6 * 7 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal 9 * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,8 @@
33#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
34#include "trace.h" 35#include "trace.h"
35 36
37static void pic_irq_request(struct kvm *kvm, int level);
38
36static void pic_lock(struct kvm_pic *s) 39static void pic_lock(struct kvm_pic *s)
37 __acquires(&s->lock) 40 __acquires(&s->lock)
38{ 41{
@@ -43,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s)
43 __releases(&s->lock) 46 __releases(&s->lock)
44{ 47{
45 bool wakeup = s->wakeup_needed; 48 bool wakeup = s->wakeup_needed;
46 struct kvm_vcpu *vcpu; 49 struct kvm_vcpu *vcpu, *found = NULL;
50 int i;
47 51
48 s->wakeup_needed = false; 52 s->wakeup_needed = false;
49 53
50 raw_spin_unlock(&s->lock); 54 raw_spin_unlock(&s->lock);
51 55
52 if (wakeup) { 56 if (wakeup) {
53 vcpu = s->kvm->bsp_vcpu; 57 kvm_for_each_vcpu(i, vcpu, s->kvm) {
54 if (vcpu) 58 if (kvm_apic_accept_pic_intr(vcpu)) {
55 kvm_vcpu_kick(vcpu); 59 found = vcpu;
60 break;
61 }
62 }
63
64 if (!found)
65 found = s->kvm->bsp_vcpu;
66
67 kvm_vcpu_kick(found);
56 } 68 }
57} 69}
58 70
@@ -173,10 +185,7 @@ static void pic_update_irq(struct kvm_pic *s)
173 pic_set_irq1(&s->pics[0], 2, 0); 185 pic_set_irq1(&s->pics[0], 2, 0);
174 } 186 }
175 irq = pic_get_irq(&s->pics[0]); 187 irq = pic_get_irq(&s->pics[0]);
176 if (irq >= 0) 188 pic_irq_request(s->kvm, irq >= 0);
177 s->irq_request(s->irq_request_opaque, 1);
178 else
179 s->irq_request(s->irq_request_opaque, 0);
180} 189}
181 190
182void kvm_pic_update_irq(struct kvm_pic *s) 191void kvm_pic_update_irq(struct kvm_pic *s)
@@ -261,8 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
261void kvm_pic_reset(struct kvm_kpic_state *s) 270void kvm_pic_reset(struct kvm_kpic_state *s)
262{ 271{
263 int irq; 272 int irq;
264 struct kvm *kvm = s->pics_state->irq_request_opaque; 273 struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
265 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
266 u8 irr = s->irr, isr = s->imr; 274 u8 irr = s->irr, isr = s->imr;
267 275
268 s->last_irr = 0; 276 s->last_irr = 0;
@@ -301,8 +309,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
301 /* 309 /*
302 * deassert a pending interrupt 310 * deassert a pending interrupt
303 */ 311 */
304 s->pics_state->irq_request(s->pics_state-> 312 pic_irq_request(s->pics_state->kvm, 0);
305 irq_request_opaque, 0);
306 s->init_state = 1; 313 s->init_state = 1;
307 s->init4 = val & 1; 314 s->init4 = val & 1;
308 if (val & 0x02) 315 if (val & 0x02)
@@ -356,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
356 } 363 }
357 } else 364 } else
358 switch (s->init_state) { 365 switch (s->init_state) {
359 case 0: /* normal mode */ 366 case 0: { /* normal mode */
367 u8 imr_diff = s->imr ^ val,
368 off = (s == &s->pics_state->pics[0]) ? 0 : 8;
360 s->imr = val; 369 s->imr = val;
370 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
371 if (imr_diff & (1 << irq))
372 kvm_fire_mask_notifiers(
373 s->pics_state->kvm,
374 SELECT_PIC(irq + off),
375 irq + off,
376 !!(s->imr & (1 << irq)));
361 pic_update_irq(s->pics_state); 377 pic_update_irq(s->pics_state);
362 break; 378 break;
379 }
363 case 1: 380 case 1:
364 s->irq_base = val & 0xf8; 381 s->irq_base = val & 0xf8;
365 s->init_state = 2; 382 s->init_state = 2;
@@ -518,9 +535,8 @@ static int picdev_read(struct kvm_io_device *this,
518/* 535/*
519 * callback when PIC0 irq status changed 536 * callback when PIC0 irq status changed
520 */ 537 */
521static void pic_irq_request(void *opaque, int level) 538static void pic_irq_request(struct kvm *kvm, int level)
522{ 539{
523 struct kvm *kvm = opaque;
524 struct kvm_vcpu *vcpu = kvm->bsp_vcpu; 540 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
525 struct kvm_pic *s = pic_irqchip(kvm); 541 struct kvm_pic *s = pic_irqchip(kvm);
526 int irq = pic_get_irq(&s->pics[0]); 542 int irq = pic_get_irq(&s->pics[0]);
@@ -549,8 +565,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
549 s->kvm = kvm; 565 s->kvm = kvm;
550 s->pics[0].elcr_mask = 0xf8; 566 s->pics[0].elcr_mask = 0xf8;
551 s->pics[1].elcr_mask = 0xde; 567 s->pics[1].elcr_mask = 0xde;
552 s->irq_request = pic_irq_request;
553 s->irq_request_opaque = kvm;
554 s->pics[0].pics_state = s; 568 s->pics[0].pics_state = s;
555 s->pics[1].pics_state = s; 569 s->pics[1].pics_state = s;
556 570
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 96dfbb6ad2a9..2095a049835e 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * irq.c: API for in kernel interrupt controller 2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation. 3 * Copyright (c) 2007, Intel Corporation.
4 * Copyright 2009 Red Hat, Inc. and/or its affilates.
4 * 5 *
5 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -89,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
89void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 90void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
90{ 91{
91 kvm_inject_apic_timer_irqs(vcpu); 92 kvm_inject_apic_timer_irqs(vcpu);
92 kvm_inject_pit_timer_irqs(vcpu);
93 /* TODO: PIT, RTC etc. */ 93 /* TODO: PIT, RTC etc. */
94} 94}
95EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); 95EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index cd1f362f413d..ffed06871c5c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -38,8 +38,6 @@
38struct kvm; 38struct kvm;
39struct kvm_vcpu; 39struct kvm_vcpu;
40 40
41typedef void irq_request_func(void *opaque, int level);
42
43struct kvm_kpic_state { 41struct kvm_kpic_state {
44 u8 last_irr; /* edge detection */ 42 u8 last_irr; /* edge detection */
45 u8 irr; /* interrupt request register */ 43 u8 irr; /* interrupt request register */
@@ -67,8 +65,6 @@ struct kvm_pic {
67 unsigned pending_acks; 65 unsigned pending_acks;
68 struct kvm *kvm; 66 struct kvm *kvm;
69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 67 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
70 irq_request_func *irq_request;
71 void *irq_request_opaque;
72 int output; /* intr from master PIC */ 68 int output; /* intr from master PIC */
73 struct kvm_io_device dev; 69 struct kvm_io_device dev;
74 void (*ack_notifier)(void *opaque, int irq); 70 void (*ack_notifier)(void *opaque, int irq);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index cff851cf5322..6491ac8e755b 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
36 36
37static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) 37static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
38{ 38{
39 might_sleep(); /* on svm */
40
39 if (!test_bit(VCPU_EXREG_PDPTR, 41 if (!test_bit(VCPU_EXREG_PDPTR,
40 (unsigned long *)&vcpu->arch.regs_avail)) 42 (unsigned long *)&vcpu->arch.regs_avail))
41 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); 43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
@@ -69,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
69 return kvm_read_cr4_bits(vcpu, ~0UL); 71 return kvm_read_cr4_bits(vcpu, ~0UL);
70} 72}
71 73
74static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
75{
76 return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
77 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
78}
79
72#endif 80#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1eb7a4ae0c9c..77d8c0f4817d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,6 +5,7 @@
5 * Copyright (C) 2006 Qumranet, Inc. 5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell 6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel 7 * Copyright (C) 2007 Intel
8 * Copyright 2009 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Authors: 10 * Authors:
10 * Dor Laor <dor.laor@qumranet.com> 11 * Dor Laor <dor.laor@qumranet.com>
@@ -328,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
328 "dest_mode 0x%x, short_hand 0x%x\n", 329 "dest_mode 0x%x, short_hand 0x%x\n",
329 target, source, dest, dest_mode, short_hand); 330 target, source, dest, dest_mode, short_hand);
330 331
331 ASSERT(!target); 332 ASSERT(target);
332 switch (short_hand) { 333 switch (short_hand) {
333 case APIC_DEST_NOSHORT: 334 case APIC_DEST_NOSHORT:
334 if (dest_mode == 0) 335 if (dest_mode == 0)
@@ -533,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
533 struct kvm_vcpu *vcpu = apic->vcpu; 534 struct kvm_vcpu *vcpu = apic->vcpu;
534 struct kvm_run *run = vcpu->run; 535 struct kvm_run *run = vcpu->run;
535 536
536 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); 537 kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
537 run->tpr_access.rip = kvm_rip_read(vcpu); 538 run->tpr_access.rip = kvm_rip_read(vcpu);
538 run->tpr_access.is_write = write; 539 run->tpr_access.is_write = write;
539} 540}
@@ -1106,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1106 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); 1107 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1107 int r = 0; 1108 int r = 0;
1108 1109
1109 if (kvm_vcpu_is_bsp(vcpu)) { 1110 if (!apic_hw_enabled(vcpu->arch.apic))
1110 if (!apic_hw_enabled(vcpu->arch.apic)) 1111 r = 1;
1111 r = 1; 1112 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1112 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1113 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
1113 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1114 r = 1;
1114 r = 1;
1115 }
1116 return r; 1115 return r;
1117} 1116}
1118 1117
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a6f695d76928..311f6dad8951 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,6 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 * 11 *
11 * Authors: 12 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -32,6 +33,7 @@
32#include <linux/compiler.h> 33#include <linux/compiler.h>
33#include <linux/srcu.h> 34#include <linux/srcu.h>
34#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/uaccess.h>
35 37
36#include <asm/page.h> 38#include <asm/page.h>
37#include <asm/cmpxchg.h> 39#include <asm/cmpxchg.h>
@@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644);
90#define PT_FIRST_AVAIL_BITS_SHIFT 9 92#define PT_FIRST_AVAIL_BITS_SHIFT 9
91#define PT64_SECOND_AVAIL_BITS_SHIFT 52 93#define PT64_SECOND_AVAIL_BITS_SHIFT 52
92 94
93#define VALID_PAGE(x) ((x) != INVALID_PAGE)
94
95#define PT64_LEVEL_BITS 9 95#define PT64_LEVEL_BITS 9
96 96
97#define PT64_LEVEL_SHIFT(level) \ 97#define PT64_LEVEL_SHIFT(level) \
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {
173 shadow_walk_okay(&(_walker)); \ 173 shadow_walk_okay(&(_walker)); \
174 shadow_walk_next(&(_walker))) 174 shadow_walk_next(&(_walker)))
175 175
176typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); 176typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
177 177
178static struct kmem_cache *pte_chain_cache; 178static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache; 179static struct kmem_cache *rmap_desc_cache;
@@ -281,13 +281,38 @@ static gfn_t pse36_gfn_delta(u32 gpte)
281 281
282static void __set_spte(u64 *sptep, u64 spte) 282static void __set_spte(u64 *sptep, u64 spte)
283{ 283{
284 set_64bit(sptep, spte);
285}
286
287static u64 __xchg_spte(u64 *sptep, u64 new_spte)
288{
284#ifdef CONFIG_X86_64 289#ifdef CONFIG_X86_64
285 set_64bit((unsigned long *)sptep, spte); 290 return xchg(sptep, new_spte);
286#else 291#else
287 set_64bit((unsigned long long *)sptep, spte); 292 u64 old_spte;
293
294 do {
295 old_spte = *sptep;
296 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
297
298 return old_spte;
288#endif 299#endif
289} 300}
290 301
302static void update_spte(u64 *sptep, u64 new_spte)
303{
304 u64 old_spte;
305
306 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
307 !is_rmap_spte(*sptep))
308 __set_spte(sptep, new_spte);
309 else {
310 old_spte = __xchg_spte(sptep, new_spte);
311 if (old_spte & shadow_accessed_mask)
312 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
313 }
314}
315
291static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 316static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
292 struct kmem_cache *base_cache, int min) 317 struct kmem_cache *base_cache, int min)
293{ 318{
@@ -304,10 +329,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
304 return 0; 329 return 0;
305} 330}
306 331
307static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 332static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
333 struct kmem_cache *cache)
308{ 334{
309 while (mc->nobjs) 335 while (mc->nobjs)
310 kfree(mc->objects[--mc->nobjs]); 336 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
311} 337}
312 338
313static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 339static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -355,10 +381,11 @@ out:
355 381
356static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 382static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
357{ 383{
358 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); 384 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
359 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); 385 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
360 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 386 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
361 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); 387 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
388 mmu_page_header_cache);
362} 389}
363 390
364static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 391static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -379,7 +406,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
379 406
380static void mmu_free_pte_chain(struct kvm_pte_chain *pc) 407static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
381{ 408{
382 kfree(pc); 409 kmem_cache_free(pte_chain_cache, pc);
383} 410}
384 411
385static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) 412static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -390,7 +417,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
390 417
391static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) 418static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
392{ 419{
393 kfree(rd); 420 kmem_cache_free(rmap_desc_cache, rd);
421}
422
423static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
424{
425 if (!sp->role.direct)
426 return sp->gfns[index];
427
428 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
429}
430
431static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
432{
433 if (sp->role.direct)
434 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
435 else
436 sp->gfns[index] = gfn;
394} 437}
395 438
396/* 439/*
@@ -403,8 +446,8 @@ static int *slot_largepage_idx(gfn_t gfn,
403{ 446{
404 unsigned long idx; 447 unsigned long idx;
405 448
406 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - 449 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
407 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); 450 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
408 return &slot->lpage_info[level - 2][idx].write_count; 451 return &slot->lpage_info[level - 2][idx].write_count;
409} 452}
410 453
@@ -414,9 +457,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
414 int *write_count; 457 int *write_count;
415 int i; 458 int i;
416 459
417 gfn = unalias_gfn(kvm, gfn); 460 slot = gfn_to_memslot(kvm, gfn);
418
419 slot = gfn_to_memslot_unaliased(kvm, gfn);
420 for (i = PT_DIRECTORY_LEVEL; 461 for (i = PT_DIRECTORY_LEVEL;
421 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 462 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
422 write_count = slot_largepage_idx(gfn, slot, i); 463 write_count = slot_largepage_idx(gfn, slot, i);
@@ -430,8 +471,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
430 int *write_count; 471 int *write_count;
431 int i; 472 int i;
432 473
433 gfn = unalias_gfn(kvm, gfn); 474 slot = gfn_to_memslot(kvm, gfn);
434 slot = gfn_to_memslot_unaliased(kvm, gfn);
435 for (i = PT_DIRECTORY_LEVEL; 475 for (i = PT_DIRECTORY_LEVEL;
436 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 476 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
437 write_count = slot_largepage_idx(gfn, slot, i); 477 write_count = slot_largepage_idx(gfn, slot, i);
@@ -447,8 +487,7 @@ static int has_wrprotected_page(struct kvm *kvm,
447 struct kvm_memory_slot *slot; 487 struct kvm_memory_slot *slot;
448 int *largepage_idx; 488 int *largepage_idx;
449 489
450 gfn = unalias_gfn(kvm, gfn); 490 slot = gfn_to_memslot(kvm, gfn);
451 slot = gfn_to_memslot_unaliased(kvm, gfn);
452 if (slot) { 491 if (slot) {
453 largepage_idx = slot_largepage_idx(gfn, slot, level); 492 largepage_idx = slot_largepage_idx(gfn, slot, level);
454 return *largepage_idx; 493 return *largepage_idx;
@@ -501,7 +540,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
501 540
502/* 541/*
503 * Take gfn and return the reverse mapping to it. 542 * Take gfn and return the reverse mapping to it.
504 * Note: gfn must be unaliased before this function get called
505 */ 543 */
506 544
507static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 545static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -513,8 +551,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
513 if (likely(level == PT_PAGE_TABLE_LEVEL)) 551 if (likely(level == PT_PAGE_TABLE_LEVEL))
514 return &slot->rmap[gfn - slot->base_gfn]; 552 return &slot->rmap[gfn - slot->base_gfn];
515 553
516 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - 554 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
517 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); 555 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
518 556
519 return &slot->lpage_info[level - 2][idx].rmap_pde; 557 return &slot->lpage_info[level - 2][idx].rmap_pde;
520} 558}
@@ -541,9 +579,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
541 579
542 if (!is_rmap_spte(*spte)) 580 if (!is_rmap_spte(*spte))
543 return count; 581 return count;
544 gfn = unalias_gfn(vcpu->kvm, gfn);
545 sp = page_header(__pa(spte)); 582 sp = page_header(__pa(spte));
546 sp->gfns[spte - sp->spt] = gfn; 583 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
547 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 584 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
548 if (!*rmapp) { 585 if (!*rmapp) {
549 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 586 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -600,19 +637,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
600 struct kvm_rmap_desc *desc; 637 struct kvm_rmap_desc *desc;
601 struct kvm_rmap_desc *prev_desc; 638 struct kvm_rmap_desc *prev_desc;
602 struct kvm_mmu_page *sp; 639 struct kvm_mmu_page *sp;
603 pfn_t pfn; 640 gfn_t gfn;
604 unsigned long *rmapp; 641 unsigned long *rmapp;
605 int i; 642 int i;
606 643
607 if (!is_rmap_spte(*spte))
608 return;
609 sp = page_header(__pa(spte)); 644 sp = page_header(__pa(spte));
610 pfn = spte_to_pfn(*spte); 645 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
611 if (*spte & shadow_accessed_mask) 646 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
612 kvm_set_pfn_accessed(pfn);
613 if (is_writable_pte(*spte))
614 kvm_set_pfn_dirty(pfn);
615 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
616 if (!*rmapp) { 647 if (!*rmapp) {
617 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 648 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
618 BUG(); 649 BUG();
@@ -644,6 +675,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
644 } 675 }
645} 676}
646 677
678static void set_spte_track_bits(u64 *sptep, u64 new_spte)
679{
680 pfn_t pfn;
681 u64 old_spte = *sptep;
682
683 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
684 old_spte & shadow_accessed_mask) {
685 __set_spte(sptep, new_spte);
686 } else
687 old_spte = __xchg_spte(sptep, new_spte);
688
689 if (!is_rmap_spte(old_spte))
690 return;
691 pfn = spte_to_pfn(old_spte);
692 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
693 kvm_set_pfn_accessed(pfn);
694 if (is_writable_pte(old_spte))
695 kvm_set_pfn_dirty(pfn);
696}
697
698static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
699{
700 set_spte_track_bits(sptep, new_spte);
701 rmap_remove(kvm, sptep);
702}
703
647static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 704static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
648{ 705{
649 struct kvm_rmap_desc *desc; 706 struct kvm_rmap_desc *desc;
@@ -676,7 +733,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
676 u64 *spte; 733 u64 *spte;
677 int i, write_protected = 0; 734 int i, write_protected = 0;
678 735
679 gfn = unalias_gfn(kvm, gfn);
680 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); 736 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
681 737
682 spte = rmap_next(kvm, rmapp, NULL); 738 spte = rmap_next(kvm, rmapp, NULL);
@@ -685,7 +741,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
685 BUG_ON(!(*spte & PT_PRESENT_MASK)); 741 BUG_ON(!(*spte & PT_PRESENT_MASK));
686 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 742 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
687 if (is_writable_pte(*spte)) { 743 if (is_writable_pte(*spte)) {
688 __set_spte(spte, *spte & ~PT_WRITABLE_MASK); 744 update_spte(spte, *spte & ~PT_WRITABLE_MASK);
689 write_protected = 1; 745 write_protected = 1;
690 } 746 }
691 spte = rmap_next(kvm, rmapp, spte); 747 spte = rmap_next(kvm, rmapp, spte);
@@ -709,9 +765,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
709 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 765 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
710 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 766 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
711 if (is_writable_pte(*spte)) { 767 if (is_writable_pte(*spte)) {
712 rmap_remove(kvm, spte); 768 drop_spte(kvm, spte,
769 shadow_trap_nonpresent_pte);
713 --kvm->stat.lpages; 770 --kvm->stat.lpages;
714 __set_spte(spte, shadow_trap_nonpresent_pte);
715 spte = NULL; 771 spte = NULL;
716 write_protected = 1; 772 write_protected = 1;
717 } 773 }
@@ -731,8 +787,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
731 while ((spte = rmap_next(kvm, rmapp, NULL))) { 787 while ((spte = rmap_next(kvm, rmapp, NULL))) {
732 BUG_ON(!(*spte & PT_PRESENT_MASK)); 788 BUG_ON(!(*spte & PT_PRESENT_MASK));
733 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 789 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
734 rmap_remove(kvm, spte); 790 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
735 __set_spte(spte, shadow_trap_nonpresent_pte);
736 need_tlb_flush = 1; 791 need_tlb_flush = 1;
737 } 792 }
738 return need_tlb_flush; 793 return need_tlb_flush;
@@ -754,8 +809,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
754 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 809 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
755 need_flush = 1; 810 need_flush = 1;
756 if (pte_write(*ptep)) { 811 if (pte_write(*ptep)) {
757 rmap_remove(kvm, spte); 812 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
758 __set_spte(spte, shadow_trap_nonpresent_pte);
759 spte = rmap_next(kvm, rmapp, NULL); 813 spte = rmap_next(kvm, rmapp, NULL);
760 } else { 814 } else {
761 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 815 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -763,9 +817,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
763 817
764 new_spte &= ~PT_WRITABLE_MASK; 818 new_spte &= ~PT_WRITABLE_MASK;
765 new_spte &= ~SPTE_HOST_WRITEABLE; 819 new_spte &= ~SPTE_HOST_WRITEABLE;
766 if (is_writable_pte(*spte)) 820 new_spte &= ~shadow_accessed_mask;
767 kvm_set_pfn_dirty(spte_to_pfn(*spte)); 821 set_spte_track_bits(spte, new_spte);
768 __set_spte(spte, new_spte);
769 spte = rmap_next(kvm, rmapp, spte); 822 spte = rmap_next(kvm, rmapp, spte);
770 } 823 }
771 } 824 }
@@ -799,8 +852,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
799 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 852 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
800 853
801 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 854 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
802 int idx = gfn_offset; 855 unsigned long idx;
803 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 856 int sh;
857
858 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
859 idx = ((memslot->base_gfn+gfn_offset) >> sh) -
860 (memslot->base_gfn >> sh);
804 ret |= handler(kvm, 861 ret |= handler(kvm,
805 &memslot->lpage_info[j][idx].rmap_pde, 862 &memslot->lpage_info[j][idx].rmap_pde,
806 data); 863 data);
@@ -863,7 +920,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
863 920
864 sp = page_header(__pa(spte)); 921 sp = page_header(__pa(spte));
865 922
866 gfn = unalias_gfn(vcpu->kvm, gfn);
867 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 923 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
868 924
869 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); 925 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -894,10 +950,12 @@ static int is_empty_shadow_page(u64 *spt)
894static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 950static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
895{ 951{
896 ASSERT(is_empty_shadow_page(sp->spt)); 952 ASSERT(is_empty_shadow_page(sp->spt));
953 hlist_del(&sp->hash_link);
897 list_del(&sp->link); 954 list_del(&sp->link);
898 __free_page(virt_to_page(sp->spt)); 955 __free_page(virt_to_page(sp->spt));
899 __free_page(virt_to_page(sp->gfns)); 956 if (!sp->role.direct)
900 kfree(sp); 957 __free_page(virt_to_page(sp->gfns));
958 kmem_cache_free(mmu_page_header_cache, sp);
901 ++kvm->arch.n_free_mmu_pages; 959 ++kvm->arch.n_free_mmu_pages;
902} 960}
903 961
@@ -907,13 +965,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
907} 965}
908 966
909static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 967static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
910 u64 *parent_pte) 968 u64 *parent_pte, int direct)
911{ 969{
912 struct kvm_mmu_page *sp; 970 struct kvm_mmu_page *sp;
913 971
914 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); 972 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
915 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 973 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
916 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 974 if (!direct)
975 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
976 PAGE_SIZE);
917 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 977 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
918 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 978 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
919 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 979 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
@@ -998,7 +1058,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
998 BUG(); 1058 BUG();
999} 1059}
1000 1060
1001
1002static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) 1061static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1003{ 1062{
1004 struct kvm_pte_chain *pte_chain; 1063 struct kvm_pte_chain *pte_chain;
@@ -1008,63 +1067,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1008 1067
1009 if (!sp->multimapped && sp->parent_pte) { 1068 if (!sp->multimapped && sp->parent_pte) {
1010 parent_sp = page_header(__pa(sp->parent_pte)); 1069 parent_sp = page_header(__pa(sp->parent_pte));
1011 fn(parent_sp); 1070 fn(parent_sp, sp->parent_pte);
1012 mmu_parent_walk(parent_sp, fn);
1013 return; 1071 return;
1014 } 1072 }
1073
1015 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1074 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1016 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { 1075 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1017 if (!pte_chain->parent_ptes[i]) 1076 u64 *spte = pte_chain->parent_ptes[i];
1077
1078 if (!spte)
1018 break; 1079 break;
1019 parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); 1080 parent_sp = page_header(__pa(spte));
1020 fn(parent_sp); 1081 fn(parent_sp, spte);
1021 mmu_parent_walk(parent_sp, fn);
1022 } 1082 }
1023} 1083}
1024 1084
1025static void kvm_mmu_update_unsync_bitmap(u64 *spte) 1085static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
1086static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1026{ 1087{
1027 unsigned int index; 1088 mmu_parent_walk(sp, mark_unsync);
1028 struct kvm_mmu_page *sp = page_header(__pa(spte));
1029
1030 index = spte - sp->spt;
1031 if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
1032 sp->unsync_children++;
1033 WARN_ON(!sp->unsync_children);
1034} 1089}
1035 1090
1036static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) 1091static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1037{ 1092{
1038 struct kvm_pte_chain *pte_chain; 1093 unsigned int index;
1039 struct hlist_node *node;
1040 int i;
1041 1094
1042 if (!sp->parent_pte) 1095 index = spte - sp->spt;
1096 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1043 return; 1097 return;
1044 1098 if (sp->unsync_children++)
1045 if (!sp->multimapped) {
1046 kvm_mmu_update_unsync_bitmap(sp->parent_pte);
1047 return; 1099 return;
1048 } 1100 kvm_mmu_mark_parents_unsync(sp);
1049
1050 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1051 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1052 if (!pte_chain->parent_ptes[i])
1053 break;
1054 kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
1055 }
1056}
1057
1058static int unsync_walk_fn(struct kvm_mmu_page *sp)
1059{
1060 kvm_mmu_update_parents_unsync(sp);
1061 return 1;
1062}
1063
1064static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1065{
1066 mmu_parent_walk(sp, unsync_walk_fn);
1067 kvm_mmu_update_parents_unsync(sp);
1068} 1101}
1069 1102
1070static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 1103static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1077,7 +1110,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1077} 1110}
1078 1111
1079static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1112static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1080 struct kvm_mmu_page *sp) 1113 struct kvm_mmu_page *sp, bool clear_unsync)
1081{ 1114{
1082 return 1; 1115 return 1;
1083} 1116}
@@ -1123,35 +1156,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1123 int i, ret, nr_unsync_leaf = 0; 1156 int i, ret, nr_unsync_leaf = 0;
1124 1157
1125 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1158 for_each_unsync_children(sp->unsync_child_bitmap, i) {
1159 struct kvm_mmu_page *child;
1126 u64 ent = sp->spt[i]; 1160 u64 ent = sp->spt[i];
1127 1161
1128 if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { 1162 if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1129 struct kvm_mmu_page *child; 1163 goto clear_child_bitmap;
1130 child = page_header(ent & PT64_BASE_ADDR_MASK); 1164
1131 1165 child = page_header(ent & PT64_BASE_ADDR_MASK);
1132 if (child->unsync_children) { 1166
1133 if (mmu_pages_add(pvec, child, i)) 1167 if (child->unsync_children) {
1134 return -ENOSPC; 1168 if (mmu_pages_add(pvec, child, i))
1135 1169 return -ENOSPC;
1136 ret = __mmu_unsync_walk(child, pvec); 1170
1137 if (!ret) 1171 ret = __mmu_unsync_walk(child, pvec);
1138 __clear_bit(i, sp->unsync_child_bitmap); 1172 if (!ret)
1139 else if (ret > 0) 1173 goto clear_child_bitmap;
1140 nr_unsync_leaf += ret; 1174 else if (ret > 0)
1141 else 1175 nr_unsync_leaf += ret;
1142 return ret; 1176 else
1143 } 1177 return ret;
1178 } else if (child->unsync) {
1179 nr_unsync_leaf++;
1180 if (mmu_pages_add(pvec, child, i))
1181 return -ENOSPC;
1182 } else
1183 goto clear_child_bitmap;
1144 1184
1145 if (child->unsync) { 1185 continue;
1146 nr_unsync_leaf++; 1186
1147 if (mmu_pages_add(pvec, child, i)) 1187clear_child_bitmap:
1148 return -ENOSPC; 1188 __clear_bit(i, sp->unsync_child_bitmap);
1149 } 1189 sp->unsync_children--;
1150 } 1190 WARN_ON((int)sp->unsync_children < 0);
1151 } 1191 }
1152 1192
1153 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
1154 sp->unsync_children = 0;
1155 1193
1156 return nr_unsync_leaf; 1194 return nr_unsync_leaf;
1157} 1195}
@@ -1166,26 +1204,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1166 return __mmu_unsync_walk(sp, pvec); 1204 return __mmu_unsync_walk(sp, pvec);
1167} 1205}
1168 1206
1169static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1170{
1171 unsigned index;
1172 struct hlist_head *bucket;
1173 struct kvm_mmu_page *sp;
1174 struct hlist_node *node;
1175
1176 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1177 index = kvm_page_table_hashfn(gfn);
1178 bucket = &kvm->arch.mmu_page_hash[index];
1179 hlist_for_each_entry(sp, node, bucket, hash_link)
1180 if (sp->gfn == gfn && !sp->role.direct
1181 && !sp->role.invalid) {
1182 pgprintk("%s: found role %x\n",
1183 __func__, sp->role.word);
1184 return sp;
1185 }
1186 return NULL;
1187}
1188
1189static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1207static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1190{ 1208{
1191 WARN_ON(!sp->unsync); 1209 WARN_ON(!sp->unsync);
@@ -1194,20 +1212,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1194 --kvm->stat.mmu_unsync; 1212 --kvm->stat.mmu_unsync;
1195} 1213}
1196 1214
1197static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); 1215static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1216 struct list_head *invalid_list);
1217static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1218 struct list_head *invalid_list);
1198 1219
1199static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1220#define for_each_gfn_sp(kvm, sp, gfn, pos) \
1221 hlist_for_each_entry(sp, pos, \
1222 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1223 if ((sp)->gfn != (gfn)) {} else
1224
1225#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
1226 hlist_for_each_entry(sp, pos, \
1227 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1228 if ((sp)->gfn != (gfn) || (sp)->role.direct || \
1229 (sp)->role.invalid) {} else
1230
1231/* @sp->gfn should be write-protected at the call site */
1232static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1233 struct list_head *invalid_list, bool clear_unsync)
1200{ 1234{
1201 if (sp->role.cr4_pae != !!is_pae(vcpu)) { 1235 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1202 kvm_mmu_zap_page(vcpu->kvm, sp); 1236 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1203 return 1; 1237 return 1;
1204 } 1238 }
1205 1239
1206 if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1240 if (clear_unsync)
1207 kvm_flush_remote_tlbs(vcpu->kvm); 1241 kvm_unlink_unsync_page(vcpu->kvm, sp);
1208 kvm_unlink_unsync_page(vcpu->kvm, sp); 1242
1209 if (vcpu->arch.mmu.sync_page(vcpu, sp)) { 1243 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1210 kvm_mmu_zap_page(vcpu->kvm, sp); 1244 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1211 return 1; 1245 return 1;
1212 } 1246 }
1213 1247
@@ -1215,6 +1249,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1215 return 0; 1249 return 0;
1216} 1250}
1217 1251
1252static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1253 struct kvm_mmu_page *sp)
1254{
1255 LIST_HEAD(invalid_list);
1256 int ret;
1257
1258 ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1259 if (ret)
1260 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1261
1262 return ret;
1263}
1264
1265static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1266 struct list_head *invalid_list)
1267{
1268 return __kvm_sync_page(vcpu, sp, invalid_list, true);
1269}
1270
1271/* @gfn should be write-protected at the call site */
1272static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1273{
1274 struct kvm_mmu_page *s;
1275 struct hlist_node *node;
1276 LIST_HEAD(invalid_list);
1277 bool flush = false;
1278
1279 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1280 if (!s->unsync)
1281 continue;
1282
1283 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1284 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1285 (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1286 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1287 continue;
1288 }
1289 kvm_unlink_unsync_page(vcpu->kvm, s);
1290 flush = true;
1291 }
1292
1293 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1294 if (flush)
1295 kvm_mmu_flush_tlb(vcpu);
1296}
1297
1218struct mmu_page_path { 1298struct mmu_page_path {
1219 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; 1299 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1220 unsigned int idx[PT64_ROOT_LEVEL-1]; 1300 unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1281,6 +1361,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1281 struct kvm_mmu_page *sp; 1361 struct kvm_mmu_page *sp;
1282 struct mmu_page_path parents; 1362 struct mmu_page_path parents;
1283 struct kvm_mmu_pages pages; 1363 struct kvm_mmu_pages pages;
1364 LIST_HEAD(invalid_list);
1284 1365
1285 kvm_mmu_pages_init(parent, &parents, &pages); 1366 kvm_mmu_pages_init(parent, &parents, &pages);
1286 while (mmu_unsync_walk(parent, &pages)) { 1367 while (mmu_unsync_walk(parent, &pages)) {
@@ -1293,9 +1374,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1293 kvm_flush_remote_tlbs(vcpu->kvm); 1374 kvm_flush_remote_tlbs(vcpu->kvm);
1294 1375
1295 for_each_sp(pages, sp, parents, i) { 1376 for_each_sp(pages, sp, parents, i) {
1296 kvm_sync_page(vcpu, sp); 1377 kvm_sync_page(vcpu, sp, &invalid_list);
1297 mmu_pages_clear_parents(&parents); 1378 mmu_pages_clear_parents(&parents);
1298 } 1379 }
1380 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1299 cond_resched_lock(&vcpu->kvm->mmu_lock); 1381 cond_resched_lock(&vcpu->kvm->mmu_lock);
1300 kvm_mmu_pages_init(parent, &parents, &pages); 1382 kvm_mmu_pages_init(parent, &parents, &pages);
1301 } 1383 }
@@ -1310,11 +1392,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1310 u64 *parent_pte) 1392 u64 *parent_pte)
1311{ 1393{
1312 union kvm_mmu_page_role role; 1394 union kvm_mmu_page_role role;
1313 unsigned index;
1314 unsigned quadrant; 1395 unsigned quadrant;
1315 struct hlist_head *bucket;
1316 struct kvm_mmu_page *sp; 1396 struct kvm_mmu_page *sp;
1317 struct hlist_node *node, *tmp; 1397 struct hlist_node *node;
1398 bool need_sync = false;
1318 1399
1319 role = vcpu->arch.mmu.base_role; 1400 role = vcpu->arch.mmu.base_role;
1320 role.level = level; 1401 role.level = level;
@@ -1322,40 +1403,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1322 if (role.direct) 1403 if (role.direct)
1323 role.cr4_pae = 0; 1404 role.cr4_pae = 0;
1324 role.access = access; 1405 role.access = access;
1325 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1406 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1326 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1407 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1327 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1408 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1328 role.quadrant = quadrant; 1409 role.quadrant = quadrant;
1329 } 1410 }
1330 index = kvm_page_table_hashfn(gfn); 1411 for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1331 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1412 if (!need_sync && sp->unsync)
1332 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) 1413 need_sync = true;
1333 if (sp->gfn == gfn) {
1334 if (sp->unsync)
1335 if (kvm_sync_page(vcpu, sp))
1336 continue;
1337 1414
1338 if (sp->role.word != role.word) 1415 if (sp->role.word != role.word)
1339 continue; 1416 continue;
1340 1417
1341 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1418 if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1342 if (sp->unsync_children) { 1419 break;
1343 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1420
1344 kvm_mmu_mark_parents_unsync(sp); 1421 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1345 } 1422 if (sp->unsync_children) {
1346 trace_kvm_mmu_get_page(sp, false); 1423 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1347 return sp; 1424 kvm_mmu_mark_parents_unsync(sp);
1348 } 1425 } else if (sp->unsync)
1426 kvm_mmu_mark_parents_unsync(sp);
1427
1428 trace_kvm_mmu_get_page(sp, false);
1429 return sp;
1430 }
1349 ++vcpu->kvm->stat.mmu_cache_miss; 1431 ++vcpu->kvm->stat.mmu_cache_miss;
1350 sp = kvm_mmu_alloc_page(vcpu, parent_pte); 1432 sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1351 if (!sp) 1433 if (!sp)
1352 return sp; 1434 return sp;
1353 sp->gfn = gfn; 1435 sp->gfn = gfn;
1354 sp->role = role; 1436 sp->role = role;
1355 hlist_add_head(&sp->hash_link, bucket); 1437 hlist_add_head(&sp->hash_link,
1438 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1356 if (!direct) { 1439 if (!direct) {
1357 if (rmap_write_protect(vcpu->kvm, gfn)) 1440 if (rmap_write_protect(vcpu->kvm, gfn))
1358 kvm_flush_remote_tlbs(vcpu->kvm); 1441 kvm_flush_remote_tlbs(vcpu->kvm);
1442 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1443 kvm_sync_pages(vcpu, gfn);
1444
1359 account_shadowed(vcpu->kvm, gfn); 1445 account_shadowed(vcpu->kvm, gfn);
1360 } 1446 }
1361 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1447 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1402,6 +1488,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1402 --iterator->level; 1488 --iterator->level;
1403} 1489}
1404 1490
1491static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1492{
1493 u64 spte;
1494
1495 spte = __pa(sp->spt)
1496 | PT_PRESENT_MASK | PT_ACCESSED_MASK
1497 | PT_WRITABLE_MASK | PT_USER_MASK;
1498 __set_spte(sptep, spte);
1499}
1500
1501static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1502{
1503 if (is_large_pte(*sptep)) {
1504 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1505 kvm_flush_remote_tlbs(vcpu->kvm);
1506 }
1507}
1508
1509static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1510 unsigned direct_access)
1511{
1512 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
1513 struct kvm_mmu_page *child;
1514
1515 /*
1516 * For the direct sp, if the guest pte's dirty bit
1517 * changed form clean to dirty, it will corrupt the
1518 * sp's access: allow writable in the read-only sp,
1519 * so we should update the spte at this point to get
1520 * a new sp with the correct access.
1521 */
1522 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
1523 if (child->role.access == direct_access)
1524 return;
1525
1526 mmu_page_remove_parent_pte(child, sptep);
1527 __set_spte(sptep, shadow_trap_nonpresent_pte);
1528 kvm_flush_remote_tlbs(vcpu->kvm);
1529 }
1530}
1531
1405static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1532static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1406 struct kvm_mmu_page *sp) 1533 struct kvm_mmu_page *sp)
1407{ 1534{
@@ -1422,7 +1549,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1422 } else { 1549 } else {
1423 if (is_large_pte(ent)) 1550 if (is_large_pte(ent))
1424 --kvm->stat.lpages; 1551 --kvm->stat.lpages;
1425 rmap_remove(kvm, &pt[i]); 1552 drop_spte(kvm, &pt[i],
1553 shadow_trap_nonpresent_pte);
1426 } 1554 }
1427 } 1555 }
1428 pt[i] = shadow_trap_nonpresent_pte; 1556 pt[i] = shadow_trap_nonpresent_pte;
@@ -1464,7 +1592,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1464} 1592}
1465 1593
1466static int mmu_zap_unsync_children(struct kvm *kvm, 1594static int mmu_zap_unsync_children(struct kvm *kvm,
1467 struct kvm_mmu_page *parent) 1595 struct kvm_mmu_page *parent,
1596 struct list_head *invalid_list)
1468{ 1597{
1469 int i, zapped = 0; 1598 int i, zapped = 0;
1470 struct mmu_page_path parents; 1599 struct mmu_page_path parents;
@@ -1478,7 +1607,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1478 struct kvm_mmu_page *sp; 1607 struct kvm_mmu_page *sp;
1479 1608
1480 for_each_sp(pages, sp, parents, i) { 1609 for_each_sp(pages, sp, parents, i) {
1481 kvm_mmu_zap_page(kvm, sp); 1610 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1482 mmu_pages_clear_parents(&parents); 1611 mmu_pages_clear_parents(&parents);
1483 zapped++; 1612 zapped++;
1484 } 1613 }
@@ -1488,32 +1617,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1488 return zapped; 1617 return zapped;
1489} 1618}
1490 1619
1491static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1620static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1621 struct list_head *invalid_list)
1492{ 1622{
1493 int ret; 1623 int ret;
1494 1624
1495 trace_kvm_mmu_zap_page(sp); 1625 trace_kvm_mmu_prepare_zap_page(sp);
1496 ++kvm->stat.mmu_shadow_zapped; 1626 ++kvm->stat.mmu_shadow_zapped;
1497 ret = mmu_zap_unsync_children(kvm, sp); 1627 ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1498 kvm_mmu_page_unlink_children(kvm, sp); 1628 kvm_mmu_page_unlink_children(kvm, sp);
1499 kvm_mmu_unlink_parents(kvm, sp); 1629 kvm_mmu_unlink_parents(kvm, sp);
1500 kvm_flush_remote_tlbs(kvm);
1501 if (!sp->role.invalid && !sp->role.direct) 1630 if (!sp->role.invalid && !sp->role.direct)
1502 unaccount_shadowed(kvm, sp->gfn); 1631 unaccount_shadowed(kvm, sp->gfn);
1503 if (sp->unsync) 1632 if (sp->unsync)
1504 kvm_unlink_unsync_page(kvm, sp); 1633 kvm_unlink_unsync_page(kvm, sp);
1505 if (!sp->root_count) { 1634 if (!sp->root_count) {
1506 hlist_del(&sp->hash_link); 1635 /* Count self */
1507 kvm_mmu_free_page(kvm, sp); 1636 ret++;
1637 list_move(&sp->link, invalid_list);
1508 } else { 1638 } else {
1509 sp->role.invalid = 1;
1510 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1639 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1511 kvm_reload_remote_mmus(kvm); 1640 kvm_reload_remote_mmus(kvm);
1512 } 1641 }
1642
1643 sp->role.invalid = 1;
1513 kvm_mmu_reset_last_pte_updated(kvm); 1644 kvm_mmu_reset_last_pte_updated(kvm);
1514 return ret; 1645 return ret;
1515} 1646}
1516 1647
1648static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1649 struct list_head *invalid_list)
1650{
1651 struct kvm_mmu_page *sp;
1652
1653 if (list_empty(invalid_list))
1654 return;
1655
1656 kvm_flush_remote_tlbs(kvm);
1657
1658 do {
1659 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1660 WARN_ON(!sp->role.invalid || sp->root_count);
1661 kvm_mmu_free_page(kvm, sp);
1662 } while (!list_empty(invalid_list));
1663
1664}
1665
1517/* 1666/*
1518 * Changing the number of mmu pages allocated to the vm 1667 * Changing the number of mmu pages allocated to the vm
1519 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1668 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1521,6 +1670,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1521void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1670void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1522{ 1671{
1523 int used_pages; 1672 int used_pages;
1673 LIST_HEAD(invalid_list);
1524 1674
1525 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; 1675 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1526 used_pages = max(0, used_pages); 1676 used_pages = max(0, used_pages);
@@ -1538,9 +1688,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1538 1688
1539 page = container_of(kvm->arch.active_mmu_pages.prev, 1689 page = container_of(kvm->arch.active_mmu_pages.prev,
1540 struct kvm_mmu_page, link); 1690 struct kvm_mmu_page, link);
1541 used_pages -= kvm_mmu_zap_page(kvm, page); 1691 used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
1542 used_pages--; 1692 &invalid_list);
1543 } 1693 }
1694 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1544 kvm_nr_mmu_pages = used_pages; 1695 kvm_nr_mmu_pages = used_pages;
1545 kvm->arch.n_free_mmu_pages = 0; 1696 kvm->arch.n_free_mmu_pages = 0;
1546 } 1697 }
@@ -1553,47 +1704,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1553 1704
1554static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1705static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1555{ 1706{
1556 unsigned index;
1557 struct hlist_head *bucket;
1558 struct kvm_mmu_page *sp; 1707 struct kvm_mmu_page *sp;
1559 struct hlist_node *node, *n; 1708 struct hlist_node *node;
1709 LIST_HEAD(invalid_list);
1560 int r; 1710 int r;
1561 1711
1562 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1712 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1563 r = 0; 1713 r = 0;
1564 index = kvm_page_table_hashfn(gfn); 1714
1565 bucket = &kvm->arch.mmu_page_hash[index]; 1715 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1566restart: 1716 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1567 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1717 sp->role.word);
1568 if (sp->gfn == gfn && !sp->role.direct) { 1718 r = 1;
1569 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1719 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1570 sp->role.word); 1720 }
1571 r = 1; 1721 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1572 if (kvm_mmu_zap_page(kvm, sp))
1573 goto restart;
1574 }
1575 return r; 1722 return r;
1576} 1723}
1577 1724
1578static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) 1725static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1579{ 1726{
1580 unsigned index;
1581 struct hlist_head *bucket;
1582 struct kvm_mmu_page *sp; 1727 struct kvm_mmu_page *sp;
1583 struct hlist_node *node, *nn; 1728 struct hlist_node *node;
1729 LIST_HEAD(invalid_list);
1584 1730
1585 index = kvm_page_table_hashfn(gfn); 1731 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1586 bucket = &kvm->arch.mmu_page_hash[index]; 1732 pgprintk("%s: zap %lx %x\n",
1587restart: 1733 __func__, gfn, sp->role.word);
1588 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { 1734 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1589 if (sp->gfn == gfn && !sp->role.direct
1590 && !sp->role.invalid) {
1591 pgprintk("%s: zap %lx %x\n",
1592 __func__, gfn, sp->role.word);
1593 if (kvm_mmu_zap_page(kvm, sp))
1594 goto restart;
1595 }
1596 } 1735 }
1736 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1597} 1737}
1598 1738
1599static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 1739static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -1723,47 +1863,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1723} 1863}
1724EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); 1864EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1725 1865
1726static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1866static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1727{ 1867{
1728 unsigned index;
1729 struct hlist_head *bucket;
1730 struct kvm_mmu_page *s;
1731 struct hlist_node *node, *n;
1732
1733 index = kvm_page_table_hashfn(sp->gfn);
1734 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1735 /* don't unsync if pagetable is shadowed with multiple roles */
1736 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1737 if (s->gfn != sp->gfn || s->role.direct)
1738 continue;
1739 if (s->role.word != sp->role.word)
1740 return 1;
1741 }
1742 trace_kvm_mmu_unsync_page(sp); 1868 trace_kvm_mmu_unsync_page(sp);
1743 ++vcpu->kvm->stat.mmu_unsync; 1869 ++vcpu->kvm->stat.mmu_unsync;
1744 sp->unsync = 1; 1870 sp->unsync = 1;
1745 1871
1746 kvm_mmu_mark_parents_unsync(sp); 1872 kvm_mmu_mark_parents_unsync(sp);
1747
1748 mmu_convert_notrap(sp); 1873 mmu_convert_notrap(sp);
1749 return 0; 1874}
1875
1876static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1877{
1878 struct kvm_mmu_page *s;
1879 struct hlist_node *node;
1880
1881 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1882 if (s->unsync)
1883 continue;
1884 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1885 __kvm_unsync_page(vcpu, s);
1886 }
1750} 1887}
1751 1888
1752static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, 1889static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1753 bool can_unsync) 1890 bool can_unsync)
1754{ 1891{
1755 struct kvm_mmu_page *shadow; 1892 struct kvm_mmu_page *s;
1893 struct hlist_node *node;
1894 bool need_unsync = false;
1756 1895
1757 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1896 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1758 if (shadow) { 1897 if (!can_unsync)
1759 if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1760 return 1; 1898 return 1;
1761 if (shadow->unsync) 1899
1762 return 0; 1900 if (s->role.level != PT_PAGE_TABLE_LEVEL)
1763 if (can_unsync && oos_shadow) 1901 return 1;
1764 return kvm_unsync_page(vcpu, shadow); 1902
1765 return 1; 1903 if (!need_unsync && !s->unsync) {
1904 if (!oos_shadow)
1905 return 1;
1906 need_unsync = true;
1907 }
1766 } 1908 }
1909 if (need_unsync)
1910 kvm_unsync_pages(vcpu, gfn);
1767 return 0; 1911 return 0;
1768} 1912}
1769 1913
@@ -1804,13 +1948,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1804 spte |= (u64)pfn << PAGE_SHIFT; 1948 spte |= (u64)pfn << PAGE_SHIFT;
1805 1949
1806 if ((pte_access & ACC_WRITE_MASK) 1950 if ((pte_access & ACC_WRITE_MASK)
1807 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1951 || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
1952 && !user_fault)) {
1808 1953
1809 if (level > PT_PAGE_TABLE_LEVEL && 1954 if (level > PT_PAGE_TABLE_LEVEL &&
1810 has_wrprotected_page(vcpu->kvm, gfn, level)) { 1955 has_wrprotected_page(vcpu->kvm, gfn, level)) {
1811 ret = 1; 1956 ret = 1;
1812 spte = shadow_trap_nonpresent_pte; 1957 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1813 goto set_pte; 1958 goto done;
1814 } 1959 }
1815 1960
1816 spte |= PT_WRITABLE_MASK; 1961 spte |= PT_WRITABLE_MASK;
@@ -1841,7 +1986,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1841 mark_page_dirty(vcpu->kvm, gfn); 1986 mark_page_dirty(vcpu->kvm, gfn);
1842 1987
1843set_pte: 1988set_pte:
1844 __set_spte(sptep, spte); 1989 if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1990 kvm_set_pfn_dirty(pfn);
1991 update_spte(sptep, spte);
1992done:
1845 return ret; 1993 return ret;
1846} 1994}
1847 1995
@@ -1853,7 +2001,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1853 bool reset_host_protection) 2001 bool reset_host_protection)
1854{ 2002{
1855 int was_rmapped = 0; 2003 int was_rmapped = 0;
1856 int was_writable = is_writable_pte(*sptep);
1857 int rmap_count; 2004 int rmap_count;
1858 2005
1859 pgprintk("%s: spte %llx access %x write_fault %d" 2006 pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1878,7 +2025,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1878 } else if (pfn != spte_to_pfn(*sptep)) { 2025 } else if (pfn != spte_to_pfn(*sptep)) {
1879 pgprintk("hfn old %lx new %lx\n", 2026 pgprintk("hfn old %lx new %lx\n",
1880 spte_to_pfn(*sptep), pfn); 2027 spte_to_pfn(*sptep), pfn);
1881 rmap_remove(vcpu->kvm, sptep); 2028 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2029 kvm_flush_remote_tlbs(vcpu->kvm);
1882 } else 2030 } else
1883 was_rmapped = 1; 2031 was_rmapped = 1;
1884 } 2032 }
@@ -1888,7 +2036,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1888 reset_host_protection)) { 2036 reset_host_protection)) {
1889 if (write_fault) 2037 if (write_fault)
1890 *ptwrite = 1; 2038 *ptwrite = 1;
1891 kvm_x86_ops->tlb_flush(vcpu); 2039 kvm_mmu_flush_tlb(vcpu);
1892 } 2040 }
1893 2041
1894 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2042 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -1902,15 +2050,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1902 page_header_update_slot(vcpu->kvm, sptep, gfn); 2050 page_header_update_slot(vcpu->kvm, sptep, gfn);
1903 if (!was_rmapped) { 2051 if (!was_rmapped) {
1904 rmap_count = rmap_add(vcpu, sptep, gfn); 2052 rmap_count = rmap_add(vcpu, sptep, gfn);
1905 kvm_release_pfn_clean(pfn);
1906 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2053 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1907 rmap_recycle(vcpu, sptep, gfn); 2054 rmap_recycle(vcpu, sptep, gfn);
1908 } else {
1909 if (was_writable)
1910 kvm_release_pfn_dirty(pfn);
1911 else
1912 kvm_release_pfn_clean(pfn);
1913 } 2055 }
2056 kvm_release_pfn_clean(pfn);
1914 if (speculative) { 2057 if (speculative) {
1915 vcpu->arch.last_pte_updated = sptep; 2058 vcpu->arch.last_pte_updated = sptep;
1916 vcpu->arch.last_pte_gfn = gfn; 2059 vcpu->arch.last_pte_gfn = gfn;
@@ -1939,7 +2082,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1939 } 2082 }
1940 2083
1941 if (*iterator.sptep == shadow_trap_nonpresent_pte) { 2084 if (*iterator.sptep == shadow_trap_nonpresent_pte) {
1942 pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; 2085 u64 base_addr = iterator.addr;
2086
2087 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2088 pseudo_gfn = base_addr >> PAGE_SHIFT;
1943 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2089 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
1944 iterator.level - 1, 2090 iterator.level - 1,
1945 1, ACC_ALL, iterator.sptep); 2091 1, ACC_ALL, iterator.sptep);
@@ -1958,6 +2104,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1958 return pt_write; 2104 return pt_write;
1959} 2105}
1960 2106
2107static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
2108{
2109 char buf[1];
2110 void __user *hva;
2111 int r;
2112
2113 /* Touch the page, so send SIGBUS */
2114 hva = (void __user *)gfn_to_hva(kvm, gfn);
2115 r = copy_from_user(buf, hva, 1);
2116}
2117
2118static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2119{
2120 kvm_release_pfn_clean(pfn);
2121 if (is_hwpoison_pfn(pfn)) {
2122 kvm_send_hwpoison_signal(kvm, gfn);
2123 return 0;
2124 } else if (is_fault_pfn(pfn))
2125 return -EFAULT;
2126
2127 return 1;
2128}
2129
1961static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2130static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1962{ 2131{
1963 int r; 2132 int r;
@@ -1981,10 +2150,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1981 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2150 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1982 2151
1983 /* mmio */ 2152 /* mmio */
1984 if (is_error_pfn(pfn)) { 2153 if (is_error_pfn(pfn))
1985 kvm_release_pfn_clean(pfn); 2154 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
1986 return 1;
1987 }
1988 2155
1989 spin_lock(&vcpu->kvm->mmu_lock); 2156 spin_lock(&vcpu->kvm->mmu_lock);
1990 if (mmu_notifier_retry(vcpu, mmu_seq)) 2157 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2007,6 +2174,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2007{ 2174{
2008 int i; 2175 int i;
2009 struct kvm_mmu_page *sp; 2176 struct kvm_mmu_page *sp;
2177 LIST_HEAD(invalid_list);
2010 2178
2011 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2179 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2012 return; 2180 return;
@@ -2016,8 +2184,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2016 2184
2017 sp = page_header(root); 2185 sp = page_header(root);
2018 --sp->root_count; 2186 --sp->root_count;
2019 if (!sp->root_count && sp->role.invalid) 2187 if (!sp->root_count && sp->role.invalid) {
2020 kvm_mmu_zap_page(vcpu->kvm, sp); 2188 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2189 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2190 }
2021 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2191 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2022 spin_unlock(&vcpu->kvm->mmu_lock); 2192 spin_unlock(&vcpu->kvm->mmu_lock);
2023 return; 2193 return;
@@ -2030,10 +2200,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2030 sp = page_header(root); 2200 sp = page_header(root);
2031 --sp->root_count; 2201 --sp->root_count;
2032 if (!sp->root_count && sp->role.invalid) 2202 if (!sp->root_count && sp->role.invalid)
2033 kvm_mmu_zap_page(vcpu->kvm, sp); 2203 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2204 &invalid_list);
2034 } 2205 }
2035 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2206 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2036 } 2207 }
2208 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2037 spin_unlock(&vcpu->kvm->mmu_lock); 2209 spin_unlock(&vcpu->kvm->mmu_lock);
2038 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2210 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2039} 2211}
@@ -2043,7 +2215,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2043 int ret = 0; 2215 int ret = 0;
2044 2216
2045 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { 2217 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2046 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 2218 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2047 ret = 1; 2219 ret = 1;
2048 } 2220 }
2049 2221
@@ -2071,6 +2243,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2071 root_gfn = 0; 2243 root_gfn = 0;
2072 } 2244 }
2073 spin_lock(&vcpu->kvm->mmu_lock); 2245 spin_lock(&vcpu->kvm->mmu_lock);
2246 kvm_mmu_free_some_pages(vcpu);
2074 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2247 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2075 PT64_ROOT_LEVEL, direct, 2248 PT64_ROOT_LEVEL, direct,
2076 ACC_ALL, NULL); 2249 ACC_ALL, NULL);
@@ -2101,6 +2274,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2101 root_gfn = i << 30; 2274 root_gfn = i << 30;
2102 } 2275 }
2103 spin_lock(&vcpu->kvm->mmu_lock); 2276 spin_lock(&vcpu->kvm->mmu_lock);
2277 kvm_mmu_free_some_pages(vcpu);
2104 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2278 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2105 PT32_ROOT_LEVEL, direct, 2279 PT32_ROOT_LEVEL, direct,
2106 ACC_ALL, NULL); 2280 ACC_ALL, NULL);
@@ -2196,10 +2370,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2196 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2370 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2197 smp_rmb(); 2371 smp_rmb();
2198 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2372 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2199 if (is_error_pfn(pfn)) { 2373 if (is_error_pfn(pfn))
2200 kvm_release_pfn_clean(pfn); 2374 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2201 return 1;
2202 }
2203 spin_lock(&vcpu->kvm->mmu_lock); 2375 spin_lock(&vcpu->kvm->mmu_lock);
2204 if (mmu_notifier_retry(vcpu, mmu_seq)) 2376 if (mmu_notifier_retry(vcpu, mmu_seq))
2205 goto out_unlock; 2377 goto out_unlock;
@@ -2241,7 +2413,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2241void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 2413void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2242{ 2414{
2243 ++vcpu->stat.tlb_flush; 2415 ++vcpu->stat.tlb_flush;
2244 kvm_x86_ops->tlb_flush(vcpu); 2416 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2245} 2417}
2246 2418
2247static void paging_new_cr3(struct kvm_vcpu *vcpu) 2419static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2455,10 +2627,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2455static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) 2627static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2456{ 2628{
2457 ASSERT(vcpu); 2629 ASSERT(vcpu);
2458 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { 2630 if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
2631 /* mmu.free() should set root_hpa = INVALID_PAGE */
2459 vcpu->arch.mmu.free(vcpu); 2632 vcpu->arch.mmu.free(vcpu);
2460 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2461 }
2462} 2633}
2463 2634
2464int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 2635int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -2475,9 +2646,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2475 r = mmu_topup_memory_caches(vcpu); 2646 r = mmu_topup_memory_caches(vcpu);
2476 if (r) 2647 if (r)
2477 goto out; 2648 goto out;
2478 spin_lock(&vcpu->kvm->mmu_lock);
2479 kvm_mmu_free_some_pages(vcpu);
2480 spin_unlock(&vcpu->kvm->mmu_lock);
2481 r = mmu_alloc_roots(vcpu); 2649 r = mmu_alloc_roots(vcpu);
2482 spin_lock(&vcpu->kvm->mmu_lock); 2650 spin_lock(&vcpu->kvm->mmu_lock);
2483 mmu_sync_roots(vcpu); 2651 mmu_sync_roots(vcpu);
@@ -2506,7 +2674,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2506 pte = *spte; 2674 pte = *spte;
2507 if (is_shadow_present_pte(pte)) { 2675 if (is_shadow_present_pte(pte)) {
2508 if (is_last_spte(pte, sp->role.level)) 2676 if (is_last_spte(pte, sp->role.level))
2509 rmap_remove(vcpu->kvm, spte); 2677 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
2510 else { 2678 else {
2511 child = page_header(pte & PT64_BASE_ADDR_MASK); 2679 child = page_header(pte & PT64_BASE_ADDR_MASK);
2512 mmu_page_remove_parent_pte(child, spte); 2680 mmu_page_remove_parent_pte(child, spte);
@@ -2527,6 +2695,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2527 return; 2695 return;
2528 } 2696 }
2529 2697
2698 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2699 return;
2700
2530 ++vcpu->kvm->stat.mmu_pte_updated; 2701 ++vcpu->kvm->stat.mmu_pte_updated;
2531 if (!sp->role.cr4_pae) 2702 if (!sp->role.cr4_pae)
2532 paging32_update_pte(vcpu, sp, spte, new); 2703 paging32_update_pte(vcpu, sp, spte, new);
@@ -2547,11 +2718,15 @@ static bool need_remote_flush(u64 old, u64 new)
2547 return (old & ~new & PT64_PERM_MASK) != 0; 2718 return (old & ~new & PT64_PERM_MASK) != 0;
2548} 2719}
2549 2720
2550static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) 2721static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
2722 bool remote_flush, bool local_flush)
2551{ 2723{
2552 if (need_remote_flush(old, new)) 2724 if (zap_page)
2725 return;
2726
2727 if (remote_flush)
2553 kvm_flush_remote_tlbs(vcpu->kvm); 2728 kvm_flush_remote_tlbs(vcpu->kvm);
2554 else 2729 else if (local_flush)
2555 kvm_mmu_flush_tlb(vcpu); 2730 kvm_mmu_flush_tlb(vcpu);
2556} 2731}
2557 2732
@@ -2601,10 +2776,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2601 bool guest_initiated) 2776 bool guest_initiated)
2602{ 2777{
2603 gfn_t gfn = gpa >> PAGE_SHIFT; 2778 gfn_t gfn = gpa >> PAGE_SHIFT;
2779 union kvm_mmu_page_role mask = { .word = 0 };
2604 struct kvm_mmu_page *sp; 2780 struct kvm_mmu_page *sp;
2605 struct hlist_node *node, *n; 2781 struct hlist_node *node;
2606 struct hlist_head *bucket; 2782 LIST_HEAD(invalid_list);
2607 unsigned index;
2608 u64 entry, gentry; 2783 u64 entry, gentry;
2609 u64 *spte; 2784 u64 *spte;
2610 unsigned offset = offset_in_page(gpa); 2785 unsigned offset = offset_in_page(gpa);
@@ -2617,6 +2792,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2617 int npte; 2792 int npte;
2618 int r; 2793 int r;
2619 int invlpg_counter; 2794 int invlpg_counter;
2795 bool remote_flush, local_flush, zap_page;
2796
2797 zap_page = remote_flush = local_flush = false;
2620 2798
2621 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2799 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2622 2800
@@ -2672,13 +2850,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2672 vcpu->arch.last_pte_updated = NULL; 2850 vcpu->arch.last_pte_updated = NULL;
2673 } 2851 }
2674 } 2852 }
2675 index = kvm_page_table_hashfn(gfn);
2676 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2677 2853
2678restart: 2854 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
2679 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2855 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
2680 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2681 continue;
2682 pte_size = sp->role.cr4_pae ? 8 : 4; 2856 pte_size = sp->role.cr4_pae ? 8 : 4;
2683 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2857 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2684 misaligned |= bytes < 4; 2858 misaligned |= bytes < 4;
@@ -2695,8 +2869,8 @@ restart:
2695 */ 2869 */
2696 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2870 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2697 gpa, bytes, sp->role.word); 2871 gpa, bytes, sp->role.word);
2698 if (kvm_mmu_zap_page(vcpu->kvm, sp)) 2872 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2699 goto restart; 2873 &invalid_list);
2700 ++vcpu->kvm->stat.mmu_flooded; 2874 ++vcpu->kvm->stat.mmu_flooded;
2701 continue; 2875 continue;
2702 } 2876 }
@@ -2720,16 +2894,22 @@ restart:
2720 if (quadrant != sp->role.quadrant) 2894 if (quadrant != sp->role.quadrant)
2721 continue; 2895 continue;
2722 } 2896 }
2897 local_flush = true;
2723 spte = &sp->spt[page_offset / sizeof(*spte)]; 2898 spte = &sp->spt[page_offset / sizeof(*spte)];
2724 while (npte--) { 2899 while (npte--) {
2725 entry = *spte; 2900 entry = *spte;
2726 mmu_pte_write_zap_pte(vcpu, sp, spte); 2901 mmu_pte_write_zap_pte(vcpu, sp, spte);
2727 if (gentry) 2902 if (gentry &&
2903 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
2904 & mask.word))
2728 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 2905 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2729 mmu_pte_write_flush_tlb(vcpu, entry, *spte); 2906 if (!remote_flush && need_remote_flush(entry, *spte))
2907 remote_flush = true;
2730 ++spte; 2908 ++spte;
2731 } 2909 }
2732 } 2910 }
2911 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2912 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2733 kvm_mmu_audit(vcpu, "post pte write"); 2913 kvm_mmu_audit(vcpu, "post pte write");
2734 spin_unlock(&vcpu->kvm->mmu_lock); 2914 spin_unlock(&vcpu->kvm->mmu_lock);
2735 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { 2915 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
@@ -2757,15 +2937,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2757 2937
2758void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 2938void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2759{ 2939{
2760 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && 2940 int free_pages;
2941 LIST_HEAD(invalid_list);
2942
2943 free_pages = vcpu->kvm->arch.n_free_mmu_pages;
2944 while (free_pages < KVM_REFILL_PAGES &&
2761 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 2945 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2762 struct kvm_mmu_page *sp; 2946 struct kvm_mmu_page *sp;
2763 2947
2764 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 2948 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2765 struct kvm_mmu_page, link); 2949 struct kvm_mmu_page, link);
2766 kvm_mmu_zap_page(vcpu->kvm, sp); 2950 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2951 &invalid_list);
2767 ++vcpu->kvm->stat.mmu_recycled; 2952 ++vcpu->kvm->stat.mmu_recycled;
2768 } 2953 }
2954 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2769} 2955}
2770 2956
2771int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 2957int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -2793,11 +2979,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2793 return 1; 2979 return 1;
2794 case EMULATE_DO_MMIO: 2980 case EMULATE_DO_MMIO:
2795 ++vcpu->stat.mmio_exits; 2981 ++vcpu->stat.mmio_exits;
2796 return 0; 2982 /* fall through */
2797 case EMULATE_FAIL: 2983 case EMULATE_FAIL:
2798 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2799 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2800 vcpu->run->internal.ndata = 0;
2801 return 0; 2984 return 0;
2802 default: 2985 default:
2803 BUG(); 2986 BUG();
@@ -2894,7 +3077,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2894 pt = sp->spt; 3077 pt = sp->spt;
2895 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3078 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2896 /* avoid RMW */ 3079 /* avoid RMW */
2897 if (pt[i] & PT_WRITABLE_MASK) 3080 if (is_writable_pte(pt[i]))
2898 pt[i] &= ~PT_WRITABLE_MASK; 3081 pt[i] &= ~PT_WRITABLE_MASK;
2899 } 3082 }
2900 kvm_flush_remote_tlbs(kvm); 3083 kvm_flush_remote_tlbs(kvm);
@@ -2903,28 +3086,29 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2903void kvm_mmu_zap_all(struct kvm *kvm) 3086void kvm_mmu_zap_all(struct kvm *kvm)
2904{ 3087{
2905 struct kvm_mmu_page *sp, *node; 3088 struct kvm_mmu_page *sp, *node;
3089 LIST_HEAD(invalid_list);
2906 3090
2907 spin_lock(&kvm->mmu_lock); 3091 spin_lock(&kvm->mmu_lock);
2908restart: 3092restart:
2909 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 3093 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2910 if (kvm_mmu_zap_page(kvm, sp)) 3094 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
2911 goto restart; 3095 goto restart;
2912 3096
3097 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2913 spin_unlock(&kvm->mmu_lock); 3098 spin_unlock(&kvm->mmu_lock);
2914
2915 kvm_flush_remote_tlbs(kvm);
2916} 3099}
2917 3100
2918static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) 3101static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3102 struct list_head *invalid_list)
2919{ 3103{
2920 struct kvm_mmu_page *page; 3104 struct kvm_mmu_page *page;
2921 3105
2922 page = container_of(kvm->arch.active_mmu_pages.prev, 3106 page = container_of(kvm->arch.active_mmu_pages.prev,
2923 struct kvm_mmu_page, link); 3107 struct kvm_mmu_page, link);
2924 return kvm_mmu_zap_page(kvm, page) + 1; 3108 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
2925} 3109}
2926 3110
2927static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) 3111static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
2928{ 3112{
2929 struct kvm *kvm; 3113 struct kvm *kvm;
2930 struct kvm *kvm_freed = NULL; 3114 struct kvm *kvm_freed = NULL;
@@ -2934,6 +3118,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2934 3118
2935 list_for_each_entry(kvm, &vm_list, vm_list) { 3119 list_for_each_entry(kvm, &vm_list, vm_list) {
2936 int npages, idx, freed_pages; 3120 int npages, idx, freed_pages;
3121 LIST_HEAD(invalid_list);
2937 3122
2938 idx = srcu_read_lock(&kvm->srcu); 3123 idx = srcu_read_lock(&kvm->srcu);
2939 spin_lock(&kvm->mmu_lock); 3124 spin_lock(&kvm->mmu_lock);
@@ -2941,12 +3126,14 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2941 kvm->arch.n_free_mmu_pages; 3126 kvm->arch.n_free_mmu_pages;
2942 cache_count += npages; 3127 cache_count += npages;
2943 if (!kvm_freed && nr_to_scan > 0 && npages > 0) { 3128 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
2944 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); 3129 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3130 &invalid_list);
2945 cache_count -= freed_pages; 3131 cache_count -= freed_pages;
2946 kvm_freed = kvm; 3132 kvm_freed = kvm;
2947 } 3133 }
2948 nr_to_scan--; 3134 nr_to_scan--;
2949 3135
3136 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2950 spin_unlock(&kvm->mmu_lock); 3137 spin_unlock(&kvm->mmu_lock);
2951 srcu_read_unlock(&kvm->srcu, idx); 3138 srcu_read_unlock(&kvm->srcu, idx);
2952 } 3139 }
@@ -3072,7 +3259,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3072 3259
3073static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3260static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3074{ 3261{
3075 kvm_set_cr3(vcpu, vcpu->arch.cr3); 3262 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3076 return 1; 3263 return 1;
3077} 3264}
3078 3265
@@ -3329,9 +3516,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3329 struct kvm_mmu_page *rev_sp; 3516 struct kvm_mmu_page *rev_sp;
3330 gfn_t gfn; 3517 gfn_t gfn;
3331 3518
3332 if (*sptep & PT_WRITABLE_MASK) { 3519 if (is_writable_pte(*sptep)) {
3333 rev_sp = page_header(__pa(sptep)); 3520 rev_sp = page_header(__pa(sptep));
3334 gfn = rev_sp->gfns[sptep - rev_sp->spt]; 3521 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3335 3522
3336 if (!gfn_to_memslot(kvm, gfn)) { 3523 if (!gfn_to_memslot(kvm, gfn)) {
3337 if (!printk_ratelimit()) 3524 if (!printk_ratelimit())
@@ -3345,8 +3532,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3345 return; 3532 return;
3346 } 3533 }
3347 3534
3348 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 3535 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3349 rev_sp->role.level);
3350 if (!*rmapp) { 3536 if (!*rmapp) {
3351 if (!printk_ratelimit()) 3537 if (!printk_ratelimit())
3352 return; 3538 return;
@@ -3379,7 +3565,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3379 3565
3380 if (!(ent & PT_PRESENT_MASK)) 3566 if (!(ent & PT_PRESENT_MASK))
3381 continue; 3567 continue;
3382 if (!(ent & PT_WRITABLE_MASK)) 3568 if (!is_writable_pte(ent))
3383 continue; 3569 continue;
3384 inspect_spte_has_rmap(vcpu->kvm, &pt[i]); 3570 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3385 } 3571 }
@@ -3407,13 +3593,12 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
3407 if (sp->unsync) 3593 if (sp->unsync)
3408 continue; 3594 continue;
3409 3595
3410 gfn = unalias_gfn(vcpu->kvm, sp->gfn); 3596 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3411 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
3412 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3597 rmapp = &slot->rmap[gfn - slot->base_gfn];
3413 3598
3414 spte = rmap_next(vcpu->kvm, rmapp, NULL); 3599 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3415 while (spte) { 3600 while (spte) {
3416 if (*spte & PT_WRITABLE_MASK) 3601 if (is_writable_pte(*spte))
3417 printk(KERN_ERR "%s: (%s) shadow page has " 3602 printk(KERN_ERR "%s: (%s) shadow page has "
3418 "writable mappings: gfn %lx role %x\n", 3603 "writable mappings: gfn %lx role %x\n",
3419 __func__, audit_msg, sp->gfn, 3604 __func__, audit_msg, sp->gfn,
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 42f07b1bfbc9..3aab0f0930ef 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
190 TP_ARGS(sp) 190 TP_ARGS(sp)
191); 191);
192 192
193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, 193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
194 TP_PROTO(struct kvm_mmu_page *sp), 194 TP_PROTO(struct kvm_mmu_page *sp),
195 195
196 TP_ARGS(sp) 196 TP_ARGS(sp)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 89d66ca4d87c..51ef9097960d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,6 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 * 11 *
11 * Authors: 12 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -118,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
118{ 119{
119 pt_element_t pte; 120 pt_element_t pte;
120 gfn_t table_gfn; 121 gfn_t table_gfn;
121 unsigned index, pt_access, pte_access; 122 unsigned index, pt_access, uninitialized_var(pte_access);
122 gpa_t pte_gpa; 123 gpa_t pte_gpa;
123 int rsvd_fault = 0; 124 bool eperm, present, rsvd_fault;
124 125
125 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 126 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
126 fetch_fault); 127 fetch_fault);
127walk: 128walk:
129 present = true;
130 eperm = rsvd_fault = false;
128 walker->level = vcpu->arch.mmu.root_level; 131 walker->level = vcpu->arch.mmu.root_level;
129 pte = vcpu->arch.cr3; 132 pte = vcpu->arch.cr3;
130#if PTTYPE == 64 133#if PTTYPE == 64
131 if (!is_long_mode(vcpu)) { 134 if (!is_long_mode(vcpu)) {
132 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); 135 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
133 trace_kvm_mmu_paging_element(pte, walker->level); 136 trace_kvm_mmu_paging_element(pte, walker->level);
134 if (!is_present_gpte(pte)) 137 if (!is_present_gpte(pte)) {
135 goto not_present; 138 present = false;
139 goto error;
140 }
136 --walker->level; 141 --walker->level;
137 } 142 }
138#endif 143#endif
@@ -150,37 +155,42 @@ walk:
150 walker->table_gfn[walker->level - 1] = table_gfn; 155 walker->table_gfn[walker->level - 1] = table_gfn;
151 walker->pte_gpa[walker->level - 1] = pte_gpa; 156 walker->pte_gpa[walker->level - 1] = pte_gpa;
152 157
153 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) 158 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
154 goto not_present; 159 present = false;
160 break;
161 }
155 162
156 trace_kvm_mmu_paging_element(pte, walker->level); 163 trace_kvm_mmu_paging_element(pte, walker->level);
157 164
158 if (!is_present_gpte(pte)) 165 if (!is_present_gpte(pte)) {
159 goto not_present; 166 present = false;
167 break;
168 }
160 169
161 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); 170 if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
162 if (rsvd_fault) 171 rsvd_fault = true;
163 goto access_error; 172 break;
173 }
164 174
165 if (write_fault && !is_writable_pte(pte)) 175 if (write_fault && !is_writable_pte(pte))
166 if (user_fault || is_write_protection(vcpu)) 176 if (user_fault || is_write_protection(vcpu))
167 goto access_error; 177 eperm = true;
168 178
169 if (user_fault && !(pte & PT_USER_MASK)) 179 if (user_fault && !(pte & PT_USER_MASK))
170 goto access_error; 180 eperm = true;
171 181
172#if PTTYPE == 64 182#if PTTYPE == 64
173 if (fetch_fault && (pte & PT64_NX_MASK)) 183 if (fetch_fault && (pte & PT64_NX_MASK))
174 goto access_error; 184 eperm = true;
175#endif 185#endif
176 186
177 if (!(pte & PT_ACCESSED_MASK)) { 187 if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
178 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 188 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
179 sizeof(pte)); 189 sizeof(pte));
180 mark_page_dirty(vcpu->kvm, table_gfn);
181 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 190 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
182 index, pte, pte|PT_ACCESSED_MASK)) 191 index, pte, pte|PT_ACCESSED_MASK))
183 goto walk; 192 goto walk;
193 mark_page_dirty(vcpu->kvm, table_gfn);
184 pte |= PT_ACCESSED_MASK; 194 pte |= PT_ACCESSED_MASK;
185 } 195 }
186 196
@@ -213,15 +223,18 @@ walk:
213 --walker->level; 223 --walker->level;
214 } 224 }
215 225
226 if (!present || eperm || rsvd_fault)
227 goto error;
228
216 if (write_fault && !is_dirty_gpte(pte)) { 229 if (write_fault && !is_dirty_gpte(pte)) {
217 bool ret; 230 bool ret;
218 231
219 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 232 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
220 mark_page_dirty(vcpu->kvm, table_gfn);
221 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 233 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
222 pte|PT_DIRTY_MASK); 234 pte|PT_DIRTY_MASK);
223 if (ret) 235 if (ret)
224 goto walk; 236 goto walk;
237 mark_page_dirty(vcpu->kvm, table_gfn);
225 pte |= PT_DIRTY_MASK; 238 pte |= PT_DIRTY_MASK;
226 walker->ptes[walker->level - 1] = pte; 239 walker->ptes[walker->level - 1] = pte;
227 } 240 }
@@ -229,22 +242,18 @@ walk:
229 walker->pt_access = pt_access; 242 walker->pt_access = pt_access;
230 walker->pte_access = pte_access; 243 walker->pte_access = pte_access;
231 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 244 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
232 __func__, (u64)pte, pt_access, pte_access); 245 __func__, (u64)pte, pte_access, pt_access);
233 return 1; 246 return 1;
234 247
235not_present: 248error:
236 walker->error_code = 0; 249 walker->error_code = 0;
237 goto err; 250 if (present)
238 251 walker->error_code |= PFERR_PRESENT_MASK;
239access_error:
240 walker->error_code = PFERR_PRESENT_MASK;
241
242err:
243 if (write_fault) 252 if (write_fault)
244 walker->error_code |= PFERR_WRITE_MASK; 253 walker->error_code |= PFERR_WRITE_MASK;
245 if (user_fault) 254 if (user_fault)
246 walker->error_code |= PFERR_USER_MASK; 255 walker->error_code |= PFERR_USER_MASK;
247 if (fetch_fault) 256 if (fetch_fault && is_nx(vcpu))
248 walker->error_code |= PFERR_FETCH_MASK; 257 walker->error_code |= PFERR_FETCH_MASK;
249 if (rsvd_fault) 258 if (rsvd_fault)
250 walker->error_code |= PFERR_RSVD_MASK; 259 walker->error_code |= PFERR_RSVD_MASK;
@@ -252,7 +261,7 @@ err:
252 return 0; 261 return 0;
253} 262}
254 263
255static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, 264static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
256 u64 *spte, const void *pte) 265 u64 *spte, const void *pte)
257{ 266{
258 pt_element_t gpte; 267 pt_element_t gpte;
@@ -263,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
263 gpte = *(const pt_element_t *)pte; 272 gpte = *(const pt_element_t *)pte;
264 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 273 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
265 if (!is_present_gpte(gpte)) { 274 if (!is_present_gpte(gpte)) {
266 if (page->unsync) 275 if (sp->unsync)
267 new_spte = shadow_trap_nonpresent_pte; 276 new_spte = shadow_trap_nonpresent_pte;
268 else 277 else
269 new_spte = shadow_notrap_nonpresent_pte; 278 new_spte = shadow_notrap_nonpresent_pte;
@@ -272,7 +281,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
272 return; 281 return;
273 } 282 }
274 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 283 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
275 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); 284 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
276 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 285 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
277 return; 286 return;
278 pfn = vcpu->arch.update_pte.pfn; 287 pfn = vcpu->arch.update_pte.pfn;
@@ -285,11 +294,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
285 * we call mmu_set_spte() with reset_host_protection = true beacuse that 294 * we call mmu_set_spte() with reset_host_protection = true beacuse that
286 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 295 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
287 */ 296 */
288 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 297 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
289 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, 298 is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
290 gpte_to_gfn(gpte), pfn, true, true); 299 gpte_to_gfn(gpte), pfn, true, true);
291} 300}
292 301
302static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
303 struct guest_walker *gw, int level)
304{
305 int r;
306 pt_element_t curr_pte;
307
308 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
309 &curr_pte, sizeof(curr_pte));
310 return r || curr_pte != gw->ptes[level - 1];
311}
312
293/* 313/*
294 * Fetch a shadow pte for a specific level in the paging hierarchy. 314 * Fetch a shadow pte for a specific level in the paging hierarchy.
295 */ 315 */
@@ -299,74 +319,86 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
299 int *ptwrite, pfn_t pfn) 319 int *ptwrite, pfn_t pfn)
300{ 320{
301 unsigned access = gw->pt_access; 321 unsigned access = gw->pt_access;
302 struct kvm_mmu_page *shadow_page; 322 struct kvm_mmu_page *sp = NULL;
303 u64 spte, *sptep = NULL; 323 bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
304 int direct; 324 int top_level;
305 gfn_t table_gfn; 325 unsigned direct_access;
306 int r; 326 struct kvm_shadow_walk_iterator it;
307 int level;
308 pt_element_t curr_pte;
309 struct kvm_shadow_walk_iterator iterator;
310 327
311 if (!is_present_gpte(gw->ptes[gw->level - 1])) 328 if (!is_present_gpte(gw->ptes[gw->level - 1]))
312 return NULL; 329 return NULL;
313 330
314 for_each_shadow_entry(vcpu, addr, iterator) { 331 direct_access = gw->pt_access & gw->pte_access;
315 level = iterator.level; 332 if (!dirty)
316 sptep = iterator.sptep; 333 direct_access &= ~ACC_WRITE_MASK;
317 if (iterator.level == hlevel) {
318 mmu_set_spte(vcpu, sptep, access,
319 gw->pte_access & access,
320 user_fault, write_fault,
321 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
322 ptwrite, level,
323 gw->gfn, pfn, false, true);
324 break;
325 }
326 334
327 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) 335 top_level = vcpu->arch.mmu.root_level;
328 continue; 336 if (top_level == PT32E_ROOT_LEVEL)
337 top_level = PT32_ROOT_LEVEL;
338 /*
339 * Verify that the top-level gpte is still there. Since the page
340 * is a root page, it is either write protected (and cannot be
341 * changed from now on) or it is invalid (in which case, we don't
342 * really care if it changes underneath us after this point).
343 */
344 if (FNAME(gpte_changed)(vcpu, gw, top_level))
345 goto out_gpte_changed;
329 346
330 if (is_large_pte(*sptep)) { 347 for (shadow_walk_init(&it, vcpu, addr);
331 rmap_remove(vcpu->kvm, sptep); 348 shadow_walk_okay(&it) && it.level > gw->level;
332 __set_spte(sptep, shadow_trap_nonpresent_pte); 349 shadow_walk_next(&it)) {
333 kvm_flush_remote_tlbs(vcpu->kvm); 350 gfn_t table_gfn;
334 }
335 351
336 if (level <= gw->level) { 352 drop_large_spte(vcpu, it.sptep);
337 int delta = level - gw->level + 1; 353
338 direct = 1; 354 sp = NULL;
339 if (!is_dirty_gpte(gw->ptes[level - delta])) 355 if (!is_shadow_present_pte(*it.sptep)) {
340 access &= ~ACC_WRITE_MASK; 356 table_gfn = gw->table_gfn[it.level - 2];
341 table_gfn = gpte_to_gfn(gw->ptes[level - delta]); 357 sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
342 /* advance table_gfn when emulating 1gb pages with 4k */ 358 false, access, it.sptep);
343 if (delta == 0)
344 table_gfn += PT_INDEX(addr, level);
345 } else {
346 direct = 0;
347 table_gfn = gw->table_gfn[level - 2];
348 }
349 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
350 direct, access, sptep);
351 if (!direct) {
352 r = kvm_read_guest_atomic(vcpu->kvm,
353 gw->pte_gpa[level - 2],
354 &curr_pte, sizeof(curr_pte));
355 if (r || curr_pte != gw->ptes[level - 2]) {
356 kvm_mmu_put_page(shadow_page, sptep);
357 kvm_release_pfn_clean(pfn);
358 sptep = NULL;
359 break;
360 }
361 } 359 }
362 360
363 spte = __pa(shadow_page->spt) 361 /*
364 | PT_PRESENT_MASK | PT_ACCESSED_MASK 362 * Verify that the gpte in the page we've just write
365 | PT_WRITABLE_MASK | PT_USER_MASK; 363 * protected is still there.
366 *sptep = spte; 364 */
365 if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
366 goto out_gpte_changed;
367
368 if (sp)
369 link_shadow_page(it.sptep, sp);
367 } 370 }
368 371
369 return sptep; 372 for (;
373 shadow_walk_okay(&it) && it.level > hlevel;
374 shadow_walk_next(&it)) {
375 gfn_t direct_gfn;
376
377 validate_direct_spte(vcpu, it.sptep, direct_access);
378
379 drop_large_spte(vcpu, it.sptep);
380
381 if (is_shadow_present_pte(*it.sptep))
382 continue;
383
384 direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
385
386 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
387 true, direct_access, it.sptep);
388 link_shadow_page(it.sptep, sp);
389 }
390
391 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
392 user_fault, write_fault, dirty, ptwrite, it.level,
393 gw->gfn, pfn, false, true);
394
395 return it.sptep;
396
397out_gpte_changed:
398 if (sp)
399 kvm_mmu_put_page(sp, it.sptep);
400 kvm_release_pfn_clean(pfn);
401 return NULL;
370} 402}
371 403
372/* 404/*
@@ -430,11 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
430 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 462 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
431 463
432 /* mmio */ 464 /* mmio */
433 if (is_error_pfn(pfn)) { 465 if (is_error_pfn(pfn))
434 pgprintk("gfn %lx is mmio\n", walker.gfn); 466 return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
435 kvm_release_pfn_clean(pfn);
436 return 1;
437 }
438 467
439 spin_lock(&vcpu->kvm->mmu_lock); 468 spin_lock(&vcpu->kvm->mmu_lock);
440 if (mmu_notifier_retry(vcpu, mmu_seq)) 469 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -442,6 +471,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
442 kvm_mmu_free_some_pages(vcpu); 471 kvm_mmu_free_some_pages(vcpu);
443 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 472 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
444 level, &write_pt, pfn); 473 level, &write_pt, pfn);
474 (void)sptep;
445 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 475 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
446 sptep, *sptep, write_pt); 476 sptep, *sptep, write_pt);
447 477
@@ -463,6 +493,7 @@ out_unlock:
463static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 493static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
464{ 494{
465 struct kvm_shadow_walk_iterator iterator; 495 struct kvm_shadow_walk_iterator iterator;
496 struct kvm_mmu_page *sp;
466 gpa_t pte_gpa = -1; 497 gpa_t pte_gpa = -1;
467 int level; 498 int level;
468 u64 *sptep; 499 u64 *sptep;
@@ -474,10 +505,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
474 level = iterator.level; 505 level = iterator.level;
475 sptep = iterator.sptep; 506 sptep = iterator.sptep;
476 507
508 sp = page_header(__pa(sptep));
477 if (is_last_spte(*sptep, level)) { 509 if (is_last_spte(*sptep, level)) {
478 struct kvm_mmu_page *sp = page_header(__pa(sptep));
479 int offset, shift; 510 int offset, shift;
480 511
512 if (!sp->unsync)
513 break;
514
481 shift = PAGE_SHIFT - 515 shift = PAGE_SHIFT -
482 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; 516 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
483 offset = sp->role.quadrant << shift; 517 offset = sp->role.quadrant << shift;
@@ -486,16 +520,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
486 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); 520 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
487 521
488 if (is_shadow_present_pte(*sptep)) { 522 if (is_shadow_present_pte(*sptep)) {
489 rmap_remove(vcpu->kvm, sptep);
490 if (is_large_pte(*sptep)) 523 if (is_large_pte(*sptep))
491 --vcpu->kvm->stat.lpages; 524 --vcpu->kvm->stat.lpages;
525 drop_spte(vcpu->kvm, sptep,
526 shadow_trap_nonpresent_pte);
492 need_flush = 1; 527 need_flush = 1;
493 } 528 } else
494 __set_spte(sptep, shadow_trap_nonpresent_pte); 529 __set_spte(sptep, shadow_trap_nonpresent_pte);
495 break; 530 break;
496 } 531 }
497 532
498 if (!is_shadow_present_pte(*sptep)) 533 if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
499 break; 534 break;
500 } 535 }
501 536
@@ -569,9 +604,9 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
569 * Using the cached information from sp->gfns is safe because: 604 * Using the cached information from sp->gfns is safe because:
570 * - The spte has a reference to the struct page, so the pfn for a given gfn 605 * - The spte has a reference to the struct page, so the pfn for a given gfn
571 * can't change unless all sptes pointing to it are nuked first. 606 * can't change unless all sptes pointing to it are nuked first.
572 * - Alias changes zap the entire shadow cache.
573 */ 607 */
574static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 608static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
609 bool clear_unsync)
575{ 610{
576 int i, offset, nr_present; 611 int i, offset, nr_present;
577 bool reset_host_protection; 612 bool reset_host_protection;
@@ -579,6 +614,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
579 614
580 offset = nr_present = 0; 615 offset = nr_present = 0;
581 616
617 /* direct kvm_mmu_page can not be unsync. */
618 BUG_ON(sp->role.direct);
619
582 if (PTTYPE == 32) 620 if (PTTYPE == 32)
583 offset = sp->role.quadrant << PT64_LEVEL_BITS; 621 offset = sp->role.quadrant << PT64_LEVEL_BITS;
584 622
@@ -588,7 +626,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
588 unsigned pte_access; 626 unsigned pte_access;
589 pt_element_t gpte; 627 pt_element_t gpte;
590 gpa_t pte_gpa; 628 gpa_t pte_gpa;
591 gfn_t gfn = sp->gfns[i]; 629 gfn_t gfn;
592 630
593 if (!is_shadow_present_pte(sp->spt[i])) 631 if (!is_shadow_present_pte(sp->spt[i]))
594 continue; 632 continue;
@@ -599,16 +637,17 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
599 sizeof(pt_element_t))) 637 sizeof(pt_element_t)))
600 return -EINVAL; 638 return -EINVAL;
601 639
602 if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || 640 gfn = gpte_to_gfn(gpte);
603 !(gpte & PT_ACCESSED_MASK)) { 641 if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
642 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
643 || !(gpte & PT_ACCESSED_MASK)) {
604 u64 nonpresent; 644 u64 nonpresent;
605 645
606 rmap_remove(vcpu->kvm, &sp->spt[i]); 646 if (is_present_gpte(gpte) || !clear_unsync)
607 if (is_present_gpte(gpte))
608 nonpresent = shadow_trap_nonpresent_pte; 647 nonpresent = shadow_trap_nonpresent_pte;
609 else 648 else
610 nonpresent = shadow_notrap_nonpresent_pte; 649 nonpresent = shadow_notrap_nonpresent_pte;
611 __set_spte(&sp->spt[i], nonpresent); 650 drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
612 continue; 651 continue;
613 } 652 }
614 653
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd26..bc5b9b8d4a33 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,6 +4,7 @@
4 * AMD SVM support 4 * AMD SVM support
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affilates.
7 * 8 *
8 * Authors: 9 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com> 10 * Yaniv Kamay <yaniv@qumranet.com>
@@ -130,7 +131,7 @@ static struct svm_direct_access_msrs {
130 u32 index; /* Index of the MSR */ 131 u32 index; /* Index of the MSR */
131 bool always; /* True if intercept is always on */ 132 bool always; /* True if intercept is always on */
132} direct_access_msrs[] = { 133} direct_access_msrs[] = {
133 { .index = MSR_K6_STAR, .always = true }, 134 { .index = MSR_STAR, .always = true },
134 { .index = MSR_IA32_SYSENTER_CS, .always = true }, 135 { .index = MSR_IA32_SYSENTER_CS, .always = true },
135#ifdef CONFIG_X86_64 136#ifdef CONFIG_X86_64
136 { .index = MSR_GS_BASE, .always = true }, 137 { .index = MSR_GS_BASE, .always = true },
@@ -285,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
285 286
286static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 287static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
287{ 288{
289 vcpu->arch.efer = efer;
288 if (!npt_enabled && !(efer & EFER_LMA)) 290 if (!npt_enabled && !(efer & EFER_LMA))
289 efer &= ~EFER_LME; 291 efer &= ~EFER_LME;
290 292
291 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 293 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
292 vcpu->arch.efer = efer;
293} 294}
294 295
295static int is_external_interrupt(u32 info) 296static int is_external_interrupt(u32 info)
@@ -383,8 +384,7 @@ static void svm_init_erratum_383(void)
383 int err; 384 int err;
384 u64 val; 385 u64 val;
385 386
386 /* Only Fam10h is affected */ 387 if (!cpu_has_amd_erratum(amd_erratum_383))
387 if (boot_cpu_data.x86 != 0x10)
388 return; 388 return;
389 389
390 /* Use _safe variants to not break nested virtualization */ 390 /* Use _safe variants to not break nested virtualization */
@@ -640,7 +640,7 @@ static __init int svm_hardware_setup(void)
640 640
641 if (nested) { 641 if (nested) {
642 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 642 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
643 kvm_enable_efer_bits(EFER_SVME); 643 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
644 } 644 }
645 645
646 for_each_possible_cpu(cpu) { 646 for_each_possible_cpu(cpu) {
@@ -806,7 +806,7 @@ static void init_vmcb(struct vcpu_svm *svm)
806 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 806 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
807 */ 807 */
808 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 808 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
809 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); 809 (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
810 810
811 save->cr4 = X86_CR4_PAE; 811 save->cr4 = X86_CR4_PAE;
812 /* rdx = ?? */ 812 /* rdx = ?? */
@@ -903,13 +903,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
903 svm->asid_generation = 0; 903 svm->asid_generation = 0;
904 init_vmcb(svm); 904 init_vmcb(svm);
905 905
906 fx_init(&svm->vcpu); 906 err = fx_init(&svm->vcpu);
907 if (err)
908 goto free_page4;
909
907 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 910 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
908 if (kvm_vcpu_is_bsp(&svm->vcpu)) 911 if (kvm_vcpu_is_bsp(&svm->vcpu))
909 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 912 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
910 913
911 return &svm->vcpu; 914 return &svm->vcpu;
912 915
916free_page4:
917 __free_page(hsave_page);
913free_page3: 918free_page3:
914 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); 919 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
915free_page2: 920free_page2:
@@ -1488,7 +1493,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
1488 */ 1493 */
1489 pr_err("KVM: Guest triggered AMD Erratum 383\n"); 1494 pr_err("KVM: Guest triggered AMD Erratum 383\n");
1490 1495
1491 set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); 1496 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1492 1497
1493 return; 1498 return;
1494 } 1499 }
@@ -1535,7 +1540,7 @@ static int io_interception(struct vcpu_svm *svm)
1535 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1540 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1536 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1541 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1537 if (string || in) 1542 if (string || in)
1538 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); 1543 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
1539 1544
1540 port = io_info >> 16; 1545 port = io_info >> 16;
1541 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1546 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1957,7 +1962,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1957 svm->vmcb->save.cr3 = hsave->save.cr3; 1962 svm->vmcb->save.cr3 = hsave->save.cr3;
1958 svm->vcpu.arch.cr3 = hsave->save.cr3; 1963 svm->vcpu.arch.cr3 = hsave->save.cr3;
1959 } else { 1964 } else {
1960 kvm_set_cr3(&svm->vcpu, hsave->save.cr3); 1965 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
1961 } 1966 }
1962 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); 1967 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
1963 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); 1968 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
@@ -2080,7 +2085,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2080 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 2085 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2081 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 2086 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2082 } else 2087 } else
2083 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 2088 (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2084 2089
2085 /* Guest paging mode is active - reset mmu */ 2090 /* Guest paging mode is active - reset mmu */
2086 kvm_mmu_reset_context(&svm->vcpu); 2091 kvm_mmu_reset_context(&svm->vcpu);
@@ -2386,16 +2391,12 @@ static int iret_interception(struct vcpu_svm *svm)
2386 2391
2387static int invlpg_interception(struct vcpu_svm *svm) 2392static int invlpg_interception(struct vcpu_svm *svm)
2388{ 2393{
2389 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) 2394 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2390 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2391 return 1;
2392} 2395}
2393 2396
2394static int emulate_on_interception(struct vcpu_svm *svm) 2397static int emulate_on_interception(struct vcpu_svm *svm)
2395{ 2398{
2396 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) 2399 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2397 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2398 return 1;
2399} 2400}
2400 2401
2401static int cr8_write_interception(struct vcpu_svm *svm) 2402static int cr8_write_interception(struct vcpu_svm *svm)
@@ -2431,7 +2432,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2431 *data = tsc_offset + native_read_tsc(); 2432 *data = tsc_offset + native_read_tsc();
2432 break; 2433 break;
2433 } 2434 }
2434 case MSR_K6_STAR: 2435 case MSR_STAR:
2435 *data = svm->vmcb->save.star; 2436 *data = svm->vmcb->save.star;
2436 break; 2437 break;
2437#ifdef CONFIG_X86_64 2438#ifdef CONFIG_X86_64
@@ -2555,7 +2556,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2555 2556
2556 break; 2557 break;
2557 } 2558 }
2558 case MSR_K6_STAR: 2559 case MSR_STAR:
2559 svm->vmcb->save.star = data; 2560 svm->vmcb->save.star = data;
2560 break; 2561 break;
2561#ifdef CONFIG_X86_64 2562#ifdef CONFIG_X86_64
@@ -2726,6 +2727,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2726 [SVM_EXIT_NPF] = pf_interception, 2727 [SVM_EXIT_NPF] = pf_interception,
2727}; 2728};
2728 2729
2730void dump_vmcb(struct kvm_vcpu *vcpu)
2731{
2732 struct vcpu_svm *svm = to_svm(vcpu);
2733 struct vmcb_control_area *control = &svm->vmcb->control;
2734 struct vmcb_save_area *save = &svm->vmcb->save;
2735
2736 pr_err("VMCB Control Area:\n");
2737 pr_err("cr_read: %04x\n", control->intercept_cr_read);
2738 pr_err("cr_write: %04x\n", control->intercept_cr_write);
2739 pr_err("dr_read: %04x\n", control->intercept_dr_read);
2740 pr_err("dr_write: %04x\n", control->intercept_dr_write);
2741 pr_err("exceptions: %08x\n", control->intercept_exceptions);
2742 pr_err("intercepts: %016llx\n", control->intercept);
2743 pr_err("pause filter count: %d\n", control->pause_filter_count);
2744 pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa);
2745 pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa);
2746 pr_err("tsc_offset: %016llx\n", control->tsc_offset);
2747 pr_err("asid: %d\n", control->asid);
2748 pr_err("tlb_ctl: %d\n", control->tlb_ctl);
2749 pr_err("int_ctl: %08x\n", control->int_ctl);
2750 pr_err("int_vector: %08x\n", control->int_vector);
2751 pr_err("int_state: %08x\n", control->int_state);
2752 pr_err("exit_code: %08x\n", control->exit_code);
2753 pr_err("exit_info1: %016llx\n", control->exit_info_1);
2754 pr_err("exit_info2: %016llx\n", control->exit_info_2);
2755 pr_err("exit_int_info: %08x\n", control->exit_int_info);
2756 pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err);
2757 pr_err("nested_ctl: %lld\n", control->nested_ctl);
2758 pr_err("nested_cr3: %016llx\n", control->nested_cr3);
2759 pr_err("event_inj: %08x\n", control->event_inj);
2760 pr_err("event_inj_err: %08x\n", control->event_inj_err);
2761 pr_err("lbr_ctl: %lld\n", control->lbr_ctl);
2762 pr_err("next_rip: %016llx\n", control->next_rip);
2763 pr_err("VMCB State Save Area:\n");
2764 pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n",
2765 save->es.selector, save->es.attrib,
2766 save->es.limit, save->es.base);
2767 pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n",
2768 save->cs.selector, save->cs.attrib,
2769 save->cs.limit, save->cs.base);
2770 pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n",
2771 save->ss.selector, save->ss.attrib,
2772 save->ss.limit, save->ss.base);
2773 pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n",
2774 save->ds.selector, save->ds.attrib,
2775 save->ds.limit, save->ds.base);
2776 pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n",
2777 save->fs.selector, save->fs.attrib,
2778 save->fs.limit, save->fs.base);
2779 pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n",
2780 save->gs.selector, save->gs.attrib,
2781 save->gs.limit, save->gs.base);
2782 pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
2783 save->gdtr.selector, save->gdtr.attrib,
2784 save->gdtr.limit, save->gdtr.base);
2785 pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
2786 save->ldtr.selector, save->ldtr.attrib,
2787 save->ldtr.limit, save->ldtr.base);
2788 pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
2789 save->idtr.selector, save->idtr.attrib,
2790 save->idtr.limit, save->idtr.base);
2791 pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n",
2792 save->tr.selector, save->tr.attrib,
2793 save->tr.limit, save->tr.base);
2794 pr_err("cpl: %d efer: %016llx\n",
2795 save->cpl, save->efer);
2796 pr_err("cr0: %016llx cr2: %016llx\n",
2797 save->cr0, save->cr2);
2798 pr_err("cr3: %016llx cr4: %016llx\n",
2799 save->cr3, save->cr4);
2800 pr_err("dr6: %016llx dr7: %016llx\n",
2801 save->dr6, save->dr7);
2802 pr_err("rip: %016llx rflags: %016llx\n",
2803 save->rip, save->rflags);
2804 pr_err("rsp: %016llx rax: %016llx\n",
2805 save->rsp, save->rax);
2806 pr_err("star: %016llx lstar: %016llx\n",
2807 save->star, save->lstar);
2808 pr_err("cstar: %016llx sfmask: %016llx\n",
2809 save->cstar, save->sfmask);
2810 pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n",
2811 save->kernel_gs_base, save->sysenter_cs);
2812 pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n",
2813 save->sysenter_esp, save->sysenter_eip);
2814 pr_err("gpat: %016llx dbgctl: %016llx\n",
2815 save->g_pat, save->dbgctl);
2816 pr_err("br_from: %016llx br_to: %016llx\n",
2817 save->br_from, save->br_to);
2818 pr_err("excp_from: %016llx excp_to: %016llx\n",
2819 save->last_excp_from, save->last_excp_to);
2820
2821}
2822
2729static int handle_exit(struct kvm_vcpu *vcpu) 2823static int handle_exit(struct kvm_vcpu *vcpu)
2730{ 2824{
2731 struct vcpu_svm *svm = to_svm(vcpu); 2825 struct vcpu_svm *svm = to_svm(vcpu);
@@ -2770,6 +2864,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2770 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2864 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2771 kvm_run->fail_entry.hardware_entry_failure_reason 2865 kvm_run->fail_entry.hardware_entry_failure_reason
2772 = svm->vmcb->control.exit_code; 2866 = svm->vmcb->control.exit_code;
2867 pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
2868 dump_vmcb(vcpu);
2773 return 0; 2869 return 0;
2774 } 2870 }
2775 2871
@@ -2826,9 +2922,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2826{ 2922{
2827 struct vmcb_control_area *control; 2923 struct vmcb_control_area *control;
2828 2924
2829 trace_kvm_inj_virq(irq);
2830
2831 ++svm->vcpu.stat.irq_injections;
2832 control = &svm->vmcb->control; 2925 control = &svm->vmcb->control;
2833 control->int_vector = irq; 2926 control->int_vector = irq;
2834 control->int_ctl &= ~V_INTR_PRIO_MASK; 2927 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -2842,6 +2935,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
2842 2935
2843 BUG_ON(!(gif_set(svm))); 2936 BUG_ON(!(gif_set(svm)));
2844 2937
2938 trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
2939 ++vcpu->stat.irq_injections;
2940
2845 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 2941 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
2846 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; 2942 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2847} 2943}
@@ -3327,6 +3423,11 @@ static bool svm_rdtscp_supported(void)
3327 return false; 3423 return false;
3328} 3424}
3329 3425
3426static bool svm_has_wbinvd_exit(void)
3427{
3428 return true;
3429}
3430
3330static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) 3431static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3331{ 3432{
3332 struct vcpu_svm *svm = to_svm(vcpu); 3433 struct vcpu_svm *svm = to_svm(vcpu);
@@ -3411,6 +3512,8 @@ static struct kvm_x86_ops svm_x86_ops = {
3411 .rdtscp_supported = svm_rdtscp_supported, 3512 .rdtscp_supported = svm_rdtscp_supported,
3412 3513
3413 .set_supported_cpuid = svm_set_supported_cpuid, 3514 .set_supported_cpuid = svm_set_supported_cpuid,
3515
3516 .has_wbinvd_exit = svm_has_wbinvd_exit,
3414}; 3517};
3415 3518
3416static int __init svm_init(void) 3519static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 4ddadb1a5ffe..e16a0dbe74d8 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -1,3 +1,17 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * timer support
8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory.
13 */
14
1#include <linux/kvm_host.h> 15#include <linux/kvm_host.h>
2#include <linux/kvm.h> 16#include <linux/kvm.h>
3#include <linux/hrtimer.h> 17#include <linux/hrtimer.h>
@@ -18,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
18 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 32 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
19 atomic_inc(&ktimer->pending); 33 atomic_inc(&ktimer->pending);
20 /* FIXME: this code should not know anything about vcpus */ 34 /* FIXME: this code should not know anything about vcpus */
21 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 35 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
22 } 36 }
23 37
24 if (waitqueue_active(q)) 38 if (waitqueue_active(q))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 859a01a07dbf..49b25eee25ac 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,6 +5,7 @@
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Authors: 10 * Authors:
10 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
@@ -36,6 +37,8 @@
36#include <asm/vmx.h> 37#include <asm/vmx.h>
37#include <asm/virtext.h> 38#include <asm/virtext.h>
38#include <asm/mce.h> 39#include <asm/mce.h>
40#include <asm/i387.h>
41#include <asm/xcr.h>
39 42
40#include "trace.h" 43#include "trace.h"
41 44
@@ -63,6 +66,9 @@ module_param_named(unrestricted_guest,
63static int __read_mostly emulate_invalid_guest_state = 0; 66static int __read_mostly emulate_invalid_guest_state = 0;
64module_param(emulate_invalid_guest_state, bool, S_IRUGO); 67module_param(emulate_invalid_guest_state, bool, S_IRUGO);
65 68
69static int __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO);
71
66#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 72#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
67 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 73 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
68#define KVM_GUEST_CR0_MASK \ 74#define KVM_GUEST_CR0_MASK \
@@ -173,10 +179,13 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
173 179
174static int init_rmode(struct kvm *kvm); 180static int init_rmode(struct kvm *kvm);
175static u64 construct_eptp(unsigned long root_hpa); 181static u64 construct_eptp(unsigned long root_hpa);
182static void kvm_cpu_vmxon(u64 addr);
183static void kvm_cpu_vmxoff(void);
176 184
177static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
178static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
179static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 187static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
188static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
180 189
181static unsigned long *vmx_io_bitmap_a; 190static unsigned long *vmx_io_bitmap_a;
182static unsigned long *vmx_io_bitmap_b; 191static unsigned long *vmx_io_bitmap_b;
@@ -231,14 +240,14 @@ static u64 host_efer;
231static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 240static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
232 241
233/* 242/*
234 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it 243 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
235 * away by decrementing the array size. 244 * away by decrementing the array size.
236 */ 245 */
237static const u32 vmx_msr_index[] = { 246static const u32 vmx_msr_index[] = {
238#ifdef CONFIG_X86_64 247#ifdef CONFIG_X86_64
239 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 248 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
240#endif 249#endif
241 MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, 250 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
242}; 251};
243#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 252#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
244 253
@@ -334,6 +343,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
334 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; 343 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
335} 344}
336 345
346static inline bool cpu_has_vmx_ept_4levels(void)
347{
348 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
349}
350
337static inline bool cpu_has_vmx_invept_individual_addr(void) 351static inline bool cpu_has_vmx_invept_individual_addr(void)
338{ 352{
339 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; 353 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -349,6 +363,16 @@ static inline bool cpu_has_vmx_invept_global(void)
349 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; 363 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
350} 364}
351 365
366static inline bool cpu_has_vmx_invvpid_single(void)
367{
368 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
369}
370
371static inline bool cpu_has_vmx_invvpid_global(void)
372{
373 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
374}
375
352static inline bool cpu_has_vmx_ept(void) 376static inline bool cpu_has_vmx_ept(void)
353{ 377{
354 return vmcs_config.cpu_based_2nd_exec_ctrl & 378 return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -389,6 +413,12 @@ static inline bool cpu_has_virtual_nmis(void)
389 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 413 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
390} 414}
391 415
416static inline bool cpu_has_vmx_wbinvd_exit(void)
417{
418 return vmcs_config.cpu_based_2nd_exec_ctrl &
419 SECONDARY_EXEC_WBINVD_EXITING;
420}
421
392static inline bool report_flexpriority(void) 422static inline bool report_flexpriority(void)
393{ 423{
394 return flexpriority_enabled; 424 return flexpriority_enabled;
@@ -453,6 +483,19 @@ static void vmcs_clear(struct vmcs *vmcs)
453 vmcs, phys_addr); 483 vmcs, phys_addr);
454} 484}
455 485
486static void vmcs_load(struct vmcs *vmcs)
487{
488 u64 phys_addr = __pa(vmcs);
489 u8 error;
490
491 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
492 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
493 : "cc", "memory");
494 if (error)
495 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
496 vmcs, phys_addr);
497}
498
456static void __vcpu_clear(void *arg) 499static void __vcpu_clear(void *arg)
457{ 500{
458 struct vcpu_vmx *vmx = arg; 501 struct vcpu_vmx *vmx = arg;
@@ -475,12 +518,27 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
475 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 518 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
476} 519}
477 520
478static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) 521static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
479{ 522{
480 if (vmx->vpid == 0) 523 if (vmx->vpid == 0)
481 return; 524 return;
482 525
483 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); 526 if (cpu_has_vmx_invvpid_single())
527 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
528}
529
530static inline void vpid_sync_vcpu_global(void)
531{
532 if (cpu_has_vmx_invvpid_global())
533 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
534}
535
536static inline void vpid_sync_context(struct vcpu_vmx *vmx)
537{
538 if (cpu_has_vmx_invvpid_single())
539 vpid_sync_vcpu_single(vmx);
540 else
541 vpid_sync_vcpu_global();
484} 542}
485 543
486static inline void ept_sync_global(void) 544static inline void ept_sync_global(void)
@@ -812,6 +870,9 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
812 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 870 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
813 } 871 }
814#endif 872#endif
873 if (current_thread_info()->status & TS_USEDFPU)
874 clts();
875 load_gdt(&__get_cpu_var(host_gdt));
815} 876}
816 877
817static void vmx_load_host_state(struct vcpu_vmx *vmx) 878static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -828,35 +889,30 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
828static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 889static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
829{ 890{
830 struct vcpu_vmx *vmx = to_vmx(vcpu); 891 struct vcpu_vmx *vmx = to_vmx(vcpu);
831 u64 phys_addr = __pa(vmx->vmcs);
832 u64 tsc_this, delta, new_offset; 892 u64 tsc_this, delta, new_offset;
893 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
833 894
834 if (vcpu->cpu != cpu) { 895 if (!vmm_exclusive)
896 kvm_cpu_vmxon(phys_addr);
897 else if (vcpu->cpu != cpu)
835 vcpu_clear(vmx); 898 vcpu_clear(vmx);
836 kvm_migrate_timers(vcpu);
837 set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
838 local_irq_disable();
839 list_add(&vmx->local_vcpus_link,
840 &per_cpu(vcpus_on_cpu, cpu));
841 local_irq_enable();
842 }
843 899
844 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 900 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
845 u8 error;
846
847 per_cpu(current_vmcs, cpu) = vmx->vmcs; 901 per_cpu(current_vmcs, cpu) = vmx->vmcs;
848 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 902 vmcs_load(vmx->vmcs);
849 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
850 : "cc");
851 if (error)
852 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
853 vmx->vmcs, phys_addr);
854 } 903 }
855 904
856 if (vcpu->cpu != cpu) { 905 if (vcpu->cpu != cpu) {
857 struct desc_ptr dt; 906 struct desc_ptr dt;
858 unsigned long sysenter_esp; 907 unsigned long sysenter_esp;
859 908
909 kvm_migrate_timers(vcpu);
910 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
911 local_irq_disable();
912 list_add(&vmx->local_vcpus_link,
913 &per_cpu(vcpus_on_cpu, cpu));
914 local_irq_enable();
915
860 vcpu->cpu = cpu; 916 vcpu->cpu = cpu;
861 /* 917 /*
862 * Linux uses per-cpu TSS and GDT, so set these when switching 918 * Linux uses per-cpu TSS and GDT, so set these when switching
@@ -884,6 +940,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
884static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 940static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
885{ 941{
886 __vmx_load_host_state(to_vmx(vcpu)); 942 __vmx_load_host_state(to_vmx(vcpu));
943 if (!vmm_exclusive) {
944 __vcpu_clear(to_vmx(vcpu));
945 kvm_cpu_vmxoff();
946 }
887} 947}
888 948
889static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 949static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -1057,10 +1117,10 @@ static void setup_msrs(struct vcpu_vmx *vmx)
1057 if (index >= 0 && vmx->rdtscp_enabled) 1117 if (index >= 0 && vmx->rdtscp_enabled)
1058 move_msr_up(vmx, index, save_nmsrs++); 1118 move_msr_up(vmx, index, save_nmsrs++);
1059 /* 1119 /*
1060 * MSR_K6_STAR is only needed on long mode guests, and only 1120 * MSR_STAR is only needed on long mode guests, and only
1061 * if efer.sce is enabled. 1121 * if efer.sce is enabled.
1062 */ 1122 */
1063 index = __find_msr_index(vmx, MSR_K6_STAR); 1123 index = __find_msr_index(vmx, MSR_STAR);
1064 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) 1124 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
1065 move_msr_up(vmx, index, save_nmsrs++); 1125 move_msr_up(vmx, index, save_nmsrs++);
1066 } 1126 }
@@ -1286,6 +1346,13 @@ static __init int vmx_disabled_by_bios(void)
1286 /* locked but not enabled */ 1346 /* locked but not enabled */
1287} 1347}
1288 1348
1349static void kvm_cpu_vmxon(u64 addr)
1350{
1351 asm volatile (ASM_VMX_VMXON_RAX
1352 : : "a"(&addr), "m"(addr)
1353 : "memory", "cc");
1354}
1355
1289static int hardware_enable(void *garbage) 1356static int hardware_enable(void *garbage)
1290{ 1357{
1291 int cpu = raw_smp_processor_id(); 1358 int cpu = raw_smp_processor_id();
@@ -1308,11 +1375,13 @@ static int hardware_enable(void *garbage)
1308 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 1375 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1309 } 1376 }
1310 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1377 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1311 asm volatile (ASM_VMX_VMXON_RAX
1312 : : "a"(&phys_addr), "m"(phys_addr)
1313 : "memory", "cc");
1314 1378
1315 ept_sync_global(); 1379 if (vmm_exclusive) {
1380 kvm_cpu_vmxon(phys_addr);
1381 ept_sync_global();
1382 }
1383
1384 store_gdt(&__get_cpu_var(host_gdt));
1316 1385
1317 return 0; 1386 return 0;
1318} 1387}
@@ -1334,13 +1403,15 @@ static void vmclear_local_vcpus(void)
1334static void kvm_cpu_vmxoff(void) 1403static void kvm_cpu_vmxoff(void)
1335{ 1404{
1336 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 1405 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1337 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1338} 1406}
1339 1407
1340static void hardware_disable(void *garbage) 1408static void hardware_disable(void *garbage)
1341{ 1409{
1342 vmclear_local_vcpus(); 1410 if (vmm_exclusive) {
1343 kvm_cpu_vmxoff(); 1411 vmclear_local_vcpus();
1412 kvm_cpu_vmxoff();
1413 }
1414 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1344} 1415}
1345 1416
1346static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 1417static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -1539,7 +1610,8 @@ static __init int hardware_setup(void)
1539 if (!cpu_has_vmx_vpid()) 1610 if (!cpu_has_vmx_vpid())
1540 enable_vpid = 0; 1611 enable_vpid = 0;
1541 1612
1542 if (!cpu_has_vmx_ept()) { 1613 if (!cpu_has_vmx_ept() ||
1614 !cpu_has_vmx_ept_4levels()) {
1543 enable_ept = 0; 1615 enable_ept = 0;
1544 enable_unrestricted_guest = 0; 1616 enable_unrestricted_guest = 0;
1545 } 1617 }
@@ -1628,7 +1700,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
1628 gfn_t base_gfn; 1700 gfn_t base_gfn;
1629 1701
1630 slots = kvm_memslots(kvm); 1702 slots = kvm_memslots(kvm);
1631 base_gfn = kvm->memslots->memslots[0].base_gfn + 1703 base_gfn = slots->memslots[0].base_gfn +
1632 kvm->memslots->memslots[0].npages - 3; 1704 kvm->memslots->memslots[0].npages - 3;
1633 return base_gfn << PAGE_SHIFT; 1705 return base_gfn << PAGE_SHIFT;
1634 } 1706 }
@@ -1744,27 +1816,27 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1744 (guest_tr_ar & ~AR_TYPE_MASK) 1816 (guest_tr_ar & ~AR_TYPE_MASK)
1745 | AR_TYPE_BUSY_64_TSS); 1817 | AR_TYPE_BUSY_64_TSS);
1746 } 1818 }
1747 vcpu->arch.efer |= EFER_LMA; 1819 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
1748 vmx_set_efer(vcpu, vcpu->arch.efer);
1749} 1820}
1750 1821
1751static void exit_lmode(struct kvm_vcpu *vcpu) 1822static void exit_lmode(struct kvm_vcpu *vcpu)
1752{ 1823{
1753 vcpu->arch.efer &= ~EFER_LMA;
1754
1755 vmcs_write32(VM_ENTRY_CONTROLS, 1824 vmcs_write32(VM_ENTRY_CONTROLS,
1756 vmcs_read32(VM_ENTRY_CONTROLS) 1825 vmcs_read32(VM_ENTRY_CONTROLS)
1757 & ~VM_ENTRY_IA32E_MODE); 1826 & ~VM_ENTRY_IA32E_MODE);
1758 vmx_set_efer(vcpu, vcpu->arch.efer); 1827 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
1759} 1828}
1760 1829
1761#endif 1830#endif
1762 1831
1763static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1832static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1764{ 1833{
1765 vpid_sync_vcpu_all(to_vmx(vcpu)); 1834 vpid_sync_context(to_vmx(vcpu));
1766 if (enable_ept) 1835 if (enable_ept) {
1836 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1837 return;
1767 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1838 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1839 }
1768} 1840}
1769 1841
1770static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1842static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -2510,7 +2582,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2510 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2582 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2511 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2583 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
2512 2584
2513 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ 2585 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
2514 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ 2586 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
2515 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 2587 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
2516 2588
@@ -2602,21 +2674,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2602 2674
2603static int init_rmode(struct kvm *kvm) 2675static int init_rmode(struct kvm *kvm)
2604{ 2676{
2677 int idx, ret = 0;
2678
2679 idx = srcu_read_lock(&kvm->srcu);
2605 if (!init_rmode_tss(kvm)) 2680 if (!init_rmode_tss(kvm))
2606 return 0; 2681 goto exit;
2607 if (!init_rmode_identity_map(kvm)) 2682 if (!init_rmode_identity_map(kvm))
2608 return 0; 2683 goto exit;
2609 return 1; 2684
2685 ret = 1;
2686exit:
2687 srcu_read_unlock(&kvm->srcu, idx);
2688 return ret;
2610} 2689}
2611 2690
2612static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2691static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2613{ 2692{
2614 struct vcpu_vmx *vmx = to_vmx(vcpu); 2693 struct vcpu_vmx *vmx = to_vmx(vcpu);
2615 u64 msr; 2694 u64 msr;
2616 int ret, idx; 2695 int ret;
2617 2696
2618 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2697 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2619 idx = srcu_read_lock(&vcpu->kvm->srcu);
2620 if (!init_rmode(vmx->vcpu.kvm)) { 2698 if (!init_rmode(vmx->vcpu.kvm)) {
2621 ret = -ENOMEM; 2699 ret = -ENOMEM;
2622 goto out; 2700 goto out;
@@ -2633,7 +2711,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2633 msr |= MSR_IA32_APICBASE_BSP; 2711 msr |= MSR_IA32_APICBASE_BSP;
2634 kvm_set_apic_base(&vmx->vcpu, msr); 2712 kvm_set_apic_base(&vmx->vcpu, msr);
2635 2713
2636 fx_init(&vmx->vcpu); 2714 ret = fx_init(&vmx->vcpu);
2715 if (ret != 0)
2716 goto out;
2637 2717
2638 seg_setup(VCPU_SREG_CS); 2718 seg_setup(VCPU_SREG_CS);
2639 /* 2719 /*
@@ -2716,7 +2796,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2716 vmx_fpu_activate(&vmx->vcpu); 2796 vmx_fpu_activate(&vmx->vcpu);
2717 update_exception_bitmap(&vmx->vcpu); 2797 update_exception_bitmap(&vmx->vcpu);
2718 2798
2719 vpid_sync_vcpu_all(vmx); 2799 vpid_sync_context(vmx);
2720 2800
2721 ret = 0; 2801 ret = 0;
2722 2802
@@ -2724,7 +2804,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2724 vmx->emulation_required = 0; 2804 vmx->emulation_required = 0;
2725 2805
2726out: 2806out:
2727 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2728 return ret; 2807 return ret;
2729} 2808}
2730 2809
@@ -2829,9 +2908,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2829{ 2908{
2830 if (!cpu_has_virtual_nmis()) 2909 if (!cpu_has_virtual_nmis())
2831 return to_vmx(vcpu)->soft_vnmi_blocked; 2910 return to_vmx(vcpu)->soft_vnmi_blocked;
2832 else 2911 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
2833 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2834 GUEST_INTR_STATE_NMI);
2835} 2912}
2836 2913
2837static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 2914static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3073,7 +3150,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
3073 ++vcpu->stat.io_exits; 3150 ++vcpu->stat.io_exits;
3074 3151
3075 if (string || in) 3152 if (string || in)
3076 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); 3153 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
3077 3154
3078 port = exit_qualification >> 16; 3155 port = exit_qualification >> 16;
3079 size = (exit_qualification & 7) + 1; 3156 size = (exit_qualification & 7) + 1;
@@ -3093,11 +3170,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3093 hypercall[2] = 0xc1; 3170 hypercall[2] = 0xc1;
3094} 3171}
3095 3172
3173static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
3174{
3175 if (err)
3176 kvm_inject_gp(vcpu, 0);
3177 else
3178 skip_emulated_instruction(vcpu);
3179}
3180
3096static int handle_cr(struct kvm_vcpu *vcpu) 3181static int handle_cr(struct kvm_vcpu *vcpu)
3097{ 3182{
3098 unsigned long exit_qualification, val; 3183 unsigned long exit_qualification, val;
3099 int cr; 3184 int cr;
3100 int reg; 3185 int reg;
3186 int err;
3101 3187
3102 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3188 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3103 cr = exit_qualification & 15; 3189 cr = exit_qualification & 15;
@@ -3108,16 +3194,16 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3108 trace_kvm_cr_write(cr, val); 3194 trace_kvm_cr_write(cr, val);
3109 switch (cr) { 3195 switch (cr) {
3110 case 0: 3196 case 0:
3111 kvm_set_cr0(vcpu, val); 3197 err = kvm_set_cr0(vcpu, val);
3112 skip_emulated_instruction(vcpu); 3198 complete_insn_gp(vcpu, err);
3113 return 1; 3199 return 1;
3114 case 3: 3200 case 3:
3115 kvm_set_cr3(vcpu, val); 3201 err = kvm_set_cr3(vcpu, val);
3116 skip_emulated_instruction(vcpu); 3202 complete_insn_gp(vcpu, err);
3117 return 1; 3203 return 1;
3118 case 4: 3204 case 4:
3119 kvm_set_cr4(vcpu, val); 3205 err = kvm_set_cr4(vcpu, val);
3120 skip_emulated_instruction(vcpu); 3206 complete_insn_gp(vcpu, err);
3121 return 1; 3207 return 1;
3122 case 8: { 3208 case 8: {
3123 u8 cr8_prev = kvm_get_cr8(vcpu); 3209 u8 cr8_prev = kvm_get_cr8(vcpu);
@@ -3324,30 +3410,25 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
3324static int handle_wbinvd(struct kvm_vcpu *vcpu) 3410static int handle_wbinvd(struct kvm_vcpu *vcpu)
3325{ 3411{
3326 skip_emulated_instruction(vcpu); 3412 skip_emulated_instruction(vcpu);
3327 /* TODO: Add support for VT-d/pass-through device */ 3413 kvm_emulate_wbinvd(vcpu);
3328 return 1; 3414 return 1;
3329} 3415}
3330 3416
3331static int handle_apic_access(struct kvm_vcpu *vcpu) 3417static int handle_xsetbv(struct kvm_vcpu *vcpu)
3332{ 3418{
3333 unsigned long exit_qualification; 3419 u64 new_bv = kvm_read_edx_eax(vcpu);
3334 enum emulation_result er; 3420 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3335 unsigned long offset;
3336 3421
3337 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3422 if (kvm_set_xcr(vcpu, index, new_bv) == 0)
3338 offset = exit_qualification & 0xffful; 3423 skip_emulated_instruction(vcpu);
3339
3340 er = emulate_instruction(vcpu, 0, 0, 0);
3341
3342 if (er != EMULATE_DONE) {
3343 printk(KERN_ERR
3344 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
3345 offset);
3346 return -ENOEXEC;
3347 }
3348 return 1; 3424 return 1;
3349} 3425}
3350 3426
3427static int handle_apic_access(struct kvm_vcpu *vcpu)
3428{
3429 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
3430}
3431
3351static int handle_task_switch(struct kvm_vcpu *vcpu) 3432static int handle_task_switch(struct kvm_vcpu *vcpu)
3352{ 3433{
3353 struct vcpu_vmx *vmx = to_vmx(vcpu); 3434 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3557,13 +3638,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3557 goto out; 3638 goto out;
3558 } 3639 }
3559 3640
3560 if (err != EMULATE_DONE) { 3641 if (err != EMULATE_DONE)
3561 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3642 return 0;
3562 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3563 vcpu->run->internal.ndata = 0;
3564 ret = 0;
3565 goto out;
3566 }
3567 3643
3568 if (signal_pending(current)) 3644 if (signal_pending(current))
3569 goto out; 3645 goto out;
@@ -3626,6 +3702,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3626 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 3702 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
3627 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 3703 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3628 [EXIT_REASON_WBINVD] = handle_wbinvd, 3704 [EXIT_REASON_WBINVD] = handle_wbinvd,
3705 [EXIT_REASON_XSETBV] = handle_xsetbv,
3629 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 3706 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
3630 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3707 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3631 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3708 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
@@ -3659,6 +3736,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3659 if (enable_ept && is_paging(vcpu)) 3736 if (enable_ept && is_paging(vcpu))
3660 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3737 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3661 3738
3739 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3740 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3741 vcpu->run->fail_entry.hardware_entry_failure_reason
3742 = exit_reason;
3743 return 0;
3744 }
3745
3662 if (unlikely(vmx->fail)) { 3746 if (unlikely(vmx->fail)) {
3663 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3747 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3664 vcpu->run->fail_entry.hardware_entry_failure_reason 3748 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3864,11 +3948,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3864 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3948 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3865 vmx_set_interrupt_shadow(vcpu, 0); 3949 vmx_set_interrupt_shadow(vcpu, 0);
3866 3950
3867 /*
3868 * Loading guest fpu may have cleared host cr0.ts
3869 */
3870 vmcs_writel(HOST_CR0, read_cr0());
3871
3872 asm( 3951 asm(
3873 /* Store host registers */ 3952 /* Store host registers */
3874 "push %%"R"dx; push %%"R"bp;" 3953 "push %%"R"dx; push %%"R"bp;"
@@ -4004,6 +4083,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
4004 kmem_cache_free(kvm_vcpu_cache, vmx); 4083 kmem_cache_free(kvm_vcpu_cache, vmx);
4005} 4084}
4006 4085
4086static inline void vmcs_init(struct vmcs *vmcs)
4087{
4088 u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
4089
4090 if (!vmm_exclusive)
4091 kvm_cpu_vmxon(phys_addr);
4092
4093 vmcs_clear(vmcs);
4094
4095 if (!vmm_exclusive)
4096 kvm_cpu_vmxoff();
4097}
4098
4007static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 4099static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4008{ 4100{
4009 int err; 4101 int err;
@@ -4029,7 +4121,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4029 if (!vmx->vmcs) 4121 if (!vmx->vmcs)
4030 goto free_msrs; 4122 goto free_msrs;
4031 4123
4032 vmcs_clear(vmx->vmcs); 4124 vmcs_init(vmx->vmcs);
4033 4125
4034 cpu = get_cpu(); 4126 cpu = get_cpu();
4035 vmx_vcpu_load(&vmx->vcpu, cpu); 4127 vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4268,6 +4360,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
4268 .rdtscp_supported = vmx_rdtscp_supported, 4360 .rdtscp_supported = vmx_rdtscp_supported,
4269 4361
4270 .set_supported_cpuid = vmx_set_supported_cpuid, 4362 .set_supported_cpuid = vmx_set_supported_cpuid,
4363
4364 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4271}; 4365};
4272 4366
4273static int __init vmx_init(void) 4367static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05d571f6f196..25f19078b321 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008 8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affilates.
9 * 10 *
10 * Authors: 11 * Authors:
11 * Avi Kivity <avi@qumranet.com> 12 * Avi Kivity <avi@qumranet.com>
@@ -41,17 +42,19 @@
41#include <linux/srcu.h> 42#include <linux/srcu.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h>
44#include <trace/events/kvm.h> 46#include <trace/events/kvm.h>
45 47
46#define CREATE_TRACE_POINTS 48#define CREATE_TRACE_POINTS
47#include "trace.h" 49#include "trace.h"
48 50
49#include <asm/debugreg.h> 51#include <asm/debugreg.h>
50#include <asm/uaccess.h>
51#include <asm/msr.h> 52#include <asm/msr.h>
52#include <asm/desc.h> 53#include <asm/desc.h>
53#include <asm/mtrr.h> 54#include <asm/mtrr.h>
54#include <asm/mce.h> 55#include <asm/mce.h>
56#include <asm/i387.h>
57#include <asm/xcr.h>
55 58
56#define MAX_IO_MSRS 256 59#define MAX_IO_MSRS 256
57#define CR0_RESERVED_BITS \ 60#define CR0_RESERVED_BITS \
@@ -62,6 +65,7 @@
62 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 65 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
63 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 66 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
64 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 67 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
68 | X86_CR4_OSXSAVE \
65 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 69 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
66 70
67#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 71#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -147,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
147 { NULL } 151 { NULL }
148}; 152};
149 153
154u64 __read_mostly host_xcr0;
155
156static inline u32 bit(int bitno)
157{
158 return 1 << (bitno & 31);
159}
160
150static void kvm_on_user_return(struct user_return_notifier *urn) 161static void kvm_on_user_return(struct user_return_notifier *urn)
151{ 162{
152 unsigned slot; 163 unsigned slot;
@@ -285,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
285 prev_nr = vcpu->arch.exception.nr; 296 prev_nr = vcpu->arch.exception.nr;
286 if (prev_nr == DF_VECTOR) { 297 if (prev_nr == DF_VECTOR) {
287 /* triple fault -> shutdown */ 298 /* triple fault -> shutdown */
288 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 299 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
289 return; 300 return;
290 } 301 }
291 class1 = exception_class(prev_nr); 302 class1 = exception_class(prev_nr);
@@ -414,121 +425,163 @@ out:
414 return changed; 425 return changed;
415} 426}
416 427
417void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 428int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
418{ 429{
430 unsigned long old_cr0 = kvm_read_cr0(vcpu);
431 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
432 X86_CR0_CD | X86_CR0_NW;
433
419 cr0 |= X86_CR0_ET; 434 cr0 |= X86_CR0_ET;
420 435
421#ifdef CONFIG_X86_64 436#ifdef CONFIG_X86_64
422 if (cr0 & 0xffffffff00000000UL) { 437 if (cr0 & 0xffffffff00000000UL)
423 kvm_inject_gp(vcpu, 0); 438 return 1;
424 return;
425 }
426#endif 439#endif
427 440
428 cr0 &= ~CR0_RESERVED_BITS; 441 cr0 &= ~CR0_RESERVED_BITS;
429 442
430 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 443 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
431 kvm_inject_gp(vcpu, 0); 444 return 1;
432 return;
433 }
434 445
435 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 446 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
436 kvm_inject_gp(vcpu, 0); 447 return 1;
437 return;
438 }
439 448
440 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 449 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
441#ifdef CONFIG_X86_64 450#ifdef CONFIG_X86_64
442 if ((vcpu->arch.efer & EFER_LME)) { 451 if ((vcpu->arch.efer & EFER_LME)) {
443 int cs_db, cs_l; 452 int cs_db, cs_l;
444 453
445 if (!is_pae(vcpu)) { 454 if (!is_pae(vcpu))
446 kvm_inject_gp(vcpu, 0); 455 return 1;
447 return;
448 }
449 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 456 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
450 if (cs_l) { 457 if (cs_l)
451 kvm_inject_gp(vcpu, 0); 458 return 1;
452 return;
453
454 }
455 } else 459 } else
456#endif 460#endif
457 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 461 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
458 kvm_inject_gp(vcpu, 0); 462 return 1;
459 return;
460 }
461
462 } 463 }
463 464
464 kvm_x86_ops->set_cr0(vcpu, cr0); 465 kvm_x86_ops->set_cr0(vcpu, cr0);
465 466
466 kvm_mmu_reset_context(vcpu); 467 if ((cr0 ^ old_cr0) & update_bits)
467 return; 468 kvm_mmu_reset_context(vcpu);
469 return 0;
468} 470}
469EXPORT_SYMBOL_GPL(kvm_set_cr0); 471EXPORT_SYMBOL_GPL(kvm_set_cr0);
470 472
471void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 473void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
472{ 474{
473 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); 475 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
474} 476}
475EXPORT_SYMBOL_GPL(kvm_lmsw); 477EXPORT_SYMBOL_GPL(kvm_lmsw);
476 478
477void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 479int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
478{ 480{
479 unsigned long old_cr4 = kvm_read_cr4(vcpu); 481 u64 xcr0;
480 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
481 482
482 if (cr4 & CR4_RESERVED_BITS) { 483 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
484 if (index != XCR_XFEATURE_ENABLED_MASK)
485 return 1;
486 xcr0 = xcr;
487 if (kvm_x86_ops->get_cpl(vcpu) != 0)
488 return 1;
489 if (!(xcr0 & XSTATE_FP))
490 return 1;
491 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
492 return 1;
493 if (xcr0 & ~host_xcr0)
494 return 1;
495 vcpu->arch.xcr0 = xcr0;
496 vcpu->guest_xcr0_loaded = 0;
497 return 0;
498}
499
500int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
501{
502 if (__kvm_set_xcr(vcpu, index, xcr)) {
483 kvm_inject_gp(vcpu, 0); 503 kvm_inject_gp(vcpu, 0);
504 return 1;
505 }
506 return 0;
507}
508EXPORT_SYMBOL_GPL(kvm_set_xcr);
509
510static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
511{
512 struct kvm_cpuid_entry2 *best;
513
514 best = kvm_find_cpuid_entry(vcpu, 1, 0);
515 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
516}
517
518static void update_cpuid(struct kvm_vcpu *vcpu)
519{
520 struct kvm_cpuid_entry2 *best;
521
522 best = kvm_find_cpuid_entry(vcpu, 1, 0);
523 if (!best)
484 return; 524 return;
525
526 /* Update OSXSAVE bit */
527 if (cpu_has_xsave && best->function == 0x1) {
528 best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
529 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
530 best->ecx |= bit(X86_FEATURE_OSXSAVE);
485 } 531 }
532}
533
534int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
535{
536 unsigned long old_cr4 = kvm_read_cr4(vcpu);
537 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
538
539 if (cr4 & CR4_RESERVED_BITS)
540 return 1;
541
542 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
543 return 1;
486 544
487 if (is_long_mode(vcpu)) { 545 if (is_long_mode(vcpu)) {
488 if (!(cr4 & X86_CR4_PAE)) { 546 if (!(cr4 & X86_CR4_PAE))
489 kvm_inject_gp(vcpu, 0); 547 return 1;
490 return;
491 }
492 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 548 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
493 && ((cr4 ^ old_cr4) & pdptr_bits) 549 && ((cr4 ^ old_cr4) & pdptr_bits)
494 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 550 && !load_pdptrs(vcpu, vcpu->arch.cr3))
495 kvm_inject_gp(vcpu, 0); 551 return 1;
496 return; 552
497 } 553 if (cr4 & X86_CR4_VMXE)
554 return 1;
498 555
499 if (cr4 & X86_CR4_VMXE) {
500 kvm_inject_gp(vcpu, 0);
501 return;
502 }
503 kvm_x86_ops->set_cr4(vcpu, cr4); 556 kvm_x86_ops->set_cr4(vcpu, cr4);
504 vcpu->arch.cr4 = cr4; 557
505 kvm_mmu_reset_context(vcpu); 558 if ((cr4 ^ old_cr4) & pdptr_bits)
559 kvm_mmu_reset_context(vcpu);
560
561 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
562 update_cpuid(vcpu);
563
564 return 0;
506} 565}
507EXPORT_SYMBOL_GPL(kvm_set_cr4); 566EXPORT_SYMBOL_GPL(kvm_set_cr4);
508 567
509void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 568int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
510{ 569{
511 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 570 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
512 kvm_mmu_sync_roots(vcpu); 571 kvm_mmu_sync_roots(vcpu);
513 kvm_mmu_flush_tlb(vcpu); 572 kvm_mmu_flush_tlb(vcpu);
514 return; 573 return 0;
515 } 574 }
516 575
517 if (is_long_mode(vcpu)) { 576 if (is_long_mode(vcpu)) {
518 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 577 if (cr3 & CR3_L_MODE_RESERVED_BITS)
519 kvm_inject_gp(vcpu, 0); 578 return 1;
520 return;
521 }
522 } else { 579 } else {
523 if (is_pae(vcpu)) { 580 if (is_pae(vcpu)) {
524 if (cr3 & CR3_PAE_RESERVED_BITS) { 581 if (cr3 & CR3_PAE_RESERVED_BITS)
525 kvm_inject_gp(vcpu, 0); 582 return 1;
526 return; 583 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
527 } 584 return 1;
528 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
529 kvm_inject_gp(vcpu, 0);
530 return;
531 }
532 } 585 }
533 /* 586 /*
534 * We don't check reserved bits in nonpae mode, because 587 * We don't check reserved bits in nonpae mode, because
@@ -546,24 +599,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
546 * to debug) behavior on the guest side. 599 * to debug) behavior on the guest side.
547 */ 600 */
548 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 601 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
549 kvm_inject_gp(vcpu, 0); 602 return 1;
550 else { 603 vcpu->arch.cr3 = cr3;
551 vcpu->arch.cr3 = cr3; 604 vcpu->arch.mmu.new_cr3(vcpu);
552 vcpu->arch.mmu.new_cr3(vcpu); 605 return 0;
553 }
554} 606}
555EXPORT_SYMBOL_GPL(kvm_set_cr3); 607EXPORT_SYMBOL_GPL(kvm_set_cr3);
556 608
557void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 609int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
558{ 610{
559 if (cr8 & CR8_RESERVED_BITS) { 611 if (cr8 & CR8_RESERVED_BITS)
560 kvm_inject_gp(vcpu, 0); 612 return 1;
561 return;
562 }
563 if (irqchip_in_kernel(vcpu->kvm)) 613 if (irqchip_in_kernel(vcpu->kvm))
564 kvm_lapic_set_tpr(vcpu, cr8); 614 kvm_lapic_set_tpr(vcpu, cr8);
565 else 615 else
566 vcpu->arch.cr8 = cr8; 616 vcpu->arch.cr8 = cr8;
617 return 0;
618}
619
620void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
621{
622 if (__kvm_set_cr8(vcpu, cr8))
623 kvm_inject_gp(vcpu, 0);
567} 624}
568EXPORT_SYMBOL_GPL(kvm_set_cr8); 625EXPORT_SYMBOL_GPL(kvm_set_cr8);
569 626
@@ -576,7 +633,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
576} 633}
577EXPORT_SYMBOL_GPL(kvm_get_cr8); 634EXPORT_SYMBOL_GPL(kvm_get_cr8);
578 635
579int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 636static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
580{ 637{
581 switch (dr) { 638 switch (dr) {
582 case 0 ... 3: 639 case 0 ... 3:
@@ -585,29 +642,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
585 vcpu->arch.eff_db[dr] = val; 642 vcpu->arch.eff_db[dr] = val;
586 break; 643 break;
587 case 4: 644 case 4:
588 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 645 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
589 kvm_queue_exception(vcpu, UD_VECTOR); 646 return 1; /* #UD */
590 return 1;
591 }
592 /* fall through */ 647 /* fall through */
593 case 6: 648 case 6:
594 if (val & 0xffffffff00000000ULL) { 649 if (val & 0xffffffff00000000ULL)
595 kvm_inject_gp(vcpu, 0); 650 return -1; /* #GP */
596 return 1;
597 }
598 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 651 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
599 break; 652 break;
600 case 5: 653 case 5:
601 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 654 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
602 kvm_queue_exception(vcpu, UD_VECTOR); 655 return 1; /* #UD */
603 return 1;
604 }
605 /* fall through */ 656 /* fall through */
606 default: /* 7 */ 657 default: /* 7 */
607 if (val & 0xffffffff00000000ULL) { 658 if (val & 0xffffffff00000000ULL)
608 kvm_inject_gp(vcpu, 0); 659 return -1; /* #GP */
609 return 1;
610 }
611 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 660 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
612 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 661 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
613 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); 662 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
@@ -618,28 +667,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
618 667
619 return 0; 668 return 0;
620} 669}
670
671int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
672{
673 int res;
674
675 res = __kvm_set_dr(vcpu, dr, val);
676 if (res > 0)
677 kvm_queue_exception(vcpu, UD_VECTOR);
678 else if (res < 0)
679 kvm_inject_gp(vcpu, 0);
680
681 return res;
682}
621EXPORT_SYMBOL_GPL(kvm_set_dr); 683EXPORT_SYMBOL_GPL(kvm_set_dr);
622 684
623int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 685static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
624{ 686{
625 switch (dr) { 687 switch (dr) {
626 case 0 ... 3: 688 case 0 ... 3:
627 *val = vcpu->arch.db[dr]; 689 *val = vcpu->arch.db[dr];
628 break; 690 break;
629 case 4: 691 case 4:
630 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 692 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
631 kvm_queue_exception(vcpu, UD_VECTOR);
632 return 1; 693 return 1;
633 }
634 /* fall through */ 694 /* fall through */
635 case 6: 695 case 6:
636 *val = vcpu->arch.dr6; 696 *val = vcpu->arch.dr6;
637 break; 697 break;
638 case 5: 698 case 5:
639 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 699 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
640 kvm_queue_exception(vcpu, UD_VECTOR);
641 return 1; 700 return 1;
642 }
643 /* fall through */ 701 /* fall through */
644 default: /* 7 */ 702 default: /* 7 */
645 *val = vcpu->arch.dr7; 703 *val = vcpu->arch.dr7;
@@ -648,12 +706,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
648 706
649 return 0; 707 return 0;
650} 708}
651EXPORT_SYMBOL_GPL(kvm_get_dr);
652 709
653static inline u32 bit(int bitno) 710int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
654{ 711{
655 return 1 << (bitno & 31); 712 if (_kvm_get_dr(vcpu, dr, val)) {
713 kvm_queue_exception(vcpu, UD_VECTOR);
714 return 1;
715 }
716 return 0;
656} 717}
718EXPORT_SYMBOL_GPL(kvm_get_dr);
657 719
658/* 720/*
659 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 721 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -671,7 +733,7 @@ static u32 msrs_to_save[] = {
671 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 733 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
672 HV_X64_MSR_APIC_ASSIST_PAGE, 734 HV_X64_MSR_APIC_ASSIST_PAGE,
673 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 735 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
674 MSR_K6_STAR, 736 MSR_STAR,
675#ifdef CONFIG_X86_64 737#ifdef CONFIG_X86_64
676 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 738 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
677#endif 739#endif
@@ -682,10 +744,14 @@ static unsigned num_msrs_to_save;
682 744
683static u32 emulated_msrs[] = { 745static u32 emulated_msrs[] = {
684 MSR_IA32_MISC_ENABLE, 746 MSR_IA32_MISC_ENABLE,
747 MSR_IA32_MCG_STATUS,
748 MSR_IA32_MCG_CTL,
685}; 749};
686 750
687static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 751static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
688{ 752{
753 u64 old_efer = vcpu->arch.efer;
754
689 if (efer & efer_reserved_bits) 755 if (efer & efer_reserved_bits)
690 return 1; 756 return 1;
691 757
@@ -714,11 +780,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
714 780
715 kvm_x86_ops->set_efer(vcpu, efer); 781 kvm_x86_ops->set_efer(vcpu, efer);
716 782
717 vcpu->arch.efer = efer;
718
719 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 783 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
720 kvm_mmu_reset_context(vcpu); 784 kvm_mmu_reset_context(vcpu);
721 785
786 /* Update reserved bits */
787 if ((efer ^ old_efer) & EFER_NX)
788 kvm_mmu_reset_context(vcpu);
789
722 return 0; 790 return 0;
723} 791}
724 792
@@ -882,7 +950,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
882 950
883 if (!vcpu->time_page) 951 if (!vcpu->time_page)
884 return 0; 952 return 0;
885 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 953 kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
886 return 1; 954 return 1;
887} 955}
888 956
@@ -1524,16 +1592,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1524{ 1592{
1525 int i, idx; 1593 int i, idx;
1526 1594
1527 vcpu_load(vcpu);
1528
1529 idx = srcu_read_lock(&vcpu->kvm->srcu); 1595 idx = srcu_read_lock(&vcpu->kvm->srcu);
1530 for (i = 0; i < msrs->nmsrs; ++i) 1596 for (i = 0; i < msrs->nmsrs; ++i)
1531 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1597 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1532 break; 1598 break;
1533 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1599 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1534 1600
1535 vcpu_put(vcpu);
1536
1537 return i; 1601 return i;
1538} 1602}
1539 1603
@@ -1562,7 +1626,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
1562 1626
1563 r = -ENOMEM; 1627 r = -ENOMEM;
1564 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 1628 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1565 entries = vmalloc(size); 1629 entries = kmalloc(size, GFP_KERNEL);
1566 if (!entries) 1630 if (!entries)
1567 goto out; 1631 goto out;
1568 1632
@@ -1581,7 +1645,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
1581 r = n; 1645 r = n;
1582 1646
1583out_free: 1647out_free:
1584 vfree(entries); 1648 kfree(entries);
1585out: 1649out:
1586 return r; 1650 return r;
1587} 1651}
@@ -1618,6 +1682,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1618 case KVM_CAP_PCI_SEGMENT: 1682 case KVM_CAP_PCI_SEGMENT:
1619 case KVM_CAP_DEBUGREGS: 1683 case KVM_CAP_DEBUGREGS:
1620 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1684 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1685 case KVM_CAP_XSAVE:
1621 r = 1; 1686 r = 1;
1622 break; 1687 break;
1623 case KVM_CAP_COALESCED_MMIO: 1688 case KVM_CAP_COALESCED_MMIO:
@@ -1641,6 +1706,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1641 case KVM_CAP_MCE: 1706 case KVM_CAP_MCE:
1642 r = KVM_MAX_MCE_BANKS; 1707 r = KVM_MAX_MCE_BANKS;
1643 break; 1708 break;
1709 case KVM_CAP_XCRS:
1710 r = cpu_has_xsave;
1711 break;
1644 default: 1712 default:
1645 r = 0; 1713 r = 0;
1646 break; 1714 break;
@@ -1717,8 +1785,28 @@ out:
1717 return r; 1785 return r;
1718} 1786}
1719 1787
1788static void wbinvd_ipi(void *garbage)
1789{
1790 wbinvd();
1791}
1792
1793static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
1794{
1795 return vcpu->kvm->arch.iommu_domain &&
1796 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
1797}
1798
1720void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1799void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1721{ 1800{
1801 /* Address WBINVD may be executed by guest */
1802 if (need_emulate_wbinvd(vcpu)) {
1803 if (kvm_x86_ops->has_wbinvd_exit())
1804 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
1805 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
1806 smp_call_function_single(vcpu->cpu,
1807 wbinvd_ipi, NULL, 1);
1808 }
1809
1722 kvm_x86_ops->vcpu_load(vcpu, cpu); 1810 kvm_x86_ops->vcpu_load(vcpu, cpu);
1723 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 1811 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1724 unsigned long khz = cpufreq_quick_get(cpu); 1812 unsigned long khz = cpufreq_quick_get(cpu);
@@ -1731,8 +1819,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1731 1819
1732void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1820void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1733{ 1821{
1734 kvm_put_guest_fpu(vcpu);
1735 kvm_x86_ops->vcpu_put(vcpu); 1822 kvm_x86_ops->vcpu_put(vcpu);
1823 kvm_put_guest_fpu(vcpu);
1736} 1824}
1737 1825
1738static int is_efer_nx(void) 1826static int is_efer_nx(void)
@@ -1781,7 +1869,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1781 if (copy_from_user(cpuid_entries, entries, 1869 if (copy_from_user(cpuid_entries, entries,
1782 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1870 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1783 goto out_free; 1871 goto out_free;
1784 vcpu_load(vcpu);
1785 for (i = 0; i < cpuid->nent; i++) { 1872 for (i = 0; i < cpuid->nent; i++) {
1786 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1873 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1787 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1874 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1799,7 +1886,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1799 r = 0; 1886 r = 0;
1800 kvm_apic_set_version(vcpu); 1887 kvm_apic_set_version(vcpu);
1801 kvm_x86_ops->cpuid_update(vcpu); 1888 kvm_x86_ops->cpuid_update(vcpu);
1802 vcpu_put(vcpu); 1889 update_cpuid(vcpu);
1803 1890
1804out_free: 1891out_free:
1805 vfree(cpuid_entries); 1892 vfree(cpuid_entries);
@@ -1820,11 +1907,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1820 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1907 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1821 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1908 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1822 goto out; 1909 goto out;
1823 vcpu_load(vcpu);
1824 vcpu->arch.cpuid_nent = cpuid->nent; 1910 vcpu->arch.cpuid_nent = cpuid->nent;
1825 kvm_apic_set_version(vcpu); 1911 kvm_apic_set_version(vcpu);
1826 kvm_x86_ops->cpuid_update(vcpu); 1912 kvm_x86_ops->cpuid_update(vcpu);
1827 vcpu_put(vcpu); 1913 update_cpuid(vcpu);
1828 return 0; 1914 return 0;
1829 1915
1830out: 1916out:
@@ -1837,7 +1923,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1837{ 1923{
1838 int r; 1924 int r;
1839 1925
1840 vcpu_load(vcpu);
1841 r = -E2BIG; 1926 r = -E2BIG;
1842 if (cpuid->nent < vcpu->arch.cpuid_nent) 1927 if (cpuid->nent < vcpu->arch.cpuid_nent)
1843 goto out; 1928 goto out;
@@ -1849,7 +1934,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1849 1934
1850out: 1935out:
1851 cpuid->nent = vcpu->arch.cpuid_nent; 1936 cpuid->nent = vcpu->arch.cpuid_nent;
1852 vcpu_put(vcpu);
1853 return r; 1937 return r;
1854} 1938}
1855 1939
@@ -1901,13 +1985,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1901 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1985 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1902 /* cpuid 1.ecx */ 1986 /* cpuid 1.ecx */
1903 const u32 kvm_supported_word4_x86_features = 1987 const u32 kvm_supported_word4_x86_features =
1904 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1988 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
1905 0 /* DS-CPL, VMX, SMX, EST */ | 1989 0 /* DS-CPL, VMX, SMX, EST */ |
1906 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1990 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1907 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1991 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1908 0 /* Reserved, DCA */ | F(XMM4_1) | 1992 0 /* Reserved, DCA */ | F(XMM4_1) |
1909 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1993 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1910 0 /* Reserved, XSAVE, OSXSAVE */; 1994 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
1911 /* cpuid 0x80000001.ecx */ 1995 /* cpuid 0x80000001.ecx */
1912 const u32 kvm_supported_word6_x86_features = 1996 const u32 kvm_supported_word6_x86_features =
1913 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1997 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
@@ -1922,7 +2006,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1922 2006
1923 switch (function) { 2007 switch (function) {
1924 case 0: 2008 case 0:
1925 entry->eax = min(entry->eax, (u32)0xb); 2009 entry->eax = min(entry->eax, (u32)0xd);
1926 break; 2010 break;
1927 case 1: 2011 case 1:
1928 entry->edx &= kvm_supported_word0_x86_features; 2012 entry->edx &= kvm_supported_word0_x86_features;
@@ -1980,6 +2064,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1980 } 2064 }
1981 break; 2065 break;
1982 } 2066 }
2067 case 0xd: {
2068 int i;
2069
2070 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2071 for (i = 1; *nent < maxnent; ++i) {
2072 if (entry[i - 1].eax == 0 && i != 2)
2073 break;
2074 do_cpuid_1_ent(&entry[i], function, i);
2075 entry[i].flags |=
2076 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2077 ++*nent;
2078 }
2079 break;
2080 }
1983 case KVM_CPUID_SIGNATURE: { 2081 case KVM_CPUID_SIGNATURE: {
1984 char signature[12] = "KVMKVMKVM\0\0"; 2082 char signature[12] = "KVMKVMKVM\0\0";
1985 u32 *sigptr = (u32 *)signature; 2083 u32 *sigptr = (u32 *)signature;
@@ -2081,9 +2179,7 @@ out:
2081static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2179static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2082 struct kvm_lapic_state *s) 2180 struct kvm_lapic_state *s)
2083{ 2181{
2084 vcpu_load(vcpu);
2085 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 2182 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2086 vcpu_put(vcpu);
2087 2183
2088 return 0; 2184 return 0;
2089} 2185}
@@ -2091,11 +2187,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2091static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 2187static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2092 struct kvm_lapic_state *s) 2188 struct kvm_lapic_state *s)
2093{ 2189{
2094 vcpu_load(vcpu);
2095 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 2190 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
2096 kvm_apic_post_state_restore(vcpu); 2191 kvm_apic_post_state_restore(vcpu);
2097 update_cr8_intercept(vcpu); 2192 update_cr8_intercept(vcpu);
2098 vcpu_put(vcpu);
2099 2193
2100 return 0; 2194 return 0;
2101} 2195}
@@ -2107,20 +2201,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2107 return -EINVAL; 2201 return -EINVAL;
2108 if (irqchip_in_kernel(vcpu->kvm)) 2202 if (irqchip_in_kernel(vcpu->kvm))
2109 return -ENXIO; 2203 return -ENXIO;
2110 vcpu_load(vcpu);
2111 2204
2112 kvm_queue_interrupt(vcpu, irq->irq, false); 2205 kvm_queue_interrupt(vcpu, irq->irq, false);
2113 2206
2114 vcpu_put(vcpu);
2115
2116 return 0; 2207 return 0;
2117} 2208}
2118 2209
2119static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 2210static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
2120{ 2211{
2121 vcpu_load(vcpu);
2122 kvm_inject_nmi(vcpu); 2212 kvm_inject_nmi(vcpu);
2123 vcpu_put(vcpu);
2124 2213
2125 return 0; 2214 return 0;
2126} 2215}
@@ -2140,7 +2229,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2140 int r; 2229 int r;
2141 unsigned bank_num = mcg_cap & 0xff, bank; 2230 unsigned bank_num = mcg_cap & 0xff, bank;
2142 2231
2143 vcpu_load(vcpu);
2144 r = -EINVAL; 2232 r = -EINVAL;
2145 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 2233 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2146 goto out; 2234 goto out;
@@ -2155,7 +2243,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2155 for (bank = 0; bank < bank_num; bank++) 2243 for (bank = 0; bank < bank_num; bank++)
2156 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 2244 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2157out: 2245out:
2158 vcpu_put(vcpu);
2159 return r; 2246 return r;
2160} 2247}
2161 2248
@@ -2188,7 +2275,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2188 printk(KERN_DEBUG "kvm: set_mce: " 2275 printk(KERN_DEBUG "kvm: set_mce: "
2189 "injects mce exception while " 2276 "injects mce exception while "
2190 "previous one is in progress!\n"); 2277 "previous one is in progress!\n");
2191 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 2278 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2192 return 0; 2279 return 0;
2193 } 2280 }
2194 if (banks[1] & MCI_STATUS_VAL) 2281 if (banks[1] & MCI_STATUS_VAL)
@@ -2213,8 +2300,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2213static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 2300static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2214 struct kvm_vcpu_events *events) 2301 struct kvm_vcpu_events *events)
2215{ 2302{
2216 vcpu_load(vcpu);
2217
2218 events->exception.injected = 2303 events->exception.injected =
2219 vcpu->arch.exception.pending && 2304 vcpu->arch.exception.pending &&
2220 !kvm_exception_is_soft(vcpu->arch.exception.nr); 2305 !kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2239,8 +2324,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2239 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2324 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2240 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2325 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2241 | KVM_VCPUEVENT_VALID_SHADOW); 2326 | KVM_VCPUEVENT_VALID_SHADOW);
2242
2243 vcpu_put(vcpu);
2244} 2327}
2245 2328
2246static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 2329static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2251,8 +2334,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2251 | KVM_VCPUEVENT_VALID_SHADOW)) 2334 | KVM_VCPUEVENT_VALID_SHADOW))
2252 return -EINVAL; 2335 return -EINVAL;
2253 2336
2254 vcpu_load(vcpu);
2255
2256 vcpu->arch.exception.pending = events->exception.injected; 2337 vcpu->arch.exception.pending = events->exception.injected;
2257 vcpu->arch.exception.nr = events->exception.nr; 2338 vcpu->arch.exception.nr = events->exception.nr;
2258 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 2339 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2275,22 +2356,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2275 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2356 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2276 vcpu->arch.sipi_vector = events->sipi_vector; 2357 vcpu->arch.sipi_vector = events->sipi_vector;
2277 2358
2278 vcpu_put(vcpu);
2279
2280 return 0; 2359 return 0;
2281} 2360}
2282 2361
2283static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 2362static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2284 struct kvm_debugregs *dbgregs) 2363 struct kvm_debugregs *dbgregs)
2285{ 2364{
2286 vcpu_load(vcpu);
2287
2288 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 2365 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2289 dbgregs->dr6 = vcpu->arch.dr6; 2366 dbgregs->dr6 = vcpu->arch.dr6;
2290 dbgregs->dr7 = vcpu->arch.dr7; 2367 dbgregs->dr7 = vcpu->arch.dr7;
2291 dbgregs->flags = 0; 2368 dbgregs->flags = 0;
2292
2293 vcpu_put(vcpu);
2294} 2369}
2295 2370
2296static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 2371static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2299,40 +2374,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2299 if (dbgregs->flags) 2374 if (dbgregs->flags)
2300 return -EINVAL; 2375 return -EINVAL;
2301 2376
2302 vcpu_load(vcpu);
2303
2304 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 2377 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2305 vcpu->arch.dr6 = dbgregs->dr6; 2378 vcpu->arch.dr6 = dbgregs->dr6;
2306 vcpu->arch.dr7 = dbgregs->dr7; 2379 vcpu->arch.dr7 = dbgregs->dr7;
2307 2380
2308 vcpu_put(vcpu); 2381 return 0;
2382}
2383
2384static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
2385 struct kvm_xsave *guest_xsave)
2386{
2387 if (cpu_has_xsave)
2388 memcpy(guest_xsave->region,
2389 &vcpu->arch.guest_fpu.state->xsave,
2390 sizeof(struct xsave_struct));
2391 else {
2392 memcpy(guest_xsave->region,
2393 &vcpu->arch.guest_fpu.state->fxsave,
2394 sizeof(struct i387_fxsave_struct));
2395 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
2396 XSTATE_FPSSE;
2397 }
2398}
2399
2400static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
2401 struct kvm_xsave *guest_xsave)
2402{
2403 u64 xstate_bv =
2404 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
2309 2405
2406 if (cpu_has_xsave)
2407 memcpy(&vcpu->arch.guest_fpu.state->xsave,
2408 guest_xsave->region, sizeof(struct xsave_struct));
2409 else {
2410 if (xstate_bv & ~XSTATE_FPSSE)
2411 return -EINVAL;
2412 memcpy(&vcpu->arch.guest_fpu.state->fxsave,
2413 guest_xsave->region, sizeof(struct i387_fxsave_struct));
2414 }
2310 return 0; 2415 return 0;
2311} 2416}
2312 2417
2418static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
2419 struct kvm_xcrs *guest_xcrs)
2420{
2421 if (!cpu_has_xsave) {
2422 guest_xcrs->nr_xcrs = 0;
2423 return;
2424 }
2425
2426 guest_xcrs->nr_xcrs = 1;
2427 guest_xcrs->flags = 0;
2428 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
2429 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
2430}
2431
2432static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2433 struct kvm_xcrs *guest_xcrs)
2434{
2435 int i, r = 0;
2436
2437 if (!cpu_has_xsave)
2438 return -EINVAL;
2439
2440 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
2441 return -EINVAL;
2442
2443 for (i = 0; i < guest_xcrs->nr_xcrs; i++)
2444 /* Only support XCR0 currently */
2445 if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
2446 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
2447 guest_xcrs->xcrs[0].value);
2448 break;
2449 }
2450 if (r)
2451 r = -EINVAL;
2452 return r;
2453}
2454
2313long kvm_arch_vcpu_ioctl(struct file *filp, 2455long kvm_arch_vcpu_ioctl(struct file *filp,
2314 unsigned int ioctl, unsigned long arg) 2456 unsigned int ioctl, unsigned long arg)
2315{ 2457{
2316 struct kvm_vcpu *vcpu = filp->private_data; 2458 struct kvm_vcpu *vcpu = filp->private_data;
2317 void __user *argp = (void __user *)arg; 2459 void __user *argp = (void __user *)arg;
2318 int r; 2460 int r;
2319 struct kvm_lapic_state *lapic = NULL; 2461 union {
2462 struct kvm_lapic_state *lapic;
2463 struct kvm_xsave *xsave;
2464 struct kvm_xcrs *xcrs;
2465 void *buffer;
2466 } u;
2320 2467
2468 u.buffer = NULL;
2321 switch (ioctl) { 2469 switch (ioctl) {
2322 case KVM_GET_LAPIC: { 2470 case KVM_GET_LAPIC: {
2323 r = -EINVAL; 2471 r = -EINVAL;
2324 if (!vcpu->arch.apic) 2472 if (!vcpu->arch.apic)
2325 goto out; 2473 goto out;
2326 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2474 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2327 2475
2328 r = -ENOMEM; 2476 r = -ENOMEM;
2329 if (!lapic) 2477 if (!u.lapic)
2330 goto out; 2478 goto out;
2331 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 2479 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
2332 if (r) 2480 if (r)
2333 goto out; 2481 goto out;
2334 r = -EFAULT; 2482 r = -EFAULT;
2335 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 2483 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
2336 goto out; 2484 goto out;
2337 r = 0; 2485 r = 0;
2338 break; 2486 break;
@@ -2341,14 +2489,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2341 r = -EINVAL; 2489 r = -EINVAL;
2342 if (!vcpu->arch.apic) 2490 if (!vcpu->arch.apic)
2343 goto out; 2491 goto out;
2344 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2492 u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2345 r = -ENOMEM; 2493 r = -ENOMEM;
2346 if (!lapic) 2494 if (!u.lapic)
2347 goto out; 2495 goto out;
2348 r = -EFAULT; 2496 r = -EFAULT;
2349 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 2497 if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
2350 goto out; 2498 goto out;
2351 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 2499 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
2352 if (r) 2500 if (r)
2353 goto out; 2501 goto out;
2354 r = 0; 2502 r = 0;
@@ -2464,9 +2612,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2464 r = -EFAULT; 2612 r = -EFAULT;
2465 if (copy_from_user(&mce, argp, sizeof mce)) 2613 if (copy_from_user(&mce, argp, sizeof mce))
2466 goto out; 2614 goto out;
2467 vcpu_load(vcpu);
2468 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2615 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
2469 vcpu_put(vcpu);
2470 break; 2616 break;
2471 } 2617 }
2472 case KVM_GET_VCPU_EVENTS: { 2618 case KVM_GET_VCPU_EVENTS: {
@@ -2513,11 +2659,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2513 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); 2659 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
2514 break; 2660 break;
2515 } 2661 }
2662 case KVM_GET_XSAVE: {
2663 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
2664 r = -ENOMEM;
2665 if (!u.xsave)
2666 break;
2667
2668 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
2669
2670 r = -EFAULT;
2671 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
2672 break;
2673 r = 0;
2674 break;
2675 }
2676 case KVM_SET_XSAVE: {
2677 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
2678 r = -ENOMEM;
2679 if (!u.xsave)
2680 break;
2681
2682 r = -EFAULT;
2683 if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
2684 break;
2685
2686 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
2687 break;
2688 }
2689 case KVM_GET_XCRS: {
2690 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
2691 r = -ENOMEM;
2692 if (!u.xcrs)
2693 break;
2694
2695 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
2696
2697 r = -EFAULT;
2698 if (copy_to_user(argp, u.xcrs,
2699 sizeof(struct kvm_xcrs)))
2700 break;
2701 r = 0;
2702 break;
2703 }
2704 case KVM_SET_XCRS: {
2705 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
2706 r = -ENOMEM;
2707 if (!u.xcrs)
2708 break;
2709
2710 r = -EFAULT;
2711 if (copy_from_user(u.xcrs, argp,
2712 sizeof(struct kvm_xcrs)))
2713 break;
2714
2715 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
2716 break;
2717 }
2516 default: 2718 default:
2517 r = -EINVAL; 2719 r = -EINVAL;
2518 } 2720 }
2519out: 2721out:
2520 kfree(lapic); 2722 kfree(u.buffer);
2521 return r; 2723 return r;
2522} 2724}
2523 2725
@@ -2560,115 +2762,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2560 return kvm->arch.n_alloc_mmu_pages; 2762 return kvm->arch.n_alloc_mmu_pages;
2561} 2763}
2562 2764
2563gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2564{
2565 int i;
2566 struct kvm_mem_alias *alias;
2567 struct kvm_mem_aliases *aliases;
2568
2569 aliases = kvm_aliases(kvm);
2570
2571 for (i = 0; i < aliases->naliases; ++i) {
2572 alias = &aliases->aliases[i];
2573 if (alias->flags & KVM_ALIAS_INVALID)
2574 continue;
2575 if (gfn >= alias->base_gfn
2576 && gfn < alias->base_gfn + alias->npages)
2577 return alias->target_gfn + gfn - alias->base_gfn;
2578 }
2579 return gfn;
2580}
2581
2582gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2583{
2584 int i;
2585 struct kvm_mem_alias *alias;
2586 struct kvm_mem_aliases *aliases;
2587
2588 aliases = kvm_aliases(kvm);
2589
2590 for (i = 0; i < aliases->naliases; ++i) {
2591 alias = &aliases->aliases[i];
2592 if (gfn >= alias->base_gfn
2593 && gfn < alias->base_gfn + alias->npages)
2594 return alias->target_gfn + gfn - alias->base_gfn;
2595 }
2596 return gfn;
2597}
2598
2599/*
2600 * Set a new alias region. Aliases map a portion of physical memory into
2601 * another portion. This is useful for memory windows, for example the PC
2602 * VGA region.
2603 */
2604static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2605 struct kvm_memory_alias *alias)
2606{
2607 int r, n;
2608 struct kvm_mem_alias *p;
2609 struct kvm_mem_aliases *aliases, *old_aliases;
2610
2611 r = -EINVAL;
2612 /* General sanity checks */
2613 if (alias->memory_size & (PAGE_SIZE - 1))
2614 goto out;
2615 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
2616 goto out;
2617 if (alias->slot >= KVM_ALIAS_SLOTS)
2618 goto out;
2619 if (alias->guest_phys_addr + alias->memory_size
2620 < alias->guest_phys_addr)
2621 goto out;
2622 if (alias->target_phys_addr + alias->memory_size
2623 < alias->target_phys_addr)
2624 goto out;
2625
2626 r = -ENOMEM;
2627 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2628 if (!aliases)
2629 goto out;
2630
2631 mutex_lock(&kvm->slots_lock);
2632
2633 /* invalidate any gfn reference in case of deletion/shrinking */
2634 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2635 aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
2636 old_aliases = kvm->arch.aliases;
2637 rcu_assign_pointer(kvm->arch.aliases, aliases);
2638 synchronize_srcu_expedited(&kvm->srcu);
2639 kvm_mmu_zap_all(kvm);
2640 kfree(old_aliases);
2641
2642 r = -ENOMEM;
2643 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2644 if (!aliases)
2645 goto out_unlock;
2646
2647 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2648
2649 p = &aliases->aliases[alias->slot];
2650 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2651 p->npages = alias->memory_size >> PAGE_SHIFT;
2652 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2653 p->flags &= ~(KVM_ALIAS_INVALID);
2654
2655 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2656 if (aliases->aliases[n - 1].npages)
2657 break;
2658 aliases->naliases = n;
2659
2660 old_aliases = kvm->arch.aliases;
2661 rcu_assign_pointer(kvm->arch.aliases, aliases);
2662 synchronize_srcu_expedited(&kvm->srcu);
2663 kfree(old_aliases);
2664 r = 0;
2665
2666out_unlock:
2667 mutex_unlock(&kvm->slots_lock);
2668out:
2669 return r;
2670}
2671
2672static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2765static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2673{ 2766{
2674 int r; 2767 int r;
@@ -2797,7 +2890,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2797 struct kvm_memory_slot *memslot; 2890 struct kvm_memory_slot *memslot;
2798 unsigned long n; 2891 unsigned long n;
2799 unsigned long is_dirty = 0; 2892 unsigned long is_dirty = 0;
2800 unsigned long *dirty_bitmap = NULL;
2801 2893
2802 mutex_lock(&kvm->slots_lock); 2894 mutex_lock(&kvm->slots_lock);
2803 2895
@@ -2812,27 +2904,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2812 2904
2813 n = kvm_dirty_bitmap_bytes(memslot); 2905 n = kvm_dirty_bitmap_bytes(memslot);
2814 2906
2815 r = -ENOMEM;
2816 dirty_bitmap = vmalloc(n);
2817 if (!dirty_bitmap)
2818 goto out;
2819 memset(dirty_bitmap, 0, n);
2820
2821 for (i = 0; !is_dirty && i < n/sizeof(long); i++) 2907 for (i = 0; !is_dirty && i < n/sizeof(long); i++)
2822 is_dirty = memslot->dirty_bitmap[i]; 2908 is_dirty = memslot->dirty_bitmap[i];
2823 2909
2824 /* If nothing is dirty, don't bother messing with page tables. */ 2910 /* If nothing is dirty, don't bother messing with page tables. */
2825 if (is_dirty) { 2911 if (is_dirty) {
2826 struct kvm_memslots *slots, *old_slots; 2912 struct kvm_memslots *slots, *old_slots;
2913 unsigned long *dirty_bitmap;
2827 2914
2828 spin_lock(&kvm->mmu_lock); 2915 spin_lock(&kvm->mmu_lock);
2829 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2916 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2830 spin_unlock(&kvm->mmu_lock); 2917 spin_unlock(&kvm->mmu_lock);
2831 2918
2832 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 2919 r = -ENOMEM;
2833 if (!slots) 2920 dirty_bitmap = vmalloc(n);
2834 goto out_free; 2921 if (!dirty_bitmap)
2922 goto out;
2923 memset(dirty_bitmap, 0, n);
2835 2924
2925 r = -ENOMEM;
2926 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2927 if (!slots) {
2928 vfree(dirty_bitmap);
2929 goto out;
2930 }
2836 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 2931 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2837 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 2932 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
2838 2933
@@ -2841,13 +2936,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2841 synchronize_srcu_expedited(&kvm->srcu); 2936 synchronize_srcu_expedited(&kvm->srcu);
2842 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; 2937 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2843 kfree(old_slots); 2938 kfree(old_slots);
2939
2940 r = -EFAULT;
2941 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
2942 vfree(dirty_bitmap);
2943 goto out;
2944 }
2945 vfree(dirty_bitmap);
2946 } else {
2947 r = -EFAULT;
2948 if (clear_user(log->dirty_bitmap, n))
2949 goto out;
2844 } 2950 }
2845 2951
2846 r = 0; 2952 r = 0;
2847 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2848 r = -EFAULT;
2849out_free:
2850 vfree(dirty_bitmap);
2851out: 2953out:
2852 mutex_unlock(&kvm->slots_lock); 2954 mutex_unlock(&kvm->slots_lock);
2853 return r; 2955 return r;
@@ -2867,7 +2969,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2867 union { 2969 union {
2868 struct kvm_pit_state ps; 2970 struct kvm_pit_state ps;
2869 struct kvm_pit_state2 ps2; 2971 struct kvm_pit_state2 ps2;
2870 struct kvm_memory_alias alias;
2871 struct kvm_pit_config pit_config; 2972 struct kvm_pit_config pit_config;
2872 } u; 2973 } u;
2873 2974
@@ -2888,22 +2989,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2888 goto out; 2989 goto out;
2889 break; 2990 break;
2890 } 2991 }
2891 case KVM_SET_MEMORY_REGION: {
2892 struct kvm_memory_region kvm_mem;
2893 struct kvm_userspace_memory_region kvm_userspace_mem;
2894
2895 r = -EFAULT;
2896 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2897 goto out;
2898 kvm_userspace_mem.slot = kvm_mem.slot;
2899 kvm_userspace_mem.flags = kvm_mem.flags;
2900 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2901 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2902 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2903 if (r)
2904 goto out;
2905 break;
2906 }
2907 case KVM_SET_NR_MMU_PAGES: 2992 case KVM_SET_NR_MMU_PAGES:
2908 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 2993 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2909 if (r) 2994 if (r)
@@ -2912,14 +2997,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2912 case KVM_GET_NR_MMU_PAGES: 2997 case KVM_GET_NR_MMU_PAGES:
2913 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 2998 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
2914 break; 2999 break;
2915 case KVM_SET_MEMORY_ALIAS:
2916 r = -EFAULT;
2917 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
2918 goto out;
2919 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
2920 if (r)
2921 goto out;
2922 break;
2923 case KVM_CREATE_IRQCHIP: { 3000 case KVM_CREATE_IRQCHIP: {
2924 struct kvm_pic *vpic; 3001 struct kvm_pic *vpic;
2925 3002
@@ -3259,7 +3336,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3259 } 3336 }
3260 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3337 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3261 if (ret < 0) { 3338 if (ret < 0) {
3262 r = X86EMUL_UNHANDLEABLE; 3339 r = X86EMUL_IO_NEEDED;
3263 goto out; 3340 goto out;
3264 } 3341 }
3265 3342
@@ -3315,7 +3392,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3315 } 3392 }
3316 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3393 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3317 if (ret < 0) { 3394 if (ret < 0) {
3318 r = X86EMUL_UNHANDLEABLE; 3395 r = X86EMUL_IO_NEEDED;
3319 goto out; 3396 goto out;
3320 } 3397 }
3321 3398
@@ -3330,10 +3407,10 @@ out:
3330static int emulator_read_emulated(unsigned long addr, 3407static int emulator_read_emulated(unsigned long addr,
3331 void *val, 3408 void *val,
3332 unsigned int bytes, 3409 unsigned int bytes,
3410 unsigned int *error_code,
3333 struct kvm_vcpu *vcpu) 3411 struct kvm_vcpu *vcpu)
3334{ 3412{
3335 gpa_t gpa; 3413 gpa_t gpa;
3336 u32 error_code;
3337 3414
3338 if (vcpu->mmio_read_completed) { 3415 if (vcpu->mmio_read_completed) {
3339 memcpy(val, vcpu->mmio_data, bytes); 3416 memcpy(val, vcpu->mmio_data, bytes);
@@ -3343,12 +3420,10 @@ static int emulator_read_emulated(unsigned long addr,
3343 return X86EMUL_CONTINUE; 3420 return X86EMUL_CONTINUE;
3344 } 3421 }
3345 3422
3346 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); 3423 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
3347 3424
3348 if (gpa == UNMAPPED_GVA) { 3425 if (gpa == UNMAPPED_GVA)
3349 kvm_inject_page_fault(vcpu, addr, error_code);
3350 return X86EMUL_PROPAGATE_FAULT; 3426 return X86EMUL_PROPAGATE_FAULT;
3351 }
3352 3427
3353 /* For APIC access vmexit */ 3428 /* For APIC access vmexit */
3354 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3429 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3370,11 +3445,12 @@ mmio:
3370 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 3445 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
3371 3446
3372 vcpu->mmio_needed = 1; 3447 vcpu->mmio_needed = 1;
3373 vcpu->mmio_phys_addr = gpa; 3448 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3374 vcpu->mmio_size = bytes; 3449 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3375 vcpu->mmio_is_write = 0; 3450 vcpu->run->mmio.len = vcpu->mmio_size = bytes;
3451 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
3376 3452
3377 return X86EMUL_UNHANDLEABLE; 3453 return X86EMUL_IO_NEEDED;
3378} 3454}
3379 3455
3380int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3456int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -3392,17 +3468,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3392static int emulator_write_emulated_onepage(unsigned long addr, 3468static int emulator_write_emulated_onepage(unsigned long addr,
3393 const void *val, 3469 const void *val,
3394 unsigned int bytes, 3470 unsigned int bytes,
3471 unsigned int *error_code,
3395 struct kvm_vcpu *vcpu) 3472 struct kvm_vcpu *vcpu)
3396{ 3473{
3397 gpa_t gpa; 3474 gpa_t gpa;
3398 u32 error_code;
3399 3475
3400 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); 3476 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
3401 3477
3402 if (gpa == UNMAPPED_GVA) { 3478 if (gpa == UNMAPPED_GVA)
3403 kvm_inject_page_fault(vcpu, addr, error_code);
3404 return X86EMUL_PROPAGATE_FAULT; 3479 return X86EMUL_PROPAGATE_FAULT;
3405 }
3406 3480
3407 /* For APIC access vmexit */ 3481 /* For APIC access vmexit */
3408 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3482 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3420,10 +3494,11 @@ mmio:
3420 return X86EMUL_CONTINUE; 3494 return X86EMUL_CONTINUE;
3421 3495
3422 vcpu->mmio_needed = 1; 3496 vcpu->mmio_needed = 1;
3423 vcpu->mmio_phys_addr = gpa; 3497 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3424 vcpu->mmio_size = bytes; 3498 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3425 vcpu->mmio_is_write = 1; 3499 vcpu->run->mmio.len = vcpu->mmio_size = bytes;
3426 memcpy(vcpu->mmio_data, val, bytes); 3500 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
3501 memcpy(vcpu->run->mmio.data, val, bytes);
3427 3502
3428 return X86EMUL_CONTINUE; 3503 return X86EMUL_CONTINUE;
3429} 3504}
@@ -3431,6 +3506,7 @@ mmio:
3431int emulator_write_emulated(unsigned long addr, 3506int emulator_write_emulated(unsigned long addr,
3432 const void *val, 3507 const void *val,
3433 unsigned int bytes, 3508 unsigned int bytes,
3509 unsigned int *error_code,
3434 struct kvm_vcpu *vcpu) 3510 struct kvm_vcpu *vcpu)
3435{ 3511{
3436 /* Crossing a page boundary? */ 3512 /* Crossing a page boundary? */
@@ -3438,16 +3514,17 @@ int emulator_write_emulated(unsigned long addr,
3438 int rc, now; 3514 int rc, now;
3439 3515
3440 now = -addr & ~PAGE_MASK; 3516 now = -addr & ~PAGE_MASK;
3441 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 3517 rc = emulator_write_emulated_onepage(addr, val, now, error_code,
3518 vcpu);
3442 if (rc != X86EMUL_CONTINUE) 3519 if (rc != X86EMUL_CONTINUE)
3443 return rc; 3520 return rc;
3444 addr += now; 3521 addr += now;
3445 val += now; 3522 val += now;
3446 bytes -= now; 3523 bytes -= now;
3447 } 3524 }
3448 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 3525 return emulator_write_emulated_onepage(addr, val, bytes, error_code,
3526 vcpu);
3449} 3527}
3450EXPORT_SYMBOL_GPL(emulator_write_emulated);
3451 3528
3452#define CMPXCHG_TYPE(t, ptr, old, new) \ 3529#define CMPXCHG_TYPE(t, ptr, old, new) \
3453 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) 3530 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
@@ -3463,6 +3540,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3463 const void *old, 3540 const void *old,
3464 const void *new, 3541 const void *new,
3465 unsigned int bytes, 3542 unsigned int bytes,
3543 unsigned int *error_code,
3466 struct kvm_vcpu *vcpu) 3544 struct kvm_vcpu *vcpu)
3467{ 3545{
3468 gpa_t gpa; 3546 gpa_t gpa;
@@ -3484,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3484 goto emul_write; 3562 goto emul_write;
3485 3563
3486 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3564 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3565 if (is_error_page(page)) {
3566 kvm_release_page_clean(page);
3567 goto emul_write;
3568 }
3487 3569
3488 kaddr = kmap_atomic(page, KM_USER0); 3570 kaddr = kmap_atomic(page, KM_USER0);
3489 kaddr += offset_in_page(gpa); 3571 kaddr += offset_in_page(gpa);
@@ -3516,7 +3598,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3516emul_write: 3598emul_write:
3517 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3599 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3518 3600
3519 return emulator_write_emulated(addr, new, bytes, vcpu); 3601 return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
3520} 3602}
3521 3603
3522static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3604static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3604,42 +3686,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
3604 return X86EMUL_CONTINUE; 3686 return X86EMUL_CONTINUE;
3605} 3687}
3606 3688
3607int emulate_clts(struct kvm_vcpu *vcpu) 3689int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
3608{ 3690{
3609 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 3691 if (!need_emulate_wbinvd(vcpu))
3610 kvm_x86_ops->fpu_activate(vcpu); 3692 return X86EMUL_CONTINUE;
3693
3694 if (kvm_x86_ops->has_wbinvd_exit()) {
3695 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
3696 wbinvd_ipi, NULL, 1);
3697 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
3698 }
3699 wbinvd();
3611 return X86EMUL_CONTINUE; 3700 return X86EMUL_CONTINUE;
3612} 3701}
3702EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
3613 3703
3614int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3704int emulate_clts(struct kvm_vcpu *vcpu)
3615{ 3705{
3616 return kvm_get_dr(ctxt->vcpu, dr, dest); 3706 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3707 kvm_x86_ops->fpu_activate(vcpu);
3708 return X86EMUL_CONTINUE;
3617} 3709}
3618 3710
3619int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3711int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
3620{ 3712{
3621 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3713 return _kvm_get_dr(vcpu, dr, dest);
3622
3623 return kvm_set_dr(ctxt->vcpu, dr, value & mask);
3624} 3714}
3625 3715
3626void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3716int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
3627{ 3717{
3628 u8 opcodes[4];
3629 unsigned long rip = kvm_rip_read(vcpu);
3630 unsigned long rip_linear;
3631
3632 if (!printk_ratelimit())
3633 return;
3634 3718
3635 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3719 return __kvm_set_dr(vcpu, dr, value);
3636
3637 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
3638
3639 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
3640 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
3641} 3720}
3642EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3643 3721
3644static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3722static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3645{ 3723{
@@ -3674,27 +3752,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3674 return value; 3752 return value;
3675} 3753}
3676 3754
3677static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 3755static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3678{ 3756{
3757 int res = 0;
3758
3679 switch (cr) { 3759 switch (cr) {
3680 case 0: 3760 case 0:
3681 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); 3761 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3682 break; 3762 break;
3683 case 2: 3763 case 2:
3684 vcpu->arch.cr2 = val; 3764 vcpu->arch.cr2 = val;
3685 break; 3765 break;
3686 case 3: 3766 case 3:
3687 kvm_set_cr3(vcpu, val); 3767 res = kvm_set_cr3(vcpu, val);
3688 break; 3768 break;
3689 case 4: 3769 case 4:
3690 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 3770 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3691 break; 3771 break;
3692 case 8: 3772 case 8:
3693 kvm_set_cr8(vcpu, val & 0xfUL); 3773 res = __kvm_set_cr8(vcpu, val & 0xfUL);
3694 break; 3774 break;
3695 default: 3775 default:
3696 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3776 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3777 res = -1;
3697 } 3778 }
3779
3780 return res;
3698} 3781}
3699 3782
3700static int emulator_get_cpl(struct kvm_vcpu *vcpu) 3783static int emulator_get_cpl(struct kvm_vcpu *vcpu)
@@ -3707,6 +3790,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3707 kvm_x86_ops->get_gdt(vcpu, dt); 3790 kvm_x86_ops->get_gdt(vcpu, dt);
3708} 3791}
3709 3792
3793static unsigned long emulator_get_cached_segment_base(int seg,
3794 struct kvm_vcpu *vcpu)
3795{
3796 return get_segment_base(vcpu, seg);
3797}
3798
3710static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 3799static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3711 struct kvm_vcpu *vcpu) 3800 struct kvm_vcpu *vcpu)
3712{ 3801{
@@ -3779,11 +3868,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,
3779 kvm_set_segment(vcpu, &kvm_seg, seg); 3868 kvm_set_segment(vcpu, &kvm_seg, seg);
3780} 3869}
3781 3870
3782static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3783{
3784 kvm_x86_ops->set_rflags(vcpu, rflags);
3785}
3786
3787static struct x86_emulate_ops emulate_ops = { 3871static struct x86_emulate_ops emulate_ops = {
3788 .read_std = kvm_read_guest_virt_system, 3872 .read_std = kvm_read_guest_virt_system,
3789 .write_std = kvm_write_guest_virt_system, 3873 .write_std = kvm_write_guest_virt_system,
@@ -3797,11 +3881,15 @@ static struct x86_emulate_ops emulate_ops = {
3797 .set_cached_descriptor = emulator_set_cached_descriptor, 3881 .set_cached_descriptor = emulator_set_cached_descriptor,
3798 .get_segment_selector = emulator_get_segment_selector, 3882 .get_segment_selector = emulator_get_segment_selector,
3799 .set_segment_selector = emulator_set_segment_selector, 3883 .set_segment_selector = emulator_set_segment_selector,
3884 .get_cached_segment_base = emulator_get_cached_segment_base,
3800 .get_gdt = emulator_get_gdt, 3885 .get_gdt = emulator_get_gdt,
3801 .get_cr = emulator_get_cr, 3886 .get_cr = emulator_get_cr,
3802 .set_cr = emulator_set_cr, 3887 .set_cr = emulator_set_cr,
3803 .cpl = emulator_get_cpl, 3888 .cpl = emulator_get_cpl,
3804 .set_rflags = emulator_set_rflags, 3889 .get_dr = emulator_get_dr,
3890 .set_dr = emulator_set_dr,
3891 .set_msr = kvm_set_msr,
3892 .get_msr = kvm_get_msr,
3805}; 3893};
3806 3894
3807static void cache_all_regs(struct kvm_vcpu *vcpu) 3895static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3812,14 +3900,75 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
3812 vcpu->arch.regs_dirty = ~0; 3900 vcpu->arch.regs_dirty = ~0;
3813} 3901}
3814 3902
3903static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
3904{
3905 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
3906 /*
3907 * an sti; sti; sequence only disable interrupts for the first
3908 * instruction. So, if the last instruction, be it emulated or
3909 * not, left the system with the INT_STI flag enabled, it
3910 * means that the last instruction is an sti. We should not
3911 * leave the flag on in this case. The same goes for mov ss
3912 */
3913 if (!(int_shadow & mask))
3914 kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
3915}
3916
3917static void inject_emulated_exception(struct kvm_vcpu *vcpu)
3918{
3919 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
3920 if (ctxt->exception == PF_VECTOR)
3921 kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
3922 else if (ctxt->error_code_valid)
3923 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
3924 else
3925 kvm_queue_exception(vcpu, ctxt->exception);
3926}
3927
3928static int handle_emulation_failure(struct kvm_vcpu *vcpu)
3929{
3930 ++vcpu->stat.insn_emulation_fail;
3931 trace_kvm_emulate_insn_failed(vcpu);
3932 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3933 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3934 vcpu->run->internal.ndata = 0;
3935 kvm_queue_exception(vcpu, UD_VECTOR);
3936 return EMULATE_FAIL;
3937}
3938
3939static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
3940{
3941 gpa_t gpa;
3942
3943 if (tdp_enabled)
3944 return false;
3945
3946 /*
3947 * if emulation was due to access to shadowed page table
3948 * and it failed try to unshadow page and re-entetr the
3949 * guest to let CPU execute the instruction.
3950 */
3951 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
3952 return true;
3953
3954 gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
3955
3956 if (gpa == UNMAPPED_GVA)
3957 return true; /* let cpu generate fault */
3958
3959 if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
3960 return true;
3961
3962 return false;
3963}
3964
3815int emulate_instruction(struct kvm_vcpu *vcpu, 3965int emulate_instruction(struct kvm_vcpu *vcpu,
3816 unsigned long cr2, 3966 unsigned long cr2,
3817 u16 error_code, 3967 u16 error_code,
3818 int emulation_type) 3968 int emulation_type)
3819{ 3969{
3820 int r, shadow_mask; 3970 int r;
3821 struct decode_cache *c; 3971 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
3822 struct kvm_run *run = vcpu->run;
3823 3972
3824 kvm_clear_exception_queue(vcpu); 3973 kvm_clear_exception_queue(vcpu);
3825 vcpu->arch.mmio_fault_cr2 = cr2; 3974 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3831,8 +3980,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3831 */ 3980 */
3832 cache_all_regs(vcpu); 3981 cache_all_regs(vcpu);
3833 3982
3834 vcpu->mmio_is_write = 0;
3835
3836 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3983 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3837 int cs_db, cs_l; 3984 int cs_db, cs_l;
3838 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3985 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
@@ -3846,13 +3993,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3846 ? X86EMUL_MODE_VM86 : cs_l 3993 ? X86EMUL_MODE_VM86 : cs_l
3847 ? X86EMUL_MODE_PROT64 : cs_db 3994 ? X86EMUL_MODE_PROT64 : cs_db
3848 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3995 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3996 memset(c, 0, sizeof(struct decode_cache));
3997 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
3998 vcpu->arch.emulate_ctxt.interruptibility = 0;
3999 vcpu->arch.emulate_ctxt.exception = -1;
3849 4000
3850 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4001 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3851 trace_kvm_emulate_insn_start(vcpu); 4002 trace_kvm_emulate_insn_start(vcpu);
3852 4003
3853 /* Only allow emulation of specific instructions on #UD 4004 /* Only allow emulation of specific instructions on #UD
3854 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 4005 * (namely VMMCALL, sysenter, sysexit, syscall)*/
3855 c = &vcpu->arch.emulate_ctxt.decode;
3856 if (emulation_type & EMULTYPE_TRAP_UD) { 4006 if (emulation_type & EMULTYPE_TRAP_UD) {
3857 if (!c->twobyte) 4007 if (!c->twobyte)
3858 return EMULATE_FAIL; 4008 return EMULATE_FAIL;
@@ -3880,11 +4030,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3880 4030
3881 ++vcpu->stat.insn_emulation; 4031 ++vcpu->stat.insn_emulation;
3882 if (r) { 4032 if (r) {
3883 ++vcpu->stat.insn_emulation_fail; 4033 if (reexecute_instruction(vcpu, cr2))
3884 trace_kvm_emulate_insn_failed(vcpu);
3885 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3886 return EMULATE_DONE; 4034 return EMULATE_DONE;
3887 return EMULATE_FAIL; 4035 if (emulation_type & EMULTYPE_SKIP)
4036 return EMULATE_FAIL;
4037 return handle_emulation_failure(vcpu);
3888 } 4038 }
3889 } 4039 }
3890 4040
@@ -3893,48 +4043,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3893 return EMULATE_DONE; 4043 return EMULATE_DONE;
3894 } 4044 }
3895 4045
4046 /* this is needed for vmware backdor interface to work since it
4047 changes registers values during IO operation */
4048 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4049
3896restart: 4050restart:
3897 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4051 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3898 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3899 4052
3900 if (r == 0) 4053 if (r) { /* emulation failed */
3901 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 4054 if (reexecute_instruction(vcpu, cr2))
4055 return EMULATE_DONE;
3902 4056
3903 if (vcpu->arch.pio.count) { 4057 return handle_emulation_failure(vcpu);
3904 if (!vcpu->arch.pio.in)
3905 vcpu->arch.pio.count = 0;
3906 return EMULATE_DO_MMIO;
3907 } 4058 }
3908 4059
3909 if (r || vcpu->mmio_is_write) { 4060 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
3910 run->exit_reason = KVM_EXIT_MMIO; 4061 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
3911 run->mmio.phys_addr = vcpu->mmio_phys_addr; 4062 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
3912 memcpy(run->mmio.data, vcpu->mmio_data, 8); 4063 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
3913 run->mmio.len = vcpu->mmio_size; 4064
3914 run->mmio.is_write = vcpu->mmio_is_write; 4065 if (vcpu->arch.emulate_ctxt.exception >= 0) {
4066 inject_emulated_exception(vcpu);
4067 return EMULATE_DONE;
3915 } 4068 }
3916 4069
3917 if (r) { 4070 if (vcpu->arch.pio.count) {
3918 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 4071 if (!vcpu->arch.pio.in)
3919 goto done; 4072 vcpu->arch.pio.count = 0;
3920 if (!vcpu->mmio_needed) {
3921 ++vcpu->stat.insn_emulation_fail;
3922 trace_kvm_emulate_insn_failed(vcpu);
3923 kvm_report_emulation_failure(vcpu, "mmio");
3924 return EMULATE_FAIL;
3925 }
3926 return EMULATE_DO_MMIO; 4073 return EMULATE_DO_MMIO;
3927 } 4074 }
3928 4075
3929 if (vcpu->mmio_is_write) { 4076 if (vcpu->mmio_needed) {
3930 vcpu->mmio_needed = 0; 4077 if (vcpu->mmio_is_write)
4078 vcpu->mmio_needed = 0;
3931 return EMULATE_DO_MMIO; 4079 return EMULATE_DO_MMIO;
3932 } 4080 }
3933 4081
3934done:
3935 if (vcpu->arch.exception.pending)
3936 vcpu->arch.emulate_ctxt.restart = false;
3937
3938 if (vcpu->arch.emulate_ctxt.restart) 4082 if (vcpu->arch.emulate_ctxt.restart)
3939 goto restart; 4083 goto restart;
3940 4084
@@ -4108,6 +4252,9 @@ int kvm_arch_init(void *opaque)
4108 4252
4109 perf_register_guest_info_callbacks(&kvm_guest_cbs); 4253 perf_register_guest_info_callbacks(&kvm_guest_cbs);
4110 4254
4255 if (cpu_has_xsave)
4256 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
4257
4111 return 0; 4258 return 0;
4112 4259
4113out: 4260out:
@@ -4270,7 +4417,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
4270 4417
4271 kvm_x86_ops->patch_hypercall(vcpu, instruction); 4418 kvm_x86_ops->patch_hypercall(vcpu, instruction);
4272 4419
4273 return emulator_write_emulated(rip, instruction, 3, vcpu); 4420 return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
4274} 4421}
4275 4422
4276void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4423void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
@@ -4506,59 +4653,78 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
4506 } 4653 }
4507} 4654}
4508 4655
4656static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
4657{
4658 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
4659 !vcpu->guest_xcr0_loaded) {
4660 /* kvm_set_xcr() also depends on this */
4661 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
4662 vcpu->guest_xcr0_loaded = 1;
4663 }
4664}
4665
4666static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
4667{
4668 if (vcpu->guest_xcr0_loaded) {
4669 if (vcpu->arch.xcr0 != host_xcr0)
4670 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
4671 vcpu->guest_xcr0_loaded = 0;
4672 }
4673}
4674
4509static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 4675static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4510{ 4676{
4511 int r; 4677 int r;
4512 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 4678 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
4513 vcpu->run->request_interrupt_window; 4679 vcpu->run->request_interrupt_window;
4514 4680
4515 if (vcpu->requests)
4516 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
4517 kvm_mmu_unload(vcpu);
4518
4519 r = kvm_mmu_reload(vcpu);
4520 if (unlikely(r))
4521 goto out;
4522
4523 if (vcpu->requests) { 4681 if (vcpu->requests) {
4524 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 4682 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
4683 kvm_mmu_unload(vcpu);
4684 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
4525 __kvm_migrate_timers(vcpu); 4685 __kvm_migrate_timers(vcpu);
4526 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 4686 if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
4527 kvm_write_guest_time(vcpu); 4687 kvm_write_guest_time(vcpu);
4528 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 4688 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4529 kvm_mmu_sync_roots(vcpu); 4689 kvm_mmu_sync_roots(vcpu);
4530 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 4690 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
4531 kvm_x86_ops->tlb_flush(vcpu); 4691 kvm_x86_ops->tlb_flush(vcpu);
4532 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 4692 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
4533 &vcpu->requests)) {
4534 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 4693 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
4535 r = 0; 4694 r = 0;
4536 goto out; 4695 goto out;
4537 } 4696 }
4538 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 4697 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
4539 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 4698 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
4540 r = 0; 4699 r = 0;
4541 goto out; 4700 goto out;
4542 } 4701 }
4543 if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { 4702 if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
4544 vcpu->fpu_active = 0; 4703 vcpu->fpu_active = 0;
4545 kvm_x86_ops->fpu_deactivate(vcpu); 4704 kvm_x86_ops->fpu_deactivate(vcpu);
4546 } 4705 }
4547 } 4706 }
4548 4707
4708 r = kvm_mmu_reload(vcpu);
4709 if (unlikely(r))
4710 goto out;
4711
4549 preempt_disable(); 4712 preempt_disable();
4550 4713
4551 kvm_x86_ops->prepare_guest_switch(vcpu); 4714 kvm_x86_ops->prepare_guest_switch(vcpu);
4552 if (vcpu->fpu_active) 4715 if (vcpu->fpu_active)
4553 kvm_load_guest_fpu(vcpu); 4716 kvm_load_guest_fpu(vcpu);
4717 kvm_load_guest_xcr0(vcpu);
4554 4718
4555 local_irq_disable(); 4719 atomic_set(&vcpu->guest_mode, 1);
4720 smp_wmb();
4556 4721
4557 clear_bit(KVM_REQ_KICK, &vcpu->requests); 4722 local_irq_disable();
4558 smp_mb__after_clear_bit();
4559 4723
4560 if (vcpu->requests || need_resched() || signal_pending(current)) { 4724 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
4561 set_bit(KVM_REQ_KICK, &vcpu->requests); 4725 || need_resched() || signal_pending(current)) {
4726 atomic_set(&vcpu->guest_mode, 0);
4727 smp_wmb();
4562 local_irq_enable(); 4728 local_irq_enable();
4563 preempt_enable(); 4729 preempt_enable();
4564 r = 1; 4730 r = 1;
@@ -4603,7 +4769,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4603 if (hw_breakpoint_active()) 4769 if (hw_breakpoint_active())
4604 hw_breakpoint_restore(); 4770 hw_breakpoint_restore();
4605 4771
4606 set_bit(KVM_REQ_KICK, &vcpu->requests); 4772 atomic_set(&vcpu->guest_mode, 0);
4773 smp_wmb();
4607 local_irq_enable(); 4774 local_irq_enable();
4608 4775
4609 ++vcpu->stat.exits; 4776 ++vcpu->stat.exits;
@@ -4665,7 +4832,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4665 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 4832 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4666 kvm_vcpu_block(vcpu); 4833 kvm_vcpu_block(vcpu);
4667 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 4834 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
4668 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4835 if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
4669 { 4836 {
4670 switch(vcpu->arch.mp_state) { 4837 switch(vcpu->arch.mp_state) {
4671 case KVM_MP_STATE_HALTED: 4838 case KVM_MP_STATE_HALTED:
@@ -4717,8 +4884,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4717 int r; 4884 int r;
4718 sigset_t sigsaved; 4885 sigset_t sigsaved;
4719 4886
4720 vcpu_load(vcpu);
4721
4722 if (vcpu->sigset_active) 4887 if (vcpu->sigset_active)
4723 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 4888 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
4724 4889
@@ -4743,7 +4908,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4743 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4908 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4744 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); 4909 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4745 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4910 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4746 if (r == EMULATE_DO_MMIO) { 4911 if (r != EMULATE_DONE) {
4747 r = 0; 4912 r = 0;
4748 goto out; 4913 goto out;
4749 } 4914 }
@@ -4759,14 +4924,11 @@ out:
4759 if (vcpu->sigset_active) 4924 if (vcpu->sigset_active)
4760 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 4925 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
4761 4926
4762 vcpu_put(vcpu);
4763 return r; 4927 return r;
4764} 4928}
4765 4929
4766int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4930int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4767{ 4931{
4768 vcpu_load(vcpu);
4769
4770 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4932 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4771 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4933 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4772 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4934 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4789,15 +4951,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4789 regs->rip = kvm_rip_read(vcpu); 4951 regs->rip = kvm_rip_read(vcpu);
4790 regs->rflags = kvm_get_rflags(vcpu); 4952 regs->rflags = kvm_get_rflags(vcpu);
4791 4953
4792 vcpu_put(vcpu);
4793
4794 return 0; 4954 return 0;
4795} 4955}
4796 4956
4797int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4957int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4798{ 4958{
4799 vcpu_load(vcpu);
4800
4801 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 4959 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
4802 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 4960 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
4803 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 4961 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4822,8 +4980,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4822 4980
4823 vcpu->arch.exception.pending = false; 4981 vcpu->arch.exception.pending = false;
4824 4982
4825 vcpu_put(vcpu);
4826
4827 return 0; 4983 return 0;
4828} 4984}
4829 4985
@@ -4842,8 +4998,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4842{ 4998{
4843 struct desc_ptr dt; 4999 struct desc_ptr dt;
4844 5000
4845 vcpu_load(vcpu);
4846
4847 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5001 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4848 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 5002 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4849 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 5003 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -4875,32 +5029,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4875 set_bit(vcpu->arch.interrupt.nr, 5029 set_bit(vcpu->arch.interrupt.nr,
4876 (unsigned long *)sregs->interrupt_bitmap); 5030 (unsigned long *)sregs->interrupt_bitmap);
4877 5031
4878 vcpu_put(vcpu);
4879
4880 return 0; 5032 return 0;
4881} 5033}
4882 5034
4883int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 5035int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
4884 struct kvm_mp_state *mp_state) 5036 struct kvm_mp_state *mp_state)
4885{ 5037{
4886 vcpu_load(vcpu);
4887 mp_state->mp_state = vcpu->arch.mp_state; 5038 mp_state->mp_state = vcpu->arch.mp_state;
4888 vcpu_put(vcpu);
4889 return 0; 5039 return 0;
4890} 5040}
4891 5041
4892int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 5042int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4893 struct kvm_mp_state *mp_state) 5043 struct kvm_mp_state *mp_state)
4894{ 5044{
4895 vcpu_load(vcpu);
4896 vcpu->arch.mp_state = mp_state->mp_state; 5045 vcpu->arch.mp_state = mp_state->mp_state;
4897 vcpu_put(vcpu);
4898 return 0; 5046 return 0;
4899} 5047}
4900 5048
4901int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 5049int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4902 bool has_error_code, u32 error_code) 5050 bool has_error_code, u32 error_code)
4903{ 5051{
5052 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4904 int cs_db, cs_l, ret; 5053 int cs_db, cs_l, ret;
4905 cache_all_regs(vcpu); 5054 cache_all_regs(vcpu);
4906 5055
@@ -4915,6 +5064,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4915 ? X86EMUL_MODE_VM86 : cs_l 5064 ? X86EMUL_MODE_VM86 : cs_l
4916 ? X86EMUL_MODE_PROT64 : cs_db 5065 ? X86EMUL_MODE_PROT64 : cs_db
4917 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 5066 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
5067 memset(c, 0, sizeof(struct decode_cache));
5068 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4918 5069
4919 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5070 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
4920 tss_selector, reason, has_error_code, 5071 tss_selector, reason, has_error_code,
@@ -4923,6 +5074,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4923 if (ret) 5074 if (ret)
4924 return EMULATE_FAIL; 5075 return EMULATE_FAIL;
4925 5076
5077 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5078 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4926 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5079 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4927 return EMULATE_DONE; 5080 return EMULATE_DONE;
4928} 5081}
@@ -4935,8 +5088,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4935 int pending_vec, max_bits; 5088 int pending_vec, max_bits;
4936 struct desc_ptr dt; 5089 struct desc_ptr dt;
4937 5090
4938 vcpu_load(vcpu);
4939
4940 dt.size = sregs->idt.limit; 5091 dt.size = sregs->idt.limit;
4941 dt.address = sregs->idt.base; 5092 dt.address = sregs->idt.base;
4942 kvm_x86_ops->set_idt(vcpu, &dt); 5093 kvm_x86_ops->set_idt(vcpu, &dt);
@@ -4996,8 +5147,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4996 !is_protmode(vcpu)) 5147 !is_protmode(vcpu))
4997 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5148 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4998 5149
4999 vcpu_put(vcpu);
5000
5001 return 0; 5150 return 0;
5002} 5151}
5003 5152
@@ -5007,12 +5156,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5007 unsigned long rflags; 5156 unsigned long rflags;
5008 int i, r; 5157 int i, r;
5009 5158
5010 vcpu_load(vcpu);
5011
5012 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { 5159 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
5013 r = -EBUSY; 5160 r = -EBUSY;
5014 if (vcpu->arch.exception.pending) 5161 if (vcpu->arch.exception.pending)
5015 goto unlock_out; 5162 goto out;
5016 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 5163 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
5017 kvm_queue_exception(vcpu, DB_VECTOR); 5164 kvm_queue_exception(vcpu, DB_VECTOR);
5018 else 5165 else
@@ -5054,34 +5201,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5054 5201
5055 r = 0; 5202 r = 0;
5056 5203
5057unlock_out: 5204out:
5058 vcpu_put(vcpu);
5059 5205
5060 return r; 5206 return r;
5061} 5207}
5062 5208
5063/* 5209/*
5064 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
5065 * we have asm/x86/processor.h
5066 */
5067struct fxsave {
5068 u16 cwd;
5069 u16 swd;
5070 u16 twd;
5071 u16 fop;
5072 u64 rip;
5073 u64 rdp;
5074 u32 mxcsr;
5075 u32 mxcsr_mask;
5076 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
5077#ifdef CONFIG_X86_64
5078 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
5079#else
5080 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
5081#endif
5082};
5083
5084/*
5085 * Translate a guest virtual address to a guest physical address. 5210 * Translate a guest virtual address to a guest physical address.
5086 */ 5211 */
5087int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 5212int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -5091,7 +5216,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
5091 gpa_t gpa; 5216 gpa_t gpa;
5092 int idx; 5217 int idx;
5093 5218
5094 vcpu_load(vcpu);
5095 idx = srcu_read_lock(&vcpu->kvm->srcu); 5219 idx = srcu_read_lock(&vcpu->kvm->srcu);
5096 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); 5220 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
5097 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5221 srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -5099,16 +5223,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
5099 tr->valid = gpa != UNMAPPED_GVA; 5223 tr->valid = gpa != UNMAPPED_GVA;
5100 tr->writeable = 1; 5224 tr->writeable = 1;
5101 tr->usermode = 0; 5225 tr->usermode = 0;
5102 vcpu_put(vcpu);
5103 5226
5104 return 0; 5227 return 0;
5105} 5228}
5106 5229
5107int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5230int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5108{ 5231{
5109 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 5232 struct i387_fxsave_struct *fxsave =
5110 5233 &vcpu->arch.guest_fpu.state->fxsave;
5111 vcpu_load(vcpu);
5112 5234
5113 memcpy(fpu->fpr, fxsave->st_space, 128); 5235 memcpy(fpu->fpr, fxsave->st_space, 128);
5114 fpu->fcw = fxsave->cwd; 5236 fpu->fcw = fxsave->cwd;
@@ -5119,16 +5241,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5119 fpu->last_dp = fxsave->rdp; 5241 fpu->last_dp = fxsave->rdp;
5120 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 5242 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
5121 5243
5122 vcpu_put(vcpu);
5123
5124 return 0; 5244 return 0;
5125} 5245}
5126 5246
5127int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5247int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5128{ 5248{
5129 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 5249 struct i387_fxsave_struct *fxsave =
5130 5250 &vcpu->arch.guest_fpu.state->fxsave;
5131 vcpu_load(vcpu);
5132 5251
5133 memcpy(fxsave->st_space, fpu->fpr, 128); 5252 memcpy(fxsave->st_space, fpu->fpr, 128);
5134 fxsave->cwd = fpu->fcw; 5253 fxsave->cwd = fpu->fcw;
@@ -5139,61 +5258,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5139 fxsave->rdp = fpu->last_dp; 5258 fxsave->rdp = fpu->last_dp;
5140 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 5259 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
5141 5260
5142 vcpu_put(vcpu);
5143
5144 return 0; 5261 return 0;
5145} 5262}
5146 5263
5147void fx_init(struct kvm_vcpu *vcpu) 5264int fx_init(struct kvm_vcpu *vcpu)
5148{ 5265{
5149 unsigned after_mxcsr_mask; 5266 int err;
5267
5268 err = fpu_alloc(&vcpu->arch.guest_fpu);
5269 if (err)
5270 return err;
5271
5272 fpu_finit(&vcpu->arch.guest_fpu);
5150 5273
5151 /* 5274 /*
5152 * Touch the fpu the first time in non atomic context as if 5275 * Ensure guest xcr0 is valid for loading
5153 * this is the first fpu instruction the exception handler
5154 * will fire before the instruction returns and it'll have to
5155 * allocate ram with GFP_KERNEL.
5156 */ 5276 */
5157 if (!used_math()) 5277 vcpu->arch.xcr0 = XSTATE_FP;
5158 kvm_fx_save(&vcpu->arch.host_fx_image);
5159
5160 /* Initialize guest FPU by resetting ours and saving into guest's */
5161 preempt_disable();
5162 kvm_fx_save(&vcpu->arch.host_fx_image);
5163 kvm_fx_finit();
5164 kvm_fx_save(&vcpu->arch.guest_fx_image);
5165 kvm_fx_restore(&vcpu->arch.host_fx_image);
5166 preempt_enable();
5167 5278
5168 vcpu->arch.cr0 |= X86_CR0_ET; 5279 vcpu->arch.cr0 |= X86_CR0_ET;
5169 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 5280
5170 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 5281 return 0;
5171 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
5172 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
5173} 5282}
5174EXPORT_SYMBOL_GPL(fx_init); 5283EXPORT_SYMBOL_GPL(fx_init);
5175 5284
5285static void fx_free(struct kvm_vcpu *vcpu)
5286{
5287 fpu_free(&vcpu->arch.guest_fpu);
5288}
5289
5176void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5290void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
5177{ 5291{
5178 if (vcpu->guest_fpu_loaded) 5292 if (vcpu->guest_fpu_loaded)
5179 return; 5293 return;
5180 5294
5295 /*
5296 * Restore all possible states in the guest,
5297 * and assume host would use all available bits.
5298 * Guest xcr0 would be loaded later.
5299 */
5300 kvm_put_guest_xcr0(vcpu);
5181 vcpu->guest_fpu_loaded = 1; 5301 vcpu->guest_fpu_loaded = 1;
5182 kvm_fx_save(&vcpu->arch.host_fx_image); 5302 unlazy_fpu(current);
5183 kvm_fx_restore(&vcpu->arch.guest_fx_image); 5303 fpu_restore_checking(&vcpu->arch.guest_fpu);
5184 trace_kvm_fpu(1); 5304 trace_kvm_fpu(1);
5185} 5305}
5186 5306
5187void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5307void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5188{ 5308{
5309 kvm_put_guest_xcr0(vcpu);
5310
5189 if (!vcpu->guest_fpu_loaded) 5311 if (!vcpu->guest_fpu_loaded)
5190 return; 5312 return;
5191 5313
5192 vcpu->guest_fpu_loaded = 0; 5314 vcpu->guest_fpu_loaded = 0;
5193 kvm_fx_save(&vcpu->arch.guest_fx_image); 5315 fpu_save_init(&vcpu->arch.guest_fpu);
5194 kvm_fx_restore(&vcpu->arch.host_fx_image);
5195 ++vcpu->stat.fpu_reload; 5316 ++vcpu->stat.fpu_reload;
5196 set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); 5317 kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
5197 trace_kvm_fpu(0); 5318 trace_kvm_fpu(0);
5198} 5319}
5199 5320
@@ -5204,6 +5325,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5204 vcpu->arch.time_page = NULL; 5325 vcpu->arch.time_page = NULL;
5205 } 5326 }
5206 5327
5328 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
5329 fx_free(vcpu);
5207 kvm_x86_ops->vcpu_free(vcpu); 5330 kvm_x86_ops->vcpu_free(vcpu);
5208} 5331}
5209 5332
@@ -5217,9 +5340,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
5217{ 5340{
5218 int r; 5341 int r;
5219 5342
5220 /* We do fxsave: this must be aligned. */
5221 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
5222
5223 vcpu->arch.mtrr_state.have_fixed = 1; 5343 vcpu->arch.mtrr_state.have_fixed = 1;
5224 vcpu_load(vcpu); 5344 vcpu_load(vcpu);
5225 r = kvm_arch_vcpu_reset(vcpu); 5345 r = kvm_arch_vcpu_reset(vcpu);
@@ -5241,6 +5361,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
5241 kvm_mmu_unload(vcpu); 5361 kvm_mmu_unload(vcpu);
5242 vcpu_put(vcpu); 5362 vcpu_put(vcpu);
5243 5363
5364 fx_free(vcpu);
5244 kvm_x86_ops->vcpu_free(vcpu); 5365 kvm_x86_ops->vcpu_free(vcpu);
5245} 5366}
5246 5367
@@ -5334,7 +5455,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5334 } 5455 }
5335 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5456 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
5336 5457
5458 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5459 goto fail_free_mce_banks;
5460
5337 return 0; 5461 return 0;
5462fail_free_mce_banks:
5463 kfree(vcpu->arch.mce_banks);
5338fail_free_lapic: 5464fail_free_lapic:
5339 kvm_free_lapic(vcpu); 5465 kvm_free_lapic(vcpu);
5340fail_mmu_destroy: 5466fail_mmu_destroy:
@@ -5364,12 +5490,6 @@ struct kvm *kvm_arch_create_vm(void)
5364 if (!kvm) 5490 if (!kvm)
5365 return ERR_PTR(-ENOMEM); 5491 return ERR_PTR(-ENOMEM);
5366 5492
5367 kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
5368 if (!kvm->arch.aliases) {
5369 kfree(kvm);
5370 return ERR_PTR(-ENOMEM);
5371 }
5372
5373 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5493 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5374 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5494 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5375 5495
@@ -5412,12 +5532,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
5412void kvm_arch_sync_events(struct kvm *kvm) 5532void kvm_arch_sync_events(struct kvm *kvm)
5413{ 5533{
5414 kvm_free_all_assigned_devices(kvm); 5534 kvm_free_all_assigned_devices(kvm);
5535 kvm_free_pit(kvm);
5415} 5536}
5416 5537
5417void kvm_arch_destroy_vm(struct kvm *kvm) 5538void kvm_arch_destroy_vm(struct kvm *kvm)
5418{ 5539{
5419 kvm_iommu_unmap_guest(kvm); 5540 kvm_iommu_unmap_guest(kvm);
5420 kvm_free_pit(kvm);
5421 kfree(kvm->arch.vpic); 5541 kfree(kvm->arch.vpic);
5422 kfree(kvm->arch.vioapic); 5542 kfree(kvm->arch.vioapic);
5423 kvm_free_vcpus(kvm); 5543 kvm_free_vcpus(kvm);
@@ -5427,7 +5547,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5427 if (kvm->arch.ept_identity_pagetable) 5547 if (kvm->arch.ept_identity_pagetable)
5428 put_page(kvm->arch.ept_identity_pagetable); 5548 put_page(kvm->arch.ept_identity_pagetable);
5429 cleanup_srcu_struct(&kvm->srcu); 5549 cleanup_srcu_struct(&kvm->srcu);
5430 kfree(kvm->arch.aliases);
5431 kfree(kvm); 5550 kfree(kvm);
5432} 5551}
5433 5552
@@ -5438,6 +5557,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
5438 int user_alloc) 5557 int user_alloc)
5439{ 5558{
5440 int npages = memslot->npages; 5559 int npages = memslot->npages;
5560 int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
5561
5562 /* Prevent internal slot pages from being moved by fork()/COW. */
5563 if (memslot->id >= KVM_MEMORY_SLOTS)
5564 map_flags = MAP_SHARED | MAP_ANONYMOUS;
5441 5565
5442 /*To keep backward compatibility with older userspace, 5566 /*To keep backward compatibility with older userspace,
5443 *x86 needs to hanlde !user_alloc case. 5567 *x86 needs to hanlde !user_alloc case.
@@ -5450,7 +5574,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
5450 userspace_addr = do_mmap(NULL, 0, 5574 userspace_addr = do_mmap(NULL, 0,
5451 npages * PAGE_SIZE, 5575 npages * PAGE_SIZE,
5452 PROT_READ | PROT_WRITE, 5576 PROT_READ | PROT_WRITE,
5453 MAP_PRIVATE | MAP_ANONYMOUS, 5577 map_flags,
5454 0); 5578 0);
5455 up_write(&current->mm->mmap_sem); 5579 up_write(&current->mm->mmap_sem);
5456 5580
@@ -5523,7 +5647,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
5523 5647
5524 me = get_cpu(); 5648 me = get_cpu();
5525 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 5649 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
5526 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 5650 if (atomic_xchg(&vcpu->guest_mode, 0))
5527 smp_send_reschedule(cpu); 5651 smp_send_reschedule(cpu);
5528 put_cpu(); 5652 put_cpu();
5529} 5653}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f4b54458285b..b7a404722d2b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 66}
67 67
68static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
69{
70 return rcu_dereference_check(kvm->arch.aliases,
71 srcu_read_lock_held(&kvm->srcu)
72 || lockdep_is_held(&kvm->slots_lock));
73}
74
75void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
76void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
77 70
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f871e04b6965..e10cf070ede0 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -30,6 +30,7 @@ ifeq ($(CONFIG_X86_32),y)
30 lib-y += checksum_32.o 30 lib-y += checksum_32.o
31 lib-y += strstr_32.o 31 lib-y += strstr_32.o
32 lib-y += semaphore_32.o string_32.o 32 lib-y += semaphore_32.o string_32.o
33 lib-y += cmpxchg.o
33ifneq ($(CONFIG_X86_CMPXCHG64),y) 34ifneq ($(CONFIG_X86_CMPXCHG64),y)
34 lib-y += cmpxchg8b_emu.o atomic64_386_32.o 35 lib-y += cmpxchg8b_emu.o atomic64_386_32.o
35endif 36endif
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/lib/cmpxchg.c
index 2056ccf572cc..5d619f6df3ee 100644
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ b/arch/x86/lib/cmpxchg.c
@@ -52,21 +52,3 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
52} 52}
53EXPORT_SYMBOL(cmpxchg_386_u32); 53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif 54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a725b7f760ae..0002a3a33081 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -37,6 +37,28 @@ struct addr_marker {
37 const char *name; 37 const char *name;
38}; 38};
39 39
40/* indices for address_markers; keep sync'd w/ address_markers below */
41enum address_markers_idx {
42 USER_SPACE_NR = 0,
43#ifdef CONFIG_X86_64
44 KERNEL_SPACE_NR,
45 LOW_KERNEL_NR,
46 VMALLOC_START_NR,
47 VMEMMAP_START_NR,
48 HIGH_KERNEL_NR,
49 MODULES_VADDR_NR,
50 MODULES_END_NR,
51#else
52 KERNEL_SPACE_NR,
53 VMALLOC_START_NR,
54 VMALLOC_END_NR,
55# ifdef CONFIG_HIGHMEM
56 PKMAP_BASE_NR,
57# endif
58 FIXADDR_START_NR,
59#endif
60};
61
40/* Address space markers hints */ 62/* Address space markers hints */
41static struct addr_marker address_markers[] = { 63static struct addr_marker address_markers[] = {
42 { 0, "User Space" }, 64 { 0, "User Space" },
@@ -331,14 +353,12 @@ static int pt_dump_init(void)
331 353
332#ifdef CONFIG_X86_32 354#ifdef CONFIG_X86_32
333 /* Not a compile-time constant on x86-32 */ 355 /* Not a compile-time constant on x86-32 */
334 address_markers[2].start_address = VMALLOC_START; 356 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
335 address_markers[3].start_address = VMALLOC_END; 357 address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
336# ifdef CONFIG_HIGHMEM 358# ifdef CONFIG_HIGHMEM
337 address_markers[4].start_address = PKMAP_BASE; 359 address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
338 address_markers[5].start_address = FIXADDR_START;
339# else
340 address_markers[4].start_address = FIXADDR_START;
341# endif 360# endif
361 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
342#endif 362#endif
343 363
344 pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, 364 pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ee41bba315d1..9a6674689a20 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -2,7 +2,7 @@
2 * linux/arch/x86_64/mm/init.c 2 * linux/arch/x86_64/mm/init.c
3 * 3 *
4 * Copyright (C) 1995 Linus Torvalds 4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> 6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */ 7 */
8 8
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 12e4d2d3c110..3ba6e0608c55 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -62,8 +62,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
62static void __iomem *__ioremap_caller(resource_size_t phys_addr, 62static void __iomem *__ioremap_caller(resource_size_t phys_addr,
63 unsigned long size, unsigned long prot_val, void *caller) 63 unsigned long size, unsigned long prot_val, void *caller)
64{ 64{
65 unsigned long pfn, offset, vaddr; 65 unsigned long offset, vaddr;
66 resource_size_t last_addr; 66 resource_size_t pfn, last_pfn, last_addr;
67 const resource_size_t unaligned_phys_addr = phys_addr; 67 const resource_size_t unaligned_phys_addr = phys_addr;
68 const unsigned long unaligned_size = size; 68 const unsigned long unaligned_size = size;
69 struct vm_struct *area; 69 struct vm_struct *area;
@@ -100,10 +100,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
100 /* 100 /*
101 * Don't allow anybody to remap normal RAM that we're using.. 101 * Don't allow anybody to remap normal RAM that we're using..
102 */ 102 */
103 for (pfn = phys_addr >> PAGE_SHIFT; 103 last_pfn = last_addr >> PAGE_SHIFT;
104 (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK); 104 for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
105 pfn++) {
106
107 int is_ram = page_is_ram(pfn); 105 int is_ram = page_is_ram(pfn);
108 106
109 if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) 107 if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
@@ -115,7 +113,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
115 * Mappings have to be page-aligned 113 * Mappings have to be page-aligned
116 */ 114 */
117 offset = phys_addr & ~PAGE_MASK; 115 offset = phys_addr & ~PAGE_MASK;
118 phys_addr &= PAGE_MASK; 116 phys_addr &= PHYSICAL_PAGE_MASK;
119 size = PAGE_ALIGN(last_addr+1) - phys_addr; 117 size = PAGE_ALIGN(last_addr+1) - phys_addr;
120 118
121 retval = reserve_memtype(phys_addr, (u64)phys_addr + size, 119 retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
@@ -613,7 +611,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
613 return; 611 return;
614 } 612 }
615 offset = virt_addr & ~PAGE_MASK; 613 offset = virt_addr & ~PAGE_MASK;
616 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; 614 nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
617 615
618 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; 616 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
619 while (nrpages > 0) { 617 while (nrpages > 0) {
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 5d0e67fff1a6..e5d5e2ce9f77 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -45,6 +45,8 @@ struct kmmio_fault_page {
45 * Protected by kmmio_lock, when linked into kmmio_page_table. 45 * Protected by kmmio_lock, when linked into kmmio_page_table.
46 */ 46 */
47 int count; 47 int count;
48
49 bool scheduled_for_release;
48}; 50};
49 51
50struct kmmio_delayed_release { 52struct kmmio_delayed_release {
@@ -398,8 +400,11 @@ static void release_kmmio_fault_page(unsigned long page,
398 BUG_ON(f->count < 0); 400 BUG_ON(f->count < 0);
399 if (!f->count) { 401 if (!f->count) {
400 disarm_kmmio_fault_page(f); 402 disarm_kmmio_fault_page(f);
401 f->release_next = *release_list; 403 if (!f->scheduled_for_release) {
402 *release_list = f; 404 f->release_next = *release_list;
405 *release_list = f;
406 f->scheduled_for_release = true;
407 }
403 } 408 }
404} 409}
405 410
@@ -471,8 +476,10 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
471 prevp = &f->release_next; 476 prevp = &f->release_next;
472 } else { 477 } else {
473 *prevp = f->release_next; 478 *prevp = f->release_next;
479 f->release_next = NULL;
480 f->scheduled_for_release = false;
474 } 481 }
475 f = f->release_next; 482 f = *prevp;
476 } 483 }
477 spin_unlock_irqrestore(&kmmio_lock, flags); 484 spin_unlock_irqrestore(&kmmio_lock, flags);
478 485
@@ -510,6 +517,9 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
510 kmmio_count--; 517 kmmio_count--;
511 spin_unlock_irqrestore(&kmmio_lock, flags); 518 spin_unlock_irqrestore(&kmmio_lock, flags);
512 519
520 if (!release_list)
521 return;
522
513 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); 523 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
514 if (!drelease) { 524 if (!drelease) {
515 pr_crit("leaking kmmio_fault_page objects.\n"); 525 pr_crit("leaking kmmio_fault_page objects.\n");
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 64121a18b8cb..f6ff57b7efa5 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -158,7 +158,7 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
158 return req_type; 158 return req_type;
159} 159}
160 160
161static int pat_pagerange_is_ram(unsigned long start, unsigned long end) 161static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
162{ 162{
163 int ram_page = 0, not_rampage = 0; 163 int ram_page = 0, not_rampage = 0;
164 unsigned long page_nr; 164 unsigned long page_nr;
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index f20eeec85a86..8acaddd0fb21 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -34,8 +34,7 @@
34 * memtype_lock protects the rbtree. 34 * memtype_lock protects the rbtree.
35 */ 35 */
36 36
37static void memtype_rb_augment_cb(struct rb_node *node); 37static struct rb_root memtype_rbroot = RB_ROOT;
38static struct rb_root memtype_rbroot = RB_AUGMENT_ROOT(&memtype_rb_augment_cb);
39 38
40static int is_node_overlap(struct memtype *node, u64 start, u64 end) 39static int is_node_overlap(struct memtype *node, u64 start, u64 end)
41{ 40{
@@ -56,7 +55,7 @@ static u64 get_subtree_max_end(struct rb_node *node)
56} 55}
57 56
58/* Update 'subtree_max_end' for a node, based on node and its children */ 57/* Update 'subtree_max_end' for a node, based on node and its children */
59static void update_node_max_end(struct rb_node *node) 58static void memtype_rb_augment_cb(struct rb_node *node, void *__unused)
60{ 59{
61 struct memtype *data; 60 struct memtype *data;
62 u64 max_end, child_max_end; 61 u64 max_end, child_max_end;
@@ -78,25 +77,6 @@ static void update_node_max_end(struct rb_node *node)
78 data->subtree_max_end = max_end; 77 data->subtree_max_end = max_end;
79} 78}
80 79
81/* Update 'subtree_max_end' for a node and all its ancestors */
82static void update_path_max_end(struct rb_node *node)
83{
84 u64 old_max_end, new_max_end;
85
86 while (node) {
87 struct memtype *data = container_of(node, struct memtype, rb);
88
89 old_max_end = data->subtree_max_end;
90 update_node_max_end(node);
91 new_max_end = data->subtree_max_end;
92
93 if (new_max_end == old_max_end)
94 break;
95
96 node = rb_parent(node);
97 }
98}
99
100/* Find the first (lowest start addr) overlapping range from rb tree */ 80/* Find the first (lowest start addr) overlapping range from rb tree */
101static struct memtype *memtype_rb_lowest_match(struct rb_root *root, 81static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
102 u64 start, u64 end) 82 u64 start, u64 end)
@@ -190,12 +170,6 @@ failure:
190 return -EBUSY; 170 return -EBUSY;
191} 171}
192 172
193static void memtype_rb_augment_cb(struct rb_node *node)
194{
195 if (node)
196 update_path_max_end(node);
197}
198
199static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) 173static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
200{ 174{
201 struct rb_node **node = &(root->rb_node); 175 struct rb_node **node = &(root->rb_node);
@@ -213,6 +187,7 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
213 187
214 rb_link_node(&newdata->rb, parent, node); 188 rb_link_node(&newdata->rb, parent, node);
215 rb_insert_color(&newdata->rb, root); 189 rb_insert_color(&newdata->rb, root);
190 rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL);
216} 191}
217 192
218int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) 193int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
@@ -234,13 +209,16 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
234 209
235struct memtype *rbt_memtype_erase(u64 start, u64 end) 210struct memtype *rbt_memtype_erase(u64 start, u64 end)
236{ 211{
212 struct rb_node *deepest;
237 struct memtype *data; 213 struct memtype *data;
238 214
239 data = memtype_rb_exact_match(&memtype_rbroot, start, end); 215 data = memtype_rb_exact_match(&memtype_rbroot, start, end);
240 if (!data) 216 if (!data)
241 goto out; 217 goto out;
242 218
219 deepest = rb_augment_erase_begin(&data->rb);
243 rb_erase(&data->rb, &memtype_rbroot); 220 rb_erase(&data->rb, &memtype_rbroot);
221 rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL);
244out: 222out:
245 return data; 223 return data;
246} 224}
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 308e32570d84..38e6d174c497 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -40,16 +40,16 @@ static unsigned char prefix_codes[] = {
40static unsigned int reg_rop[] = { 40static unsigned int reg_rop[] = {
41 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F 41 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
42}; 42};
43static unsigned int reg_wop[] = { 0x88, 0x89 }; 43static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
44static unsigned int imm_wop[] = { 0xC6, 0xC7 }; 44static unsigned int imm_wop[] = { 0xC6, 0xC7 };
45/* IA32 Manual 3, 3-432*/ 45/* IA32 Manual 3, 3-432*/
46static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 }; 46static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA };
47static unsigned int rw32[] = { 47static unsigned int rw32[] = {
48 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F 48 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
49}; 49};
50static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F }; 50static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA };
51static unsigned int mw16[] = { 0xB70F, 0xBF0F }; 51static unsigned int mw16[] = { 0xB70F, 0xBF0F };
52static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 }; 52static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB };
53static unsigned int mw64[] = {}; 53static unsigned int mw64[] = {};
54#else /* not __i386__ */ 54#else /* not __i386__ */
55static unsigned char prefix_codes[] = { 55static unsigned char prefix_codes[] = {
@@ -63,20 +63,20 @@ static unsigned char prefix_codes[] = {
63static unsigned int reg_rop[] = { 63static unsigned int reg_rop[] = {
64 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F 64 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
65}; 65};
66static unsigned int reg_wop[] = { 0x88, 0x89 }; 66static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
67static unsigned int imm_wop[] = { 0xC6, 0xC7 }; 67static unsigned int imm_wop[] = { 0xC6, 0xC7 };
68static unsigned int rw8[] = { 0xC6, 0x88, 0x8A }; 68static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA };
69static unsigned int rw32[] = { 69static unsigned int rw32[] = {
70 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F 70 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
71}; 71};
72/* 8 bit only */ 72/* 8 bit only */
73static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F }; 73static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA };
74/* 16 bit only */ 74/* 16 bit only */
75static unsigned int mw16[] = { 0xB70F, 0xBF0F }; 75static unsigned int mw16[] = { 0xB70F, 0xBF0F };
76/* 16 or 32 bit */ 76/* 16 or 32 bit */
77static unsigned int mw32[] = { 0xC7 }; 77static unsigned int mw32[] = { 0xC7 };
78/* 16, 32 or 64 bit */ 78/* 16, 32 or 64 bit */
79static unsigned int mw64[] = { 0x89, 0x8B }; 79static unsigned int mw64[] = { 0x89, 0x8B, 0xAB };
80#endif /* not __i386__ */ 80#endif /* not __i386__ */
81 81
82struct prefix_bits { 82struct prefix_bits {
@@ -410,7 +410,6 @@ static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
410unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) 410unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
411{ 411{
412 unsigned int opcode; 412 unsigned int opcode;
413 unsigned char mod_rm;
414 int reg; 413 int reg;
415 unsigned char *p; 414 unsigned char *p;
416 struct prefix_bits prf; 415 struct prefix_bits prf;
@@ -437,8 +436,13 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
437 goto err; 436 goto err;
438 437
439do_work: 438do_work:
440 mod_rm = *p; 439 /* for STOS, source register is fixed */
441 reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); 440 if (opcode == 0xAA || opcode == 0xAB) {
441 reg = arg_AX;
442 } else {
443 unsigned char mod_rm = *p;
444 reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
445 }
442 switch (get_ins_reg_width(ins_addr)) { 446 switch (get_ins_reg_width(ins_addr)) {
443 case 1: 447 case 1:
444 return *get_reg_w8(reg, prf.rex, regs); 448 return *get_reg_w8(reg, prf.rex, regs);
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 8565d944f7cf..38868adf07ea 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -90,6 +90,27 @@ static void do_test(unsigned long size)
90 iounmap(p); 90 iounmap(p);
91} 91}
92 92
93/*
94 * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in
95 * a short time. We had a bug in deferred freeing procedure which tried
96 * to free this region multiple times (ioremap can reuse the same address
97 * for many mappings).
98 */
99static void do_test_bulk_ioremapping(void)
100{
101 void __iomem *p;
102 int i;
103
104 for (i = 0; i < 10; ++i) {
105 p = ioremap_nocache(mmio_address, PAGE_SIZE);
106 if (p)
107 iounmap(p);
108 }
109
110 /* Force freeing. If it will crash we will know why. */
111 synchronize_rcu();
112}
113
93static int __init init(void) 114static int __init init(void)
94{ 115{
95 unsigned long size = (read_far) ? (8 << 20) : (16 << 10); 116 unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
@@ -104,6 +125,7 @@ static int __init init(void)
104 "and writing 16 kB of rubbish in there.\n", 125 "and writing 16 kB of rubbish in there.\n",
105 size >> 10, mmio_address); 126 size >> 10, mmio_address);
106 do_test(size); 127 do_test(size);
128 do_test_bulk_ioremapping();
107 pr_info("All done.\n"); 129 pr_info("All done.\n");
108 return 0; 130 return 0;
109} 131}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 426f3a1a64d3..c03f14ab6667 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -278,11 +278,9 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
278 278
279static void do_flush_tlb_all(void *info) 279static void do_flush_tlb_all(void *info)
280{ 280{
281 unsigned long cpu = smp_processor_id();
282
283 __flush_tlb_all(); 281 __flush_tlb_all();
284 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) 282 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
285 leave_mm(cpu); 283 leave_mm(smp_processor_id());
286} 284}
287 285
288void flush_tlb_all(void) 286void flush_tlb_all(void)
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index b28d2f1253bb..1ba67dc8006a 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -634,6 +634,18 @@ static int __init ppro_init(char **cpu_type)
634 if (force_arch_perfmon && cpu_has_arch_perfmon) 634 if (force_arch_perfmon && cpu_has_arch_perfmon)
635 return 0; 635 return 0;
636 636
637 /*
638 * Documentation on identifying Intel processors by CPU family
639 * and model can be found in the Intel Software Developer's
640 * Manuals (SDM):
641 *
642 * http://www.intel.com/products/processor/manuals/
643 *
644 * As of May 2010 the documentation for this was in the:
645 * "Intel 64 and IA-32 Architectures Software Developer's
646 * Manual Volume 3B: System Programming Guide", "Table B-1
647 * CPUID Signature Values of DisplayFamily_DisplayModel".
648 */
637 switch (cpu_model) { 649 switch (cpu_model) {
638 case 0 ... 2: 650 case 0 ... 2:
639 *cpu_type = "i386/ppro"; 651 *cpu_type = "i386/ppro";
@@ -655,12 +667,12 @@ static int __init ppro_init(char **cpu_type)
655 case 15: case 23: 667 case 15: case 23:
656 *cpu_type = "i386/core_2"; 668 *cpu_type = "i386/core_2";
657 break; 669 break;
670 case 0x1a:
658 case 0x2e: 671 case 0x2e:
659 case 26:
660 spec = &op_arch_perfmon_spec; 672 spec = &op_arch_perfmon_spec;
661 *cpu_type = "i386/core_i7"; 673 *cpu_type = "i386/core_i7";
662 break; 674 break;
663 case 28: 675 case 0x1c:
664 *cpu_type = "i386/atom"; 676 *cpu_type = "i386/atom";
665 break; 677 break;
666 default: 678 default:
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 2ec04c424a62..15466c096ba5 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -34,6 +34,15 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
34 DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), 34 DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
35 }, 35 },
36 }, 36 },
37 /* https://bugzilla.kernel.org/show_bug.cgi?id=16007 */
38 /* 2006 AMD HT/VIA system with two host bridges */
39 {
40 .callback = set_use_crs,
41 .ident = "ASRock ALiveSATA2-GLAN",
42 .matches = {
43 DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"),
44 },
45 },
37 {} 46 {}
38}; 47};
39 48
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 215a27ae050d..a0772af64efb 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -125,6 +125,23 @@ void __init dmi_check_skip_isa_align(void)
125static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) 125static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
126{ 126{
127 struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; 127 struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
128 struct resource *bar_r;
129 int bar;
130
131 if (pci_probe & PCI_NOASSIGN_BARS) {
132 /*
133 * If the BIOS did not assign the BAR, zero out the
134 * resource so the kernel doesn't attmept to assign
135 * it later on in pci_assign_unassigned_resources
136 */
137 for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) {
138 bar_r = &dev->resource[bar];
139 if (bar_r->start == 0 && bar_r->end != 0) {
140 bar_r->flags = 0;
141 bar_r->end = 0;
142 }
143 }
144 }
128 145
129 if (pci_probe & PCI_NOASSIGN_ROMS) { 146 if (pci_probe & PCI_NOASSIGN_ROMS) {
130 if (rom_r->parent) 147 if (rom_r->parent)
@@ -509,6 +526,9 @@ char * __devinit pcibios_setup(char *str)
509 } else if (!strcmp(str, "norom")) { 526 } else if (!strcmp(str, "norom")) {
510 pci_probe |= PCI_NOASSIGN_ROMS; 527 pci_probe |= PCI_NOASSIGN_ROMS;
511 return NULL; 528 return NULL;
529 } else if (!strcmp(str, "nobar")) {
530 pci_probe |= PCI_NOASSIGN_BARS;
531 return NULL;
512 } else if (!strcmp(str, "assign-busses")) { 532 } else if (!strcmp(str, "assign-busses")) {
513 pci_probe |= PCI_ASSIGN_ALL_BUSSES; 533 pci_probe |= PCI_ASSIGN_ALL_BUSSES;
514 return NULL; 534 return NULL;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 6fdb3ec30c31..55253095be84 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -184,6 +184,7 @@ static void __init pcibios_allocate_resources(int pass)
184 idx, r, disabled, pass); 184 idx, r, disabled, pass);
185 if (pci_claim_resource(dev, idx) < 0) { 185 if (pci_claim_resource(dev, idx) < 0) {
186 /* We'll assign a new address later */ 186 /* We'll assign a new address later */
187 dev->fw_addr[idx] = r->start;
187 r->end -= r->start; 188 r->end -= r->start;
188 r->start = 0; 189 r->start = 0;
189 } 190 }
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9810a0f76c91..f547ee05f715 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -989,7 +989,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
989 dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq); 989 dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
990 990
991 /* Update IRQ for all devices with the same pirq value */ 991 /* Update IRQ for all devices with the same pirq value */
992 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { 992 for_each_pci_dev(dev2) {
993 pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); 993 pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
994 if (!pin) 994 if (!pin)
995 continue; 995 continue;
@@ -1028,7 +1028,7 @@ void __init pcibios_fixup_irqs(void)
1028 u8 pin; 1028 u8 pin;
1029 1029
1030 DBG(KERN_DEBUG "PCI: IRQ fixup\n"); 1030 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
1031 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1031 for_each_pci_dev(dev) {
1032 /* 1032 /*
1033 * If the BIOS has set an out of range IRQ number, just 1033 * If the BIOS has set an out of range IRQ number, just
1034 * ignore it. Also keep track of which IRQ's are 1034 * ignore it. Also keep track of which IRQ's are
@@ -1052,7 +1052,7 @@ void __init pcibios_fixup_irqs(void)
1052 return; 1052 return;
1053 1053
1054 dev = NULL; 1054 dev = NULL;
1055 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1055 for_each_pci_dev(dev) {
1056 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 1056 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
1057 if (!pin) 1057 if (!pin)
1058 continue; 1058 continue;
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 8d460eaf524f..c89266be6048 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -36,7 +36,7 @@ int __init pci_legacy_init(void)
36 return 0; 36 return 0;
37} 37}
38 38
39void pcibios_scan_specific_bus(int busn) 39void __devinit pcibios_scan_specific_bus(int busn)
40{ 40{
41 int devfn; 41 int devfn;
42 long node; 42 long node;
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 7ef3a2735df3..cb29191cee58 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -66,8 +66,9 @@ static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn)
66 devfn, pos, 4, &pcie_cap)) 66 devfn, pos, 4, &pcie_cap))
67 return 0; 67 return 0;
68 68
69 if (pcie_cap == 0xffffffff) 69 if (PCI_EXT_CAP_ID(pcie_cap) == 0x0000 ||
70 return 0; 70 PCI_EXT_CAP_ID(pcie_cap) == 0xffff)
71 break;
71 72
72 if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) { 73 if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) {
73 raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, 74 raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
@@ -76,7 +77,7 @@ static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn)
76 return pos; 77 return pos;
77 } 78 }
78 79
79 pos = pcie_cap >> 20; 80 pos = PCI_EXT_CAP_NEXT(pcie_cap);
80 } 81 }
81 82
82 return 0; 83 return 0;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 1290ba54b350..e7e8c5f54956 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -4,7 +4,7 @@
4 * Distribute under GPLv2 4 * Distribute under GPLv2
5 * 5 *
6 * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> 6 * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
7 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> 7 * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> 8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
9 */ 9 */
10 10
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index d24f983ba1e5..460f314d13e5 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -4,7 +4,7 @@
4 * Distribute under GPLv2 4 * Distribute under GPLv2
5 * 5 *
6 * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> 6 * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
7 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> 7 * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> 8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
9 */ 9 */
10 10
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 6b4ffedb93c9..4a2afa1bac51 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -120,7 +120,8 @@ $(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE
120quiet_cmd_vdso = VDSO $@ 120quiet_cmd_vdso = VDSO $@
121 cmd_vdso = $(CC) -nostdlib -o $@ \ 121 cmd_vdso = $(CC) -nostdlib -o $@ \
122 $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ 122 $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) 123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
124 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
124 125
125VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) 126VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
126GCOV_PROFILE := n 127GCOV_PROFILE := n
diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh
new file mode 100755
index 000000000000..7ee90a9b549d
--- /dev/null
+++ b/arch/x86/vdso/checkundef.sh
@@ -0,0 +1,10 @@
1#!/bin/sh
2nm="$1"
3file="$2"
4$nm "$file" | grep '^ *U' > /dev/null 2>&1
5if [ $? -eq 1 ]; then
6 exit 0
7else
8 echo "$file: undefined symbols found" >&2
9 exit 1
10fi
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 02b442e92007..36df991985b2 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -374,7 +374,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
374 374
375#ifdef CONFIG_X86_64 375#ifdef CONFIG_X86_64
376 376
377__initcall(sysenter_setup); 377subsys_initcall(sysenter_setup);
378 378
379#ifdef CONFIG_SYSCTL 379#ifdef CONFIG_SYSCTL
380/* Register vsyscall32 into the ABI table */ 380/* Register vsyscall32 into the ABI table */
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index ac74869b8140..4b5d26f108bb 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -67,6 +67,7 @@ static int __init init_vdso_vars(void)
67 *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; 67 *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
68#include "vextern.h" 68#include "vextern.h"
69#undef VEXTERN 69#undef VEXTERN
70 vunmap(vbase);
70 return 0; 71 return 0;
71 72
72 oom: 73 oom:
@@ -74,7 +75,7 @@ static int __init init_vdso_vars(void)
74 vdso_enabled = 0; 75 vdso_enabled = 0;
75 return -ENOMEM; 76 return -ENOMEM;
76} 77}
77__initcall(init_vdso_vars); 78subsys_initcall(init_vdso_vars);
78 79
79struct linux_binprm; 80struct linux_binprm;
80 81
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index b83e119fbeb0..68128a1b401a 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,6 +13,11 @@ config XEN
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
14 Xen hypervisor. 14 Xen hypervisor.
15 15
16config XEN_PVHVM
17 def_bool y
18 depends on XEN
19 depends on X86_LOCAL_APIC
20
16config XEN_MAX_DOMAIN_MEMORY 21config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes" 22 int "Maximum allowed size of a domain in gigabytes"
18 default 8 if X86_32 23 default 8 if X86_32
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3bb4fc21f4f2..930954685980 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,7 @@ CFLAGS_mmu.o := $(nostackp)
12 12
13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
14 time.o xen-asm.o xen-asm_$(BITS).o \ 14 time.o xen-asm.o xen-asm_$(BITS).o \
15 grant-table.o suspend.o 15 grant-table.o suspend.o platform-pci-unplug.o
16 16
17obj-$(CONFIG_SMP) += smp.o 17obj-$(CONFIG_SMP) += smp.o
18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 65d8d79b46a8..d4ff5e83621d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -11,6 +11,7 @@
11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
12 */ 12 */
13 13
14#include <linux/cpu.h>
14#include <linux/kernel.h> 15#include <linux/kernel.h>
15#include <linux/init.h> 16#include <linux/init.h>
16#include <linux/smp.h> 17#include <linux/smp.h>
@@ -35,8 +36,10 @@
35#include <xen/interface/version.h> 36#include <xen/interface/version.h>
36#include <xen/interface/physdev.h> 37#include <xen/interface/physdev.h>
37#include <xen/interface/vcpu.h> 38#include <xen/interface/vcpu.h>
39#include <xen/interface/memory.h>
38#include <xen/features.h> 40#include <xen/features.h>
39#include <xen/page.h> 41#include <xen/page.h>
42#include <xen/hvm.h>
40#include <xen/hvc-console.h> 43#include <xen/hvc-console.h>
41 44
42#include <asm/paravirt.h> 45#include <asm/paravirt.h>
@@ -55,7 +58,9 @@
55#include <asm/pgtable.h> 58#include <asm/pgtable.h>
56#include <asm/tlbflush.h> 59#include <asm/tlbflush.h>
57#include <asm/reboot.h> 60#include <asm/reboot.h>
61#include <asm/setup.h>
58#include <asm/stackprotector.h> 62#include <asm/stackprotector.h>
63#include <asm/hypervisor.h>
59 64
60#include "xen-ops.h" 65#include "xen-ops.h"
61#include "mmu.h" 66#include "mmu.h"
@@ -76,6 +81,10 @@ struct shared_info xen_dummy_shared_info;
76 81
77void *xen_initial_gdt; 82void *xen_initial_gdt;
78 83
84RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
85__read_mostly int xen_have_vector_callback;
86EXPORT_SYMBOL_GPL(xen_have_vector_callback);
87
79/* 88/*
80 * Point at some empty memory to start with. We map the real shared_info 89 * Point at some empty memory to start with. We map the real shared_info
81 * page as soon as fixmap is up and running. 90 * page as soon as fixmap is up and running.
@@ -97,6 +106,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
97 */ 106 */
98static int have_vcpu_info_placement = 1; 107static int have_vcpu_info_placement = 1;
99 108
109static void clamp_max_cpus(void)
110{
111#ifdef CONFIG_SMP
112 if (setup_max_cpus > MAX_VIRT_CPUS)
113 setup_max_cpus = MAX_VIRT_CPUS;
114#endif
115}
116
100static void xen_vcpu_setup(int cpu) 117static void xen_vcpu_setup(int cpu)
101{ 118{
102 struct vcpu_register_vcpu_info info; 119 struct vcpu_register_vcpu_info info;
@@ -104,13 +121,17 @@ static void xen_vcpu_setup(int cpu)
104 struct vcpu_info *vcpup; 121 struct vcpu_info *vcpup;
105 122
106 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 123 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
107 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
108 124
109 if (!have_vcpu_info_placement) 125 if (cpu < MAX_VIRT_CPUS)
110 return; /* already tested, not available */ 126 per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
111 127
112 vcpup = &per_cpu(xen_vcpu_info, cpu); 128 if (!have_vcpu_info_placement) {
129 if (cpu >= MAX_VIRT_CPUS)
130 clamp_max_cpus();
131 return;
132 }
113 133
134 vcpup = &per_cpu(xen_vcpu_info, cpu);
114 info.mfn = arbitrary_virt_to_mfn(vcpup); 135 info.mfn = arbitrary_virt_to_mfn(vcpup);
115 info.offset = offset_in_page(vcpup); 136 info.offset = offset_in_page(vcpup);
116 137
@@ -125,6 +146,7 @@ static void xen_vcpu_setup(int cpu)
125 if (err) { 146 if (err) {
126 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); 147 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
127 have_vcpu_info_placement = 0; 148 have_vcpu_info_placement = 0;
149 clamp_max_cpus();
128 } else { 150 } else {
129 /* This cpu is using the registered vcpu info, even if 151 /* This cpu is using the registered vcpu info, even if
130 later ones fail to. */ 152 later ones fail to. */
@@ -731,7 +753,6 @@ static void set_xen_basic_apic_ops(void)
731 753
732#endif 754#endif
733 755
734
735static void xen_clts(void) 756static void xen_clts(void)
736{ 757{
737 struct multicall_space mcs; 758 struct multicall_space mcs;
@@ -926,10 +947,6 @@ static const struct pv_init_ops xen_init_ops __initdata = {
926 .patch = xen_patch, 947 .patch = xen_patch,
927}; 948};
928 949
929static const struct pv_time_ops xen_time_ops __initdata = {
930 .sched_clock = xen_sched_clock,
931};
932
933static const struct pv_cpu_ops xen_cpu_ops __initdata = { 950static const struct pv_cpu_ops xen_cpu_ops __initdata = {
934 .cpuid = xen_cpuid, 951 .cpuid = xen_cpuid,
935 952
@@ -1028,6 +1045,23 @@ static void xen_crash_shutdown(struct pt_regs *regs)
1028 xen_reboot(SHUTDOWN_crash); 1045 xen_reboot(SHUTDOWN_crash);
1029} 1046}
1030 1047
1048static int
1049xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
1050{
1051 xen_reboot(SHUTDOWN_crash);
1052 return NOTIFY_DONE;
1053}
1054
1055static struct notifier_block xen_panic_block = {
1056 .notifier_call= xen_panic_event,
1057};
1058
1059int xen_panic_handler_init(void)
1060{
1061 atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
1062 return 0;
1063}
1064
1031static const struct machine_ops __initdata xen_machine_ops = { 1065static const struct machine_ops __initdata xen_machine_ops = {
1032 .restart = xen_restart, 1066 .restart = xen_restart,
1033 .halt = xen_machine_halt, 1067 .halt = xen_machine_halt,
@@ -1067,7 +1101,6 @@ asmlinkage void __init xen_start_kernel(void)
1067 /* Install Xen paravirt ops */ 1101 /* Install Xen paravirt ops */
1068 pv_info = xen_info; 1102 pv_info = xen_info;
1069 pv_init_ops = xen_init_ops; 1103 pv_init_ops = xen_init_ops;
1070 pv_time_ops = xen_time_ops;
1071 pv_cpu_ops = xen_cpu_ops; 1104 pv_cpu_ops = xen_cpu_ops;
1072 pv_apic_ops = xen_apic_ops; 1105 pv_apic_ops = xen_apic_ops;
1073 1106
@@ -1075,13 +1108,7 @@ asmlinkage void __init xen_start_kernel(void)
1075 x86_init.oem.arch_setup = xen_arch_setup; 1108 x86_init.oem.arch_setup = xen_arch_setup;
1076 x86_init.oem.banner = xen_banner; 1109 x86_init.oem.banner = xen_banner;
1077 1110
1078 x86_init.timers.timer_init = xen_time_init; 1111 xen_init_time_ops();
1079 x86_init.timers.setup_percpu_clockev = x86_init_noop;
1080 x86_cpuinit.setup_percpu_clockev = x86_init_noop;
1081
1082 x86_platform.calibrate_tsc = xen_tsc_khz;
1083 x86_platform.get_wallclock = xen_get_wallclock;
1084 x86_platform.set_wallclock = xen_set_wallclock;
1085 1112
1086 /* 1113 /*
1087 * Set up some pagetable state before starting to set any ptes. 1114 * Set up some pagetable state before starting to set any ptes.
@@ -1206,3 +1233,139 @@ asmlinkage void __init xen_start_kernel(void)
1206 x86_64_start_reservations((char *)__pa_symbol(&boot_params)); 1233 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1207#endif 1234#endif
1208} 1235}
1236
1237static uint32_t xen_cpuid_base(void)
1238{
1239 uint32_t base, eax, ebx, ecx, edx;
1240 char signature[13];
1241
1242 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1243 cpuid(base, &eax, &ebx, &ecx, &edx);
1244 *(uint32_t *)(signature + 0) = ebx;
1245 *(uint32_t *)(signature + 4) = ecx;
1246 *(uint32_t *)(signature + 8) = edx;
1247 signature[12] = 0;
1248
1249 if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
1250 return base;
1251 }
1252
1253 return 0;
1254}
1255
1256static int init_hvm_pv_info(int *major, int *minor)
1257{
1258 uint32_t eax, ebx, ecx, edx, pages, msr, base;
1259 u64 pfn;
1260
1261 base = xen_cpuid_base();
1262 cpuid(base + 1, &eax, &ebx, &ecx, &edx);
1263
1264 *major = eax >> 16;
1265 *minor = eax & 0xffff;
1266 printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
1267
1268 cpuid(base + 2, &pages, &msr, &ecx, &edx);
1269
1270 pfn = __pa(hypercall_page);
1271 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
1272
1273 xen_setup_features();
1274
1275 pv_info = xen_info;
1276 pv_info.kernel_rpl = 0;
1277
1278 xen_domain_type = XEN_HVM_DOMAIN;
1279
1280 return 0;
1281}
1282
1283void xen_hvm_init_shared_info(void)
1284{
1285 int cpu;
1286 struct xen_add_to_physmap xatp;
1287 static struct shared_info *shared_info_page = 0;
1288
1289 if (!shared_info_page)
1290 shared_info_page = (struct shared_info *)
1291 extend_brk(PAGE_SIZE, PAGE_SIZE);
1292 xatp.domid = DOMID_SELF;
1293 xatp.idx = 0;
1294 xatp.space = XENMAPSPACE_shared_info;
1295 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
1296 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1297 BUG();
1298
1299 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
1300
1301 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1302 * page, we use it in the event channel upcall and in some pvclock
1303 * related functions. We don't need the vcpu_info placement
1304 * optimizations because we don't use any pv_mmu or pv_irq op on
1305 * HVM.
1306 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
1307 * online but xen_hvm_init_shared_info is run at resume time too and
1308 * in that case multiple vcpus might be online. */
1309 for_each_online_cpu(cpu) {
1310 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1311 }
1312}
1313
1314#ifdef CONFIG_XEN_PVHVM
1315static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1316 unsigned long action, void *hcpu)
1317{
1318 int cpu = (long)hcpu;
1319 switch (action) {
1320 case CPU_UP_PREPARE:
1321 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1322 break;
1323 default:
1324 break;
1325 }
1326 return NOTIFY_OK;
1327}
1328
1329static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
1330 .notifier_call = xen_hvm_cpu_notify,
1331};
1332
1333static void __init xen_hvm_guest_init(void)
1334{
1335 int r;
1336 int major, minor;
1337
1338 r = init_hvm_pv_info(&major, &minor);
1339 if (r < 0)
1340 return;
1341
1342 xen_hvm_init_shared_info();
1343
1344 if (xen_feature(XENFEAT_hvm_callback_vector))
1345 xen_have_vector_callback = 1;
1346 register_cpu_notifier(&xen_hvm_cpu_notifier);
1347 xen_unplug_emulated_devices();
1348 have_vcpu_info_placement = 0;
1349 x86_init.irqs.intr_init = xen_init_IRQ;
1350 xen_hvm_init_time_ops();
1351 xen_hvm_init_mmu_ops();
1352}
1353
1354static bool __init xen_hvm_platform(void)
1355{
1356 if (xen_pv_domain())
1357 return false;
1358
1359 if (!xen_cpuid_base())
1360 return false;
1361
1362 return true;
1363}
1364
1365const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
1366 .name = "Xen HVM",
1367 .detect = xen_hvm_platform,
1368 .init_platform = xen_hvm_guest_init,
1369};
1370EXPORT_SYMBOL(x86_hyper_xen_hvm);
1371#endif
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 914f04695ce5..413b19b3d0fe 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -58,6 +58,7 @@
58 58
59#include <xen/page.h> 59#include <xen/page.h>
60#include <xen/interface/xen.h> 60#include <xen/interface/xen.h>
61#include <xen/interface/hvm/hvm_op.h>
61#include <xen/interface/version.h> 62#include <xen/interface/version.h>
62#include <xen/hvc-console.h> 63#include <xen/hvc-console.h>
63 64
@@ -1941,6 +1942,40 @@ void __init xen_init_mmu_ops(void)
1941 pv_mmu_ops = xen_mmu_ops; 1942 pv_mmu_ops = xen_mmu_ops;
1942} 1943}
1943 1944
1945#ifdef CONFIG_XEN_PVHVM
1946static void xen_hvm_exit_mmap(struct mm_struct *mm)
1947{
1948 struct xen_hvm_pagetable_dying a;
1949 int rc;
1950
1951 a.domid = DOMID_SELF;
1952 a.gpa = __pa(mm->pgd);
1953 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
1954 WARN_ON_ONCE(rc < 0);
1955}
1956
1957static int is_pagetable_dying_supported(void)
1958{
1959 struct xen_hvm_pagetable_dying a;
1960 int rc = 0;
1961
1962 a.domid = DOMID_SELF;
1963 a.gpa = 0x00;
1964 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
1965 if (rc < 0) {
1966 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
1967 return 0;
1968 }
1969 return 1;
1970}
1971
1972void __init xen_hvm_init_mmu_ops(void)
1973{
1974 if (is_pagetable_dying_supported())
1975 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
1976}
1977#endif
1978
1944#ifdef CONFIG_XEN_DEBUG_FS 1979#ifdef CONFIG_XEN_DEBUG_FS
1945 1980
1946static struct dentry *d_mmu_debug; 1981static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe6bc7f5ecf..fa938c4aa2f7 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -60,4 +60,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
60unsigned long xen_read_cr2_direct(void); 60unsigned long xen_read_cr2_direct(void);
61 61
62extern void xen_init_mmu_ops(void); 62extern void xen_init_mmu_ops(void);
63extern void xen_hvm_init_mmu_ops(void);
63#endif /* _XEN_MMU_H */ 64#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
new file mode 100644
index 000000000000..554c002a1e1a
--- /dev/null
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -0,0 +1,137 @@
1/******************************************************************************
2 * platform-pci-unplug.c
3 *
4 * Xen platform PCI device driver
5 * Copyright (c) 2010, Citrix
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
18 * Place - Suite 330, Boston, MA 02111-1307 USA.
19 *
20 */
21
22#include <linux/init.h>
23#include <linux/io.h>
24#include <linux/module.h>
25
26#include <xen/platform_pci.h>
27
28#define XEN_PLATFORM_ERR_MAGIC -1
29#define XEN_PLATFORM_ERR_PROTOCOL -2
30#define XEN_PLATFORM_ERR_BLACKLIST -3
31
32/* store the value of xen_emul_unplug after the unplug is done */
33int xen_platform_pci_unplug;
34EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
35#ifdef CONFIG_XEN_PVHVM
36static int xen_emul_unplug;
37
38static int __init check_platform_magic(void)
39{
40 short magic;
41 char protocol;
42
43 magic = inw(XEN_IOPORT_MAGIC);
44 if (magic != XEN_IOPORT_MAGIC_VAL) {
45 printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
46 return XEN_PLATFORM_ERR_MAGIC;
47 }
48
49 protocol = inb(XEN_IOPORT_PROTOVER);
50
51 printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
52 protocol);
53
54 switch (protocol) {
55 case 1:
56 outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
57 outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
58 if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
59 printk(KERN_ERR "Xen Platform: blacklisted by host\n");
60 return XEN_PLATFORM_ERR_BLACKLIST;
61 }
62 break;
63 default:
64 printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
65 return XEN_PLATFORM_ERR_PROTOCOL;
66 }
67
68 return 0;
69}
70
71void __init xen_unplug_emulated_devices(void)
72{
73 int r;
74
75 /* check the version of the xen platform PCI device */
76 r = check_platform_magic();
77 /* If the version matches enable the Xen platform PCI driver.
78 * Also enable the Xen platform PCI driver if the version is really old
79 * and the user told us to ignore it. */
80 if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
81 (xen_emul_unplug & XEN_UNPLUG_IGNORE)))
82 return;
83 /* Set the default value of xen_emul_unplug depending on whether or
84 * not the Xen PV frontends and the Xen platform PCI driver have
85 * been compiled for this kernel (modules or built-in are both OK). */
86 if (!xen_emul_unplug) {
87 if (xen_must_unplug_nics()) {
88 printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
89 "been compiled for this kernel: unplug emulated NICs.\n");
90 xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
91 }
92 if (xen_must_unplug_disks()) {
93 printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
94 "been compiled for this kernel: unplug emulated disks.\n"
95 "You might have to change the root device\n"
96 "from /dev/hd[a-d] to /dev/xvd[a-d]\n"
97 "in your root= kernel command line option\n");
98 xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
99 }
100 }
101 /* Now unplug the emulated devices */
102 if (!(xen_emul_unplug & XEN_UNPLUG_IGNORE))
103 outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
104 xen_platform_pci_unplug = xen_emul_unplug;
105}
106
107static int __init parse_xen_emul_unplug(char *arg)
108{
109 char *p, *q;
110 int l;
111
112 for (p = arg; p; p = q) {
113 q = strchr(p, ',');
114 if (q) {
115 l = q - p;
116 q++;
117 } else {
118 l = strlen(p);
119 }
120 if (!strncmp(p, "all", l))
121 xen_emul_unplug |= XEN_UNPLUG_ALL;
122 else if (!strncmp(p, "ide-disks", l))
123 xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
124 else if (!strncmp(p, "aux-ide-disks", l))
125 xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
126 else if (!strncmp(p, "nics", l))
127 xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
128 else if (!strncmp(p, "ignore", l))
129 xen_emul_unplug |= XEN_UNPLUG_IGNORE;
130 else
131 printk(KERN_WARNING "unrecognised option '%s' "
132 "in parameter 'xen_emul_unplug'\n", p);
133 }
134 return 0;
135}
136early_param("xen_emul_unplug", parse_xen_emul_unplug);
137#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ad0047f47cd4..328b00305426 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
20#include <xen/page.h> 20#include <xen/page.h>
21#include <xen/interface/callback.h> 21#include <xen/interface/callback.h>
22#include <xen/interface/physdev.h> 22#include <xen/interface/physdev.h>
23#include <xen/interface/memory.h>
23#include <xen/features.h> 24#include <xen/features.h>
24 25
25#include "xen-ops.h" 26#include "xen-ops.h"
@@ -32,6 +33,73 @@ extern void xen_sysenter_target(void);
32extern void xen_syscall_target(void); 33extern void xen_syscall_target(void);
33extern void xen_syscall32_target(void); 34extern void xen_syscall32_target(void);
34 35
36static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
37 phys_addr_t end_addr)
38{
39 struct xen_memory_reservation reservation = {
40 .address_bits = 0,
41 .extent_order = 0,
42 .domid = DOMID_SELF
43 };
44 unsigned long start, end;
45 unsigned long len = 0;
46 unsigned long pfn;
47 int ret;
48
49 start = PFN_UP(start_addr);
50 end = PFN_DOWN(end_addr);
51
52 if (end <= start)
53 return 0;
54
55 printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ",
56 start, end);
57 for(pfn = start; pfn < end; pfn++) {
58 unsigned long mfn = pfn_to_mfn(pfn);
59
60 /* Make sure pfn exists to start with */
61 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
62 continue;
63
64 set_xen_guest_handle(reservation.extent_start, &mfn);
65 reservation.nr_extents = 1;
66
67 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
68 &reservation);
69 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
70 start, end, ret);
71 if (ret == 1) {
72 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
73 len++;
74 }
75 }
76 printk(KERN_CONT "%ld pages freed\n", len);
77
78 return len;
79}
80
81static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
82 const struct e820map *e820)
83{
84 phys_addr_t max_addr = PFN_PHYS(max_pfn);
85 phys_addr_t last_end = 0;
86 unsigned long released = 0;
87 int i;
88
89 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
90 phys_addr_t end = e820->map[i].addr;
91 end = min(max_addr, end);
92
93 released += xen_release_chunk(last_end, end);
94 last_end = e820->map[i].addr + e820->map[i].size;
95 }
96
97 if (last_end < max_addr)
98 released += xen_release_chunk(last_end, max_addr);
99
100 printk(KERN_INFO "released %ld pages of unused memory\n", released);
101 return released;
102}
35 103
36/** 104/**
37 * machine_specific_memory_setup - Hook for machine specific memory setup. 105 * machine_specific_memory_setup - Hook for machine specific memory setup.
@@ -67,6 +135,8 @@ char * __init xen_memory_setup(void)
67 135
68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 136 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
69 137
138 xen_return_unused_memory(xen_start_info->nr_pages, &e820);
139
70 return "Xen"; 140 return "Xen";
71} 141}
72 142
@@ -156,6 +226,8 @@ void __init xen_arch_setup(void)
156 struct physdev_set_iopl set_iopl; 226 struct physdev_set_iopl set_iopl;
157 int rc; 227 int rc;
158 228
229 xen_panic_handler_init();
230
159 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 231 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
160 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 232 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
161 233
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index a29693fd3138..25f232b18a82 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -394,6 +394,8 @@ static void stop_self(void *v)
394 load_cr3(swapper_pg_dir); 394 load_cr3(swapper_pg_dir);
395 /* should set up a minimal gdt */ 395 /* should set up a minimal gdt */
396 396
397 set_cpu_online(cpu, false);
398
397 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); 399 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
398 BUG(); 400 BUG();
399} 401}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index a9c661108034..1d789d56877c 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -26,6 +26,18 @@ void xen_pre_suspend(void)
26 BUG(); 26 BUG();
27} 27}
28 28
29void xen_hvm_post_suspend(int suspend_cancelled)
30{
31 int cpu;
32 xen_hvm_init_shared_info();
33 xen_callback_vector();
34 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
35 for_each_online_cpu(cpu) {
36 xen_setup_runstate_info(cpu);
37 }
38 }
39}
40
29void xen_post_suspend(int suspend_cancelled) 41void xen_post_suspend(int suspend_cancelled)
30{ 42{
31 xen_build_mfn_list_list(); 43 xen_build_mfn_list_list();
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b3c6c59ed302..1a5353a753fc 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -20,6 +20,7 @@
20#include <asm/xen/hypercall.h> 20#include <asm/xen/hypercall.h>
21 21
22#include <xen/events.h> 22#include <xen/events.h>
23#include <xen/features.h>
23#include <xen/interface/xen.h> 24#include <xen/interface/xen.h>
24#include <xen/interface/vcpu.h> 25#include <xen/interface/vcpu.h>
25 26
@@ -155,47 +156,8 @@ static void do_stolen_accounting(void)
155 account_idle_ticks(ticks); 156 account_idle_ticks(ticks);
156} 157}
157 158
158/*
159 * Xen sched_clock implementation. Returns the number of unstolen
160 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
161 * states.
162 */
163unsigned long long xen_sched_clock(void)
164{
165 struct vcpu_runstate_info state;
166 cycle_t now;
167 u64 ret;
168 s64 offset;
169
170 /*
171 * Ideally sched_clock should be called on a per-cpu basis
172 * anyway, so preempt should already be disabled, but that's
173 * not current practice at the moment.
174 */
175 preempt_disable();
176
177 now = xen_clocksource_read();
178
179 get_runstate_snapshot(&state);
180
181 WARN_ON(state.state != RUNSTATE_running);
182
183 offset = now - state.state_entry_time;
184 if (offset < 0)
185 offset = 0;
186
187 ret = state.time[RUNSTATE_blocked] +
188 state.time[RUNSTATE_running] +
189 offset;
190
191 preempt_enable();
192
193 return ret;
194}
195
196
197/* Get the TSC speed from Xen */ 159/* Get the TSC speed from Xen */
198unsigned long xen_tsc_khz(void) 160static unsigned long xen_tsc_khz(void)
199{ 161{
200 struct pvclock_vcpu_time_info *info = 162 struct pvclock_vcpu_time_info *info =
201 &HYPERVISOR_shared_info->vcpu_info[0].time; 163 &HYPERVISOR_shared_info->vcpu_info[0].time;
@@ -230,7 +192,7 @@ static void xen_read_wallclock(struct timespec *ts)
230 put_cpu_var(xen_vcpu); 192 put_cpu_var(xen_vcpu);
231} 193}
232 194
233unsigned long xen_get_wallclock(void) 195static unsigned long xen_get_wallclock(void)
234{ 196{
235 struct timespec ts; 197 struct timespec ts;
236 198
@@ -238,7 +200,7 @@ unsigned long xen_get_wallclock(void)
238 return ts.tv_sec; 200 return ts.tv_sec;
239} 201}
240 202
241int xen_set_wallclock(unsigned long now) 203static int xen_set_wallclock(unsigned long now)
242{ 204{
243 /* do nothing for domU */ 205 /* do nothing for domU */
244 return -1; 206 return -1;
@@ -473,7 +435,11 @@ void xen_timer_resume(void)
473 } 435 }
474} 436}
475 437
476__init void xen_time_init(void) 438static const struct pv_time_ops xen_time_ops __initdata = {
439 .sched_clock = xen_clocksource_read,
440};
441
442static __init void xen_time_init(void)
477{ 443{
478 int cpu = smp_processor_id(); 444 int cpu = smp_processor_id();
479 struct timespec tp; 445 struct timespec tp;
@@ -497,3 +463,47 @@ __init void xen_time_init(void)
497 xen_setup_timer(cpu); 463 xen_setup_timer(cpu);
498 xen_setup_cpu_clockevents(); 464 xen_setup_cpu_clockevents();
499} 465}
466
467__init void xen_init_time_ops(void)
468{
469 pv_time_ops = xen_time_ops;
470
471 x86_init.timers.timer_init = xen_time_init;
472 x86_init.timers.setup_percpu_clockev = x86_init_noop;
473 x86_cpuinit.setup_percpu_clockev = x86_init_noop;
474
475 x86_platform.calibrate_tsc = xen_tsc_khz;
476 x86_platform.get_wallclock = xen_get_wallclock;
477 x86_platform.set_wallclock = xen_set_wallclock;
478}
479
480#ifdef CONFIG_XEN_PVHVM
481static void xen_hvm_setup_cpu_clockevents(void)
482{
483 int cpu = smp_processor_id();
484 xen_setup_runstate_info(cpu);
485 xen_setup_timer(cpu);
486 xen_setup_cpu_clockevents();
487}
488
489__init void xen_hvm_init_time_ops(void)
490{
491 /* vector callback is needed otherwise we cannot receive interrupts
492 * on cpu > 0 */
493 if (!xen_have_vector_callback && num_present_cpus() > 1)
494 return;
495 if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
496 printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
497 "disable pv timer\n");
498 return;
499 }
500
501 pv_time_ops = xen_time_ops;
502 x86_init.timers.setup_percpu_clockev = xen_time_init;
503 x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
504
505 x86_platform.calibrate_tsc = xen_tsc_khz;
506 x86_platform.get_wallclock = xen_get_wallclock;
507 x86_platform.set_wallclock = xen_set_wallclock;
508}
509#endif
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f9153a300bce..7c8ab86163e9 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -38,6 +38,10 @@ void xen_enable_sysenter(void);
38void xen_enable_syscall(void); 38void xen_enable_syscall(void);
39void xen_vcpu_restore(void); 39void xen_vcpu_restore(void);
40 40
41void xen_callback_vector(void);
42void xen_hvm_init_shared_info(void);
43void __init xen_unplug_emulated_devices(void);
44
41void __init xen_build_dynamic_phys_to_machine(void); 45void __init xen_build_dynamic_phys_to_machine(void);
42 46
43void xen_init_irq_ops(void); 47void xen_init_irq_ops(void);
@@ -46,11 +50,8 @@ void xen_setup_runstate_info(int cpu);
46void xen_teardown_timer(int cpu); 50void xen_teardown_timer(int cpu);
47cycle_t xen_clocksource_read(void); 51cycle_t xen_clocksource_read(void);
48void xen_setup_cpu_clockevents(void); 52void xen_setup_cpu_clockevents(void);
49unsigned long xen_tsc_khz(void); 53void __init xen_init_time_ops(void);
50void __init xen_time_init(void); 54void __init xen_hvm_init_time_ops(void);
51unsigned long xen_get_wallclock(void);
52int xen_set_wallclock(unsigned long time);
53unsigned long long xen_sched_clock(void);
54 55
55irqreturn_t xen_debug_interrupt(int irq, void *dev_id); 56irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
56 57
@@ -101,4 +102,6 @@ void xen_sysret32(void);
101void xen_sysret64(void); 102void xen_sysret64(void);
102void xen_adjust_exception_frame(void); 103void xen_adjust_exception_frame(void);
103 104
105extern int xen_panic_handler_init(void);
106
104#endif /* XEN_OPS_H */ 107#endif /* XEN_OPS_H */