aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig14
-rw-r--r--arch/x86/boot/Makefile8
-rw-r--r--arch/x86/boot/boot.h28
-rw-r--r--arch/x86/boot/cmdline.c6
-rw-r--r--arch/x86/boot/compressed/Makefile4
-rw-r--r--arch/x86/boot/compressed/cmdline.c21
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c5
-rw-r--r--arch/x86/boot/compressed/head_32.S13
-rw-r--r--arch/x86/boot/compressed/head_64.S13
-rw-r--r--arch/x86/boot/compressed/misc.c56
-rw-r--r--arch/x86/boot/compressed/misc.h39
-rw-r--r--arch/x86/boot/compressed/string.c2
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S6
-rw-r--r--arch/x86/boot/ctype.h21
-rw-r--r--arch/x86/boot/early_serial_console.c139
-rw-r--r--arch/x86/boot/main.c9
-rw-r--r--arch/x86/boot/printf.c4
-rw-r--r--arch/x86/boot/string.c63
-rw-r--r--arch/x86/boot/tty.c37
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S3
-rw-r--r--arch/x86/ia32/sys_ia32.c9
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/alternative.h7
-rw-r--r--arch/x86/include/asm/apb_timer.h1
-rw-r--r--arch/x86/include/asm/bootparam.h11
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h198
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h83
-rw-r--r--arch/x86/include/asm/cpufeature.h43
-rw-r--r--arch/x86/include/asm/dma-mapping.h8
-rw-r--r--arch/x86/include/asm/highmem.h2
-rw-r--r--arch/x86/include/asm/hypervisor.h1
-rw-r--r--arch/x86/include/asm/i387.h26
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h20
-rw-r--r--arch/x86/include/asm/irq_vectors.h3
-rw-r--r--arch/x86/include/asm/kgdb.h20
-rw-r--r--arch/x86/include/asm/kvm.h22
-rw-r--r--arch/x86/include/asm/kvm_emulate.h30
-rw-r--r--arch/x86/include/asm/kvm_host.h70
-rw-r--r--arch/x86/include/asm/mce.h4
-rw-r--r--arch/x86/include/asm/mrst.h26
-rw-r--r--arch/x86/include/asm/msr-index.h26
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/olpc_ofw.h31
-rw-r--r--arch/x86/include/asm/pci_x86.h1
-rw-r--r--arch/x86/include/asm/pgtable_64.h4
-rw-r--r--arch/x86/include/asm/processor.h21
-rw-r--r--arch/x86/include/asm/required-features.h2
-rw-r--r--arch/x86/include/asm/rwsem.h21
-rw-r--r--arch/x86/include/asm/scatterlist.h1
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/sys_ia32.h3
-rw-r--r--arch/x86/include/asm/system.h7
-rw-r--r--arch/x86/include/asm/unistd_32.h5
-rw-r--r--arch/x86/include/asm/unistd_64.h6
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/asm/xen/page.h8
-rw-r--r--arch/x86/include/asm/xen/swiotlb-xen.h14
-rw-r--r--arch/x86/include/asm/xsave.h40
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S2
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/alternative.c1
-rw-r--r--arch/x86/kernel/amd_iommu.c8
-rw-r--r--arch/x86/kernel/apb_timer.c37
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/es7000_32.c1
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c77
-rw-r--r--arch/x86/kernel/cpu/common.c28
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h26
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c8
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c108
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c35
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c206
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c56
-rw-r--r--arch/x86/kernel/cpu/scattered.c63
-rw-r--r--arch/x86/kernel/cpu/topology.c (renamed from arch/x86/kernel/cpu/addon_cpuid_features.c)58
-rw-r--r--arch/x86/kernel/cpu/vmware.c9
-rw-r--r--arch/x86/kernel/entry_32.S16
-rw-r--r--arch/x86/kernel/entry_64.S9
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S5
-rw-r--r--arch/x86/kernel/hpet.c15
-rw-r--r--arch/x86/kernel/i387.c42
-rw-r--r--arch/x86/kernel/kgdb.c189
-rw-r--r--arch/x86/kernel/mrst.c105
-rw-r--r--arch/x86/kernel/olpc.c20
-rw-r--r--arch/x86/kernel/olpc_ofw.c106
-rw-r--r--arch/x86/kernel/pci-dma.c7
-rw-r--r--arch/x86/kernel/process.c46
-rw-r--r--arch/x86/kernel/setup.c6
-rw-r--r--arch/x86/kernel/smpboot.c15
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/tsc.c5
-rw-r--r--arch/x86/kernel/verify_cpu_64.S3
-rw-r--r--arch/x86/kernel/vsyscall_64.c17
-rw-r--r--arch/x86/kernel/xsave.c195
-rw-r--r--arch/x86/kvm/emulate.c749
-rw-r--r--arch/x86/kvm/i8254.c146
-rw-r--r--arch/x86/kvm/i8254.h4
-rw-r--r--arch/x86/kvm/i8259.c48
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/irq.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h8
-rw-r--r--arch/x86/kvm/lapic.c17
-rw-r--r--arch/x86/kvm/mmu.c807
-rw-r--r--arch/x86/kvm/mmutrace.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h252
-rw-r--r--arch/x86/kvm/svm.c147
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/vmx.c261
-rw-r--r--arch/x86/kvm/x86.c1176
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/clear_page_64.S2
-rw-r--r--arch/x86/lib/cmpxchg.c (renamed from arch/x86/kernel/cpu/cmpxchg.c)18
-rw-r--r--arch/x86/lib/copy_page_64.S2
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/memcpy_64.S2
-rw-r--r--arch/x86/lib/memset_64.S2
-rw-r--r--arch/x86/mm/dump_pagetables.c32
-rw-r--r--arch/x86/mm/highmem_32.c4
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/ioremap.c14
-rw-r--r--arch/x86/mm/kmmio.c16
-rw-r--r--arch/x86/mm/pat.c2
-rw-r--r--arch/x86/mm/testmmiotrace.c22
-rw-r--r--arch/x86/mm/tlb.c4
-rw-r--r--arch/x86/pci/acpi.c9
-rw-r--r--arch/x86/pci/common.c20
-rw-r--r--arch/x86/pci/irq.c6
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/power/hibernate_64.c2
-rw-r--r--arch/x86/vdso/Makefile3
-rwxr-xr-xarch/x86/vdso/checkundef.sh10
-rw-r--r--arch/x86/vdso/vdso32-setup.c2
-rw-r--r--arch/x86/vdso/vma.c3
-rw-r--r--arch/x86/xen/Kconfig5
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c201
-rw-r--r--arch/x86/xen/mmu.c328
-rw-r--r--arch/x86/xen/mmu.h1
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c58
-rw-r--r--arch/x86/xen/platform-pci-unplug.c137
-rw-r--r--arch/x86/xen/setup.c72
-rw-r--r--arch/x86/xen/smp.c2
-rw-r--r--arch/x86/xen/suspend.c12
-rw-r--r--arch/x86/xen/time.c96
-rw-r--r--arch/x86/xen/xen-ops.h13
169 files changed, 5204 insertions, 2458 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6f77afa6bca..a84fc34c8f7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -73,9 +73,6 @@ config ARCH_DEFCONFIG
73 default "arch/x86/configs/i386_defconfig" if X86_32 73 default "arch/x86/configs/i386_defconfig" if X86_32
74 default "arch/x86/configs/x86_64_defconfig" if X86_64 74 default "arch/x86/configs/x86_64_defconfig" if X86_64
75 75
76config GENERIC_TIME
77 def_bool y
78
79config GENERIC_CMOS_UPDATE 76config GENERIC_CMOS_UPDATE
80 def_bool y 77 def_bool y
81 78
@@ -2047,7 +2044,7 @@ config SCx200
2047 2044
2048config SCx200HR_TIMER 2045config SCx200HR_TIMER
2049 tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" 2046 tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
2050 depends on SCx200 && GENERIC_TIME 2047 depends on SCx200
2051 default y 2048 default y
2052 ---help--- 2049 ---help---
2053 This driver provides a clocksource built upon the on-chip 2050 This driver provides a clocksource built upon the on-chip
@@ -2063,6 +2060,15 @@ config OLPC
2063 Add support for detecting the unique features of the OLPC 2060 Add support for detecting the unique features of the OLPC
2064 XO hardware. 2061 XO hardware.
2065 2062
2063config OLPC_OPENFIRMWARE
2064 bool "Support for OLPC's Open Firmware"
2065 depends on !X86_64 && !X86_PAE
2066 default y if OLPC
2067 help
2068 This option adds support for the implementation of Open Firmware
2069 that is used on the OLPC XO-1 Children's Machine.
2070 If unsure, say N here.
2071
2066endif # X86_32 2072endif # X86_32
2067 2073
2068config K8_NB 2074config K8_NB
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index ec749c2bfdd..f7cb086b4ad 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,10 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf 26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
27subdir- := compressed 27subdir- := compressed
28 28
29setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o 29setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o
30setup-y += header.o main.o mca.o memory.o pm.o pmjump.o 30setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o
31setup-y += printf.o regs.o string.o tty.o video.o video-mode.o 31setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o
32setup-y += version.o 32setup-y += video-mode.o version.o
33setup-$(CONFIG_X86_APM_BOOT) += apm.o 33setup-$(CONFIG_X86_APM_BOOT) += apm.o
34 34
35# The link order of the video-*.o modules can matter. In particular, 35# The link order of the video-*.o modules can matter. In particular,
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 98239d2658f..c7093bd9f2d 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -28,6 +28,7 @@
28#include "bitops.h" 28#include "bitops.h"
29#include <asm/cpufeature.h> 29#include <asm/cpufeature.h>
30#include <asm/processor-flags.h> 30#include <asm/processor-flags.h>
31#include "ctype.h"
31 32
32/* Useful macros */ 33/* Useful macros */
33#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 34#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -37,6 +38,8 @@
37extern struct setup_header hdr; 38extern struct setup_header hdr;
38extern struct boot_params boot_params; 39extern struct boot_params boot_params;
39 40
41#define cpu_relax() asm volatile("rep; nop")
42
40/* Basic port I/O */ 43/* Basic port I/O */
41static inline void outb(u8 v, u16 port) 44static inline void outb(u8 v, u16 port)
42{ 45{
@@ -198,11 +201,6 @@ static inline int memcmp_gs(const void *s1, addr_t s2, size_t len)
198 return diff; 201 return diff;
199} 202}
200 203
201static inline int isdigit(int ch)
202{
203 return (ch >= '0') && (ch <= '9');
204}
205
206/* Heap -- available for dynamic lists. */ 204/* Heap -- available for dynamic lists. */
207extern char _end[]; 205extern char _end[];
208extern char *HEAP; 206extern char *HEAP;
@@ -287,8 +285,18 @@ struct biosregs {
287void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); 285void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
288 286
289/* cmdline.c */ 287/* cmdline.c */
290int cmdline_find_option(const char *option, char *buffer, int bufsize); 288int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize);
291int cmdline_find_option_bool(const char *option); 289int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option);
290static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
291{
292 return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize);
293}
294
295static inline int cmdline_find_option_bool(const char *option)
296{
297 return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option);
298}
299
292 300
293/* cpu.c, cpucheck.c */ 301/* cpu.c, cpucheck.c */
294struct cpu_features { 302struct cpu_features {
@@ -300,6 +308,10 @@ extern struct cpu_features cpu;
300int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); 308int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
301int validate_cpu(void); 309int validate_cpu(void);
302 310
311/* early_serial_console.c */
312extern int early_serial_base;
313void console_init(void);
314
303/* edd.c */ 315/* edd.c */
304void query_edd(void); 316void query_edd(void);
305 317
@@ -329,8 +341,10 @@ void initregs(struct biosregs *regs);
329 341
330/* string.c */ 342/* string.c */
331int strcmp(const char *str1, const char *str2); 343int strcmp(const char *str1, const char *str2);
344int strncmp(const char *cs, const char *ct, size_t count);
332size_t strnlen(const char *s, size_t maxlen); 345size_t strnlen(const char *s, size_t maxlen);
333unsigned int atou(const char *s); 346unsigned int atou(const char *s);
347unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base);
334 348
335/* tty.c */ 349/* tty.c */
336void puts(const char *); 350void puts(const char *);
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index a1d35634bce..6b3b6f708c0 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -27,9 +27,8 @@ static inline int myisspace(u8 c)
27 * Returns the length of the argument (regardless of if it was 27 * Returns the length of the argument (regardless of if it was
28 * truncated to fit in the buffer), or -1 on not found. 28 * truncated to fit in the buffer), or -1 on not found.
29 */ 29 */
30int cmdline_find_option(const char *option, char *buffer, int bufsize) 30int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize)
31{ 31{
32 u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
33 addr_t cptr; 32 addr_t cptr;
34 char c; 33 char c;
35 int len = -1; 34 int len = -1;
@@ -100,9 +99,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize)
100 * Returns the position of that option (starts counting with 1) 99 * Returns the position of that option (starts counting with 1)
101 * or 0 on not found 100 * or 0 on not found
102 */ 101 */
103int cmdline_find_option_bool(const char *option) 102int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
104{ 103{
105 u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
106 addr_t cptr; 104 addr_t cptr;
107 char c; 105 char c;
108 int pos = 0, wstart = 0; 106 int pos = 0, wstart = 0;
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index fbb47daf245..0c229551eea 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o 7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
8 8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -23,7 +23,7 @@ LDFLAGS_vmlinux := -T
23 23
24hostprogs-y := mkpiggy 24hostprogs-y := mkpiggy
25 25
26$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE 26$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE
27 $(call if_changed,ld) 27 $(call if_changed,ld)
28 @: 28 @:
29 29
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
new file mode 100644
index 00000000000..cb62f786990
--- /dev/null
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -0,0 +1,21 @@
1#include "misc.h"
2
3static unsigned long fs;
4static inline void set_fs(unsigned long seg)
5{
6 fs = seg << 4; /* shift it back */
7}
8typedef unsigned long addr_t;
9static inline char rdfs8(addr_t addr)
10{
11 return *((char *)(fs + addr));
12}
13#include "../cmdline.c"
14int cmdline_find_option(const char *option, char *buffer, int bufsize)
15{
16 return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize);
17}
18int cmdline_find_option_bool(const char *option)
19{
20 return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
21}
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
new file mode 100644
index 00000000000..261e81fb958
--- /dev/null
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -0,0 +1,5 @@
1#include "misc.h"
2
3int early_serial_base;
4
5#include "../early_serial_console.c"
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index f543b70ffae..67a655a39ce 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -124,6 +124,19 @@ relocated:
124 rep stosl 124 rep stosl
125 125
126/* 126/*
127 * Adjust our own GOT
128 */
129 leal _got(%ebx), %edx
130 leal _egot(%ebx), %ecx
1311:
132 cmpl %ecx, %edx
133 jae 2f
134 addl %ebx, (%edx)
135 addl $4, %edx
136 jmp 1b
1372:
138
139/*
127 * Do the decompression, and jump to the new kernel.. 140 * Do the decompression, and jump to the new kernel..
128 */ 141 */
129 leal z_extract_offset_negative(%ebx), %ebp 142 leal z_extract_offset_negative(%ebx), %ebp
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index faff0dc9c06..52f85a196fa 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -280,6 +280,19 @@ relocated:
280 rep stosq 280 rep stosq
281 281
282/* 282/*
283 * Adjust our own GOT
284 */
285 leaq _got(%rip), %rdx
286 leaq _egot(%rip), %rcx
2871:
288 cmpq %rcx, %rdx
289 jae 2f
290 addq %rbx, (%rdx)
291 addq $8, %rdx
292 jmp 1b
2932:
294
295/*
283 * Do the decompression, and jump to the new kernel.. 296 * Do the decompression, and jump to the new kernel..
284 */ 297 */
285 pushq %rsi /* Save the real mode argument */ 298 pushq %rsi /* Save the real mode argument */
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 51e240779a4..8f7bef8e9ff 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -9,23 +9,7 @@
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */ 10 */
11 11
12/* 12#include "misc.h"
13 * we have to be careful, because no indirections are allowed here, and
14 * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
15 * we just keep it from happening
16 */
17#undef CONFIG_PARAVIRT
18#ifdef CONFIG_X86_32
19#define _ASM_X86_DESC_H 1
20#endif
21
22#include <linux/linkage.h>
23#include <linux/screen_info.h>
24#include <linux/elf.h>
25#include <linux/io.h>
26#include <asm/page.h>
27#include <asm/boot.h>
28#include <asm/bootparam.h>
29 13
30/* WARNING!! 14/* WARNING!!
31 * This code is compiled with -fPIC and it is relocated dynamically 15 * This code is compiled with -fPIC and it is relocated dynamically
@@ -123,15 +107,13 @@ static void error(char *m);
123/* 107/*
124 * This is set up by the setup-routine at boot-time 108 * This is set up by the setup-routine at boot-time
125 */ 109 */
126static struct boot_params *real_mode; /* Pointer to real-mode data */ 110struct boot_params *real_mode; /* Pointer to real-mode data */
127static int quiet; 111static int quiet;
112static int debug;
128 113
129void *memset(void *s, int c, size_t n); 114void *memset(void *s, int c, size_t n);
130void *memcpy(void *dest, const void *src, size_t n); 115void *memcpy(void *dest, const void *src, size_t n);
131 116
132static void __putstr(int, const char *);
133#define putstr(__x) __putstr(0, __x)
134
135#ifdef CONFIG_X86_64 117#ifdef CONFIG_X86_64
136#define memptr long 118#define memptr long
137#else 119#else
@@ -170,7 +152,21 @@ static void scroll(void)
170 vidmem[i] = ' '; 152 vidmem[i] = ' ';
171} 153}
172 154
173static void __putstr(int error, const char *s) 155#define XMTRDY 0x20
156
157#define TXR 0 /* Transmit register (WRITE) */
158#define LSR 5 /* Line Status */
159static void serial_putchar(int ch)
160{
161 unsigned timeout = 0xffff;
162
163 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
164 cpu_relax();
165
166 outb(ch, early_serial_base + TXR);
167}
168
169void __putstr(int error, const char *s)
174{ 170{
175 int x, y, pos; 171 int x, y, pos;
176 char c; 172 char c;
@@ -179,6 +175,14 @@ static void __putstr(int error, const char *s)
179 if (!error) 175 if (!error)
180 return; 176 return;
181#endif 177#endif
178 if (early_serial_base) {
179 const char *str = s;
180 while (*str) {
181 if (*str == '\n')
182 serial_putchar('\r');
183 serial_putchar(*str++);
184 }
185 }
182 186
183 if (real_mode->screen_info.orig_video_mode == 0 && 187 if (real_mode->screen_info.orig_video_mode == 0 &&
184 lines == 0 && cols == 0) 188 lines == 0 && cols == 0)
@@ -305,8 +309,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
305{ 309{
306 real_mode = rmode; 310 real_mode = rmode;
307 311
308 if (real_mode->hdr.loadflags & QUIET_FLAG) 312 if (cmdline_find_option_bool("quiet"))
309 quiet = 1; 313 quiet = 1;
314 if (cmdline_find_option_bool("debug"))
315 debug = 1;
310 316
311 if (real_mode->screen_info.orig_video_mode == 7) { 317 if (real_mode->screen_info.orig_video_mode == 7) {
312 vidmem = (char *) 0xb0000; 318 vidmem = (char *) 0xb0000;
@@ -319,6 +325,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
319 lines = real_mode->screen_info.orig_video_lines; 325 lines = real_mode->screen_info.orig_video_lines;
320 cols = real_mode->screen_info.orig_video_cols; 326 cols = real_mode->screen_info.orig_video_cols;
321 327
328 console_init();
329 if (debug)
330 putstr("early console in decompress_kernel\n");
331
322 free_mem_ptr = heap; /* Heap */ 332 free_mem_ptr = heap; /* Heap */
323 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 333 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
324 334
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
new file mode 100644
index 00000000000..3f19c81a620
--- /dev/null
+++ b/arch/x86/boot/compressed/misc.h
@@ -0,0 +1,39 @@
1#ifndef BOOT_COMPRESSED_MISC_H
2#define BOOT_COMPRESSED_MISC_H
3
4/*
5 * we have to be careful, because no indirections are allowed here, and
6 * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
7 * we just keep it from happening
8 */
9#undef CONFIG_PARAVIRT
10#ifdef CONFIG_X86_32
11#define _ASM_X86_DESC_H 1
12#endif
13
14#include <linux/linkage.h>
15#include <linux/screen_info.h>
16#include <linux/elf.h>
17#include <linux/io.h>
18#include <asm/page.h>
19#include <asm/boot.h>
20#include <asm/bootparam.h>
21
22#define BOOT_BOOT_H
23#include "../ctype.h"
24
25/* misc.c */
26extern struct boot_params *real_mode; /* Pointer to real-mode data */
27void __putstr(int error, const char *s);
28#define putstr(__x) __putstr(0, __x)
29#define puts(__x) __putstr(0, __x)
30
31/* cmdline.c */
32int cmdline_find_option(const char *option, char *buffer, int bufsize);
33int cmdline_find_option_bool(const char *option);
34
35/* early_serial_console.c */
36extern int early_serial_base;
37void console_init(void);
38
39#endif
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
new file mode 100644
index 00000000000..19b3e693cd7
--- /dev/null
+++ b/arch/x86/boot/compressed/string.c
@@ -0,0 +1,2 @@
1#include "misc.h"
2#include "../string.c"
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 5ddabceee12..34d047c9828 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -41,6 +41,12 @@ SECTIONS
41 *(.rodata.*) 41 *(.rodata.*)
42 _erodata = . ; 42 _erodata = . ;
43 } 43 }
44 .got : {
45 _got = .;
46 KEEP(*(.got.plt))
47 KEEP(*(.got))
48 _egot = .;
49 }
44 .data : { 50 .data : {
45 _data = . ; 51 _data = . ;
46 *(.data) 52 *(.data)
diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h
new file mode 100644
index 00000000000..25e13403193
--- /dev/null
+++ b/arch/x86/boot/ctype.h
@@ -0,0 +1,21 @@
1#ifndef BOOT_ISDIGIT_H
2
3#define BOOT_ISDIGIT_H
4
5static inline int isdigit(int ch)
6{
7 return (ch >= '0') && (ch <= '9');
8}
9
10static inline int isxdigit(int ch)
11{
12 if (isdigit(ch))
13 return true;
14
15 if ((ch >= 'a') && (ch <= 'f'))
16 return true;
17
18 return (ch >= 'A') && (ch <= 'F');
19}
20
21#endif
diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c
new file mode 100644
index 00000000000..030f4b93e25
--- /dev/null
+++ b/arch/x86/boot/early_serial_console.c
@@ -0,0 +1,139 @@
1#include "boot.h"
2
3#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
4
5#define XMTRDY 0x20
6
7#define DLAB 0x80
8
9#define TXR 0 /* Transmit register (WRITE) */
10#define RXR 0 /* Receive register (READ) */
11#define IER 1 /* Interrupt Enable */
12#define IIR 2 /* Interrupt ID */
13#define FCR 2 /* FIFO control */
14#define LCR 3 /* Line control */
15#define MCR 4 /* Modem control */
16#define LSR 5 /* Line Status */
17#define MSR 6 /* Modem Status */
18#define DLL 0 /* Divisor Latch Low */
19#define DLH 1 /* Divisor latch High */
20
21#define DEFAULT_BAUD 9600
22
23static void early_serial_init(int port, int baud)
24{
25 unsigned char c;
26 unsigned divisor;
27
28 outb(0x3, port + LCR); /* 8n1 */
29 outb(0, port + IER); /* no interrupt */
30 outb(0, port + FCR); /* no fifo */
31 outb(0x3, port + MCR); /* DTR + RTS */
32
33 divisor = 115200 / baud;
34 c = inb(port + LCR);
35 outb(c | DLAB, port + LCR);
36 outb(divisor & 0xff, port + DLL);
37 outb((divisor >> 8) & 0xff, port + DLH);
38 outb(c & ~DLAB, port + LCR);
39
40 early_serial_base = port;
41}
42
43static void parse_earlyprintk(void)
44{
45 int baud = DEFAULT_BAUD;
46 char arg[32];
47 int pos = 0;
48 int port = 0;
49
50 if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) {
51 char *e;
52
53 if (!strncmp(arg, "serial", 6)) {
54 port = DEFAULT_SERIAL_PORT;
55 pos += 6;
56 }
57
58 if (arg[pos] == ',')
59 pos++;
60
61 if (!strncmp(arg, "ttyS", 4)) {
62 static const int bases[] = { 0x3f8, 0x2f8 };
63 int idx = 0;
64
65 if (!strncmp(arg + pos, "ttyS", 4))
66 pos += 4;
67
68 if (arg[pos++] == '1')
69 idx = 1;
70
71 port = bases[idx];
72 }
73
74 if (arg[pos] == ',')
75 pos++;
76
77 baud = simple_strtoull(arg + pos, &e, 0);
78 if (baud == 0 || arg + pos == e)
79 baud = DEFAULT_BAUD;
80 }
81
82 if (port)
83 early_serial_init(port, baud);
84}
85
86#define BASE_BAUD (1843200/16)
87static unsigned int probe_baud(int port)
88{
89 unsigned char lcr, dll, dlh;
90 unsigned int quot;
91
92 lcr = inb(port + LCR);
93 outb(lcr | DLAB, port + LCR);
94 dll = inb(port + DLL);
95 dlh = inb(port + DLH);
96 outb(lcr, port + LCR);
97 quot = (dlh << 8) | dll;
98
99 return BASE_BAUD / quot;
100}
101
102static void parse_console_uart8250(void)
103{
104 char optstr[64], *options;
105 int baud = DEFAULT_BAUD;
106 int port = 0;
107
108 /*
109 * console=uart8250,io,0x3f8,115200n8
110 * need to make sure it is last one console !
111 */
112 if (cmdline_find_option("console", optstr, sizeof optstr) <= 0)
113 return;
114
115 options = optstr;
116
117 if (!strncmp(options, "uart8250,io,", 12))
118 port = simple_strtoull(options + 12, &options, 0);
119 else if (!strncmp(options, "uart,io,", 8))
120 port = simple_strtoull(options + 8, &options, 0);
121 else
122 return;
123
124 if (options && (options[0] == ','))
125 baud = simple_strtoull(options + 1, &options, 0);
126 else
127 baud = probe_baud(port);
128
129 if (port)
130 early_serial_init(port, baud);
131}
132
133void console_init(void)
134{
135 parse_earlyprintk();
136
137 if (!early_serial_base)
138 parse_console_uart8250();
139}
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 140172b895b..40358c8905b 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -130,6 +130,11 @@ void main(void)
130 /* First, copy the boot header into the "zeropage" */ 130 /* First, copy the boot header into the "zeropage" */
131 copy_boot_params(); 131 copy_boot_params();
132 132
133 /* Initialize the early-boot console */
134 console_init();
135 if (cmdline_find_option_bool("debug"))
136 puts("early console in setup code\n");
137
133 /* End of heap check */ 138 /* End of heap check */
134 init_heap(); 139 init_heap();
135 140
@@ -168,10 +173,6 @@ void main(void)
168 /* Set the video mode */ 173 /* Set the video mode */
169 set_video(); 174 set_video();
170 175
171 /* Parse command line for 'quiet' and pass it to decompressor. */
172 if (cmdline_find_option_bool("quiet"))
173 boot_params.hdr.loadflags |= QUIET_FLAG;
174
175 /* Do the last things and invoke protected mode */ 176 /* Do the last things and invoke protected mode */
176 go_to_protected_mode(); 177 go_to_protected_mode();
177} 178}
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index 50e47cdbddd..cdac91ca55d 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -34,7 +34,7 @@ static int skip_atoi(const char **s)
34#define SMALL 32 /* Must be 32 == 0x20 */ 34#define SMALL 32 /* Must be 32 == 0x20 */
35#define SPECIAL 64 /* 0x */ 35#define SPECIAL 64 /* 0x */
36 36
37#define do_div(n,base) ({ \ 37#define __do_div(n, base) ({ \
38int __res; \ 38int __res; \
39__res = ((unsigned long) n) % (unsigned) base; \ 39__res = ((unsigned long) n) % (unsigned) base; \
40n = ((unsigned long) n) / (unsigned) base; \ 40n = ((unsigned long) n) / (unsigned) base; \
@@ -83,7 +83,7 @@ static char *number(char *str, long num, int base, int size, int precision,
83 tmp[i++] = '0'; 83 tmp[i++] = '0';
84 else 84 else
85 while (num != 0) 85 while (num != 0)
86 tmp[i++] = (digits[do_div(num, base)] | locase); 86 tmp[i++] = (digits[__do_div(num, base)] | locase);
87 if (i > precision) 87 if (i > precision)
88 precision = i; 88 precision = i;
89 size -= precision; 89 size -= precision;
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index f94b7a0c2ab..3cbc4058dd2 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -30,6 +30,22 @@ int strcmp(const char *str1, const char *str2)
30 return 0; 30 return 0;
31} 31}
32 32
33int strncmp(const char *cs, const char *ct, size_t count)
34{
35 unsigned char c1, c2;
36
37 while (count) {
38 c1 = *cs++;
39 c2 = *ct++;
40 if (c1 != c2)
41 return c1 < c2 ? -1 : 1;
42 if (!c1)
43 break;
44 count--;
45 }
46 return 0;
47}
48
33size_t strnlen(const char *s, size_t maxlen) 49size_t strnlen(const char *s, size_t maxlen)
34{ 50{
35 const char *es = s; 51 const char *es = s;
@@ -48,3 +64,50 @@ unsigned int atou(const char *s)
48 i = i * 10 + (*s++ - '0'); 64 i = i * 10 + (*s++ - '0');
49 return i; 65 return i;
50} 66}
67
68/* Works only for digits and letters, but small and fast */
69#define TOLOWER(x) ((x) | 0x20)
70
71static unsigned int simple_guess_base(const char *cp)
72{
73 if (cp[0] == '0') {
74 if (TOLOWER(cp[1]) == 'x' && isxdigit(cp[2]))
75 return 16;
76 else
77 return 8;
78 } else {
79 return 10;
80 }
81}
82
83/**
84 * simple_strtoull - convert a string to an unsigned long long
85 * @cp: The start of the string
86 * @endp: A pointer to the end of the parsed string will be placed here
87 * @base: The number base to use
88 */
89
90unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
91{
92 unsigned long long result = 0;
93
94 if (!base)
95 base = simple_guess_base(cp);
96
97 if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x')
98 cp += 2;
99
100 while (isxdigit(*cp)) {
101 unsigned int value;
102
103 value = isdigit(*cp) ? *cp - '0' : TOLOWER(*cp) - 'a' + 10;
104 if (value >= base)
105 break;
106 result = result * base + value;
107 cp++;
108 }
109 if (endp)
110 *endp = (char *)cp;
111
112 return result;
113}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 01ec69c901c..def2451f46a 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -10,23 +10,36 @@
10 * ----------------------------------------------------------------------- */ 10 * ----------------------------------------------------------------------- */
11 11
12/* 12/*
13 * Very simple screen I/O 13 * Very simple screen and serial I/O
14 * XXX: Probably should add very simple serial I/O?
15 */ 14 */
16 15
17#include "boot.h" 16#include "boot.h"
18 17
18int early_serial_base;
19
20#define XMTRDY 0x20
21
22#define TXR 0 /* Transmit register (WRITE) */
23#define LSR 5 /* Line Status */
24
19/* 25/*
20 * These functions are in .inittext so they can be used to signal 26 * These functions are in .inittext so they can be used to signal
21 * error during initialization. 27 * error during initialization.
22 */ 28 */
23 29
24void __attribute__((section(".inittext"))) putchar(int ch) 30static void __attribute__((section(".inittext"))) serial_putchar(int ch)
25{ 31{
26 struct biosregs ireg; 32 unsigned timeout = 0xffff;
27 33
28 if (ch == '\n') 34 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
29 putchar('\r'); /* \n -> \r\n */ 35 cpu_relax();
36
37 outb(ch, early_serial_base + TXR);
38}
39
40static void __attribute__((section(".inittext"))) bios_putchar(int ch)
41{
42 struct biosregs ireg;
30 43
31 initregs(&ireg); 44 initregs(&ireg);
32 ireg.bx = 0x0007; 45 ireg.bx = 0x0007;
@@ -36,6 +49,17 @@ void __attribute__((section(".inittext"))) putchar(int ch)
36 intcall(0x10, &ireg, NULL); 49 intcall(0x10, &ireg, NULL);
37} 50}
38 51
52void __attribute__((section(".inittext"))) putchar(int ch)
53{
54 if (ch == '\n')
55 putchar('\r'); /* \n -> \r\n */
56
57 bios_putchar(ch);
58
59 if (early_serial_base != 0)
60 serial_putchar(ch);
61}
62
39void __attribute__((section(".inittext"))) puts(const char *str) 63void __attribute__((section(".inittext"))) puts(const char *str)
40{ 64{
41 while (*str) 65 while (*str)
@@ -112,3 +136,4 @@ int getchar_timeout(void)
112 136
113 return 0; /* Timeout! */ 137 return 0; /* Timeout! */
114} 138}
139
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index d28fad19654..e3a32431ca1 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1471,6 +1471,7 @@ CONFIG_HWMON=y
1471# CONFIG_SENSORS_GL518SM is not set 1471# CONFIG_SENSORS_GL518SM is not set
1472# CONFIG_SENSORS_GL520SM is not set 1472# CONFIG_SENSORS_GL520SM is not set
1473# CONFIG_SENSORS_CORETEMP is not set 1473# CONFIG_SENSORS_CORETEMP is not set
1474# CONFIG_SENSORS_PKGTEMP is not set
1474# CONFIG_SENSORS_IT87 is not set 1475# CONFIG_SENSORS_IT87 is not set
1475# CONFIG_SENSORS_LM63 is not set 1476# CONFIG_SENSORS_LM63 is not set
1476# CONFIG_SENSORS_LM75 is not set 1477# CONFIG_SENSORS_LM75 is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 6c86acd847a..4251f837205 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1456,6 +1456,7 @@ CONFIG_HWMON=y
1456# CONFIG_SENSORS_GL518SM is not set 1456# CONFIG_SENSORS_GL518SM is not set
1457# CONFIG_SENSORS_GL520SM is not set 1457# CONFIG_SENSORS_GL520SM is not set
1458# CONFIG_SENSORS_CORETEMP is not set 1458# CONFIG_SENSORS_CORETEMP is not set
1459# CONFIG_SENSORS_PKGTEMP is not set
1459# CONFIG_SENSORS_IT87 is not set 1460# CONFIG_SENSORS_IT87 is not set
1460# CONFIG_SENSORS_LM63 is not set 1461# CONFIG_SENSORS_LM63 is not set
1461# CONFIG_SENSORS_LM75 is not set 1462# CONFIG_SENSORS_LM75 is not set
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e790bc1fbfa..b86feabed69 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -842,4 +842,7 @@ ia32_sys_call_table:
842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */ 842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
843 .quad sys_perf_event_open 843 .quad sys_perf_event_open
844 .quad compat_sys_recvmmsg 844 .quad compat_sys_recvmmsg
845 .quad sys_fanotify_init
846 .quad sys32_fanotify_mark
847 .quad sys_prlimit64 /* 340 */
845ia32_syscall_end: 848ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 626be156d88..3d093311d5e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -546,3 +546,12 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
546 return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, 546 return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
547 ((u64)len_hi << 32) | len_lo); 547 ((u64)len_hi << 32) | len_lo);
548} 548}
549
550asmlinkage long sys32_fanotify_mark(int fanotify_fd, unsigned int flags,
551 u32 mask_lo, u32 mask_hi,
552 int fd, const char __user *pathname)
553{
554 return sys_fanotify_mark(fanotify_fd, flags,
555 ((u64)mask_hi << 32) | mask_lo,
556 fd, pathname);
557}
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index aa2c39d968f..92091de1111 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -134,7 +134,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
134 boot_cpu_data.x86_model <= 0x05 && 134 boot_cpu_data.x86_model <= 0x05 &&
135 boot_cpu_data.x86_mask < 0x0A) 135 boot_cpu_data.x86_mask < 0x0A)
136 return 1; 136 return 1;
137 else if (boot_cpu_has(X86_FEATURE_AMDC1E)) 137 else if (c1e_detected)
138 return 1; 138 return 1;
139 else 139 else
140 return max_cstate; 140 return max_cstate;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 03b6bb5394a..bc6abb7bc7e 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -45,10 +45,9 @@
45struct alt_instr { 45struct alt_instr {
46 u8 *instr; /* original instruction */ 46 u8 *instr; /* original instruction */
47 u8 *replacement; 47 u8 *replacement;
48 u8 cpuid; /* cpuid bit set for replacement */ 48 u16 cpuid; /* cpuid bit set for replacement */
49 u8 instrlen; /* length of original instruction */ 49 u8 instrlen; /* length of original instruction */
50 u8 replacementlen; /* length of new instruction, <= instrlen */ 50 u8 replacementlen; /* length of new instruction, <= instrlen */
51 u8 pad1;
52#ifdef CONFIG_X86_64 51#ifdef CONFIG_X86_64
53 u32 pad2; 52 u32 pad2;
54#endif 53#endif
@@ -86,9 +85,11 @@ static inline int alternatives_text_reserved(void *start, void *end)
86 _ASM_ALIGN "\n" \ 85 _ASM_ALIGN "\n" \
87 _ASM_PTR "661b\n" /* label */ \ 86 _ASM_PTR "661b\n" /* label */ \
88 _ASM_PTR "663f\n" /* new instruction */ \ 87 _ASM_PTR "663f\n" /* new instruction */ \
89 " .byte " __stringify(feature) "\n" /* feature bit */ \ 88 " .word " __stringify(feature) "\n" /* feature bit */ \
90 " .byte 662b-661b\n" /* sourcelen */ \ 89 " .byte 662b-661b\n" /* sourcelen */ \
91 " .byte 664f-663f\n" /* replacementlen */ \ 90 " .byte 664f-663f\n" /* replacementlen */ \
91 ".previous\n" \
92 ".section .discard,\"aw\",@progbits\n" \
92 " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ 93 " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \
93 ".previous\n" \ 94 ".previous\n" \
94 ".section .altinstr_replacement, \"ax\"\n" \ 95 ".section .altinstr_replacement, \"ax\"\n" \
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index c74a2eebe57..a69b1ac9eaf 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -55,7 +55,6 @@ extern unsigned long apbt_quick_calibrate(void);
55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); 55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
56extern void apbt_setup_secondary_clock(void); 56extern void apbt_setup_secondary_clock(void);
57extern unsigned int boot_cpu_id; 57extern unsigned int boot_cpu_id;
58extern int disable_apbt_percpu;
59 58
60extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); 59extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
61extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); 60extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 6be33d83c71..8e6218550e7 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -70,6 +70,14 @@ struct sys_desc_table {
70 __u8 table[14]; 70 __u8 table[14];
71}; 71};
72 72
73/* Gleaned from OFW's set-parameters in cpu/x86/pc/linux.fth */
74struct olpc_ofw_header {
75 __u32 ofw_magic; /* OFW signature */
76 __u32 ofw_version;
77 __u32 cif_handler; /* callback into OFW */
78 __u32 irq_desc_table;
79} __attribute__((packed));
80
73struct efi_info { 81struct efi_info {
74 __u32 efi_loader_signature; 82 __u32 efi_loader_signature;
75 __u32 efi_systab; 83 __u32 efi_systab;
@@ -92,7 +100,8 @@ struct boot_params {
92 __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ 100 __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
93 __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ 101 __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
94 struct sys_desc_table sys_desc_table; /* 0x0a0 */ 102 struct sys_desc_table sys_desc_table; /* 0x0a0 */
95 __u8 _pad4[144]; /* 0x0b0 */ 103 struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */
104 __u8 _pad4[128]; /* 0x0c0 */
96 struct edid_info edid_info; /* 0x140 */ 105 struct edid_info edid_info; /* 0x140 */
97 struct efi_info efi_info; /* 0x1c0 */ 106 struct efi_info efi_info; /* 0x1c0 */
98 __u32 alt_mem_k; /* 0x1e0 */ 107 __u32 alt_mem_k; /* 0x1e0 */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 8859e12dd3c..284a6e8f7ce 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -11,38 +11,42 @@
11extern void __xchg_wrong_size(void); 11extern void __xchg_wrong_size(void);
12 12
13/* 13/*
14 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway 14 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
15 * Note 2: xchg has side effect, so that attribute volatile is necessary, 15 * Since this is generally used to protect other memory information, we
16 * but generally the primitive is invalid, *ptr is output argument. --ANK 16 * use "asm volatile" and "memory" clobbers to prevent gcc from moving
17 * information around.
17 */ 18 */
18
19struct __xchg_dummy {
20 unsigned long a[100];
21};
22#define __xg(x) ((struct __xchg_dummy *)(x))
23
24#define __xchg(x, ptr, size) \ 19#define __xchg(x, ptr, size) \
25({ \ 20({ \
26 __typeof(*(ptr)) __x = (x); \ 21 __typeof(*(ptr)) __x = (x); \
27 switch (size) { \ 22 switch (size) { \
28 case 1: \ 23 case 1: \
29 asm volatile("xchgb %b0,%1" \ 24 { \
30 : "=q" (__x) \ 25 volatile u8 *__ptr = (volatile u8 *)(ptr); \
31 : "m" (*__xg(ptr)), "0" (__x) \ 26 asm volatile("xchgb %0,%1" \
27 : "=q" (__x), "+m" (*__ptr) \
28 : "0" (__x) \
32 : "memory"); \ 29 : "memory"); \
33 break; \ 30 break; \
31 } \
34 case 2: \ 32 case 2: \
35 asm volatile("xchgw %w0,%1" \ 33 { \
36 : "=r" (__x) \ 34 volatile u16 *__ptr = (volatile u16 *)(ptr); \
37 : "m" (*__xg(ptr)), "0" (__x) \ 35 asm volatile("xchgw %0,%1" \
36 : "=r" (__x), "+m" (*__ptr) \
37 : "0" (__x) \
38 : "memory"); \ 38 : "memory"); \
39 break; \ 39 break; \
40 } \
40 case 4: \ 41 case 4: \
42 { \
43 volatile u32 *__ptr = (volatile u32 *)(ptr); \
41 asm volatile("xchgl %0,%1" \ 44 asm volatile("xchgl %0,%1" \
42 : "=r" (__x) \ 45 : "=r" (__x), "+m" (*__ptr) \
43 : "m" (*__xg(ptr)), "0" (__x) \ 46 : "0" (__x) \
44 : "memory"); \ 47 : "memory"); \
45 break; \ 48 break; \
49 } \
46 default: \ 50 default: \
47 __xchg_wrong_size(); \ 51 __xchg_wrong_size(); \
48 } \ 52 } \
@@ -53,60 +57,33 @@ struct __xchg_dummy {
53 __xchg((v), (ptr), sizeof(*ptr)) 57 __xchg((v), (ptr), sizeof(*ptr))
54 58
55/* 59/*
56 * The semantics of XCHGCMP8B are a bit strange, this is why 60 * CMPXCHG8B only writes to the target if we had the previous
57 * there is a loop and the loading of %%eax and %%edx has to 61 * value in registers, otherwise it acts as a read and gives us the
58 * be inside. This inlines well in most cases, the cached 62 * "new previous" value. That is why there is a loop. Preloading
59 * cost is around ~38 cycles. (in the future we might want 63 * EDX:EAX is a performance optimization: in the common case it means
60 * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that 64 * we need only one locked operation.
61 * might have an implicit FPU-save as a cost, so it's not
62 * clear which path to go.)
63 * 65 *
64 * cmpxchg8b must be used with the lock prefix here to allow 66 * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
65 * the instruction to be executed atomically, see page 3-102 67 * least an FPU save and/or %cr0.ts manipulation.
66 * of the instruction set reference 24319102.pdf. We need 68 *
67 * the reader side to see the coherent 64bit value. 69 * cmpxchg8b must be used with the lock prefix here to allow the
70 * instruction to be executed atomically. We need to have the reader
71 * side to see the coherent 64bit value.
68 */ 72 */
69static inline void __set_64bit(unsigned long long *ptr, 73static inline void set_64bit(volatile u64 *ptr, u64 value)
70 unsigned int low, unsigned int high)
71{ 74{
75 u32 low = value;
76 u32 high = value >> 32;
77 u64 prev = *ptr;
78
72 asm volatile("\n1:\t" 79 asm volatile("\n1:\t"
73 "movl (%0), %%eax\n\t" 80 LOCK_PREFIX "cmpxchg8b %0\n\t"
74 "movl 4(%0), %%edx\n\t"
75 LOCK_PREFIX "cmpxchg8b (%0)\n\t"
76 "jnz 1b" 81 "jnz 1b"
77 : /* no outputs */ 82 : "=m" (*ptr), "+A" (prev)
78 : "D"(ptr), 83 : "b" (low), "c" (high)
79 "b"(low), 84 : "memory");
80 "c"(high)
81 : "ax", "dx", "memory");
82}
83
84static inline void __set_64bit_constant(unsigned long long *ptr,
85 unsigned long long value)
86{
87 __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
88}
89
90#define ll_low(x) *(((unsigned int *)&(x)) + 0)
91#define ll_high(x) *(((unsigned int *)&(x)) + 1)
92
93static inline void __set_64bit_var(unsigned long long *ptr,
94 unsigned long long value)
95{
96 __set_64bit(ptr, ll_low(value), ll_high(value));
97} 85}
98 86
99#define set_64bit(ptr, value) \
100 (__builtin_constant_p((value)) \
101 ? __set_64bit_constant((ptr), (value)) \
102 : __set_64bit_var((ptr), (value)))
103
104#define _set_64bit(ptr, value) \
105 (__builtin_constant_p(value) \
106 ? __set_64bit(ptr, (unsigned int)(value), \
107 (unsigned int)((value) >> 32)) \
108 : __set_64bit(ptr, ll_low((value)), ll_high((value))))
109
110extern void __cmpxchg_wrong_size(void); 87extern void __cmpxchg_wrong_size(void);
111 88
112/* 89/*
@@ -121,23 +98,32 @@ extern void __cmpxchg_wrong_size(void);
121 __typeof__(*(ptr)) __new = (new); \ 98 __typeof__(*(ptr)) __new = (new); \
122 switch (size) { \ 99 switch (size) { \
123 case 1: \ 100 case 1: \
124 asm volatile(lock "cmpxchgb %b1,%2" \ 101 { \
125 : "=a"(__ret) \ 102 volatile u8 *__ptr = (volatile u8 *)(ptr); \
126 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ 103 asm volatile(lock "cmpxchgb %2,%1" \
104 : "=a" (__ret), "+m" (*__ptr) \
105 : "q" (__new), "0" (__old) \
127 : "memory"); \ 106 : "memory"); \
128 break; \ 107 break; \
108 } \
129 case 2: \ 109 case 2: \
130 asm volatile(lock "cmpxchgw %w1,%2" \ 110 { \
131 : "=a"(__ret) \ 111 volatile u16 *__ptr = (volatile u16 *)(ptr); \
132 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 112 asm volatile(lock "cmpxchgw %2,%1" \
113 : "=a" (__ret), "+m" (*__ptr) \
114 : "r" (__new), "0" (__old) \
133 : "memory"); \ 115 : "memory"); \
134 break; \ 116 break; \
117 } \
135 case 4: \ 118 case 4: \
136 asm volatile(lock "cmpxchgl %1,%2" \ 119 { \
137 : "=a"(__ret) \ 120 volatile u32 *__ptr = (volatile u32 *)(ptr); \
138 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 121 asm volatile(lock "cmpxchgl %2,%1" \
122 : "=a" (__ret), "+m" (*__ptr) \
123 : "r" (__new), "0" (__old) \
139 : "memory"); \ 124 : "memory"); \
140 break; \ 125 break; \
126 } \
141 default: \ 127 default: \
142 __cmpxchg_wrong_size(); \ 128 __cmpxchg_wrong_size(); \
143 } \ 129 } \
@@ -175,32 +161,28 @@ extern void __cmpxchg_wrong_size(void);
175 (unsigned long long)(n))) 161 (unsigned long long)(n)))
176#endif 162#endif
177 163
178static inline unsigned long long __cmpxchg64(volatile void *ptr, 164static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
179 unsigned long long old,
180 unsigned long long new)
181{ 165{
182 unsigned long long prev; 166 u64 prev;
183 asm volatile(LOCK_PREFIX "cmpxchg8b %3" 167 asm volatile(LOCK_PREFIX "cmpxchg8b %1"
184 : "=A"(prev) 168 : "=A" (prev),
185 : "b"((unsigned long)new), 169 "+m" (*ptr)
186 "c"((unsigned long)(new >> 32)), 170 : "b" ((u32)new),
187 "m"(*__xg(ptr)), 171 "c" ((u32)(new >> 32)),
188 "0"(old) 172 "0" (old)
189 : "memory"); 173 : "memory");
190 return prev; 174 return prev;
191} 175}
192 176
193static inline unsigned long long __cmpxchg64_local(volatile void *ptr, 177static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
194 unsigned long long old,
195 unsigned long long new)
196{ 178{
197 unsigned long long prev; 179 u64 prev;
198 asm volatile("cmpxchg8b %3" 180 asm volatile("cmpxchg8b %1"
199 : "=A"(prev) 181 : "=A" (prev),
200 : "b"((unsigned long)new), 182 "+m" (*ptr)
201 "c"((unsigned long)(new >> 32)), 183 : "b" ((u32)new),
202 "m"(*__xg(ptr)), 184 "c" ((u32)(new >> 32)),
203 "0"(old) 185 "0" (old)
204 : "memory"); 186 : "memory");
205 return prev; 187 return prev;
206} 188}
@@ -264,8 +246,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
264 * to simulate the cmpxchg8b on the 80386 and 80486 CPU. 246 * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
265 */ 247 */
266 248
267extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
268
269#define cmpxchg64(ptr, o, n) \ 249#define cmpxchg64(ptr, o, n) \
270({ \ 250({ \
271 __typeof__(*(ptr)) __ret; \ 251 __typeof__(*(ptr)) __ret; \
@@ -283,20 +263,20 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
283 __ret; }) 263 __ret; })
284 264
285 265
286 266#define cmpxchg64_local(ptr, o, n) \
287#define cmpxchg64_local(ptr, o, n) \ 267({ \
288({ \ 268 __typeof__(*(ptr)) __ret; \
289 __typeof__(*(ptr)) __ret; \ 269 __typeof__(*(ptr)) __old = (o); \
290 if (likely(boot_cpu_data.x86 > 4)) \ 270 __typeof__(*(ptr)) __new = (n); \
291 __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \ 271 alternative_io("call cmpxchg8b_emu", \
292 (unsigned long long)(o), \ 272 "cmpxchg8b (%%esi)" , \
293 (unsigned long long)(n)); \ 273 X86_FEATURE_CX8, \
294 else \ 274 "=A" (__ret), \
295 __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \ 275 "S" ((ptr)), "0" (__old), \
296 (unsigned long long)(o), \ 276 "b" ((unsigned int)__new), \
297 (unsigned long long)(n)); \ 277 "c" ((unsigned int)(__new>>32)) \
298 __ret; \ 278 : "memory"); \
299}) 279 __ret; })
300 280
301#endif 281#endif
302 282
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 485ae415fae..423ae58aa02 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -3,51 +3,60 @@
3 3
4#include <asm/alternative.h> /* Provides LOCK_PREFIX */ 4#include <asm/alternative.h> /* Provides LOCK_PREFIX */
5 5
6#define __xg(x) ((volatile long *)(x)) 6static inline void set_64bit(volatile u64 *ptr, u64 val)
7
8static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
9{ 7{
10 *ptr = val; 8 *ptr = val;
11} 9}
12 10
13#define _set_64bit set_64bit
14
15extern void __xchg_wrong_size(void); 11extern void __xchg_wrong_size(void);
16extern void __cmpxchg_wrong_size(void); 12extern void __cmpxchg_wrong_size(void);
17 13
18/* 14/*
19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway 15 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
20 * Note 2: xchg has side effect, so that attribute volatile is necessary, 16 * Since this is generally used to protect other memory information, we
21 * but generally the primitive is invalid, *ptr is output argument. --ANK 17 * use "asm volatile" and "memory" clobbers to prevent gcc from moving
18 * information around.
22 */ 19 */
23#define __xchg(x, ptr, size) \ 20#define __xchg(x, ptr, size) \
24({ \ 21({ \
25 __typeof(*(ptr)) __x = (x); \ 22 __typeof(*(ptr)) __x = (x); \
26 switch (size) { \ 23 switch (size) { \
27 case 1: \ 24 case 1: \
28 asm volatile("xchgb %b0,%1" \ 25 { \
29 : "=q" (__x) \ 26 volatile u8 *__ptr = (volatile u8 *)(ptr); \
30 : "m" (*__xg(ptr)), "0" (__x) \ 27 asm volatile("xchgb %0,%1" \
28 : "=q" (__x), "+m" (*__ptr) \
29 : "0" (__x) \
31 : "memory"); \ 30 : "memory"); \
32 break; \ 31 break; \
32 } \
33 case 2: \ 33 case 2: \
34 asm volatile("xchgw %w0,%1" \ 34 { \
35 : "=r" (__x) \ 35 volatile u16 *__ptr = (volatile u16 *)(ptr); \
36 : "m" (*__xg(ptr)), "0" (__x) \ 36 asm volatile("xchgw %0,%1" \
37 : "=r" (__x), "+m" (*__ptr) \
38 : "0" (__x) \
37 : "memory"); \ 39 : "memory"); \
38 break; \ 40 break; \
41 } \
39 case 4: \ 42 case 4: \
40 asm volatile("xchgl %k0,%1" \ 43 { \
41 : "=r" (__x) \ 44 volatile u32 *__ptr = (volatile u32 *)(ptr); \
42 : "m" (*__xg(ptr)), "0" (__x) \ 45 asm volatile("xchgl %0,%1" \
46 : "=r" (__x), "+m" (*__ptr) \
47 : "0" (__x) \
43 : "memory"); \ 48 : "memory"); \
44 break; \ 49 break; \
50 } \
45 case 8: \ 51 case 8: \
52 { \
53 volatile u64 *__ptr = (volatile u64 *)(ptr); \
46 asm volatile("xchgq %0,%1" \ 54 asm volatile("xchgq %0,%1" \
47 : "=r" (__x) \ 55 : "=r" (__x), "+m" (*__ptr) \
48 : "m" (*__xg(ptr)), "0" (__x) \ 56 : "0" (__x) \
49 : "memory"); \ 57 : "memory"); \
50 break; \ 58 break; \
59 } \
51 default: \ 60 default: \
52 __xchg_wrong_size(); \ 61 __xchg_wrong_size(); \
53 } \ 62 } \
@@ -71,29 +80,41 @@ extern void __cmpxchg_wrong_size(void);
71 __typeof__(*(ptr)) __new = (new); \ 80 __typeof__(*(ptr)) __new = (new); \
72 switch (size) { \ 81 switch (size) { \
73 case 1: \ 82 case 1: \
74 asm volatile(lock "cmpxchgb %b1,%2" \ 83 { \
75 : "=a"(__ret) \ 84 volatile u8 *__ptr = (volatile u8 *)(ptr); \
76 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ 85 asm volatile(lock "cmpxchgb %2,%1" \
86 : "=a" (__ret), "+m" (*__ptr) \
87 : "q" (__new), "0" (__old) \
77 : "memory"); \ 88 : "memory"); \
78 break; \ 89 break; \
90 } \
79 case 2: \ 91 case 2: \
80 asm volatile(lock "cmpxchgw %w1,%2" \ 92 { \
81 : "=a"(__ret) \ 93 volatile u16 *__ptr = (volatile u16 *)(ptr); \
82 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 94 asm volatile(lock "cmpxchgw %2,%1" \
95 : "=a" (__ret), "+m" (*__ptr) \
96 : "r" (__new), "0" (__old) \
83 : "memory"); \ 97 : "memory"); \
84 break; \ 98 break; \
99 } \
85 case 4: \ 100 case 4: \
86 asm volatile(lock "cmpxchgl %k1,%2" \ 101 { \
87 : "=a"(__ret) \ 102 volatile u32 *__ptr = (volatile u32 *)(ptr); \
88 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 103 asm volatile(lock "cmpxchgl %2,%1" \
104 : "=a" (__ret), "+m" (*__ptr) \
105 : "r" (__new), "0" (__old) \
89 : "memory"); \ 106 : "memory"); \
90 break; \ 107 break; \
108 } \
91 case 8: \ 109 case 8: \
92 asm volatile(lock "cmpxchgq %1,%2" \ 110 { \
93 : "=a"(__ret) \ 111 volatile u64 *__ptr = (volatile u64 *)(ptr); \
94 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ 112 asm volatile(lock "cmpxchgq %2,%1" \
113 : "=a" (__ret), "+m" (*__ptr) \
114 : "r" (__new), "0" (__old) \
95 : "memory"); \ 115 : "memory"); \
96 break; \ 116 break; \
117 } \
97 default: \ 118 default: \
98 __cmpxchg_wrong_size(); \ 119 __cmpxchg_wrong_size(); \
99 } \ 120 } \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 46814591438..781a50b29a4 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -6,7 +6,7 @@
6 6
7#include <asm/required-features.h> 7#include <asm/required-features.h>
8 8
9#define NCAPINTS 9 /* N 32-bit words worth of info */ 9#define NCAPINTS 10 /* N 32-bit words worth of info */
10 10
11/* 11/*
12 * Note: If the comment begins with a quoted string, that string is used 12 * Note: If the comment begins with a quoted string, that string is used
@@ -89,7 +89,7 @@
89#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */ 89#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
90#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */ 90#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
91#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ 91#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
92#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ 92 /* 21 available, was AMD_C1E */
93#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ 93#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ 94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ 95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
@@ -124,6 +124,8 @@
124#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ 124#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
125#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 125#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
126#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ 126#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
127#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
128#define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */
127#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ 129#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
128 130
129/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ 131/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
@@ -157,22 +159,29 @@
157 159
158/* 160/*
159 * Auxiliary flags: Linux defined - For features scattered in various 161 * Auxiliary flags: Linux defined - For features scattered in various
160 * CPUID levels like 0x6, 0xA etc 162 * CPUID levels like 0x6, 0xA etc, word 7
161 */ 163 */
162#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ 164#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
163#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ 165#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */
164#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ 166#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */
167#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
168#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */
169#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
170#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
165 171
166/* Virtualization flags: Linux defined */ 172/* Virtualization flags: Linux defined, word 8 */
167#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ 173#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
168#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */ 174#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
169#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ 175#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
170#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ 176#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
171#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ 177#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
172#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */ 178#define X86_FEATURE_NPT (8*32+ 5) /* AMD Nested Page Table support */
173#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */ 179#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */
174#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ 180#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
175#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ 181#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
182
183/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
184#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
176 185
177#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 186#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
178 187
@@ -194,7 +203,9 @@ extern const char * const x86_power_flags[32];
194 (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \ 203 (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
195 (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \ 204 (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
196 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ 205 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
197 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \ 206 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
207 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
208 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \
198 ? 1 : \ 209 ? 1 : \
199 test_cpu_cap(c, bit)) 210 test_cpu_cap(c, bit))
200 211
@@ -291,7 +302,7 @@ extern const char * const x86_power_flags[32];
291 * patch the target code for additional performance. 302 * patch the target code for additional performance.
292 * 303 *
293 */ 304 */
294static __always_inline __pure bool __static_cpu_has(u8 bit) 305static __always_inline __pure bool __static_cpu_has(u16 bit)
295{ 306{
296#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) 307#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
297 asm goto("1: jmp %l[t_no]\n" 308 asm goto("1: jmp %l[t_no]\n"
@@ -300,11 +311,11 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
300 _ASM_ALIGN "\n" 311 _ASM_ALIGN "\n"
301 _ASM_PTR "1b\n" 312 _ASM_PTR "1b\n"
302 _ASM_PTR "0\n" /* no replacement */ 313 _ASM_PTR "0\n" /* no replacement */
303 " .byte %P0\n" /* feature bit */ 314 " .word %P0\n" /* feature bit */
304 " .byte 2b - 1b\n" /* source len */ 315 " .byte 2b - 1b\n" /* source len */
305 " .byte 0\n" /* replacement len */ 316 " .byte 0\n" /* replacement len */
306 " .byte 0xff + 0 - (2b-1b)\n" /* padding */
307 ".previous\n" 317 ".previous\n"
318 /* skipping size check since replacement size = 0 */
308 : : "i" (bit) : : t_no); 319 : : "i" (bit) : : t_no);
309 return true; 320 return true;
310 t_no: 321 t_no:
@@ -318,10 +329,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
318 _ASM_ALIGN "\n" 329 _ASM_ALIGN "\n"
319 _ASM_PTR "1b\n" 330 _ASM_PTR "1b\n"
320 _ASM_PTR "3f\n" 331 _ASM_PTR "3f\n"
321 " .byte %P1\n" /* feature bit */ 332 " .word %P1\n" /* feature bit */
322 " .byte 2b - 1b\n" /* source len */ 333 " .byte 2b - 1b\n" /* source len */
323 " .byte 4f - 3f\n" /* replacement len */ 334 " .byte 4f - 3f\n" /* replacement len */
324 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */ 335 ".previous\n"
336 ".section .discard,\"aw\",@progbits\n"
337 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
325 ".previous\n" 338 ".previous\n"
326 ".section .altinstr_replacement,\"ax\"\n" 339 ".section .altinstr_replacement,\"ax\"\n"
327 "3: movb $1,%0\n" 340 "3: movb $1,%0\n"
@@ -337,7 +350,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
337( \ 350( \
338 __builtin_constant_p(boot_cpu_has(bit)) ? \ 351 __builtin_constant_p(boot_cpu_has(bit)) ? \
339 boot_cpu_has(bit) : \ 352 boot_cpu_has(bit) : \
340 (__builtin_constant_p(bit) && !((bit) & ~0xff)) ? \ 353 __builtin_constant_p(bit) ? \
341 __static_cpu_has(bit) : \ 354 __static_cpu_has(bit) : \
342 boot_cpu_has(bit) \ 355 boot_cpu_has(bit) \
343) 356)
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index ac91eed2106..d4c419f883a 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -54,7 +54,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
54 54
55#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) 55#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
56#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) 56#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
57#define dma_is_consistent(d, h) (1)
58 57
59extern int dma_supported(struct device *hwdev, u64 mask); 58extern int dma_supported(struct device *hwdev, u64 mask);
60extern int dma_set_mask(struct device *dev, u64 mask); 59extern int dma_set_mask(struct device *dev, u64 mask);
@@ -87,13 +86,6 @@ dma_cache_sync(struct device *dev, void *vaddr, size_t size,
87 flush_write_buffers(); 86 flush_write_buffers();
88} 87}
89 88
90static inline int dma_get_cache_alignment(void)
91{
92 /* no easy way to get cache size on all x86, so return the
93 * maximum possible, to be safe */
94 return boot_cpu_data.x86_clflush_size;
95}
96
97static inline unsigned long dma_alloc_coherent_mask(struct device *dev, 89static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
98 gfp_t gfp) 90 gfp_t gfp)
99{ 91{
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index a726650fc80..8caac76ac32 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -61,7 +61,7 @@ void *kmap(struct page *page);
61void kunmap(struct page *page); 61void kunmap(struct page *page);
62void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); 62void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
63void *kmap_atomic(struct page *page, enum km_type type); 63void *kmap_atomic(struct page *page, enum km_type type);
64void kunmap_atomic(void *kvaddr, enum km_type type); 64void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); 65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
67struct page *kmap_atomic_to_page(void *ptr); 67struct page *kmap_atomic_to_page(void *ptr);
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 70abda7058c..ff2546ce717 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -45,5 +45,6 @@ extern const struct hypervisor_x86 *x86_hyper;
45/* Recognized hypervisors */ 45/* Recognized hypervisors */
46extern const struct hypervisor_x86 x86_hyper_vmware; 46extern const struct hypervisor_x86 x86_hyper_vmware;
47extern const struct hypervisor_x86 x86_hyper_ms_hyperv; 47extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
48extern const struct hypervisor_x86 x86_hyper_xen_hvm;
48 49
49#endif 50#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b90..a73a8d5a5e6 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -31,7 +31,6 @@ extern void mxcsr_feature_mask_init(void);
31extern int init_fpu(struct task_struct *child); 31extern int init_fpu(struct task_struct *child);
32extern asmlinkage void math_state_restore(void); 32extern asmlinkage void math_state_restore(void);
33extern void __math_state_restore(void); 33extern void __math_state_restore(void);
34extern void init_thread_xstate(void);
35extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); 34extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
36 35
37extern user_regset_active_fn fpregs_active, xfpregs_active; 36extern user_regset_active_fn fpregs_active, xfpregs_active;
@@ -58,11 +57,25 @@ extern int restore_i387_xstate_ia32(void __user *buf);
58 57
59#define X87_FSW_ES (1 << 7) /* Exception Summary */ 58#define X87_FSW_ES (1 << 7) /* Exception Summary */
60 59
60static __always_inline __pure bool use_xsaveopt(void)
61{
62 return static_cpu_has(X86_FEATURE_XSAVEOPT);
63}
64
61static __always_inline __pure bool use_xsave(void) 65static __always_inline __pure bool use_xsave(void)
62{ 66{
63 return static_cpu_has(X86_FEATURE_XSAVE); 67 return static_cpu_has(X86_FEATURE_XSAVE);
64} 68}
65 69
70extern void __sanitize_i387_state(struct task_struct *);
71
72static inline void sanitize_i387_state(struct task_struct *tsk)
73{
74 if (!use_xsaveopt())
75 return;
76 __sanitize_i387_state(tsk);
77}
78
66#ifdef CONFIG_X86_64 79#ifdef CONFIG_X86_64
67 80
68/* Ignore delayed exceptions from user space */ 81/* Ignore delayed exceptions from user space */
@@ -127,6 +140,15 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
127{ 140{
128 int err; 141 int err;
129 142
143 /*
144 * Clear the bytes not touched by the fxsave and reserved
145 * for the SW usage.
146 */
147 err = __clear_user(&fx->sw_reserved,
148 sizeof(struct _fpx_sw_bytes));
149 if (unlikely(err))
150 return -EFAULT;
151
130 asm volatile("1: rex64/fxsave (%[fx])\n\t" 152 asm volatile("1: rex64/fxsave (%[fx])\n\t"
131 "2:\n" 153 "2:\n"
132 ".section .fixup,\"ax\"\n" 154 ".section .fixup,\"ax\"\n"
@@ -482,6 +504,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src)
482 memcpy(dst->state, src->state, xstate_size); 504 memcpy(dst->state, src->state, xstate_size);
483} 505}
484 506
507extern void fpu_finit(struct fpu *fpu);
508
485#endif /* __ASSEMBLY__ */ 509#endif /* __ASSEMBLY__ */
486 510
487#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 511#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 4470c9ad4a3..29f66793cc5 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -1,6 +1,12 @@
1#ifndef _ASM_X86_INTEL_SCU_IPC_H_ 1#ifndef _ASM_X86_INTEL_SCU_IPC_H_
2#define _ASM_X86_INTEL_SCU_IPC_H_ 2#define _ASM_X86_INTEL_SCU_IPC_H_
3 3
4#define IPCMSG_VRTC 0xFA /* Set vRTC device */
5
6/* Command id associated with message IPCMSG_VRTC */
7#define IPC_CMD_VRTC_SETTIME 1 /* Set time */
8#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */
9
4/* Read single register */ 10/* Read single register */
5int intel_scu_ipc_ioread8(u16 addr, u8 *data); 11int intel_scu_ipc_ioread8(u16 addr, u8 *data);
6 12
@@ -28,20 +34,6 @@ int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
28/* Update single register based on the mask */ 34/* Update single register based on the mask */
29int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); 35int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
30 36
31/*
32 * Indirect register read
33 * Can be used when SCCB(System Controller Configuration Block) register
34 * HRIM(Honor Restricted IPC Messages) is set (bit 23)
35 */
36int intel_scu_ipc_register_read(u32 addr, u32 *data);
37
38/*
39 * Indirect register write
40 * Can be used when SCCB(System Controller Configuration Block) register
41 * HRIM(Honor Restricted IPC Messages) is set (bit 23)
42 */
43int intel_scu_ipc_register_write(u32 addr, u32 data);
44
45/* Issue commands to the SCU with or without data */ 37/* Issue commands to the SCU with or without data */
46int intel_scu_ipc_simple_command(int cmd, int sub); 38int intel_scu_ipc_simple_command(int cmd, int sub);
47int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, 39int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8767d99c4f6..e2ca3009255 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -125,6 +125,9 @@
125 */ 125 */
126#define MCE_SELF_VECTOR 0xeb 126#define MCE_SELF_VECTOR 0xeb
127 127
128/* Xen vector callback to receive events in a HVM domain */
129#define XEN_HVM_EVTCHN_CALLBACK 0xe9
130
128#define NR_VECTORS 256 131#define NR_VECTORS 256
129 132
130#define FPU_IRQ 13 133#define FPU_IRQ 13
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 006da3687cd..396f5b5fc4d 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -39,9 +39,11 @@ enum regnames {
39 GDB_FS, /* 14 */ 39 GDB_FS, /* 14 */
40 GDB_GS, /* 15 */ 40 GDB_GS, /* 15 */
41}; 41};
42#define GDB_ORIG_AX 41
43#define DBG_MAX_REG_NUM 16
42#define NUMREGBYTES ((GDB_GS+1)*4) 44#define NUMREGBYTES ((GDB_GS+1)*4)
43#else /* ! CONFIG_X86_32 */ 45#else /* ! CONFIG_X86_32 */
44enum regnames64 { 46enum regnames {
45 GDB_AX, /* 0 */ 47 GDB_AX, /* 0 */
46 GDB_BX, /* 1 */ 48 GDB_BX, /* 1 */
47 GDB_CX, /* 2 */ 49 GDB_CX, /* 2 */
@@ -59,15 +61,15 @@ enum regnames64 {
59 GDB_R14, /* 14 */ 61 GDB_R14, /* 14 */
60 GDB_R15, /* 15 */ 62 GDB_R15, /* 15 */
61 GDB_PC, /* 16 */ 63 GDB_PC, /* 16 */
64 GDB_PS, /* 17 */
65 GDB_CS, /* 18 */
66 GDB_SS, /* 19 */
62}; 67};
63 68#define GDB_ORIG_AX 57
64enum regnames32 { 69#define DBG_MAX_REG_NUM 20
65 GDB_PS = 34, 70/* 17 64 bit regs and 3 32 bit regs */
66 GDB_CS, 71#define NUMREGBYTES ((17 * 8) + (3 * 4))
67 GDB_SS, 72#endif /* ! CONFIG_X86_32 */
68};
69#define NUMREGBYTES ((GDB_SS+1)*4)
70#endif /* CONFIG_X86_32 */
71 73
72static inline void arch_kgdb_breakpoint(void) 74static inline void arch_kgdb_breakpoint(void)
73{ 75{
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ff90055c7f0..4d8dcbdfc12 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -22,6 +22,8 @@
22#define __KVM_HAVE_XEN_HVM 22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS 23#define __KVM_HAVE_VCPU_EVENTS
24#define __KVM_HAVE_DEBUGREGS 24#define __KVM_HAVE_DEBUGREGS
25#define __KVM_HAVE_XSAVE
26#define __KVM_HAVE_XCRS
25 27
26/* Architectural interrupt line count. */ 28/* Architectural interrupt line count. */
27#define KVM_NR_INTERRUPTS 256 29#define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
299 __u64 reserved[9]; 301 __u64 reserved[9];
300}; 302};
301 303
304/* for KVM_CAP_XSAVE */
305struct kvm_xsave {
306 __u32 region[1024];
307};
308
309#define KVM_MAX_XCRS 16
310
311struct kvm_xcr {
312 __u32 xcr;
313 __u32 reserved;
314 __u64 value;
315};
316
317struct kvm_xcrs {
318 __u32 nr_xcrs;
319 __u32 flags;
320 struct kvm_xcr xcrs[KVM_MAX_XCRS];
321 __u64 padding[16];
322};
323
302#endif /* _ASM_X86_KVM_H */ 324#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0b2729bf207..51cfd730ac5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -51,8 +51,10 @@ struct x86_emulate_ctxt;
51#define X86EMUL_UNHANDLEABLE 1 51#define X86EMUL_UNHANDLEABLE 1
52/* Terminate emulation but return success to the caller. */ 52/* Terminate emulation but return success to the caller. */
53#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ 53#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
54#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ 54#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
55#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ 55#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
56#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
57
56struct x86_emulate_ops { 58struct x86_emulate_ops {
57 /* 59 /*
58 * read_std: Read bytes of standard (non-emulated/special) memory. 60 * read_std: Read bytes of standard (non-emulated/special) memory.
@@ -92,6 +94,7 @@ struct x86_emulate_ops {
92 int (*read_emulated)(unsigned long addr, 94 int (*read_emulated)(unsigned long addr,
93 void *val, 95 void *val,
94 unsigned int bytes, 96 unsigned int bytes,
97 unsigned int *error,
95 struct kvm_vcpu *vcpu); 98 struct kvm_vcpu *vcpu);
96 99
97 /* 100 /*
@@ -104,6 +107,7 @@ struct x86_emulate_ops {
104 int (*write_emulated)(unsigned long addr, 107 int (*write_emulated)(unsigned long addr,
105 const void *val, 108 const void *val,
106 unsigned int bytes, 109 unsigned int bytes,
110 unsigned int *error,
107 struct kvm_vcpu *vcpu); 111 struct kvm_vcpu *vcpu);
108 112
109 /* 113 /*
@@ -118,6 +122,7 @@ struct x86_emulate_ops {
118 const void *old, 122 const void *old,
119 const void *new, 123 const void *new,
120 unsigned int bytes, 124 unsigned int bytes,
125 unsigned int *error,
121 struct kvm_vcpu *vcpu); 126 struct kvm_vcpu *vcpu);
122 127
123 int (*pio_in_emulated)(int size, unsigned short port, void *val, 128 int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -132,18 +137,26 @@ struct x86_emulate_ops {
132 int seg, struct kvm_vcpu *vcpu); 137 int seg, struct kvm_vcpu *vcpu);
133 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 138 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
134 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 139 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
140 unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
135 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 141 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
136 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); 142 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
137 void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); 143 int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
138 int (*cpl)(struct kvm_vcpu *vcpu); 144 int (*cpl)(struct kvm_vcpu *vcpu);
139 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 145 int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
146 int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
147 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
148 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
140}; 149};
141 150
142/* Type, address-of, and value of an instruction's operand. */ 151/* Type, address-of, and value of an instruction's operand. */
143struct operand { 152struct operand {
144 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; 153 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
145 unsigned int bytes; 154 unsigned int bytes;
146 unsigned long val, orig_val, *ptr; 155 unsigned long orig_val, *ptr;
156 union {
157 unsigned long val;
158 char valptr[sizeof(unsigned long) + 2];
159 };
147}; 160};
148 161
149struct fetch_cache { 162struct fetch_cache {
@@ -186,6 +199,7 @@ struct decode_cache {
186 unsigned long modrm_val; 199 unsigned long modrm_val;
187 struct fetch_cache fetch; 200 struct fetch_cache fetch;
188 struct read_cache io_read; 201 struct read_cache io_read;
202 struct read_cache mem_read;
189}; 203};
190 204
191struct x86_emulate_ctxt { 205struct x86_emulate_ctxt {
@@ -202,6 +216,12 @@ struct x86_emulate_ctxt {
202 int interruptibility; 216 int interruptibility;
203 217
204 bool restart; /* restart string instruction after writeback */ 218 bool restart; /* restart string instruction after writeback */
219
220 int exception; /* exception that happens during emulation or -1 */
221 u32 error_code; /* error code for exception */
222 bool error_code_valid;
223 unsigned long cr2; /* faulted address in case of #PF */
224
205 /* decode cache */ 225 /* decode cache */
206 struct decode_cache decode; 226 struct decode_cache decode;
207}; 227};
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 76f5483cffe..502e53f999c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/mmu_notifier.h> 16#include <linux/mmu_notifier.h>
17#include <linux/tracepoint.h> 17#include <linux/tracepoint.h>
18#include <linux/cpumask.h>
18 19
19#include <linux/kvm.h> 20#include <linux/kvm.h>
20#include <linux/kvm_para.h> 21#include <linux/kvm_para.h>
@@ -39,11 +40,14 @@
39 0xFFFFFF0000000000ULL) 40 0xFFFFFF0000000000ULL)
40 41
41#define INVALID_PAGE (~(hpa_t)0) 42#define INVALID_PAGE (~(hpa_t)0)
43#define VALID_PAGE(x) ((x) != INVALID_PAGE)
44
42#define UNMAPPED_GVA (~(gpa_t)0) 45#define UNMAPPED_GVA (~(gpa_t)0)
43 46
44/* KVM Hugepage definitions for x86 */ 47/* KVM Hugepage definitions for x86 */
45#define KVM_NR_PAGE_SIZES 3 48#define KVM_NR_PAGE_SIZES 3
46#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) 49#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
50#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
47#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) 51#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
48#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) 52#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
49#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) 53#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
@@ -69,8 +73,6 @@
69 73
70#define IOPL_SHIFT 12 74#define IOPL_SHIFT 12
71 75
72#define KVM_ALIAS_SLOTS 4
73
74#define KVM_PERMILLE_MMU_PAGES 20 76#define KVM_PERMILLE_MMU_PAGES 20
75#define KVM_MIN_ALLOC_MMU_PAGES 64 77#define KVM_MIN_ALLOC_MMU_PAGES 64
76#define KVM_MMU_HASH_SHIFT 10 78#define KVM_MMU_HASH_SHIFT 10
@@ -241,7 +243,7 @@ struct kvm_mmu {
241 void (*prefetch_page)(struct kvm_vcpu *vcpu, 243 void (*prefetch_page)(struct kvm_vcpu *vcpu,
242 struct kvm_mmu_page *page); 244 struct kvm_mmu_page *page);
243 int (*sync_page)(struct kvm_vcpu *vcpu, 245 int (*sync_page)(struct kvm_vcpu *vcpu,
244 struct kvm_mmu_page *sp); 246 struct kvm_mmu_page *sp, bool clear_unsync);
245 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 247 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
246 hpa_t root_hpa; 248 hpa_t root_hpa;
247 int root_level; 249 int root_level;
@@ -301,8 +303,8 @@ struct kvm_vcpu_arch {
301 unsigned long mmu_seq; 303 unsigned long mmu_seq;
302 } update_pte; 304 } update_pte;
303 305
304 struct i387_fxsave_struct host_fx_image; 306 struct fpu guest_fpu;
305 struct i387_fxsave_struct guest_fx_image; 307 u64 xcr0;
306 308
307 gva_t mmio_fault_cr2; 309 gva_t mmio_fault_cr2;
308 struct kvm_pio_request pio; 310 struct kvm_pio_request pio;
@@ -360,26 +362,11 @@ struct kvm_vcpu_arch {
360 362
361 /* fields used by HYPER-V emulation */ 363 /* fields used by HYPER-V emulation */
362 u64 hv_vapic; 364 u64 hv_vapic;
363};
364
365struct kvm_mem_alias {
366 gfn_t base_gfn;
367 unsigned long npages;
368 gfn_t target_gfn;
369#define KVM_ALIAS_INVALID 1UL
370 unsigned long flags;
371};
372 365
373#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION 366 cpumask_var_t wbinvd_dirty_mask;
374
375struct kvm_mem_aliases {
376 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
377 int naliases;
378}; 367};
379 368
380struct kvm_arch { 369struct kvm_arch {
381 struct kvm_mem_aliases *aliases;
382
383 unsigned int n_free_mmu_pages; 370 unsigned int n_free_mmu_pages;
384 unsigned int n_requested_mmu_pages; 371 unsigned int n_requested_mmu_pages;
385 unsigned int n_alloc_mmu_pages; 372 unsigned int n_alloc_mmu_pages;
@@ -533,6 +520,8 @@ struct kvm_x86_ops {
533 520
534 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); 521 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
535 522
523 bool (*has_wbinvd_exit)(void);
524
536 const struct trace_print_flags *exit_reasons_str; 525 const struct trace_print_flags *exit_reasons_str;
537}; 526};
538 527
@@ -576,7 +565,6 @@ enum emulation_result {
576#define EMULTYPE_SKIP (1 << 2) 565#define EMULTYPE_SKIP (1 << 2)
577int emulate_instruction(struct kvm_vcpu *vcpu, 566int emulate_instruction(struct kvm_vcpu *vcpu,
578 unsigned long cr2, u16 error_code, int emulation_type); 567 unsigned long cr2, u16 error_code, int emulation_type);
579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 568void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
581void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 569void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
582 570
@@ -591,10 +579,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
591int kvm_emulate_halt(struct kvm_vcpu *vcpu); 579int kvm_emulate_halt(struct kvm_vcpu *vcpu);
592int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 580int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
593int emulate_clts(struct kvm_vcpu *vcpu); 581int emulate_clts(struct kvm_vcpu *vcpu);
594int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, 582int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
595 unsigned long *dest);
596int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
597 unsigned long value);
598 583
599void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 584void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
600int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 585int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
@@ -602,15 +587,16 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
602int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 587int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
603 bool has_error_code, u32 error_code); 588 bool has_error_code, u32 error_code);
604 589
605void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 590int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
606void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 591int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
607void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 592int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
608void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 593void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
609int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); 594int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
610int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); 595int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
611unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 596unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
612void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); 597void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
613void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); 598void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
599int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
614 600
615int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 601int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
616int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 602int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -630,12 +616,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level);
630 616
631void kvm_inject_nmi(struct kvm_vcpu *vcpu); 617void kvm_inject_nmi(struct kvm_vcpu *vcpu);
632 618
633void fx_init(struct kvm_vcpu *vcpu); 619int fx_init(struct kvm_vcpu *vcpu);
634
635int emulator_write_emulated(unsigned long addr,
636 const void *val,
637 unsigned int bytes,
638 struct kvm_vcpu *vcpu);
639 620
640void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 621void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
641void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 622void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -664,8 +645,6 @@ void kvm_disable_tdp(void);
664int complete_pio(struct kvm_vcpu *vcpu); 645int complete_pio(struct kvm_vcpu *vcpu);
665bool kvm_check_iopl(struct kvm_vcpu *vcpu); 646bool kvm_check_iopl(struct kvm_vcpu *vcpu);
666 647
667struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
668
669static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 648static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
670{ 649{
671 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); 650 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -719,21 +698,6 @@ static inline unsigned long read_msr(unsigned long msr)
719} 698}
720#endif 699#endif
721 700
722static inline void kvm_fx_save(struct i387_fxsave_struct *image)
723{
724 asm("fxsave (%0)":: "r" (image));
725}
726
727static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
728{
729 asm("fxrstor (%0)":: "r" (image));
730}
731
732static inline void kvm_fx_finit(void)
733{
734 asm("finit");
735}
736
737static inline u32 get_rdx_init_val(void) 701static inline u32 get_rdx_init_val(void)
738{ 702{
739 return 0x600; /* P6 family */ 703 return 0x600; /* P6 family */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f32a4301c4d..c62c13cb978 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -38,6 +38,10 @@
38#define MCM_ADDR_MEM 3 /* memory address */ 38#define MCM_ADDR_MEM 3 /* memory address */
39#define MCM_ADDR_GENERIC 7 /* generic */ 39#define MCM_ADDR_GENERIC 7 /* generic */
40 40
41/* CTL2 register defines */
42#define MCI_CTL2_CMCI_EN (1ULL << 30)
43#define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL
44
41#define MCJ_CTX_MASK 3 45#define MCJ_CTX_MASK 3
42#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) 46#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK)
43#define MCJ_CTX_RANDOM 0 /* inject context: random */ 47#define MCJ_CTX_RANDOM 0 /* inject context: random */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 451d30e7f62..16350740edf 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -13,6 +13,32 @@
13extern int pci_mrst_init(void); 13extern int pci_mrst_init(void);
14int __init sfi_parse_mrtc(struct sfi_table_header *table); 14int __init sfi_parse_mrtc(struct sfi_table_header *table);
15 15
16/*
17 * Medfield is the follow-up of Moorestown, it combines two chip solution into
18 * one. Other than that it also added always-on and constant tsc and lapic
19 * timers. Medfield is the platform name, and the chip name is called Penwell
20 * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be
21 * identified via MSRs.
22 */
23enum mrst_cpu_type {
24 MRST_CPU_CHIP_LINCROFT = 1,
25 MRST_CPU_CHIP_PENWELL,
26};
27
28extern enum mrst_cpu_type __mrst_cpu_chip;
29static enum mrst_cpu_type mrst_identify_cpu(void)
30{
31 return __mrst_cpu_chip;
32}
33
34enum mrst_timer_options {
35 MRST_TIMER_DEFAULT,
36 MRST_TIMER_APBT_ONLY,
37 MRST_TIMER_LAPIC_APBT,
38};
39
40extern enum mrst_timer_options mrst_timer_options;
41
16#define SFI_MTMR_MAX_NUM 8 42#define SFI_MTMR_MAX_NUM 8
17#define SFI_MRTC_MAX 8 43#define SFI_MRTC_MAX 8
18 44
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8c7ae431862..986f7790fdb 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -20,6 +20,7 @@
20#define _EFER_LMA 10 /* Long mode active (read-only) */ 20#define _EFER_LMA 10 /* Long mode active (read-only) */
21#define _EFER_NX 11 /* No execute enable */ 21#define _EFER_NX 11 /* No execute enable */
22#define _EFER_SVME 12 /* Enable virtualization */ 22#define _EFER_SVME 12 /* Enable virtualization */
23#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
23#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ 24#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
24 25
25#define EFER_SCE (1<<_EFER_SCE) 26#define EFER_SCE (1<<_EFER_SCE)
@@ -27,6 +28,7 @@
27#define EFER_LMA (1<<_EFER_LMA) 28#define EFER_LMA (1<<_EFER_LMA)
28#define EFER_NX (1<<_EFER_NX) 29#define EFER_NX (1<<_EFER_NX)
29#define EFER_SVME (1<<_EFER_SVME) 30#define EFER_SVME (1<<_EFER_SVME)
31#define EFER_LMSLE (1<<_EFER_LMSLE)
30#define EFER_FFXSR (1<<_EFER_FFXSR) 32#define EFER_FFXSR (1<<_EFER_FFXSR)
31 33
32/* Intel MSRs. Some also available on other CPUs */ 34/* Intel MSRs. Some also available on other CPUs */
@@ -94,9 +96,6 @@
94#define MSR_IA32_MC0_CTL2 0x00000280 96#define MSR_IA32_MC0_CTL2 0x00000280
95#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) 97#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
96 98
97#define CMCI_EN (1ULL << 30)
98#define CMCI_THRESHOLD_MASK 0xffffULL
99
100#define MSR_P6_PERFCTR0 0x000000c1 99#define MSR_P6_PERFCTR0 0x000000c1
101#define MSR_P6_PERFCTR1 0x000000c2 100#define MSR_P6_PERFCTR1 0x000000c2
102#define MSR_P6_EVNTSEL0 0x00000186 101#define MSR_P6_EVNTSEL0 0x00000186
@@ -159,8 +158,6 @@
159#define MSR_K7_FID_VID_STATUS 0xc0010042 158#define MSR_K7_FID_VID_STATUS 0xc0010042
160 159
161/* K6 MSRs */ 160/* K6 MSRs */
162#define MSR_K6_EFER 0xc0000080
163#define MSR_K6_STAR 0xc0000081
164#define MSR_K6_WHCR 0xc0000082 161#define MSR_K6_WHCR 0xc0000082
165#define MSR_K6_UWCCR 0xc0000085 162#define MSR_K6_UWCCR 0xc0000085
166#define MSR_K6_EPMR 0xc0000086 163#define MSR_K6_EPMR 0xc0000086
@@ -224,12 +221,14 @@
224#define MSR_IA32_THERM_CONTROL 0x0000019a 221#define MSR_IA32_THERM_CONTROL 0x0000019a
225#define MSR_IA32_THERM_INTERRUPT 0x0000019b 222#define MSR_IA32_THERM_INTERRUPT 0x0000019b
226 223
227#define THERM_INT_LOW_ENABLE (1 << 0) 224#define THERM_INT_HIGH_ENABLE (1 << 0)
228#define THERM_INT_HIGH_ENABLE (1 << 1) 225#define THERM_INT_LOW_ENABLE (1 << 1)
226#define THERM_INT_PLN_ENABLE (1 << 24)
229 227
230#define MSR_IA32_THERM_STATUS 0x0000019c 228#define MSR_IA32_THERM_STATUS 0x0000019c
231 229
232#define THERM_STATUS_PROCHOT (1 << 0) 230#define THERM_STATUS_PROCHOT (1 << 0)
231#define THERM_STATUS_POWER_LIMIT (1 << 10)
233 232
234#define MSR_THERM2_CTL 0x0000019d 233#define MSR_THERM2_CTL 0x0000019d
235 234
@@ -239,6 +238,19 @@
239 238
240#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 239#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
241 240
241#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
242
243#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
244
245#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
246#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10)
247
248#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2
249
250#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
251#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
252#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
253
242/* MISC_ENABLE bits: architectural */ 254/* MISC_ENABLE bits: architectural */
243#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) 255#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
244#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) 256#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index c5bc4c2d33f..084ef95274c 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -148,8 +148,8 @@ static inline unsigned long long native_read_pmc(int counter)
148#define rdmsr(msr, val1, val2) \ 148#define rdmsr(msr, val1, val2) \
149do { \ 149do { \
150 u64 __val = native_read_msr((msr)); \ 150 u64 __val = native_read_msr((msr)); \
151 (val1) = (u32)__val; \ 151 (void)((val1) = (u32)__val); \
152 (val2) = (u32)(__val >> 32); \ 152 (void)((val2) = (u32)(__val >> 32)); \
153} while (0) 153} while (0)
154 154
155static inline void wrmsr(unsigned msr, unsigned low, unsigned high) 155static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
new file mode 100644
index 00000000000..08fde475cb3
--- /dev/null
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -0,0 +1,31 @@
1#ifndef _ASM_X86_OLPC_OFW_H
2#define _ASM_X86_OLPC_OFW_H
3
4/* index into the page table containing the entry OFW occupies */
5#define OLPC_OFW_PDE_NR 1022
6
7#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */
8
9#ifdef CONFIG_OLPC_OPENFIRMWARE
10
11/* run an OFW command by calling into the firmware */
12#define olpc_ofw(name, args, res) \
13 __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
14
15extern int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
16 void **res);
17
18/* determine whether OFW is available and lives in the proper memory */
19extern void olpc_ofw_detect(void);
20
21/* install OFW's pde permanently into the kernel's pgtable */
22extern void setup_olpc_ofw_pgd(void);
23
24#else /* !CONFIG_OLPC_OPENFIRMWARE */
25
26static inline void olpc_ofw_detect(void) { }
27static inline void setup_olpc_ofw_pgd(void) { }
28
29#endif /* !CONFIG_OLPC_OPENFIRMWARE */
30
31#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index cd2a31dc5fb..49c7219826f 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -30,6 +30,7 @@
30#define PCI_HAS_IO_ECS 0x40000 30#define PCI_HAS_IO_ECS 0x40000
31#define PCI_NOASSIGN_ROMS 0x80000 31#define PCI_NOASSIGN_ROMS 0x80000
32#define PCI_ROOT_NO_CRS 0x100000 32#define PCI_ROOT_NO_CRS 0x100000
33#define PCI_NOASSIGN_BARS 0x200000
33 34
34extern unsigned int pci_probe; 35extern unsigned int pci_probe;
35extern unsigned long pirq_table_addr; 36extern unsigned long pirq_table_addr;
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 181be528c61..076052cd62b 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -126,8 +126,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
126/* x86-64 always has all page tables mapped. */ 126/* x86-64 always has all page tables mapped. */
127#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) 127#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
128#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) 128#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
129#define pte_unmap(pte) /* NOP */ 129#define pte_unmap(pte) ((void)(pte))/* NOP */
130#define pte_unmap_nested(pte) /* NOP */ 130#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */
131 131
132#define update_mmu_cache(vma, address, ptep) do { } while (0) 132#define update_mmu_cache(vma, address, ptep) do { } while (0)
133 133
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7e5c6a60b8e..325b7bdbeba 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -762,6 +762,7 @@ extern void init_c1e_mask(void);
762extern unsigned long boot_option_idle_override; 762extern unsigned long boot_option_idle_override;
763extern unsigned long idle_halt; 763extern unsigned long idle_halt;
764extern unsigned long idle_nomwait; 764extern unsigned long idle_nomwait;
765extern bool c1e_detected;
765 766
766/* 767/*
767 * on systems with caches, caches must be flashed as the absolute 768 * on systems with caches, caches must be flashed as the absolute
@@ -1025,4 +1026,24 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
1025 return ratio; 1026 return ratio;
1026} 1027}
1027 1028
1029/*
1030 * AMD errata checking
1031 */
1032#ifdef CONFIG_CPU_SUP_AMD
1033extern const int amd_erratum_383[];
1034extern const int amd_erratum_400[];
1035extern bool cpu_has_amd_erratum(const int *);
1036
1037#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
1038#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
1039#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
1040 ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
1041#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
1042#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
1043#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
1044
1045#else
1046#define cpu_has_amd_erratum(x) (false)
1047#endif /* CONFIG_CPU_SUP_AMD */
1048
1028#endif /* _ASM_X86_PROCESSOR_H */ 1049#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 64cf2d24fad..6c7fc25f2c3 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -84,5 +84,7 @@
84#define REQUIRED_MASK5 0 84#define REQUIRED_MASK5 0
85#define REQUIRED_MASK6 0 85#define REQUIRED_MASK6 0
86#define REQUIRED_MASK7 0 86#define REQUIRED_MASK7 0
87#define REQUIRED_MASK8 0
88#define REQUIRED_MASK9 0
87 89
88#endif /* _ASM_X86_REQUIRED_FEATURES_H */ 90#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 606ede12697..d1e41b0f9b6 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -118,7 +118,7 @@ static inline void __down_read(struct rw_semaphore *sem)
118{ 118{
119 asm volatile("# beginning down_read\n\t" 119 asm volatile("# beginning down_read\n\t"
120 LOCK_PREFIX _ASM_INC "(%1)\n\t" 120 LOCK_PREFIX _ASM_INC "(%1)\n\t"
121 /* adds 0x00000001, returns the old value */ 121 /* adds 0x00000001 */
122 " jns 1f\n" 122 " jns 1f\n"
123 " call call_rwsem_down_read_failed\n" 123 " call call_rwsem_down_read_failed\n"
124 "1:\n\t" 124 "1:\n\t"
@@ -156,11 +156,9 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) 156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
157{ 157{
158 rwsem_count_t tmp; 158 rwsem_count_t tmp;
159
160 tmp = RWSEM_ACTIVE_WRITE_BIAS;
161 asm volatile("# beginning down_write\n\t" 159 asm volatile("# beginning down_write\n\t"
162 LOCK_PREFIX " xadd %1,(%2)\n\t" 160 LOCK_PREFIX " xadd %1,(%2)\n\t"
163 /* subtract 0x0000ffff, returns the old value */ 161 /* adds 0xffff0001, returns the old value */
164 " test %1,%1\n\t" 162 " test %1,%1\n\t"
165 /* was the count 0 before? */ 163 /* was the count 0 before? */
166 " jz 1f\n" 164 " jz 1f\n"
@@ -168,7 +166,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
168 "1:\n" 166 "1:\n"
169 "# ending down_write" 167 "# ending down_write"
170 : "+m" (sem->count), "=d" (tmp) 168 : "+m" (sem->count), "=d" (tmp)
171 : "a" (sem), "1" (tmp) 169 : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS)
172 : "memory", "cc"); 170 : "memory", "cc");
173} 171}
174 172
@@ -195,16 +193,16 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
195 */ 193 */
196static inline void __up_read(struct rw_semaphore *sem) 194static inline void __up_read(struct rw_semaphore *sem)
197{ 195{
198 rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS; 196 rwsem_count_t tmp;
199 asm volatile("# beginning __up_read\n\t" 197 asm volatile("# beginning __up_read\n\t"
200 LOCK_PREFIX " xadd %1,(%2)\n\t" 198 LOCK_PREFIX " xadd %1,(%2)\n\t"
201 /* subtracts 1, returns the old value */ 199 /* subtracts 1, returns the old value */
202 " jns 1f\n\t" 200 " jns 1f\n\t"
203 " call call_rwsem_wake\n" 201 " call call_rwsem_wake\n" /* expects old value in %edx */
204 "1:\n" 202 "1:\n"
205 "# ending __up_read\n" 203 "# ending __up_read\n"
206 : "+m" (sem->count), "=d" (tmp) 204 : "+m" (sem->count), "=d" (tmp)
207 : "a" (sem), "1" (tmp) 205 : "a" (sem), "1" (-RWSEM_ACTIVE_READ_BIAS)
208 : "memory", "cc"); 206 : "memory", "cc");
209} 207}
210 208
@@ -216,10 +214,9 @@ static inline void __up_write(struct rw_semaphore *sem)
216 rwsem_count_t tmp; 214 rwsem_count_t tmp;
217 asm volatile("# beginning __up_write\n\t" 215 asm volatile("# beginning __up_write\n\t"
218 LOCK_PREFIX " xadd %1,(%2)\n\t" 216 LOCK_PREFIX " xadd %1,(%2)\n\t"
219 /* tries to transition 217 /* subtracts 0xffff0001, returns the old value */
220 0xffff0001 -> 0x00000000 */ 218 " jns 1f\n\t"
221 " jz 1f\n" 219 " call call_rwsem_wake\n" /* expects old value in %edx */
222 " call call_rwsem_wake\n"
223 "1:\n\t" 220 "1:\n\t"
224 "# ending __up_write\n" 221 "# ending __up_write\n"
225 : "+m" (sem->count), "=d" (tmp) 222 : "+m" (sem->count), "=d" (tmp)
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
index fb0b1874396..4240878b9d7 100644
--- a/arch/x86/include/asm/scatterlist.h
+++ b/arch/x86/include/asm/scatterlist.h
@@ -3,7 +3,6 @@
3 3
4#include <asm-generic/scatterlist.h> 4#include <asm-generic/scatterlist.h>
5 5
6#define ISA_DMA_THRESHOLD (0x00ffffff)
7#define ARCH_HAS_SG_CHAIN 6#define ARCH_HAS_SG_CHAIN
8 7
9#endif /* _ASM_X86_SCATTERLIST_H */ 8#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 86b1506f417..ef292c792d7 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -82,7 +82,7 @@ void *extend_brk(size_t size, size_t align);
82 * executable.) 82 * executable.)
83 */ 83 */
84#define RESERVE_BRK(name,sz) \ 84#define RESERVE_BRK(name,sz) \
85 static void __section(.discard) __used \ 85 static void __section(.discard.text) __used \
86 __brk_reservation_fn_##name##__(void) { \ 86 __brk_reservation_fn_##name##__(void) { \
87 asm volatile ( \ 87 asm volatile ( \
88 ".pushsection .brk_reservation,\"aw\",@nobits;" \ 88 ".pushsection .brk_reservation,\"aw\",@nobits;" \
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 3ad421784ae..cf4e2e381cb 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -80,4 +80,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
80 80
81/* ia32/ipc32.c */ 81/* ia32/ipc32.c */
82asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32); 82asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
83
84asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int,
85 const char __user *);
83#endif /* _ASM_X86_SYS_IA32_H */ 86#endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index e7f4d33c55e..33ecc3ea878 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -457,4 +457,11 @@ static __always_inline void rdtsc_barrier(void)
457 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); 457 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
458} 458}
459 459
460/*
461 * We handle most unaligned accesses in hardware. On the other hand
462 * unaligned DMA can be quite expensive on some Nehalem processors.
463 *
464 * Based on this we disable the IP header alignment in network drivers.
465 */
466#define NET_IP_ALIGN 0
460#endif /* _ASM_X86_SYSTEM_H */ 467#endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index beb9b5f8f8a..b766a5e8ba0 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,13 @@
343#define __NR_rt_tgsigqueueinfo 335 343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_event_open 336 344#define __NR_perf_event_open 336
345#define __NR_recvmmsg 337 345#define __NR_recvmmsg 337
346#define __NR_fanotify_init 338
347#define __NR_fanotify_mark 339
348#define __NR_prlimit64 340
346 349
347#ifdef __KERNEL__ 350#ifdef __KERNEL__
348 351
349#define NR_syscalls 338 352#define NR_syscalls 341
350 353
351#define __ARCH_WANT_IPC_PARSE_VERSION 354#define __ARCH_WANT_IPC_PARSE_VERSION
352#define __ARCH_WANT_OLD_READDIR 355#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index ff4307b0e81..363e9b8a715 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,12 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
663__SYSCALL(__NR_perf_event_open, sys_perf_event_open) 663__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
664#define __NR_recvmmsg 299 664#define __NR_recvmmsg 299
665__SYSCALL(__NR_recvmmsg, sys_recvmmsg) 665__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
666#define __NR_fanotify_init 300
667__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
668#define __NR_fanotify_mark 301
669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
670#define __NR_prlimit64 302
671__SYSCALL(__NR_prlimit64, sys_prlimit64)
666 672
667#ifndef __NO_STUBS 673#ifndef __NO_STUBS
668#define __ARCH_WANT_OLD_READDIR 674#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9e6779f7cf2..9f0cbd987d5 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -257,6 +257,7 @@ enum vmcs_field {
257#define EXIT_REASON_IO_INSTRUCTION 30 257#define EXIT_REASON_IO_INSTRUCTION 30
258#define EXIT_REASON_MSR_READ 31 258#define EXIT_REASON_MSR_READ 31
259#define EXIT_REASON_MSR_WRITE 32 259#define EXIT_REASON_MSR_WRITE 32
260#define EXIT_REASON_INVALID_STATE 33
260#define EXIT_REASON_MWAIT_INSTRUCTION 36 261#define EXIT_REASON_MWAIT_INSTRUCTION 36
261#define EXIT_REASON_MONITOR_INSTRUCTION 39 262#define EXIT_REASON_MONITOR_INSTRUCTION 39
262#define EXIT_REASON_PAUSE_INSTRUCTION 40 263#define EXIT_REASON_PAUSE_INSTRUCTION 40
@@ -266,6 +267,7 @@ enum vmcs_field {
266#define EXIT_REASON_EPT_VIOLATION 48 267#define EXIT_REASON_EPT_VIOLATION 48
267#define EXIT_REASON_EPT_MISCONFIG 49 268#define EXIT_REASON_EPT_MISCONFIG 49
268#define EXIT_REASON_WBINVD 54 269#define EXIT_REASON_WBINVD 54
270#define EXIT_REASON_XSETBV 55
269 271
270/* 272/*
271 * Interruption-information format 273 * Interruption-information format
@@ -375,6 +377,9 @@ enum vmcs_field {
375#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 377#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
376#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 378#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
377 379
380#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
381#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
382
378#define VMX_EPT_DEFAULT_GAW 3 383#define VMX_EPT_DEFAULT_GAW 3
379#define VMX_EPT_MAX_GAW 0x4 384#define VMX_EPT_MAX_GAW 0x4
380#define VMX_EPT_MT_EPTE_SHIFT 3 385#define VMX_EPT_MT_EPTE_SHIFT 3
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9c371e4a9fa..7fda040a76c 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -417,6 +417,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
417 return _hypercall2(int, nmi_op, op, arg); 417 return _hypercall2(int, nmi_op, op, arg);
418} 418}
419 419
420static inline unsigned long __must_check
421HYPERVISOR_hvm_op(int op, void *arg)
422{
423 return _hypercall2(unsigned long, hvm_op, op, arg);
424}
425
420static inline void 426static inline void
421MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) 427MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
422{ 428{
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 018a0a40079..bf5f7d32bd0 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -112,13 +112,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
112 */ 112 */
113static inline unsigned long mfn_to_local_pfn(unsigned long mfn) 113static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
114{ 114{
115 extern unsigned long max_mapnr;
116 unsigned long pfn = mfn_to_pfn(mfn); 115 unsigned long pfn = mfn_to_pfn(mfn);
117 if ((pfn < max_mapnr) 116 if (get_phys_to_machine(pfn) != mfn)
118 && !xen_feature(XENFEAT_auto_translated_physmap) 117 return -1; /* force !pfn_valid() */
119 && (get_phys_to_machine(pfn) != mfn))
120 return max_mapnr; /* force !pfn_valid() */
121 /* XXX fixme; not true with sparsemem */
122 return pfn; 118 return pfn;
123} 119}
124 120
diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
new file mode 100644
index 00000000000..1be1ab7d6a4
--- /dev/null
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -0,0 +1,14 @@
1#ifndef _ASM_X86_SWIOTLB_XEN_H
2#define _ASM_X86_SWIOTLB_XEN_H
3
4#ifdef CONFIG_SWIOTLB_XEN
5extern int xen_swiotlb;
6extern int __init pci_xen_swiotlb_detect(void);
7extern void __init pci_xen_swiotlb_init(void);
8#else
9#define xen_swiotlb (0)
10static inline int __init pci_xen_swiotlb_detect(void) { return 0; }
11static inline void __init pci_xen_swiotlb_init(void) { }
12#endif
13
14#endif /* _ASM_X86_SWIOTLB_XEN_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae22..c6ce2452f10 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -3,7 +3,8 @@
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <asm/processor.h> 5#include <asm/processor.h>
6#include <asm/i387.h> 6
7#define XSTATE_CPUID 0x0000000d
7 8
8#define XSTATE_FP 0x1 9#define XSTATE_FP 0x1
9#define XSTATE_SSE 0x2 10#define XSTATE_SSE 0x2
@@ -13,6 +14,12 @@
13 14
14#define FXSAVE_SIZE 512 15#define FXSAVE_SIZE 512
15 16
17#define XSAVE_HDR_SIZE 64
18#define XSAVE_HDR_OFFSET FXSAVE_SIZE
19
20#define XSAVE_YMM_SIZE 256
21#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
22
16/* 23/*
17 * These are the features that the OS can handle currently. 24 * These are the features that the OS can handle currently.
18 */ 25 */
@@ -26,10 +33,8 @@
26 33
27extern unsigned int xstate_size; 34extern unsigned int xstate_size;
28extern u64 pcntxt_mask; 35extern u64 pcntxt_mask;
29extern struct xsave_struct *init_xstate_buf;
30extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; 36extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
31 37
32extern void xsave_cntxt_init(void);
33extern void xsave_init(void); 38extern void xsave_init(void);
34extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); 39extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
35extern int init_fpu(struct task_struct *child); 40extern int init_fpu(struct task_struct *child);
@@ -59,6 +64,16 @@ static inline int fpu_xrstor_checking(struct fpu *fpu)
59static inline int xsave_user(struct xsave_struct __user *buf) 64static inline int xsave_user(struct xsave_struct __user *buf)
60{ 65{
61 int err; 66 int err;
67
68 /*
69 * Clear the xsave header first, so that reserved fields are
70 * initialized to zero.
71 */
72 err = __clear_user(&buf->xsave_hdr,
73 sizeof(struct xsave_hdr_struct));
74 if (unlikely(err))
75 return -EFAULT;
76
62 __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" 77 __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
63 "2:\n" 78 "2:\n"
64 ".section .fixup,\"ax\"\n" 79 ".section .fixup,\"ax\"\n"
@@ -111,12 +126,25 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
111 : "memory"); 126 : "memory");
112} 127}
113 128
129static inline void xsave_state(struct xsave_struct *fx, u64 mask)
130{
131 u32 lmask = mask;
132 u32 hmask = mask >> 32;
133
134 asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
135 : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
136 : "memory");
137}
138
114static inline void fpu_xsave(struct fpu *fpu) 139static inline void fpu_xsave(struct fpu *fpu)
115{ 140{
116 /* This, however, we can work around by forcing the compiler to select 141 /* This, however, we can work around by forcing the compiler to select
117 an addressing mode that doesn't require extended registers. */ 142 an addressing mode that doesn't require extended registers. */
118 __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27" 143 alternative_input(
119 : : "D" (&(fpu->state->xsave)), 144 ".byte " REX_PREFIX "0x0f,0xae,0x27",
120 "a" (-1), "d"(-1) : "memory"); 145 ".byte " REX_PREFIX "0x0f,0xae,0x37",
146 X86_FEATURE_XSAVEOPT,
147 [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
148 "memory");
121} 149}
122#endif 150#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77b2208372..0925676266b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o
104scx200-y += scx200_32.o 104scx200-y += scx200_32.o
105 105
106obj-$(CONFIG_OLPC) += olpc.o 106obj-$(CONFIG_OLPC) += olpc.o
107obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
107obj-$(CONFIG_X86_MRST) += mrst.o 108obj-$(CONFIG_X86_MRST) += mrst.o
108 109
109microcode-y := microcode_core.o 110microcode-y := microcode_core.o
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 580b4e29601..28595d6df47 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -104,7 +104,7 @@ _start:
104 movl %eax, %ecx 104 movl %eax, %ecx
105 orl %edx, %ecx 105 orl %edx, %ecx
106 jz 1f 106 jz 1f
107 movl $0xc0000080, %ecx 107 movl $MSR_EFER, %ecx
108 wrmsr 108 wrmsr
1091: 1091:
110 110
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index fcc3c61fdec..33cec152070 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -2,7 +2,7 @@
2 * sleep.c - x86-specific ACPI sleep support. 2 * sleep.c - x86-specific ACPI sleep support.
3 * 3 *
4 * Copyright (C) 2001-2003 Patrick Mochel 4 * Copyright (C) 2001-2003 Patrick Mochel
5 * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
6 */ 6 */
7 7
8#include <linux/acpi.h> 8#include <linux/acpi.h>
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 70237732a6c..f65ab8b014c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
214 u8 *instr = a->instr; 214 u8 *instr = a->instr;
215 BUG_ON(a->replacementlen > a->instrlen); 215 BUG_ON(a->replacementlen > a->instrlen);
216 BUG_ON(a->instrlen > sizeof(insnbuf)); 216 BUG_ON(a->instrlen > sizeof(insnbuf));
217 BUG_ON(a->cpuid >= NCAPINTS*32);
217 if (!boot_cpu_has(a->cpuid)) 218 if (!boot_cpu_has(a->cpuid))
218 continue; 219 continue;
219#ifdef CONFIG_X86_64 220#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0d20286d78c..fa044e1e30a 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2572,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2572static int amd_iommu_domain_has_cap(struct iommu_domain *domain, 2572static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
2573 unsigned long cap) 2573 unsigned long cap)
2574{ 2574{
2575 switch (cap) {
2576 case IOMMU_CAP_CACHE_COHERENCY:
2577 return 1;
2578 }
2579
2575 return 0; 2580 return 0;
2576} 2581}
2577 2582
@@ -2609,8 +2614,7 @@ int __init amd_iommu_init_passthrough(void)
2609 2614
2610 pt_domain->mode |= PAGE_MODE_NONE; 2615 pt_domain->mode |= PAGE_MODE_NONE;
2611 2616
2612 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2617 for_each_pci_dev(dev) {
2613
2614 if (!check_device(&dev->dev)) 2618 if (!check_device(&dev->dev))
2615 continue; 2619 continue;
2616 2620
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index a35347501d3..8dd77800ff5 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -43,10 +43,11 @@
43 43
44#include <asm/fixmap.h> 44#include <asm/fixmap.h>
45#include <asm/apb_timer.h> 45#include <asm/apb_timer.h>
46#include <asm/mrst.h>
46 47
47#define APBT_MASK CLOCKSOURCE_MASK(32) 48#define APBT_MASK CLOCKSOURCE_MASK(32)
48#define APBT_SHIFT 22 49#define APBT_SHIFT 22
49#define APBT_CLOCKEVENT_RATING 150 50#define APBT_CLOCKEVENT_RATING 110
50#define APBT_CLOCKSOURCE_RATING 250 51#define APBT_CLOCKSOURCE_RATING 250
51#define APBT_MIN_DELTA_USEC 200 52#define APBT_MIN_DELTA_USEC 200
52 53
@@ -83,8 +84,6 @@ struct apbt_dev {
83 char name[10]; 84 char name[10];
84}; 85};
85 86
86int disable_apbt_percpu __cpuinitdata;
87
88static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); 87static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
89 88
90#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
@@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = {
195}; 194};
196 195
197/* 196/*
198 * if user does not want to use per CPU apb timer, just give it a lower rating
199 * than local apic timer and skip the late per cpu timer init.
200 */
201static inline int __init setup_x86_mrst_timer(char *arg)
202{
203 if (!arg)
204 return -EINVAL;
205
206 if (strcmp("apbt_only", arg) == 0)
207 disable_apbt_percpu = 0;
208 else if (strcmp("lapic_and_apbt", arg) == 0)
209 disable_apbt_percpu = 1;
210 else {
211 pr_warning("X86 MRST timer option %s not recognised"
212 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
213 arg);
214 return -EINVAL;
215 }
216 return 0;
217}
218__setup("x86_mrst_timer=", setup_x86_mrst_timer);
219
220/*
221 * start count down from 0xffff_ffff. this is done by toggling the enable bit 197 * start count down from 0xffff_ffff. this is done by toggling the enable bit
222 * then load initial load count to ~0. 198 * then load initial load count to ~0.
223 */ 199 */
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void)
335 adev->num = smp_processor_id(); 311 adev->num = smp_processor_id();
336 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); 312 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
337 313
338 if (disable_apbt_percpu) { 314 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
339 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; 315 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
340 global_clock_event = &adev->evt; 316 global_clock_event = &adev->evt;
341 printk(KERN_DEBUG "%s clockevent registered as global\n", 317 printk(KERN_DEBUG "%s clockevent registered as global\n",
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
429 405
430static __init int apbt_late_init(void) 406static __init int apbt_late_init(void)
431{ 407{
432 if (disable_apbt_percpu || !apb_timer_block_enabled) 408 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
409 !apb_timer_block_enabled)
433 return 0; 410 return 0;
434 /* This notifier should be called after workqueue is ready */ 411 /* This notifier should be called after workqueue is ready */
435 hotcpu_notifier(apbt_cpuhp_notify, -20); 412 hotcpu_notifier(apbt_cpuhp_notify, -20);
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode,
450 int timer_num; 427 int timer_num;
451 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); 428 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
452 429
430 BUG_ON(!apbt_virt_address);
431
453 timer_num = adev->num; 432 timer_num = adev->num;
454 pr_debug("%s CPU %d timer %d mode=%d\n", 433 pr_debug("%s CPU %d timer %d mode=%d\n",
455 __func__, first_cpu(*evt->cpumask), timer_num, mode); 434 __func__, first_cpu(*evt->cpumask), timer_num, mode);
@@ -676,7 +655,7 @@ void __init apbt_time_init(void)
676 } 655 }
677#ifdef CONFIG_SMP 656#ifdef CONFIG_SMP
678 /* kernel cmdline disable apb timer, so we will use lapic timers */ 657 /* kernel cmdline disable apb timer, so we will use lapic timers */
679 if (disable_apbt_percpu) { 658 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
680 printk(KERN_INFO "apbt: disabled per cpu timer\n"); 659 printk(KERN_INFO "apbt: disabled per cpu timer\n");
681 return; 660 return;
682 } 661 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b5d8b0bcf23..a2e0caf26e1 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void)
280 * or BIOS forget to put that in reserved. 280 * or BIOS forget to put that in reserved.
281 * try to update e820 to make that region as reserved. 281 * try to update e820 to make that region as reserved.
282 */ 282 */
283 u32 agp_aper_base = 0, agp_aper_order = 0; 283 u32 agp_aper_order = 0;
284 int i, fix, slot, valid_agp = 0; 284 int i, fix, slot, valid_agp = 0;
285 u32 ctl; 285 u32 ctl;
286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void)
291 return; 291 return;
292 292
293 /* This is mostly duplicate of iommu_hole_init */ 293 /* This is mostly duplicate of iommu_hole_init */
294 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); 294 search_agp_bridge(&agp_aper_order, &valid_agp);
295 295
296 fix = 0; 296 fix = 0;
297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a96489ee6ca..980508c7908 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -460,7 +460,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
460} 460}
461 461
462/* 462/*
463 * Setup the local APIC timer for this CPU. Copy the initilized values 463 * Setup the local APIC timer for this CPU. Copy the initialized values
464 * of the boot CPU and register the clock event in the framework. 464 * of the boot CPU and register the clock event in the framework.
465 */ 465 */
466static void __cpuinit setup_APIC_timer(void) 466static void __cpuinit setup_APIC_timer(void)
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 425e53a87fe..8593582d802 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -129,7 +129,6 @@ int es7000_plat;
129 * GSI override for ES7000 platforms. 129 * GSI override for ES7000 platforms.
130 */ 130 */
131 131
132static unsigned int base;
133 132
134static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) 133static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
135{ 134{
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e41ed24ab26..4dc0084ec1b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3397 3397
3398 cfg = desc->chip_data; 3398 cfg = desc->chip_data;
3399 3399
3400 read_msi_msg_desc(desc, &msg); 3400 get_cached_msi_msg_desc(desc, &msg);
3401 3401
3402 msg.data &= ~MSI_DATA_VECTOR_MASK; 3402 msg.data &= ~MSI_DATA_VECTOR_MASK;
3403 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3403 msg.data |= MSI_DATA_VECTOR(cfg->vector);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index c4f9182ca3a..4c9c67bf09b 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -140,7 +140,7 @@
140 * is now the way life works). 140 * is now the way life works).
141 * Fix thinko in suspend() (wrong return). 141 * Fix thinko in suspend() (wrong return).
142 * Notify drivers on critical suspend. 142 * Notify drivers on critical suspend.
143 * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz> 143 * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
144 * modified by sfr). 144 * modified by sfr).
145 * Disable interrupts while we are suspended (Andy Henroid 145 * Disable interrupts while we are suspended (Andy Henroid
146 * <andy_henroid@yahoo.com> fixed by sfr). 146 * <andy_henroid@yahoo.com> fixed by sfr).
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3a785da34b6..3f0ebe429a0 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,11 +12,11 @@ endif
12nostackp := $(call cc-option, -fno-stack-protector) 12nostackp := $(call cc-option, -fno-stack-protector)
13CFLAGS_common.o := $(nostackp) 13CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o addon_cpuid_features.o 15obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o mshyperv.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18 18
19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o
20obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
21 21
22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d..60a57b13082 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
466 } 466 }
467 467
468 } 468 }
469 if (c->x86 == 0x10 || c->x86 == 0x11) 469 if (c->x86 >= 0x10)
470 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 470 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
471 471
472 /* get apicid instead of initial apic id from cpuid */ 472 /* get apicid instead of initial apic id from cpuid */
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
529 num_cache_leaves = 3; 529 num_cache_leaves = 3;
530 } 530 }
531 531
532 if (c->x86 >= 0xf && c->x86 <= 0x11) 532 if (c->x86 >= 0xf)
533 set_cpu_cap(c, X86_FEATURE_K8); 533 set_cpu_cap(c, X86_FEATURE_K8);
534 534
535 if (cpu_has_xmm2) { 535 if (cpu_has_xmm2) {
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
546 fam10h_check_enable_mmcfg(); 546 fam10h_check_enable_mmcfg();
547 } 547 }
548 548
549 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { 549 if (c == &boot_cpu_data && c->x86 >= 0xf) {
550 unsigned long long tseg; 550 unsigned long long tseg;
551 551
552 /* 552 /*
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
609}; 609};
610 610
611cpu_dev_register(amd_cpu_dev); 611cpu_dev_register(amd_cpu_dev);
612
613/*
614 * AMD errata checking
615 *
616 * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
617 * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
618 * have an OSVW id assigned, which it takes as first argument. Both take a
619 * variable number of family-specific model-stepping ranges created by
620 * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
621 * int[] in arch/x86/include/asm/processor.h.
622 *
623 * Example:
624 *
625 * const int amd_erratum_319[] =
626 * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
627 * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
628 * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
629 */
630
631const int amd_erratum_400[] =
632 AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
633 AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
634EXPORT_SYMBOL_GPL(amd_erratum_400);
635
636const int amd_erratum_383[] =
637 AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
638EXPORT_SYMBOL_GPL(amd_erratum_383);
639
640bool cpu_has_amd_erratum(const int *erratum)
641{
642 struct cpuinfo_x86 *cpu = &current_cpu_data;
643 int osvw_id = *erratum++;
644 u32 range;
645 u32 ms;
646
647 /*
648 * If called early enough that current_cpu_data hasn't been initialized
649 * yet, fall back to boot_cpu_data.
650 */
651 if (cpu->x86 == 0)
652 cpu = &boot_cpu_data;
653
654 if (cpu->x86_vendor != X86_VENDOR_AMD)
655 return false;
656
657 if (osvw_id >= 0 && osvw_id < 65536 &&
658 cpu_has(cpu, X86_FEATURE_OSVW)) {
659 u64 osvw_len;
660
661 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
662 if (osvw_id < osvw_len) {
663 u64 osvw_bits;
664
665 rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
666 osvw_bits);
667 return osvw_bits & (1ULL << (osvw_id & 0x3f));
668 }
669 }
670
671 /* OSVW unavailable or ID unknown, match family-model-stepping range */
672 ms = (cpu->x86_model << 8) | cpu->x86_mask;
673 while ((range = *erratum++))
674 if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
675 (ms >= AMD_MODEL_RANGE_START(range)) &&
676 (ms <= AMD_MODEL_RANGE_END(range)))
677 return true;
678
679 return false;
680}
681
682EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68e4a6f2211..490dac63c2d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
140static int __init x86_xsave_setup(char *s) 140static int __init x86_xsave_setup(char *s)
141{ 141{
142 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 142 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
143 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
143 return 1; 144 return 1;
144} 145}
145__setup("noxsave", x86_xsave_setup); 146__setup("noxsave", x86_xsave_setup);
146 147
148static int __init x86_xsaveopt_setup(char *s)
149{
150 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
151 return 1;
152}
153__setup("noxsaveopt", x86_xsaveopt_setup);
154
147#ifdef CONFIG_X86_32 155#ifdef CONFIG_X86_32
148static int cachesize_override __cpuinitdata = -1; 156static int cachesize_override __cpuinitdata = -1;
149static int disable_x86_serial_nr __cpuinitdata = 1; 157static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
551 c->x86_capability[4] = excap; 559 c->x86_capability[4] = excap;
552 } 560 }
553 561
562 /* Additional Intel-defined flags: level 0x00000007 */
563 if (c->cpuid_level >= 0x00000007) {
564 u32 eax, ebx, ecx, edx;
565
566 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
567
568 if (eax > 0)
569 c->x86_capability[9] = ebx;
570 }
571
554 /* AMD-defined flags: level 0x80000001 */ 572 /* AMD-defined flags: level 0x80000001 */
555 xlvl = cpuid_eax(0x80000000); 573 xlvl = cpuid_eax(0x80000000);
556 c->extended_cpuid_level = xlvl; 574 c->extended_cpuid_level = xlvl;
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
576 if (c->extended_cpuid_level >= 0x80000007) 594 if (c->extended_cpuid_level >= 0x80000007)
577 c->x86_power = cpuid_edx(0x80000007); 595 c->x86_power = cpuid_edx(0x80000007);
578 596
597 init_scattered_cpuid_features(c);
579} 598}
580 599
581static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) 600static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
731 750
732 get_model_name(c); /* Default name */ 751 get_model_name(c); /* Default name */
733 752
734 init_scattered_cpuid_features(c);
735 detect_nopl(c); 753 detect_nopl(c);
736} 754}
737 755
@@ -1192,6 +1210,7 @@ void __cpuinit cpu_init(void)
1192 dbg_restore_debug_regs(); 1210 dbg_restore_debug_regs();
1193 1211
1194 fpu_init(); 1212 fpu_init();
1213 xsave_init();
1195 1214
1196 raw_local_save_flags(kernel_eflags); 1215 raw_local_save_flags(kernel_eflags);
1197 1216
@@ -1252,12 +1271,7 @@ void __cpuinit cpu_init(void)
1252 clear_used_math(); 1271 clear_used_math();
1253 mxcsr_feature_mask_init(); 1272 mxcsr_feature_mask_init();
1254 1273
1255 /* 1274 fpu_init();
1256 * Boot processor to setup the FP and extended state context info.
1257 */
1258 if (smp_processor_id() == boot_cpu_id)
1259 init_thread_xstate();
1260
1261 xsave_init(); 1275 xsave_init();
1262} 1276}
1263#endif 1277#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index cee5263927c..246cd3afbb5 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -348,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
348 348
349 freqs.old = perf->states[perf->state].core_frequency * 1000; 349 freqs.old = perf->states[perf->state].core_frequency * 1000;
350 freqs.new = data->freq_table[next_state].frequency; 350 freqs.new = data->freq_table[next_state].frequency;
351 for_each_cpu(i, cmd.mask) { 351 for_each_cpu(i, policy->cpus) {
352 freqs.cpu = i; 352 freqs.cpu = i;
353 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 353 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
354 } 354 }
@@ -364,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
364 } 364 }
365 } 365 }
366 366
367 for_each_cpu(i, cmd.mask) { 367 for_each_cpu(i, policy->cpus) {
368 freqs.cpu = i; 368 freqs.cpu = i;
369 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 369 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
370 } 370 }
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 16e3483be9e..32974cf8423 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -169,12 +169,9 @@ static int gx_freq_mult[16] = {
169 * Low Level chipset interface * 169 * Low Level chipset interface *
170 ****************************************************************/ 170 ****************************************************************/
171static struct pci_device_id gx_chipset_tbl[] __initdata = { 171static struct pci_device_id gx_chipset_tbl[] __initdata = {
172 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, 172 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
173 PCI_ANY_ID, PCI_ANY_ID }, 173 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
174 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, 174 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
175 PCI_ANY_ID, PCI_ANY_ID },
176 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
177 PCI_ANY_ID, PCI_ANY_ID },
178 { 0, }, 175 { 0, },
179}; 176};
180 177
@@ -199,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
199 } 196 }
200 197
201 /* detect which companion chip is used */ 198 /* detect which companion chip is used */
202 while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { 199 for_each_pci_dev(gx_pci) {
203 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) 200 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
204 return gx_pci; 201 return gx_pci;
205 } 202 }
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 7e7eea4f826..03162dac627 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -426,7 +426,7 @@ static int guess_fsb(int mult)
426} 426}
427 427
428 428
429static int __init longhaul_get_ranges(void) 429static int __cpuinit longhaul_get_ranges(void)
430{ 430{
431 unsigned int i, j, k = 0; 431 unsigned int i, j, k = 0;
432 unsigned int ratio; 432 unsigned int ratio;
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void)
530} 530}
531 531
532 532
533static void __init longhaul_setup_voltagescaling(void) 533static void __cpuinit longhaul_setup_voltagescaling(void)
534{ 534{
535 union msr_longhaul longhaul; 535 union msr_longhaul longhaul;
536 struct mV_pos minvid, maxvid, vid; 536 struct mV_pos minvid, maxvid, vid;
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void)
784 return 0; 784 return 0;
785} 785}
786 786
787static int __init longhaul_cpu_init(struct cpufreq_policy *policy) 787static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
788{ 788{
789 struct cpuinfo_x86 *c = &cpu_data(0); 789 struct cpuinfo_x86 *c = &cpu_data(0);
790 char *cpuname = NULL; 790 char *cpuname = NULL;
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index e2360a469f7..cbf48fbca88 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -56,7 +56,7 @@ union msr_longhaul {
56/* 56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0) 57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */ 58 */
59static const int __initdata samuel1_mults[16] = { 59static const int __cpuinitdata samuel1_mults[16] = {
60 -1, /* 0000 -> RESERVED */ 60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */ 61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */ 62 40, /* 0010 -> 4.0x */
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = {
75 -1, /* 1111 -> RESERVED */ 75 -1, /* 1111 -> RESERVED */
76}; 76};
77 77
78static const int __initdata samuel1_eblcr[16] = { 78static const int __cpuinitdata samuel1_eblcr[16] = {
79 50, /* 0000 -> RESERVED */ 79 50, /* 0000 -> RESERVED */
80 30, /* 0001 -> 3.0x */ 80 30, /* 0001 -> 3.0x */
81 40, /* 0010 -> 4.0x */ 81 40, /* 0010 -> 4.0x */
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = {
97/* 97/*
98 * VIA C3 Samuel2 Stepping 1->15 98 * VIA C3 Samuel2 Stepping 1->15
99 */ 99 */
100static const int __initdata samuel2_eblcr[16] = { 100static const int __cpuinitdata samuel2_eblcr[16] = {
101 50, /* 0000 -> 5.0x */ 101 50, /* 0000 -> 5.0x */
102 30, /* 0001 -> 3.0x */ 102 30, /* 0001 -> 3.0x */
103 40, /* 0010 -> 4.0x */ 103 40, /* 0010 -> 4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
119/* 119/*
120 * VIA C3 Ezra 120 * VIA C3 Ezra
121 */ 121 */
122static const int __initdata ezra_mults[16] = { 122static const int __cpuinitdata ezra_mults[16] = {
123 100, /* 0000 -> 10.0x */ 123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */ 124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */ 125 40, /* 0010 -> 4.0x */
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = {
138 120, /* 1111 -> 12.0x */ 138 120, /* 1111 -> 12.0x */
139}; 139};
140 140
141static const int __initdata ezra_eblcr[16] = { 141static const int __cpuinitdata ezra_eblcr[16] = {
142 50, /* 0000 -> 5.0x */ 142 50, /* 0000 -> 5.0x */
143 30, /* 0001 -> 3.0x */ 143 30, /* 0001 -> 3.0x */
144 40, /* 0010 -> 4.0x */ 144 40, /* 0010 -> 4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
160/* 160/*
161 * VIA C3 (Ezra-T) [C5M]. 161 * VIA C3 (Ezra-T) [C5M].
162 */ 162 */
163static const int __initdata ezrat_mults[32] = { 163static const int __cpuinitdata ezrat_mults[32] = {
164 100, /* 0000 -> 10.0x */ 164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */ 165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */ 166 40, /* 0010 -> 4.0x */
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = {
196 -1, /* 1111 -> RESERVED (12.0x) */ 196 -1, /* 1111 -> RESERVED (12.0x) */
197}; 197};
198 198
199static const int __initdata ezrat_eblcr[32] = { 199static const int __cpuinitdata ezrat_eblcr[32] = {
200 50, /* 0000 -> 5.0x */ 200 50, /* 0000 -> 5.0x */
201 30, /* 0001 -> 3.0x */ 201 30, /* 0001 -> 3.0x */
202 40, /* 0010 -> 4.0x */ 202 40, /* 0010 -> 4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
235/* 235/*
236 * VIA C3 Nehemiah */ 236 * VIA C3 Nehemiah */
237 237
238static const int __initdata nehemiah_mults[32] = { 238static const int __cpuinitdata nehemiah_mults[32] = {
239 100, /* 0000 -> 10.0x */ 239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */ 240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */ 241 40, /* 0010 -> 4.0x */
@@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = {
270 -1, /* 1111 -> 12.0x */ 270 -1, /* 1111 -> 12.0x */
271}; 271};
272 272
273static const int __initdata nehemiah_eblcr[32] = { 273static const int __cpuinitdata nehemiah_eblcr[32] = {
274 50, /* 0000 -> 5.0x */ 274 50, /* 0000 -> 5.0x */
275 160, /* 0001 -> 16.0x */ 275 160, /* 0001 -> 16.0x */
276 40, /* 0010 -> 4.0x */ 276 40, /* 0010 -> 4.0x */
@@ -315,7 +315,7 @@ struct mV_pos {
315 unsigned short pos; 315 unsigned short pos;
316}; 316};
317 317
318static const struct mV_pos __initdata vrm85_mV[32] = { 318static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, 319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, 320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, 321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = {
326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} 326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
327}; 327};
328 328
329static const unsigned char __initdata mV_vrm85[32] = { 329static const unsigned char __cpuinitdata mV_vrm85[32] = {
330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, 330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, 331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, 332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
334}; 334};
335 335
336static const struct mV_pos __initdata mobilevrm_mV[32] = { 336static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, 337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, 338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, 339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = {
344 {675, 3}, {650, 2}, {625, 1}, {600, 0} 344 {675, 3}, {650, 2}, {625, 1}, {600, 0}
345}; 345};
346 346
347static const unsigned char __initdata mV_mobilevrm[32] = { 347static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index e7b559d74c5..fc09f142d94 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu)
165 * TMTA rules: 165 * TMTA rules:
166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) 166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
167 */ 167 */
168static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, 168static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
169 unsigned int *high_freq) 169 unsigned int *high_freq)
170{ 170{
171 u32 msr_lo, msr_hi; 171 u32 msr_lo, msr_hi;
172 u32 save_lo, save_hi; 172 u32 save_lo, save_hi;
@@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
258} 258}
259 259
260 260
261static int __init longrun_cpu_init(struct cpufreq_policy *policy) 261static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
262{ 262{
263 int result = 0; 263 int result = 0;
264 264
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 7b8a8ba67b0..bd1cac747f6 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
178 } 178 }
179 } 179 }
180 180
181 if (c->x86 != 0xF) { 181 if (c->x86 != 0xF)
182 if (!cpu_has(c, X86_FEATURE_EST))
183 printk(KERN_WARNING PFX "Unknown CPU. "
184 "Please send an e-mail to "
185 "<cpufreq@vger.kernel.org>\n");
186 return 0; 182 return 0;
187 }
188 183
189 /* on P-4s, the TSC runs with constant frequency independent whether 184 /* on P-4s, the TSC runs with constant frequency independent whether
190 * throttling is active or not. */ 185 * throttling is active or not. */
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index a36de5bbb62..994230d4dc4 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -110,7 +110,7 @@ struct pcc_cpu {
110 u32 output_offset; 110 u32 output_offset;
111}; 111};
112 112
113static struct pcc_cpu *pcc_cpu_info; 113static struct pcc_cpu __percpu *pcc_cpu_info;
114 114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy) 115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{ 116{
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 9a97116f89e..4a45fd6e41b 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy)
569 * We will then get the same kind of behaviour already tested under 569 * We will then get the same kind of behaviour already tested under
570 * the "well-known" other OS. 570 * the "well-known" other OS.
571 */ 571 */
572static int __init fixup_sgtc(void) 572static int __cpuinit fixup_sgtc(void)
573{ 573{
574 unsigned int sgtc; 574 unsigned int sgtc;
575 unsigned int m; 575 unsigned int m;
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu)
603} 603}
604 604
605 605
606static int __init acer_cpufreq_pst(const struct dmi_system_id *d) 606static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
607{ 607{
608 printk(KERN_WARNING PFX 608 printk(KERN_WARNING PFX
609 "%s laptop with broken PST tables in BIOS detected.\n", 609 "%s laptop with broken PST tables in BIOS detected.\n",
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
621 * A BIOS update is all that can save them. 621 * A BIOS update is all that can save them.
622 * Mention this, and disable cpufreq. 622 * Mention this, and disable cpufreq.
623 */ 623 */
624static struct dmi_system_id __initdata powernow_dmi_table[] = { 624static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
625 { 625 {
626 .callback = acer_cpufreq_pst, 626 .callback = acer_cpufreq_pst,
627 .ident = "Acer Aspire", 627 .ident = "Acer Aspire",
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
633 { } 633 { }
634}; 634};
635 635
636static int __init powernow_cpu_init(struct cpufreq_policy *policy) 636static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
637{ 637{
638 union msr_fidvidstatus fidvidstatus; 638 union msr_fidvidstatus fidvidstatus;
639 int result; 639 int result;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 3e90cce3dc8..491977baf6c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -9,7 +9,7 @@
9 * Based on the powernow-k7.c module written by Dave Jones. 9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones on behalf of SuSE Labs 10 * (C) 2003 Dave Jones on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de> 11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@suse.cz> 12 * (C) 2004 Pavel Machek <pavel@ucw.cz>
13 * Licensed under the terms of the GNU GPL License version 2. 13 * Licensed under the terms of the GNU GPL License version 2.
14 * Based upon datasheets & sample CPUs kindly provided by AMD. 14 * Based upon datasheets & sample CPUs kindly provided by AMD.
15 * 15 *
@@ -806,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data)
806 * www.amd.com 806 * www.amd.com
807 */ 807 */
808 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); 808 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
809 printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
810 " and Cool'N'Quiet support is enabled in BIOS setup\n");
809 return -ENODEV; 811 return -ENODEV;
810} 812}
811 813
@@ -910,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
910{ 912{
911 int i; 913 int i;
912 u32 hi = 0, lo = 0; 914 u32 hi = 0, lo = 0;
913 rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); 915 rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
914 data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; 916 data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
915 917
916 for (i = 0; i < data->acpi_data.state_count; i++) { 918 for (i = 0; i < data->acpi_data.state_count; i++) {
917 u32 index; 919 u32 index;
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index dd531cc56a8..8095f8611f8 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,6 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
34{ 34{
35 &x86_hyper_vmware, 35 &x86_hyper_vmware,
36 &x86_hyper_ms_hyperv, 36 &x86_hyper_ms_hyperv,
37#ifdef CONFIG_XEN_PVHVM
38 &x86_hyper_xen_hvm,
39#endif
37}; 40};
38 41
39const struct hypervisor_x86 *x86_hyper; 42const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 33eae2062cf..898c2f4eab8 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
347 return l3; 347 return l3;
348} 348}
349 349
350static void __cpuinit 350static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
351amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 351 int index)
352{ 352{
353 int node; 353 int node;
354 354
@@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
396 this_leaf->l3 = l3_caches[node]; 396 this_leaf->l3 = l3_caches[node];
397} 397}
398 398
399/*
400 * check whether a slot used for disabling an L3 index is occupied.
401 * @l3: L3 cache descriptor
402 * @slot: slot number (0..1)
403 *
404 * @returns: the disabled index if used or negative value if slot free.
405 */
406int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
407{
408 unsigned int reg = 0;
409
410 pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
411
412 /* check whether this slot is activated already */
413 if (reg & (3UL << 30))
414 return reg & 0xfff;
415
416 return -1;
417}
418
399static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, 419static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
400 unsigned int slot) 420 unsigned int slot)
401{ 421{
402 struct pci_dev *dev = this_leaf->l3->dev; 422 int index;
403 unsigned int reg = 0;
404 423
405 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 424 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
406 return -EINVAL; 425 return -EINVAL;
407 426
408 if (!dev) 427 index = amd_get_l3_disable_slot(this_leaf->l3, slot);
409 return -EINVAL; 428 if (index >= 0)
429 return sprintf(buf, "%d\n", index);
410 430
411 pci_read_config_dword(dev, 0x1BC + slot * 4, &reg); 431 return sprintf(buf, "FREE\n");
412 return sprintf(buf, "0x%08x\n", reg);
413} 432}
414 433
415#define SHOW_CACHE_DISABLE(slot) \ 434#define SHOW_CACHE_DISABLE(slot) \
@@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
451 } 470 }
452} 471}
453 472
454 473/*
455static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, 474 * disable a L3 cache index by using a disable-slot
456 const char *buf, size_t count, 475 *
457 unsigned int slot) 476 * @l3: L3 cache descriptor
477 * @cpu: A CPU on the node containing the L3 cache
478 * @slot: slot number (0..1)
479 * @index: index to disable
480 *
481 * @return: 0 on success, error status on failure
482 */
483int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
484 unsigned long index)
458{ 485{
459 struct pci_dev *dev = this_leaf->l3->dev; 486 int ret = 0;
460 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
461 unsigned long val = 0;
462 487
463#define SUBCACHE_MASK (3UL << 20) 488#define SUBCACHE_MASK (3UL << 20)
464#define SUBCACHE_INDEX 0xfff 489#define SUBCACHE_INDEX 0xfff
465 490
466 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 491 /*
492 * check whether this slot is already used or
493 * the index is already disabled
494 */
495 ret = amd_get_l3_disable_slot(l3, slot);
496 if (ret >= 0)
467 return -EINVAL; 497 return -EINVAL;
468 498
499 /*
500 * check whether the other slot has disabled the
501 * same index already
502 */
503 if (index == amd_get_l3_disable_slot(l3, !slot))
504 return -EINVAL;
505
506 /* do not allow writes outside of allowed bits */
507 if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
508 ((index & SUBCACHE_INDEX) > l3->indices))
509 return -EINVAL;
510
511 amd_l3_disable_index(l3, cpu, slot, index);
512
513 return 0;
514}
515
516static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
517 const char *buf, size_t count,
518 unsigned int slot)
519{
520 unsigned long val = 0;
521 int cpu, err = 0;
522
469 if (!capable(CAP_SYS_ADMIN)) 523 if (!capable(CAP_SYS_ADMIN))
470 return -EPERM; 524 return -EPERM;
471 525
472 if (!dev) 526 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
473 return -EINVAL; 527 return -EINVAL;
474 528
475 if (strict_strtoul(buf, 10, &val) < 0) 529 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
476 return -EINVAL;
477 530
478 /* do not allow writes outside of allowed bits */ 531 if (strict_strtoul(buf, 10, &val) < 0)
479 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
480 ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
481 return -EINVAL; 532 return -EINVAL;
482 533
483 amd_l3_disable_index(this_leaf->l3, cpu, slot, val); 534 err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
484 535 if (err) {
536 if (err == -EEXIST)
537 printk(KERN_WARNING "L3 disable slot %d in use!\n",
538 slot);
539 return err;
540 }
485 return count; 541 return count;
486} 542}
487 543
@@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
502 558
503#else /* CONFIG_CPU_SUP_AMD */ 559#else /* CONFIG_CPU_SUP_AMD */
504static void __cpuinit 560static void __cpuinit
505amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 561amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
506{ 562{
507}; 563};
508#endif /* CONFIG_CPU_SUP_AMD */ 564#endif /* CONFIG_CPU_SUP_AMD */
@@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
518 574
519 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 575 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
520 amd_cpuid4(index, &eax, &ebx, &ecx); 576 amd_cpuid4(index, &eax, &ebx, &ecx);
521 amd_check_l3_disable(index, this_leaf); 577 amd_check_l3_disable(this_leaf, index);
522 } else { 578 } else {
523 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 579 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
524 } 580 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18cc4256225..ed41562909f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,7 +51,7 @@
51static DEFINE_MUTEX(mce_read_mutex); 51static DEFINE_MUTEX(mce_read_mutex);
52 52
53#define rcu_dereference_check_mce(p) \ 53#define rcu_dereference_check_mce(p) \
54 rcu_dereference_check((p), \ 54 rcu_dereference_index_check((p), \
55 rcu_read_lock_sched_held() || \ 55 rcu_read_lock_sched_held() || \
56 lockdep_is_held(&mce_read_mutex)) 56 lockdep_is_held(&mce_read_mutex))
57 57
@@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
107static int default_decode_mce(struct notifier_block *nb, unsigned long val, 107static int default_decode_mce(struct notifier_block *nb, unsigned long val,
108 void *data) 108 void *data)
109{ 109{
110 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 110 pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
111 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 111 pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
112 112
113 return NOTIFY_STOP; 113 return NOTIFY_STOP;
114} 114}
@@ -211,11 +211,11 @@ void mce_log(struct mce *mce)
211 211
212static void print_mce(struct mce *m) 212static void print_mce(struct mce *m)
213{ 213{
214 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 214 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
215 m->extcpu, m->mcgstatus, m->bank, m->status); 215 m->extcpu, m->mcgstatus, m->bank, m->status);
216 216
217 if (m->ip) { 217 if (m->ip) {
218 pr_emerg("RIP%s %02x:<%016Lx> ", 218 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
219 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 219 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
220 m->cs, m->ip); 220 m->cs, m->ip);
221 221
@@ -224,14 +224,14 @@ static void print_mce(struct mce *m)
224 pr_cont("\n"); 224 pr_cont("\n");
225 } 225 }
226 226
227 pr_emerg("TSC %llx ", m->tsc); 227 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
228 if (m->addr) 228 if (m->addr)
229 pr_cont("ADDR %llx ", m->addr); 229 pr_cont("ADDR %llx ", m->addr);
230 if (m->misc) 230 if (m->misc)
231 pr_cont("MISC %llx ", m->misc); 231 pr_cont("MISC %llx ", m->misc);
232 232
233 pr_cont("\n"); 233 pr_cont("\n");
234 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 234 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
235 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 235 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
236 236
237 /* 237 /*
@@ -241,16 +241,6 @@ static void print_mce(struct mce *m)
241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
242} 242}
243 243
244static void print_mce_head(void)
245{
246 pr_emerg("\nHARDWARE ERROR\n");
247}
248
249static void print_mce_tail(void)
250{
251 pr_emerg("This is not a software problem!\n");
252}
253
254#define PANIC_TIMEOUT 5 /* 5 seconds */ 244#define PANIC_TIMEOUT 5 /* 5 seconds */
255 245
256static atomic_t mce_paniced; 246static atomic_t mce_paniced;
@@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
291 if (atomic_inc_return(&mce_fake_paniced) > 1) 281 if (atomic_inc_return(&mce_fake_paniced) > 1)
292 return; 282 return;
293 } 283 }
294 print_mce_head();
295 /* First print corrected ones that are still unlogged */ 284 /* First print corrected ones that are still unlogged */
296 for (i = 0; i < MCE_LOG_LEN; i++) { 285 for (i = 0; i < MCE_LOG_LEN; i++) {
297 struct mce *m = &mcelog.entry[i]; 286 struct mce *m = &mcelog.entry[i];
@@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
322 apei_err = apei_write_mce(final); 311 apei_err = apei_write_mce(final);
323 } 312 }
324 if (cpu_missing) 313 if (cpu_missing)
325 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 314 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
326 print_mce_tail();
327 if (exp) 315 if (exp)
328 printk(KERN_EMERG "Machine check: %s\n", exp); 316 pr_emerg(HW_ERR "Machine check: %s\n", exp);
329 if (!fake_panic) { 317 if (!fake_panic) {
330 if (panic_timeout == 0) 318 if (panic_timeout == 0)
331 panic_timeout = mce_panic_timeout; 319 panic_timeout = mce_panic_timeout;
332 panic(msg); 320 panic(msg);
333 } else 321 } else
334 printk(KERN_EMERG "Fake kernel panic: %s\n", msg); 322 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
335} 323}
336 324
337/* Support code for software error injection */ 325/* Support code for software error injection */
@@ -600,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
600 */ 588 */
601 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 589 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
602 mce_log(&m); 590 mce_log(&m);
591 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
603 add_taint(TAINT_MACHINE_CHECK); 592 add_taint(TAINT_MACHINE_CHECK);
604 } 593 }
605 594
@@ -1220,7 +1209,7 @@ int mce_notify_irq(void)
1220 schedule_work(&mce_trigger_work); 1209 schedule_work(&mce_trigger_work);
1221 1210
1222 if (__ratelimit(&ratelimit)) 1211 if (__ratelimit(&ratelimit))
1223 printk(KERN_INFO "Machine check events logged\n"); 1212 pr_info(HW_ERR "Machine check events logged\n");
1224 1213
1225 return 1; 1214 return 1;
1226 } 1215 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 62b48e40920..6fcd0936194 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot)
95 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 95 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
96 96
97 /* Already owned by someone else? */ 97 /* Already owned by someone else? */
98 if (val & CMCI_EN) { 98 if (val & MCI_CTL2_CMCI_EN) {
99 if (test_and_clear_bit(i, owned) && !boot) 99 if (test_and_clear_bit(i, owned) && !boot)
100 print_update("SHD", &hdr, i); 100 print_update("SHD", &hdr, i);
101 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 101 __clear_bit(i, __get_cpu_var(mce_poll_banks));
102 continue; 102 continue;
103 } 103 }
104 104
105 val |= CMCI_EN | CMCI_THRESHOLD; 105 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
106 val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
106 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 107 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
107 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 108 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
108 109
109 /* Did the enable bit stick? -- the bank supports CMCI */ 110 /* Did the enable bit stick? -- the bank supports CMCI */
110 if (val & CMCI_EN) { 111 if (val & MCI_CTL2_CMCI_EN) {
111 if (!test_and_set_bit(i, owned) && !boot) 112 if (!test_and_set_bit(i, owned) && !boot)
112 print_update("CMCI", &hdr, i); 113 print_update("CMCI", &hdr, i);
113 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 114 __clear_bit(i, __get_cpu_var(mce_poll_banks));
@@ -155,7 +156,7 @@ void cmci_clear(void)
155 continue; 156 continue;
156 /* Disable CMCI */ 157 /* Disable CMCI */
157 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 158 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
158 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); 159 val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
159 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 160 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
160 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 161 __clear_bit(i, __get_cpu_var(mce_banks_owned));
161 } 162 }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e1a0a3bf971..c2a8b26d4fe 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,15 +34,25 @@
34/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
35#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
36 36
37#define THERMAL_THROTTLING_EVENT 0
38#define POWER_LIMIT_EVENT 1
39
37/* 40/*
38 * Current thermal throttling state: 41 * Current thermal event state:
39 */ 42 */
40struct thermal_state { 43struct _thermal_state {
41 bool is_throttled; 44 bool new_event;
42 45 int event;
43 u64 next_check; 46 u64 next_check;
44 unsigned long throttle_count; 47 unsigned long count;
45 unsigned long last_throttle_count; 48 unsigned long last_count;
49};
50
51struct thermal_state {
52 struct _thermal_state core_throttle;
53 struct _thermal_state core_power_limit;
54 struct _thermal_state package_throttle;
55 struct _thermal_state package_power_limit;
46}; 56};
47 57
48static DEFINE_PER_CPU(struct thermal_state, thermal_state); 58static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly;
53 63
54#ifdef CONFIG_SYSFS 64#ifdef CONFIG_SYSFS
55#define define_therm_throt_sysdev_one_ro(_name) \ 65#define define_therm_throt_sysdev_one_ro(_name) \
56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 66 static SYSDEV_ATTR(_name, 0444, \
67 therm_throt_sysdev_show_##_name, \
68 NULL) \
57 69
58#define define_therm_throt_sysdev_show_func(name) \ 70#define define_therm_throt_sysdev_show_func(event, name) \
59 \ 71 \
60static ssize_t therm_throt_sysdev_show_##name( \ 72static ssize_t therm_throt_sysdev_show_##event##_##name( \
61 struct sys_device *dev, \ 73 struct sys_device *dev, \
62 struct sysdev_attribute *attr, \ 74 struct sysdev_attribute *attr, \
63 char *buf) \ 75 char *buf) \
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \
66 ssize_t ret; \ 78 ssize_t ret; \
67 \ 79 \
68 preempt_disable(); /* CPU hotplug */ \ 80 preempt_disable(); /* CPU hotplug */ \
69 if (cpu_online(cpu)) \ 81 if (cpu_online(cpu)) { \
70 ret = sprintf(buf, "%lu\n", \ 82 ret = sprintf(buf, "%lu\n", \
71 per_cpu(thermal_state, cpu).name); \ 83 per_cpu(thermal_state, cpu).event.name); \
72 else \ 84 } else \
73 ret = 0; \ 85 ret = 0; \
74 preempt_enable(); \ 86 preempt_enable(); \
75 \ 87 \
76 return ret; \ 88 return ret; \
77} 89}
78 90
79define_therm_throt_sysdev_show_func(throttle_count); 91define_therm_throt_sysdev_show_func(core_throttle, count);
80define_therm_throt_sysdev_one_ro(throttle_count); 92define_therm_throt_sysdev_one_ro(core_throttle_count);
93
94define_therm_throt_sysdev_show_func(core_power_limit, count);
95define_therm_throt_sysdev_one_ro(core_power_limit_count);
96
97define_therm_throt_sysdev_show_func(package_throttle, count);
98define_therm_throt_sysdev_one_ro(package_throttle_count);
99
100define_therm_throt_sysdev_show_func(package_power_limit, count);
101define_therm_throt_sysdev_one_ro(package_power_limit_count);
81 102
82static struct attribute *thermal_throttle_attrs[] = { 103static struct attribute *thermal_throttle_attrs[] = {
83 &attr_throttle_count.attr, 104 &attr_core_throttle_count.attr,
84 NULL 105 NULL
85}; 106};
86 107
87static struct attribute_group thermal_throttle_attr_group = { 108static struct attribute_group thermal_attr_group = {
88 .attrs = thermal_throttle_attrs, 109 .attrs = thermal_throttle_attrs,
89 .name = "thermal_throttle" 110 .name = "thermal_throttle"
90}; 111};
91#endif /* CONFIG_SYSFS */ 112#endif /* CONFIG_SYSFS */
92 113
114#define CORE_LEVEL 0
115#define PACKAGE_LEVEL 1
116
93/*** 117/***
94 * therm_throt_process - Process thermal throttling event from interrupt 118 * therm_throt_process - Process thermal throttling event from interrupt
95 * @curr: Whether the condition is current or not (boolean), since the 119 * @curr: Whether the condition is current or not (boolean), since the
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
106 * 1 : Event should be logged further, and a message has been 130 * 1 : Event should be logged further, and a message has been
107 * printed to the syslog. 131 * printed to the syslog.
108 */ 132 */
109static int therm_throt_process(bool is_throttled) 133static int therm_throt_process(bool new_event, int event, int level)
110{ 134{
111 struct thermal_state *state; 135 struct _thermal_state *state;
112 unsigned int this_cpu; 136 unsigned int this_cpu = smp_processor_id();
113 bool was_throttled; 137 bool old_event;
114 u64 now; 138 u64 now;
139 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
115 140
116 this_cpu = smp_processor_id();
117 now = get_jiffies_64(); 141 now = get_jiffies_64();
118 state = &per_cpu(thermal_state, this_cpu); 142 if (level == CORE_LEVEL) {
143 if (event == THERMAL_THROTTLING_EVENT)
144 state = &pstate->core_throttle;
145 else if (event == POWER_LIMIT_EVENT)
146 state = &pstate->core_power_limit;
147 else
148 return 0;
149 } else if (level == PACKAGE_LEVEL) {
150 if (event == THERMAL_THROTTLING_EVENT)
151 state = &pstate->package_throttle;
152 else if (event == POWER_LIMIT_EVENT)
153 state = &pstate->package_power_limit;
154 else
155 return 0;
156 } else
157 return 0;
119 158
120 was_throttled = state->is_throttled; 159 old_event = state->new_event;
121 state->is_throttled = is_throttled; 160 state->new_event = new_event;
122 161
123 if (is_throttled) 162 if (new_event)
124 state->throttle_count++; 163 state->count++;
125 164
126 if (time_before64(now, state->next_check) && 165 if (time_before64(now, state->next_check) &&
127 state->throttle_count != state->last_throttle_count) 166 state->count != state->last_count)
128 return 0; 167 return 0;
129 168
130 state->next_check = now + CHECK_INTERVAL; 169 state->next_check = now + CHECK_INTERVAL;
131 state->last_throttle_count = state->throttle_count; 170 state->last_count = state->count;
132 171
133 /* if we just entered the thermal event */ 172 /* if we just entered the thermal event */
134 if (is_throttled) { 173 if (new_event) {
135 printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); 174 if (event == THERMAL_THROTTLING_EVENT)
175 printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
176 this_cpu,
177 level == CORE_LEVEL ? "Core" : "Package",
178 state->count);
179 else
180 printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
181 this_cpu,
182 level == CORE_LEVEL ? "Core" : "Package",
183 state->count);
136 184
137 add_taint(TAINT_MACHINE_CHECK); 185 add_taint(TAINT_MACHINE_CHECK);
138 return 1; 186 return 1;
139 } 187 }
140 if (was_throttled) { 188 if (old_event) {
141 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); 189 if (event == THERMAL_THROTTLING_EVENT)
190 printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
191 this_cpu,
192 level == CORE_LEVEL ? "Core" : "Package");
193 else
194 printk(KERN_INFO "CPU%d: %s power limit normal\n",
195 this_cpu,
196 level == CORE_LEVEL ? "Core" : "Package");
142 return 1; 197 return 1;
143 } 198 }
144 199
@@ -149,13 +204,32 @@ static int therm_throt_process(bool is_throttled)
149/* Add/Remove thermal_throttle interface for CPU device: */ 204/* Add/Remove thermal_throttle interface for CPU device: */
150static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 205static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
151{ 206{
152 return sysfs_create_group(&sys_dev->kobj, 207 int err;
153 &thermal_throttle_attr_group); 208 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
209
210 err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
211 if (err)
212 return err;
213
214 if (cpu_has(c, X86_FEATURE_PLN))
215 err = sysfs_add_file_to_group(&sys_dev->kobj,
216 &attr_core_power_limit_count.attr,
217 thermal_attr_group.name);
218 if (cpu_has(c, X86_FEATURE_PTS))
219 err = sysfs_add_file_to_group(&sys_dev->kobj,
220 &attr_package_throttle_count.attr,
221 thermal_attr_group.name);
222 if (cpu_has(c, X86_FEATURE_PLN))
223 err = sysfs_add_file_to_group(&sys_dev->kobj,
224 &attr_package_power_limit_count.attr,
225 thermal_attr_group.name);
226
227 return err;
154} 228}
155 229
156static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 230static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
157{ 231{
158 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 232 sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
159} 233}
160 234
161/* Mutex protecting device creation against CPU hotplug: */ 235/* Mutex protecting device creation against CPU hotplug: */
@@ -226,14 +300,50 @@ device_initcall(thermal_throttle_init_device);
226 300
227#endif /* CONFIG_SYSFS */ 301#endif /* CONFIG_SYSFS */
228 302
303/*
304 * Set up the most two significant bit to notify mce log that this thermal
305 * event type.
306 * This is a temp solution. May be changed in the future with mce log
307 * infrasture.
308 */
309#define CORE_THROTTLED (0)
310#define CORE_POWER_LIMIT ((__u64)1 << 62)
311#define PACKAGE_THROTTLED ((__u64)2 << 62)
312#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
313
229/* Thermal transition interrupt handler */ 314/* Thermal transition interrupt handler */
230static void intel_thermal_interrupt(void) 315static void intel_thermal_interrupt(void)
231{ 316{
232 __u64 msr_val; 317 __u64 msr_val;
318 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
233 319
234 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 320 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
235 if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) 321
236 mce_log_therm_throt_event(msr_val); 322 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
323 THERMAL_THROTTLING_EVENT,
324 CORE_LEVEL) != 0)
325 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
326
327 if (cpu_has(c, X86_FEATURE_PLN))
328 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
329 POWER_LIMIT_EVENT,
330 CORE_LEVEL) != 0)
331 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
332
333 if (cpu_has(c, X86_FEATURE_PTS)) {
334 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
335 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
336 THERMAL_THROTTLING_EVENT,
337 PACKAGE_LEVEL) != 0)
338 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
339 if (cpu_has(c, X86_FEATURE_PLN))
340 if (therm_throt_process(msr_val &
341 PACKAGE_THERM_STATUS_POWER_LIMIT,
342 POWER_LIMIT_EVENT,
343 PACKAGE_LEVEL) != 0)
344 mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
345 | msr_val);
346 }
237} 347}
238 348
239static void unexpected_thermal_interrupt(void) 349static void unexpected_thermal_interrupt(void)
@@ -335,8 +445,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
335 apic_write(APIC_LVTTHMR, h); 445 apic_write(APIC_LVTTHMR, h);
336 446
337 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 447 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
338 wrmsr(MSR_IA32_THERM_INTERRUPT, 448 if (cpu_has(c, X86_FEATURE_PLN))
339 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); 449 wrmsr(MSR_IA32_THERM_INTERRUPT,
450 l | (THERM_INT_LOW_ENABLE
451 | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
452 else
453 wrmsr(MSR_IA32_THERM_INTERRUPT,
454 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
455
456 if (cpu_has(c, X86_FEATURE_PTS)) {
457 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
458 if (cpu_has(c, X86_FEATURE_PLN))
459 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
460 l | (PACKAGE_THERM_INT_LOW_ENABLE
461 | PACKAGE_THERM_INT_HIGH_ENABLE
462 | PACKAGE_THERM_INT_PLN_ENABLE), h);
463 else
464 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
465 l | (PACKAGE_THERM_INT_LOW_ENABLE
466 | PACKAGE_THERM_INT_HIGH_ENABLE), h);
467 }
340 468
341 smp_thermal_vector = intel_thermal_interrupt; 469 smp_thermal_vector = intel_thermal_interrupt;
342 470
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 16f41bbe46b..d944bf6c50e 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
18#include <asm/mshyperv.h> 18#include <asm/mshyperv.h>
19 19
20struct ms_hyperv_info ms_hyperv; 20struct ms_hyperv_info ms_hyperv;
21EXPORT_SYMBOL_GPL(ms_hyperv);
21 22
22static bool __init ms_hyperv_platform(void) 23static bool __init ms_hyperv_platform(void)
23{ 24{
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f01..c5f59d07142 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
632 unsigned long gran_base, chunk_base, lose_base; 632 unsigned long gran_base, chunk_base, lose_base;
633 char gran_factor, chunk_factor, lose_factor; 633 char gran_factor, chunk_factor, lose_factor;
634 634
635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), 637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
638 638
639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", 639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
640 result[i].bad ? "*BAD*" : " ", 640 result[i].bad ? "*BAD*" : " ",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61..7d28d7d0388 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
433{ 433{
434 unsigned int mask_lo, mask_hi, base_lo, base_hi; 434 unsigned int mask_lo, mask_hi, base_lo, base_hi;
435 unsigned int tmp, hi; 435 unsigned int tmp, hi;
436 int cpu;
437 436
438 /* 437 /*
439 * get_mtrr doesn't need to update mtrr_state, also it could be called 438 * get_mtrr doesn't need to update mtrr_state, also it could be called
440 * from any cpu, so try to print it out directly. 439 * from any cpu, so try to print it out directly.
441 */ 440 */
442 cpu = get_cpu(); 441 get_cpu();
443 442
444 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 443 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
445 444
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79556bd9b60..01c0f3ee6cc 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
35 35
36#include <linux/types.h> /* FIXME: kvm_para.h needs this */ 36#include <linux/types.h> /* FIXME: kvm_para.h needs this */
37 37
38#include <linux/stop_machine.h>
38#include <linux/kvm_para.h> 39#include <linux/kvm_para.h>
39#include <linux/uaccess.h> 40#include <linux/uaccess.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -143,22 +144,28 @@ struct set_mtrr_data {
143 mtrr_type smp_type; 144 mtrr_type smp_type;
144}; 145};
145 146
147static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
148
146/** 149/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 150 * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data 151 * @info: pointer to mtrr configuration data
149 * 152 *
150 * Returns nothing. 153 * Returns nothing.
151 */ 154 */
152static void ipi_handler(void *info) 155static int mtrr_work_handler(void *info)
153{ 156{
154#ifdef CONFIG_SMP 157#ifdef CONFIG_SMP
155 struct set_mtrr_data *data = info; 158 struct set_mtrr_data *data = info;
156 unsigned long flags; 159 unsigned long flags;
157 160
161 atomic_dec(&data->count);
162 while (!atomic_read(&data->gate))
163 cpu_relax();
164
158 local_irq_save(flags); 165 local_irq_save(flags);
159 166
160 atomic_dec(&data->count); 167 atomic_dec(&data->count);
161 while (!atomic_read(&data->gate)) 168 while (atomic_read(&data->gate))
162 cpu_relax(); 169 cpu_relax();
163 170
164 /* The master has cleared me to execute */ 171 /* The master has cleared me to execute */
@@ -173,12 +180,13 @@ static void ipi_handler(void *info)
173 } 180 }
174 181
175 atomic_dec(&data->count); 182 atomic_dec(&data->count);
176 while (atomic_read(&data->gate)) 183 while (!atomic_read(&data->gate))
177 cpu_relax(); 184 cpu_relax();
178 185
179 atomic_dec(&data->count); 186 atomic_dec(&data->count);
180 local_irq_restore(flags); 187 local_irq_restore(flags);
181#endif 188#endif
189 return 0;
182} 190}
183 191
184static inline int types_compatible(mtrr_type type1, mtrr_type type2) 192static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
198 * 206 *
199 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: 207 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
200 * 208 *
201 * 1. Send IPI to do the following: 209 * 1. Queue work to do the following on all processors:
202 * 2. Disable Interrupts 210 * 2. Disable Interrupts
203 * 3. Wait for all procs to do so 211 * 3. Wait for all procs to do so
204 * 4. Enter no-fill cache mode 212 * 4. Enter no-fill cache mode
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
215 * 15. Enable interrupts. 223 * 15. Enable interrupts.
216 * 224 *
217 * What does that mean for us? Well, first we set data.count to the number 225 * What does that mean for us? Well, first we set data.count to the number
218 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait 226 * of CPUs. As each CPU announces that it started the rendezvous handler by
219 * until it hits 0 and proceed. We set the data.gate flag and reset data.count. 227 * decrementing the count, We reset data.count and set the data.gate flag
220 * Meanwhile, they are waiting for that flag to be set. Once it's set, each 228 * allowing all the cpu's to proceed with the work. As each cpu disables
229 * interrupts, it'll decrement data.count once. We wait until it hits 0 and
230 * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
231 * are waiting for that flag to be cleared. Once it's cleared, each
221 * CPU goes through the transition of updating MTRRs. 232 * CPU goes through the transition of updating MTRRs.
222 * The CPU vendors may each do it differently, 233 * The CPU vendors may each do it differently,
223 * so we call mtrr_if->set() callback and let them take care of it. 234 * so we call mtrr_if->set() callback and let them take care of it.
224 * When they're done, they again decrement data->count and wait for data.gate 235 * When they're done, they again decrement data->count and wait for data.gate
225 * to be reset. 236 * to be set.
226 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag 237 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
227 * Everyone then enables interrupts and we all continue on. 238 * Everyone then enables interrupts and we all continue on.
228 * 239 *
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
234{ 245{
235 struct set_mtrr_data data; 246 struct set_mtrr_data data;
236 unsigned long flags; 247 unsigned long flags;
248 int cpu;
249
250 preempt_disable();
237 251
238 data.smp_reg = reg; 252 data.smp_reg = reg;
239 data.smp_base = base; 253 data.smp_base = base;
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
246 atomic_set(&data.gate, 0); 260 atomic_set(&data.gate, 0);
247 261
248 /* Start the ball rolling on other CPUs */ 262 /* Start the ball rolling on other CPUs */
249 if (smp_call_function(ipi_handler, &data, 0) != 0) 263 for_each_online_cpu(cpu) {
250 panic("mtrr: timed out waiting for other CPUs\n"); 264 struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
265
266 if (cpu == smp_processor_id())
267 continue;
268
269 stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
270 }
251 271
252 local_irq_save(flags);
253 272
254 while (atomic_read(&data.count)) 273 while (atomic_read(&data.count))
255 cpu_relax(); 274 cpu_relax();
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
259 smp_wmb(); 278 smp_wmb();
260 atomic_set(&data.gate, 1); 279 atomic_set(&data.gate, 1);
261 280
281 local_irq_save(flags);
282
283 while (atomic_read(&data.count))
284 cpu_relax();
285
286 /* Ok, reset count and toggle gate */
287 atomic_set(&data.count, num_booting_cpus() - 1);
288 smp_wmb();
289 atomic_set(&data.gate, 0);
290
262 /* Do our MTRR business */ 291 /* Do our MTRR business */
263 292
264 /* 293 /*
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
279 308
280 atomic_set(&data.count, num_booting_cpus() - 1); 309 atomic_set(&data.count, num_booting_cpus() - 1);
281 smp_wmb(); 310 smp_wmb();
282 atomic_set(&data.gate, 0); 311 atomic_set(&data.gate, 1);
283 312
284 /* 313 /*
285 * Wait here for everyone to have seen the gate change 314 * Wait here for everyone to have seen the gate change
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
289 cpu_relax(); 318 cpu_relax();
290 319
291 local_irq_restore(flags); 320 local_irq_restore(flags);
321 preempt_enable();
292} 322}
293 323
294/** 324/**
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 00000000000..34b4dad6f0b
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,63 @@
1/*
2 * Routines to indentify additional cpu features that are scattered in
3 * cpuid space.
4 */
5#include <linux/cpu.h>
6
7#include <asm/pat.h>
8#include <asm/processor.h>
9
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17 u32 sub_leaf;
18};
19
20enum cpuid_regs {
21 CR_EAX = 0,
22 CR_ECX,
23 CR_EDX,
24 CR_EBX
25};
26
27void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
28{
29 u32 max_level;
30 u32 regs[4];
31 const struct cpuid_bit *cb;
32
33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
34 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
35 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
36 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
37 { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
38 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
39 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
40 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
41 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
42 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
43 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
44 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
45 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
46 { 0, 0, 0, 0, 0 }
47 };
48
49 for (cb = cpuid_bits; cb->feature; cb++) {
50
51 /* Verify that the level is valid */
52 max_level = cpuid_eax(cb->level & 0xffff0000);
53 if (max_level < cb->level ||
54 max_level > (cb->level | 0xffff))
55 continue;
56
57 cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
58 &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
59
60 if (regs[cb->reg] & (1 << cb->bit))
61 set_cpu_cap(c, cb->feature);
62 }
63}
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c
index 10fa5684a66..4397e987a1c 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,62 +1,14 @@
1/* 1/*
2 * Routines to indentify additional cpu features that are scattered in 2 * Check for extended topology enumeration cpuid leaf 0xb and if it
3 * cpuid space. 3 * exists, use it for populating initial_apicid and cpu topology
4 * detection.
4 */ 5 */
5#include <linux/cpu.h>
6 6
7#include <linux/cpu.h>
8#include <asm/apic.h>
7#include <asm/pat.h> 9#include <asm/pat.h>
8#include <asm/processor.h> 10#include <asm/processor.h>
9 11
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17};
18
19enum cpuid_regs {
20 CR_EAX = 0,
21 CR_ECX,
22 CR_EDX,
23 CR_EBX
24};
25
26void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
27{
28 u32 max_level;
29 u32 regs[4];
30 const struct cpuid_bit *cb;
31
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 },
36 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 },
37 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
38 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
39 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
40 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
41 { 0, 0, 0, 0 }
42 };
43
44 for (cb = cpuid_bits; cb->feature; cb++) {
45
46 /* Verify that the level is valid */
47 max_level = cpuid_eax(cb->level & 0xffff0000);
48 if (max_level < cb->level ||
49 max_level > (cb->level | 0xffff))
50 continue;
51
52 cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
53 &regs[CR_ECX], &regs[CR_EDX]);
54
55 if (regs[cb->reg] & (1 << cb->bit))
56 set_cpu_cap(c, cb->feature);
57 }
58}
59
60/* leaf 0xb SMT level */ 12/* leaf 0xb SMT level */
61#define SMT_LEVEL 0 13#define SMT_LEVEL 0
62 14
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index b9d1ff58844..227b0448960 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void)
51 51
52static unsigned long vmware_get_tsc_khz(void) 52static unsigned long vmware_get_tsc_khz(void)
53{ 53{
54 uint64_t tsc_hz; 54 uint64_t tsc_hz, lpj;
55 uint32_t eax, ebx, ecx, edx; 55 uint32_t eax, ebx, ecx, edx;
56 56
57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); 57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void)
62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", 62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
63 (unsigned long) tsc_hz / 1000, 63 (unsigned long) tsc_hz / 1000,
64 (unsigned long) tsc_hz % 1000); 64 (unsigned long) tsc_hz % 1000);
65
66 if (!preset_lpj) {
67 lpj = ((u64)tsc_hz * 1000);
68 do_div(lpj, HZ);
69 preset_lpj = lpj;
70 }
71
65 return tsc_hz; 72 return tsc_hz;
66} 73}
67 74
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf15..227d00920d2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -611,14 +611,14 @@ ldt_ss:
611 * compensating for the offset by changing to the ESPFIX segment with 611 * compensating for the offset by changing to the ESPFIX segment with
612 * a base address that matches for the difference. 612 * a base address that matches for the difference.
613 */ 613 */
614#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
614 mov %esp, %edx /* load kernel esp */ 615 mov %esp, %edx /* load kernel esp */
615 mov PT_OLDESP(%esp), %eax /* load userspace esp */ 616 mov PT_OLDESP(%esp), %eax /* load userspace esp */
616 mov %dx, %ax /* eax: new kernel esp */ 617 mov %dx, %ax /* eax: new kernel esp */
617 sub %eax, %edx /* offset (low word is 0) */ 618 sub %eax, %edx /* offset (low word is 0) */
618 PER_CPU(gdt_page, %ebx)
619 shr $16, %edx 619 shr $16, %edx
620 mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ 620 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
621 mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ 621 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
622 pushl $__ESPFIX_SS 622 pushl $__ESPFIX_SS
623 CFI_ADJUST_CFA_OFFSET 4 623 CFI_ADJUST_CFA_OFFSET 4
624 push %eax /* new kernel esp */ 624 push %eax /* new kernel esp */
@@ -791,9 +791,8 @@ ptregs_clone:
791 * normal stack and adjusts ESP with the matching offset. 791 * normal stack and adjusts ESP with the matching offset.
792 */ 792 */
793 /* fixup the stack */ 793 /* fixup the stack */
794 PER_CPU(gdt_page, %ebx) 794 mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
795 mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ 795 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
796 mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
797 shl $16, %eax 796 shl $16, %eax
798 addl %esp, %eax /* the adjusted stack pointer */ 797 addl %esp, %eax /* the adjusted stack pointer */
799 pushl $__KERNEL_DS 798 pushl $__KERNEL_DS
@@ -914,7 +913,7 @@ ENTRY(simd_coprocessor_error)
914 .balign 4 913 .balign 4
915 .long 661b 914 .long 661b
916 .long 663f 915 .long 663f
917 .byte X86_FEATURE_XMM 916 .word X86_FEATURE_XMM
918 .byte 662b-661b 917 .byte 662b-661b
919 .byte 664f-663f 918 .byte 664f-663f
920.previous 919.previous
@@ -1166,6 +1165,9 @@ ENTRY(xen_failsafe_callback)
1166.previous 1165.previous
1167ENDPROC(xen_failsafe_callback) 1166ENDPROC(xen_failsafe_callback)
1168 1167
1168BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
1169 xen_evtchn_do_upcall)
1170
1169#endif /* CONFIG_XEN */ 1171#endif /* CONFIG_XEN */
1170 1172
1171#ifdef CONFIG_FUNCTION_TRACER 1173#ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4db7c4d12ff..c5ea5cdbe7b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1065,6 +1065,7 @@ ENTRY(\sym)
1065END(\sym) 1065END(\sym)
1066.endm 1066.endm
1067 1067
1068#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
1068.macro paranoidzeroentry_ist sym do_sym ist 1069.macro paranoidzeroentry_ist sym do_sym ist
1069ENTRY(\sym) 1070ENTRY(\sym)
1070 INTR_FRAME 1071 INTR_FRAME
@@ -1076,10 +1077,9 @@ ENTRY(\sym)
1076 TRACE_IRQS_OFF 1077 TRACE_IRQS_OFF
1077 movq %rsp,%rdi /* pt_regs pointer */ 1078 movq %rsp,%rdi /* pt_regs pointer */
1078 xorl %esi,%esi /* no error code */ 1079 xorl %esi,%esi /* no error code */
1079 PER_CPU(init_tss, %r12) 1080 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1080 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1081 call \do_sym 1081 call \do_sym
1082 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) 1082 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1083 jmp paranoid_exit /* %ebx: no swapgs flag */ 1083 jmp paranoid_exit /* %ebx: no swapgs flag */
1084 CFI_ENDPROC 1084 CFI_ENDPROC
1085END(\sym) 1085END(\sym)
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback)
1329 CFI_ENDPROC 1329 CFI_ENDPROC
1330END(xen_failsafe_callback) 1330END(xen_failsafe_callback)
1331 1331
1332apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
1333 xen_hvm_callback_vector xen_evtchn_do_upcall
1334
1332#endif /* CONFIG_XEN */ 1335#endif /* CONFIG_XEN */
1333 1336
1334/* 1337/*
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index b2e24603739..784360c0625 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -20,7 +20,7 @@
20 20
21static void __init i386_default_early_setup(void) 21static void __init i386_default_early_setup(void)
22{ 22{
23 /* Initilize 32bit specific setup functions */ 23 /* Initialize 32bit specific setup functions */
24 x86_init.resources.probe_roms = probe_roms; 24 x86_init.resources.probe_roms = probe_roms;
25 x86_init.resources.reserve_resources = i386_reserve_resources; 25 x86_init.resources.reserve_resources = i386_reserve_resources;
26 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; 26 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 37c3d4b17d8..ff4c453e13f 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -131,6 +131,12 @@ ENTRY(startup_32)
131 movsl 131 movsl
1321: 1321:
133 133
134#ifdef CONFIG_OLPC_OPENFIRMWARE
135 /* save OFW's pgdir table for later use when calling into OFW */
136 movl %cr3, %eax
137 movl %eax, pa(olpc_ofw_pgd)
138#endif
139
134#ifdef CONFIG_PARAVIRT 140#ifdef CONFIG_PARAVIRT
135 /* This is can only trip for a broken bootloader... */ 141 /* This is can only trip for a broken bootloader... */
136 cmpw $0x207, pa(boot_params + BP_version) 142 cmpw $0x207, pa(boot_params + BP_version)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3d1e6f16b7a..239046bd447 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64)
234 * init data section till per cpu areas are set up. 234 * init data section till per cpu areas are set up.
235 */ 235 */
236 movl $MSR_GS_BASE,%ecx 236 movl $MSR_GS_BASE,%ecx
237 movq initial_gs(%rip),%rax 237 movl initial_gs(%rip),%eax
238 movq %rax,%rdx 238 movl initial_gs+4(%rip),%edx
239 shrq $32,%rdx
240 wrmsr 239 wrmsr
241 240
242 /* esi is pointer to real mode structure with interesting info. 241 /* esi is pointer to real mode structure with interesting info.
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba390d73117..351f9c0fea1 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,6 @@
16#include <asm/hpet.h> 16#include <asm/hpet.h>
17 17
18#define HPET_MASK CLOCKSOURCE_MASK(32) 18#define HPET_MASK CLOCKSOURCE_MASK(32)
19#define HPET_SHIFT 22
20 19
21/* FSEC = 10^-15 20/* FSEC = 10^-15
22 NSEC = 10^-9 */ 21 NSEC = 10^-9 */
@@ -583,7 +582,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
583 * scaled math multiplication factor for nanosecond to hpet tick 582 * scaled math multiplication factor for nanosecond to hpet tick
584 * conversion. 583 * conversion.
585 */ 584 */
586 hpet_freq = 1000000000000000ULL; 585 hpet_freq = FSEC_PER_SEC;
587 do_div(hpet_freq, hpet_period); 586 do_div(hpet_freq, hpet_period);
588 evt->mult = div_sc((unsigned long) hpet_freq, 587 evt->mult = div_sc((unsigned long) hpet_freq,
589 NSEC_PER_SEC, evt->shift); 588 NSEC_PER_SEC, evt->shift);
@@ -787,7 +786,6 @@ static struct clocksource clocksource_hpet = {
787 .rating = 250, 786 .rating = 250,
788 .read = read_hpet, 787 .read = read_hpet,
789 .mask = HPET_MASK, 788 .mask = HPET_MASK,
790 .shift = HPET_SHIFT,
791 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 789 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
792 .resume = hpet_resume_counter, 790 .resume = hpet_resume_counter,
793#ifdef CONFIG_X86_64 791#ifdef CONFIG_X86_64
@@ -798,6 +796,7 @@ static struct clocksource clocksource_hpet = {
798static int hpet_clocksource_register(void) 796static int hpet_clocksource_register(void)
799{ 797{
800 u64 start, now; 798 u64 start, now;
799 u64 hpet_freq;
801 cycle_t t1; 800 cycle_t t1;
802 801
803 /* Start the counter */ 802 /* Start the counter */
@@ -832,9 +831,15 @@ static int hpet_clocksource_register(void)
832 * mult = (hpet_period * 2^shift)/10^6 831 * mult = (hpet_period * 2^shift)/10^6
833 * mult = (hpet_period << shift)/FSEC_PER_NSEC 832 * mult = (hpet_period << shift)/FSEC_PER_NSEC
834 */ 833 */
835 clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
836 834
837 clocksource_register(&clocksource_hpet); 835 /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
836 *
837 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
838 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
839 */
840 hpet_freq = FSEC_PER_SEC;
841 do_div(hpet_freq, hpet_period);
842 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
838 843
839 return 0; 844 return 0;
840} 845}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b3225..1f11f5ce668 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void)
59 stts(); 59 stts();
60} 60}
61 61
62void __cpuinit init_thread_xstate(void) 62static void __cpuinit init_thread_xstate(void)
63{ 63{
64 /*
65 * Note that xstate_size might be overwriten later during
66 * xsave_init().
67 */
68
64 if (!HAVE_HWFP) { 69 if (!HAVE_HWFP) {
65 xstate_size = sizeof(struct i387_soft_struct); 70 xstate_size = sizeof(struct i387_soft_struct);
66 return; 71 return;
67 } 72 }
68 73
69 if (cpu_has_xsave) {
70 xsave_cntxt_init();
71 return;
72 }
73
74 if (cpu_has_fxsr) 74 if (cpu_has_fxsr)
75 xstate_size = sizeof(struct i387_fxsave_struct); 75 xstate_size = sizeof(struct i387_fxsave_struct);
76#ifdef CONFIG_X86_32 76#ifdef CONFIG_X86_32
@@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void)
84 * Called at bootup to set up the initial FPU state that is later cloned 84 * Called at bootup to set up the initial FPU state that is later cloned
85 * into all processes. 85 * into all processes.
86 */ 86 */
87
87void __cpuinit fpu_init(void) 88void __cpuinit fpu_init(void)
88{ 89{
89 unsigned long oldcr0 = read_cr0(); 90 unsigned long oldcr0 = read_cr0();
@@ -93,21 +94,26 @@ void __cpuinit fpu_init(void)
93 94
94 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 95 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
95 96
96 /*
97 * Boot processor to setup the FP and extended state context info.
98 */
99 if (!smp_processor_id()) 97 if (!smp_processor_id())
100 init_thread_xstate(); 98 init_thread_xstate();
101 xsave_init();
102 99
103 mxcsr_feature_mask_init(); 100 mxcsr_feature_mask_init();
104 /* clean state in init */ 101 /* clean state in init */
105 current_thread_info()->status = 0; 102 current_thread_info()->status = 0;
106 clear_used_math(); 103 clear_used_math();
107} 104}
108#endif /* CONFIG_X86_64 */
109 105
110static void fpu_finit(struct fpu *fpu) 106#else /* CONFIG_X86_64 */
107
108void __cpuinit fpu_init(void)
109{
110 if (!smp_processor_id())
111 init_thread_xstate();
112}
113
114#endif /* CONFIG_X86_32 */
115
116void fpu_finit(struct fpu *fpu)
111{ 117{
112#ifdef CONFIG_X86_32 118#ifdef CONFIG_X86_32
113 if (!HAVE_HWFP) { 119 if (!HAVE_HWFP) {
@@ -132,6 +138,7 @@ static void fpu_finit(struct fpu *fpu)
132 fp->fos = 0xffff0000u; 138 fp->fos = 0xffff0000u;
133 } 139 }
134} 140}
141EXPORT_SYMBOL_GPL(fpu_finit);
135 142
136/* 143/*
137 * The _current_ task is using the FPU for the first time 144 * The _current_ task is using the FPU for the first time
@@ -190,6 +197,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
190 if (ret) 197 if (ret)
191 return ret; 198 return ret;
192 199
200 sanitize_i387_state(target);
201
193 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 202 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
194 &target->thread.fpu.state->fxsave, 0, -1); 203 &target->thread.fpu.state->fxsave, 0, -1);
195} 204}
@@ -207,6 +216,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
207 if (ret) 216 if (ret)
208 return ret; 217 return ret;
209 218
219 sanitize_i387_state(target);
220
210 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 221 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
211 &target->thread.fpu.state->fxsave, 0, -1); 222 &target->thread.fpu.state->fxsave, 0, -1);
212 223
@@ -446,6 +457,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
446 -1); 457 -1);
447 } 458 }
448 459
460 sanitize_i387_state(target);
461
449 if (kbuf && pos == 0 && count == sizeof(env)) { 462 if (kbuf && pos == 0 && count == sizeof(env)) {
450 convert_from_fxsr(kbuf, target); 463 convert_from_fxsr(kbuf, target);
451 return 0; 464 return 0;
@@ -467,6 +480,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
467 if (ret) 480 if (ret)
468 return ret; 481 return ret;
469 482
483 sanitize_i387_state(target);
484
470 if (!HAVE_HWFP) 485 if (!HAVE_HWFP)
471 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); 486 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
472 487
@@ -533,6 +548,9 @@ static int save_i387_xsave(void __user *buf)
533 struct _fpstate_ia32 __user *fx = buf; 548 struct _fpstate_ia32 __user *fx = buf;
534 int err = 0; 549 int err = 0;
535 550
551
552 sanitize_i387_state(tsk);
553
536 /* 554 /*
537 * For legacy compatible, we always set FP/SSE bits in the bit 555 * For legacy compatible, we always set FP/SSE bits in the bit
538 * vector while saving the state to the user context. 556 * vector while saving the state to the user context.
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 01ab17ae2ae..ef10940e1af 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -49,55 +49,94 @@
49#include <asm/system.h> 49#include <asm/system.h>
50#include <asm/apic.h> 50#include <asm/apic.h>
51 51
52/** 52struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
53 * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
54 * @gdb_regs: A pointer to hold the registers in the order GDB wants.
55 * @regs: The &struct pt_regs of the current process.
56 *
57 * Convert the pt_regs in @regs into the format for registers that
58 * GDB expects, stored in @gdb_regs.
59 */
60void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
61{ 53{
62#ifndef CONFIG_X86_32 54#ifdef CONFIG_X86_32
63 u32 *gdb_regs32 = (u32 *)gdb_regs; 55 { "ax", 4, offsetof(struct pt_regs, ax) },
56 { "cx", 4, offsetof(struct pt_regs, cx) },
57 { "dx", 4, offsetof(struct pt_regs, dx) },
58 { "bx", 4, offsetof(struct pt_regs, bx) },
59 { "sp", 4, offsetof(struct pt_regs, sp) },
60 { "bp", 4, offsetof(struct pt_regs, bp) },
61 { "si", 4, offsetof(struct pt_regs, si) },
62 { "di", 4, offsetof(struct pt_regs, di) },
63 { "ip", 4, offsetof(struct pt_regs, ip) },
64 { "flags", 4, offsetof(struct pt_regs, flags) },
65 { "cs", 4, offsetof(struct pt_regs, cs) },
66 { "ss", 4, offsetof(struct pt_regs, ss) },
67 { "ds", 4, offsetof(struct pt_regs, ds) },
68 { "es", 4, offsetof(struct pt_regs, es) },
69 { "fs", 4, -1 },
70 { "gs", 4, -1 },
71#else
72 { "ax", 8, offsetof(struct pt_regs, ax) },
73 { "bx", 8, offsetof(struct pt_regs, bx) },
74 { "cx", 8, offsetof(struct pt_regs, cx) },
75 { "dx", 8, offsetof(struct pt_regs, dx) },
76 { "si", 8, offsetof(struct pt_regs, dx) },
77 { "di", 8, offsetof(struct pt_regs, di) },
78 { "bp", 8, offsetof(struct pt_regs, bp) },
79 { "sp", 8, offsetof(struct pt_regs, sp) },
80 { "r8", 8, offsetof(struct pt_regs, r8) },
81 { "r9", 8, offsetof(struct pt_regs, r9) },
82 { "r10", 8, offsetof(struct pt_regs, r10) },
83 { "r11", 8, offsetof(struct pt_regs, r11) },
84 { "r12", 8, offsetof(struct pt_regs, r12) },
85 { "r13", 8, offsetof(struct pt_regs, r13) },
86 { "r14", 8, offsetof(struct pt_regs, r14) },
87 { "r15", 8, offsetof(struct pt_regs, r15) },
88 { "ip", 8, offsetof(struct pt_regs, ip) },
89 { "flags", 4, offsetof(struct pt_regs, flags) },
90 { "cs", 4, offsetof(struct pt_regs, cs) },
91 { "ss", 4, offsetof(struct pt_regs, ss) },
64#endif 92#endif
65 gdb_regs[GDB_AX] = regs->ax; 93};
66 gdb_regs[GDB_BX] = regs->bx; 94
67 gdb_regs[GDB_CX] = regs->cx; 95int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
68 gdb_regs[GDB_DX] = regs->dx; 96{
69 gdb_regs[GDB_SI] = regs->si; 97 if (
70 gdb_regs[GDB_DI] = regs->di;
71 gdb_regs[GDB_BP] = regs->bp;
72 gdb_regs[GDB_PC] = regs->ip;
73#ifdef CONFIG_X86_32 98#ifdef CONFIG_X86_32
74 gdb_regs[GDB_PS] = regs->flags; 99 regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
75 gdb_regs[GDB_DS] = regs->ds; 100#endif
76 gdb_regs[GDB_ES] = regs->es; 101 regno == GDB_SP || regno == GDB_ORIG_AX)
77 gdb_regs[GDB_CS] = regs->cs; 102 return 0;
78 gdb_regs[GDB_FS] = 0xFFFF; 103
79 gdb_regs[GDB_GS] = 0xFFFF; 104 if (dbg_reg_def[regno].offset != -1)
80 if (user_mode_vm(regs)) { 105 memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
81 gdb_regs[GDB_SS] = regs->ss; 106 dbg_reg_def[regno].size);
82 gdb_regs[GDB_SP] = regs->sp; 107 return 0;
83 } else { 108}
84 gdb_regs[GDB_SS] = __KERNEL_DS; 109
85 gdb_regs[GDB_SP] = kernel_stack_pointer(regs); 110char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
111{
112 if (regno == GDB_ORIG_AX) {
113 memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
114 return "orig_ax";
86 } 115 }
87#else 116 if (regno >= DBG_MAX_REG_NUM || regno < 0)
88 gdb_regs[GDB_R8] = regs->r8; 117 return NULL;
89 gdb_regs[GDB_R9] = regs->r9; 118
90 gdb_regs[GDB_R10] = regs->r10; 119 if (dbg_reg_def[regno].offset != -1)
91 gdb_regs[GDB_R11] = regs->r11; 120 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
92 gdb_regs[GDB_R12] = regs->r12; 121 dbg_reg_def[regno].size);
93 gdb_regs[GDB_R13] = regs->r13; 122
94 gdb_regs[GDB_R14] = regs->r14; 123 switch (regno) {
95 gdb_regs[GDB_R15] = regs->r15; 124#ifdef CONFIG_X86_32
96 gdb_regs32[GDB_PS] = regs->flags; 125 case GDB_SS:
97 gdb_regs32[GDB_CS] = regs->cs; 126 if (!user_mode_vm(regs))
98 gdb_regs32[GDB_SS] = regs->ss; 127 *(unsigned long *)mem = __KERNEL_DS;
99 gdb_regs[GDB_SP] = kernel_stack_pointer(regs); 128 break;
129 case GDB_SP:
130 if (!user_mode_vm(regs))
131 *(unsigned long *)mem = kernel_stack_pointer(regs);
132 break;
133 case GDB_GS:
134 case GDB_FS:
135 *(unsigned long *)mem = 0xFFFF;
136 break;
100#endif 137#endif
138 }
139 return dbg_reg_def[regno].name;
101} 140}
102 141
103/** 142/**
@@ -150,54 +189,13 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
150 gdb_regs[GDB_SP] = p->thread.sp; 189 gdb_regs[GDB_SP] = p->thread.sp;
151} 190}
152 191
153/**
154 * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
155 * @gdb_regs: A pointer to hold the registers we've received from GDB.
156 * @regs: A pointer to a &struct pt_regs to hold these values in.
157 *
158 * Convert the GDB regs in @gdb_regs into the pt_regs, and store them
159 * in @regs.
160 */
161void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
162{
163#ifndef CONFIG_X86_32
164 u32 *gdb_regs32 = (u32 *)gdb_regs;
165#endif
166 regs->ax = gdb_regs[GDB_AX];
167 regs->bx = gdb_regs[GDB_BX];
168 regs->cx = gdb_regs[GDB_CX];
169 regs->dx = gdb_regs[GDB_DX];
170 regs->si = gdb_regs[GDB_SI];
171 regs->di = gdb_regs[GDB_DI];
172 regs->bp = gdb_regs[GDB_BP];
173 regs->ip = gdb_regs[GDB_PC];
174#ifdef CONFIG_X86_32
175 regs->flags = gdb_regs[GDB_PS];
176 regs->ds = gdb_regs[GDB_DS];
177 regs->es = gdb_regs[GDB_ES];
178 regs->cs = gdb_regs[GDB_CS];
179#else
180 regs->r8 = gdb_regs[GDB_R8];
181 regs->r9 = gdb_regs[GDB_R9];
182 regs->r10 = gdb_regs[GDB_R10];
183 regs->r11 = gdb_regs[GDB_R11];
184 regs->r12 = gdb_regs[GDB_R12];
185 regs->r13 = gdb_regs[GDB_R13];
186 regs->r14 = gdb_regs[GDB_R14];
187 regs->r15 = gdb_regs[GDB_R15];
188 regs->flags = gdb_regs32[GDB_PS];
189 regs->cs = gdb_regs32[GDB_CS];
190 regs->ss = gdb_regs32[GDB_SS];
191#endif
192}
193
194static struct hw_breakpoint { 192static struct hw_breakpoint {
195 unsigned enabled; 193 unsigned enabled;
196 unsigned long addr; 194 unsigned long addr;
197 int len; 195 int len;
198 int type; 196 int type;
199 struct perf_event **pev; 197 struct perf_event **pev;
200} breakinfo[4]; 198} breakinfo[HBP_NUM];
201 199
202static unsigned long early_dr7; 200static unsigned long early_dr7;
203 201
@@ -205,7 +203,7 @@ static void kgdb_correct_hw_break(void)
205{ 203{
206 int breakno; 204 int breakno;
207 205
208 for (breakno = 0; breakno < 4; breakno++) { 206 for (breakno = 0; breakno < HBP_NUM; breakno++) {
209 struct perf_event *bp; 207 struct perf_event *bp;
210 struct arch_hw_breakpoint *info; 208 struct arch_hw_breakpoint *info;
211 int val; 209 int val;
@@ -292,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
292{ 290{
293 int i; 291 int i;
294 292
295 for (i = 0; i < 4; i++) 293 for (i = 0; i < HBP_NUM; i++)
296 if (breakinfo[i].addr == addr && breakinfo[i].enabled) 294 if (breakinfo[i].addr == addr && breakinfo[i].enabled)
297 break; 295 break;
298 if (i == 4) 296 if (i == HBP_NUM)
299 return -1; 297 return -1;
300 298
301 if (hw_break_release_slot(i)) { 299 if (hw_break_release_slot(i)) {
@@ -313,7 +311,7 @@ static void kgdb_remove_all_hw_break(void)
313 int cpu = raw_smp_processor_id(); 311 int cpu = raw_smp_processor_id();
314 struct perf_event *bp; 312 struct perf_event *bp;
315 313
316 for (i = 0; i < 4; i++) { 314 for (i = 0; i < HBP_NUM; i++) {
317 if (!breakinfo[i].enabled) 315 if (!breakinfo[i].enabled)
318 continue; 316 continue;
319 bp = *per_cpu_ptr(breakinfo[i].pev, cpu); 317 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -333,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
333{ 331{
334 int i; 332 int i;
335 333
336 for (i = 0; i < 4; i++) 334 for (i = 0; i < HBP_NUM; i++)
337 if (!breakinfo[i].enabled) 335 if (!breakinfo[i].enabled)
338 break; 336 break;
339 if (i == 4) 337 if (i == HBP_NUM)
340 return -1; 338 return -1;
341 339
342 switch (bptype) { 340 switch (bptype) {
@@ -397,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
397 395
398 /* Disable hardware debugging while we are in kgdb: */ 396 /* Disable hardware debugging while we are in kgdb: */
399 set_debugreg(0UL, 7); 397 set_debugreg(0UL, 7);
400 for (i = 0; i < 4; i++) { 398 for (i = 0; i < HBP_NUM; i++) {
401 if (!breakinfo[i].enabled) 399 if (!breakinfo[i].enabled)
402 continue; 400 continue;
403 if (dbg_is_early) { 401 if (dbg_is_early) {
@@ -458,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
458{ 456{
459 unsigned long addr; 457 unsigned long addr;
460 char *ptr; 458 char *ptr;
461 int newPC;
462 459
463 switch (remcomInBuffer[0]) { 460 switch (remcomInBuffer[0]) {
464 case 'c': 461 case 'c':
@@ -469,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
469 linux_regs->ip = addr; 466 linux_regs->ip = addr;
470 case 'D': 467 case 'D':
471 case 'k': 468 case 'k':
472 newPC = linux_regs->ip;
473
474 /* clear the trace bit */ 469 /* clear the trace bit */
475 linux_regs->flags &= ~X86_EFLAGS_TF; 470 linux_regs->flags &= ~X86_EFLAGS_TF;
476 atomic_set(&kgdb_cpu_doing_single_step, -1); 471 atomic_set(&kgdb_cpu_doing_single_step, -1);
@@ -645,7 +640,7 @@ void kgdb_arch_late(void)
645 attr.bp_len = HW_BREAKPOINT_LEN_1; 640 attr.bp_len = HW_BREAKPOINT_LEN_1;
646 attr.bp_type = HW_BREAKPOINT_W; 641 attr.bp_type = HW_BREAKPOINT_W;
647 attr.disabled = 1; 642 attr.disabled = 1;
648 for (i = 0; i < 4; i++) { 643 for (i = 0; i < HBP_NUM; i++) {
649 if (breakinfo[i].pev) 644 if (breakinfo[i].pev)
650 continue; 645 continue;
651 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); 646 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 5915e0b3330..79ae68154e8 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -25,8 +25,34 @@
25#include <asm/i8259.h> 25#include <asm/i8259.h>
26#include <asm/apb_timer.h> 26#include <asm/apb_timer.h>
27 27
28/*
29 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
30 * cmdline option x86_mrst_timer can be used to override the configuration
31 * to prefer one or the other.
32 * at runtime, there are basically three timer configurations:
33 * 1. per cpu apbt clock only
34 * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
35 * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
36 *
37 * by default (without cmdline option), platform code first detects cpu type
38 * to see if we are on lincroft or penwell, then set up both lapic or apbt
39 * clocks accordingly.
40 * i.e. by default, medfield uses configuration #2, moorestown uses #1.
41 * config #3 is supported but not recommended on medfield.
42 *
43 * rating and feature summary:
44 * lapic (with C3STOP) --------- 100
45 * apbt (always-on) ------------ 110
46 * lapic (always-on,ARAT) ------ 150
47 */
48
49__cpuinitdata enum mrst_timer_options mrst_timer_options;
50
28static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; 51static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
29static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; 52static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
53enum mrst_cpu_type __mrst_cpu_chip;
54EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
55
30int sfi_mtimer_num; 56int sfi_mtimer_num;
31 57
32struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; 58struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
@@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
167 return 0; 193 return 0;
168} 194}
169 195
170/*
171 * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
172 * APBT but cmdline option can also override it.
173 */
174static void __cpuinit mrst_setup_secondary_clock(void)
175{
176 /* restore default lapic clock if disabled by cmdline */
177 if (disable_apbt_percpu)
178 return setup_secondary_APIC_clock();
179 apbt_setup_secondary_clock();
180}
181
182static unsigned long __init mrst_calibrate_tsc(void) 196static unsigned long __init mrst_calibrate_tsc(void)
183{ 197{
184 unsigned long flags, fast_calibrate; 198 unsigned long flags, fast_calibrate;
@@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void)
195 209
196void __init mrst_time_init(void) 210void __init mrst_time_init(void)
197{ 211{
212 switch (mrst_timer_options) {
213 case MRST_TIMER_APBT_ONLY:
214 break;
215 case MRST_TIMER_LAPIC_APBT:
216 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
217 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
218 break;
219 default:
220 if (!boot_cpu_has(X86_FEATURE_ARAT))
221 break;
222 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
223 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
224 return;
225 }
226 /* we need at least one APB timer */
198 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); 227 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
199 pre_init_apic_IRQ0(); 228 pre_init_apic_IRQ0();
200 apbt_time_init(); 229 apbt_time_init();
@@ -205,16 +234,21 @@ void __init mrst_rtc_init(void)
205 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); 234 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
206} 235}
207 236
208/* 237void __cpuinit mrst_arch_setup(void)
209 * if we use per cpu apb timer, the bootclock already setup. if we use lapic
210 * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
211 */
212static void __init mrst_setup_boot_clock(void)
213{ 238{
214 pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); 239 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
215 if (disable_apbt_percpu) 240 __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
216 setup_boot_APIC_clock(); 241 else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
217}; 242 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
243 else {
244 pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
245 boot_cpu_data.x86, boot_cpu_data.x86_model);
246 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
247 }
248 pr_debug("Moorestown CPU %s identified\n",
249 (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
250 "Lincroft" : "Penwell");
251}
218 252
219/* MID systems don't have i8042 controller */ 253/* MID systems don't have i8042 controller */
220static int mrst_i8042_detect(void) 254static int mrst_i8042_detect(void)
@@ -232,11 +266,13 @@ void __init x86_mrst_early_setup(void)
232 x86_init.resources.reserve_resources = x86_init_noop; 266 x86_init.resources.reserve_resources = x86_init_noop;
233 267
234 x86_init.timers.timer_init = mrst_time_init; 268 x86_init.timers.timer_init = mrst_time_init;
235 x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; 269 x86_init.timers.setup_percpu_clockev = x86_init_noop;
236 270
237 x86_init.irqs.pre_vector_init = x86_init_noop; 271 x86_init.irqs.pre_vector_init = x86_init_noop;
238 272
239 x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; 273 x86_init.oem.arch_setup = mrst_arch_setup;
274
275 x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
240 276
241 x86_platform.calibrate_tsc = mrst_calibrate_tsc; 277 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
242 x86_platform.i8042_detect = mrst_i8042_detect; 278 x86_platform.i8042_detect = mrst_i8042_detect;
@@ -250,3 +286,26 @@ void __init x86_mrst_early_setup(void)
250 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 286 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
251 287
252} 288}
289
290/*
291 * if user does not want to use per CPU apb timer, just give it a lower rating
292 * than local apic timer and skip the late per cpu timer init.
293 */
294static inline int __init setup_x86_mrst_timer(char *arg)
295{
296 if (!arg)
297 return -EINVAL;
298
299 if (strcmp("apbt_only", arg) == 0)
300 mrst_timer_options = MRST_TIMER_APBT_ONLY;
301 else if (strcmp("lapic_and_apbt", arg) == 0)
302 mrst_timer_options = MRST_TIMER_LAPIC_APBT;
303 else {
304 pr_warning("X86 MRST timer option %s not recognised"
305 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
306 arg);
307 return -EINVAL;
308 }
309 return 0;
310}
311__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 8297160c41b..0e0cdde519b 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -21,10 +21,7 @@
21#include <asm/geode.h> 21#include <asm/geode.h>
22#include <asm/setup.h> 22#include <asm/setup.h>
23#include <asm/olpc.h> 23#include <asm/olpc.h>
24 24#include <asm/olpc_ofw.h>
25#ifdef CONFIG_OPEN_FIRMWARE
26#include <asm/ofw.h>
27#endif
28 25
29struct olpc_platform_t olpc_platform_info; 26struct olpc_platform_t olpc_platform_info;
30EXPORT_SYMBOL_GPL(olpc_platform_info); 27EXPORT_SYMBOL_GPL(olpc_platform_info);
@@ -145,7 +142,7 @@ restart:
145 * The OBF flag will sometimes misbehave due to what we believe 142 * The OBF flag will sometimes misbehave due to what we believe
146 * is a hardware quirk.. 143 * is a hardware quirk..
147 */ 144 */
148 printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd); 145 pr_devel("olpc-ec: running cmd 0x%x\n", cmd);
149 outb(cmd, 0x6c); 146 outb(cmd, 0x6c);
150 147
151 if (wait_on_ibf(0x6c, 0)) { 148 if (wait_on_ibf(0x6c, 0)) {
@@ -162,8 +159,7 @@ restart:
162 " EC accept data!\n"); 159 " EC accept data!\n");
163 goto err; 160 goto err;
164 } 161 }
165 printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n", 162 pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]);
166 inbuf[i]);
167 outb(inbuf[i], 0x68); 163 outb(inbuf[i], 0x68);
168 } 164 }
169 } 165 }
@@ -176,8 +172,7 @@ restart:
176 goto restart; 172 goto restart;
177 } 173 }
178 outbuf[i] = inb(0x68); 174 outbuf[i] = inb(0x68);
179 printk(KERN_DEBUG "olpc-ec: received 0x%x\n", 175 pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
180 outbuf[i]);
181 } 176 }
182 } 177 }
183 178
@@ -188,14 +183,15 @@ err:
188} 183}
189EXPORT_SYMBOL_GPL(olpc_ec_cmd); 184EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190 185
191#ifdef CONFIG_OPEN_FIRMWARE 186#ifdef CONFIG_OLPC_OPENFIRMWARE
192static void __init platform_detect(void) 187static void __init platform_detect(void)
193{ 188{
194 size_t propsize; 189 size_t propsize;
195 __be32 rev; 190 __be32 rev;
191 const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
192 void *res[] = { &propsize };
196 193
197 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 194 if (olpc_ofw("getprop", args, res) || propsize != 4) {
198 &propsize) || propsize != 4) {
199 printk(KERN_ERR "ofw: getprop call failed!\n"); 195 printk(KERN_ERR "ofw: getprop call failed!\n");
200 rev = cpu_to_be32(0); 196 rev = cpu_to_be32(0);
201 } 197 }
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
new file mode 100644
index 00000000000..3218aa71ab5
--- /dev/null
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -0,0 +1,106 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/init.h>
4#include <asm/page.h>
5#include <asm/setup.h>
6#include <asm/io.h>
7#include <asm/pgtable.h>
8#include <asm/olpc_ofw.h>
9
10/* address of OFW callback interface; will be NULL if OFW isn't found */
11static int (*olpc_ofw_cif)(int *);
12
13/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
14u32 olpc_ofw_pgd __initdata;
15
16static DEFINE_SPINLOCK(ofw_lock);
17
18#define MAXARGS 10
19
20void __init setup_olpc_ofw_pgd(void)
21{
22 pgd_t *base, *ofw_pde;
23
24 if (!olpc_ofw_cif)
25 return;
26
27 /* fetch OFW's PDE */
28 base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
29 if (!base) {
30 printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
31 olpc_ofw_cif = NULL;
32 return;
33 }
34 ofw_pde = &base[OLPC_OFW_PDE_NR];
35
36 /* install OFW's PDE permanently into the kernel's pgtable */
37 set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
38 /* implicit optimization barrier here due to uninline function return */
39
40 early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
41}
42
43int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
44 void **res)
45{
46 int ofw_args[MAXARGS + 3];
47 unsigned long flags;
48 int ret, i, *p;
49
50 BUG_ON(nr_args + nr_res > MAXARGS);
51
52 if (!olpc_ofw_cif)
53 return -EIO;
54
55 ofw_args[0] = (int)name;
56 ofw_args[1] = nr_args;
57 ofw_args[2] = nr_res;
58
59 p = &ofw_args[3];
60 for (i = 0; i < nr_args; i++, p++)
61 *p = (int)args[i];
62
63 /* call into ofw */
64 spin_lock_irqsave(&ofw_lock, flags);
65 ret = olpc_ofw_cif(ofw_args);
66 spin_unlock_irqrestore(&ofw_lock, flags);
67
68 if (!ret) {
69 for (i = 0; i < nr_res; i++, p++)
70 *((int *)res[i]) = *p;
71 }
72
73 return ret;
74}
75EXPORT_SYMBOL_GPL(__olpc_ofw);
76
77/* OFW cif _should_ be above this address */
78#define OFW_MIN 0xff000000
79
80/* OFW starts on a 1MB boundary */
81#define OFW_BOUND (1<<20)
82
83void __init olpc_ofw_detect(void)
84{
85 struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
86 unsigned long start;
87
88 /* ensure OFW booted us by checking for "OFW " string */
89 if (hdr->ofw_magic != OLPC_OFW_SIG)
90 return;
91
92 olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
93
94 if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
95 printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
96 (unsigned long)olpc_ofw_cif);
97 olpc_ofw_cif = NULL;
98 return;
99 }
100
101 /* determine where OFW starts in memory */
102 start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
103 printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
104 (unsigned long)olpc_ofw_cif, (-start) >> 20);
105 reserve_top_address(-start);
106}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 4b7e3d8b01d..9f07cfcbd3a 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -13,6 +13,7 @@
13#include <asm/calgary.h> 13#include <asm/calgary.h>
14#include <asm/amd_iommu.h> 14#include <asm/amd_iommu.h>
15#include <asm/x86_init.h> 15#include <asm/x86_init.h>
16#include <asm/xen/swiotlb-xen.h>
16 17
17static int forbid_dac __read_mostly; 18static int forbid_dac __read_mostly;
18 19
@@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void)
132 /* free the range so iommu could get some range less than 4G */ 133 /* free the range so iommu could get some range less than 4G */
133 dma32_free_bootmem(); 134 dma32_free_bootmem();
134 135
135 if (pci_swiotlb_detect()) 136 if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
136 goto out; 137 goto out;
137 138
138 gart_iommu_hole_init(); 139 gart_iommu_hole_init();
@@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void)
144 /* needs to be called after gart_iommu_hole_init */ 145 /* needs to be called after gart_iommu_hole_init */
145 amd_iommu_detect(); 146 amd_iommu_detect();
146out: 147out:
148 pci_xen_swiotlb_init();
149
147 pci_swiotlb_init(); 150 pci_swiotlb_init();
148} 151}
149 152
@@ -296,7 +299,7 @@ static int __init pci_iommu_init(void)
296#endif 299#endif
297 x86_init.iommu.iommu_init(); 300 x86_init.iommu.iommu_init();
298 301
299 if (swiotlb) { 302 if (swiotlb || xen_swiotlb) {
300 printk(KERN_INFO "PCI-DMA: " 303 printk(KERN_INFO "PCI-DMA: "
301 "Using software bounce buffering for IO (SWIOTLB)\n"); 304 "Using software bounce buffering for IO (SWIOTLB)\n");
302 swiotlb_print_info(); 305 swiotlb_print_info();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 787572d43d9..d401f1d2d06 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
28EXPORT_SYMBOL(idle_nomwait); 28EXPORT_SYMBOL(idle_nomwait);
29 29
30struct kmem_cache *task_xstate_cachep; 30struct kmem_cache *task_xstate_cachep;
31EXPORT_SYMBOL_GPL(task_xstate_cachep);
31 32
32int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 33int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
33{ 34{
@@ -525,44 +526,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
525 return (edx & MWAIT_EDX_C1); 526 return (edx & MWAIT_EDX_C1);
526} 527}
527 528
528/* 529bool c1e_detected;
529 * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. 530EXPORT_SYMBOL(c1e_detected);
530 * For more information see
531 * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
532 * - Erratum #365 for family 0x11 (not affected because C1e not in use)
533 */
534static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
535{
536 u64 val;
537 if (c->x86_vendor != X86_VENDOR_AMD)
538 goto no_c1e_idle;
539
540 /* Family 0x0f models < rev F do not have C1E */
541 if (c->x86 == 0x0F && c->x86_model >= 0x40)
542 return 1;
543
544 if (c->x86 == 0x10) {
545 /*
546 * check OSVW bit for CPUs that are not affected
547 * by erratum #400
548 */
549 if (cpu_has(c, X86_FEATURE_OSVW)) {
550 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
551 if (val >= 2) {
552 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
553 if (!(val & BIT(1)))
554 goto no_c1e_idle;
555 }
556 }
557 return 1;
558 }
559
560no_c1e_idle:
561 return 0;
562}
563 531
564static cpumask_var_t c1e_mask; 532static cpumask_var_t c1e_mask;
565static int c1e_detected;
566 533
567void c1e_remove_cpu(int cpu) 534void c1e_remove_cpu(int cpu)
568{ 535{
@@ -584,12 +551,12 @@ static void c1e_idle(void)
584 u32 lo, hi; 551 u32 lo, hi;
585 552
586 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 553 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
554
587 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 555 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
588 c1e_detected = 1; 556 c1e_detected = true;
589 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 557 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
590 mark_tsc_unstable("TSC halt in AMD C1E"); 558 mark_tsc_unstable("TSC halt in AMD C1E");
591 printk(KERN_INFO "System has AMD C1E enabled\n"); 559 printk(KERN_INFO "System has AMD C1E enabled\n");
592 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
593 } 560 }
594 } 561 }
595 562
@@ -638,7 +605,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
638 */ 605 */
639 printk(KERN_INFO "using mwait in idle threads.\n"); 606 printk(KERN_INFO "using mwait in idle threads.\n");
640 pm_idle = mwait_idle; 607 pm_idle = mwait_idle;
641 } else if (check_c1e_idle(c)) { 608 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
609 /* E400: APIC timer interrupt does not wake up CPU from C1e */
642 printk(KERN_INFO "using C1E aware idle routine\n"); 610 printk(KERN_INFO "using C1E aware idle routine\n");
643 pm_idle = c1e_idle; 611 pm_idle = c1e_idle;
644 } else 612 } else
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4ae4acbd03..b008e788320 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -102,6 +102,7 @@
102 102
103#include <asm/paravirt.h> 103#include <asm/paravirt.h>
104#include <asm/hypervisor.h> 104#include <asm/hypervisor.h>
105#include <asm/olpc_ofw.h>
105 106
106#include <asm/percpu.h> 107#include <asm/percpu.h>
107#include <asm/topology.h> 108#include <asm/topology.h>
@@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p)
736 /* VMI may relocate the fixmap; do this before touching ioremap area */ 737 /* VMI may relocate the fixmap; do this before touching ioremap area */
737 vmi_init(); 738 vmi_init();
738 739
740 /* OFW also may relocate the fixmap */
741 olpc_ofw_detect();
742
739 early_trap_init(); 743 early_trap_init();
740 early_cpu_init(); 744 early_cpu_init();
741 early_ioremap_init(); 745 early_ioremap_init();
742 746
747 setup_olpc_ofw_pgd();
748
743 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 749 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
744 screen_info = boot_params.screen_info; 750 screen_info = boot_params.screen_info;
745 edid_info = boot_params.edid_info; 751 edid_info = boot_params.edid_info;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d..a5e928b0cb5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
735 goto do_rest; 735 goto do_rest;
736 } 736 }
737 737
738 if (!keventd_up() || current_is_keventd()) 738 schedule_work(&c_idle.work);
739 c_idle.work.func(&c_idle.work); 739 wait_for_completion(&c_idle.done);
740 else {
741 schedule_work(&c_idle.work);
742 wait_for_completion(&c_idle.done);
743 }
744 740
745 if (IS_ERR(c_idle.idle)) { 741 if (IS_ERR(c_idle.idle)) {
746 printk("failed fork for CPU %d\n", cpu); 742 printk("failed fork for CPU %d\n", cpu);
@@ -816,6 +812,13 @@ do_rest:
816 if (cpumask_test_cpu(cpu, cpu_callin_mask)) 812 if (cpumask_test_cpu(cpu, cpu_callin_mask))
817 break; /* It has booted */ 813 break; /* It has booted */
818 udelay(100); 814 udelay(100);
815 /*
816 * Allow other tasks to run while we wait for the
817 * AP to come online. This also gives a chance
818 * for the MTRR work(triggered by the AP coming online)
819 * to be completed in the stop machine context.
820 */
821 schedule();
819 } 822 }
820 823
821 if (cpumask_test_cpu(cpu, cpu_callin_mask)) 824 if (cpumask_test_cpu(cpu, cpu_callin_mask))
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b372934121..b35786dc9b8 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,6 @@ ENTRY(sys_call_table)
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_recvmmsg 339 .long sys_recvmmsg
340 .long sys_fanotify_init
341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9faf91ae184..ce8e5023933 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = {
751 .read = read_tsc, 751 .read = read_tsc,
752 .resume = resume_tsc, 752 .resume = resume_tsc,
753 .mask = CLOCKSOURCE_MASK(64), 753 .mask = CLOCKSOURCE_MASK(64),
754 .shift = 22,
755 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 754 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
756 CLOCK_SOURCE_MUST_VERIFY, 755 CLOCK_SOURCE_MUST_VERIFY,
757#ifdef CONFIG_X86_64 756#ifdef CONFIG_X86_64
@@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void)
845 844
846static void __init init_tsc_clocksource(void) 845static void __init init_tsc_clocksource(void)
847{ 846{
848 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
849 clocksource_tsc.shift);
850 if (tsc_clocksource_reliable) 847 if (tsc_clocksource_reliable)
851 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 848 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
852 /* lower the rating if we already know its unstable: */ 849 /* lower the rating if we already know its unstable: */
@@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void)
854 clocksource_tsc.rating = 0; 851 clocksource_tsc.rating = 0;
855 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 852 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
856 } 853 }
857 clocksource_register(&clocksource_tsc); 854 clocksource_register_khz(&clocksource_tsc, tsc_khz);
858} 855}
859 856
860#ifdef CONFIG_X86_64 857#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
index 45b6f8a975a..56a8c2a867d 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <asm/cpufeature.h> 33#include <asm/cpufeature.h>
34#include <asm/msr-index.h>
34 35
35verify_cpu: 36verify_cpu:
36 pushfl # Save caller passed flags 37 pushfl # Save caller passed flags
@@ -88,7 +89,7 @@ verify_cpu_sse_test:
88 je verify_cpu_sse_ok 89 je verify_cpu_sse_ok
89 test %di,%di 90 test %di,%di
90 jz verify_cpu_no_longmode # only try to force SSE on AMD 91 jz verify_cpu_no_longmode # only try to force SSE on AMD
91 movl $0xc0010015,%ecx # HWCR 92 movl $MSR_K7_HWCR,%ecx
92 rdmsr 93 rdmsr
93 btr $15,%eax # enable SSE 94 btr $15,%eax # enable SSE
94 wrmsr 95 wrmsr
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1c0c6ab9c60..dcbb28c4b69 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, 76void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
77 u32 mult) 77 struct clocksource *clock, u32 mult)
78{ 78{
79 unsigned long flags; 79 unsigned long flags;
80 80
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
87 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
90 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; 90 vsyscall_gtod_data.wall_to_monotonic = *wtm;
91 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 91 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
92 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 92 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
93} 93}
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
169 * unlikely */ 169 * unlikely */
170time_t __vsyscall(1) vtime(time_t *t) 170time_t __vsyscall(1) vtime(time_t *t)
171{ 171{
172 struct timeval tv; 172 unsigned seq;
173 time_t result; 173 time_t result;
174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) 174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
175 return time_syscall(t); 175 return time_syscall(t);
176 176
177 vgettimeofday(&tv, NULL); 177 do {
178 result = tv.tv_sec; 178 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
179
180 result = __vsyscall_gtod_data.wall_time_sec;
181
182 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
183
179 if (t) 184 if (t)
180 *t = result; 185 *t = result;
181 return result; 186 return result;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 37e68fc5e24..9c253bd65e2 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -16,11 +16,88 @@
16 */ 16 */
17u64 pcntxt_mask; 17u64 pcntxt_mask;
18 18
19/*
20 * Represents init state for the supported extended state.
21 */
22static struct xsave_struct *init_xstate_buf;
23
19struct _fpx_sw_bytes fx_sw_reserved; 24struct _fpx_sw_bytes fx_sw_reserved;
20#ifdef CONFIG_IA32_EMULATION 25#ifdef CONFIG_IA32_EMULATION
21struct _fpx_sw_bytes fx_sw_reserved_ia32; 26struct _fpx_sw_bytes fx_sw_reserved_ia32;
22#endif 27#endif
23 28
29static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
30
31/*
32 * If a processor implementation discern that a processor state component is
33 * in its initialized state it may modify the corresponding bit in the
34 * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
35 * layout in the case of xsaveopt. While presenting the xstate information to
36 * the user, we always ensure that the memory layout of a feature will be in
37 * the init state if the corresponding header bit is zero. This is to ensure
38 * that the user doesn't see some stale state in the memory layout during
39 * signal handling, debugging etc.
40 */
41void __sanitize_i387_state(struct task_struct *tsk)
42{
43 u64 xstate_bv;
44 int feature_bit = 0x2;
45 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
46
47 if (!fx)
48 return;
49
50 BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
51
52 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
53
54 /*
55 * None of the feature bits are in init state. So nothing else
56 * to do for us, as the memory layout is upto date.
57 */
58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
59 return;
60
61 /*
62 * FP is in init state
63 */
64 if (!(xstate_bv & XSTATE_FP)) {
65 fx->cwd = 0x37f;
66 fx->swd = 0;
67 fx->twd = 0;
68 fx->fop = 0;
69 fx->rip = 0;
70 fx->rdp = 0;
71 memset(&fx->st_space[0], 0, 128);
72 }
73
74 /*
75 * SSE is in init state
76 */
77 if (!(xstate_bv & XSTATE_SSE))
78 memset(&fx->xmm_space[0], 0, 256);
79
80 xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
81
82 /*
83 * Update all the other memory layouts for which the corresponding
84 * header bit is in the init state.
85 */
86 while (xstate_bv) {
87 if (xstate_bv & 0x1) {
88 int offset = xstate_offsets[feature_bit];
89 int size = xstate_sizes[feature_bit];
90
91 memcpy(((void *) fx) + offset,
92 ((void *) init_xstate_buf) + offset,
93 size);
94 }
95
96 xstate_bv >>= 1;
97 feature_bit++;
98 }
99}
100
24/* 101/*
25 * Check for the presence of extended state information in the 102 * Check for the presence of extended state information in the
26 * user fpstate pointer in the sigcontext. 103 * user fpstate pointer in the sigcontext.
@@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
36 113
37 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], 114 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
38 sizeof(struct _fpx_sw_bytes)); 115 sizeof(struct _fpx_sw_bytes));
39
40 if (err) 116 if (err)
41 return err; 117 return -EFAULT;
42 118
43 /* 119 /*
44 * First Magic check failed. 120 * First Magic check failed.
45 */ 121 */
46 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) 122 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
47 return -1; 123 return -EINVAL;
48 124
49 /* 125 /*
50 * Check for error scenarios. 126 * Check for error scenarios.
@@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
52 if (fx_sw_user->xstate_size < min_xstate_size || 128 if (fx_sw_user->xstate_size < min_xstate_size ||
53 fx_sw_user->xstate_size > xstate_size || 129 fx_sw_user->xstate_size > xstate_size ||
54 fx_sw_user->xstate_size > fx_sw_user->extended_size) 130 fx_sw_user->xstate_size > fx_sw_user->extended_size)
55 return -1; 131 return -EINVAL;
56 132
57 err = __get_user(magic2, (__u32 *) (((void *)fpstate) + 133 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
58 fx_sw_user->extended_size - 134 fx_sw_user->extended_size -
59 FP_XSTATE_MAGIC2_SIZE)); 135 FP_XSTATE_MAGIC2_SIZE));
136 if (err)
137 return err;
60 /* 138 /*
61 * Check for the presence of second magic word at the end of memory 139 * Check for the presence of second magic word at the end of memory
62 * layout. This detects the case where the user just copied the legacy 140 * layout. This detects the case where the user just copied the legacy
63 * fpstate layout with out copying the extended state information 141 * fpstate layout with out copying the extended state information
64 * in the memory layout. 142 * in the memory layout.
65 */ 143 */
66 if (err || magic2 != FP_XSTATE_MAGIC2) 144 if (magic2 != FP_XSTATE_MAGIC2)
67 return -1; 145 return -EFAULT;
68 146
69 return 0; 147 return 0;
70} 148}
@@ -91,14 +169,6 @@ int save_i387_xstate(void __user *buf)
91 return 0; 169 return 0;
92 170
93 if (task_thread_info(tsk)->status & TS_USEDFPU) { 171 if (task_thread_info(tsk)->status & TS_USEDFPU) {
94 /*
95 * Start with clearing the user buffer. This will present a
96 * clean context for the bytes not touched by the fxsave/xsave.
97 */
98 err = __clear_user(buf, sig_xstate_size);
99 if (err)
100 return err;
101
102 if (use_xsave()) 172 if (use_xsave())
103 err = xsave_user(buf); 173 err = xsave_user(buf);
104 else 174 else
@@ -109,6 +179,7 @@ int save_i387_xstate(void __user *buf)
109 task_thread_info(tsk)->status &= ~TS_USEDFPU; 179 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts(); 180 stts();
111 } else { 181 } else {
182 sanitize_i387_state(tsk);
112 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, 183 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
113 xstate_size)) 184 xstate_size))
114 return -1; 185 return -1;
@@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf)
184 * init the state skipped by the user. 255 * init the state skipped by the user.
185 */ 256 */
186 mask = pcntxt_mask & ~mask; 257 mask = pcntxt_mask & ~mask;
187 258 if (unlikely(mask))
188 xrstor_state(init_xstate_buf, mask); 259 xrstor_state(init_xstate_buf, mask);
189 260
190 return 0; 261 return 0;
191 262
@@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void)
274#endif 345#endif
275} 346}
276 347
277/*
278 * Represents init state for the supported extended state.
279 */
280struct xsave_struct *init_xstate_buf;
281
282#ifdef CONFIG_X86_64 348#ifdef CONFIG_X86_64
283unsigned int sig_xstate_size = sizeof(struct _fpstate); 349unsigned int sig_xstate_size = sizeof(struct _fpstate);
284#endif 350#endif
@@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
286/* 352/*
287 * Enable the extended processor state save/restore feature 353 * Enable the extended processor state save/restore feature
288 */ 354 */
289void __cpuinit xsave_init(void) 355static inline void xstate_enable(void)
290{ 356{
291 if (!cpu_has_xsave)
292 return;
293
294 set_in_cr4(X86_CR4_OSXSAVE); 357 set_in_cr4(X86_CR4_OSXSAVE);
295
296 /*
297 * Enable all the features that the HW is capable of
298 * and the Linux kernel is aware of.
299 */
300 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); 358 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
301} 359}
302 360
303/* 361/*
362 * Record the offsets and sizes of different state managed by the xsave
363 * memory layout.
364 */
365static void __init setup_xstate_features(void)
366{
367 int eax, ebx, ecx, edx, leaf = 0x2;
368
369 xstate_features = fls64(pcntxt_mask);
370 xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
371 xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
372
373 do {
374 cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
375
376 if (eax == 0)
377 break;
378
379 xstate_offsets[leaf] = ebx;
380 xstate_sizes[leaf] = eax;
381
382 leaf++;
383 } while (1);
384}
385
386/*
304 * setup the xstate image representing the init state 387 * setup the xstate image representing the init state
305 */ 388 */
306static void __init setup_xstate_init(void) 389static void __init setup_xstate_init(void)
307{ 390{
391 setup_xstate_features();
392
393 /*
394 * Setup init_xstate_buf to represent the init state of
395 * all the features managed by the xsave
396 */
308 init_xstate_buf = alloc_bootmem(xstate_size); 397 init_xstate_buf = alloc_bootmem(xstate_size);
309 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; 398 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
399
400 clts();
401 /*
402 * Init all the features state with header_bv being 0x0
403 */
404 xrstor_state(init_xstate_buf, -1);
405 /*
406 * Dump the init state again. This is to identify the init state
407 * of any feature which is not represented by all zero's.
408 */
409 xsave_state(init_xstate_buf, -1);
410 stts();
310} 411}
311 412
312/* 413/*
313 * Enable and initialize the xsave feature. 414 * Enable and initialize the xsave feature.
314 */ 415 */
315void __ref xsave_cntxt_init(void) 416static void __init xstate_enable_boot_cpu(void)
316{ 417{
317 unsigned int eax, ebx, ecx, edx; 418 unsigned int eax, ebx, ecx, edx;
318 419
319 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 420 if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
421 WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
422 return;
423 }
424
425 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
320 pcntxt_mask = eax + ((u64)edx << 32); 426 pcntxt_mask = eax + ((u64)edx << 32);
321 427
322 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { 428 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
@@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void)
329 * Support only the state known to OS. 435 * Support only the state known to OS.
330 */ 436 */
331 pcntxt_mask = pcntxt_mask & XCNTXT_MASK; 437 pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
332 xsave_init(); 438
439 xstate_enable();
333 440
334 /* 441 /*
335 * Recompute the context size for enabled features 442 * Recompute the context size for enabled features
336 */ 443 */
337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 444 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
338 xstate_size = ebx; 445 xstate_size = ebx;
339 446
340 update_regset_xstate_info(xstate_size, pcntxt_mask); 447 update_regset_xstate_info(xstate_size, pcntxt_mask);
@@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void)
346 "cntxt size 0x%x\n", 453 "cntxt size 0x%x\n",
347 pcntxt_mask, xstate_size); 454 pcntxt_mask, xstate_size);
348} 455}
456
457/*
458 * For the very first instance, this calls xstate_enable_boot_cpu();
459 * for all subsequent instances, this calls xstate_enable().
460 *
461 * This is somewhat obfuscated due to the lack of powerful enough
462 * overrides for the section checks.
463 */
464void __cpuinit xsave_init(void)
465{
466 static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
467 void (*this_func)(void);
468
469 if (!cpu_has_xsave)
470 return;
471
472 this_func = next_func;
473 next_func = xstate_enable;
474 this_func();
475}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5ac0bb465ed..b38bd8b92aa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,6 +9,7 @@
9 * privileged instructions: 9 * privileged instructions:
10 * 10 *
11 * Copyright (C) 2006 Qumranet 11 * Copyright (C) 2006 Qumranet
12 * Copyright 2010 Red Hat, Inc. and/or its affilates.
12 * 13 *
13 * Avi Kivity <avi@qumranet.com> 14 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com> 15 * Yaniv Kamay <yaniv@qumranet.com>
@@ -67,6 +68,9 @@
67#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 68#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
68#define SrcImmU (9<<4) /* Immediate operand, unsigned */ 69#define SrcImmU (9<<4) /* Immediate operand, unsigned */
69#define SrcSI (0xa<<4) /* Source is in the DS:RSI */ 70#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
73#define SrcAcc (0xd<<4) /* Source Accumulator */
70#define SrcMask (0xf<<4) 74#define SrcMask (0xf<<4)
71/* Generic ModRM decode. */ 75/* Generic ModRM decode. */
72#define ModRM (1<<8) 76#define ModRM (1<<8)
@@ -88,10 +92,6 @@
88#define Src2CL (1<<29) 92#define Src2CL (1<<29)
89#define Src2ImmByte (2<<29) 93#define Src2ImmByte (2<<29)
90#define Src2One (3<<29) 94#define Src2One (3<<29)
91#define Src2Imm16 (4<<29)
92#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
93 in memory and second argument is located
94 immediately after the first one in memory. */
95#define Src2Mask (7<<29) 95#define Src2Mask (7<<29)
96 96
97enum { 97enum {
@@ -124,15 +124,15 @@ static u32 opcode_table[256] = {
124 /* 0x20 - 0x27 */ 124 /* 0x20 - 0x27 */
125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
127 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 127 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
128 /* 0x28 - 0x2F */ 128 /* 0x28 - 0x2F */
129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
131 0, 0, 0, 0, 131 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
132 /* 0x30 - 0x37 */ 132 /* 0x30 - 0x37 */
133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
135 0, 0, 0, 0, 135 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
136 /* 0x38 - 0x3F */ 136 /* 0x38 - 0x3F */
137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -170,20 +170,20 @@ static u32 opcode_table[256] = {
170 /* 0x88 - 0x8F */ 170 /* 0x88 - 0x8F */
171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
173 DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, 173 DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
174 DstReg | SrcMem | ModRM | Mov, Group | Group1A, 174 ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
175 /* 0x90 - 0x97 */ 175 /* 0x90 - 0x97 */
176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
177 /* 0x98 - 0x9F */ 177 /* 0x98 - 0x9F */
178 0, 0, SrcImm | Src2Imm16 | No64, 0, 178 0, 0, SrcImmFAddr | No64, 0,
179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
180 /* 0xA0 - 0xA7 */ 180 /* 0xA0 - 0xA7 */
181 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 181 ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
182 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 182 ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, 183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, 184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
185 /* 0xA8 - 0xAF */ 185 /* 0xA8 - 0xAF */
186 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, 186 DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, 187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
188 ByteOp | DstDI | String, DstDI | String, 188 ByteOp | DstDI | String, DstDI | String,
189 /* 0xB0 - 0xB7 */ 189 /* 0xB0 - 0xB7 */
@@ -215,7 +215,7 @@ static u32 opcode_table[256] = {
215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, 215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
216 /* 0xE8 - 0xEF */ 216 /* 0xE8 - 0xEF */
217 SrcImm | Stack, SrcImm | ImplicitOps, 217 SrcImm | Stack, SrcImm | ImplicitOps,
218 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, 218 SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, 219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, 220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
221 /* 0xF0 - 0xF7 */ 221 /* 0xF0 - 0xF7 */
@@ -337,20 +337,20 @@ static u32 group_table[] = {
337 [Group1A*8] = 337 [Group1A*8] =
338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, 338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
339 [Group3_Byte*8] = 339 [Group3_Byte*8] =
340 ByteOp | SrcImm | DstMem | ModRM, 0, 340 ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
342 0, 0, 0, 0, 342 0, 0, 0, 0,
343 [Group3*8] = 343 [Group3*8] =
344 DstMem | SrcImm | ModRM, 0, 344 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
346 0, 0, 0, 0, 346 0, 0, 0, 0,
347 [Group4*8] = 347 [Group4*8] =
348 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 348 ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
349 0, 0, 0, 0, 0, 0, 349 0, 0, 0, 0, 0, 0,
350 [Group5*8] = 350 [Group5*8] =
351 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 351 DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
352 SrcMem | ModRM | Stack, 0, 352 SrcMem | ModRM | Stack, 0,
353 SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, 353 SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
354 SrcMem | ModRM | Stack, 0, 354 SrcMem | ModRM | Stack, 0,
355 [Group7*8] = 355 [Group7*8] =
356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, 356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
@@ -576,6 +576,13 @@ static u32 group2_table[] = {
576 (_type)_x; \ 576 (_type)_x; \
577}) 577})
578 578
579#define insn_fetch_arr(_arr, _size, _eip) \
580({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
581 if (rc != X86EMUL_CONTINUE) \
582 goto done; \
583 (_eip) += (_size); \
584})
585
579static inline unsigned long ad_mask(struct decode_cache *c) 586static inline unsigned long ad_mask(struct decode_cache *c)
580{ 587{
581 return (1UL << (c->ad_bytes << 3)) - 1; 588 return (1UL << (c->ad_bytes << 3)) - 1;
@@ -617,31 +624,66 @@ static void set_seg_override(struct decode_cache *c, int seg)
617 c->seg_override = seg; 624 c->seg_override = seg;
618} 625}
619 626
620static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) 627static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
628 struct x86_emulate_ops *ops, int seg)
621{ 629{
622 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 630 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
623 return 0; 631 return 0;
624 632
625 return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); 633 return ops->get_cached_segment_base(seg, ctxt->vcpu);
626} 634}
627 635
628static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 636static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
637 struct x86_emulate_ops *ops,
629 struct decode_cache *c) 638 struct decode_cache *c)
630{ 639{
631 if (!c->has_seg_override) 640 if (!c->has_seg_override)
632 return 0; 641 return 0;
633 642
634 return seg_base(ctxt, c->seg_override); 643 return seg_base(ctxt, ops, c->seg_override);
644}
645
646static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
647 struct x86_emulate_ops *ops)
648{
649 return seg_base(ctxt, ops, VCPU_SREG_ES);
650}
651
652static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
653 struct x86_emulate_ops *ops)
654{
655 return seg_base(ctxt, ops, VCPU_SREG_SS);
656}
657
658static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
659 u32 error, bool valid)
660{
661 ctxt->exception = vec;
662 ctxt->error_code = error;
663 ctxt->error_code_valid = valid;
664 ctxt->restart = false;
665}
666
667static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
668{
669 emulate_exception(ctxt, GP_VECTOR, err, true);
635} 670}
636 671
637static unsigned long es_base(struct x86_emulate_ctxt *ctxt) 672static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
673 int err)
638{ 674{
639 return seg_base(ctxt, VCPU_SREG_ES); 675 ctxt->cr2 = addr;
676 emulate_exception(ctxt, PF_VECTOR, err, true);
640} 677}
641 678
642static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) 679static void emulate_ud(struct x86_emulate_ctxt *ctxt)
643{ 680{
644 return seg_base(ctxt, VCPU_SREG_SS); 681 emulate_exception(ctxt, UD_VECTOR, 0, false);
682}
683
684static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
685{
686 emulate_exception(ctxt, TS_VECTOR, err, true);
645} 687}
646 688
647static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 689static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -932,12 +974,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
932 /* we cannot decode insn before we complete previous rep insn */ 974 /* we cannot decode insn before we complete previous rep insn */
933 WARN_ON(ctxt->restart); 975 WARN_ON(ctxt->restart);
934 976
935 /* Shadow copy of register state. Committed on successful emulation. */
936 memset(c, 0, sizeof(struct decode_cache));
937 c->eip = ctxt->eip; 977 c->eip = ctxt->eip;
938 c->fetch.start = c->fetch.end = c->eip; 978 c->fetch.start = c->fetch.end = c->eip;
939 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 979 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
940 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
941 980
942 switch (mode) { 981 switch (mode) {
943 case X86EMUL_MODE_REAL: 982 case X86EMUL_MODE_REAL:
@@ -1060,7 +1099,7 @@ done_prefixes:
1060 set_seg_override(c, VCPU_SREG_DS); 1099 set_seg_override(c, VCPU_SREG_DS);
1061 1100
1062 if (!(!c->twobyte && c->b == 0x8d)) 1101 if (!(!c->twobyte && c->b == 0x8d))
1063 c->modrm_ea += seg_override_base(ctxt, c); 1102 c->modrm_ea += seg_override_base(ctxt, ops, c);
1064 1103
1065 if (c->ad_bytes != 8) 1104 if (c->ad_bytes != 8)
1066 c->modrm_ea = (u32)c->modrm_ea; 1105 c->modrm_ea = (u32)c->modrm_ea;
@@ -1148,6 +1187,25 @@ done_prefixes:
1148 else 1187 else
1149 c->src.val = insn_fetch(u8, 1, c->eip); 1188 c->src.val = insn_fetch(u8, 1, c->eip);
1150 break; 1189 break;
1190 case SrcAcc:
1191 c->src.type = OP_REG;
1192 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1193 c->src.ptr = &c->regs[VCPU_REGS_RAX];
1194 switch (c->src.bytes) {
1195 case 1:
1196 c->src.val = *(u8 *)c->src.ptr;
1197 break;
1198 case 2:
1199 c->src.val = *(u16 *)c->src.ptr;
1200 break;
1201 case 4:
1202 c->src.val = *(u32 *)c->src.ptr;
1203 break;
1204 case 8:
1205 c->src.val = *(u64 *)c->src.ptr;
1206 break;
1207 }
1208 break;
1151 case SrcOne: 1209 case SrcOne:
1152 c->src.bytes = 1; 1210 c->src.bytes = 1;
1153 c->src.val = 1; 1211 c->src.val = 1;
@@ -1156,10 +1214,21 @@ done_prefixes:
1156 c->src.type = OP_MEM; 1214 c->src.type = OP_MEM;
1157 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1215 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1158 c->src.ptr = (unsigned long *) 1216 c->src.ptr = (unsigned long *)
1159 register_address(c, seg_override_base(ctxt, c), 1217 register_address(c, seg_override_base(ctxt, ops, c),
1160 c->regs[VCPU_REGS_RSI]); 1218 c->regs[VCPU_REGS_RSI]);
1161 c->src.val = 0; 1219 c->src.val = 0;
1162 break; 1220 break;
1221 case SrcImmFAddr:
1222 c->src.type = OP_IMM;
1223 c->src.ptr = (unsigned long *)c->eip;
1224 c->src.bytes = c->op_bytes + 2;
1225 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
1226 break;
1227 case SrcMemFAddr:
1228 c->src.type = OP_MEM;
1229 c->src.ptr = (unsigned long *)c->modrm_ea;
1230 c->src.bytes = c->op_bytes + 2;
1231 break;
1163 } 1232 }
1164 1233
1165 /* 1234 /*
@@ -1179,22 +1248,10 @@ done_prefixes:
1179 c->src2.bytes = 1; 1248 c->src2.bytes = 1;
1180 c->src2.val = insn_fetch(u8, 1, c->eip); 1249 c->src2.val = insn_fetch(u8, 1, c->eip);
1181 break; 1250 break;
1182 case Src2Imm16:
1183 c->src2.type = OP_IMM;
1184 c->src2.ptr = (unsigned long *)c->eip;
1185 c->src2.bytes = 2;
1186 c->src2.val = insn_fetch(u16, 2, c->eip);
1187 break;
1188 case Src2One: 1251 case Src2One:
1189 c->src2.bytes = 1; 1252 c->src2.bytes = 1;
1190 c->src2.val = 1; 1253 c->src2.val = 1;
1191 break; 1254 break;
1192 case Src2Mem16:
1193 c->src2.type = OP_MEM;
1194 c->src2.bytes = 2;
1195 c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
1196 c->src2.val = 0;
1197 break;
1198 } 1255 }
1199 1256
1200 /* Decode and fetch the destination operand: register or memory. */ 1257 /* Decode and fetch the destination operand: register or memory. */
@@ -1253,7 +1310,7 @@ done_prefixes:
1253 c->dst.type = OP_MEM; 1310 c->dst.type = OP_MEM;
1254 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1311 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1255 c->dst.ptr = (unsigned long *) 1312 c->dst.ptr = (unsigned long *)
1256 register_address(c, es_base(ctxt), 1313 register_address(c, es_base(ctxt, ops),
1257 c->regs[VCPU_REGS_RDI]); 1314 c->regs[VCPU_REGS_RDI]);
1258 c->dst.val = 0; 1315 c->dst.val = 0;
1259 break; 1316 break;
@@ -1263,6 +1320,37 @@ done:
1263 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1320 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1264} 1321}
1265 1322
1323static int read_emulated(struct x86_emulate_ctxt *ctxt,
1324 struct x86_emulate_ops *ops,
1325 unsigned long addr, void *dest, unsigned size)
1326{
1327 int rc;
1328 struct read_cache *mc = &ctxt->decode.mem_read;
1329 u32 err;
1330
1331 while (size) {
1332 int n = min(size, 8u);
1333 size -= n;
1334 if (mc->pos < mc->end)
1335 goto read_cached;
1336
1337 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
1338 ctxt->vcpu);
1339 if (rc == X86EMUL_PROPAGATE_FAULT)
1340 emulate_pf(ctxt, addr, err);
1341 if (rc != X86EMUL_CONTINUE)
1342 return rc;
1343 mc->end += n;
1344
1345 read_cached:
1346 memcpy(dest, mc->data + mc->pos, n);
1347 mc->pos += n;
1348 dest += n;
1349 addr += n;
1350 }
1351 return X86EMUL_CONTINUE;
1352}
1353
1266static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, 1354static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 struct x86_emulate_ops *ops, 1355 struct x86_emulate_ops *ops,
1268 unsigned int size, unsigned short port, 1356 unsigned int size, unsigned short port,
@@ -1330,13 +1418,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1330 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1418 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1331 1419
1332 if (dt.size < index * 8 + 7) { 1420 if (dt.size < index * 8 + 7) {
1333 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); 1421 emulate_gp(ctxt, selector & 0xfffc);
1334 return X86EMUL_PROPAGATE_FAULT; 1422 return X86EMUL_PROPAGATE_FAULT;
1335 } 1423 }
1336 addr = dt.address + index * 8; 1424 addr = dt.address + index * 8;
1337 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1425 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1338 if (ret == X86EMUL_PROPAGATE_FAULT) 1426 if (ret == X86EMUL_PROPAGATE_FAULT)
1339 kvm_inject_page_fault(ctxt->vcpu, addr, err); 1427 emulate_pf(ctxt, addr, err);
1340 1428
1341 return ret; 1429 return ret;
1342} 1430}
@@ -1355,14 +1443,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1355 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1443 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1356 1444
1357 if (dt.size < index * 8 + 7) { 1445 if (dt.size < index * 8 + 7) {
1358 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); 1446 emulate_gp(ctxt, selector & 0xfffc);
1359 return X86EMUL_PROPAGATE_FAULT; 1447 return X86EMUL_PROPAGATE_FAULT;
1360 } 1448 }
1361 1449
1362 addr = dt.address + index * 8; 1450 addr = dt.address + index * 8;
1363 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1451 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1364 if (ret == X86EMUL_PROPAGATE_FAULT) 1452 if (ret == X86EMUL_PROPAGATE_FAULT)
1365 kvm_inject_page_fault(ctxt->vcpu, addr, err); 1453 emulate_pf(ctxt, addr, err);
1366 1454
1367 return ret; 1455 return ret;
1368} 1456}
@@ -1481,11 +1569,70 @@ load:
1481 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); 1569 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
1482 return X86EMUL_CONTINUE; 1570 return X86EMUL_CONTINUE;
1483exception: 1571exception:
1484 kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); 1572 emulate_exception(ctxt, err_vec, err_code, true);
1485 return X86EMUL_PROPAGATE_FAULT; 1573 return X86EMUL_PROPAGATE_FAULT;
1486} 1574}
1487 1575
1488static inline void emulate_push(struct x86_emulate_ctxt *ctxt) 1576static inline int writeback(struct x86_emulate_ctxt *ctxt,
1577 struct x86_emulate_ops *ops)
1578{
1579 int rc;
1580 struct decode_cache *c = &ctxt->decode;
1581 u32 err;
1582
1583 switch (c->dst.type) {
1584 case OP_REG:
1585 /* The 4-byte case *is* correct:
1586 * in 64-bit mode we zero-extend.
1587 */
1588 switch (c->dst.bytes) {
1589 case 1:
1590 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1591 break;
1592 case 2:
1593 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1594 break;
1595 case 4:
1596 *c->dst.ptr = (u32)c->dst.val;
1597 break; /* 64b: zero-ext */
1598 case 8:
1599 *c->dst.ptr = c->dst.val;
1600 break;
1601 }
1602 break;
1603 case OP_MEM:
1604 if (c->lock_prefix)
1605 rc = ops->cmpxchg_emulated(
1606 (unsigned long)c->dst.ptr,
1607 &c->dst.orig_val,
1608 &c->dst.val,
1609 c->dst.bytes,
1610 &err,
1611 ctxt->vcpu);
1612 else
1613 rc = ops->write_emulated(
1614 (unsigned long)c->dst.ptr,
1615 &c->dst.val,
1616 c->dst.bytes,
1617 &err,
1618 ctxt->vcpu);
1619 if (rc == X86EMUL_PROPAGATE_FAULT)
1620 emulate_pf(ctxt,
1621 (unsigned long)c->dst.ptr, err);
1622 if (rc != X86EMUL_CONTINUE)
1623 return rc;
1624 break;
1625 case OP_NONE:
1626 /* no writeback */
1627 break;
1628 default:
1629 break;
1630 }
1631 return X86EMUL_CONTINUE;
1632}
1633
1634static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1635 struct x86_emulate_ops *ops)
1489{ 1636{
1490 struct decode_cache *c = &ctxt->decode; 1637 struct decode_cache *c = &ctxt->decode;
1491 1638
@@ -1493,7 +1640,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1493 c->dst.bytes = c->op_bytes; 1640 c->dst.bytes = c->op_bytes;
1494 c->dst.val = c->src.val; 1641 c->dst.val = c->src.val;
1495 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1642 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1496 c->dst.ptr = (void *) register_address(c, ss_base(ctxt), 1643 c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
1497 c->regs[VCPU_REGS_RSP]); 1644 c->regs[VCPU_REGS_RSP]);
1498} 1645}
1499 1646
@@ -1504,9 +1651,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1504 struct decode_cache *c = &ctxt->decode; 1651 struct decode_cache *c = &ctxt->decode;
1505 int rc; 1652 int rc;
1506 1653
1507 rc = ops->read_emulated(register_address(c, ss_base(ctxt), 1654 rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
1508 c->regs[VCPU_REGS_RSP]), 1655 c->regs[VCPU_REGS_RSP]),
1509 dest, len, ctxt->vcpu); 1656 dest, len);
1510 if (rc != X86EMUL_CONTINUE) 1657 if (rc != X86EMUL_CONTINUE)
1511 return rc; 1658 return rc;
1512 1659
@@ -1541,7 +1688,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1541 break; 1688 break;
1542 case X86EMUL_MODE_VM86: 1689 case X86EMUL_MODE_VM86:
1543 if (iopl < 3) { 1690 if (iopl < 3) {
1544 kvm_inject_gp(ctxt->vcpu, 0); 1691 emulate_gp(ctxt, 0);
1545 return X86EMUL_PROPAGATE_FAULT; 1692 return X86EMUL_PROPAGATE_FAULT;
1546 } 1693 }
1547 change_mask |= EFLG_IF; 1694 change_mask |= EFLG_IF;
@@ -1557,15 +1704,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1557 return rc; 1704 return rc;
1558} 1705}
1559 1706
1560static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1707static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
1708 struct x86_emulate_ops *ops, int seg)
1561{ 1709{
1562 struct decode_cache *c = &ctxt->decode; 1710 struct decode_cache *c = &ctxt->decode;
1563 struct kvm_segment segment;
1564 1711
1565 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); 1712 c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
1566 1713
1567 c->src.val = segment.selector; 1714 emulate_push(ctxt, ops);
1568 emulate_push(ctxt);
1569} 1715}
1570 1716
1571static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, 1717static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1583,19 +1729,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1583 return rc; 1729 return rc;
1584} 1730}
1585 1731
1586static void emulate_pusha(struct x86_emulate_ctxt *ctxt) 1732static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
1733 struct x86_emulate_ops *ops)
1587{ 1734{
1588 struct decode_cache *c = &ctxt->decode; 1735 struct decode_cache *c = &ctxt->decode;
1589 unsigned long old_esp = c->regs[VCPU_REGS_RSP]; 1736 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1737 int rc = X86EMUL_CONTINUE;
1590 int reg = VCPU_REGS_RAX; 1738 int reg = VCPU_REGS_RAX;
1591 1739
1592 while (reg <= VCPU_REGS_RDI) { 1740 while (reg <= VCPU_REGS_RDI) {
1593 (reg == VCPU_REGS_RSP) ? 1741 (reg == VCPU_REGS_RSP) ?
1594 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1742 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1595 1743
1596 emulate_push(ctxt); 1744 emulate_push(ctxt, ops);
1745
1746 rc = writeback(ctxt, ops);
1747 if (rc != X86EMUL_CONTINUE)
1748 return rc;
1749
1597 ++reg; 1750 ++reg;
1598 } 1751 }
1752
1753 /* Disable writeback. */
1754 c->dst.type = OP_NONE;
1755
1756 return rc;
1599} 1757}
1600 1758
1601static int emulate_popa(struct x86_emulate_ctxt *ctxt, 1759static int emulate_popa(struct x86_emulate_ctxt *ctxt,
@@ -1695,14 +1853,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1695 old_eip = c->eip; 1853 old_eip = c->eip;
1696 c->eip = c->src.val; 1854 c->eip = c->src.val;
1697 c->src.val = old_eip; 1855 c->src.val = old_eip;
1698 emulate_push(ctxt); 1856 emulate_push(ctxt, ops);
1699 break; 1857 break;
1700 } 1858 }
1701 case 4: /* jmp abs */ 1859 case 4: /* jmp abs */
1702 c->eip = c->src.val; 1860 c->eip = c->src.val;
1703 break; 1861 break;
1704 case 6: /* push */ 1862 case 6: /* push */
1705 emulate_push(ctxt); 1863 emulate_push(ctxt, ops);
1706 break; 1864 break;
1707 } 1865 }
1708 return X86EMUL_CONTINUE; 1866 return X86EMUL_CONTINUE;
@@ -1748,145 +1906,82 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1748 return rc; 1906 return rc;
1749} 1907}
1750 1908
1751static inline int writeback(struct x86_emulate_ctxt *ctxt,
1752 struct x86_emulate_ops *ops)
1753{
1754 int rc;
1755 struct decode_cache *c = &ctxt->decode;
1756
1757 switch (c->dst.type) {
1758 case OP_REG:
1759 /* The 4-byte case *is* correct:
1760 * in 64-bit mode we zero-extend.
1761 */
1762 switch (c->dst.bytes) {
1763 case 1:
1764 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1765 break;
1766 case 2:
1767 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1768 break;
1769 case 4:
1770 *c->dst.ptr = (u32)c->dst.val;
1771 break; /* 64b: zero-ext */
1772 case 8:
1773 *c->dst.ptr = c->dst.val;
1774 break;
1775 }
1776 break;
1777 case OP_MEM:
1778 if (c->lock_prefix)
1779 rc = ops->cmpxchg_emulated(
1780 (unsigned long)c->dst.ptr,
1781 &c->dst.orig_val,
1782 &c->dst.val,
1783 c->dst.bytes,
1784 ctxt->vcpu);
1785 else
1786 rc = ops->write_emulated(
1787 (unsigned long)c->dst.ptr,
1788 &c->dst.val,
1789 c->dst.bytes,
1790 ctxt->vcpu);
1791 if (rc != X86EMUL_CONTINUE)
1792 return rc;
1793 break;
1794 case OP_NONE:
1795 /* no writeback */
1796 break;
1797 default:
1798 break;
1799 }
1800 return X86EMUL_CONTINUE;
1801}
1802
1803static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1804{
1805 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
1806 /*
1807 * an sti; sti; sequence only disable interrupts for the first
1808 * instruction. So, if the last instruction, be it emulated or
1809 * not, left the system with the INT_STI flag enabled, it
1810 * means that the last instruction is an sti. We should not
1811 * leave the flag on in this case. The same goes for mov ss
1812 */
1813 if (!(int_shadow & mask))
1814 ctxt->interruptibility = mask;
1815}
1816
1817static inline void 1909static inline void
1818setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1910setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1819 struct kvm_segment *cs, struct kvm_segment *ss) 1911 struct x86_emulate_ops *ops, struct desc_struct *cs,
1912 struct desc_struct *ss)
1820{ 1913{
1821 memset(cs, 0, sizeof(struct kvm_segment)); 1914 memset(cs, 0, sizeof(struct desc_struct));
1822 kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); 1915 ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
1823 memset(ss, 0, sizeof(struct kvm_segment)); 1916 memset(ss, 0, sizeof(struct desc_struct));
1824 1917
1825 cs->l = 0; /* will be adjusted later */ 1918 cs->l = 0; /* will be adjusted later */
1826 cs->base = 0; /* flat segment */ 1919 set_desc_base(cs, 0); /* flat segment */
1827 cs->g = 1; /* 4kb granularity */ 1920 cs->g = 1; /* 4kb granularity */
1828 cs->limit = 0xffffffff; /* 4GB limit */ 1921 set_desc_limit(cs, 0xfffff); /* 4GB limit */
1829 cs->type = 0x0b; /* Read, Execute, Accessed */ 1922 cs->type = 0x0b; /* Read, Execute, Accessed */
1830 cs->s = 1; 1923 cs->s = 1;
1831 cs->dpl = 0; /* will be adjusted later */ 1924 cs->dpl = 0; /* will be adjusted later */
1832 cs->present = 1; 1925 cs->p = 1;
1833 cs->db = 1; 1926 cs->d = 1;
1834 1927
1835 ss->unusable = 0; 1928 set_desc_base(ss, 0); /* flat segment */
1836 ss->base = 0; /* flat segment */ 1929 set_desc_limit(ss, 0xfffff); /* 4GB limit */
1837 ss->limit = 0xffffffff; /* 4GB limit */
1838 ss->g = 1; /* 4kb granularity */ 1930 ss->g = 1; /* 4kb granularity */
1839 ss->s = 1; 1931 ss->s = 1;
1840 ss->type = 0x03; /* Read/Write, Accessed */ 1932 ss->type = 0x03; /* Read/Write, Accessed */
1841 ss->db = 1; /* 32bit stack segment */ 1933 ss->d = 1; /* 32bit stack segment */
1842 ss->dpl = 0; 1934 ss->dpl = 0;
1843 ss->present = 1; 1935 ss->p = 1;
1844} 1936}
1845 1937
1846static int 1938static int
1847emulate_syscall(struct x86_emulate_ctxt *ctxt) 1939emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1848{ 1940{
1849 struct decode_cache *c = &ctxt->decode; 1941 struct decode_cache *c = &ctxt->decode;
1850 struct kvm_segment cs, ss; 1942 struct desc_struct cs, ss;
1851 u64 msr_data; 1943 u64 msr_data;
1944 u16 cs_sel, ss_sel;
1852 1945
1853 /* syscall is not available in real mode */ 1946 /* syscall is not available in real mode */
1854 if (ctxt->mode == X86EMUL_MODE_REAL || 1947 if (ctxt->mode == X86EMUL_MODE_REAL ||
1855 ctxt->mode == X86EMUL_MODE_VM86) { 1948 ctxt->mode == X86EMUL_MODE_VM86) {
1856 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 1949 emulate_ud(ctxt);
1857 return X86EMUL_PROPAGATE_FAULT; 1950 return X86EMUL_PROPAGATE_FAULT;
1858 } 1951 }
1859 1952
1860 setup_syscalls_segments(ctxt, &cs, &ss); 1953 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1861 1954
1862 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1955 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1863 msr_data >>= 32; 1956 msr_data >>= 32;
1864 cs.selector = (u16)(msr_data & 0xfffc); 1957 cs_sel = (u16)(msr_data & 0xfffc);
1865 ss.selector = (u16)(msr_data + 8); 1958 ss_sel = (u16)(msr_data + 8);
1866 1959
1867 if (is_long_mode(ctxt->vcpu)) { 1960 if (is_long_mode(ctxt->vcpu)) {
1868 cs.db = 0; 1961 cs.d = 0;
1869 cs.l = 1; 1962 cs.l = 1;
1870 } 1963 }
1871 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 1964 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
1872 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 1965 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1966 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
1967 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1873 1968
1874 c->regs[VCPU_REGS_RCX] = c->eip; 1969 c->regs[VCPU_REGS_RCX] = c->eip;
1875 if (is_long_mode(ctxt->vcpu)) { 1970 if (is_long_mode(ctxt->vcpu)) {
1876#ifdef CONFIG_X86_64 1971#ifdef CONFIG_X86_64
1877 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 1972 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1878 1973
1879 kvm_x86_ops->get_msr(ctxt->vcpu, 1974 ops->get_msr(ctxt->vcpu,
1880 ctxt->mode == X86EMUL_MODE_PROT64 ? 1975 ctxt->mode == X86EMUL_MODE_PROT64 ?
1881 MSR_LSTAR : MSR_CSTAR, &msr_data); 1976 MSR_LSTAR : MSR_CSTAR, &msr_data);
1882 c->eip = msr_data; 1977 c->eip = msr_data;
1883 1978
1884 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); 1979 ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
1885 ctxt->eflags &= ~(msr_data | EFLG_RF); 1980 ctxt->eflags &= ~(msr_data | EFLG_RF);
1886#endif 1981#endif
1887 } else { 1982 } else {
1888 /* legacy mode */ 1983 /* legacy mode */
1889 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1984 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1890 c->eip = (u32)msr_data; 1985 c->eip = (u32)msr_data;
1891 1986
1892 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1987 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1896,15 +1991,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1896} 1991}
1897 1992
1898static int 1993static int
1899emulate_sysenter(struct x86_emulate_ctxt *ctxt) 1994emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1900{ 1995{
1901 struct decode_cache *c = &ctxt->decode; 1996 struct decode_cache *c = &ctxt->decode;
1902 struct kvm_segment cs, ss; 1997 struct desc_struct cs, ss;
1903 u64 msr_data; 1998 u64 msr_data;
1999 u16 cs_sel, ss_sel;
1904 2000
1905 /* inject #GP if in real mode */ 2001 /* inject #GP if in real mode */
1906 if (ctxt->mode == X86EMUL_MODE_REAL) { 2002 if (ctxt->mode == X86EMUL_MODE_REAL) {
1907 kvm_inject_gp(ctxt->vcpu, 0); 2003 emulate_gp(ctxt, 0);
1908 return X86EMUL_PROPAGATE_FAULT; 2004 return X86EMUL_PROPAGATE_FAULT;
1909 } 2005 }
1910 2006
@@ -1912,67 +2008,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1912 * Therefore, we inject an #UD. 2008 * Therefore, we inject an #UD.
1913 */ 2009 */
1914 if (ctxt->mode == X86EMUL_MODE_PROT64) { 2010 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1915 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2011 emulate_ud(ctxt);
1916 return X86EMUL_PROPAGATE_FAULT; 2012 return X86EMUL_PROPAGATE_FAULT;
1917 } 2013 }
1918 2014
1919 setup_syscalls_segments(ctxt, &cs, &ss); 2015 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1920 2016
1921 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2017 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1922 switch (ctxt->mode) { 2018 switch (ctxt->mode) {
1923 case X86EMUL_MODE_PROT32: 2019 case X86EMUL_MODE_PROT32:
1924 if ((msr_data & 0xfffc) == 0x0) { 2020 if ((msr_data & 0xfffc) == 0x0) {
1925 kvm_inject_gp(ctxt->vcpu, 0); 2021 emulate_gp(ctxt, 0);
1926 return X86EMUL_PROPAGATE_FAULT; 2022 return X86EMUL_PROPAGATE_FAULT;
1927 } 2023 }
1928 break; 2024 break;
1929 case X86EMUL_MODE_PROT64: 2025 case X86EMUL_MODE_PROT64:
1930 if (msr_data == 0x0) { 2026 if (msr_data == 0x0) {
1931 kvm_inject_gp(ctxt->vcpu, 0); 2027 emulate_gp(ctxt, 0);
1932 return X86EMUL_PROPAGATE_FAULT; 2028 return X86EMUL_PROPAGATE_FAULT;
1933 } 2029 }
1934 break; 2030 break;
1935 } 2031 }
1936 2032
1937 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 2033 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1938 cs.selector = (u16)msr_data; 2034 cs_sel = (u16)msr_data;
1939 cs.selector &= ~SELECTOR_RPL_MASK; 2035 cs_sel &= ~SELECTOR_RPL_MASK;
1940 ss.selector = cs.selector + 8; 2036 ss_sel = cs_sel + 8;
1941 ss.selector &= ~SELECTOR_RPL_MASK; 2037 ss_sel &= ~SELECTOR_RPL_MASK;
1942 if (ctxt->mode == X86EMUL_MODE_PROT64 2038 if (ctxt->mode == X86EMUL_MODE_PROT64
1943 || is_long_mode(ctxt->vcpu)) { 2039 || is_long_mode(ctxt->vcpu)) {
1944 cs.db = 0; 2040 cs.d = 0;
1945 cs.l = 1; 2041 cs.l = 1;
1946 } 2042 }
1947 2043
1948 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2044 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
1949 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 2045 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
2046 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2047 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1950 2048
1951 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 2049 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
1952 c->eip = msr_data; 2050 c->eip = msr_data;
1953 2051
1954 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 2052 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
1955 c->regs[VCPU_REGS_RSP] = msr_data; 2053 c->regs[VCPU_REGS_RSP] = msr_data;
1956 2054
1957 return X86EMUL_CONTINUE; 2055 return X86EMUL_CONTINUE;
1958} 2056}
1959 2057
1960static int 2058static int
1961emulate_sysexit(struct x86_emulate_ctxt *ctxt) 2059emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1962{ 2060{
1963 struct decode_cache *c = &ctxt->decode; 2061 struct decode_cache *c = &ctxt->decode;
1964 struct kvm_segment cs, ss; 2062 struct desc_struct cs, ss;
1965 u64 msr_data; 2063 u64 msr_data;
1966 int usermode; 2064 int usermode;
2065 u16 cs_sel, ss_sel;
1967 2066
1968 /* inject #GP if in real mode or Virtual 8086 mode */ 2067 /* inject #GP if in real mode or Virtual 8086 mode */
1969 if (ctxt->mode == X86EMUL_MODE_REAL || 2068 if (ctxt->mode == X86EMUL_MODE_REAL ||
1970 ctxt->mode == X86EMUL_MODE_VM86) { 2069 ctxt->mode == X86EMUL_MODE_VM86) {
1971 kvm_inject_gp(ctxt->vcpu, 0); 2070 emulate_gp(ctxt, 0);
1972 return X86EMUL_PROPAGATE_FAULT; 2071 return X86EMUL_PROPAGATE_FAULT;
1973 } 2072 }
1974 2073
1975 setup_syscalls_segments(ctxt, &cs, &ss); 2074 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1976 2075
1977 if ((c->rex_prefix & 0x8) != 0x0) 2076 if ((c->rex_prefix & 0x8) != 0x0)
1978 usermode = X86EMUL_MODE_PROT64; 2077 usermode = X86EMUL_MODE_PROT64;
@@ -1981,35 +2080,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1981 2080
1982 cs.dpl = 3; 2081 cs.dpl = 3;
1983 ss.dpl = 3; 2082 ss.dpl = 3;
1984 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2083 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1985 switch (usermode) { 2084 switch (usermode) {
1986 case X86EMUL_MODE_PROT32: 2085 case X86EMUL_MODE_PROT32:
1987 cs.selector = (u16)(msr_data + 16); 2086 cs_sel = (u16)(msr_data + 16);
1988 if ((msr_data & 0xfffc) == 0x0) { 2087 if ((msr_data & 0xfffc) == 0x0) {
1989 kvm_inject_gp(ctxt->vcpu, 0); 2088 emulate_gp(ctxt, 0);
1990 return X86EMUL_PROPAGATE_FAULT; 2089 return X86EMUL_PROPAGATE_FAULT;
1991 } 2090 }
1992 ss.selector = (u16)(msr_data + 24); 2091 ss_sel = (u16)(msr_data + 24);
1993 break; 2092 break;
1994 case X86EMUL_MODE_PROT64: 2093 case X86EMUL_MODE_PROT64:
1995 cs.selector = (u16)(msr_data + 32); 2094 cs_sel = (u16)(msr_data + 32);
1996 if (msr_data == 0x0) { 2095 if (msr_data == 0x0) {
1997 kvm_inject_gp(ctxt->vcpu, 0); 2096 emulate_gp(ctxt, 0);
1998 return X86EMUL_PROPAGATE_FAULT; 2097 return X86EMUL_PROPAGATE_FAULT;
1999 } 2098 }
2000 ss.selector = cs.selector + 8; 2099 ss_sel = cs_sel + 8;
2001 cs.db = 0; 2100 cs.d = 0;
2002 cs.l = 1; 2101 cs.l = 1;
2003 break; 2102 break;
2004 } 2103 }
2005 cs.selector |= SELECTOR_RPL_MASK; 2104 cs_sel |= SELECTOR_RPL_MASK;
2006 ss.selector |= SELECTOR_RPL_MASK; 2105 ss_sel |= SELECTOR_RPL_MASK;
2007 2106
2008 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2107 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
2009 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 2108 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
2109 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2110 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
2010 2111
2011 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; 2112 c->eip = c->regs[VCPU_REGS_RDX];
2012 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; 2113 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
2013 2114
2014 return X86EMUL_CONTINUE; 2115 return X86EMUL_CONTINUE;
2015} 2116}
@@ -2030,25 +2131,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2030 struct x86_emulate_ops *ops, 2131 struct x86_emulate_ops *ops,
2031 u16 port, u16 len) 2132 u16 port, u16 len)
2032{ 2133{
2033 struct kvm_segment tr_seg; 2134 struct desc_struct tr_seg;
2034 int r; 2135 int r;
2035 u16 io_bitmap_ptr; 2136 u16 io_bitmap_ptr;
2036 u8 perm, bit_idx = port & 0x7; 2137 u8 perm, bit_idx = port & 0x7;
2037 unsigned mask = (1 << len) - 1; 2138 unsigned mask = (1 << len) - 1;
2038 2139
2039 kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); 2140 ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
2040 if (tr_seg.unusable) 2141 if (!tr_seg.p)
2041 return false; 2142 return false;
2042 if (tr_seg.limit < 103) 2143 if (desc_limit_scaled(&tr_seg) < 103)
2043 return false; 2144 return false;
2044 r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, 2145 r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
2045 NULL); 2146 ctxt->vcpu, NULL);
2046 if (r != X86EMUL_CONTINUE) 2147 if (r != X86EMUL_CONTINUE)
2047 return false; 2148 return false;
2048 if (io_bitmap_ptr + port/8 > tr_seg.limit) 2149 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
2049 return false; 2150 return false;
2050 r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, 2151 r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
2051 ctxt->vcpu, NULL); 2152 &perm, 1, ctxt->vcpu, NULL);
2052 if (r != X86EMUL_CONTINUE) 2153 if (r != X86EMUL_CONTINUE)
2053 return false; 2154 return false;
2054 if ((perm >> bit_idx) & mask) 2155 if ((perm >> bit_idx) & mask)
@@ -2066,17 +2167,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2066 return true; 2167 return true;
2067} 2168}
2068 2169
2069static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
2070 struct x86_emulate_ops *ops,
2071 int seg)
2072{
2073 struct desc_struct desc;
2074 if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
2075 return get_desc_base(&desc);
2076 else
2077 return ~0;
2078}
2079
2080static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, 2170static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2081 struct x86_emulate_ops *ops, 2171 struct x86_emulate_ops *ops,
2082 struct tss_segment_16 *tss) 2172 struct tss_segment_16 *tss)
@@ -2165,7 +2255,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2165 &err); 2255 &err);
2166 if (ret == X86EMUL_PROPAGATE_FAULT) { 2256 if (ret == X86EMUL_PROPAGATE_FAULT) {
2167 /* FIXME: need to provide precise fault address */ 2257 /* FIXME: need to provide precise fault address */
2168 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2258 emulate_pf(ctxt, old_tss_base, err);
2169 return ret; 2259 return ret;
2170 } 2260 }
2171 2261
@@ -2175,7 +2265,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2175 &err); 2265 &err);
2176 if (ret == X86EMUL_PROPAGATE_FAULT) { 2266 if (ret == X86EMUL_PROPAGATE_FAULT) {
2177 /* FIXME: need to provide precise fault address */ 2267 /* FIXME: need to provide precise fault address */
2178 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2268 emulate_pf(ctxt, old_tss_base, err);
2179 return ret; 2269 return ret;
2180 } 2270 }
2181 2271
@@ -2183,7 +2273,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2183 &err); 2273 &err);
2184 if (ret == X86EMUL_PROPAGATE_FAULT) { 2274 if (ret == X86EMUL_PROPAGATE_FAULT) {
2185 /* FIXME: need to provide precise fault address */ 2275 /* FIXME: need to provide precise fault address */
2186 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2276 emulate_pf(ctxt, new_tss_base, err);
2187 return ret; 2277 return ret;
2188 } 2278 }
2189 2279
@@ -2196,7 +2286,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2196 ctxt->vcpu, &err); 2286 ctxt->vcpu, &err);
2197 if (ret == X86EMUL_PROPAGATE_FAULT) { 2287 if (ret == X86EMUL_PROPAGATE_FAULT) {
2198 /* FIXME: need to provide precise fault address */ 2288 /* FIXME: need to provide precise fault address */
2199 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2289 emulate_pf(ctxt, new_tss_base, err);
2200 return ret; 2290 return ret;
2201 } 2291 }
2202 } 2292 }
@@ -2238,7 +2328,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2238 struct decode_cache *c = &ctxt->decode; 2328 struct decode_cache *c = &ctxt->decode;
2239 int ret; 2329 int ret;
2240 2330
2241 ops->set_cr(3, tss->cr3, ctxt->vcpu); 2331 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
2332 emulate_gp(ctxt, 0);
2333 return X86EMUL_PROPAGATE_FAULT;
2334 }
2242 c->eip = tss->eip; 2335 c->eip = tss->eip;
2243 ctxt->eflags = tss->eflags | 2; 2336 ctxt->eflags = tss->eflags | 2;
2244 c->regs[VCPU_REGS_RAX] = tss->eax; 2337 c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2304,7 +2397,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2304 &err); 2397 &err);
2305 if (ret == X86EMUL_PROPAGATE_FAULT) { 2398 if (ret == X86EMUL_PROPAGATE_FAULT) {
2306 /* FIXME: need to provide precise fault address */ 2399 /* FIXME: need to provide precise fault address */
2307 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2400 emulate_pf(ctxt, old_tss_base, err);
2308 return ret; 2401 return ret;
2309 } 2402 }
2310 2403
@@ -2314,7 +2407,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2314 &err); 2407 &err);
2315 if (ret == X86EMUL_PROPAGATE_FAULT) { 2408 if (ret == X86EMUL_PROPAGATE_FAULT) {
2316 /* FIXME: need to provide precise fault address */ 2409 /* FIXME: need to provide precise fault address */
2317 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2410 emulate_pf(ctxt, old_tss_base, err);
2318 return ret; 2411 return ret;
2319 } 2412 }
2320 2413
@@ -2322,7 +2415,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2322 &err); 2415 &err);
2323 if (ret == X86EMUL_PROPAGATE_FAULT) { 2416 if (ret == X86EMUL_PROPAGATE_FAULT) {
2324 /* FIXME: need to provide precise fault address */ 2417 /* FIXME: need to provide precise fault address */
2325 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2418 emulate_pf(ctxt, new_tss_base, err);
2326 return ret; 2419 return ret;
2327 } 2420 }
2328 2421
@@ -2335,7 +2428,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2335 ctxt->vcpu, &err); 2428 ctxt->vcpu, &err);
2336 if (ret == X86EMUL_PROPAGATE_FAULT) { 2429 if (ret == X86EMUL_PROPAGATE_FAULT) {
2337 /* FIXME: need to provide precise fault address */ 2430 /* FIXME: need to provide precise fault address */
2338 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2431 emulate_pf(ctxt, new_tss_base, err);
2339 return ret; 2432 return ret;
2340 } 2433 }
2341 } 2434 }
@@ -2352,7 +2445,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2352 int ret; 2445 int ret;
2353 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); 2446 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
2354 ulong old_tss_base = 2447 ulong old_tss_base =
2355 get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); 2448 ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
2356 u32 desc_limit; 2449 u32 desc_limit;
2357 2450
2358 /* FIXME: old_tss_base == ~0 ? */ 2451 /* FIXME: old_tss_base == ~0 ? */
@@ -2369,7 +2462,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2369 if (reason != TASK_SWITCH_IRET) { 2462 if (reason != TASK_SWITCH_IRET) {
2370 if ((tss_selector & 3) > next_tss_desc.dpl || 2463 if ((tss_selector & 3) > next_tss_desc.dpl ||
2371 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2464 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
2372 kvm_inject_gp(ctxt->vcpu, 0); 2465 emulate_gp(ctxt, 0);
2373 return X86EMUL_PROPAGATE_FAULT; 2466 return X86EMUL_PROPAGATE_FAULT;
2374 } 2467 }
2375 } 2468 }
@@ -2378,8 +2471,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2378 if (!next_tss_desc.p || 2471 if (!next_tss_desc.p ||
2379 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || 2472 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
2380 desc_limit < 0x2b)) { 2473 desc_limit < 0x2b)) {
2381 kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, 2474 emulate_ts(ctxt, tss_selector & 0xfffc);
2382 tss_selector & 0xfffc);
2383 return X86EMUL_PROPAGATE_FAULT; 2475 return X86EMUL_PROPAGATE_FAULT;
2384 } 2476 }
2385 2477
@@ -2425,7 +2517,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2425 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2517 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2426 c->lock_prefix = 0; 2518 c->lock_prefix = 0;
2427 c->src.val = (unsigned long) error_code; 2519 c->src.val = (unsigned long) error_code;
2428 emulate_push(ctxt); 2520 emulate_push(ctxt, ops);
2429 } 2521 }
2430 2522
2431 return ret; 2523 return ret;
@@ -2439,18 +2531,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2439 struct decode_cache *c = &ctxt->decode; 2531 struct decode_cache *c = &ctxt->decode;
2440 int rc; 2532 int rc;
2441 2533
2442 memset(c, 0, sizeof(struct decode_cache));
2443 c->eip = ctxt->eip; 2534 c->eip = ctxt->eip;
2444 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2445 c->dst.type = OP_NONE; 2535 c->dst.type = OP_NONE;
2446 2536
2447 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2537 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2448 has_error_code, error_code); 2538 has_error_code, error_code);
2449 2539
2450 if (rc == X86EMUL_CONTINUE) { 2540 if (rc == X86EMUL_CONTINUE) {
2451 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2452 kvm_rip_write(ctxt->vcpu, c->eip);
2453 rc = writeback(ctxt, ops); 2541 rc = writeback(ctxt, ops);
2542 if (rc == X86EMUL_CONTINUE)
2543 ctxt->eip = c->eip;
2454 } 2544 }
2455 2545
2456 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2546 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -2474,29 +2564,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2474 int rc = X86EMUL_CONTINUE; 2564 int rc = X86EMUL_CONTINUE;
2475 int saved_dst_type = c->dst.type; 2565 int saved_dst_type = c->dst.type;
2476 2566
2477 ctxt->interruptibility = 0; 2567 ctxt->decode.mem_read.pos = 0;
2478
2479 /* Shadow copy of register state. Committed on successful emulation.
2480 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
2481 * modify them.
2482 */
2483
2484 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2485 2568
2486 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 2569 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
2487 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2570 emulate_ud(ctxt);
2488 goto done; 2571 goto done;
2489 } 2572 }
2490 2573
2491 /* LOCK prefix is allowed only with some instructions */ 2574 /* LOCK prefix is allowed only with some instructions */
2492 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 2575 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
2493 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2576 emulate_ud(ctxt);
2494 goto done; 2577 goto done;
2495 } 2578 }
2496 2579
2497 /* Privileged instruction can be executed only in CPL=0 */ 2580 /* Privileged instruction can be executed only in CPL=0 */
2498 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 2581 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
2499 kvm_inject_gp(ctxt->vcpu, 0); 2582 emulate_gp(ctxt, 0);
2500 goto done; 2583 goto done;
2501 } 2584 }
2502 2585
@@ -2506,7 +2589,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2506 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 2589 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2507 string_done: 2590 string_done:
2508 ctxt->restart = false; 2591 ctxt->restart = false;
2509 kvm_rip_write(ctxt->vcpu, c->eip); 2592 ctxt->eip = c->eip;
2510 goto done; 2593 goto done;
2511 } 2594 }
2512 /* The second termination condition only applies for REPE 2595 /* The second termination condition only applies for REPE
@@ -2529,20 +2612,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2529 } 2612 }
2530 2613
2531 if (c->src.type == OP_MEM) { 2614 if (c->src.type == OP_MEM) {
2532 rc = ops->read_emulated((unsigned long)c->src.ptr, 2615 rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
2533 &c->src.val, 2616 c->src.valptr, c->src.bytes);
2534 c->src.bytes,
2535 ctxt->vcpu);
2536 if (rc != X86EMUL_CONTINUE) 2617 if (rc != X86EMUL_CONTINUE)
2537 goto done; 2618 goto done;
2538 c->src.orig_val = c->src.val; 2619 c->src.orig_val = c->src.val;
2539 } 2620 }
2540 2621
2541 if (c->src2.type == OP_MEM) { 2622 if (c->src2.type == OP_MEM) {
2542 rc = ops->read_emulated((unsigned long)c->src2.ptr, 2623 rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
2543 &c->src2.val, 2624 &c->src2.val, c->src2.bytes);
2544 c->src2.bytes,
2545 ctxt->vcpu);
2546 if (rc != X86EMUL_CONTINUE) 2625 if (rc != X86EMUL_CONTINUE)
2547 goto done; 2626 goto done;
2548 } 2627 }
@@ -2553,8 +2632,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2553 2632
2554 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 2633 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
2555 /* optimisation - avoid slow emulated read if Mov */ 2634 /* optimisation - avoid slow emulated read if Mov */
2556 rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, 2635 rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
2557 c->dst.bytes, ctxt->vcpu); 2636 &c->dst.val, c->dst.bytes);
2558 if (rc != X86EMUL_CONTINUE) 2637 if (rc != X86EMUL_CONTINUE)
2559 goto done; 2638 goto done;
2560 } 2639 }
@@ -2571,7 +2650,7 @@ special_insn:
2571 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 2650 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2572 break; 2651 break;
2573 case 0x06: /* push es */ 2652 case 0x06: /* push es */
2574 emulate_push_sreg(ctxt, VCPU_SREG_ES); 2653 emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
2575 break; 2654 break;
2576 case 0x07: /* pop es */ 2655 case 0x07: /* pop es */
2577 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 2656 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
@@ -2583,14 +2662,14 @@ special_insn:
2583 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 2662 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2584 break; 2663 break;
2585 case 0x0e: /* push cs */ 2664 case 0x0e: /* push cs */
2586 emulate_push_sreg(ctxt, VCPU_SREG_CS); 2665 emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
2587 break; 2666 break;
2588 case 0x10 ... 0x15: 2667 case 0x10 ... 0x15:
2589 adc: /* adc */ 2668 adc: /* adc */
2590 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 2669 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2591 break; 2670 break;
2592 case 0x16: /* push ss */ 2671 case 0x16: /* push ss */
2593 emulate_push_sreg(ctxt, VCPU_SREG_SS); 2672 emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
2594 break; 2673 break;
2595 case 0x17: /* pop ss */ 2674 case 0x17: /* pop ss */
2596 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 2675 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
@@ -2602,7 +2681,7 @@ special_insn:
2602 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 2681 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2603 break; 2682 break;
2604 case 0x1e: /* push ds */ 2683 case 0x1e: /* push ds */
2605 emulate_push_sreg(ctxt, VCPU_SREG_DS); 2684 emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
2606 break; 2685 break;
2607 case 0x1f: /* pop ds */ 2686 case 0x1f: /* pop ds */
2608 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 2687 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
@@ -2632,7 +2711,7 @@ special_insn:
2632 emulate_1op("dec", c->dst, ctxt->eflags); 2711 emulate_1op("dec", c->dst, ctxt->eflags);
2633 break; 2712 break;
2634 case 0x50 ... 0x57: /* push reg */ 2713 case 0x50 ... 0x57: /* push reg */
2635 emulate_push(ctxt); 2714 emulate_push(ctxt, ops);
2636 break; 2715 break;
2637 case 0x58 ... 0x5f: /* pop reg */ 2716 case 0x58 ... 0x5f: /* pop reg */
2638 pop_instruction: 2717 pop_instruction:
@@ -2641,7 +2720,9 @@ special_insn:
2641 goto done; 2720 goto done;
2642 break; 2721 break;
2643 case 0x60: /* pusha */ 2722 case 0x60: /* pusha */
2644 emulate_pusha(ctxt); 2723 rc = emulate_pusha(ctxt, ops);
2724 if (rc != X86EMUL_CONTINUE)
2725 goto done;
2645 break; 2726 break;
2646 case 0x61: /* popa */ 2727 case 0x61: /* popa */
2647 rc = emulate_popa(ctxt, ops); 2728 rc = emulate_popa(ctxt, ops);
@@ -2655,14 +2736,14 @@ special_insn:
2655 break; 2736 break;
2656 case 0x68: /* push imm */ 2737 case 0x68: /* push imm */
2657 case 0x6a: /* push imm8 */ 2738 case 0x6a: /* push imm8 */
2658 emulate_push(ctxt); 2739 emulate_push(ctxt, ops);
2659 break; 2740 break;
2660 case 0x6c: /* insb */ 2741 case 0x6c: /* insb */
2661 case 0x6d: /* insw/insd */ 2742 case 0x6d: /* insw/insd */
2662 c->dst.bytes = min(c->dst.bytes, 4u); 2743 c->dst.bytes = min(c->dst.bytes, 4u);
2663 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2744 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2664 c->dst.bytes)) { 2745 c->dst.bytes)) {
2665 kvm_inject_gp(ctxt->vcpu, 0); 2746 emulate_gp(ctxt, 0);
2666 goto done; 2747 goto done;
2667 } 2748 }
2668 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, 2749 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
@@ -2674,7 +2755,7 @@ special_insn:
2674 c->src.bytes = min(c->src.bytes, 4u); 2755 c->src.bytes = min(c->src.bytes, 4u);
2675 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2756 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2676 c->src.bytes)) { 2757 c->src.bytes)) {
2677 kvm_inject_gp(ctxt->vcpu, 0); 2758 emulate_gp(ctxt, 0);
2678 goto done; 2759 goto done;
2679 } 2760 }
2680 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], 2761 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
@@ -2707,6 +2788,7 @@ special_insn:
2707 } 2788 }
2708 break; 2789 break;
2709 case 0x84 ... 0x85: 2790 case 0x84 ... 0x85:
2791 test:
2710 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 2792 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
2711 break; 2793 break;
2712 case 0x86 ... 0x87: /* xchg */ 2794 case 0x86 ... 0x87: /* xchg */
@@ -2735,18 +2817,13 @@ special_insn:
2735 break; 2817 break;
2736 case 0x88 ... 0x8b: /* mov */ 2818 case 0x88 ... 0x8b: /* mov */
2737 goto mov; 2819 goto mov;
2738 case 0x8c: { /* mov r/m, sreg */ 2820 case 0x8c: /* mov r/m, sreg */
2739 struct kvm_segment segreg; 2821 if (c->modrm_reg > VCPU_SREG_GS) {
2740 2822 emulate_ud(ctxt);
2741 if (c->modrm_reg <= VCPU_SREG_GS)
2742 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
2743 else {
2744 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2745 goto done; 2823 goto done;
2746 } 2824 }
2747 c->dst.val = segreg.selector; 2825 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
2748 break; 2826 break;
2749 }
2750 case 0x8d: /* lea r16/r32, m */ 2827 case 0x8d: /* lea r16/r32, m */
2751 c->dst.val = c->modrm_ea; 2828 c->dst.val = c->modrm_ea;
2752 break; 2829 break;
@@ -2757,12 +2834,12 @@ special_insn:
2757 2834
2758 if (c->modrm_reg == VCPU_SREG_CS || 2835 if (c->modrm_reg == VCPU_SREG_CS ||
2759 c->modrm_reg > VCPU_SREG_GS) { 2836 c->modrm_reg > VCPU_SREG_GS) {
2760 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2837 emulate_ud(ctxt);
2761 goto done; 2838 goto done;
2762 } 2839 }
2763 2840
2764 if (c->modrm_reg == VCPU_SREG_SS) 2841 if (c->modrm_reg == VCPU_SREG_SS)
2765 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); 2842 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
2766 2843
2767 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); 2844 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
2768 2845
@@ -2775,19 +2852,19 @@ special_insn:
2775 goto done; 2852 goto done;
2776 break; 2853 break;
2777 case 0x90: /* nop / xchg r8,rax */ 2854 case 0x90: /* nop / xchg r8,rax */
2778 if (!(c->rex_prefix & 1)) { /* nop */ 2855 if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
2779 c->dst.type = OP_NONE; 2856 c->dst.type = OP_NONE; /* nop */
2780 break; 2857 break;
2781 } 2858 }
2782 case 0x91 ... 0x97: /* xchg reg,rax */ 2859 case 0x91 ... 0x97: /* xchg reg,rax */
2783 c->src.type = c->dst.type = OP_REG; 2860 c->src.type = OP_REG;
2784 c->src.bytes = c->dst.bytes = c->op_bytes; 2861 c->src.bytes = c->op_bytes;
2785 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; 2862 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
2786 c->src.val = *(c->src.ptr); 2863 c->src.val = *(c->src.ptr);
2787 goto xchg; 2864 goto xchg;
2788 case 0x9c: /* pushf */ 2865 case 0x9c: /* pushf */
2789 c->src.val = (unsigned long) ctxt->eflags; 2866 c->src.val = (unsigned long) ctxt->eflags;
2790 emulate_push(ctxt); 2867 emulate_push(ctxt, ops);
2791 break; 2868 break;
2792 case 0x9d: /* popf */ 2869 case 0x9d: /* popf */
2793 c->dst.type = OP_REG; 2870 c->dst.type = OP_REG;
@@ -2797,19 +2874,15 @@ special_insn:
2797 if (rc != X86EMUL_CONTINUE) 2874 if (rc != X86EMUL_CONTINUE)
2798 goto done; 2875 goto done;
2799 break; 2876 break;
2800 case 0xa0 ... 0xa1: /* mov */ 2877 case 0xa0 ... 0xa3: /* mov */
2801 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2802 c->dst.val = c->src.val;
2803 break;
2804 case 0xa2 ... 0xa3: /* mov */
2805 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
2806 break;
2807 case 0xa4 ... 0xa5: /* movs */ 2878 case 0xa4 ... 0xa5: /* movs */
2808 goto mov; 2879 goto mov;
2809 case 0xa6 ... 0xa7: /* cmps */ 2880 case 0xa6 ... 0xa7: /* cmps */
2810 c->dst.type = OP_NONE; /* Disable writeback. */ 2881 c->dst.type = OP_NONE; /* Disable writeback. */
2811 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2882 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
2812 goto cmp; 2883 goto cmp;
2884 case 0xa8 ... 0xa9: /* test ax, imm */
2885 goto test;
2813 case 0xaa ... 0xab: /* stos */ 2886 case 0xaa ... 0xab: /* stos */
2814 c->dst.val = c->regs[VCPU_REGS_RAX]; 2887 c->dst.val = c->regs[VCPU_REGS_RAX];
2815 break; 2888 break;
@@ -2855,19 +2928,23 @@ special_insn:
2855 long int rel = c->src.val; 2928 long int rel = c->src.val;
2856 c->src.val = (unsigned long) c->eip; 2929 c->src.val = (unsigned long) c->eip;
2857 jmp_rel(c, rel); 2930 jmp_rel(c, rel);
2858 emulate_push(ctxt); 2931 emulate_push(ctxt, ops);
2859 break; 2932 break;
2860 } 2933 }
2861 case 0xe9: /* jmp rel */ 2934 case 0xe9: /* jmp rel */
2862 goto jmp; 2935 goto jmp;
2863 case 0xea: /* jmp far */ 2936 case 0xea: { /* jmp far */
2937 unsigned short sel;
2864 jump_far: 2938 jump_far:
2865 if (load_segment_descriptor(ctxt, ops, c->src2.val, 2939 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
2866 VCPU_SREG_CS)) 2940
2941 if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
2867 goto done; 2942 goto done;
2868 2943
2869 c->eip = c->src.val; 2944 c->eip = 0;
2945 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2870 break; 2946 break;
2947 }
2871 case 0xeb: 2948 case 0xeb:
2872 jmp: /* jmp rel short */ 2949 jmp: /* jmp rel short */
2873 jmp_rel(c, c->src.val); 2950 jmp_rel(c, c->src.val);
@@ -2879,20 +2956,20 @@ special_insn:
2879 do_io_in: 2956 do_io_in:
2880 c->dst.bytes = min(c->dst.bytes, 4u); 2957 c->dst.bytes = min(c->dst.bytes, 4u);
2881 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 2958 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2882 kvm_inject_gp(ctxt->vcpu, 0); 2959 emulate_gp(ctxt, 0);
2883 goto done; 2960 goto done;
2884 } 2961 }
2885 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 2962 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
2886 &c->dst.val)) 2963 &c->dst.val))
2887 goto done; /* IO is needed */ 2964 goto done; /* IO is needed */
2888 break; 2965 break;
2889 case 0xee: /* out al,dx */ 2966 case 0xee: /* out dx,al */
2890 case 0xef: /* out (e/r)ax,dx */ 2967 case 0xef: /* out dx,(e/r)ax */
2891 c->src.val = c->regs[VCPU_REGS_RDX]; 2968 c->src.val = c->regs[VCPU_REGS_RDX];
2892 do_io_out: 2969 do_io_out:
2893 c->dst.bytes = min(c->dst.bytes, 4u); 2970 c->dst.bytes = min(c->dst.bytes, 4u);
2894 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 2971 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2895 kvm_inject_gp(ctxt->vcpu, 0); 2972 emulate_gp(ctxt, 0);
2896 goto done; 2973 goto done;
2897 } 2974 }
2898 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, 2975 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
@@ -2916,18 +2993,20 @@ special_insn:
2916 c->dst.type = OP_NONE; /* Disable writeback. */ 2993 c->dst.type = OP_NONE; /* Disable writeback. */
2917 break; 2994 break;
2918 case 0xfa: /* cli */ 2995 case 0xfa: /* cli */
2919 if (emulator_bad_iopl(ctxt, ops)) 2996 if (emulator_bad_iopl(ctxt, ops)) {
2920 kvm_inject_gp(ctxt->vcpu, 0); 2997 emulate_gp(ctxt, 0);
2921 else { 2998 goto done;
2999 } else {
2922 ctxt->eflags &= ~X86_EFLAGS_IF; 3000 ctxt->eflags &= ~X86_EFLAGS_IF;
2923 c->dst.type = OP_NONE; /* Disable writeback. */ 3001 c->dst.type = OP_NONE; /* Disable writeback. */
2924 } 3002 }
2925 break; 3003 break;
2926 case 0xfb: /* sti */ 3004 case 0xfb: /* sti */
2927 if (emulator_bad_iopl(ctxt, ops)) 3005 if (emulator_bad_iopl(ctxt, ops)) {
2928 kvm_inject_gp(ctxt->vcpu, 0); 3006 emulate_gp(ctxt, 0);
2929 else { 3007 goto done;
2930 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); 3008 } else {
3009 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
2931 ctxt->eflags |= X86_EFLAGS_IF; 3010 ctxt->eflags |= X86_EFLAGS_IF;
2932 c->dst.type = OP_NONE; /* Disable writeback. */ 3011 c->dst.type = OP_NONE; /* Disable writeback. */
2933 } 3012 }
@@ -2964,11 +3043,12 @@ writeback:
2964 c->dst.type = saved_dst_type; 3043 c->dst.type = saved_dst_type;
2965 3044
2966 if ((c->d & SrcMask) == SrcSI) 3045 if ((c->d & SrcMask) == SrcSI)
2967 string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, 3046 string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
2968 &c->src); 3047 VCPU_REGS_RSI, &c->src);
2969 3048
2970 if ((c->d & DstMask) == DstDI) 3049 if ((c->d & DstMask) == DstDI)
2971 string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); 3050 string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
3051 &c->dst);
2972 3052
2973 if (c->rep_prefix && (c->d & String)) { 3053 if (c->rep_prefix && (c->d & String)) {
2974 struct read_cache *rc = &ctxt->decode.io_read; 3054 struct read_cache *rc = &ctxt->decode.io_read;
@@ -2981,11 +3061,12 @@ writeback:
2981 (rc->end != 0 && rc->end == rc->pos)) 3061 (rc->end != 0 && rc->end == rc->pos))
2982 ctxt->restart = false; 3062 ctxt->restart = false;
2983 } 3063 }
2984 3064 /*
2985 /* Commit shadow register state. */ 3065 * reset read cache here in case string instruction is restared
2986 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 3066 * without decoding
2987 kvm_rip_write(ctxt->vcpu, c->eip); 3067 */
2988 ops->set_rflags(ctxt->vcpu, ctxt->eflags); 3068 ctxt->decode.mem_read.end = 0;
3069 ctxt->eip = c->eip;
2989 3070
2990done: 3071done:
2991 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 3072 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -3051,7 +3132,7 @@ twobyte_insn:
3051 c->dst.type = OP_NONE; 3132 c->dst.type = OP_NONE;
3052 break; 3133 break;
3053 case 5: /* not defined */ 3134 case 5: /* not defined */
3054 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3135 emulate_ud(ctxt);
3055 goto done; 3136 goto done;
3056 case 7: /* invlpg*/ 3137 case 7: /* invlpg*/
3057 emulate_invlpg(ctxt->vcpu, c->modrm_ea); 3138 emulate_invlpg(ctxt->vcpu, c->modrm_ea);
@@ -3063,7 +3144,7 @@ twobyte_insn:
3063 } 3144 }
3064 break; 3145 break;
3065 case 0x05: /* syscall */ 3146 case 0x05: /* syscall */
3066 rc = emulate_syscall(ctxt); 3147 rc = emulate_syscall(ctxt, ops);
3067 if (rc != X86EMUL_CONTINUE) 3148 if (rc != X86EMUL_CONTINUE)
3068 goto done; 3149 goto done;
3069 else 3150 else
@@ -3073,8 +3154,11 @@ twobyte_insn:
3073 emulate_clts(ctxt->vcpu); 3154 emulate_clts(ctxt->vcpu);
3074 c->dst.type = OP_NONE; 3155 c->dst.type = OP_NONE;
3075 break; 3156 break;
3076 case 0x08: /* invd */
3077 case 0x09: /* wbinvd */ 3157 case 0x09: /* wbinvd */
3158 kvm_emulate_wbinvd(ctxt->vcpu);
3159 c->dst.type = OP_NONE;
3160 break;
3161 case 0x08: /* invd */
3078 case 0x0d: /* GrpP (prefetch) */ 3162 case 0x0d: /* GrpP (prefetch) */
3079 case 0x18: /* Grp16 (prefetch/nop) */ 3163 case 0x18: /* Grp16 (prefetch/nop) */
3080 c->dst.type = OP_NONE; 3164 c->dst.type = OP_NONE;
@@ -3084,7 +3168,7 @@ twobyte_insn:
3084 case 1: 3168 case 1:
3085 case 5 ... 7: 3169 case 5 ... 7:
3086 case 9 ... 15: 3170 case 9 ... 15:
3087 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3171 emulate_ud(ctxt);
3088 goto done; 3172 goto done;
3089 } 3173 }
3090 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3174 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3093,31 +3177,42 @@ twobyte_insn:
3093 case 0x21: /* mov from dr to reg */ 3177 case 0x21: /* mov from dr to reg */
3094 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3178 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3095 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3179 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3096 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3180 emulate_ud(ctxt);
3097 goto done; 3181 goto done;
3098 } 3182 }
3099 emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); 3183 ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
3100 c->dst.type = OP_NONE; /* no writeback */ 3184 c->dst.type = OP_NONE; /* no writeback */
3101 break; 3185 break;
3102 case 0x22: /* mov reg, cr */ 3186 case 0x22: /* mov reg, cr */
3103 ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); 3187 if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
3188 emulate_gp(ctxt, 0);
3189 goto done;
3190 }
3104 c->dst.type = OP_NONE; 3191 c->dst.type = OP_NONE;
3105 break; 3192 break;
3106 case 0x23: /* mov from reg to dr */ 3193 case 0x23: /* mov from reg to dr */
3107 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3194 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3108 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3195 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3109 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3196 emulate_ud(ctxt);
3197 goto done;
3198 }
3199
3200 if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
3201 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
3202 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3203 /* #UD condition is already handled by the code above */
3204 emulate_gp(ctxt, 0);
3110 goto done; 3205 goto done;
3111 } 3206 }
3112 emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); 3207
3113 c->dst.type = OP_NONE; /* no writeback */ 3208 c->dst.type = OP_NONE; /* no writeback */
3114 break; 3209 break;
3115 case 0x30: 3210 case 0x30:
3116 /* wrmsr */ 3211 /* wrmsr */
3117 msr_data = (u32)c->regs[VCPU_REGS_RAX] 3212 msr_data = (u32)c->regs[VCPU_REGS_RAX]
3118 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3213 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3119 if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 3214 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
3120 kvm_inject_gp(ctxt->vcpu, 0); 3215 emulate_gp(ctxt, 0);
3121 goto done; 3216 goto done;
3122 } 3217 }
3123 rc = X86EMUL_CONTINUE; 3218 rc = X86EMUL_CONTINUE;
@@ -3125,8 +3220,8 @@ twobyte_insn:
3125 break; 3220 break;
3126 case 0x32: 3221 case 0x32:
3127 /* rdmsr */ 3222 /* rdmsr */
3128 if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 3223 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
3129 kvm_inject_gp(ctxt->vcpu, 0); 3224 emulate_gp(ctxt, 0);
3130 goto done; 3225 goto done;
3131 } else { 3226 } else {
3132 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3227 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3136,14 +3231,14 @@ twobyte_insn:
3136 c->dst.type = OP_NONE; 3231 c->dst.type = OP_NONE;
3137 break; 3232 break;
3138 case 0x34: /* sysenter */ 3233 case 0x34: /* sysenter */
3139 rc = emulate_sysenter(ctxt); 3234 rc = emulate_sysenter(ctxt, ops);
3140 if (rc != X86EMUL_CONTINUE) 3235 if (rc != X86EMUL_CONTINUE)
3141 goto done; 3236 goto done;
3142 else 3237 else
3143 goto writeback; 3238 goto writeback;
3144 break; 3239 break;
3145 case 0x35: /* sysexit */ 3240 case 0x35: /* sysexit */
3146 rc = emulate_sysexit(ctxt); 3241 rc = emulate_sysexit(ctxt, ops);
3147 if (rc != X86EMUL_CONTINUE) 3242 if (rc != X86EMUL_CONTINUE)
3148 goto done; 3243 goto done;
3149 else 3244 else
@@ -3160,7 +3255,7 @@ twobyte_insn:
3160 c->dst.type = OP_NONE; 3255 c->dst.type = OP_NONE;
3161 break; 3256 break;
3162 case 0xa0: /* push fs */ 3257 case 0xa0: /* push fs */
3163 emulate_push_sreg(ctxt, VCPU_SREG_FS); 3258 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
3164 break; 3259 break;
3165 case 0xa1: /* pop fs */ 3260 case 0xa1: /* pop fs */
3166 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3261 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3179,7 +3274,7 @@ twobyte_insn:
3179 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 3274 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
3180 break; 3275 break;
3181 case 0xa8: /* push gs */ 3276 case 0xa8: /* push gs */
3182 emulate_push_sreg(ctxt, VCPU_SREG_GS); 3277 emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
3183 break; 3278 break;
3184 case 0xa9: /* pop gs */ 3279 case 0xa9: /* pop gs */
3185 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3280 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0150affad25..0fd6378981f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,6 +5,7 @@
5 * Copyright (c) 2006 Intel Corporation 5 * Copyright (c) 2006 Intel Corporation
6 * Copyright (c) 2007 Keir Fraser, XenSource Inc 6 * Copyright (c) 2007 Keir Fraser, XenSource Inc
7 * Copyright (c) 2008 Intel Corporation 7 * Copyright (c) 2008 Intel Corporation
8 * Copyright 2009 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal 11 * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,7 @@
33 34
34#include <linux/kvm_host.h> 35#include <linux/kvm_host.h>
35#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/workqueue.h>
36 38
37#include "irq.h" 39#include "irq.h"
38#include "i8254.h" 40#include "i8254.h"
@@ -243,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
243{ 245{
244 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 246 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
245 irq_ack_notifier); 247 irq_ack_notifier);
246 raw_spin_lock(&ps->inject_lock); 248 int value;
247 if (atomic_dec_return(&ps->pit_timer.pending) < 0) 249
250 spin_lock(&ps->inject_lock);
251 value = atomic_dec_return(&ps->pit_timer.pending);
252 if (value < 0)
253 /* spurious acks can be generated if, for example, the
254 * PIC is being reset. Handle it gracefully here
255 */
248 atomic_inc(&ps->pit_timer.pending); 256 atomic_inc(&ps->pit_timer.pending);
257 else if (value > 0)
258 /* in this case, we had multiple outstanding pit interrupts
259 * that we needed to inject. Reinject
260 */
261 queue_work(ps->pit->wq, &ps->pit->expired);
249 ps->irq_ack = 1; 262 ps->irq_ack = 1;
250 raw_spin_unlock(&ps->inject_lock); 263 spin_unlock(&ps->inject_lock);
251} 264}
252 265
253void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 266void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -263,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
263 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 276 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
264} 277}
265 278
266static void destroy_pit_timer(struct kvm_timer *pt) 279static void destroy_pit_timer(struct kvm_pit *pit)
267{ 280{
268 pr_debug("execute del timer!\n"); 281 hrtimer_cancel(&pit->pit_state.pit_timer.timer);
269 hrtimer_cancel(&pt->timer); 282 cancel_work_sync(&pit->expired);
270} 283}
271 284
272static bool kpit_is_periodic(struct kvm_timer *ktimer) 285static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -280,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = {
280 .is_periodic = kpit_is_periodic, 293 .is_periodic = kpit_is_periodic,
281}; 294};
282 295
296static void pit_do_work(struct work_struct *work)
297{
298 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
299 struct kvm *kvm = pit->kvm;
300 struct kvm_vcpu *vcpu;
301 int i;
302 struct kvm_kpit_state *ps = &pit->pit_state;
303 int inject = 0;
304
305 /* Try to inject pending interrupts when
306 * last one has been acked.
307 */
308 spin_lock(&ps->inject_lock);
309 if (ps->irq_ack) {
310 ps->irq_ack = 0;
311 inject = 1;
312 }
313 spin_unlock(&ps->inject_lock);
314 if (inject) {
315 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
316 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
317
318 /*
319 * Provides NMI watchdog support via Virtual Wire mode.
320 * The route is: PIT -> PIC -> LVT0 in NMI mode.
321 *
322 * Note: Our Virtual Wire implementation is simplified, only
323 * propagating PIT interrupts to all VCPUs when they have set
324 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
325 * VCPU0, and only if its LVT0 is in EXTINT mode.
326 */
327 if (kvm->arch.vapics_in_nmi_mode > 0)
328 kvm_for_each_vcpu(i, vcpu, kvm)
329 kvm_apic_nmi_wd_deliver(vcpu);
330 }
331}
332
333static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
334{
335 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
336 struct kvm_pit *pt = ktimer->kvm->arch.vpit;
337
338 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
339 atomic_inc(&ktimer->pending);
340 queue_work(pt->wq, &pt->expired);
341 }
342
343 if (ktimer->t_ops->is_periodic(ktimer)) {
344 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
345 return HRTIMER_RESTART;
346 } else
347 return HRTIMER_NORESTART;
348}
349
283static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 350static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
284{ 351{
285 struct kvm_timer *pt = &ps->pit_timer; 352 struct kvm_timer *pt = &ps->pit_timer;
@@ -291,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
291 358
292 /* TODO The new value only affected after the retriggered */ 359 /* TODO The new value only affected after the retriggered */
293 hrtimer_cancel(&pt->timer); 360 hrtimer_cancel(&pt->timer);
361 cancel_work_sync(&ps->pit->expired);
294 pt->period = interval; 362 pt->period = interval;
295 ps->is_periodic = is_period; 363 ps->is_periodic = is_period;
296 364
297 pt->timer.function = kvm_timer_fn; 365 pt->timer.function = pit_timer_fn;
298 pt->t_ops = &kpit_ops; 366 pt->t_ops = &kpit_ops;
299 pt->kvm = ps->pit->kvm; 367 pt->kvm = ps->pit->kvm;
300 pt->vcpu = pt->kvm->bsp_vcpu;
301 368
302 atomic_set(&pt->pending, 0); 369 atomic_set(&pt->pending, 0);
303 ps->irq_ack = 1; 370 ps->irq_ack = 1;
@@ -346,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
346 } 413 }
347 break; 414 break;
348 default: 415 default:
349 destroy_pit_timer(&ps->pit_timer); 416 destroy_pit_timer(kvm->arch.vpit);
350 } 417 }
351} 418}
352 419
@@ -625,7 +692,15 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
625 692
626 mutex_init(&pit->pit_state.lock); 693 mutex_init(&pit->pit_state.lock);
627 mutex_lock(&pit->pit_state.lock); 694 mutex_lock(&pit->pit_state.lock);
628 raw_spin_lock_init(&pit->pit_state.inject_lock); 695 spin_lock_init(&pit->pit_state.inject_lock);
696
697 pit->wq = create_singlethread_workqueue("kvm-pit-wq");
698 if (!pit->wq) {
699 mutex_unlock(&pit->pit_state.lock);
700 kfree(pit);
701 return NULL;
702 }
703 INIT_WORK(&pit->expired, pit_do_work);
629 704
630 kvm->arch.vpit = pit; 705 kvm->arch.vpit = pit;
631 pit->kvm = kvm; 706 pit->kvm = kvm;
@@ -677,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm)
677 struct hrtimer *timer; 752 struct hrtimer *timer;
678 753
679 if (kvm->arch.vpit) { 754 if (kvm->arch.vpit) {
755 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
756 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
757 &kvm->arch.vpit->speaker_dev);
680 kvm_unregister_irq_mask_notifier(kvm, 0, 758 kvm_unregister_irq_mask_notifier(kvm, 0,
681 &kvm->arch.vpit->mask_notifier); 759 &kvm->arch.vpit->mask_notifier);
682 kvm_unregister_irq_ack_notifier(kvm, 760 kvm_unregister_irq_ack_notifier(kvm,
@@ -684,54 +762,10 @@ void kvm_free_pit(struct kvm *kvm)
684 mutex_lock(&kvm->arch.vpit->pit_state.lock); 762 mutex_lock(&kvm->arch.vpit->pit_state.lock);
685 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 763 timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
686 hrtimer_cancel(timer); 764 hrtimer_cancel(timer);
765 cancel_work_sync(&kvm->arch.vpit->expired);
687 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); 766 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
688 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 767 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
768 destroy_workqueue(kvm->arch.vpit->wq);
689 kfree(kvm->arch.vpit); 769 kfree(kvm->arch.vpit);
690 } 770 }
691} 771}
692
693static void __inject_pit_timer_intr(struct kvm *kvm)
694{
695 struct kvm_vcpu *vcpu;
696 int i;
697
698 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
699 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
700
701 /*
702 * Provides NMI watchdog support via Virtual Wire mode.
703 * The route is: PIT -> PIC -> LVT0 in NMI mode.
704 *
705 * Note: Our Virtual Wire implementation is simplified, only
706 * propagating PIT interrupts to all VCPUs when they have set
707 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
708 * VCPU0, and only if its LVT0 is in EXTINT mode.
709 */
710 if (kvm->arch.vapics_in_nmi_mode > 0)
711 kvm_for_each_vcpu(i, vcpu, kvm)
712 kvm_apic_nmi_wd_deliver(vcpu);
713}
714
715void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
716{
717 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
718 struct kvm *kvm = vcpu->kvm;
719 struct kvm_kpit_state *ps;
720
721 if (pit) {
722 int inject = 0;
723 ps = &pit->pit_state;
724
725 /* Try to inject pending interrupts when
726 * last one has been acked.
727 */
728 raw_spin_lock(&ps->inject_lock);
729 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
730 ps->irq_ack = 0;
731 inject = 1;
732 }
733 raw_spin_unlock(&ps->inject_lock);
734 if (inject)
735 __inject_pit_timer_intr(kvm);
736 }
737}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 900d6b0ba7c..46d08ca0b48 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
27 u32 speaker_data_on; 27 u32 speaker_data_on;
28 struct mutex lock; 28 struct mutex lock;
29 struct kvm_pit *pit; 29 struct kvm_pit *pit;
30 raw_spinlock_t inject_lock; 30 spinlock_t inject_lock;
31 unsigned long irq_ack; 31 unsigned long irq_ack;
32 struct kvm_irq_ack_notifier irq_ack_notifier; 32 struct kvm_irq_ack_notifier irq_ack_notifier;
33}; 33};
@@ -40,6 +40,8 @@ struct kvm_pit {
40 struct kvm_kpit_state pit_state; 40 struct kvm_kpit_state pit_state;
41 int irq_source_id; 41 int irq_source_id;
42 struct kvm_irq_mask_notifier mask_notifier; 42 struct kvm_irq_mask_notifier mask_notifier;
43 struct workqueue_struct *wq;
44 struct work_struct expired;
43}; 45};
44 46
45#define KVM_PIT_BASE_ADDRESS 0x40 47#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 93825ff3338..8d10c063d7f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard 4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation 5 * Copyright (c) 2007 Intel Corporation
6 * Copyright 2009 Red Hat, Inc. and/or its affilates.
6 * 7 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal 9 * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,8 @@
33#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
34#include "trace.h" 35#include "trace.h"
35 36
37static void pic_irq_request(struct kvm *kvm, int level);
38
36static void pic_lock(struct kvm_pic *s) 39static void pic_lock(struct kvm_pic *s)
37 __acquires(&s->lock) 40 __acquires(&s->lock)
38{ 41{
@@ -43,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s)
43 __releases(&s->lock) 46 __releases(&s->lock)
44{ 47{
45 bool wakeup = s->wakeup_needed; 48 bool wakeup = s->wakeup_needed;
46 struct kvm_vcpu *vcpu; 49 struct kvm_vcpu *vcpu, *found = NULL;
50 int i;
47 51
48 s->wakeup_needed = false; 52 s->wakeup_needed = false;
49 53
50 raw_spin_unlock(&s->lock); 54 raw_spin_unlock(&s->lock);
51 55
52 if (wakeup) { 56 if (wakeup) {
53 vcpu = s->kvm->bsp_vcpu; 57 kvm_for_each_vcpu(i, vcpu, s->kvm) {
54 if (vcpu) 58 if (kvm_apic_accept_pic_intr(vcpu)) {
55 kvm_vcpu_kick(vcpu); 59 found = vcpu;
60 break;
61 }
62 }
63
64 if (!found)
65 found = s->kvm->bsp_vcpu;
66
67 kvm_vcpu_kick(found);
56 } 68 }
57} 69}
58 70
@@ -173,10 +185,7 @@ static void pic_update_irq(struct kvm_pic *s)
173 pic_set_irq1(&s->pics[0], 2, 0); 185 pic_set_irq1(&s->pics[0], 2, 0);
174 } 186 }
175 irq = pic_get_irq(&s->pics[0]); 187 irq = pic_get_irq(&s->pics[0]);
176 if (irq >= 0) 188 pic_irq_request(s->kvm, irq >= 0);
177 s->irq_request(s->irq_request_opaque, 1);
178 else
179 s->irq_request(s->irq_request_opaque, 0);
180} 189}
181 190
182void kvm_pic_update_irq(struct kvm_pic *s) 191void kvm_pic_update_irq(struct kvm_pic *s)
@@ -261,8 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
261void kvm_pic_reset(struct kvm_kpic_state *s) 270void kvm_pic_reset(struct kvm_kpic_state *s)
262{ 271{
263 int irq; 272 int irq;
264 struct kvm *kvm = s->pics_state->irq_request_opaque; 273 struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
265 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
266 u8 irr = s->irr, isr = s->imr; 274 u8 irr = s->irr, isr = s->imr;
267 275
268 s->last_irr = 0; 276 s->last_irr = 0;
@@ -301,8 +309,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
301 /* 309 /*
302 * deassert a pending interrupt 310 * deassert a pending interrupt
303 */ 311 */
304 s->pics_state->irq_request(s->pics_state-> 312 pic_irq_request(s->pics_state->kvm, 0);
305 irq_request_opaque, 0);
306 s->init_state = 1; 313 s->init_state = 1;
307 s->init4 = val & 1; 314 s->init4 = val & 1;
308 if (val & 0x02) 315 if (val & 0x02)
@@ -356,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
356 } 363 }
357 } else 364 } else
358 switch (s->init_state) { 365 switch (s->init_state) {
359 case 0: /* normal mode */ 366 case 0: { /* normal mode */
367 u8 imr_diff = s->imr ^ val,
368 off = (s == &s->pics_state->pics[0]) ? 0 : 8;
360 s->imr = val; 369 s->imr = val;
370 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
371 if (imr_diff & (1 << irq))
372 kvm_fire_mask_notifiers(
373 s->pics_state->kvm,
374 SELECT_PIC(irq + off),
375 irq + off,
376 !!(s->imr & (1 << irq)));
361 pic_update_irq(s->pics_state); 377 pic_update_irq(s->pics_state);
362 break; 378 break;
379 }
363 case 1: 380 case 1:
364 s->irq_base = val & 0xf8; 381 s->irq_base = val & 0xf8;
365 s->init_state = 2; 382 s->init_state = 2;
@@ -518,9 +535,8 @@ static int picdev_read(struct kvm_io_device *this,
518/* 535/*
519 * callback when PIC0 irq status changed 536 * callback when PIC0 irq status changed
520 */ 537 */
521static void pic_irq_request(void *opaque, int level) 538static void pic_irq_request(struct kvm *kvm, int level)
522{ 539{
523 struct kvm *kvm = opaque;
524 struct kvm_vcpu *vcpu = kvm->bsp_vcpu; 540 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
525 struct kvm_pic *s = pic_irqchip(kvm); 541 struct kvm_pic *s = pic_irqchip(kvm);
526 int irq = pic_get_irq(&s->pics[0]); 542 int irq = pic_get_irq(&s->pics[0]);
@@ -549,8 +565,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
549 s->kvm = kvm; 565 s->kvm = kvm;
550 s->pics[0].elcr_mask = 0xf8; 566 s->pics[0].elcr_mask = 0xf8;
551 s->pics[1].elcr_mask = 0xde; 567 s->pics[1].elcr_mask = 0xde;
552 s->irq_request = pic_irq_request;
553 s->irq_request_opaque = kvm;
554 s->pics[0].pics_state = s; 568 s->pics[0].pics_state = s;
555 s->pics[1].pics_state = s; 569 s->pics[1].pics_state = s;
556 570
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 96dfbb6ad2a..2095a049835 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * irq.c: API for in kernel interrupt controller 2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation. 3 * Copyright (c) 2007, Intel Corporation.
4 * Copyright 2009 Red Hat, Inc. and/or its affilates.
4 * 5 *
5 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -89,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
89void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 90void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
90{ 91{
91 kvm_inject_apic_timer_irqs(vcpu); 92 kvm_inject_apic_timer_irqs(vcpu);
92 kvm_inject_pit_timer_irqs(vcpu);
93 /* TODO: PIT, RTC etc. */ 93 /* TODO: PIT, RTC etc. */
94} 94}
95EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); 95EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index cd1f362f413..ffed06871c5 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -38,8 +38,6 @@
38struct kvm; 38struct kvm;
39struct kvm_vcpu; 39struct kvm_vcpu;
40 40
41typedef void irq_request_func(void *opaque, int level);
42
43struct kvm_kpic_state { 41struct kvm_kpic_state {
44 u8 last_irr; /* edge detection */ 42 u8 last_irr; /* edge detection */
45 u8 irr; /* interrupt request register */ 43 u8 irr; /* interrupt request register */
@@ -67,8 +65,6 @@ struct kvm_pic {
67 unsigned pending_acks; 65 unsigned pending_acks;
68 struct kvm *kvm; 66 struct kvm *kvm;
69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 67 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
70 irq_request_func *irq_request;
71 void *irq_request_opaque;
72 int output; /* intr from master PIC */ 68 int output; /* intr from master PIC */
73 struct kvm_io_device dev; 69 struct kvm_io_device dev;
74 void (*ack_notifier)(void *opaque, int irq); 70 void (*ack_notifier)(void *opaque, int irq);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index cff851cf532..6491ac8e755 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
36 36
37static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) 37static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
38{ 38{
39 might_sleep(); /* on svm */
40
39 if (!test_bit(VCPU_EXREG_PDPTR, 41 if (!test_bit(VCPU_EXREG_PDPTR,
40 (unsigned long *)&vcpu->arch.regs_avail)) 42 (unsigned long *)&vcpu->arch.regs_avail))
41 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); 43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
@@ -69,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
69 return kvm_read_cr4_bits(vcpu, ~0UL); 71 return kvm_read_cr4_bits(vcpu, ~0UL);
70} 72}
71 73
74static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
75{
76 return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
77 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
78}
79
72#endif 80#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1eb7a4ae0c9..77d8c0f4817 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,6 +5,7 @@
5 * Copyright (C) 2006 Qumranet, Inc. 5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell 6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel 7 * Copyright (C) 2007 Intel
8 * Copyright 2009 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Authors: 10 * Authors:
10 * Dor Laor <dor.laor@qumranet.com> 11 * Dor Laor <dor.laor@qumranet.com>
@@ -328,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
328 "dest_mode 0x%x, short_hand 0x%x\n", 329 "dest_mode 0x%x, short_hand 0x%x\n",
329 target, source, dest, dest_mode, short_hand); 330 target, source, dest, dest_mode, short_hand);
330 331
331 ASSERT(!target); 332 ASSERT(target);
332 switch (short_hand) { 333 switch (short_hand) {
333 case APIC_DEST_NOSHORT: 334 case APIC_DEST_NOSHORT:
334 if (dest_mode == 0) 335 if (dest_mode == 0)
@@ -533,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
533 struct kvm_vcpu *vcpu = apic->vcpu; 534 struct kvm_vcpu *vcpu = apic->vcpu;
534 struct kvm_run *run = vcpu->run; 535 struct kvm_run *run = vcpu->run;
535 536
536 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); 537 kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
537 run->tpr_access.rip = kvm_rip_read(vcpu); 538 run->tpr_access.rip = kvm_rip_read(vcpu);
538 run->tpr_access.is_write = write; 539 run->tpr_access.is_write = write;
539} 540}
@@ -1106,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1106 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); 1107 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1107 int r = 0; 1108 int r = 0;
1108 1109
1109 if (kvm_vcpu_is_bsp(vcpu)) { 1110 if (!apic_hw_enabled(vcpu->arch.apic))
1110 if (!apic_hw_enabled(vcpu->arch.apic)) 1111 r = 1;
1111 r = 1; 1112 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1112 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1113 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
1113 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1114 r = 1;
1114 r = 1;
1115 }
1116 return r; 1115 return r;
1117} 1116}
1118 1117
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1ed0a1a591..311f6dad895 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,6 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 * 11 *
11 * Authors: 12 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -32,6 +33,7 @@
32#include <linux/compiler.h> 33#include <linux/compiler.h>
33#include <linux/srcu.h> 34#include <linux/srcu.h>
34#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/uaccess.h>
35 37
36#include <asm/page.h> 38#include <asm/page.h>
37#include <asm/cmpxchg.h> 39#include <asm/cmpxchg.h>
@@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644);
90#define PT_FIRST_AVAIL_BITS_SHIFT 9 92#define PT_FIRST_AVAIL_BITS_SHIFT 9
91#define PT64_SECOND_AVAIL_BITS_SHIFT 52 93#define PT64_SECOND_AVAIL_BITS_SHIFT 52
92 94
93#define VALID_PAGE(x) ((x) != INVALID_PAGE)
94
95#define PT64_LEVEL_BITS 9 95#define PT64_LEVEL_BITS 9
96 96
97#define PT64_LEVEL_SHIFT(level) \ 97#define PT64_LEVEL_SHIFT(level) \
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {
173 shadow_walk_okay(&(_walker)); \ 173 shadow_walk_okay(&(_walker)); \
174 shadow_walk_next(&(_walker))) 174 shadow_walk_next(&(_walker)))
175 175
176typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); 176typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
177 177
178static struct kmem_cache *pte_chain_cache; 178static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache; 179static struct kmem_cache *rmap_desc_cache;
@@ -281,13 +281,38 @@ static gfn_t pse36_gfn_delta(u32 gpte)
281 281
282static void __set_spte(u64 *sptep, u64 spte) 282static void __set_spte(u64 *sptep, u64 spte)
283{ 283{
284 set_64bit(sptep, spte);
285}
286
287static u64 __xchg_spte(u64 *sptep, u64 new_spte)
288{
284#ifdef CONFIG_X86_64 289#ifdef CONFIG_X86_64
285 set_64bit((unsigned long *)sptep, spte); 290 return xchg(sptep, new_spte);
286#else 291#else
287 set_64bit((unsigned long long *)sptep, spte); 292 u64 old_spte;
293
294 do {
295 old_spte = *sptep;
296 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
297
298 return old_spte;
288#endif 299#endif
289} 300}
290 301
302static void update_spte(u64 *sptep, u64 new_spte)
303{
304 u64 old_spte;
305
306 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
307 !is_rmap_spte(*sptep))
308 __set_spte(sptep, new_spte);
309 else {
310 old_spte = __xchg_spte(sptep, new_spte);
311 if (old_spte & shadow_accessed_mask)
312 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
313 }
314}
315
291static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 316static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
292 struct kmem_cache *base_cache, int min) 317 struct kmem_cache *base_cache, int min)
293{ 318{
@@ -304,10 +329,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
304 return 0; 329 return 0;
305} 330}
306 331
307static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 332static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
333 struct kmem_cache *cache)
308{ 334{
309 while (mc->nobjs) 335 while (mc->nobjs)
310 kfree(mc->objects[--mc->nobjs]); 336 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
311} 337}
312 338
313static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 339static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -355,10 +381,11 @@ out:
355 381
356static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 382static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
357{ 383{
358 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); 384 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
359 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); 385 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
360 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 386 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
361 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); 387 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
388 mmu_page_header_cache);
362} 389}
363 390
364static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 391static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -379,7 +406,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
379 406
380static void mmu_free_pte_chain(struct kvm_pte_chain *pc) 407static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
381{ 408{
382 kfree(pc); 409 kmem_cache_free(pte_chain_cache, pc);
383} 410}
384 411
385static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) 412static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -390,7 +417,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
390 417
391static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) 418static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
392{ 419{
393 kfree(rd); 420 kmem_cache_free(rmap_desc_cache, rd);
421}
422
423static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
424{
425 if (!sp->role.direct)
426 return sp->gfns[index];
427
428 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
429}
430
431static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
432{
433 if (sp->role.direct)
434 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
435 else
436 sp->gfns[index] = gfn;
394} 437}
395 438
396/* 439/*
@@ -403,8 +446,8 @@ static int *slot_largepage_idx(gfn_t gfn,
403{ 446{
404 unsigned long idx; 447 unsigned long idx;
405 448
406 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - 449 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
407 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); 450 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
408 return &slot->lpage_info[level - 2][idx].write_count; 451 return &slot->lpage_info[level - 2][idx].write_count;
409} 452}
410 453
@@ -414,9 +457,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
414 int *write_count; 457 int *write_count;
415 int i; 458 int i;
416 459
417 gfn = unalias_gfn(kvm, gfn); 460 slot = gfn_to_memslot(kvm, gfn);
418
419 slot = gfn_to_memslot_unaliased(kvm, gfn);
420 for (i = PT_DIRECTORY_LEVEL; 461 for (i = PT_DIRECTORY_LEVEL;
421 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 462 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
422 write_count = slot_largepage_idx(gfn, slot, i); 463 write_count = slot_largepage_idx(gfn, slot, i);
@@ -430,8 +471,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
430 int *write_count; 471 int *write_count;
431 int i; 472 int i;
432 473
433 gfn = unalias_gfn(kvm, gfn); 474 slot = gfn_to_memslot(kvm, gfn);
434 slot = gfn_to_memslot_unaliased(kvm, gfn);
435 for (i = PT_DIRECTORY_LEVEL; 475 for (i = PT_DIRECTORY_LEVEL;
436 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 476 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
437 write_count = slot_largepage_idx(gfn, slot, i); 477 write_count = slot_largepage_idx(gfn, slot, i);
@@ -447,8 +487,7 @@ static int has_wrprotected_page(struct kvm *kvm,
447 struct kvm_memory_slot *slot; 487 struct kvm_memory_slot *slot;
448 int *largepage_idx; 488 int *largepage_idx;
449 489
450 gfn = unalias_gfn(kvm, gfn); 490 slot = gfn_to_memslot(kvm, gfn);
451 slot = gfn_to_memslot_unaliased(kvm, gfn);
452 if (slot) { 491 if (slot) {
453 largepage_idx = slot_largepage_idx(gfn, slot, level); 492 largepage_idx = slot_largepage_idx(gfn, slot, level);
454 return *largepage_idx; 493 return *largepage_idx;
@@ -501,7 +540,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
501 540
502/* 541/*
503 * Take gfn and return the reverse mapping to it. 542 * Take gfn and return the reverse mapping to it.
504 * Note: gfn must be unaliased before this function get called
505 */ 543 */
506 544
507static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 545static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -513,8 +551,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
513 if (likely(level == PT_PAGE_TABLE_LEVEL)) 551 if (likely(level == PT_PAGE_TABLE_LEVEL))
514 return &slot->rmap[gfn - slot->base_gfn]; 552 return &slot->rmap[gfn - slot->base_gfn];
515 553
516 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - 554 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
517 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); 555 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
518 556
519 return &slot->lpage_info[level - 2][idx].rmap_pde; 557 return &slot->lpage_info[level - 2][idx].rmap_pde;
520} 558}
@@ -541,9 +579,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
541 579
542 if (!is_rmap_spte(*spte)) 580 if (!is_rmap_spte(*spte))
543 return count; 581 return count;
544 gfn = unalias_gfn(vcpu->kvm, gfn);
545 sp = page_header(__pa(spte)); 582 sp = page_header(__pa(spte));
546 sp->gfns[spte - sp->spt] = gfn; 583 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
547 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 584 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
548 if (!*rmapp) { 585 if (!*rmapp) {
549 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 586 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -600,19 +637,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
600 struct kvm_rmap_desc *desc; 637 struct kvm_rmap_desc *desc;
601 struct kvm_rmap_desc *prev_desc; 638 struct kvm_rmap_desc *prev_desc;
602 struct kvm_mmu_page *sp; 639 struct kvm_mmu_page *sp;
603 pfn_t pfn; 640 gfn_t gfn;
604 unsigned long *rmapp; 641 unsigned long *rmapp;
605 int i; 642 int i;
606 643
607 if (!is_rmap_spte(*spte))
608 return;
609 sp = page_header(__pa(spte)); 644 sp = page_header(__pa(spte));
610 pfn = spte_to_pfn(*spte); 645 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
611 if (*spte & shadow_accessed_mask) 646 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
612 kvm_set_pfn_accessed(pfn);
613 if (is_writable_pte(*spte))
614 kvm_set_pfn_dirty(pfn);
615 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
616 if (!*rmapp) { 647 if (!*rmapp) {
617 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 648 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
618 BUG(); 649 BUG();
@@ -644,6 +675,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
644 } 675 }
645} 676}
646 677
678static void set_spte_track_bits(u64 *sptep, u64 new_spte)
679{
680 pfn_t pfn;
681 u64 old_spte = *sptep;
682
683 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
684 old_spte & shadow_accessed_mask) {
685 __set_spte(sptep, new_spte);
686 } else
687 old_spte = __xchg_spte(sptep, new_spte);
688
689 if (!is_rmap_spte(old_spte))
690 return;
691 pfn = spte_to_pfn(old_spte);
692 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
693 kvm_set_pfn_accessed(pfn);
694 if (is_writable_pte(old_spte))
695 kvm_set_pfn_dirty(pfn);
696}
697
698static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
699{
700 set_spte_track_bits(sptep, new_spte);
701 rmap_remove(kvm, sptep);
702}
703
647static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 704static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
648{ 705{
649 struct kvm_rmap_desc *desc; 706 struct kvm_rmap_desc *desc;
@@ -676,7 +733,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
676 u64 *spte; 733 u64 *spte;
677 int i, write_protected = 0; 734 int i, write_protected = 0;
678 735
679 gfn = unalias_gfn(kvm, gfn);
680 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); 736 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
681 737
682 spte = rmap_next(kvm, rmapp, NULL); 738 spte = rmap_next(kvm, rmapp, NULL);
@@ -685,7 +741,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
685 BUG_ON(!(*spte & PT_PRESENT_MASK)); 741 BUG_ON(!(*spte & PT_PRESENT_MASK));
686 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 742 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
687 if (is_writable_pte(*spte)) { 743 if (is_writable_pte(*spte)) {
688 __set_spte(spte, *spte & ~PT_WRITABLE_MASK); 744 update_spte(spte, *spte & ~PT_WRITABLE_MASK);
689 write_protected = 1; 745 write_protected = 1;
690 } 746 }
691 spte = rmap_next(kvm, rmapp, spte); 747 spte = rmap_next(kvm, rmapp, spte);
@@ -709,9 +765,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
709 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 765 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
710 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 766 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
711 if (is_writable_pte(*spte)) { 767 if (is_writable_pte(*spte)) {
712 rmap_remove(kvm, spte); 768 drop_spte(kvm, spte,
769 shadow_trap_nonpresent_pte);
713 --kvm->stat.lpages; 770 --kvm->stat.lpages;
714 __set_spte(spte, shadow_trap_nonpresent_pte);
715 spte = NULL; 771 spte = NULL;
716 write_protected = 1; 772 write_protected = 1;
717 } 773 }
@@ -731,8 +787,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
731 while ((spte = rmap_next(kvm, rmapp, NULL))) { 787 while ((spte = rmap_next(kvm, rmapp, NULL))) {
732 BUG_ON(!(*spte & PT_PRESENT_MASK)); 788 BUG_ON(!(*spte & PT_PRESENT_MASK));
733 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 789 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
734 rmap_remove(kvm, spte); 790 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
735 __set_spte(spte, shadow_trap_nonpresent_pte);
736 need_tlb_flush = 1; 791 need_tlb_flush = 1;
737 } 792 }
738 return need_tlb_flush; 793 return need_tlb_flush;
@@ -754,8 +809,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
754 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 809 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
755 need_flush = 1; 810 need_flush = 1;
756 if (pte_write(*ptep)) { 811 if (pte_write(*ptep)) {
757 rmap_remove(kvm, spte); 812 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
758 __set_spte(spte, shadow_trap_nonpresent_pte);
759 spte = rmap_next(kvm, rmapp, NULL); 813 spte = rmap_next(kvm, rmapp, NULL);
760 } else { 814 } else {
761 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 815 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -763,9 +817,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
763 817
764 new_spte &= ~PT_WRITABLE_MASK; 818 new_spte &= ~PT_WRITABLE_MASK;
765 new_spte &= ~SPTE_HOST_WRITEABLE; 819 new_spte &= ~SPTE_HOST_WRITEABLE;
766 if (is_writable_pte(*spte)) 820 new_spte &= ~shadow_accessed_mask;
767 kvm_set_pfn_dirty(spte_to_pfn(*spte)); 821 set_spte_track_bits(spte, new_spte);
768 __set_spte(spte, new_spte);
769 spte = rmap_next(kvm, rmapp, spte); 822 spte = rmap_next(kvm, rmapp, spte);
770 } 823 }
771 } 824 }
@@ -799,8 +852,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
799 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 852 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
800 853
801 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 854 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
802 int idx = gfn_offset; 855 unsigned long idx;
803 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 856 int sh;
857
858 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
859 idx = ((memslot->base_gfn+gfn_offset) >> sh) -
860 (memslot->base_gfn >> sh);
804 ret |= handler(kvm, 861 ret |= handler(kvm,
805 &memslot->lpage_info[j][idx].rmap_pde, 862 &memslot->lpage_info[j][idx].rmap_pde,
806 data); 863 data);
@@ -863,7 +920,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
863 920
864 sp = page_header(__pa(spte)); 921 sp = page_header(__pa(spte));
865 922
866 gfn = unalias_gfn(vcpu->kvm, gfn);
867 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 923 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
868 924
869 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); 925 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -894,10 +950,12 @@ static int is_empty_shadow_page(u64 *spt)
894static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 950static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
895{ 951{
896 ASSERT(is_empty_shadow_page(sp->spt)); 952 ASSERT(is_empty_shadow_page(sp->spt));
953 hlist_del(&sp->hash_link);
897 list_del(&sp->link); 954 list_del(&sp->link);
898 __free_page(virt_to_page(sp->spt)); 955 __free_page(virt_to_page(sp->spt));
899 __free_page(virt_to_page(sp->gfns)); 956 if (!sp->role.direct)
900 kfree(sp); 957 __free_page(virt_to_page(sp->gfns));
958 kmem_cache_free(mmu_page_header_cache, sp);
901 ++kvm->arch.n_free_mmu_pages; 959 ++kvm->arch.n_free_mmu_pages;
902} 960}
903 961
@@ -907,13 +965,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
907} 965}
908 966
909static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 967static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
910 u64 *parent_pte) 968 u64 *parent_pte, int direct)
911{ 969{
912 struct kvm_mmu_page *sp; 970 struct kvm_mmu_page *sp;
913 971
914 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); 972 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
915 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 973 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
916 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 974 if (!direct)
975 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
976 PAGE_SIZE);
917 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 977 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
918 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 978 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
919 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 979 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
@@ -998,7 +1058,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
998 BUG(); 1058 BUG();
999} 1059}
1000 1060
1001
1002static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) 1061static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1003{ 1062{
1004 struct kvm_pte_chain *pte_chain; 1063 struct kvm_pte_chain *pte_chain;
@@ -1008,63 +1067,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1008 1067
1009 if (!sp->multimapped && sp->parent_pte) { 1068 if (!sp->multimapped && sp->parent_pte) {
1010 parent_sp = page_header(__pa(sp->parent_pte)); 1069 parent_sp = page_header(__pa(sp->parent_pte));
1011 fn(parent_sp); 1070 fn(parent_sp, sp->parent_pte);
1012 mmu_parent_walk(parent_sp, fn);
1013 return; 1071 return;
1014 } 1072 }
1073
1015 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1074 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1016 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { 1075 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1017 if (!pte_chain->parent_ptes[i]) 1076 u64 *spte = pte_chain->parent_ptes[i];
1077
1078 if (!spte)
1018 break; 1079 break;
1019 parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); 1080 parent_sp = page_header(__pa(spte));
1020 fn(parent_sp); 1081 fn(parent_sp, spte);
1021 mmu_parent_walk(parent_sp, fn);
1022 } 1082 }
1023} 1083}
1024 1084
1025static void kvm_mmu_update_unsync_bitmap(u64 *spte) 1085static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
1086static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1026{ 1087{
1027 unsigned int index; 1088 mmu_parent_walk(sp, mark_unsync);
1028 struct kvm_mmu_page *sp = page_header(__pa(spte));
1029
1030 index = spte - sp->spt;
1031 if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
1032 sp->unsync_children++;
1033 WARN_ON(!sp->unsync_children);
1034} 1089}
1035 1090
1036static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) 1091static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1037{ 1092{
1038 struct kvm_pte_chain *pte_chain; 1093 unsigned int index;
1039 struct hlist_node *node;
1040 int i;
1041 1094
1042 if (!sp->parent_pte) 1095 index = spte - sp->spt;
1096 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1043 return; 1097 return;
1044 1098 if (sp->unsync_children++)
1045 if (!sp->multimapped) {
1046 kvm_mmu_update_unsync_bitmap(sp->parent_pte);
1047 return; 1099 return;
1048 } 1100 kvm_mmu_mark_parents_unsync(sp);
1049
1050 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1051 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1052 if (!pte_chain->parent_ptes[i])
1053 break;
1054 kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
1055 }
1056}
1057
1058static int unsync_walk_fn(struct kvm_mmu_page *sp)
1059{
1060 kvm_mmu_update_parents_unsync(sp);
1061 return 1;
1062}
1063
1064static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1065{
1066 mmu_parent_walk(sp, unsync_walk_fn);
1067 kvm_mmu_update_parents_unsync(sp);
1068} 1101}
1069 1102
1070static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 1103static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1077,7 +1110,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1077} 1110}
1078 1111
1079static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1112static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1080 struct kvm_mmu_page *sp) 1113 struct kvm_mmu_page *sp, bool clear_unsync)
1081{ 1114{
1082 return 1; 1115 return 1;
1083} 1116}
@@ -1123,35 +1156,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1123 int i, ret, nr_unsync_leaf = 0; 1156 int i, ret, nr_unsync_leaf = 0;
1124 1157
1125 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1158 for_each_unsync_children(sp->unsync_child_bitmap, i) {
1159 struct kvm_mmu_page *child;
1126 u64 ent = sp->spt[i]; 1160 u64 ent = sp->spt[i];
1127 1161
1128 if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { 1162 if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1129 struct kvm_mmu_page *child; 1163 goto clear_child_bitmap;
1130 child = page_header(ent & PT64_BASE_ADDR_MASK); 1164
1131 1165 child = page_header(ent & PT64_BASE_ADDR_MASK);
1132 if (child->unsync_children) { 1166
1133 if (mmu_pages_add(pvec, child, i)) 1167 if (child->unsync_children) {
1134 return -ENOSPC; 1168 if (mmu_pages_add(pvec, child, i))
1135 1169 return -ENOSPC;
1136 ret = __mmu_unsync_walk(child, pvec); 1170
1137 if (!ret) 1171 ret = __mmu_unsync_walk(child, pvec);
1138 __clear_bit(i, sp->unsync_child_bitmap); 1172 if (!ret)
1139 else if (ret > 0) 1173 goto clear_child_bitmap;
1140 nr_unsync_leaf += ret; 1174 else if (ret > 0)
1141 else 1175 nr_unsync_leaf += ret;
1142 return ret; 1176 else
1143 } 1177 return ret;
1178 } else if (child->unsync) {
1179 nr_unsync_leaf++;
1180 if (mmu_pages_add(pvec, child, i))
1181 return -ENOSPC;
1182 } else
1183 goto clear_child_bitmap;
1144 1184
1145 if (child->unsync) { 1185 continue;
1146 nr_unsync_leaf++; 1186
1147 if (mmu_pages_add(pvec, child, i)) 1187clear_child_bitmap:
1148 return -ENOSPC; 1188 __clear_bit(i, sp->unsync_child_bitmap);
1149 } 1189 sp->unsync_children--;
1150 } 1190 WARN_ON((int)sp->unsync_children < 0);
1151 } 1191 }
1152 1192
1153 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
1154 sp->unsync_children = 0;
1155 1193
1156 return nr_unsync_leaf; 1194 return nr_unsync_leaf;
1157} 1195}
@@ -1166,26 +1204,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1166 return __mmu_unsync_walk(sp, pvec); 1204 return __mmu_unsync_walk(sp, pvec);
1167} 1205}
1168 1206
1169static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1170{
1171 unsigned index;
1172 struct hlist_head *bucket;
1173 struct kvm_mmu_page *sp;
1174 struct hlist_node *node;
1175
1176 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1177 index = kvm_page_table_hashfn(gfn);
1178 bucket = &kvm->arch.mmu_page_hash[index];
1179 hlist_for_each_entry(sp, node, bucket, hash_link)
1180 if (sp->gfn == gfn && !sp->role.direct
1181 && !sp->role.invalid) {
1182 pgprintk("%s: found role %x\n",
1183 __func__, sp->role.word);
1184 return sp;
1185 }
1186 return NULL;
1187}
1188
1189static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1207static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1190{ 1208{
1191 WARN_ON(!sp->unsync); 1209 WARN_ON(!sp->unsync);
@@ -1194,20 +1212,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1194 --kvm->stat.mmu_unsync; 1212 --kvm->stat.mmu_unsync;
1195} 1213}
1196 1214
1197static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); 1215static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1216 struct list_head *invalid_list);
1217static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1218 struct list_head *invalid_list);
1198 1219
1199static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1220#define for_each_gfn_sp(kvm, sp, gfn, pos) \
1221 hlist_for_each_entry(sp, pos, \
1222 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1223 if ((sp)->gfn != (gfn)) {} else
1224
1225#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
1226 hlist_for_each_entry(sp, pos, \
1227 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1228 if ((sp)->gfn != (gfn) || (sp)->role.direct || \
1229 (sp)->role.invalid) {} else
1230
1231/* @sp->gfn should be write-protected at the call site */
1232static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1233 struct list_head *invalid_list, bool clear_unsync)
1200{ 1234{
1201 if (sp->role.cr4_pae != !!is_pae(vcpu)) { 1235 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1202 kvm_mmu_zap_page(vcpu->kvm, sp); 1236 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1203 return 1; 1237 return 1;
1204 } 1238 }
1205 1239
1206 if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1240 if (clear_unsync)
1207 kvm_flush_remote_tlbs(vcpu->kvm); 1241 kvm_unlink_unsync_page(vcpu->kvm, sp);
1208 kvm_unlink_unsync_page(vcpu->kvm, sp); 1242
1209 if (vcpu->arch.mmu.sync_page(vcpu, sp)) { 1243 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1210 kvm_mmu_zap_page(vcpu->kvm, sp); 1244 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1211 return 1; 1245 return 1;
1212 } 1246 }
1213 1247
@@ -1215,6 +1249,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1215 return 0; 1249 return 0;
1216} 1250}
1217 1251
1252static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1253 struct kvm_mmu_page *sp)
1254{
1255 LIST_HEAD(invalid_list);
1256 int ret;
1257
1258 ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1259 if (ret)
1260 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1261
1262 return ret;
1263}
1264
1265static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1266 struct list_head *invalid_list)
1267{
1268 return __kvm_sync_page(vcpu, sp, invalid_list, true);
1269}
1270
1271/* @gfn should be write-protected at the call site */
1272static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1273{
1274 struct kvm_mmu_page *s;
1275 struct hlist_node *node;
1276 LIST_HEAD(invalid_list);
1277 bool flush = false;
1278
1279 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1280 if (!s->unsync)
1281 continue;
1282
1283 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1284 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1285 (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1286 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1287 continue;
1288 }
1289 kvm_unlink_unsync_page(vcpu->kvm, s);
1290 flush = true;
1291 }
1292
1293 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1294 if (flush)
1295 kvm_mmu_flush_tlb(vcpu);
1296}
1297
1218struct mmu_page_path { 1298struct mmu_page_path {
1219 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; 1299 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1220 unsigned int idx[PT64_ROOT_LEVEL-1]; 1300 unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1281,6 +1361,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1281 struct kvm_mmu_page *sp; 1361 struct kvm_mmu_page *sp;
1282 struct mmu_page_path parents; 1362 struct mmu_page_path parents;
1283 struct kvm_mmu_pages pages; 1363 struct kvm_mmu_pages pages;
1364 LIST_HEAD(invalid_list);
1284 1365
1285 kvm_mmu_pages_init(parent, &parents, &pages); 1366 kvm_mmu_pages_init(parent, &parents, &pages);
1286 while (mmu_unsync_walk(parent, &pages)) { 1367 while (mmu_unsync_walk(parent, &pages)) {
@@ -1293,9 +1374,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1293 kvm_flush_remote_tlbs(vcpu->kvm); 1374 kvm_flush_remote_tlbs(vcpu->kvm);
1294 1375
1295 for_each_sp(pages, sp, parents, i) { 1376 for_each_sp(pages, sp, parents, i) {
1296 kvm_sync_page(vcpu, sp); 1377 kvm_sync_page(vcpu, sp, &invalid_list);
1297 mmu_pages_clear_parents(&parents); 1378 mmu_pages_clear_parents(&parents);
1298 } 1379 }
1380 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1299 cond_resched_lock(&vcpu->kvm->mmu_lock); 1381 cond_resched_lock(&vcpu->kvm->mmu_lock);
1300 kvm_mmu_pages_init(parent, &parents, &pages); 1382 kvm_mmu_pages_init(parent, &parents, &pages);
1301 } 1383 }
@@ -1310,11 +1392,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1310 u64 *parent_pte) 1392 u64 *parent_pte)
1311{ 1393{
1312 union kvm_mmu_page_role role; 1394 union kvm_mmu_page_role role;
1313 unsigned index;
1314 unsigned quadrant; 1395 unsigned quadrant;
1315 struct hlist_head *bucket;
1316 struct kvm_mmu_page *sp; 1396 struct kvm_mmu_page *sp;
1317 struct hlist_node *node, *tmp; 1397 struct hlist_node *node;
1398 bool need_sync = false;
1318 1399
1319 role = vcpu->arch.mmu.base_role; 1400 role = vcpu->arch.mmu.base_role;
1320 role.level = level; 1401 role.level = level;
@@ -1322,40 +1403,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1322 if (role.direct) 1403 if (role.direct)
1323 role.cr4_pae = 0; 1404 role.cr4_pae = 0;
1324 role.access = access; 1405 role.access = access;
1325 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1406 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1326 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1407 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1327 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1408 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1328 role.quadrant = quadrant; 1409 role.quadrant = quadrant;
1329 } 1410 }
1330 index = kvm_page_table_hashfn(gfn); 1411 for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1331 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1412 if (!need_sync && sp->unsync)
1332 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) 1413 need_sync = true;
1333 if (sp->gfn == gfn) {
1334 if (sp->unsync)
1335 if (kvm_sync_page(vcpu, sp))
1336 continue;
1337 1414
1338 if (sp->role.word != role.word) 1415 if (sp->role.word != role.word)
1339 continue; 1416 continue;
1340 1417
1341 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1418 if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1342 if (sp->unsync_children) { 1419 break;
1343 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1420
1344 kvm_mmu_mark_parents_unsync(sp); 1421 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1345 } 1422 if (sp->unsync_children) {
1346 trace_kvm_mmu_get_page(sp, false); 1423 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1347 return sp; 1424 kvm_mmu_mark_parents_unsync(sp);
1348 } 1425 } else if (sp->unsync)
1426 kvm_mmu_mark_parents_unsync(sp);
1427
1428 trace_kvm_mmu_get_page(sp, false);
1429 return sp;
1430 }
1349 ++vcpu->kvm->stat.mmu_cache_miss; 1431 ++vcpu->kvm->stat.mmu_cache_miss;
1350 sp = kvm_mmu_alloc_page(vcpu, parent_pte); 1432 sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1351 if (!sp) 1433 if (!sp)
1352 return sp; 1434 return sp;
1353 sp->gfn = gfn; 1435 sp->gfn = gfn;
1354 sp->role = role; 1436 sp->role = role;
1355 hlist_add_head(&sp->hash_link, bucket); 1437 hlist_add_head(&sp->hash_link,
1438 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1356 if (!direct) { 1439 if (!direct) {
1357 if (rmap_write_protect(vcpu->kvm, gfn)) 1440 if (rmap_write_protect(vcpu->kvm, gfn))
1358 kvm_flush_remote_tlbs(vcpu->kvm); 1441 kvm_flush_remote_tlbs(vcpu->kvm);
1442 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1443 kvm_sync_pages(vcpu, gfn);
1444
1359 account_shadowed(vcpu->kvm, gfn); 1445 account_shadowed(vcpu->kvm, gfn);
1360 } 1446 }
1361 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1447 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1402,6 +1488,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1402 --iterator->level; 1488 --iterator->level;
1403} 1489}
1404 1490
1491static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1492{
1493 u64 spte;
1494
1495 spte = __pa(sp->spt)
1496 | PT_PRESENT_MASK | PT_ACCESSED_MASK
1497 | PT_WRITABLE_MASK | PT_USER_MASK;
1498 __set_spte(sptep, spte);
1499}
1500
1501static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1502{
1503 if (is_large_pte(*sptep)) {
1504 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1505 kvm_flush_remote_tlbs(vcpu->kvm);
1506 }
1507}
1508
1509static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1510 unsigned direct_access)
1511{
1512 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
1513 struct kvm_mmu_page *child;
1514
1515 /*
1516 * For the direct sp, if the guest pte's dirty bit
1517 * changed form clean to dirty, it will corrupt the
1518 * sp's access: allow writable in the read-only sp,
1519 * so we should update the spte at this point to get
1520 * a new sp with the correct access.
1521 */
1522 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
1523 if (child->role.access == direct_access)
1524 return;
1525
1526 mmu_page_remove_parent_pte(child, sptep);
1527 __set_spte(sptep, shadow_trap_nonpresent_pte);
1528 kvm_flush_remote_tlbs(vcpu->kvm);
1529 }
1530}
1531
1405static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1532static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1406 struct kvm_mmu_page *sp) 1533 struct kvm_mmu_page *sp)
1407{ 1534{
@@ -1422,7 +1549,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1422 } else { 1549 } else {
1423 if (is_large_pte(ent)) 1550 if (is_large_pte(ent))
1424 --kvm->stat.lpages; 1551 --kvm->stat.lpages;
1425 rmap_remove(kvm, &pt[i]); 1552 drop_spte(kvm, &pt[i],
1553 shadow_trap_nonpresent_pte);
1426 } 1554 }
1427 } 1555 }
1428 pt[i] = shadow_trap_nonpresent_pte; 1556 pt[i] = shadow_trap_nonpresent_pte;
@@ -1464,7 +1592,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1464} 1592}
1465 1593
1466static int mmu_zap_unsync_children(struct kvm *kvm, 1594static int mmu_zap_unsync_children(struct kvm *kvm,
1467 struct kvm_mmu_page *parent) 1595 struct kvm_mmu_page *parent,
1596 struct list_head *invalid_list)
1468{ 1597{
1469 int i, zapped = 0; 1598 int i, zapped = 0;
1470 struct mmu_page_path parents; 1599 struct mmu_page_path parents;
@@ -1478,7 +1607,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1478 struct kvm_mmu_page *sp; 1607 struct kvm_mmu_page *sp;
1479 1608
1480 for_each_sp(pages, sp, parents, i) { 1609 for_each_sp(pages, sp, parents, i) {
1481 kvm_mmu_zap_page(kvm, sp); 1610 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1482 mmu_pages_clear_parents(&parents); 1611 mmu_pages_clear_parents(&parents);
1483 zapped++; 1612 zapped++;
1484 } 1613 }
@@ -1488,32 +1617,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1488 return zapped; 1617 return zapped;
1489} 1618}
1490 1619
1491static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1620static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1621 struct list_head *invalid_list)
1492{ 1622{
1493 int ret; 1623 int ret;
1494 1624
1495 trace_kvm_mmu_zap_page(sp); 1625 trace_kvm_mmu_prepare_zap_page(sp);
1496 ++kvm->stat.mmu_shadow_zapped; 1626 ++kvm->stat.mmu_shadow_zapped;
1497 ret = mmu_zap_unsync_children(kvm, sp); 1627 ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1498 kvm_mmu_page_unlink_children(kvm, sp); 1628 kvm_mmu_page_unlink_children(kvm, sp);
1499 kvm_mmu_unlink_parents(kvm, sp); 1629 kvm_mmu_unlink_parents(kvm, sp);
1500 kvm_flush_remote_tlbs(kvm);
1501 if (!sp->role.invalid && !sp->role.direct) 1630 if (!sp->role.invalid && !sp->role.direct)
1502 unaccount_shadowed(kvm, sp->gfn); 1631 unaccount_shadowed(kvm, sp->gfn);
1503 if (sp->unsync) 1632 if (sp->unsync)
1504 kvm_unlink_unsync_page(kvm, sp); 1633 kvm_unlink_unsync_page(kvm, sp);
1505 if (!sp->root_count) { 1634 if (!sp->root_count) {
1506 hlist_del(&sp->hash_link); 1635 /* Count self */
1507 kvm_mmu_free_page(kvm, sp); 1636 ret++;
1637 list_move(&sp->link, invalid_list);
1508 } else { 1638 } else {
1509 sp->role.invalid = 1;
1510 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1639 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1511 kvm_reload_remote_mmus(kvm); 1640 kvm_reload_remote_mmus(kvm);
1512 } 1641 }
1642
1643 sp->role.invalid = 1;
1513 kvm_mmu_reset_last_pte_updated(kvm); 1644 kvm_mmu_reset_last_pte_updated(kvm);
1514 return ret; 1645 return ret;
1515} 1646}
1516 1647
1648static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1649 struct list_head *invalid_list)
1650{
1651 struct kvm_mmu_page *sp;
1652
1653 if (list_empty(invalid_list))
1654 return;
1655
1656 kvm_flush_remote_tlbs(kvm);
1657
1658 do {
1659 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1660 WARN_ON(!sp->role.invalid || sp->root_count);
1661 kvm_mmu_free_page(kvm, sp);
1662 } while (!list_empty(invalid_list));
1663
1664}
1665
1517/* 1666/*
1518 * Changing the number of mmu pages allocated to the vm 1667 * Changing the number of mmu pages allocated to the vm
1519 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1668 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1521,6 +1670,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1521void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1670void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1522{ 1671{
1523 int used_pages; 1672 int used_pages;
1673 LIST_HEAD(invalid_list);
1524 1674
1525 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; 1675 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1526 used_pages = max(0, used_pages); 1676 used_pages = max(0, used_pages);
@@ -1538,9 +1688,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1538 1688
1539 page = container_of(kvm->arch.active_mmu_pages.prev, 1689 page = container_of(kvm->arch.active_mmu_pages.prev,
1540 struct kvm_mmu_page, link); 1690 struct kvm_mmu_page, link);
1541 used_pages -= kvm_mmu_zap_page(kvm, page); 1691 used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
1542 used_pages--; 1692 &invalid_list);
1543 } 1693 }
1694 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1544 kvm_nr_mmu_pages = used_pages; 1695 kvm_nr_mmu_pages = used_pages;
1545 kvm->arch.n_free_mmu_pages = 0; 1696 kvm->arch.n_free_mmu_pages = 0;
1546 } 1697 }
@@ -1553,47 +1704,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1553 1704
1554static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1705static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1555{ 1706{
1556 unsigned index;
1557 struct hlist_head *bucket;
1558 struct kvm_mmu_page *sp; 1707 struct kvm_mmu_page *sp;
1559 struct hlist_node *node, *n; 1708 struct hlist_node *node;
1709 LIST_HEAD(invalid_list);
1560 int r; 1710 int r;
1561 1711
1562 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1712 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1563 r = 0; 1713 r = 0;
1564 index = kvm_page_table_hashfn(gfn); 1714
1565 bucket = &kvm->arch.mmu_page_hash[index]; 1715 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1566restart: 1716 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1567 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1717 sp->role.word);
1568 if (sp->gfn == gfn && !sp->role.direct) { 1718 r = 1;
1569 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1719 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1570 sp->role.word); 1720 }
1571 r = 1; 1721 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1572 if (kvm_mmu_zap_page(kvm, sp))
1573 goto restart;
1574 }
1575 return r; 1722 return r;
1576} 1723}
1577 1724
1578static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) 1725static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1579{ 1726{
1580 unsigned index;
1581 struct hlist_head *bucket;
1582 struct kvm_mmu_page *sp; 1727 struct kvm_mmu_page *sp;
1583 struct hlist_node *node, *nn; 1728 struct hlist_node *node;
1729 LIST_HEAD(invalid_list);
1584 1730
1585 index = kvm_page_table_hashfn(gfn); 1731 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1586 bucket = &kvm->arch.mmu_page_hash[index]; 1732 pgprintk("%s: zap %lx %x\n",
1587restart: 1733 __func__, gfn, sp->role.word);
1588 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { 1734 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1589 if (sp->gfn == gfn && !sp->role.direct
1590 && !sp->role.invalid) {
1591 pgprintk("%s: zap %lx %x\n",
1592 __func__, gfn, sp->role.word);
1593 if (kvm_mmu_zap_page(kvm, sp))
1594 goto restart;
1595 }
1596 } 1735 }
1736 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1597} 1737}
1598 1738
1599static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 1739static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -1723,47 +1863,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1723} 1863}
1724EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); 1864EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1725 1865
1726static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1866static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1727{ 1867{
1728 unsigned index;
1729 struct hlist_head *bucket;
1730 struct kvm_mmu_page *s;
1731 struct hlist_node *node, *n;
1732
1733 index = kvm_page_table_hashfn(sp->gfn);
1734 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1735 /* don't unsync if pagetable is shadowed with multiple roles */
1736 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1737 if (s->gfn != sp->gfn || s->role.direct)
1738 continue;
1739 if (s->role.word != sp->role.word)
1740 return 1;
1741 }
1742 trace_kvm_mmu_unsync_page(sp); 1868 trace_kvm_mmu_unsync_page(sp);
1743 ++vcpu->kvm->stat.mmu_unsync; 1869 ++vcpu->kvm->stat.mmu_unsync;
1744 sp->unsync = 1; 1870 sp->unsync = 1;
1745 1871
1746 kvm_mmu_mark_parents_unsync(sp); 1872 kvm_mmu_mark_parents_unsync(sp);
1747
1748 mmu_convert_notrap(sp); 1873 mmu_convert_notrap(sp);
1749 return 0; 1874}
1875
1876static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1877{
1878 struct kvm_mmu_page *s;
1879 struct hlist_node *node;
1880
1881 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1882 if (s->unsync)
1883 continue;
1884 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1885 __kvm_unsync_page(vcpu, s);
1886 }
1750} 1887}
1751 1888
1752static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, 1889static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1753 bool can_unsync) 1890 bool can_unsync)
1754{ 1891{
1755 struct kvm_mmu_page *shadow; 1892 struct kvm_mmu_page *s;
1893 struct hlist_node *node;
1894 bool need_unsync = false;
1756 1895
1757 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1896 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1758 if (shadow) { 1897 if (!can_unsync)
1759 if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1760 return 1; 1898 return 1;
1761 if (shadow->unsync) 1899
1762 return 0; 1900 if (s->role.level != PT_PAGE_TABLE_LEVEL)
1763 if (can_unsync && oos_shadow) 1901 return 1;
1764 return kvm_unsync_page(vcpu, shadow); 1902
1765 return 1; 1903 if (!need_unsync && !s->unsync) {
1904 if (!oos_shadow)
1905 return 1;
1906 need_unsync = true;
1907 }
1766 } 1908 }
1909 if (need_unsync)
1910 kvm_unsync_pages(vcpu, gfn);
1767 return 0; 1911 return 0;
1768} 1912}
1769 1913
@@ -1804,13 +1948,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1804 spte |= (u64)pfn << PAGE_SHIFT; 1948 spte |= (u64)pfn << PAGE_SHIFT;
1805 1949
1806 if ((pte_access & ACC_WRITE_MASK) 1950 if ((pte_access & ACC_WRITE_MASK)
1807 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1951 || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
1952 && !user_fault)) {
1808 1953
1809 if (level > PT_PAGE_TABLE_LEVEL && 1954 if (level > PT_PAGE_TABLE_LEVEL &&
1810 has_wrprotected_page(vcpu->kvm, gfn, level)) { 1955 has_wrprotected_page(vcpu->kvm, gfn, level)) {
1811 ret = 1; 1956 ret = 1;
1812 spte = shadow_trap_nonpresent_pte; 1957 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1813 goto set_pte; 1958 goto done;
1814 } 1959 }
1815 1960
1816 spte |= PT_WRITABLE_MASK; 1961 spte |= PT_WRITABLE_MASK;
@@ -1841,7 +1986,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1841 mark_page_dirty(vcpu->kvm, gfn); 1986 mark_page_dirty(vcpu->kvm, gfn);
1842 1987
1843set_pte: 1988set_pte:
1844 __set_spte(sptep, spte); 1989 if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1990 kvm_set_pfn_dirty(pfn);
1991 update_spte(sptep, spte);
1992done:
1845 return ret; 1993 return ret;
1846} 1994}
1847 1995
@@ -1853,7 +2001,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1853 bool reset_host_protection) 2001 bool reset_host_protection)
1854{ 2002{
1855 int was_rmapped = 0; 2003 int was_rmapped = 0;
1856 int was_writable = is_writable_pte(*sptep);
1857 int rmap_count; 2004 int rmap_count;
1858 2005
1859 pgprintk("%s: spte %llx access %x write_fault %d" 2006 pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1878,8 +2025,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1878 } else if (pfn != spte_to_pfn(*sptep)) { 2025 } else if (pfn != spte_to_pfn(*sptep)) {
1879 pgprintk("hfn old %lx new %lx\n", 2026 pgprintk("hfn old %lx new %lx\n",
1880 spte_to_pfn(*sptep), pfn); 2027 spte_to_pfn(*sptep), pfn);
1881 rmap_remove(vcpu->kvm, sptep); 2028 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1882 __set_spte(sptep, shadow_trap_nonpresent_pte);
1883 kvm_flush_remote_tlbs(vcpu->kvm); 2029 kvm_flush_remote_tlbs(vcpu->kvm);
1884 } else 2030 } else
1885 was_rmapped = 1; 2031 was_rmapped = 1;
@@ -1890,7 +2036,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1890 reset_host_protection)) { 2036 reset_host_protection)) {
1891 if (write_fault) 2037 if (write_fault)
1892 *ptwrite = 1; 2038 *ptwrite = 1;
1893 kvm_x86_ops->tlb_flush(vcpu); 2039 kvm_mmu_flush_tlb(vcpu);
1894 } 2040 }
1895 2041
1896 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2042 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -1904,15 +2050,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1904 page_header_update_slot(vcpu->kvm, sptep, gfn); 2050 page_header_update_slot(vcpu->kvm, sptep, gfn);
1905 if (!was_rmapped) { 2051 if (!was_rmapped) {
1906 rmap_count = rmap_add(vcpu, sptep, gfn); 2052 rmap_count = rmap_add(vcpu, sptep, gfn);
1907 kvm_release_pfn_clean(pfn);
1908 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2053 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1909 rmap_recycle(vcpu, sptep, gfn); 2054 rmap_recycle(vcpu, sptep, gfn);
1910 } else {
1911 if (was_writable)
1912 kvm_release_pfn_dirty(pfn);
1913 else
1914 kvm_release_pfn_clean(pfn);
1915 } 2055 }
2056 kvm_release_pfn_clean(pfn);
1916 if (speculative) { 2057 if (speculative) {
1917 vcpu->arch.last_pte_updated = sptep; 2058 vcpu->arch.last_pte_updated = sptep;
1918 vcpu->arch.last_pte_gfn = gfn; 2059 vcpu->arch.last_pte_gfn = gfn;
@@ -1941,7 +2082,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1941 } 2082 }
1942 2083
1943 if (*iterator.sptep == shadow_trap_nonpresent_pte) { 2084 if (*iterator.sptep == shadow_trap_nonpresent_pte) {
1944 pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; 2085 u64 base_addr = iterator.addr;
2086
2087 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2088 pseudo_gfn = base_addr >> PAGE_SHIFT;
1945 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2089 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
1946 iterator.level - 1, 2090 iterator.level - 1,
1947 1, ACC_ALL, iterator.sptep); 2091 1, ACC_ALL, iterator.sptep);
@@ -1960,6 +2104,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1960 return pt_write; 2104 return pt_write;
1961} 2105}
1962 2106
2107static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
2108{
2109 char buf[1];
2110 void __user *hva;
2111 int r;
2112
2113 /* Touch the page, so send SIGBUS */
2114 hva = (void __user *)gfn_to_hva(kvm, gfn);
2115 r = copy_from_user(buf, hva, 1);
2116}
2117
2118static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2119{
2120 kvm_release_pfn_clean(pfn);
2121 if (is_hwpoison_pfn(pfn)) {
2122 kvm_send_hwpoison_signal(kvm, gfn);
2123 return 0;
2124 } else if (is_fault_pfn(pfn))
2125 return -EFAULT;
2126
2127 return 1;
2128}
2129
1963static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2130static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1964{ 2131{
1965 int r; 2132 int r;
@@ -1983,10 +2150,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1983 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2150 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1984 2151
1985 /* mmio */ 2152 /* mmio */
1986 if (is_error_pfn(pfn)) { 2153 if (is_error_pfn(pfn))
1987 kvm_release_pfn_clean(pfn); 2154 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
1988 return 1;
1989 }
1990 2155
1991 spin_lock(&vcpu->kvm->mmu_lock); 2156 spin_lock(&vcpu->kvm->mmu_lock);
1992 if (mmu_notifier_retry(vcpu, mmu_seq)) 2157 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2009,6 +2174,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2009{ 2174{
2010 int i; 2175 int i;
2011 struct kvm_mmu_page *sp; 2176 struct kvm_mmu_page *sp;
2177 LIST_HEAD(invalid_list);
2012 2178
2013 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2179 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2014 return; 2180 return;
@@ -2018,8 +2184,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2018 2184
2019 sp = page_header(root); 2185 sp = page_header(root);
2020 --sp->root_count; 2186 --sp->root_count;
2021 if (!sp->root_count && sp->role.invalid) 2187 if (!sp->root_count && sp->role.invalid) {
2022 kvm_mmu_zap_page(vcpu->kvm, sp); 2188 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2189 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2190 }
2023 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2191 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2024 spin_unlock(&vcpu->kvm->mmu_lock); 2192 spin_unlock(&vcpu->kvm->mmu_lock);
2025 return; 2193 return;
@@ -2032,10 +2200,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2032 sp = page_header(root); 2200 sp = page_header(root);
2033 --sp->root_count; 2201 --sp->root_count;
2034 if (!sp->root_count && sp->role.invalid) 2202 if (!sp->root_count && sp->role.invalid)
2035 kvm_mmu_zap_page(vcpu->kvm, sp); 2203 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2204 &invalid_list);
2036 } 2205 }
2037 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2206 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2038 } 2207 }
2208 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2039 spin_unlock(&vcpu->kvm->mmu_lock); 2209 spin_unlock(&vcpu->kvm->mmu_lock);
2040 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2210 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2041} 2211}
@@ -2045,7 +2215,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2045 int ret = 0; 2215 int ret = 0;
2046 2216
2047 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { 2217 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2048 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 2218 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2049 ret = 1; 2219 ret = 1;
2050 } 2220 }
2051 2221
@@ -2073,6 +2243,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2073 root_gfn = 0; 2243 root_gfn = 0;
2074 } 2244 }
2075 spin_lock(&vcpu->kvm->mmu_lock); 2245 spin_lock(&vcpu->kvm->mmu_lock);
2246 kvm_mmu_free_some_pages(vcpu);
2076 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2247 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2077 PT64_ROOT_LEVEL, direct, 2248 PT64_ROOT_LEVEL, direct,
2078 ACC_ALL, NULL); 2249 ACC_ALL, NULL);
@@ -2103,6 +2274,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2103 root_gfn = i << 30; 2274 root_gfn = i << 30;
2104 } 2275 }
2105 spin_lock(&vcpu->kvm->mmu_lock); 2276 spin_lock(&vcpu->kvm->mmu_lock);
2277 kvm_mmu_free_some_pages(vcpu);
2106 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2278 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2107 PT32_ROOT_LEVEL, direct, 2279 PT32_ROOT_LEVEL, direct,
2108 ACC_ALL, NULL); 2280 ACC_ALL, NULL);
@@ -2198,10 +2370,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2198 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2370 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2199 smp_rmb(); 2371 smp_rmb();
2200 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2372 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2201 if (is_error_pfn(pfn)) { 2373 if (is_error_pfn(pfn))
2202 kvm_release_pfn_clean(pfn); 2374 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2203 return 1;
2204 }
2205 spin_lock(&vcpu->kvm->mmu_lock); 2375 spin_lock(&vcpu->kvm->mmu_lock);
2206 if (mmu_notifier_retry(vcpu, mmu_seq)) 2376 if (mmu_notifier_retry(vcpu, mmu_seq))
2207 goto out_unlock; 2377 goto out_unlock;
@@ -2243,7 +2413,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2243void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 2413void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2244{ 2414{
2245 ++vcpu->stat.tlb_flush; 2415 ++vcpu->stat.tlb_flush;
2246 kvm_x86_ops->tlb_flush(vcpu); 2416 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2247} 2417}
2248 2418
2249static void paging_new_cr3(struct kvm_vcpu *vcpu) 2419static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2457,10 +2627,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2457static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) 2627static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2458{ 2628{
2459 ASSERT(vcpu); 2629 ASSERT(vcpu);
2460 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { 2630 if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
2631 /* mmu.free() should set root_hpa = INVALID_PAGE */
2461 vcpu->arch.mmu.free(vcpu); 2632 vcpu->arch.mmu.free(vcpu);
2462 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2463 }
2464} 2633}
2465 2634
2466int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 2635int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -2477,9 +2646,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2477 r = mmu_topup_memory_caches(vcpu); 2646 r = mmu_topup_memory_caches(vcpu);
2478 if (r) 2647 if (r)
2479 goto out; 2648 goto out;
2480 spin_lock(&vcpu->kvm->mmu_lock);
2481 kvm_mmu_free_some_pages(vcpu);
2482 spin_unlock(&vcpu->kvm->mmu_lock);
2483 r = mmu_alloc_roots(vcpu); 2649 r = mmu_alloc_roots(vcpu);
2484 spin_lock(&vcpu->kvm->mmu_lock); 2650 spin_lock(&vcpu->kvm->mmu_lock);
2485 mmu_sync_roots(vcpu); 2651 mmu_sync_roots(vcpu);
@@ -2508,7 +2674,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2508 pte = *spte; 2674 pte = *spte;
2509 if (is_shadow_present_pte(pte)) { 2675 if (is_shadow_present_pte(pte)) {
2510 if (is_last_spte(pte, sp->role.level)) 2676 if (is_last_spte(pte, sp->role.level))
2511 rmap_remove(vcpu->kvm, spte); 2677 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
2512 else { 2678 else {
2513 child = page_header(pte & PT64_BASE_ADDR_MASK); 2679 child = page_header(pte & PT64_BASE_ADDR_MASK);
2514 mmu_page_remove_parent_pte(child, spte); 2680 mmu_page_remove_parent_pte(child, spte);
@@ -2529,6 +2695,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2529 return; 2695 return;
2530 } 2696 }
2531 2697
2698 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2699 return;
2700
2532 ++vcpu->kvm->stat.mmu_pte_updated; 2701 ++vcpu->kvm->stat.mmu_pte_updated;
2533 if (!sp->role.cr4_pae) 2702 if (!sp->role.cr4_pae)
2534 paging32_update_pte(vcpu, sp, spte, new); 2703 paging32_update_pte(vcpu, sp, spte, new);
@@ -2549,11 +2718,15 @@ static bool need_remote_flush(u64 old, u64 new)
2549 return (old & ~new & PT64_PERM_MASK) != 0; 2718 return (old & ~new & PT64_PERM_MASK) != 0;
2550} 2719}
2551 2720
2552static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) 2721static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
2722 bool remote_flush, bool local_flush)
2553{ 2723{
2554 if (need_remote_flush(old, new)) 2724 if (zap_page)
2725 return;
2726
2727 if (remote_flush)
2555 kvm_flush_remote_tlbs(vcpu->kvm); 2728 kvm_flush_remote_tlbs(vcpu->kvm);
2556 else 2729 else if (local_flush)
2557 kvm_mmu_flush_tlb(vcpu); 2730 kvm_mmu_flush_tlb(vcpu);
2558} 2731}
2559 2732
@@ -2603,10 +2776,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2603 bool guest_initiated) 2776 bool guest_initiated)
2604{ 2777{
2605 gfn_t gfn = gpa >> PAGE_SHIFT; 2778 gfn_t gfn = gpa >> PAGE_SHIFT;
2779 union kvm_mmu_page_role mask = { .word = 0 };
2606 struct kvm_mmu_page *sp; 2780 struct kvm_mmu_page *sp;
2607 struct hlist_node *node, *n; 2781 struct hlist_node *node;
2608 struct hlist_head *bucket; 2782 LIST_HEAD(invalid_list);
2609 unsigned index;
2610 u64 entry, gentry; 2783 u64 entry, gentry;
2611 u64 *spte; 2784 u64 *spte;
2612 unsigned offset = offset_in_page(gpa); 2785 unsigned offset = offset_in_page(gpa);
@@ -2619,6 +2792,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2619 int npte; 2792 int npte;
2620 int r; 2793 int r;
2621 int invlpg_counter; 2794 int invlpg_counter;
2795 bool remote_flush, local_flush, zap_page;
2796
2797 zap_page = remote_flush = local_flush = false;
2622 2798
2623 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2799 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2624 2800
@@ -2674,13 +2850,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2674 vcpu->arch.last_pte_updated = NULL; 2850 vcpu->arch.last_pte_updated = NULL;
2675 } 2851 }
2676 } 2852 }
2677 index = kvm_page_table_hashfn(gfn);
2678 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2679 2853
2680restart: 2854 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
2681 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2855 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
2682 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2683 continue;
2684 pte_size = sp->role.cr4_pae ? 8 : 4; 2856 pte_size = sp->role.cr4_pae ? 8 : 4;
2685 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2857 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2686 misaligned |= bytes < 4; 2858 misaligned |= bytes < 4;
@@ -2697,8 +2869,8 @@ restart:
2697 */ 2869 */
2698 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2870 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2699 gpa, bytes, sp->role.word); 2871 gpa, bytes, sp->role.word);
2700 if (kvm_mmu_zap_page(vcpu->kvm, sp)) 2872 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2701 goto restart; 2873 &invalid_list);
2702 ++vcpu->kvm->stat.mmu_flooded; 2874 ++vcpu->kvm->stat.mmu_flooded;
2703 continue; 2875 continue;
2704 } 2876 }
@@ -2722,16 +2894,22 @@ restart:
2722 if (quadrant != sp->role.quadrant) 2894 if (quadrant != sp->role.quadrant)
2723 continue; 2895 continue;
2724 } 2896 }
2897 local_flush = true;
2725 spte = &sp->spt[page_offset / sizeof(*spte)]; 2898 spte = &sp->spt[page_offset / sizeof(*spte)];
2726 while (npte--) { 2899 while (npte--) {
2727 entry = *spte; 2900 entry = *spte;
2728 mmu_pte_write_zap_pte(vcpu, sp, spte); 2901 mmu_pte_write_zap_pte(vcpu, sp, spte);
2729 if (gentry) 2902 if (gentry &&
2903 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
2904 & mask.word))
2730 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 2905 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2731 mmu_pte_write_flush_tlb(vcpu, entry, *spte); 2906 if (!remote_flush && need_remote_flush(entry, *spte))
2907 remote_flush = true;
2732 ++spte; 2908 ++spte;
2733 } 2909 }
2734 } 2910 }
2911 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2912 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2735 kvm_mmu_audit(vcpu, "post pte write"); 2913 kvm_mmu_audit(vcpu, "post pte write");
2736 spin_unlock(&vcpu->kvm->mmu_lock); 2914 spin_unlock(&vcpu->kvm->mmu_lock);
2737 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { 2915 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
@@ -2759,15 +2937,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2759 2937
2760void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 2938void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2761{ 2939{
2762 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && 2940 int free_pages;
2941 LIST_HEAD(invalid_list);
2942
2943 free_pages = vcpu->kvm->arch.n_free_mmu_pages;
2944 while (free_pages < KVM_REFILL_PAGES &&
2763 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 2945 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2764 struct kvm_mmu_page *sp; 2946 struct kvm_mmu_page *sp;
2765 2947
2766 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 2948 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2767 struct kvm_mmu_page, link); 2949 struct kvm_mmu_page, link);
2768 kvm_mmu_zap_page(vcpu->kvm, sp); 2950 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2951 &invalid_list);
2769 ++vcpu->kvm->stat.mmu_recycled; 2952 ++vcpu->kvm->stat.mmu_recycled;
2770 } 2953 }
2954 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2771} 2955}
2772 2956
2773int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 2957int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -2795,11 +2979,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2795 return 1; 2979 return 1;
2796 case EMULATE_DO_MMIO: 2980 case EMULATE_DO_MMIO:
2797 ++vcpu->stat.mmio_exits; 2981 ++vcpu->stat.mmio_exits;
2798 return 0; 2982 /* fall through */
2799 case EMULATE_FAIL: 2983 case EMULATE_FAIL:
2800 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2801 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2802 vcpu->run->internal.ndata = 0;
2803 return 0; 2984 return 0;
2804 default: 2985 default:
2805 BUG(); 2986 BUG();
@@ -2896,7 +3077,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2896 pt = sp->spt; 3077 pt = sp->spt;
2897 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3078 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2898 /* avoid RMW */ 3079 /* avoid RMW */
2899 if (pt[i] & PT_WRITABLE_MASK) 3080 if (is_writable_pte(pt[i]))
2900 pt[i] &= ~PT_WRITABLE_MASK; 3081 pt[i] &= ~PT_WRITABLE_MASK;
2901 } 3082 }
2902 kvm_flush_remote_tlbs(kvm); 3083 kvm_flush_remote_tlbs(kvm);
@@ -2905,25 +3086,26 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2905void kvm_mmu_zap_all(struct kvm *kvm) 3086void kvm_mmu_zap_all(struct kvm *kvm)
2906{ 3087{
2907 struct kvm_mmu_page *sp, *node; 3088 struct kvm_mmu_page *sp, *node;
3089 LIST_HEAD(invalid_list);
2908 3090
2909 spin_lock(&kvm->mmu_lock); 3091 spin_lock(&kvm->mmu_lock);
2910restart: 3092restart:
2911 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 3093 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2912 if (kvm_mmu_zap_page(kvm, sp)) 3094 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
2913 goto restart; 3095 goto restart;
2914 3096
3097 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2915 spin_unlock(&kvm->mmu_lock); 3098 spin_unlock(&kvm->mmu_lock);
2916
2917 kvm_flush_remote_tlbs(kvm);
2918} 3099}
2919 3100
2920static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) 3101static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3102 struct list_head *invalid_list)
2921{ 3103{
2922 struct kvm_mmu_page *page; 3104 struct kvm_mmu_page *page;
2923 3105
2924 page = container_of(kvm->arch.active_mmu_pages.prev, 3106 page = container_of(kvm->arch.active_mmu_pages.prev,
2925 struct kvm_mmu_page, link); 3107 struct kvm_mmu_page, link);
2926 return kvm_mmu_zap_page(kvm, page) + 1; 3108 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
2927} 3109}
2928 3110
2929static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 3111static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
@@ -2936,6 +3118,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
2936 3118
2937 list_for_each_entry(kvm, &vm_list, vm_list) { 3119 list_for_each_entry(kvm, &vm_list, vm_list) {
2938 int npages, idx, freed_pages; 3120 int npages, idx, freed_pages;
3121 LIST_HEAD(invalid_list);
2939 3122
2940 idx = srcu_read_lock(&kvm->srcu); 3123 idx = srcu_read_lock(&kvm->srcu);
2941 spin_lock(&kvm->mmu_lock); 3124 spin_lock(&kvm->mmu_lock);
@@ -2943,12 +3126,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
2943 kvm->arch.n_free_mmu_pages; 3126 kvm->arch.n_free_mmu_pages;
2944 cache_count += npages; 3127 cache_count += npages;
2945 if (!kvm_freed && nr_to_scan > 0 && npages > 0) { 3128 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
2946 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); 3129 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3130 &invalid_list);
2947 cache_count -= freed_pages; 3131 cache_count -= freed_pages;
2948 kvm_freed = kvm; 3132 kvm_freed = kvm;
2949 } 3133 }
2950 nr_to_scan--; 3134 nr_to_scan--;
2951 3135
3136 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2952 spin_unlock(&kvm->mmu_lock); 3137 spin_unlock(&kvm->mmu_lock);
2953 srcu_read_unlock(&kvm->srcu, idx); 3138 srcu_read_unlock(&kvm->srcu, idx);
2954 } 3139 }
@@ -3074,7 +3259,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3074 3259
3075static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3260static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3076{ 3261{
3077 kvm_set_cr3(vcpu, vcpu->arch.cr3); 3262 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3078 return 1; 3263 return 1;
3079} 3264}
3080 3265
@@ -3331,9 +3516,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3331 struct kvm_mmu_page *rev_sp; 3516 struct kvm_mmu_page *rev_sp;
3332 gfn_t gfn; 3517 gfn_t gfn;
3333 3518
3334 if (*sptep & PT_WRITABLE_MASK) { 3519 if (is_writable_pte(*sptep)) {
3335 rev_sp = page_header(__pa(sptep)); 3520 rev_sp = page_header(__pa(sptep));
3336 gfn = rev_sp->gfns[sptep - rev_sp->spt]; 3521 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3337 3522
3338 if (!gfn_to_memslot(kvm, gfn)) { 3523 if (!gfn_to_memslot(kvm, gfn)) {
3339 if (!printk_ratelimit()) 3524 if (!printk_ratelimit())
@@ -3347,8 +3532,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3347 return; 3532 return;
3348 } 3533 }
3349 3534
3350 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 3535 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3351 rev_sp->role.level);
3352 if (!*rmapp) { 3536 if (!*rmapp) {
3353 if (!printk_ratelimit()) 3537 if (!printk_ratelimit())
3354 return; 3538 return;
@@ -3381,7 +3565,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3381 3565
3382 if (!(ent & PT_PRESENT_MASK)) 3566 if (!(ent & PT_PRESENT_MASK))
3383 continue; 3567 continue;
3384 if (!(ent & PT_WRITABLE_MASK)) 3568 if (!is_writable_pte(ent))
3385 continue; 3569 continue;
3386 inspect_spte_has_rmap(vcpu->kvm, &pt[i]); 3570 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3387 } 3571 }
@@ -3409,13 +3593,12 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
3409 if (sp->unsync) 3593 if (sp->unsync)
3410 continue; 3594 continue;
3411 3595
3412 gfn = unalias_gfn(vcpu->kvm, sp->gfn); 3596 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3413 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
3414 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3597 rmapp = &slot->rmap[gfn - slot->base_gfn];
3415 3598
3416 spte = rmap_next(vcpu->kvm, rmapp, NULL); 3599 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3417 while (spte) { 3600 while (spte) {
3418 if (*spte & PT_WRITABLE_MASK) 3601 if (is_writable_pte(*spte))
3419 printk(KERN_ERR "%s: (%s) shadow page has " 3602 printk(KERN_ERR "%s: (%s) shadow page has "
3420 "writable mappings: gfn %lx role %x\n", 3603 "writable mappings: gfn %lx role %x\n",
3421 __func__, audit_msg, sp->gfn, 3604 __func__, audit_msg, sp->gfn,
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 42f07b1bfbc..3aab0f0930e 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
190 TP_ARGS(sp) 190 TP_ARGS(sp)
191); 191);
192 192
193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, 193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
194 TP_PROTO(struct kvm_mmu_page *sp), 194 TP_PROTO(struct kvm_mmu_page *sp),
195 195
196 TP_ARGS(sp) 196 TP_ARGS(sp)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2331bdc2b54..51ef9097960 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,6 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 * 11 *
11 * Authors: 12 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -118,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
118{ 119{
119 pt_element_t pte; 120 pt_element_t pte;
120 gfn_t table_gfn; 121 gfn_t table_gfn;
121 unsigned index, pt_access, pte_access; 122 unsigned index, pt_access, uninitialized_var(pte_access);
122 gpa_t pte_gpa; 123 gpa_t pte_gpa;
123 int rsvd_fault = 0; 124 bool eperm, present, rsvd_fault;
124 125
125 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 126 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
126 fetch_fault); 127 fetch_fault);
127walk: 128walk:
129 present = true;
130 eperm = rsvd_fault = false;
128 walker->level = vcpu->arch.mmu.root_level; 131 walker->level = vcpu->arch.mmu.root_level;
129 pte = vcpu->arch.cr3; 132 pte = vcpu->arch.cr3;
130#if PTTYPE == 64 133#if PTTYPE == 64
131 if (!is_long_mode(vcpu)) { 134 if (!is_long_mode(vcpu)) {
132 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); 135 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
133 trace_kvm_mmu_paging_element(pte, walker->level); 136 trace_kvm_mmu_paging_element(pte, walker->level);
134 if (!is_present_gpte(pte)) 137 if (!is_present_gpte(pte)) {
135 goto not_present; 138 present = false;
139 goto error;
140 }
136 --walker->level; 141 --walker->level;
137 } 142 }
138#endif 143#endif
@@ -150,37 +155,42 @@ walk:
150 walker->table_gfn[walker->level - 1] = table_gfn; 155 walker->table_gfn[walker->level - 1] = table_gfn;
151 walker->pte_gpa[walker->level - 1] = pte_gpa; 156 walker->pte_gpa[walker->level - 1] = pte_gpa;
152 157
153 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) 158 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
154 goto not_present; 159 present = false;
160 break;
161 }
155 162
156 trace_kvm_mmu_paging_element(pte, walker->level); 163 trace_kvm_mmu_paging_element(pte, walker->level);
157 164
158 if (!is_present_gpte(pte)) 165 if (!is_present_gpte(pte)) {
159 goto not_present; 166 present = false;
167 break;
168 }
160 169
161 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); 170 if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
162 if (rsvd_fault) 171 rsvd_fault = true;
163 goto access_error; 172 break;
173 }
164 174
165 if (write_fault && !is_writable_pte(pte)) 175 if (write_fault && !is_writable_pte(pte))
166 if (user_fault || is_write_protection(vcpu)) 176 if (user_fault || is_write_protection(vcpu))
167 goto access_error; 177 eperm = true;
168 178
169 if (user_fault && !(pte & PT_USER_MASK)) 179 if (user_fault && !(pte & PT_USER_MASK))
170 goto access_error; 180 eperm = true;
171 181
172#if PTTYPE == 64 182#if PTTYPE == 64
173 if (fetch_fault && (pte & PT64_NX_MASK)) 183 if (fetch_fault && (pte & PT64_NX_MASK))
174 goto access_error; 184 eperm = true;
175#endif 185#endif
176 186
177 if (!(pte & PT_ACCESSED_MASK)) { 187 if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
178 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 188 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
179 sizeof(pte)); 189 sizeof(pte));
180 mark_page_dirty(vcpu->kvm, table_gfn);
181 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 190 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
182 index, pte, pte|PT_ACCESSED_MASK)) 191 index, pte, pte|PT_ACCESSED_MASK))
183 goto walk; 192 goto walk;
193 mark_page_dirty(vcpu->kvm, table_gfn);
184 pte |= PT_ACCESSED_MASK; 194 pte |= PT_ACCESSED_MASK;
185 } 195 }
186 196
@@ -213,15 +223,18 @@ walk:
213 --walker->level; 223 --walker->level;
214 } 224 }
215 225
226 if (!present || eperm || rsvd_fault)
227 goto error;
228
216 if (write_fault && !is_dirty_gpte(pte)) { 229 if (write_fault && !is_dirty_gpte(pte)) {
217 bool ret; 230 bool ret;
218 231
219 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 232 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
220 mark_page_dirty(vcpu->kvm, table_gfn);
221 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 233 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
222 pte|PT_DIRTY_MASK); 234 pte|PT_DIRTY_MASK);
223 if (ret) 235 if (ret)
224 goto walk; 236 goto walk;
237 mark_page_dirty(vcpu->kvm, table_gfn);
225 pte |= PT_DIRTY_MASK; 238 pte |= PT_DIRTY_MASK;
226 walker->ptes[walker->level - 1] = pte; 239 walker->ptes[walker->level - 1] = pte;
227 } 240 }
@@ -229,22 +242,18 @@ walk:
229 walker->pt_access = pt_access; 242 walker->pt_access = pt_access;
230 walker->pte_access = pte_access; 243 walker->pte_access = pte_access;
231 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 244 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
232 __func__, (u64)pte, pt_access, pte_access); 245 __func__, (u64)pte, pte_access, pt_access);
233 return 1; 246 return 1;
234 247
235not_present: 248error:
236 walker->error_code = 0; 249 walker->error_code = 0;
237 goto err; 250 if (present)
238 251 walker->error_code |= PFERR_PRESENT_MASK;
239access_error:
240 walker->error_code = PFERR_PRESENT_MASK;
241
242err:
243 if (write_fault) 252 if (write_fault)
244 walker->error_code |= PFERR_WRITE_MASK; 253 walker->error_code |= PFERR_WRITE_MASK;
245 if (user_fault) 254 if (user_fault)
246 walker->error_code |= PFERR_USER_MASK; 255 walker->error_code |= PFERR_USER_MASK;
247 if (fetch_fault) 256 if (fetch_fault && is_nx(vcpu))
248 walker->error_code |= PFERR_FETCH_MASK; 257 walker->error_code |= PFERR_FETCH_MASK;
249 if (rsvd_fault) 258 if (rsvd_fault)
250 walker->error_code |= PFERR_RSVD_MASK; 259 walker->error_code |= PFERR_RSVD_MASK;
@@ -252,7 +261,7 @@ err:
252 return 0; 261 return 0;
253} 262}
254 263
255static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, 264static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
256 u64 *spte, const void *pte) 265 u64 *spte, const void *pte)
257{ 266{
258 pt_element_t gpte; 267 pt_element_t gpte;
@@ -263,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
263 gpte = *(const pt_element_t *)pte; 272 gpte = *(const pt_element_t *)pte;
264 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 273 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
265 if (!is_present_gpte(gpte)) { 274 if (!is_present_gpte(gpte)) {
266 if (page->unsync) 275 if (sp->unsync)
267 new_spte = shadow_trap_nonpresent_pte; 276 new_spte = shadow_trap_nonpresent_pte;
268 else 277 else
269 new_spte = shadow_notrap_nonpresent_pte; 278 new_spte = shadow_notrap_nonpresent_pte;
@@ -272,7 +281,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
272 return; 281 return;
273 } 282 }
274 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 283 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
275 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); 284 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
276 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 285 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
277 return; 286 return;
278 pfn = vcpu->arch.update_pte.pfn; 287 pfn = vcpu->arch.update_pte.pfn;
@@ -285,11 +294,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
285 * we call mmu_set_spte() with reset_host_protection = true beacuse that 294 * we call mmu_set_spte() with reset_host_protection = true beacuse that
286 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 295 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
287 */ 296 */
288 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 297 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
289 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, 298 is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
290 gpte_to_gfn(gpte), pfn, true, true); 299 gpte_to_gfn(gpte), pfn, true, true);
291} 300}
292 301
302static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
303 struct guest_walker *gw, int level)
304{
305 int r;
306 pt_element_t curr_pte;
307
308 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
309 &curr_pte, sizeof(curr_pte));
310 return r || curr_pte != gw->ptes[level - 1];
311}
312
293/* 313/*
294 * Fetch a shadow pte for a specific level in the paging hierarchy. 314 * Fetch a shadow pte for a specific level in the paging hierarchy.
295 */ 315 */
@@ -299,75 +319,86 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
299 int *ptwrite, pfn_t pfn) 319 int *ptwrite, pfn_t pfn)
300{ 320{
301 unsigned access = gw->pt_access; 321 unsigned access = gw->pt_access;
302 struct kvm_mmu_page *shadow_page; 322 struct kvm_mmu_page *sp = NULL;
303 u64 spte, *sptep = NULL; 323 bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
304 int direct; 324 int top_level;
305 gfn_t table_gfn; 325 unsigned direct_access;
306 int r; 326 struct kvm_shadow_walk_iterator it;
307 int level;
308 pt_element_t curr_pte;
309 struct kvm_shadow_walk_iterator iterator;
310 327
311 if (!is_present_gpte(gw->ptes[gw->level - 1])) 328 if (!is_present_gpte(gw->ptes[gw->level - 1]))
312 return NULL; 329 return NULL;
313 330
314 for_each_shadow_entry(vcpu, addr, iterator) { 331 direct_access = gw->pt_access & gw->pte_access;
315 level = iterator.level; 332 if (!dirty)
316 sptep = iterator.sptep; 333 direct_access &= ~ACC_WRITE_MASK;
317 if (iterator.level == hlevel) {
318 mmu_set_spte(vcpu, sptep, access,
319 gw->pte_access & access,
320 user_fault, write_fault,
321 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
322 ptwrite, level,
323 gw->gfn, pfn, false, true);
324 break;
325 }
326 334
327 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) 335 top_level = vcpu->arch.mmu.root_level;
328 continue; 336 if (top_level == PT32E_ROOT_LEVEL)
337 top_level = PT32_ROOT_LEVEL;
338 /*
339 * Verify that the top-level gpte is still there. Since the page
340 * is a root page, it is either write protected (and cannot be
341 * changed from now on) or it is invalid (in which case, we don't
342 * really care if it changes underneath us after this point).
343 */
344 if (FNAME(gpte_changed)(vcpu, gw, top_level))
345 goto out_gpte_changed;
329 346
330 if (is_large_pte(*sptep)) { 347 for (shadow_walk_init(&it, vcpu, addr);
331 rmap_remove(vcpu->kvm, sptep); 348 shadow_walk_okay(&it) && it.level > gw->level;
332 __set_spte(sptep, shadow_trap_nonpresent_pte); 349 shadow_walk_next(&it)) {
333 kvm_flush_remote_tlbs(vcpu->kvm); 350 gfn_t table_gfn;
334 }
335 351
336 if (level <= gw->level) { 352 drop_large_spte(vcpu, it.sptep);
337 int delta = level - gw->level + 1; 353
338 direct = 1; 354 sp = NULL;
339 if (!is_dirty_gpte(gw->ptes[level - delta])) 355 if (!is_shadow_present_pte(*it.sptep)) {
340 access &= ~ACC_WRITE_MASK; 356 table_gfn = gw->table_gfn[it.level - 2];
341 table_gfn = gpte_to_gfn(gw->ptes[level - delta]); 357 sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
342 /* advance table_gfn when emulating 1gb pages with 4k */ 358 false, access, it.sptep);
343 if (delta == 0)
344 table_gfn += PT_INDEX(addr, level);
345 access &= gw->pte_access;
346 } else {
347 direct = 0;
348 table_gfn = gw->table_gfn[level - 2];
349 }
350 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
351 direct, access, sptep);
352 if (!direct) {
353 r = kvm_read_guest_atomic(vcpu->kvm,
354 gw->pte_gpa[level - 2],
355 &curr_pte, sizeof(curr_pte));
356 if (r || curr_pte != gw->ptes[level - 2]) {
357 kvm_mmu_put_page(shadow_page, sptep);
358 kvm_release_pfn_clean(pfn);
359 sptep = NULL;
360 break;
361 }
362 } 359 }
363 360
364 spte = __pa(shadow_page->spt) 361 /*
365 | PT_PRESENT_MASK | PT_ACCESSED_MASK 362 * Verify that the gpte in the page we've just write
366 | PT_WRITABLE_MASK | PT_USER_MASK; 363 * protected is still there.
367 *sptep = spte; 364 */
365 if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
366 goto out_gpte_changed;
367
368 if (sp)
369 link_shadow_page(it.sptep, sp);
368 } 370 }
369 371
370 return sptep; 372 for (;
373 shadow_walk_okay(&it) && it.level > hlevel;
374 shadow_walk_next(&it)) {
375 gfn_t direct_gfn;
376
377 validate_direct_spte(vcpu, it.sptep, direct_access);
378
379 drop_large_spte(vcpu, it.sptep);
380
381 if (is_shadow_present_pte(*it.sptep))
382 continue;
383
384 direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
385
386 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
387 true, direct_access, it.sptep);
388 link_shadow_page(it.sptep, sp);
389 }
390
391 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
392 user_fault, write_fault, dirty, ptwrite, it.level,
393 gw->gfn, pfn, false, true);
394
395 return it.sptep;
396
397out_gpte_changed:
398 if (sp)
399 kvm_mmu_put_page(sp, it.sptep);
400 kvm_release_pfn_clean(pfn);
401 return NULL;
371} 402}
372 403
373/* 404/*
@@ -431,11 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
431 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 462 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
432 463
433 /* mmio */ 464 /* mmio */
434 if (is_error_pfn(pfn)) { 465 if (is_error_pfn(pfn))
435 pgprintk("gfn %lx is mmio\n", walker.gfn); 466 return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
436 kvm_release_pfn_clean(pfn);
437 return 1;
438 }
439 467
440 spin_lock(&vcpu->kvm->mmu_lock); 468 spin_lock(&vcpu->kvm->mmu_lock);
441 if (mmu_notifier_retry(vcpu, mmu_seq)) 469 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -443,6 +471,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
443 kvm_mmu_free_some_pages(vcpu); 471 kvm_mmu_free_some_pages(vcpu);
444 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 472 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
445 level, &write_pt, pfn); 473 level, &write_pt, pfn);
474 (void)sptep;
446 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 475 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
447 sptep, *sptep, write_pt); 476 sptep, *sptep, write_pt);
448 477
@@ -464,6 +493,7 @@ out_unlock:
464static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 493static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
465{ 494{
466 struct kvm_shadow_walk_iterator iterator; 495 struct kvm_shadow_walk_iterator iterator;
496 struct kvm_mmu_page *sp;
467 gpa_t pte_gpa = -1; 497 gpa_t pte_gpa = -1;
468 int level; 498 int level;
469 u64 *sptep; 499 u64 *sptep;
@@ -475,10 +505,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
475 level = iterator.level; 505 level = iterator.level;
476 sptep = iterator.sptep; 506 sptep = iterator.sptep;
477 507
508 sp = page_header(__pa(sptep));
478 if (is_last_spte(*sptep, level)) { 509 if (is_last_spte(*sptep, level)) {
479 struct kvm_mmu_page *sp = page_header(__pa(sptep));
480 int offset, shift; 510 int offset, shift;
481 511
512 if (!sp->unsync)
513 break;
514
482 shift = PAGE_SHIFT - 515 shift = PAGE_SHIFT -
483 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; 516 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
484 offset = sp->role.quadrant << shift; 517 offset = sp->role.quadrant << shift;
@@ -487,16 +520,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
487 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); 520 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
488 521
489 if (is_shadow_present_pte(*sptep)) { 522 if (is_shadow_present_pte(*sptep)) {
490 rmap_remove(vcpu->kvm, sptep);
491 if (is_large_pte(*sptep)) 523 if (is_large_pte(*sptep))
492 --vcpu->kvm->stat.lpages; 524 --vcpu->kvm->stat.lpages;
525 drop_spte(vcpu->kvm, sptep,
526 shadow_trap_nonpresent_pte);
493 need_flush = 1; 527 need_flush = 1;
494 } 528 } else
495 __set_spte(sptep, shadow_trap_nonpresent_pte); 529 __set_spte(sptep, shadow_trap_nonpresent_pte);
496 break; 530 break;
497 } 531 }
498 532
499 if (!is_shadow_present_pte(*sptep)) 533 if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
500 break; 534 break;
501 } 535 }
502 536
@@ -570,9 +604,9 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
570 * Using the cached information from sp->gfns is safe because: 604 * Using the cached information from sp->gfns is safe because:
571 * - The spte has a reference to the struct page, so the pfn for a given gfn 605 * - The spte has a reference to the struct page, so the pfn for a given gfn
572 * can't change unless all sptes pointing to it are nuked first. 606 * can't change unless all sptes pointing to it are nuked first.
573 * - Alias changes zap the entire shadow cache.
574 */ 607 */
575static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 608static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
609 bool clear_unsync)
576{ 610{
577 int i, offset, nr_present; 611 int i, offset, nr_present;
578 bool reset_host_protection; 612 bool reset_host_protection;
@@ -580,6 +614,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
580 614
581 offset = nr_present = 0; 615 offset = nr_present = 0;
582 616
617 /* direct kvm_mmu_page can not be unsync. */
618 BUG_ON(sp->role.direct);
619
583 if (PTTYPE == 32) 620 if (PTTYPE == 32)
584 offset = sp->role.quadrant << PT64_LEVEL_BITS; 621 offset = sp->role.quadrant << PT64_LEVEL_BITS;
585 622
@@ -589,7 +626,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
589 unsigned pte_access; 626 unsigned pte_access;
590 pt_element_t gpte; 627 pt_element_t gpte;
591 gpa_t pte_gpa; 628 gpa_t pte_gpa;
592 gfn_t gfn = sp->gfns[i]; 629 gfn_t gfn;
593 630
594 if (!is_shadow_present_pte(sp->spt[i])) 631 if (!is_shadow_present_pte(sp->spt[i]))
595 continue; 632 continue;
@@ -600,16 +637,17 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
600 sizeof(pt_element_t))) 637 sizeof(pt_element_t)))
601 return -EINVAL; 638 return -EINVAL;
602 639
603 if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || 640 gfn = gpte_to_gfn(gpte);
604 !(gpte & PT_ACCESSED_MASK)) { 641 if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
642 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
643 || !(gpte & PT_ACCESSED_MASK)) {
605 u64 nonpresent; 644 u64 nonpresent;
606 645
607 rmap_remove(vcpu->kvm, &sp->spt[i]); 646 if (is_present_gpte(gpte) || !clear_unsync)
608 if (is_present_gpte(gpte))
609 nonpresent = shadow_trap_nonpresent_pte; 647 nonpresent = shadow_trap_nonpresent_pte;
610 else 648 else
611 nonpresent = shadow_notrap_nonpresent_pte; 649 nonpresent = shadow_notrap_nonpresent_pte;
612 __set_spte(&sp->spt[i], nonpresent); 650 drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
613 continue; 651 continue;
614 } 652 }
615 653
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd2..bc5b9b8d4a3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,6 +4,7 @@
4 * AMD SVM support 4 * AMD SVM support
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affilates.
7 * 8 *
8 * Authors: 9 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com> 10 * Yaniv Kamay <yaniv@qumranet.com>
@@ -130,7 +131,7 @@ static struct svm_direct_access_msrs {
130 u32 index; /* Index of the MSR */ 131 u32 index; /* Index of the MSR */
131 bool always; /* True if intercept is always on */ 132 bool always; /* True if intercept is always on */
132} direct_access_msrs[] = { 133} direct_access_msrs[] = {
133 { .index = MSR_K6_STAR, .always = true }, 134 { .index = MSR_STAR, .always = true },
134 { .index = MSR_IA32_SYSENTER_CS, .always = true }, 135 { .index = MSR_IA32_SYSENTER_CS, .always = true },
135#ifdef CONFIG_X86_64 136#ifdef CONFIG_X86_64
136 { .index = MSR_GS_BASE, .always = true }, 137 { .index = MSR_GS_BASE, .always = true },
@@ -285,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
285 286
286static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 287static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
287{ 288{
289 vcpu->arch.efer = efer;
288 if (!npt_enabled && !(efer & EFER_LMA)) 290 if (!npt_enabled && !(efer & EFER_LMA))
289 efer &= ~EFER_LME; 291 efer &= ~EFER_LME;
290 292
291 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 293 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
292 vcpu->arch.efer = efer;
293} 294}
294 295
295static int is_external_interrupt(u32 info) 296static int is_external_interrupt(u32 info)
@@ -383,8 +384,7 @@ static void svm_init_erratum_383(void)
383 int err; 384 int err;
384 u64 val; 385 u64 val;
385 386
386 /* Only Fam10h is affected */ 387 if (!cpu_has_amd_erratum(amd_erratum_383))
387 if (boot_cpu_data.x86 != 0x10)
388 return; 388 return;
389 389
390 /* Use _safe variants to not break nested virtualization */ 390 /* Use _safe variants to not break nested virtualization */
@@ -640,7 +640,7 @@ static __init int svm_hardware_setup(void)
640 640
641 if (nested) { 641 if (nested) {
642 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 642 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
643 kvm_enable_efer_bits(EFER_SVME); 643 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
644 } 644 }
645 645
646 for_each_possible_cpu(cpu) { 646 for_each_possible_cpu(cpu) {
@@ -806,7 +806,7 @@ static void init_vmcb(struct vcpu_svm *svm)
806 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 806 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
807 */ 807 */
808 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 808 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
809 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); 809 (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
810 810
811 save->cr4 = X86_CR4_PAE; 811 save->cr4 = X86_CR4_PAE;
812 /* rdx = ?? */ 812 /* rdx = ?? */
@@ -903,13 +903,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
903 svm->asid_generation = 0; 903 svm->asid_generation = 0;
904 init_vmcb(svm); 904 init_vmcb(svm);
905 905
906 fx_init(&svm->vcpu); 906 err = fx_init(&svm->vcpu);
907 if (err)
908 goto free_page4;
909
907 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 910 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
908 if (kvm_vcpu_is_bsp(&svm->vcpu)) 911 if (kvm_vcpu_is_bsp(&svm->vcpu))
909 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 912 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
910 913
911 return &svm->vcpu; 914 return &svm->vcpu;
912 915
916free_page4:
917 __free_page(hsave_page);
913free_page3: 918free_page3:
914 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); 919 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
915free_page2: 920free_page2:
@@ -1488,7 +1493,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
1488 */ 1493 */
1489 pr_err("KVM: Guest triggered AMD Erratum 383\n"); 1494 pr_err("KVM: Guest triggered AMD Erratum 383\n");
1490 1495
1491 set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); 1496 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1492 1497
1493 return; 1498 return;
1494 } 1499 }
@@ -1535,7 +1540,7 @@ static int io_interception(struct vcpu_svm *svm)
1535 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1540 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1536 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1541 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1537 if (string || in) 1542 if (string || in)
1538 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); 1543 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
1539 1544
1540 port = io_info >> 16; 1545 port = io_info >> 16;
1541 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1546 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1957,7 +1962,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1957 svm->vmcb->save.cr3 = hsave->save.cr3; 1962 svm->vmcb->save.cr3 = hsave->save.cr3;
1958 svm->vcpu.arch.cr3 = hsave->save.cr3; 1963 svm->vcpu.arch.cr3 = hsave->save.cr3;
1959 } else { 1964 } else {
1960 kvm_set_cr3(&svm->vcpu, hsave->save.cr3); 1965 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
1961 } 1966 }
1962 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); 1967 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
1963 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); 1968 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
@@ -2080,7 +2085,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2080 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 2085 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2081 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 2086 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2082 } else 2087 } else
2083 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 2088 (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2084 2089
2085 /* Guest paging mode is active - reset mmu */ 2090 /* Guest paging mode is active - reset mmu */
2086 kvm_mmu_reset_context(&svm->vcpu); 2091 kvm_mmu_reset_context(&svm->vcpu);
@@ -2386,16 +2391,12 @@ static int iret_interception(struct vcpu_svm *svm)
2386 2391
2387static int invlpg_interception(struct vcpu_svm *svm) 2392static int invlpg_interception(struct vcpu_svm *svm)
2388{ 2393{
2389 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) 2394 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2390 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2391 return 1;
2392} 2395}
2393 2396
2394static int emulate_on_interception(struct vcpu_svm *svm) 2397static int emulate_on_interception(struct vcpu_svm *svm)
2395{ 2398{
2396 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) 2399 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2397 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2398 return 1;
2399} 2400}
2400 2401
2401static int cr8_write_interception(struct vcpu_svm *svm) 2402static int cr8_write_interception(struct vcpu_svm *svm)
@@ -2431,7 +2432,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2431 *data = tsc_offset + native_read_tsc(); 2432 *data = tsc_offset + native_read_tsc();
2432 break; 2433 break;
2433 } 2434 }
2434 case MSR_K6_STAR: 2435 case MSR_STAR:
2435 *data = svm->vmcb->save.star; 2436 *data = svm->vmcb->save.star;
2436 break; 2437 break;
2437#ifdef CONFIG_X86_64 2438#ifdef CONFIG_X86_64
@@ -2555,7 +2556,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2555 2556
2556 break; 2557 break;
2557 } 2558 }
2558 case MSR_K6_STAR: 2559 case MSR_STAR:
2559 svm->vmcb->save.star = data; 2560 svm->vmcb->save.star = data;
2560 break; 2561 break;
2561#ifdef CONFIG_X86_64 2562#ifdef CONFIG_X86_64
@@ -2726,6 +2727,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2726 [SVM_EXIT_NPF] = pf_interception, 2727 [SVM_EXIT_NPF] = pf_interception,
2727}; 2728};
2728 2729
2730void dump_vmcb(struct kvm_vcpu *vcpu)
2731{
2732 struct vcpu_svm *svm = to_svm(vcpu);
2733 struct vmcb_control_area *control = &svm->vmcb->control;
2734 struct vmcb_save_area *save = &svm->vmcb->save;
2735
2736 pr_err("VMCB Control Area:\n");
2737 pr_err("cr_read: %04x\n", control->intercept_cr_read);
2738 pr_err("cr_write: %04x\n", control->intercept_cr_write);
2739 pr_err("dr_read: %04x\n", control->intercept_dr_read);
2740 pr_err("dr_write: %04x\n", control->intercept_dr_write);
2741 pr_err("exceptions: %08x\n", control->intercept_exceptions);
2742 pr_err("intercepts: %016llx\n", control->intercept);
2743 pr_err("pause filter count: %d\n", control->pause_filter_count);
2744 pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa);
2745 pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa);
2746 pr_err("tsc_offset: %016llx\n", control->tsc_offset);
2747 pr_err("asid: %d\n", control->asid);
2748 pr_err("tlb_ctl: %d\n", control->tlb_ctl);
2749 pr_err("int_ctl: %08x\n", control->int_ctl);
2750 pr_err("int_vector: %08x\n", control->int_vector);
2751 pr_err("int_state: %08x\n", control->int_state);
2752 pr_err("exit_code: %08x\n", control->exit_code);
2753 pr_err("exit_info1: %016llx\n", control->exit_info_1);
2754 pr_err("exit_info2: %016llx\n", control->exit_info_2);
2755 pr_err("exit_int_info: %08x\n", control->exit_int_info);
2756 pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err);
2757 pr_err("nested_ctl: %lld\n", control->nested_ctl);
2758 pr_err("nested_cr3: %016llx\n", control->nested_cr3);
2759 pr_err("event_inj: %08x\n", control->event_inj);
2760 pr_err("event_inj_err: %08x\n", control->event_inj_err);
2761 pr_err("lbr_ctl: %lld\n", control->lbr_ctl);
2762 pr_err("next_rip: %016llx\n", control->next_rip);
2763 pr_err("VMCB State Save Area:\n");
2764 pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n",
2765 save->es.selector, save->es.attrib,
2766 save->es.limit, save->es.base);
2767 pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n",
2768 save->cs.selector, save->cs.attrib,
2769 save->cs.limit, save->cs.base);
2770 pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n",
2771 save->ss.selector, save->ss.attrib,
2772 save->ss.limit, save->ss.base);
2773 pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n",
2774 save->ds.selector, save->ds.attrib,
2775 save->ds.limit, save->ds.base);
2776 pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n",
2777 save->fs.selector, save->fs.attrib,
2778 save->fs.limit, save->fs.base);
2779 pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n",
2780 save->gs.selector, save->gs.attrib,
2781 save->gs.limit, save->gs.base);
2782 pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
2783 save->gdtr.selector, save->gdtr.attrib,
2784 save->gdtr.limit, save->gdtr.base);
2785 pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
2786 save->ldtr.selector, save->ldtr.attrib,
2787 save->ldtr.limit, save->ldtr.base);
2788 pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
2789 save->idtr.selector, save->idtr.attrib,
2790 save->idtr.limit, save->idtr.base);
2791 pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n",
2792 save->tr.selector, save->tr.attrib,
2793 save->tr.limit, save->tr.base);
2794 pr_err("cpl: %d efer: %016llx\n",
2795 save->cpl, save->efer);
2796 pr_err("cr0: %016llx cr2: %016llx\n",
2797 save->cr0, save->cr2);
2798 pr_err("cr3: %016llx cr4: %016llx\n",
2799 save->cr3, save->cr4);
2800 pr_err("dr6: %016llx dr7: %016llx\n",
2801 save->dr6, save->dr7);
2802 pr_err("rip: %016llx rflags: %016llx\n",
2803 save->rip, save->rflags);
2804 pr_err("rsp: %016llx rax: %016llx\n",
2805 save->rsp, save->rax);
2806 pr_err("star: %016llx lstar: %016llx\n",
2807 save->star, save->lstar);
2808 pr_err("cstar: %016llx sfmask: %016llx\n",
2809 save->cstar, save->sfmask);
2810 pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n",
2811 save->kernel_gs_base, save->sysenter_cs);
2812 pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n",
2813 save->sysenter_esp, save->sysenter_eip);
2814 pr_err("gpat: %016llx dbgctl: %016llx\n",
2815 save->g_pat, save->dbgctl);
2816 pr_err("br_from: %016llx br_to: %016llx\n",
2817 save->br_from, save->br_to);
2818 pr_err("excp_from: %016llx excp_to: %016llx\n",
2819 save->last_excp_from, save->last_excp_to);
2820
2821}
2822
2729static int handle_exit(struct kvm_vcpu *vcpu) 2823static int handle_exit(struct kvm_vcpu *vcpu)
2730{ 2824{
2731 struct vcpu_svm *svm = to_svm(vcpu); 2825 struct vcpu_svm *svm = to_svm(vcpu);
@@ -2770,6 +2864,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2770 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2864 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2771 kvm_run->fail_entry.hardware_entry_failure_reason 2865 kvm_run->fail_entry.hardware_entry_failure_reason
2772 = svm->vmcb->control.exit_code; 2866 = svm->vmcb->control.exit_code;
2867 pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
2868 dump_vmcb(vcpu);
2773 return 0; 2869 return 0;
2774 } 2870 }
2775 2871
@@ -2826,9 +2922,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2826{ 2922{
2827 struct vmcb_control_area *control; 2923 struct vmcb_control_area *control;
2828 2924
2829 trace_kvm_inj_virq(irq);
2830
2831 ++svm->vcpu.stat.irq_injections;
2832 control = &svm->vmcb->control; 2925 control = &svm->vmcb->control;
2833 control->int_vector = irq; 2926 control->int_vector = irq;
2834 control->int_ctl &= ~V_INTR_PRIO_MASK; 2927 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -2842,6 +2935,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
2842 2935
2843 BUG_ON(!(gif_set(svm))); 2936 BUG_ON(!(gif_set(svm)));
2844 2937
2938 trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
2939 ++vcpu->stat.irq_injections;
2940
2845 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 2941 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
2846 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; 2942 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2847} 2943}
@@ -3327,6 +3423,11 @@ static bool svm_rdtscp_supported(void)
3327 return false; 3423 return false;
3328} 3424}
3329 3425
3426static bool svm_has_wbinvd_exit(void)
3427{
3428 return true;
3429}
3430
3330static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) 3431static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3331{ 3432{
3332 struct vcpu_svm *svm = to_svm(vcpu); 3433 struct vcpu_svm *svm = to_svm(vcpu);
@@ -3411,6 +3512,8 @@ static struct kvm_x86_ops svm_x86_ops = {
3411 .rdtscp_supported = svm_rdtscp_supported, 3512 .rdtscp_supported = svm_rdtscp_supported,
3412 3513
3413 .set_supported_cpuid = svm_set_supported_cpuid, 3514 .set_supported_cpuid = svm_set_supported_cpuid,
3515
3516 .has_wbinvd_exit = svm_has_wbinvd_exit,
3414}; 3517};
3415 3518
3416static int __init svm_init(void) 3519static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 4ddadb1a5ff..e16a0dbe74d 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -1,3 +1,17 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * timer support
8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory.
13 */
14
1#include <linux/kvm_host.h> 15#include <linux/kvm_host.h>
2#include <linux/kvm.h> 16#include <linux/kvm.h>
3#include <linux/hrtimer.h> 17#include <linux/hrtimer.h>
@@ -18,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
18 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 32 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
19 atomic_inc(&ktimer->pending); 33 atomic_inc(&ktimer->pending);
20 /* FIXME: this code should not know anything about vcpus */ 34 /* FIXME: this code should not know anything about vcpus */
21 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 35 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
22 } 36 }
23 37
24 if (waitqueue_active(q)) 38 if (waitqueue_active(q))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee03679efe7..49b25eee25a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,6 +5,7 @@
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Authors: 10 * Authors:
10 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
@@ -36,6 +37,8 @@
36#include <asm/vmx.h> 37#include <asm/vmx.h>
37#include <asm/virtext.h> 38#include <asm/virtext.h>
38#include <asm/mce.h> 39#include <asm/mce.h>
40#include <asm/i387.h>
41#include <asm/xcr.h>
39 42
40#include "trace.h" 43#include "trace.h"
41 44
@@ -63,6 +66,9 @@ module_param_named(unrestricted_guest,
63static int __read_mostly emulate_invalid_guest_state = 0; 66static int __read_mostly emulate_invalid_guest_state = 0;
64module_param(emulate_invalid_guest_state, bool, S_IRUGO); 67module_param(emulate_invalid_guest_state, bool, S_IRUGO);
65 68
69static int __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO);
71
66#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 72#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
67 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 73 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
68#define KVM_GUEST_CR0_MASK \ 74#define KVM_GUEST_CR0_MASK \
@@ -173,10 +179,13 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
173 179
174static int init_rmode(struct kvm *kvm); 180static int init_rmode(struct kvm *kvm);
175static u64 construct_eptp(unsigned long root_hpa); 181static u64 construct_eptp(unsigned long root_hpa);
182static void kvm_cpu_vmxon(u64 addr);
183static void kvm_cpu_vmxoff(void);
176 184
177static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
178static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
179static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 187static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
188static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
180 189
181static unsigned long *vmx_io_bitmap_a; 190static unsigned long *vmx_io_bitmap_a;
182static unsigned long *vmx_io_bitmap_b; 191static unsigned long *vmx_io_bitmap_b;
@@ -231,14 +240,14 @@ static u64 host_efer;
231static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 240static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
232 241
233/* 242/*
234 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it 243 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
235 * away by decrementing the array size. 244 * away by decrementing the array size.
236 */ 245 */
237static const u32 vmx_msr_index[] = { 246static const u32 vmx_msr_index[] = {
238#ifdef CONFIG_X86_64 247#ifdef CONFIG_X86_64
239 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 248 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
240#endif 249#endif
241 MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, 250 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
242}; 251};
243#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 252#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
244 253
@@ -334,6 +343,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
334 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; 343 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
335} 344}
336 345
346static inline bool cpu_has_vmx_ept_4levels(void)
347{
348 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
349}
350
337static inline bool cpu_has_vmx_invept_individual_addr(void) 351static inline bool cpu_has_vmx_invept_individual_addr(void)
338{ 352{
339 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; 353 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -349,6 +363,16 @@ static inline bool cpu_has_vmx_invept_global(void)
349 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; 363 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
350} 364}
351 365
366static inline bool cpu_has_vmx_invvpid_single(void)
367{
368 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
369}
370
371static inline bool cpu_has_vmx_invvpid_global(void)
372{
373 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
374}
375
352static inline bool cpu_has_vmx_ept(void) 376static inline bool cpu_has_vmx_ept(void)
353{ 377{
354 return vmcs_config.cpu_based_2nd_exec_ctrl & 378 return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -389,6 +413,12 @@ static inline bool cpu_has_virtual_nmis(void)
389 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 413 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
390} 414}
391 415
416static inline bool cpu_has_vmx_wbinvd_exit(void)
417{
418 return vmcs_config.cpu_based_2nd_exec_ctrl &
419 SECONDARY_EXEC_WBINVD_EXITING;
420}
421
392static inline bool report_flexpriority(void) 422static inline bool report_flexpriority(void)
393{ 423{
394 return flexpriority_enabled; 424 return flexpriority_enabled;
@@ -453,6 +483,19 @@ static void vmcs_clear(struct vmcs *vmcs)
453 vmcs, phys_addr); 483 vmcs, phys_addr);
454} 484}
455 485
486static void vmcs_load(struct vmcs *vmcs)
487{
488 u64 phys_addr = __pa(vmcs);
489 u8 error;
490
491 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
492 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
493 : "cc", "memory");
494 if (error)
495 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
496 vmcs, phys_addr);
497}
498
456static void __vcpu_clear(void *arg) 499static void __vcpu_clear(void *arg)
457{ 500{
458 struct vcpu_vmx *vmx = arg; 501 struct vcpu_vmx *vmx = arg;
@@ -475,12 +518,27 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
475 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 518 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
476} 519}
477 520
478static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) 521static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
479{ 522{
480 if (vmx->vpid == 0) 523 if (vmx->vpid == 0)
481 return; 524 return;
482 525
483 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); 526 if (cpu_has_vmx_invvpid_single())
527 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
528}
529
530static inline void vpid_sync_vcpu_global(void)
531{
532 if (cpu_has_vmx_invvpid_global())
533 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
534}
535
536static inline void vpid_sync_context(struct vcpu_vmx *vmx)
537{
538 if (cpu_has_vmx_invvpid_single())
539 vpid_sync_vcpu_single(vmx);
540 else
541 vpid_sync_vcpu_global();
484} 542}
485 543
486static inline void ept_sync_global(void) 544static inline void ept_sync_global(void)
@@ -812,6 +870,9 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
812 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 870 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
813 } 871 }
814#endif 872#endif
873 if (current_thread_info()->status & TS_USEDFPU)
874 clts();
875 load_gdt(&__get_cpu_var(host_gdt));
815} 876}
816 877
817static void vmx_load_host_state(struct vcpu_vmx *vmx) 878static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -828,35 +889,30 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
828static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 889static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
829{ 890{
830 struct vcpu_vmx *vmx = to_vmx(vcpu); 891 struct vcpu_vmx *vmx = to_vmx(vcpu);
831 u64 phys_addr = __pa(vmx->vmcs);
832 u64 tsc_this, delta, new_offset; 892 u64 tsc_this, delta, new_offset;
893 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
833 894
834 if (vcpu->cpu != cpu) { 895 if (!vmm_exclusive)
896 kvm_cpu_vmxon(phys_addr);
897 else if (vcpu->cpu != cpu)
835 vcpu_clear(vmx); 898 vcpu_clear(vmx);
836 kvm_migrate_timers(vcpu);
837 set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
838 local_irq_disable();
839 list_add(&vmx->local_vcpus_link,
840 &per_cpu(vcpus_on_cpu, cpu));
841 local_irq_enable();
842 }
843 899
844 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 900 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
845 u8 error;
846
847 per_cpu(current_vmcs, cpu) = vmx->vmcs; 901 per_cpu(current_vmcs, cpu) = vmx->vmcs;
848 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 902 vmcs_load(vmx->vmcs);
849 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
850 : "cc");
851 if (error)
852 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
853 vmx->vmcs, phys_addr);
854 } 903 }
855 904
856 if (vcpu->cpu != cpu) { 905 if (vcpu->cpu != cpu) {
857 struct desc_ptr dt; 906 struct desc_ptr dt;
858 unsigned long sysenter_esp; 907 unsigned long sysenter_esp;
859 908
909 kvm_migrate_timers(vcpu);
910 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
911 local_irq_disable();
912 list_add(&vmx->local_vcpus_link,
913 &per_cpu(vcpus_on_cpu, cpu));
914 local_irq_enable();
915
860 vcpu->cpu = cpu; 916 vcpu->cpu = cpu;
861 /* 917 /*
862 * Linux uses per-cpu TSS and GDT, so set these when switching 918 * Linux uses per-cpu TSS and GDT, so set these when switching
@@ -884,6 +940,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
884static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 940static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
885{ 941{
886 __vmx_load_host_state(to_vmx(vcpu)); 942 __vmx_load_host_state(to_vmx(vcpu));
943 if (!vmm_exclusive) {
944 __vcpu_clear(to_vmx(vcpu));
945 kvm_cpu_vmxoff();
946 }
887} 947}
888 948
889static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 949static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -1057,10 +1117,10 @@ static void setup_msrs(struct vcpu_vmx *vmx)
1057 if (index >= 0 && vmx->rdtscp_enabled) 1117 if (index >= 0 && vmx->rdtscp_enabled)
1058 move_msr_up(vmx, index, save_nmsrs++); 1118 move_msr_up(vmx, index, save_nmsrs++);
1059 /* 1119 /*
1060 * MSR_K6_STAR is only needed on long mode guests, and only 1120 * MSR_STAR is only needed on long mode guests, and only
1061 * if efer.sce is enabled. 1121 * if efer.sce is enabled.
1062 */ 1122 */
1063 index = __find_msr_index(vmx, MSR_K6_STAR); 1123 index = __find_msr_index(vmx, MSR_STAR);
1064 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) 1124 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
1065 move_msr_up(vmx, index, save_nmsrs++); 1125 move_msr_up(vmx, index, save_nmsrs++);
1066 } 1126 }
@@ -1286,6 +1346,13 @@ static __init int vmx_disabled_by_bios(void)
1286 /* locked but not enabled */ 1346 /* locked but not enabled */
1287} 1347}
1288 1348
1349static void kvm_cpu_vmxon(u64 addr)
1350{
1351 asm volatile (ASM_VMX_VMXON_RAX
1352 : : "a"(&addr), "m"(addr)
1353 : "memory", "cc");
1354}
1355
1289static int hardware_enable(void *garbage) 1356static int hardware_enable(void *garbage)
1290{ 1357{
1291 int cpu = raw_smp_processor_id(); 1358 int cpu = raw_smp_processor_id();
@@ -1308,11 +1375,13 @@ static int hardware_enable(void *garbage)
1308 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 1375 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1309 } 1376 }
1310 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1377 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1311 asm volatile (ASM_VMX_VMXON_RAX
1312 : : "a"(&phys_addr), "m"(phys_addr)
1313 : "memory", "cc");
1314 1378
1315 ept_sync_global(); 1379 if (vmm_exclusive) {
1380 kvm_cpu_vmxon(phys_addr);
1381 ept_sync_global();
1382 }
1383
1384 store_gdt(&__get_cpu_var(host_gdt));
1316 1385
1317 return 0; 1386 return 0;
1318} 1387}
@@ -1334,13 +1403,15 @@ static void vmclear_local_vcpus(void)
1334static void kvm_cpu_vmxoff(void) 1403static void kvm_cpu_vmxoff(void)
1335{ 1404{
1336 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 1405 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1337 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1338} 1406}
1339 1407
1340static void hardware_disable(void *garbage) 1408static void hardware_disable(void *garbage)
1341{ 1409{
1342 vmclear_local_vcpus(); 1410 if (vmm_exclusive) {
1343 kvm_cpu_vmxoff(); 1411 vmclear_local_vcpus();
1412 kvm_cpu_vmxoff();
1413 }
1414 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1344} 1415}
1345 1416
1346static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 1417static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -1539,7 +1610,8 @@ static __init int hardware_setup(void)
1539 if (!cpu_has_vmx_vpid()) 1610 if (!cpu_has_vmx_vpid())
1540 enable_vpid = 0; 1611 enable_vpid = 0;
1541 1612
1542 if (!cpu_has_vmx_ept()) { 1613 if (!cpu_has_vmx_ept() ||
1614 !cpu_has_vmx_ept_4levels()) {
1543 enable_ept = 0; 1615 enable_ept = 0;
1544 enable_unrestricted_guest = 0; 1616 enable_unrestricted_guest = 0;
1545 } 1617 }
@@ -1628,7 +1700,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
1628 gfn_t base_gfn; 1700 gfn_t base_gfn;
1629 1701
1630 slots = kvm_memslots(kvm); 1702 slots = kvm_memslots(kvm);
1631 base_gfn = kvm->memslots->memslots[0].base_gfn + 1703 base_gfn = slots->memslots[0].base_gfn +
1632 kvm->memslots->memslots[0].npages - 3; 1704 kvm->memslots->memslots[0].npages - 3;
1633 return base_gfn << PAGE_SHIFT; 1705 return base_gfn << PAGE_SHIFT;
1634 } 1706 }
@@ -1759,9 +1831,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1759 1831
1760static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1832static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1761{ 1833{
1762 vpid_sync_vcpu_all(to_vmx(vcpu)); 1834 vpid_sync_context(to_vmx(vcpu));
1763 if (enable_ept) 1835 if (enable_ept) {
1836 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1837 return;
1764 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1838 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1839 }
1765} 1840}
1766 1841
1767static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1842static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -2507,7 +2582,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2507 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2582 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2508 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2583 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
2509 2584
2510 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ 2585 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
2511 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ 2586 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
2512 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 2587 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
2513 2588
@@ -2599,21 +2674,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2599 2674
2600static int init_rmode(struct kvm *kvm) 2675static int init_rmode(struct kvm *kvm)
2601{ 2676{
2677 int idx, ret = 0;
2678
2679 idx = srcu_read_lock(&kvm->srcu);
2602 if (!init_rmode_tss(kvm)) 2680 if (!init_rmode_tss(kvm))
2603 return 0; 2681 goto exit;
2604 if (!init_rmode_identity_map(kvm)) 2682 if (!init_rmode_identity_map(kvm))
2605 return 0; 2683 goto exit;
2606 return 1; 2684
2685 ret = 1;
2686exit:
2687 srcu_read_unlock(&kvm->srcu, idx);
2688 return ret;
2607} 2689}
2608 2690
2609static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2691static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2610{ 2692{
2611 struct vcpu_vmx *vmx = to_vmx(vcpu); 2693 struct vcpu_vmx *vmx = to_vmx(vcpu);
2612 u64 msr; 2694 u64 msr;
2613 int ret, idx; 2695 int ret;
2614 2696
2615 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2697 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2616 idx = srcu_read_lock(&vcpu->kvm->srcu);
2617 if (!init_rmode(vmx->vcpu.kvm)) { 2698 if (!init_rmode(vmx->vcpu.kvm)) {
2618 ret = -ENOMEM; 2699 ret = -ENOMEM;
2619 goto out; 2700 goto out;
@@ -2630,7 +2711,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2630 msr |= MSR_IA32_APICBASE_BSP; 2711 msr |= MSR_IA32_APICBASE_BSP;
2631 kvm_set_apic_base(&vmx->vcpu, msr); 2712 kvm_set_apic_base(&vmx->vcpu, msr);
2632 2713
2633 fx_init(&vmx->vcpu); 2714 ret = fx_init(&vmx->vcpu);
2715 if (ret != 0)
2716 goto out;
2634 2717
2635 seg_setup(VCPU_SREG_CS); 2718 seg_setup(VCPU_SREG_CS);
2636 /* 2719 /*
@@ -2713,7 +2796,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2713 vmx_fpu_activate(&vmx->vcpu); 2796 vmx_fpu_activate(&vmx->vcpu);
2714 update_exception_bitmap(&vmx->vcpu); 2797 update_exception_bitmap(&vmx->vcpu);
2715 2798
2716 vpid_sync_vcpu_all(vmx); 2799 vpid_sync_context(vmx);
2717 2800
2718 ret = 0; 2801 ret = 0;
2719 2802
@@ -2721,7 +2804,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2721 vmx->emulation_required = 0; 2804 vmx->emulation_required = 0;
2722 2805
2723out: 2806out:
2724 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2725 return ret; 2807 return ret;
2726} 2808}
2727 2809
@@ -2826,9 +2908,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2826{ 2908{
2827 if (!cpu_has_virtual_nmis()) 2909 if (!cpu_has_virtual_nmis())
2828 return to_vmx(vcpu)->soft_vnmi_blocked; 2910 return to_vmx(vcpu)->soft_vnmi_blocked;
2829 else 2911 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
2830 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2831 GUEST_INTR_STATE_NMI);
2832} 2912}
2833 2913
2834static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 2914static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3070,7 +3150,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
3070 ++vcpu->stat.io_exits; 3150 ++vcpu->stat.io_exits;
3071 3151
3072 if (string || in) 3152 if (string || in)
3073 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); 3153 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
3074 3154
3075 port = exit_qualification >> 16; 3155 port = exit_qualification >> 16;
3076 size = (exit_qualification & 7) + 1; 3156 size = (exit_qualification & 7) + 1;
@@ -3090,11 +3170,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3090 hypercall[2] = 0xc1; 3170 hypercall[2] = 0xc1;
3091} 3171}
3092 3172
3173static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
3174{
3175 if (err)
3176 kvm_inject_gp(vcpu, 0);
3177 else
3178 skip_emulated_instruction(vcpu);
3179}
3180
3093static int handle_cr(struct kvm_vcpu *vcpu) 3181static int handle_cr(struct kvm_vcpu *vcpu)
3094{ 3182{
3095 unsigned long exit_qualification, val; 3183 unsigned long exit_qualification, val;
3096 int cr; 3184 int cr;
3097 int reg; 3185 int reg;
3186 int err;
3098 3187
3099 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3188 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3100 cr = exit_qualification & 15; 3189 cr = exit_qualification & 15;
@@ -3105,16 +3194,16 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3105 trace_kvm_cr_write(cr, val); 3194 trace_kvm_cr_write(cr, val);
3106 switch (cr) { 3195 switch (cr) {
3107 case 0: 3196 case 0:
3108 kvm_set_cr0(vcpu, val); 3197 err = kvm_set_cr0(vcpu, val);
3109 skip_emulated_instruction(vcpu); 3198 complete_insn_gp(vcpu, err);
3110 return 1; 3199 return 1;
3111 case 3: 3200 case 3:
3112 kvm_set_cr3(vcpu, val); 3201 err = kvm_set_cr3(vcpu, val);
3113 skip_emulated_instruction(vcpu); 3202 complete_insn_gp(vcpu, err);
3114 return 1; 3203 return 1;
3115 case 4: 3204 case 4:
3116 kvm_set_cr4(vcpu, val); 3205 err = kvm_set_cr4(vcpu, val);
3117 skip_emulated_instruction(vcpu); 3206 complete_insn_gp(vcpu, err);
3118 return 1; 3207 return 1;
3119 case 8: { 3208 case 8: {
3120 u8 cr8_prev = kvm_get_cr8(vcpu); 3209 u8 cr8_prev = kvm_get_cr8(vcpu);
@@ -3321,30 +3410,25 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
3321static int handle_wbinvd(struct kvm_vcpu *vcpu) 3410static int handle_wbinvd(struct kvm_vcpu *vcpu)
3322{ 3411{
3323 skip_emulated_instruction(vcpu); 3412 skip_emulated_instruction(vcpu);
3324 /* TODO: Add support for VT-d/pass-through device */ 3413 kvm_emulate_wbinvd(vcpu);
3325 return 1; 3414 return 1;
3326} 3415}
3327 3416
3328static int handle_apic_access(struct kvm_vcpu *vcpu) 3417static int handle_xsetbv(struct kvm_vcpu *vcpu)
3329{ 3418{
3330 unsigned long exit_qualification; 3419 u64 new_bv = kvm_read_edx_eax(vcpu);
3331 enum emulation_result er; 3420 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3332 unsigned long offset;
3333 3421
3334 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3422 if (kvm_set_xcr(vcpu, index, new_bv) == 0)
3335 offset = exit_qualification & 0xffful; 3423 skip_emulated_instruction(vcpu);
3336
3337 er = emulate_instruction(vcpu, 0, 0, 0);
3338
3339 if (er != EMULATE_DONE) {
3340 printk(KERN_ERR
3341 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
3342 offset);
3343 return -ENOEXEC;
3344 }
3345 return 1; 3424 return 1;
3346} 3425}
3347 3426
3427static int handle_apic_access(struct kvm_vcpu *vcpu)
3428{
3429 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
3430}
3431
3348static int handle_task_switch(struct kvm_vcpu *vcpu) 3432static int handle_task_switch(struct kvm_vcpu *vcpu)
3349{ 3433{
3350 struct vcpu_vmx *vmx = to_vmx(vcpu); 3434 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3554,13 +3638,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3554 goto out; 3638 goto out;
3555 } 3639 }
3556 3640
3557 if (err != EMULATE_DONE) { 3641 if (err != EMULATE_DONE)
3558 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3642 return 0;
3559 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3560 vcpu->run->internal.ndata = 0;
3561 ret = 0;
3562 goto out;
3563 }
3564 3643
3565 if (signal_pending(current)) 3644 if (signal_pending(current))
3566 goto out; 3645 goto out;
@@ -3623,6 +3702,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3623 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 3702 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
3624 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 3703 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3625 [EXIT_REASON_WBINVD] = handle_wbinvd, 3704 [EXIT_REASON_WBINVD] = handle_wbinvd,
3705 [EXIT_REASON_XSETBV] = handle_xsetbv,
3626 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 3706 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
3627 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3707 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3628 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3708 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
@@ -3656,6 +3736,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3656 if (enable_ept && is_paging(vcpu)) 3736 if (enable_ept && is_paging(vcpu))
3657 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3737 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3658 3738
3739 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3740 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3741 vcpu->run->fail_entry.hardware_entry_failure_reason
3742 = exit_reason;
3743 return 0;
3744 }
3745
3659 if (unlikely(vmx->fail)) { 3746 if (unlikely(vmx->fail)) {
3660 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3747 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3661 vcpu->run->fail_entry.hardware_entry_failure_reason 3748 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3861,11 +3948,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3861 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3948 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3862 vmx_set_interrupt_shadow(vcpu, 0); 3949 vmx_set_interrupt_shadow(vcpu, 0);
3863 3950
3864 /*
3865 * Loading guest fpu may have cleared host cr0.ts
3866 */
3867 vmcs_writel(HOST_CR0, read_cr0());
3868
3869 asm( 3951 asm(
3870 /* Store host registers */ 3952 /* Store host registers */
3871 "push %%"R"dx; push %%"R"bp;" 3953 "push %%"R"dx; push %%"R"bp;"
@@ -4001,6 +4083,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
4001 kmem_cache_free(kvm_vcpu_cache, vmx); 4083 kmem_cache_free(kvm_vcpu_cache, vmx);
4002} 4084}
4003 4085
4086static inline void vmcs_init(struct vmcs *vmcs)
4087{
4088 u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
4089
4090 if (!vmm_exclusive)
4091 kvm_cpu_vmxon(phys_addr);
4092
4093 vmcs_clear(vmcs);
4094
4095 if (!vmm_exclusive)
4096 kvm_cpu_vmxoff();
4097}
4098
4004static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 4099static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4005{ 4100{
4006 int err; 4101 int err;
@@ -4026,7 +4121,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4026 if (!vmx->vmcs) 4121 if (!vmx->vmcs)
4027 goto free_msrs; 4122 goto free_msrs;
4028 4123
4029 vmcs_clear(vmx->vmcs); 4124 vmcs_init(vmx->vmcs);
4030 4125
4031 cpu = get_cpu(); 4126 cpu = get_cpu();
4032 vmx_vcpu_load(&vmx->vcpu, cpu); 4127 vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4265,6 +4360,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
4265 .rdtscp_supported = vmx_rdtscp_supported, 4360 .rdtscp_supported = vmx_rdtscp_supported,
4266 4361
4267 .set_supported_cpuid = vmx_set_supported_cpuid, 4362 .set_supported_cpuid = vmx_set_supported_cpuid,
4363
4364 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4268}; 4365};
4269 4366
4270static int __init vmx_init(void) 4367static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fa89c39c64..25f19078b32 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008 8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affilates.
9 * 10 *
10 * Authors: 11 * Authors:
11 * Avi Kivity <avi@qumranet.com> 12 * Avi Kivity <avi@qumranet.com>
@@ -41,17 +42,19 @@
41#include <linux/srcu.h> 42#include <linux/srcu.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h>
44#include <trace/events/kvm.h> 46#include <trace/events/kvm.h>
45 47
46#define CREATE_TRACE_POINTS 48#define CREATE_TRACE_POINTS
47#include "trace.h" 49#include "trace.h"
48 50
49#include <asm/debugreg.h> 51#include <asm/debugreg.h>
50#include <asm/uaccess.h>
51#include <asm/msr.h> 52#include <asm/msr.h>
52#include <asm/desc.h> 53#include <asm/desc.h>
53#include <asm/mtrr.h> 54#include <asm/mtrr.h>
54#include <asm/mce.h> 55#include <asm/mce.h>
56#include <asm/i387.h>
57#include <asm/xcr.h>
55 58
56#define MAX_IO_MSRS 256 59#define MAX_IO_MSRS 256
57#define CR0_RESERVED_BITS \ 60#define CR0_RESERVED_BITS \
@@ -62,6 +65,7 @@
62 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 65 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
63 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 66 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
64 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 67 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
68 | X86_CR4_OSXSAVE \
65 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 69 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
66 70
67#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 71#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -147,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
147 { NULL } 151 { NULL }
148}; 152};
149 153
154u64 __read_mostly host_xcr0;
155
156static inline u32 bit(int bitno)
157{
158 return 1 << (bitno & 31);
159}
160
150static void kvm_on_user_return(struct user_return_notifier *urn) 161static void kvm_on_user_return(struct user_return_notifier *urn)
151{ 162{
152 unsigned slot; 163 unsigned slot;
@@ -285,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
285 prev_nr = vcpu->arch.exception.nr; 296 prev_nr = vcpu->arch.exception.nr;
286 if (prev_nr == DF_VECTOR) { 297 if (prev_nr == DF_VECTOR) {
287 /* triple fault -> shutdown */ 298 /* triple fault -> shutdown */
288 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 299 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
289 return; 300 return;
290 } 301 }
291 class1 = exception_class(prev_nr); 302 class1 = exception_class(prev_nr);
@@ -414,121 +425,163 @@ out:
414 return changed; 425 return changed;
415} 426}
416 427
417void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 428int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
418{ 429{
430 unsigned long old_cr0 = kvm_read_cr0(vcpu);
431 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
432 X86_CR0_CD | X86_CR0_NW;
433
419 cr0 |= X86_CR0_ET; 434 cr0 |= X86_CR0_ET;
420 435
421#ifdef CONFIG_X86_64 436#ifdef CONFIG_X86_64
422 if (cr0 & 0xffffffff00000000UL) { 437 if (cr0 & 0xffffffff00000000UL)
423 kvm_inject_gp(vcpu, 0); 438 return 1;
424 return;
425 }
426#endif 439#endif
427 440
428 cr0 &= ~CR0_RESERVED_BITS; 441 cr0 &= ~CR0_RESERVED_BITS;
429 442
430 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 443 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
431 kvm_inject_gp(vcpu, 0); 444 return 1;
432 return;
433 }
434 445
435 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 446 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
436 kvm_inject_gp(vcpu, 0); 447 return 1;
437 return;
438 }
439 448
440 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 449 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
441#ifdef CONFIG_X86_64 450#ifdef CONFIG_X86_64
442 if ((vcpu->arch.efer & EFER_LME)) { 451 if ((vcpu->arch.efer & EFER_LME)) {
443 int cs_db, cs_l; 452 int cs_db, cs_l;
444 453
445 if (!is_pae(vcpu)) { 454 if (!is_pae(vcpu))
446 kvm_inject_gp(vcpu, 0); 455 return 1;
447 return;
448 }
449 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 456 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
450 if (cs_l) { 457 if (cs_l)
451 kvm_inject_gp(vcpu, 0); 458 return 1;
452 return;
453
454 }
455 } else 459 } else
456#endif 460#endif
457 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 461 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
458 kvm_inject_gp(vcpu, 0); 462 return 1;
459 return;
460 }
461
462 } 463 }
463 464
464 kvm_x86_ops->set_cr0(vcpu, cr0); 465 kvm_x86_ops->set_cr0(vcpu, cr0);
465 466
466 kvm_mmu_reset_context(vcpu); 467 if ((cr0 ^ old_cr0) & update_bits)
467 return; 468 kvm_mmu_reset_context(vcpu);
469 return 0;
468} 470}
469EXPORT_SYMBOL_GPL(kvm_set_cr0); 471EXPORT_SYMBOL_GPL(kvm_set_cr0);
470 472
471void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 473void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
472{ 474{
473 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); 475 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
474} 476}
475EXPORT_SYMBOL_GPL(kvm_lmsw); 477EXPORT_SYMBOL_GPL(kvm_lmsw);
476 478
477void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 479int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
478{ 480{
479 unsigned long old_cr4 = kvm_read_cr4(vcpu); 481 u64 xcr0;
480 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
481 482
482 if (cr4 & CR4_RESERVED_BITS) { 483 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
484 if (index != XCR_XFEATURE_ENABLED_MASK)
485 return 1;
486 xcr0 = xcr;
487 if (kvm_x86_ops->get_cpl(vcpu) != 0)
488 return 1;
489 if (!(xcr0 & XSTATE_FP))
490 return 1;
491 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
492 return 1;
493 if (xcr0 & ~host_xcr0)
494 return 1;
495 vcpu->arch.xcr0 = xcr0;
496 vcpu->guest_xcr0_loaded = 0;
497 return 0;
498}
499
500int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
501{
502 if (__kvm_set_xcr(vcpu, index, xcr)) {
483 kvm_inject_gp(vcpu, 0); 503 kvm_inject_gp(vcpu, 0);
504 return 1;
505 }
506 return 0;
507}
508EXPORT_SYMBOL_GPL(kvm_set_xcr);
509
510static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
511{
512 struct kvm_cpuid_entry2 *best;
513
514 best = kvm_find_cpuid_entry(vcpu, 1, 0);
515 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
516}
517
518static void update_cpuid(struct kvm_vcpu *vcpu)
519{
520 struct kvm_cpuid_entry2 *best;
521
522 best = kvm_find_cpuid_entry(vcpu, 1, 0);
523 if (!best)
484 return; 524 return;
525
526 /* Update OSXSAVE bit */
527 if (cpu_has_xsave && best->function == 0x1) {
528 best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
529 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
530 best->ecx |= bit(X86_FEATURE_OSXSAVE);
485 } 531 }
532}
533
534int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
535{
536 unsigned long old_cr4 = kvm_read_cr4(vcpu);
537 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
538
539 if (cr4 & CR4_RESERVED_BITS)
540 return 1;
541
542 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
543 return 1;
486 544
487 if (is_long_mode(vcpu)) { 545 if (is_long_mode(vcpu)) {
488 if (!(cr4 & X86_CR4_PAE)) { 546 if (!(cr4 & X86_CR4_PAE))
489 kvm_inject_gp(vcpu, 0); 547 return 1;
490 return;
491 }
492 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 548 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
493 && ((cr4 ^ old_cr4) & pdptr_bits) 549 && ((cr4 ^ old_cr4) & pdptr_bits)
494 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 550 && !load_pdptrs(vcpu, vcpu->arch.cr3))
495 kvm_inject_gp(vcpu, 0); 551 return 1;
496 return; 552
497 } 553 if (cr4 & X86_CR4_VMXE)
554 return 1;
498 555
499 if (cr4 & X86_CR4_VMXE) {
500 kvm_inject_gp(vcpu, 0);
501 return;
502 }
503 kvm_x86_ops->set_cr4(vcpu, cr4); 556 kvm_x86_ops->set_cr4(vcpu, cr4);
504 vcpu->arch.cr4 = cr4; 557
505 kvm_mmu_reset_context(vcpu); 558 if ((cr4 ^ old_cr4) & pdptr_bits)
559 kvm_mmu_reset_context(vcpu);
560
561 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
562 update_cpuid(vcpu);
563
564 return 0;
506} 565}
507EXPORT_SYMBOL_GPL(kvm_set_cr4); 566EXPORT_SYMBOL_GPL(kvm_set_cr4);
508 567
509void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 568int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
510{ 569{
511 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 570 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
512 kvm_mmu_sync_roots(vcpu); 571 kvm_mmu_sync_roots(vcpu);
513 kvm_mmu_flush_tlb(vcpu); 572 kvm_mmu_flush_tlb(vcpu);
514 return; 573 return 0;
515 } 574 }
516 575
517 if (is_long_mode(vcpu)) { 576 if (is_long_mode(vcpu)) {
518 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 577 if (cr3 & CR3_L_MODE_RESERVED_BITS)
519 kvm_inject_gp(vcpu, 0); 578 return 1;
520 return;
521 }
522 } else { 579 } else {
523 if (is_pae(vcpu)) { 580 if (is_pae(vcpu)) {
524 if (cr3 & CR3_PAE_RESERVED_BITS) { 581 if (cr3 & CR3_PAE_RESERVED_BITS)
525 kvm_inject_gp(vcpu, 0); 582 return 1;
526 return; 583 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
527 } 584 return 1;
528 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
529 kvm_inject_gp(vcpu, 0);
530 return;
531 }
532 } 585 }
533 /* 586 /*
534 * We don't check reserved bits in nonpae mode, because 587 * We don't check reserved bits in nonpae mode, because
@@ -546,24 +599,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
546 * to debug) behavior on the guest side. 599 * to debug) behavior on the guest side.
547 */ 600 */
548 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 601 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
549 kvm_inject_gp(vcpu, 0); 602 return 1;
550 else { 603 vcpu->arch.cr3 = cr3;
551 vcpu->arch.cr3 = cr3; 604 vcpu->arch.mmu.new_cr3(vcpu);
552 vcpu->arch.mmu.new_cr3(vcpu); 605 return 0;
553 }
554} 606}
555EXPORT_SYMBOL_GPL(kvm_set_cr3); 607EXPORT_SYMBOL_GPL(kvm_set_cr3);
556 608
557void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 609int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
558{ 610{
559 if (cr8 & CR8_RESERVED_BITS) { 611 if (cr8 & CR8_RESERVED_BITS)
560 kvm_inject_gp(vcpu, 0); 612 return 1;
561 return;
562 }
563 if (irqchip_in_kernel(vcpu->kvm)) 613 if (irqchip_in_kernel(vcpu->kvm))
564 kvm_lapic_set_tpr(vcpu, cr8); 614 kvm_lapic_set_tpr(vcpu, cr8);
565 else 615 else
566 vcpu->arch.cr8 = cr8; 616 vcpu->arch.cr8 = cr8;
617 return 0;
618}
619
620void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
621{
622 if (__kvm_set_cr8(vcpu, cr8))
623 kvm_inject_gp(vcpu, 0);
567} 624}
568EXPORT_SYMBOL_GPL(kvm_set_cr8); 625EXPORT_SYMBOL_GPL(kvm_set_cr8);
569 626
@@ -576,7 +633,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
576} 633}
577EXPORT_SYMBOL_GPL(kvm_get_cr8); 634EXPORT_SYMBOL_GPL(kvm_get_cr8);
578 635
579int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 636static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
580{ 637{
581 switch (dr) { 638 switch (dr) {
582 case 0 ... 3: 639 case 0 ... 3:
@@ -585,29 +642,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
585 vcpu->arch.eff_db[dr] = val; 642 vcpu->arch.eff_db[dr] = val;
586 break; 643 break;
587 case 4: 644 case 4:
588 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 645 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
589 kvm_queue_exception(vcpu, UD_VECTOR); 646 return 1; /* #UD */
590 return 1;
591 }
592 /* fall through */ 647 /* fall through */
593 case 6: 648 case 6:
594 if (val & 0xffffffff00000000ULL) { 649 if (val & 0xffffffff00000000ULL)
595 kvm_inject_gp(vcpu, 0); 650 return -1; /* #GP */
596 return 1;
597 }
598 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 651 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
599 break; 652 break;
600 case 5: 653 case 5:
601 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 654 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
602 kvm_queue_exception(vcpu, UD_VECTOR); 655 return 1; /* #UD */
603 return 1;
604 }
605 /* fall through */ 656 /* fall through */
606 default: /* 7 */ 657 default: /* 7 */
607 if (val & 0xffffffff00000000ULL) { 658 if (val & 0xffffffff00000000ULL)
608 kvm_inject_gp(vcpu, 0); 659 return -1; /* #GP */
609 return 1;
610 }
611 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 660 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
612 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 661 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
613 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); 662 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
@@ -618,28 +667,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
618 667
619 return 0; 668 return 0;
620} 669}
670
671int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
672{
673 int res;
674
675 res = __kvm_set_dr(vcpu, dr, val);
676 if (res > 0)
677 kvm_queue_exception(vcpu, UD_VECTOR);
678 else if (res < 0)
679 kvm_inject_gp(vcpu, 0);
680
681 return res;
682}
621EXPORT_SYMBOL_GPL(kvm_set_dr); 683EXPORT_SYMBOL_GPL(kvm_set_dr);
622 684
623int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 685static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
624{ 686{
625 switch (dr) { 687 switch (dr) {
626 case 0 ... 3: 688 case 0 ... 3:
627 *val = vcpu->arch.db[dr]; 689 *val = vcpu->arch.db[dr];
628 break; 690 break;
629 case 4: 691 case 4:
630 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 692 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
631 kvm_queue_exception(vcpu, UD_VECTOR);
632 return 1; 693 return 1;
633 }
634 /* fall through */ 694 /* fall through */
635 case 6: 695 case 6:
636 *val = vcpu->arch.dr6; 696 *val = vcpu->arch.dr6;
637 break; 697 break;
638 case 5: 698 case 5:
639 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 699 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
640 kvm_queue_exception(vcpu, UD_VECTOR);
641 return 1; 700 return 1;
642 }
643 /* fall through */ 701 /* fall through */
644 default: /* 7 */ 702 default: /* 7 */
645 *val = vcpu->arch.dr7; 703 *val = vcpu->arch.dr7;
@@ -648,12 +706,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
648 706
649 return 0; 707 return 0;
650} 708}
651EXPORT_SYMBOL_GPL(kvm_get_dr);
652 709
653static inline u32 bit(int bitno) 710int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
654{ 711{
655 return 1 << (bitno & 31); 712 if (_kvm_get_dr(vcpu, dr, val)) {
713 kvm_queue_exception(vcpu, UD_VECTOR);
714 return 1;
715 }
716 return 0;
656} 717}
718EXPORT_SYMBOL_GPL(kvm_get_dr);
657 719
658/* 720/*
659 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 721 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -671,7 +733,7 @@ static u32 msrs_to_save[] = {
671 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 733 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
672 HV_X64_MSR_APIC_ASSIST_PAGE, 734 HV_X64_MSR_APIC_ASSIST_PAGE,
673 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 735 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
674 MSR_K6_STAR, 736 MSR_STAR,
675#ifdef CONFIG_X86_64 737#ifdef CONFIG_X86_64
676 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 738 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
677#endif 739#endif
@@ -682,10 +744,14 @@ static unsigned num_msrs_to_save;
682 744
683static u32 emulated_msrs[] = { 745static u32 emulated_msrs[] = {
684 MSR_IA32_MISC_ENABLE, 746 MSR_IA32_MISC_ENABLE,
747 MSR_IA32_MCG_STATUS,
748 MSR_IA32_MCG_CTL,
685}; 749};
686 750
687static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 751static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
688{ 752{
753 u64 old_efer = vcpu->arch.efer;
754
689 if (efer & efer_reserved_bits) 755 if (efer & efer_reserved_bits)
690 return 1; 756 return 1;
691 757
@@ -714,11 +780,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
714 780
715 kvm_x86_ops->set_efer(vcpu, efer); 781 kvm_x86_ops->set_efer(vcpu, efer);
716 782
717 vcpu->arch.efer = efer;
718
719 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 783 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
720 kvm_mmu_reset_context(vcpu); 784 kvm_mmu_reset_context(vcpu);
721 785
786 /* Update reserved bits */
787 if ((efer ^ old_efer) & EFER_NX)
788 kvm_mmu_reset_context(vcpu);
789
722 return 0; 790 return 0;
723} 791}
724 792
@@ -882,7 +950,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
882 950
883 if (!vcpu->time_page) 951 if (!vcpu->time_page)
884 return 0; 952 return 0;
885 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 953 kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
886 return 1; 954 return 1;
887} 955}
888 956
@@ -1524,16 +1592,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1524{ 1592{
1525 int i, idx; 1593 int i, idx;
1526 1594
1527 vcpu_load(vcpu);
1528
1529 idx = srcu_read_lock(&vcpu->kvm->srcu); 1595 idx = srcu_read_lock(&vcpu->kvm->srcu);
1530 for (i = 0; i < msrs->nmsrs; ++i) 1596 for (i = 0; i < msrs->nmsrs; ++i)
1531 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1597 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1532 break; 1598 break;
1533 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1599 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1534 1600
1535 vcpu_put(vcpu);
1536
1537 return i; 1601 return i;
1538} 1602}
1539 1603
@@ -1618,6 +1682,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1618 case KVM_CAP_PCI_SEGMENT: 1682 case KVM_CAP_PCI_SEGMENT:
1619 case KVM_CAP_DEBUGREGS: 1683 case KVM_CAP_DEBUGREGS:
1620 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1684 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1685 case KVM_CAP_XSAVE:
1621 r = 1; 1686 r = 1;
1622 break; 1687 break;
1623 case KVM_CAP_COALESCED_MMIO: 1688 case KVM_CAP_COALESCED_MMIO:
@@ -1641,6 +1706,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1641 case KVM_CAP_MCE: 1706 case KVM_CAP_MCE:
1642 r = KVM_MAX_MCE_BANKS; 1707 r = KVM_MAX_MCE_BANKS;
1643 break; 1708 break;
1709 case KVM_CAP_XCRS:
1710 r = cpu_has_xsave;
1711 break;
1644 default: 1712 default:
1645 r = 0; 1713 r = 0;
1646 break; 1714 break;
@@ -1717,8 +1785,28 @@ out:
1717 return r; 1785 return r;
1718} 1786}
1719 1787
1788static void wbinvd_ipi(void *garbage)
1789{
1790 wbinvd();
1791}
1792
1793static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
1794{
1795 return vcpu->kvm->arch.iommu_domain &&
1796 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
1797}
1798
1720void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1799void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1721{ 1800{
1801 /* Address WBINVD may be executed by guest */
1802 if (need_emulate_wbinvd(vcpu)) {
1803 if (kvm_x86_ops->has_wbinvd_exit())
1804 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
1805 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
1806 smp_call_function_single(vcpu->cpu,
1807 wbinvd_ipi, NULL, 1);
1808 }
1809
1722 kvm_x86_ops->vcpu_load(vcpu, cpu); 1810 kvm_x86_ops->vcpu_load(vcpu, cpu);
1723 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 1811 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1724 unsigned long khz = cpufreq_quick_get(cpu); 1812 unsigned long khz = cpufreq_quick_get(cpu);
@@ -1731,8 +1819,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1731 1819
1732void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1820void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1733{ 1821{
1734 kvm_put_guest_fpu(vcpu);
1735 kvm_x86_ops->vcpu_put(vcpu); 1822 kvm_x86_ops->vcpu_put(vcpu);
1823 kvm_put_guest_fpu(vcpu);
1736} 1824}
1737 1825
1738static int is_efer_nx(void) 1826static int is_efer_nx(void)
@@ -1781,7 +1869,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1781 if (copy_from_user(cpuid_entries, entries, 1869 if (copy_from_user(cpuid_entries, entries,
1782 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1870 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1783 goto out_free; 1871 goto out_free;
1784 vcpu_load(vcpu);
1785 for (i = 0; i < cpuid->nent; i++) { 1872 for (i = 0; i < cpuid->nent; i++) {
1786 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1873 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1787 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1874 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1799,7 +1886,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1799 r = 0; 1886 r = 0;
1800 kvm_apic_set_version(vcpu); 1887 kvm_apic_set_version(vcpu);
1801 kvm_x86_ops->cpuid_update(vcpu); 1888 kvm_x86_ops->cpuid_update(vcpu);
1802 vcpu_put(vcpu); 1889 update_cpuid(vcpu);
1803 1890
1804out_free: 1891out_free:
1805 vfree(cpuid_entries); 1892 vfree(cpuid_entries);
@@ -1820,11 +1907,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1820 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1907 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1821 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1908 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1822 goto out; 1909 goto out;
1823 vcpu_load(vcpu);
1824 vcpu->arch.cpuid_nent = cpuid->nent; 1910 vcpu->arch.cpuid_nent = cpuid->nent;
1825 kvm_apic_set_version(vcpu); 1911 kvm_apic_set_version(vcpu);
1826 kvm_x86_ops->cpuid_update(vcpu); 1912 kvm_x86_ops->cpuid_update(vcpu);
1827 vcpu_put(vcpu); 1913 update_cpuid(vcpu);
1828 return 0; 1914 return 0;
1829 1915
1830out: 1916out:
@@ -1837,7 +1923,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1837{ 1923{
1838 int r; 1924 int r;
1839 1925
1840 vcpu_load(vcpu);
1841 r = -E2BIG; 1926 r = -E2BIG;
1842 if (cpuid->nent < vcpu->arch.cpuid_nent) 1927 if (cpuid->nent < vcpu->arch.cpuid_nent)
1843 goto out; 1928 goto out;
@@ -1849,7 +1934,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1849 1934
1850out: 1935out:
1851 cpuid->nent = vcpu->arch.cpuid_nent; 1936 cpuid->nent = vcpu->arch.cpuid_nent;
1852 vcpu_put(vcpu);
1853 return r; 1937 return r;
1854} 1938}
1855 1939
@@ -1901,13 +1985,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1901 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1985 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1902 /* cpuid 1.ecx */ 1986 /* cpuid 1.ecx */
1903 const u32 kvm_supported_word4_x86_features = 1987 const u32 kvm_supported_word4_x86_features =
1904 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1988 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
1905 0 /* DS-CPL, VMX, SMX, EST */ | 1989 0 /* DS-CPL, VMX, SMX, EST */ |
1906 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1990 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1907 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1991 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1908 0 /* Reserved, DCA */ | F(XMM4_1) | 1992 0 /* Reserved, DCA */ | F(XMM4_1) |
1909 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1993 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1910 0 /* Reserved, XSAVE, OSXSAVE */; 1994 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
1911 /* cpuid 0x80000001.ecx */ 1995 /* cpuid 0x80000001.ecx */
1912 const u32 kvm_supported_word6_x86_features = 1996 const u32 kvm_supported_word6_x86_features =
1913 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1997 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
@@ -1922,7 +2006,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1922 2006
1923 switch (function) { 2007 switch (function) {
1924 case 0: 2008 case 0:
1925 entry->eax = min(entry->eax, (u32)0xb); 2009 entry->eax = min(entry->eax, (u32)0xd);
1926 break; 2010 break;
1927 case 1: 2011 case 1:
1928 entry->edx &= kvm_supported_word0_x86_features; 2012 entry->edx &= kvm_supported_word0_x86_features;
@@ -1980,6 +2064,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1980 } 2064 }
1981 break; 2065 break;
1982 } 2066 }
2067 case 0xd: {
2068 int i;
2069
2070 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2071 for (i = 1; *nent < maxnent; ++i) {
2072 if (entry[i - 1].eax == 0 && i != 2)
2073 break;
2074 do_cpuid_1_ent(&entry[i], function, i);
2075 entry[i].flags |=
2076 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2077 ++*nent;
2078 }
2079 break;
2080 }
1983 case KVM_CPUID_SIGNATURE: { 2081 case KVM_CPUID_SIGNATURE: {
1984 char signature[12] = "KVMKVMKVM\0\0"; 2082 char signature[12] = "KVMKVMKVM\0\0";
1985 u32 *sigptr = (u32 *)signature; 2083 u32 *sigptr = (u32 *)signature;
@@ -2081,9 +2179,7 @@ out:
2081static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2179static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2082 struct kvm_lapic_state *s) 2180 struct kvm_lapic_state *s)
2083{ 2181{
2084 vcpu_load(vcpu);
2085 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 2182 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2086 vcpu_put(vcpu);
2087 2183
2088 return 0; 2184 return 0;
2089} 2185}
@@ -2091,11 +2187,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2091static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 2187static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2092 struct kvm_lapic_state *s) 2188 struct kvm_lapic_state *s)
2093{ 2189{
2094 vcpu_load(vcpu);
2095 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 2190 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
2096 kvm_apic_post_state_restore(vcpu); 2191 kvm_apic_post_state_restore(vcpu);
2097 update_cr8_intercept(vcpu); 2192 update_cr8_intercept(vcpu);
2098 vcpu_put(vcpu);
2099 2193
2100 return 0; 2194 return 0;
2101} 2195}
@@ -2107,20 +2201,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2107 return -EINVAL; 2201 return -EINVAL;
2108 if (irqchip_in_kernel(vcpu->kvm)) 2202 if (irqchip_in_kernel(vcpu->kvm))
2109 return -ENXIO; 2203 return -ENXIO;
2110 vcpu_load(vcpu);
2111 2204
2112 kvm_queue_interrupt(vcpu, irq->irq, false); 2205 kvm_queue_interrupt(vcpu, irq->irq, false);
2113 2206
2114 vcpu_put(vcpu);
2115
2116 return 0; 2207 return 0;
2117} 2208}
2118 2209
2119static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 2210static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
2120{ 2211{
2121 vcpu_load(vcpu);
2122 kvm_inject_nmi(vcpu); 2212 kvm_inject_nmi(vcpu);
2123 vcpu_put(vcpu);
2124 2213
2125 return 0; 2214 return 0;
2126} 2215}
@@ -2140,7 +2229,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2140 int r; 2229 int r;
2141 unsigned bank_num = mcg_cap & 0xff, bank; 2230 unsigned bank_num = mcg_cap & 0xff, bank;
2142 2231
2143 vcpu_load(vcpu);
2144 r = -EINVAL; 2232 r = -EINVAL;
2145 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 2233 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2146 goto out; 2234 goto out;
@@ -2155,7 +2243,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2155 for (bank = 0; bank < bank_num; bank++) 2243 for (bank = 0; bank < bank_num; bank++)
2156 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 2244 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2157out: 2245out:
2158 vcpu_put(vcpu);
2159 return r; 2246 return r;
2160} 2247}
2161 2248
@@ -2188,7 +2275,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2188 printk(KERN_DEBUG "kvm: set_mce: " 2275 printk(KERN_DEBUG "kvm: set_mce: "
2189 "injects mce exception while " 2276 "injects mce exception while "
2190 "previous one is in progress!\n"); 2277 "previous one is in progress!\n");
2191 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 2278 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2192 return 0; 2279 return 0;
2193 } 2280 }
2194 if (banks[1] & MCI_STATUS_VAL) 2281 if (banks[1] & MCI_STATUS_VAL)
@@ -2213,8 +2300,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2213static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 2300static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2214 struct kvm_vcpu_events *events) 2301 struct kvm_vcpu_events *events)
2215{ 2302{
2216 vcpu_load(vcpu);
2217
2218 events->exception.injected = 2303 events->exception.injected =
2219 vcpu->arch.exception.pending && 2304 vcpu->arch.exception.pending &&
2220 !kvm_exception_is_soft(vcpu->arch.exception.nr); 2305 !kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2239,8 +2324,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2239 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2324 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2240 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2325 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2241 | KVM_VCPUEVENT_VALID_SHADOW); 2326 | KVM_VCPUEVENT_VALID_SHADOW);
2242
2243 vcpu_put(vcpu);
2244} 2327}
2245 2328
2246static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 2329static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2251,8 +2334,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2251 | KVM_VCPUEVENT_VALID_SHADOW)) 2334 | KVM_VCPUEVENT_VALID_SHADOW))
2252 return -EINVAL; 2335 return -EINVAL;
2253 2336
2254 vcpu_load(vcpu);
2255
2256 vcpu->arch.exception.pending = events->exception.injected; 2337 vcpu->arch.exception.pending = events->exception.injected;
2257 vcpu->arch.exception.nr = events->exception.nr; 2338 vcpu->arch.exception.nr = events->exception.nr;
2258 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 2339 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2275,22 +2356,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2275 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2356 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2276 vcpu->arch.sipi_vector = events->sipi_vector; 2357 vcpu->arch.sipi_vector = events->sipi_vector;
2277 2358
2278 vcpu_put(vcpu);
2279
2280 return 0; 2359 return 0;
2281} 2360}
2282 2361
2283static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 2362static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2284 struct kvm_debugregs *dbgregs) 2363 struct kvm_debugregs *dbgregs)
2285{ 2364{
2286 vcpu_load(vcpu);
2287
2288 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 2365 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2289 dbgregs->dr6 = vcpu->arch.dr6; 2366 dbgregs->dr6 = vcpu->arch.dr6;
2290 dbgregs->dr7 = vcpu->arch.dr7; 2367 dbgregs->dr7 = vcpu->arch.dr7;
2291 dbgregs->flags = 0; 2368 dbgregs->flags = 0;
2292
2293 vcpu_put(vcpu);
2294} 2369}
2295 2370
2296static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 2371static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2299,40 +2374,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2299 if (dbgregs->flags) 2374 if (dbgregs->flags)
2300 return -EINVAL; 2375 return -EINVAL;
2301 2376
2302 vcpu_load(vcpu);
2303
2304 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 2377 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2305 vcpu->arch.dr6 = dbgregs->dr6; 2378 vcpu->arch.dr6 = dbgregs->dr6;
2306 vcpu->arch.dr7 = dbgregs->dr7; 2379 vcpu->arch.dr7 = dbgregs->dr7;
2307 2380
2308 vcpu_put(vcpu); 2381 return 0;
2382}
2383
2384static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
2385 struct kvm_xsave *guest_xsave)
2386{
2387 if (cpu_has_xsave)
2388 memcpy(guest_xsave->region,
2389 &vcpu->arch.guest_fpu.state->xsave,
2390 sizeof(struct xsave_struct));
2391 else {
2392 memcpy(guest_xsave->region,
2393 &vcpu->arch.guest_fpu.state->fxsave,
2394 sizeof(struct i387_fxsave_struct));
2395 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
2396 XSTATE_FPSSE;
2397 }
2398}
2399
2400static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
2401 struct kvm_xsave *guest_xsave)
2402{
2403 u64 xstate_bv =
2404 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
2309 2405
2406 if (cpu_has_xsave)
2407 memcpy(&vcpu->arch.guest_fpu.state->xsave,
2408 guest_xsave->region, sizeof(struct xsave_struct));
2409 else {
2410 if (xstate_bv & ~XSTATE_FPSSE)
2411 return -EINVAL;
2412 memcpy(&vcpu->arch.guest_fpu.state->fxsave,
2413 guest_xsave->region, sizeof(struct i387_fxsave_struct));
2414 }
2310 return 0; 2415 return 0;
2311} 2416}
2312 2417
2418static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
2419 struct kvm_xcrs *guest_xcrs)
2420{
2421 if (!cpu_has_xsave) {
2422 guest_xcrs->nr_xcrs = 0;
2423 return;
2424 }
2425
2426 guest_xcrs->nr_xcrs = 1;
2427 guest_xcrs->flags = 0;
2428 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
2429 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
2430}
2431
2432static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2433 struct kvm_xcrs *guest_xcrs)
2434{
2435 int i, r = 0;
2436
2437 if (!cpu_has_xsave)
2438 return -EINVAL;
2439
2440 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
2441 return -EINVAL;
2442
2443 for (i = 0; i < guest_xcrs->nr_xcrs; i++)
2444 /* Only support XCR0 currently */
2445 if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
2446 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
2447 guest_xcrs->xcrs[0].value);
2448 break;
2449 }
2450 if (r)
2451 r = -EINVAL;
2452 return r;
2453}
2454
2313long kvm_arch_vcpu_ioctl(struct file *filp, 2455long kvm_arch_vcpu_ioctl(struct file *filp,
2314 unsigned int ioctl, unsigned long arg) 2456 unsigned int ioctl, unsigned long arg)
2315{ 2457{
2316 struct kvm_vcpu *vcpu = filp->private_data; 2458 struct kvm_vcpu *vcpu = filp->private_data;
2317 void __user *argp = (void __user *)arg; 2459 void __user *argp = (void __user *)arg;
2318 int r; 2460 int r;
2319 struct kvm_lapic_state *lapic = NULL; 2461 union {
2462 struct kvm_lapic_state *lapic;
2463 struct kvm_xsave *xsave;
2464 struct kvm_xcrs *xcrs;
2465 void *buffer;
2466 } u;
2320 2467
2468 u.buffer = NULL;
2321 switch (ioctl) { 2469 switch (ioctl) {
2322 case KVM_GET_LAPIC: { 2470 case KVM_GET_LAPIC: {
2323 r = -EINVAL; 2471 r = -EINVAL;
2324 if (!vcpu->arch.apic) 2472 if (!vcpu->arch.apic)
2325 goto out; 2473 goto out;
2326 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2474 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2327 2475
2328 r = -ENOMEM; 2476 r = -ENOMEM;
2329 if (!lapic) 2477 if (!u.lapic)
2330 goto out; 2478 goto out;
2331 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 2479 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
2332 if (r) 2480 if (r)
2333 goto out; 2481 goto out;
2334 r = -EFAULT; 2482 r = -EFAULT;
2335 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 2483 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
2336 goto out; 2484 goto out;
2337 r = 0; 2485 r = 0;
2338 break; 2486 break;
@@ -2341,14 +2489,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2341 r = -EINVAL; 2489 r = -EINVAL;
2342 if (!vcpu->arch.apic) 2490 if (!vcpu->arch.apic)
2343 goto out; 2491 goto out;
2344 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2492 u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2345 r = -ENOMEM; 2493 r = -ENOMEM;
2346 if (!lapic) 2494 if (!u.lapic)
2347 goto out; 2495 goto out;
2348 r = -EFAULT; 2496 r = -EFAULT;
2349 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 2497 if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
2350 goto out; 2498 goto out;
2351 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 2499 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
2352 if (r) 2500 if (r)
2353 goto out; 2501 goto out;
2354 r = 0; 2502 r = 0;
@@ -2464,9 +2612,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2464 r = -EFAULT; 2612 r = -EFAULT;
2465 if (copy_from_user(&mce, argp, sizeof mce)) 2613 if (copy_from_user(&mce, argp, sizeof mce))
2466 goto out; 2614 goto out;
2467 vcpu_load(vcpu);
2468 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2615 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
2469 vcpu_put(vcpu);
2470 break; 2616 break;
2471 } 2617 }
2472 case KVM_GET_VCPU_EVENTS: { 2618 case KVM_GET_VCPU_EVENTS: {
@@ -2513,11 +2659,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2513 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); 2659 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
2514 break; 2660 break;
2515 } 2661 }
2662 case KVM_GET_XSAVE: {
2663 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
2664 r = -ENOMEM;
2665 if (!u.xsave)
2666 break;
2667
2668 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
2669
2670 r = -EFAULT;
2671 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
2672 break;
2673 r = 0;
2674 break;
2675 }
2676 case KVM_SET_XSAVE: {
2677 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
2678 r = -ENOMEM;
2679 if (!u.xsave)
2680 break;
2681
2682 r = -EFAULT;
2683 if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
2684 break;
2685
2686 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
2687 break;
2688 }
2689 case KVM_GET_XCRS: {
2690 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
2691 r = -ENOMEM;
2692 if (!u.xcrs)
2693 break;
2694
2695 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
2696
2697 r = -EFAULT;
2698 if (copy_to_user(argp, u.xcrs,
2699 sizeof(struct kvm_xcrs)))
2700 break;
2701 r = 0;
2702 break;
2703 }
2704 case KVM_SET_XCRS: {
2705 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
2706 r = -ENOMEM;
2707 if (!u.xcrs)
2708 break;
2709
2710 r = -EFAULT;
2711 if (copy_from_user(u.xcrs, argp,
2712 sizeof(struct kvm_xcrs)))
2713 break;
2714
2715 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
2716 break;
2717 }
2516 default: 2718 default:
2517 r = -EINVAL; 2719 r = -EINVAL;
2518 } 2720 }
2519out: 2721out:
2520 kfree(lapic); 2722 kfree(u.buffer);
2521 return r; 2723 return r;
2522} 2724}
2523 2725
@@ -2560,115 +2762,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2560 return kvm->arch.n_alloc_mmu_pages; 2762 return kvm->arch.n_alloc_mmu_pages;
2561} 2763}
2562 2764
2563gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2564{
2565 int i;
2566 struct kvm_mem_alias *alias;
2567 struct kvm_mem_aliases *aliases;
2568
2569 aliases = kvm_aliases(kvm);
2570
2571 for (i = 0; i < aliases->naliases; ++i) {
2572 alias = &aliases->aliases[i];
2573 if (alias->flags & KVM_ALIAS_INVALID)
2574 continue;
2575 if (gfn >= alias->base_gfn
2576 && gfn < alias->base_gfn + alias->npages)
2577 return alias->target_gfn + gfn - alias->base_gfn;
2578 }
2579 return gfn;
2580}
2581
2582gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2583{
2584 int i;
2585 struct kvm_mem_alias *alias;
2586 struct kvm_mem_aliases *aliases;
2587
2588 aliases = kvm_aliases(kvm);
2589
2590 for (i = 0; i < aliases->naliases; ++i) {
2591 alias = &aliases->aliases[i];
2592 if (gfn >= alias->base_gfn
2593 && gfn < alias->base_gfn + alias->npages)
2594 return alias->target_gfn + gfn - alias->base_gfn;
2595 }
2596 return gfn;
2597}
2598
2599/*
2600 * Set a new alias region. Aliases map a portion of physical memory into
2601 * another portion. This is useful for memory windows, for example the PC
2602 * VGA region.
2603 */
2604static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2605 struct kvm_memory_alias *alias)
2606{
2607 int r, n;
2608 struct kvm_mem_alias *p;
2609 struct kvm_mem_aliases *aliases, *old_aliases;
2610
2611 r = -EINVAL;
2612 /* General sanity checks */
2613 if (alias->memory_size & (PAGE_SIZE - 1))
2614 goto out;
2615 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
2616 goto out;
2617 if (alias->slot >= KVM_ALIAS_SLOTS)
2618 goto out;
2619 if (alias->guest_phys_addr + alias->memory_size
2620 < alias->guest_phys_addr)
2621 goto out;
2622 if (alias->target_phys_addr + alias->memory_size
2623 < alias->target_phys_addr)
2624 goto out;
2625
2626 r = -ENOMEM;
2627 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2628 if (!aliases)
2629 goto out;
2630
2631 mutex_lock(&kvm->slots_lock);
2632
2633 /* invalidate any gfn reference in case of deletion/shrinking */
2634 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2635 aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
2636 old_aliases = kvm->arch.aliases;
2637 rcu_assign_pointer(kvm->arch.aliases, aliases);
2638 synchronize_srcu_expedited(&kvm->srcu);
2639 kvm_mmu_zap_all(kvm);
2640 kfree(old_aliases);
2641
2642 r = -ENOMEM;
2643 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2644 if (!aliases)
2645 goto out_unlock;
2646
2647 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2648
2649 p = &aliases->aliases[alias->slot];
2650 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2651 p->npages = alias->memory_size >> PAGE_SHIFT;
2652 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2653 p->flags &= ~(KVM_ALIAS_INVALID);
2654
2655 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2656 if (aliases->aliases[n - 1].npages)
2657 break;
2658 aliases->naliases = n;
2659
2660 old_aliases = kvm->arch.aliases;
2661 rcu_assign_pointer(kvm->arch.aliases, aliases);
2662 synchronize_srcu_expedited(&kvm->srcu);
2663 kfree(old_aliases);
2664 r = 0;
2665
2666out_unlock:
2667 mutex_unlock(&kvm->slots_lock);
2668out:
2669 return r;
2670}
2671
2672static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2765static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2673{ 2766{
2674 int r; 2767 int r;
@@ -2797,7 +2890,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2797 struct kvm_memory_slot *memslot; 2890 struct kvm_memory_slot *memslot;
2798 unsigned long n; 2891 unsigned long n;
2799 unsigned long is_dirty = 0; 2892 unsigned long is_dirty = 0;
2800 unsigned long *dirty_bitmap = NULL;
2801 2893
2802 mutex_lock(&kvm->slots_lock); 2894 mutex_lock(&kvm->slots_lock);
2803 2895
@@ -2812,27 +2904,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2812 2904
2813 n = kvm_dirty_bitmap_bytes(memslot); 2905 n = kvm_dirty_bitmap_bytes(memslot);
2814 2906
2815 r = -ENOMEM;
2816 dirty_bitmap = vmalloc(n);
2817 if (!dirty_bitmap)
2818 goto out;
2819 memset(dirty_bitmap, 0, n);
2820
2821 for (i = 0; !is_dirty && i < n/sizeof(long); i++) 2907 for (i = 0; !is_dirty && i < n/sizeof(long); i++)
2822 is_dirty = memslot->dirty_bitmap[i]; 2908 is_dirty = memslot->dirty_bitmap[i];
2823 2909
2824 /* If nothing is dirty, don't bother messing with page tables. */ 2910 /* If nothing is dirty, don't bother messing with page tables. */
2825 if (is_dirty) { 2911 if (is_dirty) {
2826 struct kvm_memslots *slots, *old_slots; 2912 struct kvm_memslots *slots, *old_slots;
2913 unsigned long *dirty_bitmap;
2827 2914
2828 spin_lock(&kvm->mmu_lock); 2915 spin_lock(&kvm->mmu_lock);
2829 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2916 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2830 spin_unlock(&kvm->mmu_lock); 2917 spin_unlock(&kvm->mmu_lock);
2831 2918
2832 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 2919 r = -ENOMEM;
2833 if (!slots) 2920 dirty_bitmap = vmalloc(n);
2834 goto out_free; 2921 if (!dirty_bitmap)
2922 goto out;
2923 memset(dirty_bitmap, 0, n);
2835 2924
2925 r = -ENOMEM;
2926 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2927 if (!slots) {
2928 vfree(dirty_bitmap);
2929 goto out;
2930 }
2836 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 2931 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2837 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 2932 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
2838 2933
@@ -2841,13 +2936,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2841 synchronize_srcu_expedited(&kvm->srcu); 2936 synchronize_srcu_expedited(&kvm->srcu);
2842 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; 2937 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2843 kfree(old_slots); 2938 kfree(old_slots);
2939
2940 r = -EFAULT;
2941 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
2942 vfree(dirty_bitmap);
2943 goto out;
2944 }
2945 vfree(dirty_bitmap);
2946 } else {
2947 r = -EFAULT;
2948 if (clear_user(log->dirty_bitmap, n))
2949 goto out;
2844 } 2950 }
2845 2951
2846 r = 0; 2952 r = 0;
2847 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2848 r = -EFAULT;
2849out_free:
2850 vfree(dirty_bitmap);
2851out: 2953out:
2852 mutex_unlock(&kvm->slots_lock); 2954 mutex_unlock(&kvm->slots_lock);
2853 return r; 2955 return r;
@@ -2867,7 +2969,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2867 union { 2969 union {
2868 struct kvm_pit_state ps; 2970 struct kvm_pit_state ps;
2869 struct kvm_pit_state2 ps2; 2971 struct kvm_pit_state2 ps2;
2870 struct kvm_memory_alias alias;
2871 struct kvm_pit_config pit_config; 2972 struct kvm_pit_config pit_config;
2872 } u; 2973 } u;
2873 2974
@@ -2888,22 +2989,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2888 goto out; 2989 goto out;
2889 break; 2990 break;
2890 } 2991 }
2891 case KVM_SET_MEMORY_REGION: {
2892 struct kvm_memory_region kvm_mem;
2893 struct kvm_userspace_memory_region kvm_userspace_mem;
2894
2895 r = -EFAULT;
2896 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2897 goto out;
2898 kvm_userspace_mem.slot = kvm_mem.slot;
2899 kvm_userspace_mem.flags = kvm_mem.flags;
2900 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2901 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2902 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2903 if (r)
2904 goto out;
2905 break;
2906 }
2907 case KVM_SET_NR_MMU_PAGES: 2992 case KVM_SET_NR_MMU_PAGES:
2908 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 2993 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2909 if (r) 2994 if (r)
@@ -2912,14 +2997,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2912 case KVM_GET_NR_MMU_PAGES: 2997 case KVM_GET_NR_MMU_PAGES:
2913 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 2998 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
2914 break; 2999 break;
2915 case KVM_SET_MEMORY_ALIAS:
2916 r = -EFAULT;
2917 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
2918 goto out;
2919 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
2920 if (r)
2921 goto out;
2922 break;
2923 case KVM_CREATE_IRQCHIP: { 3000 case KVM_CREATE_IRQCHIP: {
2924 struct kvm_pic *vpic; 3001 struct kvm_pic *vpic;
2925 3002
@@ -3259,7 +3336,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3259 } 3336 }
3260 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3337 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3261 if (ret < 0) { 3338 if (ret < 0) {
3262 r = X86EMUL_UNHANDLEABLE; 3339 r = X86EMUL_IO_NEEDED;
3263 goto out; 3340 goto out;
3264 } 3341 }
3265 3342
@@ -3315,7 +3392,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3315 } 3392 }
3316 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3393 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3317 if (ret < 0) { 3394 if (ret < 0) {
3318 r = X86EMUL_UNHANDLEABLE; 3395 r = X86EMUL_IO_NEEDED;
3319 goto out; 3396 goto out;
3320 } 3397 }
3321 3398
@@ -3330,10 +3407,10 @@ out:
3330static int emulator_read_emulated(unsigned long addr, 3407static int emulator_read_emulated(unsigned long addr,
3331 void *val, 3408 void *val,
3332 unsigned int bytes, 3409 unsigned int bytes,
3410 unsigned int *error_code,
3333 struct kvm_vcpu *vcpu) 3411 struct kvm_vcpu *vcpu)
3334{ 3412{
3335 gpa_t gpa; 3413 gpa_t gpa;
3336 u32 error_code;
3337 3414
3338 if (vcpu->mmio_read_completed) { 3415 if (vcpu->mmio_read_completed) {
3339 memcpy(val, vcpu->mmio_data, bytes); 3416 memcpy(val, vcpu->mmio_data, bytes);
@@ -3343,12 +3420,10 @@ static int emulator_read_emulated(unsigned long addr,
3343 return X86EMUL_CONTINUE; 3420 return X86EMUL_CONTINUE;
3344 } 3421 }
3345 3422
3346 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); 3423 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
3347 3424
3348 if (gpa == UNMAPPED_GVA) { 3425 if (gpa == UNMAPPED_GVA)
3349 kvm_inject_page_fault(vcpu, addr, error_code);
3350 return X86EMUL_PROPAGATE_FAULT; 3426 return X86EMUL_PROPAGATE_FAULT;
3351 }
3352 3427
3353 /* For APIC access vmexit */ 3428 /* For APIC access vmexit */
3354 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3429 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3370,11 +3445,12 @@ mmio:
3370 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 3445 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
3371 3446
3372 vcpu->mmio_needed = 1; 3447 vcpu->mmio_needed = 1;
3373 vcpu->mmio_phys_addr = gpa; 3448 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3374 vcpu->mmio_size = bytes; 3449 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3375 vcpu->mmio_is_write = 0; 3450 vcpu->run->mmio.len = vcpu->mmio_size = bytes;
3451 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
3376 3452
3377 return X86EMUL_UNHANDLEABLE; 3453 return X86EMUL_IO_NEEDED;
3378} 3454}
3379 3455
3380int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3456int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -3392,17 +3468,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3392static int emulator_write_emulated_onepage(unsigned long addr, 3468static int emulator_write_emulated_onepage(unsigned long addr,
3393 const void *val, 3469 const void *val,
3394 unsigned int bytes, 3470 unsigned int bytes,
3471 unsigned int *error_code,
3395 struct kvm_vcpu *vcpu) 3472 struct kvm_vcpu *vcpu)
3396{ 3473{
3397 gpa_t gpa; 3474 gpa_t gpa;
3398 u32 error_code;
3399 3475
3400 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); 3476 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
3401 3477
3402 if (gpa == UNMAPPED_GVA) { 3478 if (gpa == UNMAPPED_GVA)
3403 kvm_inject_page_fault(vcpu, addr, error_code);
3404 return X86EMUL_PROPAGATE_FAULT; 3479 return X86EMUL_PROPAGATE_FAULT;
3405 }
3406 3480
3407 /* For APIC access vmexit */ 3481 /* For APIC access vmexit */
3408 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3482 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3420,10 +3494,11 @@ mmio:
3420 return X86EMUL_CONTINUE; 3494 return X86EMUL_CONTINUE;
3421 3495
3422 vcpu->mmio_needed = 1; 3496 vcpu->mmio_needed = 1;
3423 vcpu->mmio_phys_addr = gpa; 3497 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3424 vcpu->mmio_size = bytes; 3498 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3425 vcpu->mmio_is_write = 1; 3499 vcpu->run->mmio.len = vcpu->mmio_size = bytes;
3426 memcpy(vcpu->mmio_data, val, bytes); 3500 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
3501 memcpy(vcpu->run->mmio.data, val, bytes);
3427 3502
3428 return X86EMUL_CONTINUE; 3503 return X86EMUL_CONTINUE;
3429} 3504}
@@ -3431,6 +3506,7 @@ mmio:
3431int emulator_write_emulated(unsigned long addr, 3506int emulator_write_emulated(unsigned long addr,
3432 const void *val, 3507 const void *val,
3433 unsigned int bytes, 3508 unsigned int bytes,
3509 unsigned int *error_code,
3434 struct kvm_vcpu *vcpu) 3510 struct kvm_vcpu *vcpu)
3435{ 3511{
3436 /* Crossing a page boundary? */ 3512 /* Crossing a page boundary? */
@@ -3438,16 +3514,17 @@ int emulator_write_emulated(unsigned long addr,
3438 int rc, now; 3514 int rc, now;
3439 3515
3440 now = -addr & ~PAGE_MASK; 3516 now = -addr & ~PAGE_MASK;
3441 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 3517 rc = emulator_write_emulated_onepage(addr, val, now, error_code,
3518 vcpu);
3442 if (rc != X86EMUL_CONTINUE) 3519 if (rc != X86EMUL_CONTINUE)
3443 return rc; 3520 return rc;
3444 addr += now; 3521 addr += now;
3445 val += now; 3522 val += now;
3446 bytes -= now; 3523 bytes -= now;
3447 } 3524 }
3448 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 3525 return emulator_write_emulated_onepage(addr, val, bytes, error_code,
3526 vcpu);
3449} 3527}
3450EXPORT_SYMBOL_GPL(emulator_write_emulated);
3451 3528
3452#define CMPXCHG_TYPE(t, ptr, old, new) \ 3529#define CMPXCHG_TYPE(t, ptr, old, new) \
3453 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) 3530 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
@@ -3463,6 +3540,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3463 const void *old, 3540 const void *old,
3464 const void *new, 3541 const void *new,
3465 unsigned int bytes, 3542 unsigned int bytes,
3543 unsigned int *error_code,
3466 struct kvm_vcpu *vcpu) 3544 struct kvm_vcpu *vcpu)
3467{ 3545{
3468 gpa_t gpa; 3546 gpa_t gpa;
@@ -3484,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3484 goto emul_write; 3562 goto emul_write;
3485 3563
3486 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3564 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3565 if (is_error_page(page)) {
3566 kvm_release_page_clean(page);
3567 goto emul_write;
3568 }
3487 3569
3488 kaddr = kmap_atomic(page, KM_USER0); 3570 kaddr = kmap_atomic(page, KM_USER0);
3489 kaddr += offset_in_page(gpa); 3571 kaddr += offset_in_page(gpa);
@@ -3516,7 +3598,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3516emul_write: 3598emul_write:
3517 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3599 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3518 3600
3519 return emulator_write_emulated(addr, new, bytes, vcpu); 3601 return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
3520} 3602}
3521 3603
3522static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3604static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3604,42 +3686,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
3604 return X86EMUL_CONTINUE; 3686 return X86EMUL_CONTINUE;
3605} 3687}
3606 3688
3607int emulate_clts(struct kvm_vcpu *vcpu) 3689int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
3608{ 3690{
3609 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 3691 if (!need_emulate_wbinvd(vcpu))
3610 kvm_x86_ops->fpu_activate(vcpu); 3692 return X86EMUL_CONTINUE;
3693
3694 if (kvm_x86_ops->has_wbinvd_exit()) {
3695 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
3696 wbinvd_ipi, NULL, 1);
3697 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
3698 }
3699 wbinvd();
3611 return X86EMUL_CONTINUE; 3700 return X86EMUL_CONTINUE;
3612} 3701}
3702EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
3613 3703
3614int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3704int emulate_clts(struct kvm_vcpu *vcpu)
3615{ 3705{
3616 return kvm_get_dr(ctxt->vcpu, dr, dest); 3706 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3707 kvm_x86_ops->fpu_activate(vcpu);
3708 return X86EMUL_CONTINUE;
3617} 3709}
3618 3710
3619int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3711int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
3620{ 3712{
3621 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3713 return _kvm_get_dr(vcpu, dr, dest);
3622
3623 return kvm_set_dr(ctxt->vcpu, dr, value & mask);
3624} 3714}
3625 3715
3626void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3716int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
3627{ 3717{
3628 u8 opcodes[4];
3629 unsigned long rip = kvm_rip_read(vcpu);
3630 unsigned long rip_linear;
3631
3632 if (!printk_ratelimit())
3633 return;
3634 3718
3635 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3719 return __kvm_set_dr(vcpu, dr, value);
3636
3637 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
3638
3639 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
3640 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
3641} 3720}
3642EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3643 3721
3644static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3722static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3645{ 3723{
@@ -3674,27 +3752,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3674 return value; 3752 return value;
3675} 3753}
3676 3754
3677static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 3755static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3678{ 3756{
3757 int res = 0;
3758
3679 switch (cr) { 3759 switch (cr) {
3680 case 0: 3760 case 0:
3681 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); 3761 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3682 break; 3762 break;
3683 case 2: 3763 case 2:
3684 vcpu->arch.cr2 = val; 3764 vcpu->arch.cr2 = val;
3685 break; 3765 break;
3686 case 3: 3766 case 3:
3687 kvm_set_cr3(vcpu, val); 3767 res = kvm_set_cr3(vcpu, val);
3688 break; 3768 break;
3689 case 4: 3769 case 4:
3690 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 3770 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3691 break; 3771 break;
3692 case 8: 3772 case 8:
3693 kvm_set_cr8(vcpu, val & 0xfUL); 3773 res = __kvm_set_cr8(vcpu, val & 0xfUL);
3694 break; 3774 break;
3695 default: 3775 default:
3696 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3776 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3777 res = -1;
3697 } 3778 }
3779
3780 return res;
3698} 3781}
3699 3782
3700static int emulator_get_cpl(struct kvm_vcpu *vcpu) 3783static int emulator_get_cpl(struct kvm_vcpu *vcpu)
@@ -3707,6 +3790,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3707 kvm_x86_ops->get_gdt(vcpu, dt); 3790 kvm_x86_ops->get_gdt(vcpu, dt);
3708} 3791}
3709 3792
3793static unsigned long emulator_get_cached_segment_base(int seg,
3794 struct kvm_vcpu *vcpu)
3795{
3796 return get_segment_base(vcpu, seg);
3797}
3798
3710static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 3799static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3711 struct kvm_vcpu *vcpu) 3800 struct kvm_vcpu *vcpu)
3712{ 3801{
@@ -3779,11 +3868,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,
3779 kvm_set_segment(vcpu, &kvm_seg, seg); 3868 kvm_set_segment(vcpu, &kvm_seg, seg);
3780} 3869}
3781 3870
3782static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3783{
3784 kvm_x86_ops->set_rflags(vcpu, rflags);
3785}
3786
3787static struct x86_emulate_ops emulate_ops = { 3871static struct x86_emulate_ops emulate_ops = {
3788 .read_std = kvm_read_guest_virt_system, 3872 .read_std = kvm_read_guest_virt_system,
3789 .write_std = kvm_write_guest_virt_system, 3873 .write_std = kvm_write_guest_virt_system,
@@ -3797,11 +3881,15 @@ static struct x86_emulate_ops emulate_ops = {
3797 .set_cached_descriptor = emulator_set_cached_descriptor, 3881 .set_cached_descriptor = emulator_set_cached_descriptor,
3798 .get_segment_selector = emulator_get_segment_selector, 3882 .get_segment_selector = emulator_get_segment_selector,
3799 .set_segment_selector = emulator_set_segment_selector, 3883 .set_segment_selector = emulator_set_segment_selector,
3884 .get_cached_segment_base = emulator_get_cached_segment_base,
3800 .get_gdt = emulator_get_gdt, 3885 .get_gdt = emulator_get_gdt,
3801 .get_cr = emulator_get_cr, 3886 .get_cr = emulator_get_cr,
3802 .set_cr = emulator_set_cr, 3887 .set_cr = emulator_set_cr,
3803 .cpl = emulator_get_cpl, 3888 .cpl = emulator_get_cpl,
3804 .set_rflags = emulator_set_rflags, 3889 .get_dr = emulator_get_dr,
3890 .set_dr = emulator_set_dr,
3891 .set_msr = kvm_set_msr,
3892 .get_msr = kvm_get_msr,
3805}; 3893};
3806 3894
3807static void cache_all_regs(struct kvm_vcpu *vcpu) 3895static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3812,14 +3900,75 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
3812 vcpu->arch.regs_dirty = ~0; 3900 vcpu->arch.regs_dirty = ~0;
3813} 3901}
3814 3902
3903static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
3904{
3905 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
3906 /*
3907 * an sti; sti; sequence only disable interrupts for the first
3908 * instruction. So, if the last instruction, be it emulated or
3909 * not, left the system with the INT_STI flag enabled, it
3910 * means that the last instruction is an sti. We should not
3911 * leave the flag on in this case. The same goes for mov ss
3912 */
3913 if (!(int_shadow & mask))
3914 kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
3915}
3916
3917static void inject_emulated_exception(struct kvm_vcpu *vcpu)
3918{
3919 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
3920 if (ctxt->exception == PF_VECTOR)
3921 kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
3922 else if (ctxt->error_code_valid)
3923 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
3924 else
3925 kvm_queue_exception(vcpu, ctxt->exception);
3926}
3927
3928static int handle_emulation_failure(struct kvm_vcpu *vcpu)
3929{
3930 ++vcpu->stat.insn_emulation_fail;
3931 trace_kvm_emulate_insn_failed(vcpu);
3932 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3933 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3934 vcpu->run->internal.ndata = 0;
3935 kvm_queue_exception(vcpu, UD_VECTOR);
3936 return EMULATE_FAIL;
3937}
3938
3939static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
3940{
3941 gpa_t gpa;
3942
3943 if (tdp_enabled)
3944 return false;
3945
3946 /*
3947 * if emulation was due to access to shadowed page table
3948 * and it failed try to unshadow page and re-entetr the
3949 * guest to let CPU execute the instruction.
3950 */
3951 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
3952 return true;
3953
3954 gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
3955
3956 if (gpa == UNMAPPED_GVA)
3957 return true; /* let cpu generate fault */
3958
3959 if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
3960 return true;
3961
3962 return false;
3963}
3964
3815int emulate_instruction(struct kvm_vcpu *vcpu, 3965int emulate_instruction(struct kvm_vcpu *vcpu,
3816 unsigned long cr2, 3966 unsigned long cr2,
3817 u16 error_code, 3967 u16 error_code,
3818 int emulation_type) 3968 int emulation_type)
3819{ 3969{
3820 int r, shadow_mask; 3970 int r;
3821 struct decode_cache *c; 3971 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
3822 struct kvm_run *run = vcpu->run;
3823 3972
3824 kvm_clear_exception_queue(vcpu); 3973 kvm_clear_exception_queue(vcpu);
3825 vcpu->arch.mmio_fault_cr2 = cr2; 3974 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3831,8 +3980,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3831 */ 3980 */
3832 cache_all_regs(vcpu); 3981 cache_all_regs(vcpu);
3833 3982
3834 vcpu->mmio_is_write = 0;
3835
3836 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3983 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3837 int cs_db, cs_l; 3984 int cs_db, cs_l;
3838 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3985 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
@@ -3846,13 +3993,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3846 ? X86EMUL_MODE_VM86 : cs_l 3993 ? X86EMUL_MODE_VM86 : cs_l
3847 ? X86EMUL_MODE_PROT64 : cs_db 3994 ? X86EMUL_MODE_PROT64 : cs_db
3848 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3995 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3996 memset(c, 0, sizeof(struct decode_cache));
3997 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
3998 vcpu->arch.emulate_ctxt.interruptibility = 0;
3999 vcpu->arch.emulate_ctxt.exception = -1;
3849 4000
3850 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4001 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3851 trace_kvm_emulate_insn_start(vcpu); 4002 trace_kvm_emulate_insn_start(vcpu);
3852 4003
3853 /* Only allow emulation of specific instructions on #UD 4004 /* Only allow emulation of specific instructions on #UD
3854 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 4005 * (namely VMMCALL, sysenter, sysexit, syscall)*/
3855 c = &vcpu->arch.emulate_ctxt.decode;
3856 if (emulation_type & EMULTYPE_TRAP_UD) { 4006 if (emulation_type & EMULTYPE_TRAP_UD) {
3857 if (!c->twobyte) 4007 if (!c->twobyte)
3858 return EMULATE_FAIL; 4008 return EMULATE_FAIL;
@@ -3880,11 +4030,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3880 4030
3881 ++vcpu->stat.insn_emulation; 4031 ++vcpu->stat.insn_emulation;
3882 if (r) { 4032 if (r) {
3883 ++vcpu->stat.insn_emulation_fail; 4033 if (reexecute_instruction(vcpu, cr2))
3884 trace_kvm_emulate_insn_failed(vcpu);
3885 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3886 return EMULATE_DONE; 4034 return EMULATE_DONE;
3887 return EMULATE_FAIL; 4035 if (emulation_type & EMULTYPE_SKIP)
4036 return EMULATE_FAIL;
4037 return handle_emulation_failure(vcpu);
3888 } 4038 }
3889 } 4039 }
3890 4040
@@ -3893,48 +4043,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3893 return EMULATE_DONE; 4043 return EMULATE_DONE;
3894 } 4044 }
3895 4045
4046 /* this is needed for vmware backdor interface to work since it
4047 changes registers values during IO operation */
4048 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4049
3896restart: 4050restart:
3897 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4051 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3898 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3899 4052
3900 if (r == 0) 4053 if (r) { /* emulation failed */
3901 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 4054 if (reexecute_instruction(vcpu, cr2))
4055 return EMULATE_DONE;
3902 4056
3903 if (vcpu->arch.pio.count) { 4057 return handle_emulation_failure(vcpu);
3904 if (!vcpu->arch.pio.in)
3905 vcpu->arch.pio.count = 0;
3906 return EMULATE_DO_MMIO;
3907 } 4058 }
3908 4059
3909 if (r || vcpu->mmio_is_write) { 4060 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
3910 run->exit_reason = KVM_EXIT_MMIO; 4061 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
3911 run->mmio.phys_addr = vcpu->mmio_phys_addr; 4062 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
3912 memcpy(run->mmio.data, vcpu->mmio_data, 8); 4063 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
3913 run->mmio.len = vcpu->mmio_size; 4064
3914 run->mmio.is_write = vcpu->mmio_is_write; 4065 if (vcpu->arch.emulate_ctxt.exception >= 0) {
4066 inject_emulated_exception(vcpu);
4067 return EMULATE_DONE;
3915 } 4068 }
3916 4069
3917 if (r) { 4070 if (vcpu->arch.pio.count) {
3918 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 4071 if (!vcpu->arch.pio.in)
3919 goto done; 4072 vcpu->arch.pio.count = 0;
3920 if (!vcpu->mmio_needed) {
3921 ++vcpu->stat.insn_emulation_fail;
3922 trace_kvm_emulate_insn_failed(vcpu);
3923 kvm_report_emulation_failure(vcpu, "mmio");
3924 return EMULATE_FAIL;
3925 }
3926 return EMULATE_DO_MMIO; 4073 return EMULATE_DO_MMIO;
3927 } 4074 }
3928 4075
3929 if (vcpu->mmio_is_write) { 4076 if (vcpu->mmio_needed) {
3930 vcpu->mmio_needed = 0; 4077 if (vcpu->mmio_is_write)
4078 vcpu->mmio_needed = 0;
3931 return EMULATE_DO_MMIO; 4079 return EMULATE_DO_MMIO;
3932 } 4080 }
3933 4081
3934done:
3935 if (vcpu->arch.exception.pending)
3936 vcpu->arch.emulate_ctxt.restart = false;
3937
3938 if (vcpu->arch.emulate_ctxt.restart) 4082 if (vcpu->arch.emulate_ctxt.restart)
3939 goto restart; 4083 goto restart;
3940 4084
@@ -4108,6 +4252,9 @@ int kvm_arch_init(void *opaque)
4108 4252
4109 perf_register_guest_info_callbacks(&kvm_guest_cbs); 4253 perf_register_guest_info_callbacks(&kvm_guest_cbs);
4110 4254
4255 if (cpu_has_xsave)
4256 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
4257
4111 return 0; 4258 return 0;
4112 4259
4113out: 4260out:
@@ -4270,7 +4417,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
4270 4417
4271 kvm_x86_ops->patch_hypercall(vcpu, instruction); 4418 kvm_x86_ops->patch_hypercall(vcpu, instruction);
4272 4419
4273 return emulator_write_emulated(rip, instruction, 3, vcpu); 4420 return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
4274} 4421}
4275 4422
4276void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4423void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
@@ -4506,59 +4653,78 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
4506 } 4653 }
4507} 4654}
4508 4655
4656static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
4657{
4658 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
4659 !vcpu->guest_xcr0_loaded) {
4660 /* kvm_set_xcr() also depends on this */
4661 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
4662 vcpu->guest_xcr0_loaded = 1;
4663 }
4664}
4665
4666static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
4667{
4668 if (vcpu->guest_xcr0_loaded) {
4669 if (vcpu->arch.xcr0 != host_xcr0)
4670 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
4671 vcpu->guest_xcr0_loaded = 0;
4672 }
4673}
4674
4509static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 4675static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4510{ 4676{
4511 int r; 4677 int r;
4512 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 4678 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
4513 vcpu->run->request_interrupt_window; 4679 vcpu->run->request_interrupt_window;
4514 4680
4515 if (vcpu->requests)
4516 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
4517 kvm_mmu_unload(vcpu);
4518
4519 r = kvm_mmu_reload(vcpu);
4520 if (unlikely(r))
4521 goto out;
4522
4523 if (vcpu->requests) { 4681 if (vcpu->requests) {
4524 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 4682 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
4683 kvm_mmu_unload(vcpu);
4684 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
4525 __kvm_migrate_timers(vcpu); 4685 __kvm_migrate_timers(vcpu);
4526 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 4686 if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
4527 kvm_write_guest_time(vcpu); 4687 kvm_write_guest_time(vcpu);
4528 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 4688 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4529 kvm_mmu_sync_roots(vcpu); 4689 kvm_mmu_sync_roots(vcpu);
4530 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 4690 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
4531 kvm_x86_ops->tlb_flush(vcpu); 4691 kvm_x86_ops->tlb_flush(vcpu);
4532 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 4692 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
4533 &vcpu->requests)) {
4534 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 4693 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
4535 r = 0; 4694 r = 0;
4536 goto out; 4695 goto out;
4537 } 4696 }
4538 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 4697 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
4539 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 4698 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
4540 r = 0; 4699 r = 0;
4541 goto out; 4700 goto out;
4542 } 4701 }
4543 if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { 4702 if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
4544 vcpu->fpu_active = 0; 4703 vcpu->fpu_active = 0;
4545 kvm_x86_ops->fpu_deactivate(vcpu); 4704 kvm_x86_ops->fpu_deactivate(vcpu);
4546 } 4705 }
4547 } 4706 }
4548 4707
4708 r = kvm_mmu_reload(vcpu);
4709 if (unlikely(r))
4710 goto out;
4711
4549 preempt_disable(); 4712 preempt_disable();
4550 4713
4551 kvm_x86_ops->prepare_guest_switch(vcpu); 4714 kvm_x86_ops->prepare_guest_switch(vcpu);
4552 if (vcpu->fpu_active) 4715 if (vcpu->fpu_active)
4553 kvm_load_guest_fpu(vcpu); 4716 kvm_load_guest_fpu(vcpu);
4717 kvm_load_guest_xcr0(vcpu);
4554 4718
4555 local_irq_disable(); 4719 atomic_set(&vcpu->guest_mode, 1);
4720 smp_wmb();
4556 4721
4557 clear_bit(KVM_REQ_KICK, &vcpu->requests); 4722 local_irq_disable();
4558 smp_mb__after_clear_bit();
4559 4723
4560 if (vcpu->requests || need_resched() || signal_pending(current)) { 4724 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
4561 set_bit(KVM_REQ_KICK, &vcpu->requests); 4725 || need_resched() || signal_pending(current)) {
4726 atomic_set(&vcpu->guest_mode, 0);
4727 smp_wmb();
4562 local_irq_enable(); 4728 local_irq_enable();
4563 preempt_enable(); 4729 preempt_enable();
4564 r = 1; 4730 r = 1;
@@ -4603,7 +4769,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4603 if (hw_breakpoint_active()) 4769 if (hw_breakpoint_active())
4604 hw_breakpoint_restore(); 4770 hw_breakpoint_restore();
4605 4771
4606 set_bit(KVM_REQ_KICK, &vcpu->requests); 4772 atomic_set(&vcpu->guest_mode, 0);
4773 smp_wmb();
4607 local_irq_enable(); 4774 local_irq_enable();
4608 4775
4609 ++vcpu->stat.exits; 4776 ++vcpu->stat.exits;
@@ -4665,7 +4832,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4665 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 4832 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4666 kvm_vcpu_block(vcpu); 4833 kvm_vcpu_block(vcpu);
4667 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 4834 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
4668 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4835 if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
4669 { 4836 {
4670 switch(vcpu->arch.mp_state) { 4837 switch(vcpu->arch.mp_state) {
4671 case KVM_MP_STATE_HALTED: 4838 case KVM_MP_STATE_HALTED:
@@ -4717,8 +4884,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4717 int r; 4884 int r;
4718 sigset_t sigsaved; 4885 sigset_t sigsaved;
4719 4886
4720 vcpu_load(vcpu);
4721
4722 if (vcpu->sigset_active) 4887 if (vcpu->sigset_active)
4723 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 4888 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
4724 4889
@@ -4743,7 +4908,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4743 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4908 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4744 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); 4909 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4745 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4910 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4746 if (r == EMULATE_DO_MMIO) { 4911 if (r != EMULATE_DONE) {
4747 r = 0; 4912 r = 0;
4748 goto out; 4913 goto out;
4749 } 4914 }
@@ -4759,14 +4924,11 @@ out:
4759 if (vcpu->sigset_active) 4924 if (vcpu->sigset_active)
4760 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 4925 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
4761 4926
4762 vcpu_put(vcpu);
4763 return r; 4927 return r;
4764} 4928}
4765 4929
4766int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4930int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4767{ 4931{
4768 vcpu_load(vcpu);
4769
4770 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4932 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4771 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4933 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4772 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4934 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4789,15 +4951,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4789 regs->rip = kvm_rip_read(vcpu); 4951 regs->rip = kvm_rip_read(vcpu);
4790 regs->rflags = kvm_get_rflags(vcpu); 4952 regs->rflags = kvm_get_rflags(vcpu);
4791 4953
4792 vcpu_put(vcpu);
4793
4794 return 0; 4954 return 0;
4795} 4955}
4796 4956
4797int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4957int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4798{ 4958{
4799 vcpu_load(vcpu);
4800
4801 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 4959 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
4802 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 4960 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
4803 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 4961 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4822,8 +4980,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4822 4980
4823 vcpu->arch.exception.pending = false; 4981 vcpu->arch.exception.pending = false;
4824 4982
4825 vcpu_put(vcpu);
4826
4827 return 0; 4983 return 0;
4828} 4984}
4829 4985
@@ -4842,8 +4998,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4842{ 4998{
4843 struct desc_ptr dt; 4999 struct desc_ptr dt;
4844 5000
4845 vcpu_load(vcpu);
4846
4847 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5001 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4848 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 5002 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4849 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 5003 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -4875,32 +5029,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4875 set_bit(vcpu->arch.interrupt.nr, 5029 set_bit(vcpu->arch.interrupt.nr,
4876 (unsigned long *)sregs->interrupt_bitmap); 5030 (unsigned long *)sregs->interrupt_bitmap);
4877 5031
4878 vcpu_put(vcpu);
4879
4880 return 0; 5032 return 0;
4881} 5033}
4882 5034
4883int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 5035int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
4884 struct kvm_mp_state *mp_state) 5036 struct kvm_mp_state *mp_state)
4885{ 5037{
4886 vcpu_load(vcpu);
4887 mp_state->mp_state = vcpu->arch.mp_state; 5038 mp_state->mp_state = vcpu->arch.mp_state;
4888 vcpu_put(vcpu);
4889 return 0; 5039 return 0;
4890} 5040}
4891 5041
4892int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 5042int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4893 struct kvm_mp_state *mp_state) 5043 struct kvm_mp_state *mp_state)
4894{ 5044{
4895 vcpu_load(vcpu);
4896 vcpu->arch.mp_state = mp_state->mp_state; 5045 vcpu->arch.mp_state = mp_state->mp_state;
4897 vcpu_put(vcpu);
4898 return 0; 5046 return 0;
4899} 5047}
4900 5048
4901int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 5049int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4902 bool has_error_code, u32 error_code) 5050 bool has_error_code, u32 error_code)
4903{ 5051{
5052 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4904 int cs_db, cs_l, ret; 5053 int cs_db, cs_l, ret;
4905 cache_all_regs(vcpu); 5054 cache_all_regs(vcpu);
4906 5055
@@ -4915,6 +5064,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4915 ? X86EMUL_MODE_VM86 : cs_l 5064 ? X86EMUL_MODE_VM86 : cs_l
4916 ? X86EMUL_MODE_PROT64 : cs_db 5065 ? X86EMUL_MODE_PROT64 : cs_db
4917 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 5066 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
5067 memset(c, 0, sizeof(struct decode_cache));
5068 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4918 5069
4919 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5070 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
4920 tss_selector, reason, has_error_code, 5071 tss_selector, reason, has_error_code,
@@ -4923,6 +5074,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4923 if (ret) 5074 if (ret)
4924 return EMULATE_FAIL; 5075 return EMULATE_FAIL;
4925 5076
5077 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5078 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4926 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5079 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4927 return EMULATE_DONE; 5080 return EMULATE_DONE;
4928} 5081}
@@ -4935,8 +5088,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4935 int pending_vec, max_bits; 5088 int pending_vec, max_bits;
4936 struct desc_ptr dt; 5089 struct desc_ptr dt;
4937 5090
4938 vcpu_load(vcpu);
4939
4940 dt.size = sregs->idt.limit; 5091 dt.size = sregs->idt.limit;
4941 dt.address = sregs->idt.base; 5092 dt.address = sregs->idt.base;
4942 kvm_x86_ops->set_idt(vcpu, &dt); 5093 kvm_x86_ops->set_idt(vcpu, &dt);
@@ -4996,8 +5147,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4996 !is_protmode(vcpu)) 5147 !is_protmode(vcpu))
4997 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5148 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4998 5149
4999 vcpu_put(vcpu);
5000
5001 return 0; 5150 return 0;
5002} 5151}
5003 5152
@@ -5007,12 +5156,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5007 unsigned long rflags; 5156 unsigned long rflags;
5008 int i, r; 5157 int i, r;
5009 5158
5010 vcpu_load(vcpu);
5011
5012 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { 5159 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
5013 r = -EBUSY; 5160 r = -EBUSY;
5014 if (vcpu->arch.exception.pending) 5161 if (vcpu->arch.exception.pending)
5015 goto unlock_out; 5162 goto out;
5016 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 5163 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
5017 kvm_queue_exception(vcpu, DB_VECTOR); 5164 kvm_queue_exception(vcpu, DB_VECTOR);
5018 else 5165 else
@@ -5054,34 +5201,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5054 5201
5055 r = 0; 5202 r = 0;
5056 5203
5057unlock_out: 5204out:
5058 vcpu_put(vcpu);
5059 5205
5060 return r; 5206 return r;
5061} 5207}
5062 5208
5063/* 5209/*
5064 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
5065 * we have asm/x86/processor.h
5066 */
5067struct fxsave {
5068 u16 cwd;
5069 u16 swd;
5070 u16 twd;
5071 u16 fop;
5072 u64 rip;
5073 u64 rdp;
5074 u32 mxcsr;
5075 u32 mxcsr_mask;
5076 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
5077#ifdef CONFIG_X86_64
5078 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
5079#else
5080 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
5081#endif
5082};
5083
5084/*
5085 * Translate a guest virtual address to a guest physical address. 5210 * Translate a guest virtual address to a guest physical address.
5086 */ 5211 */
5087int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 5212int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -5091,7 +5216,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
5091 gpa_t gpa; 5216 gpa_t gpa;
5092 int idx; 5217 int idx;
5093 5218
5094 vcpu_load(vcpu);
5095 idx = srcu_read_lock(&vcpu->kvm->srcu); 5219 idx = srcu_read_lock(&vcpu->kvm->srcu);
5096 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); 5220 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
5097 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5221 srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -5099,16 +5223,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
5099 tr->valid = gpa != UNMAPPED_GVA; 5223 tr->valid = gpa != UNMAPPED_GVA;
5100 tr->writeable = 1; 5224 tr->writeable = 1;
5101 tr->usermode = 0; 5225 tr->usermode = 0;
5102 vcpu_put(vcpu);
5103 5226
5104 return 0; 5227 return 0;
5105} 5228}
5106 5229
5107int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5230int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5108{ 5231{
5109 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 5232 struct i387_fxsave_struct *fxsave =
5110 5233 &vcpu->arch.guest_fpu.state->fxsave;
5111 vcpu_load(vcpu);
5112 5234
5113 memcpy(fpu->fpr, fxsave->st_space, 128); 5235 memcpy(fpu->fpr, fxsave->st_space, 128);
5114 fpu->fcw = fxsave->cwd; 5236 fpu->fcw = fxsave->cwd;
@@ -5119,16 +5241,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5119 fpu->last_dp = fxsave->rdp; 5241 fpu->last_dp = fxsave->rdp;
5120 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 5242 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
5121 5243
5122 vcpu_put(vcpu);
5123
5124 return 0; 5244 return 0;
5125} 5245}
5126 5246
5127int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5247int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5128{ 5248{
5129 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 5249 struct i387_fxsave_struct *fxsave =
5130 5250 &vcpu->arch.guest_fpu.state->fxsave;
5131 vcpu_load(vcpu);
5132 5251
5133 memcpy(fxsave->st_space, fpu->fpr, 128); 5252 memcpy(fxsave->st_space, fpu->fpr, 128);
5134 fxsave->cwd = fpu->fcw; 5253 fxsave->cwd = fpu->fcw;
@@ -5139,61 +5258,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5139 fxsave->rdp = fpu->last_dp; 5258 fxsave->rdp = fpu->last_dp;
5140 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 5259 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
5141 5260
5142 vcpu_put(vcpu);
5143
5144 return 0; 5261 return 0;
5145} 5262}
5146 5263
5147void fx_init(struct kvm_vcpu *vcpu) 5264int fx_init(struct kvm_vcpu *vcpu)
5148{ 5265{
5149 unsigned after_mxcsr_mask; 5266 int err;
5267
5268 err = fpu_alloc(&vcpu->arch.guest_fpu);
5269 if (err)
5270 return err;
5271
5272 fpu_finit(&vcpu->arch.guest_fpu);
5150 5273
5151 /* 5274 /*
5152 * Touch the fpu the first time in non atomic context as if 5275 * Ensure guest xcr0 is valid for loading
5153 * this is the first fpu instruction the exception handler
5154 * will fire before the instruction returns and it'll have to
5155 * allocate ram with GFP_KERNEL.
5156 */ 5276 */
5157 if (!used_math()) 5277 vcpu->arch.xcr0 = XSTATE_FP;
5158 kvm_fx_save(&vcpu->arch.host_fx_image);
5159
5160 /* Initialize guest FPU by resetting ours and saving into guest's */
5161 preempt_disable();
5162 kvm_fx_save(&vcpu->arch.host_fx_image);
5163 kvm_fx_finit();
5164 kvm_fx_save(&vcpu->arch.guest_fx_image);
5165 kvm_fx_restore(&vcpu->arch.host_fx_image);
5166 preempt_enable();
5167 5278
5168 vcpu->arch.cr0 |= X86_CR0_ET; 5279 vcpu->arch.cr0 |= X86_CR0_ET;
5169 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 5280
5170 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 5281 return 0;
5171 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
5172 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
5173} 5282}
5174EXPORT_SYMBOL_GPL(fx_init); 5283EXPORT_SYMBOL_GPL(fx_init);
5175 5284
5285static void fx_free(struct kvm_vcpu *vcpu)
5286{
5287 fpu_free(&vcpu->arch.guest_fpu);
5288}
5289
5176void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5290void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
5177{ 5291{
5178 if (vcpu->guest_fpu_loaded) 5292 if (vcpu->guest_fpu_loaded)
5179 return; 5293 return;
5180 5294
5295 /*
5296 * Restore all possible states in the guest,
5297 * and assume host would use all available bits.
5298 * Guest xcr0 would be loaded later.
5299 */
5300 kvm_put_guest_xcr0(vcpu);
5181 vcpu->guest_fpu_loaded = 1; 5301 vcpu->guest_fpu_loaded = 1;
5182 kvm_fx_save(&vcpu->arch.host_fx_image); 5302 unlazy_fpu(current);
5183 kvm_fx_restore(&vcpu->arch.guest_fx_image); 5303 fpu_restore_checking(&vcpu->arch.guest_fpu);
5184 trace_kvm_fpu(1); 5304 trace_kvm_fpu(1);
5185} 5305}
5186 5306
5187void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5307void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5188{ 5308{
5309 kvm_put_guest_xcr0(vcpu);
5310
5189 if (!vcpu->guest_fpu_loaded) 5311 if (!vcpu->guest_fpu_loaded)
5190 return; 5312 return;
5191 5313
5192 vcpu->guest_fpu_loaded = 0; 5314 vcpu->guest_fpu_loaded = 0;
5193 kvm_fx_save(&vcpu->arch.guest_fx_image); 5315 fpu_save_init(&vcpu->arch.guest_fpu);
5194 kvm_fx_restore(&vcpu->arch.host_fx_image);
5195 ++vcpu->stat.fpu_reload; 5316 ++vcpu->stat.fpu_reload;
5196 set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); 5317 kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
5197 trace_kvm_fpu(0); 5318 trace_kvm_fpu(0);
5198} 5319}
5199 5320
@@ -5204,6 +5325,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5204 vcpu->arch.time_page = NULL; 5325 vcpu->arch.time_page = NULL;
5205 } 5326 }
5206 5327
5328 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
5329 fx_free(vcpu);
5207 kvm_x86_ops->vcpu_free(vcpu); 5330 kvm_x86_ops->vcpu_free(vcpu);
5208} 5331}
5209 5332
@@ -5217,9 +5340,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
5217{ 5340{
5218 int r; 5341 int r;
5219 5342
5220 /* We do fxsave: this must be aligned. */
5221 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
5222
5223 vcpu->arch.mtrr_state.have_fixed = 1; 5343 vcpu->arch.mtrr_state.have_fixed = 1;
5224 vcpu_load(vcpu); 5344 vcpu_load(vcpu);
5225 r = kvm_arch_vcpu_reset(vcpu); 5345 r = kvm_arch_vcpu_reset(vcpu);
@@ -5241,6 +5361,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
5241 kvm_mmu_unload(vcpu); 5361 kvm_mmu_unload(vcpu);
5242 vcpu_put(vcpu); 5362 vcpu_put(vcpu);
5243 5363
5364 fx_free(vcpu);
5244 kvm_x86_ops->vcpu_free(vcpu); 5365 kvm_x86_ops->vcpu_free(vcpu);
5245} 5366}
5246 5367
@@ -5334,7 +5455,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5334 } 5455 }
5335 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5456 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
5336 5457
5458 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5459 goto fail_free_mce_banks;
5460
5337 return 0; 5461 return 0;
5462fail_free_mce_banks:
5463 kfree(vcpu->arch.mce_banks);
5338fail_free_lapic: 5464fail_free_lapic:
5339 kvm_free_lapic(vcpu); 5465 kvm_free_lapic(vcpu);
5340fail_mmu_destroy: 5466fail_mmu_destroy:
@@ -5364,12 +5490,6 @@ struct kvm *kvm_arch_create_vm(void)
5364 if (!kvm) 5490 if (!kvm)
5365 return ERR_PTR(-ENOMEM); 5491 return ERR_PTR(-ENOMEM);
5366 5492
5367 kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
5368 if (!kvm->arch.aliases) {
5369 kfree(kvm);
5370 return ERR_PTR(-ENOMEM);
5371 }
5372
5373 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5493 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5374 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5494 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5375 5495
@@ -5412,12 +5532,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
5412void kvm_arch_sync_events(struct kvm *kvm) 5532void kvm_arch_sync_events(struct kvm *kvm)
5413{ 5533{
5414 kvm_free_all_assigned_devices(kvm); 5534 kvm_free_all_assigned_devices(kvm);
5535 kvm_free_pit(kvm);
5415} 5536}
5416 5537
5417void kvm_arch_destroy_vm(struct kvm *kvm) 5538void kvm_arch_destroy_vm(struct kvm *kvm)
5418{ 5539{
5419 kvm_iommu_unmap_guest(kvm); 5540 kvm_iommu_unmap_guest(kvm);
5420 kvm_free_pit(kvm);
5421 kfree(kvm->arch.vpic); 5541 kfree(kvm->arch.vpic);
5422 kfree(kvm->arch.vioapic); 5542 kfree(kvm->arch.vioapic);
5423 kvm_free_vcpus(kvm); 5543 kvm_free_vcpus(kvm);
@@ -5427,7 +5547,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5427 if (kvm->arch.ept_identity_pagetable) 5547 if (kvm->arch.ept_identity_pagetable)
5428 put_page(kvm->arch.ept_identity_pagetable); 5548 put_page(kvm->arch.ept_identity_pagetable);
5429 cleanup_srcu_struct(&kvm->srcu); 5549 cleanup_srcu_struct(&kvm->srcu);
5430 kfree(kvm->arch.aliases);
5431 kfree(kvm); 5550 kfree(kvm);
5432} 5551}
5433 5552
@@ -5438,6 +5557,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
5438 int user_alloc) 5557 int user_alloc)
5439{ 5558{
5440 int npages = memslot->npages; 5559 int npages = memslot->npages;
5560 int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
5561
5562 /* Prevent internal slot pages from being moved by fork()/COW. */
5563 if (memslot->id >= KVM_MEMORY_SLOTS)
5564 map_flags = MAP_SHARED | MAP_ANONYMOUS;
5441 5565
5442 /*To keep backward compatibility with older userspace, 5566 /*To keep backward compatibility with older userspace,
5443 *x86 needs to hanlde !user_alloc case. 5567 *x86 needs to hanlde !user_alloc case.
@@ -5450,7 +5574,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
5450 userspace_addr = do_mmap(NULL, 0, 5574 userspace_addr = do_mmap(NULL, 0,
5451 npages * PAGE_SIZE, 5575 npages * PAGE_SIZE,
5452 PROT_READ | PROT_WRITE, 5576 PROT_READ | PROT_WRITE,
5453 MAP_PRIVATE | MAP_ANONYMOUS, 5577 map_flags,
5454 0); 5578 0);
5455 up_write(&current->mm->mmap_sem); 5579 up_write(&current->mm->mmap_sem);
5456 5580
@@ -5523,7 +5647,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
5523 5647
5524 me = get_cpu(); 5648 me = get_cpu();
5525 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 5649 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
5526 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 5650 if (atomic_xchg(&vcpu->guest_mode, 0))
5527 smp_send_reschedule(cpu); 5651 smp_send_reschedule(cpu);
5528 put_cpu(); 5652 put_cpu();
5529} 5653}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f4b54458285..b7a404722d2 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 66}
67 67
68static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
69{
70 return rcu_dereference_check(kvm->arch.aliases,
71 srcu_read_lock_held(&kvm->srcu)
72 || lockdep_is_held(&kvm->slots_lock));
73}
74
75void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
76void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
77 70
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f871e04b696..e10cf070ede 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -30,6 +30,7 @@ ifeq ($(CONFIG_X86_32),y)
30 lib-y += checksum_32.o 30 lib-y += checksum_32.o
31 lib-y += strstr_32.o 31 lib-y += strstr_32.o
32 lib-y += semaphore_32.o string_32.o 32 lib-y += semaphore_32.o string_32.o
33 lib-y += cmpxchg.o
33ifneq ($(CONFIG_X86_CMPXCHG64),y) 34ifneq ($(CONFIG_X86_CMPXCHG64),y)
34 lib-y += cmpxchg8b_emu.o atomic64_386_32.o 35 lib-y += cmpxchg8b_emu.o atomic64_386_32.o
35endif 36endif
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index ebeafcce04a..aa4326bfb24 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -52,7 +52,7 @@ ENDPROC(clear_page)
52 .align 8 52 .align 8
53 .quad clear_page 53 .quad clear_page
54 .quad 1b 54 .quad 1b
55 .byte X86_FEATURE_REP_GOOD 55 .word X86_FEATURE_REP_GOOD
56 .byte .Lclear_page_end - clear_page 56 .byte .Lclear_page_end - clear_page
57 .byte 2b - 1b 57 .byte 2b - 1b
58 .previous 58 .previous
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/lib/cmpxchg.c
index 2056ccf572c..5d619f6df3e 100644
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ b/arch/x86/lib/cmpxchg.c
@@ -52,21 +52,3 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
52} 52}
53EXPORT_SYMBOL(cmpxchg_386_u32); 53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif 54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 727a5d46d2f..6fec2d1cebe 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -113,7 +113,7 @@ ENDPROC(copy_page)
113 .align 8 113 .align 8
114 .quad copy_page 114 .quad copy_page
115 .quad 1b 115 .quad 1b
116 .byte X86_FEATURE_REP_GOOD 116 .word X86_FEATURE_REP_GOOD
117 .byte .Lcopy_page_end - copy_page 117 .byte .Lcopy_page_end - copy_page
118 .byte 2b - 1b 118 .byte 2b - 1b
119 .previous 119 .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 71100c98e33..a460158b5ac 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -29,7 +29,7 @@
29 .align 8 29 .align 8
30 .quad 0b 30 .quad 0b
31 .quad 2b 31 .quad 2b
32 .byte \feature /* when feature is set */ 32 .word \feature /* when feature is set */
33 .byte 5 33 .byte 5
34 .byte 5 34 .byte 5
35 .previous 35 .previous
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index f82e884928a..bcbcd1e0f7d 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -131,7 +131,7 @@ ENDPROC(__memcpy)
131 .align 8 131 .align 8
132 .quad memcpy 132 .quad memcpy
133 .quad .Lmemcpy_c 133 .quad .Lmemcpy_c
134 .byte X86_FEATURE_REP_GOOD 134 .word X86_FEATURE_REP_GOOD
135 135
136 /* 136 /*
137 * Replace only beginning, memcpy is used to apply alternatives, 137 * Replace only beginning, memcpy is used to apply alternatives,
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index e88d3b81644..09d34426965 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -121,7 +121,7 @@ ENDPROC(__memset)
121 .align 8 121 .align 8
122 .quad memset 122 .quad memset
123 .quad .Lmemset_c 123 .quad .Lmemset_c
124 .byte X86_FEATURE_REP_GOOD 124 .word X86_FEATURE_REP_GOOD
125 .byte .Lfinal - memset 125 .byte .Lfinal - memset
126 .byte .Lmemset_e - .Lmemset_c 126 .byte .Lmemset_e - .Lmemset_c
127 .previous 127 .previous
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a725b7f760a..0002a3a3308 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -37,6 +37,28 @@ struct addr_marker {
37 const char *name; 37 const char *name;
38}; 38};
39 39
40/* indices for address_markers; keep sync'd w/ address_markers below */
41enum address_markers_idx {
42 USER_SPACE_NR = 0,
43#ifdef CONFIG_X86_64
44 KERNEL_SPACE_NR,
45 LOW_KERNEL_NR,
46 VMALLOC_START_NR,
47 VMEMMAP_START_NR,
48 HIGH_KERNEL_NR,
49 MODULES_VADDR_NR,
50 MODULES_END_NR,
51#else
52 KERNEL_SPACE_NR,
53 VMALLOC_START_NR,
54 VMALLOC_END_NR,
55# ifdef CONFIG_HIGHMEM
56 PKMAP_BASE_NR,
57# endif
58 FIXADDR_START_NR,
59#endif
60};
61
40/* Address space markers hints */ 62/* Address space markers hints */
41static struct addr_marker address_markers[] = { 63static struct addr_marker address_markers[] = {
42 { 0, "User Space" }, 64 { 0, "User Space" },
@@ -331,14 +353,12 @@ static int pt_dump_init(void)
331 353
332#ifdef CONFIG_X86_32 354#ifdef CONFIG_X86_32
333 /* Not a compile-time constant on x86-32 */ 355 /* Not a compile-time constant on x86-32 */
334 address_markers[2].start_address = VMALLOC_START; 356 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
335 address_markers[3].start_address = VMALLOC_END; 357 address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
336# ifdef CONFIG_HIGHMEM 358# ifdef CONFIG_HIGHMEM
337 address_markers[4].start_address = PKMAP_BASE; 359 address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
338 address_markers[5].start_address = FIXADDR_START;
339# else
340 address_markers[4].start_address = FIXADDR_START;
341# endif 360# endif
361 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
342#endif 362#endif
343 363
344 pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, 364 pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 63a6ba66cbe..5e8fa12ef86 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
53 return kmap_atomic_prot(page, type, kmap_prot); 53 return kmap_atomic_prot(page, type, kmap_prot);
54} 54}
55 55
56void kunmap_atomic(void *kvaddr, enum km_type type) 56void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
57{ 57{
58 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; 58 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
59 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); 59 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
@@ -102,7 +102,7 @@ struct page *kmap_atomic_to_page(void *ptr)
102EXPORT_SYMBOL(kmap); 102EXPORT_SYMBOL(kmap);
103EXPORT_SYMBOL(kunmap); 103EXPORT_SYMBOL(kunmap);
104EXPORT_SYMBOL(kmap_atomic); 104EXPORT_SYMBOL(kmap_atomic);
105EXPORT_SYMBOL(kunmap_atomic); 105EXPORT_SYMBOL(kunmap_atomic_notypecheck);
106EXPORT_SYMBOL(kmap_atomic_prot); 106EXPORT_SYMBOL(kmap_atomic_prot);
107EXPORT_SYMBOL(kmap_atomic_to_page); 107EXPORT_SYMBOL(kmap_atomic_to_page);
108 108
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ee41bba315d..9a6674689a2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -2,7 +2,7 @@
2 * linux/arch/x86_64/mm/init.c 2 * linux/arch/x86_64/mm/init.c
3 * 3 *
4 * Copyright (C) 1995 Linus Torvalds 4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> 6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */ 7 */
8 8
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 12e4d2d3c11..3ba6e0608c5 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -62,8 +62,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
62static void __iomem *__ioremap_caller(resource_size_t phys_addr, 62static void __iomem *__ioremap_caller(resource_size_t phys_addr,
63 unsigned long size, unsigned long prot_val, void *caller) 63 unsigned long size, unsigned long prot_val, void *caller)
64{ 64{
65 unsigned long pfn, offset, vaddr; 65 unsigned long offset, vaddr;
66 resource_size_t last_addr; 66 resource_size_t pfn, last_pfn, last_addr;
67 const resource_size_t unaligned_phys_addr = phys_addr; 67 const resource_size_t unaligned_phys_addr = phys_addr;
68 const unsigned long unaligned_size = size; 68 const unsigned long unaligned_size = size;
69 struct vm_struct *area; 69 struct vm_struct *area;
@@ -100,10 +100,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
100 /* 100 /*
101 * Don't allow anybody to remap normal RAM that we're using.. 101 * Don't allow anybody to remap normal RAM that we're using..
102 */ 102 */
103 for (pfn = phys_addr >> PAGE_SHIFT; 103 last_pfn = last_addr >> PAGE_SHIFT;
104 (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK); 104 for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
105 pfn++) {
106
107 int is_ram = page_is_ram(pfn); 105 int is_ram = page_is_ram(pfn);
108 106
109 if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) 107 if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
@@ -115,7 +113,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
115 * Mappings have to be page-aligned 113 * Mappings have to be page-aligned
116 */ 114 */
117 offset = phys_addr & ~PAGE_MASK; 115 offset = phys_addr & ~PAGE_MASK;
118 phys_addr &= PAGE_MASK; 116 phys_addr &= PHYSICAL_PAGE_MASK;
119 size = PAGE_ALIGN(last_addr+1) - phys_addr; 117 size = PAGE_ALIGN(last_addr+1) - phys_addr;
120 118
121 retval = reserve_memtype(phys_addr, (u64)phys_addr + size, 119 retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
@@ -613,7 +611,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
613 return; 611 return;
614 } 612 }
615 offset = virt_addr & ~PAGE_MASK; 613 offset = virt_addr & ~PAGE_MASK;
616 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; 614 nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
617 615
618 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; 616 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
619 while (nrpages > 0) { 617 while (nrpages > 0) {
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 5d0e67fff1a..e5d5e2ce9f7 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -45,6 +45,8 @@ struct kmmio_fault_page {
45 * Protected by kmmio_lock, when linked into kmmio_page_table. 45 * Protected by kmmio_lock, when linked into kmmio_page_table.
46 */ 46 */
47 int count; 47 int count;
48
49 bool scheduled_for_release;
48}; 50};
49 51
50struct kmmio_delayed_release { 52struct kmmio_delayed_release {
@@ -398,8 +400,11 @@ static void release_kmmio_fault_page(unsigned long page,
398 BUG_ON(f->count < 0); 400 BUG_ON(f->count < 0);
399 if (!f->count) { 401 if (!f->count) {
400 disarm_kmmio_fault_page(f); 402 disarm_kmmio_fault_page(f);
401 f->release_next = *release_list; 403 if (!f->scheduled_for_release) {
402 *release_list = f; 404 f->release_next = *release_list;
405 *release_list = f;
406 f->scheduled_for_release = true;
407 }
403 } 408 }
404} 409}
405 410
@@ -471,8 +476,10 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
471 prevp = &f->release_next; 476 prevp = &f->release_next;
472 } else { 477 } else {
473 *prevp = f->release_next; 478 *prevp = f->release_next;
479 f->release_next = NULL;
480 f->scheduled_for_release = false;
474 } 481 }
475 f = f->release_next; 482 f = *prevp;
476 } 483 }
477 spin_unlock_irqrestore(&kmmio_lock, flags); 484 spin_unlock_irqrestore(&kmmio_lock, flags);
478 485
@@ -510,6 +517,9 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
510 kmmio_count--; 517 kmmio_count--;
511 spin_unlock_irqrestore(&kmmio_lock, flags); 518 spin_unlock_irqrestore(&kmmio_lock, flags);
512 519
520 if (!release_list)
521 return;
522
513 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); 523 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
514 if (!drelease) { 524 if (!drelease) {
515 pr_crit("leaking kmmio_fault_page objects.\n"); 525 pr_crit("leaking kmmio_fault_page objects.\n");
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 64121a18b8c..f6ff57b7efa 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -158,7 +158,7 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
158 return req_type; 158 return req_type;
159} 159}
160 160
161static int pat_pagerange_is_ram(unsigned long start, unsigned long end) 161static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
162{ 162{
163 int ram_page = 0, not_rampage = 0; 163 int ram_page = 0, not_rampage = 0;
164 unsigned long page_nr; 164 unsigned long page_nr;
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 8565d944f7c..38868adf07e 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -90,6 +90,27 @@ static void do_test(unsigned long size)
90 iounmap(p); 90 iounmap(p);
91} 91}
92 92
93/*
94 * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in
95 * a short time. We had a bug in deferred freeing procedure which tried
96 * to free this region multiple times (ioremap can reuse the same address
97 * for many mappings).
98 */
99static void do_test_bulk_ioremapping(void)
100{
101 void __iomem *p;
102 int i;
103
104 for (i = 0; i < 10; ++i) {
105 p = ioremap_nocache(mmio_address, PAGE_SIZE);
106 if (p)
107 iounmap(p);
108 }
109
110 /* Force freeing. If it will crash we will know why. */
111 synchronize_rcu();
112}
113
93static int __init init(void) 114static int __init init(void)
94{ 115{
95 unsigned long size = (read_far) ? (8 << 20) : (16 << 10); 116 unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
@@ -104,6 +125,7 @@ static int __init init(void)
104 "and writing 16 kB of rubbish in there.\n", 125 "and writing 16 kB of rubbish in there.\n",
105 size >> 10, mmio_address); 126 size >> 10, mmio_address);
106 do_test(size); 127 do_test(size);
128 do_test_bulk_ioremapping();
107 pr_info("All done.\n"); 129 pr_info("All done.\n");
108 return 0; 130 return 0;
109} 131}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 426f3a1a64d..c03f14ab666 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -278,11 +278,9 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
278 278
279static void do_flush_tlb_all(void *info) 279static void do_flush_tlb_all(void *info)
280{ 280{
281 unsigned long cpu = smp_processor_id();
282
283 __flush_tlb_all(); 281 __flush_tlb_all();
284 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) 282 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
285 leave_mm(cpu); 283 leave_mm(smp_processor_id());
286} 284}
287 285
288void flush_tlb_all(void) 286void flush_tlb_all(void)
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 2ec04c424a6..15466c096ba 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -34,6 +34,15 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
34 DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), 34 DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
35 }, 35 },
36 }, 36 },
37 /* https://bugzilla.kernel.org/show_bug.cgi?id=16007 */
38 /* 2006 AMD HT/VIA system with two host bridges */
39 {
40 .callback = set_use_crs,
41 .ident = "ASRock ALiveSATA2-GLAN",
42 .matches = {
43 DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"),
44 },
45 },
37 {} 46 {}
38}; 47};
39 48
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 215a27ae050..a0772af64ef 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -125,6 +125,23 @@ void __init dmi_check_skip_isa_align(void)
125static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) 125static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
126{ 126{
127 struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; 127 struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
128 struct resource *bar_r;
129 int bar;
130
131 if (pci_probe & PCI_NOASSIGN_BARS) {
132 /*
133 * If the BIOS did not assign the BAR, zero out the
134 * resource so the kernel doesn't attmept to assign
135 * it later on in pci_assign_unassigned_resources
136 */
137 for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) {
138 bar_r = &dev->resource[bar];
139 if (bar_r->start == 0 && bar_r->end != 0) {
140 bar_r->flags = 0;
141 bar_r->end = 0;
142 }
143 }
144 }
128 145
129 if (pci_probe & PCI_NOASSIGN_ROMS) { 146 if (pci_probe & PCI_NOASSIGN_ROMS) {
130 if (rom_r->parent) 147 if (rom_r->parent)
@@ -509,6 +526,9 @@ char * __devinit pcibios_setup(char *str)
509 } else if (!strcmp(str, "norom")) { 526 } else if (!strcmp(str, "norom")) {
510 pci_probe |= PCI_NOASSIGN_ROMS; 527 pci_probe |= PCI_NOASSIGN_ROMS;
511 return NULL; 528 return NULL;
529 } else if (!strcmp(str, "nobar")) {
530 pci_probe |= PCI_NOASSIGN_BARS;
531 return NULL;
512 } else if (!strcmp(str, "assign-busses")) { 532 } else if (!strcmp(str, "assign-busses")) {
513 pci_probe |= PCI_ASSIGN_ALL_BUSSES; 533 pci_probe |= PCI_ASSIGN_ALL_BUSSES;
514 return NULL; 534 return NULL;
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9810a0f76c9..f547ee05f71 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -989,7 +989,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
989 dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq); 989 dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
990 990
991 /* Update IRQ for all devices with the same pirq value */ 991 /* Update IRQ for all devices with the same pirq value */
992 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { 992 for_each_pci_dev(dev2) {
993 pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); 993 pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
994 if (!pin) 994 if (!pin)
995 continue; 995 continue;
@@ -1028,7 +1028,7 @@ void __init pcibios_fixup_irqs(void)
1028 u8 pin; 1028 u8 pin;
1029 1029
1030 DBG(KERN_DEBUG "PCI: IRQ fixup\n"); 1030 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
1031 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1031 for_each_pci_dev(dev) {
1032 /* 1032 /*
1033 * If the BIOS has set an out of range IRQ number, just 1033 * If the BIOS has set an out of range IRQ number, just
1034 * ignore it. Also keep track of which IRQ's are 1034 * ignore it. Also keep track of which IRQ's are
@@ -1052,7 +1052,7 @@ void __init pcibios_fixup_irqs(void)
1052 return; 1052 return;
1053 1053
1054 dev = NULL; 1054 dev = NULL;
1055 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1055 for_each_pci_dev(dev) {
1056 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 1056 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
1057 if (!pin) 1057 if (!pin)
1058 continue; 1058 continue;
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 8d460eaf524..c89266be604 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -36,7 +36,7 @@ int __init pci_legacy_init(void)
36 return 0; 36 return 0;
37} 37}
38 38
39void pcibios_scan_specific_bus(int busn) 39void __devinit pcibios_scan_specific_bus(int busn)
40{ 40{
41 int devfn; 41 int devfn;
42 long node; 42 long node;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 1290ba54b35..e7e8c5f5495 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -4,7 +4,7 @@
4 * Distribute under GPLv2 4 * Distribute under GPLv2
5 * 5 *
6 * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> 6 * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
7 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> 7 * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> 8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
9 */ 9 */
10 10
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index d24f983ba1e..460f314d13e 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -4,7 +4,7 @@
4 * Distribute under GPLv2 4 * Distribute under GPLv2
5 * 5 *
6 * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> 6 * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
7 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> 7 * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> 8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
9 */ 9 */
10 10
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 6b4ffedb93c..4a2afa1bac5 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -120,7 +120,8 @@ $(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE
120quiet_cmd_vdso = VDSO $@ 120quiet_cmd_vdso = VDSO $@
121 cmd_vdso = $(CC) -nostdlib -o $@ \ 121 cmd_vdso = $(CC) -nostdlib -o $@ \
122 $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ 122 $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) 123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
124 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
124 125
125VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) 126VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
126GCOV_PROFILE := n 127GCOV_PROFILE := n
diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh
new file mode 100755
index 00000000000..7ee90a9b549
--- /dev/null
+++ b/arch/x86/vdso/checkundef.sh
@@ -0,0 +1,10 @@
1#!/bin/sh
2nm="$1"
3file="$2"
4$nm "$file" | grep '^ *U' > /dev/null 2>&1
5if [ $? -eq 1 ]; then
6 exit 0
7else
8 echo "$file: undefined symbols found" >&2
9 exit 1
10fi
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 02b442e9200..36df991985b 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -374,7 +374,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
374 374
375#ifdef CONFIG_X86_64 375#ifdef CONFIG_X86_64
376 376
377__initcall(sysenter_setup); 377subsys_initcall(sysenter_setup);
378 378
379#ifdef CONFIG_SYSCTL 379#ifdef CONFIG_SYSCTL
380/* Register vsyscall32 into the ABI table */ 380/* Register vsyscall32 into the ABI table */
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index ac74869b814..4b5d26f108b 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -67,6 +67,7 @@ static int __init init_vdso_vars(void)
67 *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; 67 *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
68#include "vextern.h" 68#include "vextern.h"
69#undef VEXTERN 69#undef VEXTERN
70 vunmap(vbase);
70 return 0; 71 return 0;
71 72
72 oom: 73 oom:
@@ -74,7 +75,7 @@ static int __init init_vdso_vars(void)
74 vdso_enabled = 0; 75 vdso_enabled = 0;
75 return -ENOMEM; 76 return -ENOMEM;
76} 77}
77__initcall(init_vdso_vars); 78subsys_initcall(init_vdso_vars);
78 79
79struct linux_binprm; 80struct linux_binprm;
80 81
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index b83e119fbeb..68128a1b401 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,6 +13,11 @@ config XEN
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
14 Xen hypervisor. 14 Xen hypervisor.
15 15
16config XEN_PVHVM
17 def_bool y
18 depends on XEN
19 depends on X86_LOCAL_APIC
20
16config XEN_MAX_DOMAIN_MEMORY 21config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes" 22 int "Maximum allowed size of a domain in gigabytes"
18 default 8 if X86_32 23 default 8 if X86_32
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3bb4fc21f4f..77938515891 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,9 +12,10 @@ CFLAGS_mmu.o := $(nostackp)
12 12
13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
14 time.o xen-asm.o xen-asm_$(BITS).o \ 14 time.o xen-asm.o xen-asm_$(BITS).o \
15 grant-table.o suspend.o 15 grant-table.o suspend.o platform-pci-unplug.o
16 16
17obj-$(CONFIG_SMP) += smp.o 17obj-$(CONFIG_SMP) += smp.o
18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
19obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o 19obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
20 20
21obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 65d8d79b46a..7d46c844141 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -11,6 +11,7 @@
11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
12 */ 12 */
13 13
14#include <linux/cpu.h>
14#include <linux/kernel.h> 15#include <linux/kernel.h>
15#include <linux/init.h> 16#include <linux/init.h>
16#include <linux/smp.h> 17#include <linux/smp.h>
@@ -35,8 +36,10 @@
35#include <xen/interface/version.h> 36#include <xen/interface/version.h>
36#include <xen/interface/physdev.h> 37#include <xen/interface/physdev.h>
37#include <xen/interface/vcpu.h> 38#include <xen/interface/vcpu.h>
39#include <xen/interface/memory.h>
38#include <xen/features.h> 40#include <xen/features.h>
39#include <xen/page.h> 41#include <xen/page.h>
42#include <xen/hvm.h>
40#include <xen/hvc-console.h> 43#include <xen/hvc-console.h>
41 44
42#include <asm/paravirt.h> 45#include <asm/paravirt.h>
@@ -55,7 +58,9 @@
55#include <asm/pgtable.h> 58#include <asm/pgtable.h>
56#include <asm/tlbflush.h> 59#include <asm/tlbflush.h>
57#include <asm/reboot.h> 60#include <asm/reboot.h>
61#include <asm/setup.h>
58#include <asm/stackprotector.h> 62#include <asm/stackprotector.h>
63#include <asm/hypervisor.h>
59 64
60#include "xen-ops.h" 65#include "xen-ops.h"
61#include "mmu.h" 66#include "mmu.h"
@@ -76,6 +81,10 @@ struct shared_info xen_dummy_shared_info;
76 81
77void *xen_initial_gdt; 82void *xen_initial_gdt;
78 83
84RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
85__read_mostly int xen_have_vector_callback;
86EXPORT_SYMBOL_GPL(xen_have_vector_callback);
87
79/* 88/*
80 * Point at some empty memory to start with. We map the real shared_info 89 * Point at some empty memory to start with. We map the real shared_info
81 * page as soon as fixmap is up and running. 90 * page as soon as fixmap is up and running.
@@ -97,6 +106,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
97 */ 106 */
98static int have_vcpu_info_placement = 1; 107static int have_vcpu_info_placement = 1;
99 108
109static void clamp_max_cpus(void)
110{
111#ifdef CONFIG_SMP
112 if (setup_max_cpus > MAX_VIRT_CPUS)
113 setup_max_cpus = MAX_VIRT_CPUS;
114#endif
115}
116
100static void xen_vcpu_setup(int cpu) 117static void xen_vcpu_setup(int cpu)
101{ 118{
102 struct vcpu_register_vcpu_info info; 119 struct vcpu_register_vcpu_info info;
@@ -104,13 +121,17 @@ static void xen_vcpu_setup(int cpu)
104 struct vcpu_info *vcpup; 121 struct vcpu_info *vcpup;
105 122
106 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 123 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
107 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
108 124
109 if (!have_vcpu_info_placement) 125 if (cpu < MAX_VIRT_CPUS)
110 return; /* already tested, not available */ 126 per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
111 127
112 vcpup = &per_cpu(xen_vcpu_info, cpu); 128 if (!have_vcpu_info_placement) {
129 if (cpu >= MAX_VIRT_CPUS)
130 clamp_max_cpus();
131 return;
132 }
113 133
134 vcpup = &per_cpu(xen_vcpu_info, cpu);
114 info.mfn = arbitrary_virt_to_mfn(vcpup); 135 info.mfn = arbitrary_virt_to_mfn(vcpup);
115 info.offset = offset_in_page(vcpup); 136 info.offset = offset_in_page(vcpup);
116 137
@@ -125,6 +146,7 @@ static void xen_vcpu_setup(int cpu)
125 if (err) { 146 if (err) {
126 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); 147 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
127 have_vcpu_info_placement = 0; 148 have_vcpu_info_placement = 0;
149 clamp_max_cpus();
128 } else { 150 } else {
129 /* This cpu is using the registered vcpu info, even if 151 /* This cpu is using the registered vcpu info, even if
130 later ones fail to. */ 152 later ones fail to. */
@@ -731,7 +753,6 @@ static void set_xen_basic_apic_ops(void)
731 753
732#endif 754#endif
733 755
734
735static void xen_clts(void) 756static void xen_clts(void)
736{ 757{
737 struct multicall_space mcs; 758 struct multicall_space mcs;
@@ -926,10 +947,6 @@ static const struct pv_init_ops xen_init_ops __initdata = {
926 .patch = xen_patch, 947 .patch = xen_patch,
927}; 948};
928 949
929static const struct pv_time_ops xen_time_ops __initdata = {
930 .sched_clock = xen_sched_clock,
931};
932
933static const struct pv_cpu_ops xen_cpu_ops __initdata = { 950static const struct pv_cpu_ops xen_cpu_ops __initdata = {
934 .cpuid = xen_cpuid, 951 .cpuid = xen_cpuid,
935 952
@@ -1028,6 +1045,23 @@ static void xen_crash_shutdown(struct pt_regs *regs)
1028 xen_reboot(SHUTDOWN_crash); 1045 xen_reboot(SHUTDOWN_crash);
1029} 1046}
1030 1047
1048static int
1049xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
1050{
1051 xen_reboot(SHUTDOWN_crash);
1052 return NOTIFY_DONE;
1053}
1054
1055static struct notifier_block xen_panic_block = {
1056 .notifier_call= xen_panic_event,
1057};
1058
1059int xen_panic_handler_init(void)
1060{
1061 atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
1062 return 0;
1063}
1064
1031static const struct machine_ops __initdata xen_machine_ops = { 1065static const struct machine_ops __initdata xen_machine_ops = {
1032 .restart = xen_restart, 1066 .restart = xen_restart,
1033 .halt = xen_machine_halt, 1067 .halt = xen_machine_halt,
@@ -1067,7 +1101,6 @@ asmlinkage void __init xen_start_kernel(void)
1067 /* Install Xen paravirt ops */ 1101 /* Install Xen paravirt ops */
1068 pv_info = xen_info; 1102 pv_info = xen_info;
1069 pv_init_ops = xen_init_ops; 1103 pv_init_ops = xen_init_ops;
1070 pv_time_ops = xen_time_ops;
1071 pv_cpu_ops = xen_cpu_ops; 1104 pv_cpu_ops = xen_cpu_ops;
1072 pv_apic_ops = xen_apic_ops; 1105 pv_apic_ops = xen_apic_ops;
1073 1106
@@ -1075,13 +1108,7 @@ asmlinkage void __init xen_start_kernel(void)
1075 x86_init.oem.arch_setup = xen_arch_setup; 1108 x86_init.oem.arch_setup = xen_arch_setup;
1076 x86_init.oem.banner = xen_banner; 1109 x86_init.oem.banner = xen_banner;
1077 1110
1078 x86_init.timers.timer_init = xen_time_init; 1111 xen_init_time_ops();
1079 x86_init.timers.setup_percpu_clockev = x86_init_noop;
1080 x86_cpuinit.setup_percpu_clockev = x86_init_noop;
1081
1082 x86_platform.calibrate_tsc = xen_tsc_khz;
1083 x86_platform.get_wallclock = xen_get_wallclock;
1084 x86_platform.set_wallclock = xen_set_wallclock;
1085 1112
1086 /* 1113 /*
1087 * Set up some pagetable state before starting to set any ptes. 1114 * Set up some pagetable state before starting to set any ptes.
@@ -1145,6 +1172,10 @@ asmlinkage void __init xen_start_kernel(void)
1145 1172
1146 pgd = (pgd_t *)xen_start_info->pt_base; 1173 pgd = (pgd_t *)xen_start_info->pt_base;
1147 1174
1175 if (!xen_initial_domain())
1176 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1177
1178 __supported_pte_mask |= _PAGE_IOMAP;
1148 /* Don't do the full vcpu_info placement stuff until we have a 1179 /* Don't do the full vcpu_info placement stuff until we have a
1149 possible map and a non-dummy shared_info. */ 1180 possible map and a non-dummy shared_info. */
1150 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1181 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
@@ -1206,3 +1237,139 @@ asmlinkage void __init xen_start_kernel(void)
1206 x86_64_start_reservations((char *)__pa_symbol(&boot_params)); 1237 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1207#endif 1238#endif
1208} 1239}
1240
1241static uint32_t xen_cpuid_base(void)
1242{
1243 uint32_t base, eax, ebx, ecx, edx;
1244 char signature[13];
1245
1246 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1247 cpuid(base, &eax, &ebx, &ecx, &edx);
1248 *(uint32_t *)(signature + 0) = ebx;
1249 *(uint32_t *)(signature + 4) = ecx;
1250 *(uint32_t *)(signature + 8) = edx;
1251 signature[12] = 0;
1252
1253 if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
1254 return base;
1255 }
1256
1257 return 0;
1258}
1259
1260static int init_hvm_pv_info(int *major, int *minor)
1261{
1262 uint32_t eax, ebx, ecx, edx, pages, msr, base;
1263 u64 pfn;
1264
1265 base = xen_cpuid_base();
1266 cpuid(base + 1, &eax, &ebx, &ecx, &edx);
1267
1268 *major = eax >> 16;
1269 *minor = eax & 0xffff;
1270 printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
1271
1272 cpuid(base + 2, &pages, &msr, &ecx, &edx);
1273
1274 pfn = __pa(hypercall_page);
1275 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
1276
1277 xen_setup_features();
1278
1279 pv_info = xen_info;
1280 pv_info.kernel_rpl = 0;
1281
1282 xen_domain_type = XEN_HVM_DOMAIN;
1283
1284 return 0;
1285}
1286
1287void xen_hvm_init_shared_info(void)
1288{
1289 int cpu;
1290 struct xen_add_to_physmap xatp;
1291 static struct shared_info *shared_info_page = 0;
1292
1293 if (!shared_info_page)
1294 shared_info_page = (struct shared_info *)
1295 extend_brk(PAGE_SIZE, PAGE_SIZE);
1296 xatp.domid = DOMID_SELF;
1297 xatp.idx = 0;
1298 xatp.space = XENMAPSPACE_shared_info;
1299 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
1300 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1301 BUG();
1302
1303 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
1304
1305 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1306 * page, we use it in the event channel upcall and in some pvclock
1307 * related functions. We don't need the vcpu_info placement
1308 * optimizations because we don't use any pv_mmu or pv_irq op on
1309 * HVM.
1310 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
1311 * online but xen_hvm_init_shared_info is run at resume time too and
1312 * in that case multiple vcpus might be online. */
1313 for_each_online_cpu(cpu) {
1314 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1315 }
1316}
1317
1318#ifdef CONFIG_XEN_PVHVM
1319static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1320 unsigned long action, void *hcpu)
1321{
1322 int cpu = (long)hcpu;
1323 switch (action) {
1324 case CPU_UP_PREPARE:
1325 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1326 break;
1327 default:
1328 break;
1329 }
1330 return NOTIFY_OK;
1331}
1332
1333static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
1334 .notifier_call = xen_hvm_cpu_notify,
1335};
1336
1337static void __init xen_hvm_guest_init(void)
1338{
1339 int r;
1340 int major, minor;
1341
1342 r = init_hvm_pv_info(&major, &minor);
1343 if (r < 0)
1344 return;
1345
1346 xen_hvm_init_shared_info();
1347
1348 if (xen_feature(XENFEAT_hvm_callback_vector))
1349 xen_have_vector_callback = 1;
1350 register_cpu_notifier(&xen_hvm_cpu_notifier);
1351 xen_unplug_emulated_devices();
1352 have_vcpu_info_placement = 0;
1353 x86_init.irqs.intr_init = xen_init_IRQ;
1354 xen_hvm_init_time_ops();
1355 xen_hvm_init_mmu_ops();
1356}
1357
1358static bool __init xen_hvm_platform(void)
1359{
1360 if (xen_pv_domain())
1361 return false;
1362
1363 if (!xen_cpuid_base())
1364 return false;
1365
1366 return true;
1367}
1368
1369const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
1370 .name = "Xen HVM",
1371 .detect = xen_hvm_platform,
1372 .init_platform = xen_hvm_guest_init,
1373};
1374EXPORT_SYMBOL(x86_hyper_xen_hvm);
1375#endif
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 914f04695ce..42086ac406a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,6 +42,7 @@
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/bug.h> 44#include <linux/bug.h>
45#include <linux/vmalloc.h>
45#include <linux/module.h> 46#include <linux/module.h>
46#include <linux/gfp.h> 47#include <linux/gfp.h>
47 48
@@ -51,14 +52,19 @@
51#include <asm/mmu_context.h> 52#include <asm/mmu_context.h>
52#include <asm/setup.h> 53#include <asm/setup.h>
53#include <asm/paravirt.h> 54#include <asm/paravirt.h>
55#include <asm/e820.h>
54#include <asm/linkage.h> 56#include <asm/linkage.h>
57#include <asm/page.h>
55 58
56#include <asm/xen/hypercall.h> 59#include <asm/xen/hypercall.h>
57#include <asm/xen/hypervisor.h> 60#include <asm/xen/hypervisor.h>
58 61
62#include <xen/xen.h>
59#include <xen/page.h> 63#include <xen/page.h>
60#include <xen/interface/xen.h> 64#include <xen/interface/xen.h>
65#include <xen/interface/hvm/hvm_op.h>
61#include <xen/interface/version.h> 66#include <xen/interface/version.h>
67#include <xen/interface/memory.h>
62#include <xen/hvc-console.h> 68#include <xen/hvc-console.h>
63 69
64#include "multicalls.h" 70#include "multicalls.h"
@@ -67,6 +73,13 @@
67 73
68#define MMU_UPDATE_HISTO 30 74#define MMU_UPDATE_HISTO 30
69 75
76/*
77 * Protects atomic reservation decrease/increase against concurrent increases.
78 * Also protects non-atomic updates of current_pages and driver_pages, and
79 * balloon lists.
80 */
81DEFINE_SPINLOCK(xen_reservation_lock);
82
70#ifdef CONFIG_XEN_DEBUG_FS 83#ifdef CONFIG_XEN_DEBUG_FS
71 84
72static struct { 85static struct {
@@ -377,6 +390,28 @@ static bool xen_page_pinned(void *ptr)
377 return PagePinned(page); 390 return PagePinned(page);
378} 391}
379 392
393static bool xen_iomap_pte(pte_t pte)
394{
395 return pte_flags(pte) & _PAGE_IOMAP;
396}
397
398static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
399{
400 struct multicall_space mcs;
401 struct mmu_update *u;
402
403 mcs = xen_mc_entry(sizeof(*u));
404 u = mcs.args;
405
406 /* ptep might be kmapped when using 32-bit HIGHPTE */
407 u->ptr = arbitrary_virt_to_machine(ptep).maddr;
408 u->val = pte_val_ma(pteval);
409
410 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
411
412 xen_mc_issue(PARAVIRT_LAZY_MMU);
413}
414
380static void xen_extend_mmu_update(const struct mmu_update *update) 415static void xen_extend_mmu_update(const struct mmu_update *update)
381{ 416{
382 struct multicall_space mcs; 417 struct multicall_space mcs;
@@ -453,6 +488,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
453void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 488void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
454 pte_t *ptep, pte_t pteval) 489 pte_t *ptep, pte_t pteval)
455{ 490{
491 if (xen_iomap_pte(pteval)) {
492 xen_set_iomap_pte(ptep, pteval);
493 goto out;
494 }
495
456 ADD_STATS(set_pte_at, 1); 496 ADD_STATS(set_pte_at, 1);
457// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); 497// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
458 ADD_STATS(set_pte_at_current, mm == current->mm); 498 ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -523,8 +563,25 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
523 return val; 563 return val;
524} 564}
525 565
566static pteval_t iomap_pte(pteval_t val)
567{
568 if (val & _PAGE_PRESENT) {
569 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
570 pteval_t flags = val & PTE_FLAGS_MASK;
571
572 /* We assume the pte frame number is a MFN, so
573 just use it as-is. */
574 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
575 }
576
577 return val;
578}
579
526pteval_t xen_pte_val(pte_t pte) 580pteval_t xen_pte_val(pte_t pte)
527{ 581{
582 if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
583 return pte.pte;
584
528 return pte_mfn_to_pfn(pte.pte); 585 return pte_mfn_to_pfn(pte.pte);
529} 586}
530PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 587PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
@@ -537,7 +594,22 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
537 594
538pte_t xen_make_pte(pteval_t pte) 595pte_t xen_make_pte(pteval_t pte)
539{ 596{
540 pte = pte_pfn_to_mfn(pte); 597 phys_addr_t addr = (pte & PTE_PFN_MASK);
598
599 /*
600 * Unprivileged domains are allowed to do IOMAPpings for
601 * PCI passthrough, but not map ISA space. The ISA
602 * mappings are just dummy local mappings to keep other
603 * parts of the kernel happy.
604 */
605 if (unlikely(pte & _PAGE_IOMAP) &&
606 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
607 pte = iomap_pte(pte);
608 } else {
609 pte &= ~_PAGE_IOMAP;
610 pte = pte_pfn_to_mfn(pte);
611 }
612
541 return native_make_pte(pte); 613 return native_make_pte(pte);
542} 614}
543PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 615PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
@@ -593,6 +665,11 @@ void xen_set_pud(pud_t *ptr, pud_t val)
593 665
594void xen_set_pte(pte_t *ptep, pte_t pte) 666void xen_set_pte(pte_t *ptep, pte_t pte)
595{ 667{
668 if (xen_iomap_pte(pte)) {
669 xen_set_iomap_pte(ptep, pte);
670 return;
671 }
672
596 ADD_STATS(pte_update, 1); 673 ADD_STATS(pte_update, 1);
597// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); 674// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
598 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); 675 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
@@ -609,6 +686,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte)
609#ifdef CONFIG_X86_PAE 686#ifdef CONFIG_X86_PAE
610void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 687void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
611{ 688{
689 if (xen_iomap_pte(pte)) {
690 xen_set_iomap_pte(ptep, pte);
691 return;
692 }
693
612 set_64bit((u64 *)ptep, native_pte_val(pte)); 694 set_64bit((u64 *)ptep, native_pte_val(pte));
613} 695}
614 696
@@ -935,8 +1017,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
935 read-only, and can be pinned. */ 1017 read-only, and can be pinned. */
936static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) 1018static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
937{ 1019{
938 vm_unmap_aliases();
939
940 xen_mc_batch(); 1020 xen_mc_batch();
941 1021
942 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { 1022 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
@@ -1500,7 +1580,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
1500 if (PagePinned(virt_to_page(mm->pgd))) { 1580 if (PagePinned(virt_to_page(mm->pgd))) {
1501 SetPagePinned(page); 1581 SetPagePinned(page);
1502 1582
1503 vm_unmap_aliases();
1504 if (!PageHighMem(page)) { 1583 if (!PageHighMem(page)) {
1505 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); 1584 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1506 if (level == PT_PTE && USE_SPLIT_PTLOCKS) 1585 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
@@ -1811,9 +1890,16 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1811 pte = pfn_pte(phys, prot); 1890 pte = pfn_pte(phys, prot);
1812 break; 1891 break;
1813 1892
1814 default: 1893 case FIX_PARAVIRT_BOOTMAP:
1894 /* This is an MFN, but it isn't an IO mapping from the
1895 IO domain */
1815 pte = mfn_pte(phys, prot); 1896 pte = mfn_pte(phys, prot);
1816 break; 1897 break;
1898
1899 default:
1900 /* By default, set_fixmap is used for hardware mappings */
1901 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1902 break;
1817 } 1903 }
1818 1904
1819 __native_set_fixmap(idx, pte); 1905 __native_set_fixmap(idx, pte);
@@ -1939,8 +2025,240 @@ void __init xen_init_mmu_ops(void)
1939 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; 2025 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
1940 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; 2026 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
1941 pv_mmu_ops = xen_mmu_ops; 2027 pv_mmu_ops = xen_mmu_ops;
2028
2029 vmap_lazy_unmap = false;
2030}
2031
2032/* Protected by xen_reservation_lock. */
2033#define MAX_CONTIG_ORDER 9 /* 2MB */
2034static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2035
2036#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2037static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2038 unsigned long *in_frames,
2039 unsigned long *out_frames)
2040{
2041 int i;
2042 struct multicall_space mcs;
2043
2044 xen_mc_batch();
2045 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2046 mcs = __xen_mc_entry(0);
2047
2048 if (in_frames)
2049 in_frames[i] = virt_to_mfn(vaddr);
2050
2051 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2052 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2053
2054 if (out_frames)
2055 out_frames[i] = virt_to_pfn(vaddr);
2056 }
2057 xen_mc_issue(0);
2058}
2059
2060/*
2061 * Update the pfn-to-mfn mappings for a virtual address range, either to
2062 * point to an array of mfns, or contiguously from a single starting
2063 * mfn.
2064 */
2065static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2066 unsigned long *mfns,
2067 unsigned long first_mfn)
2068{
2069 unsigned i, limit;
2070 unsigned long mfn;
2071
2072 xen_mc_batch();
2073
2074 limit = 1u << order;
2075 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2076 struct multicall_space mcs;
2077 unsigned flags;
2078
2079 mcs = __xen_mc_entry(0);
2080 if (mfns)
2081 mfn = mfns[i];
2082 else
2083 mfn = first_mfn + i;
2084
2085 if (i < (limit - 1))
2086 flags = 0;
2087 else {
2088 if (order == 0)
2089 flags = UVMF_INVLPG | UVMF_ALL;
2090 else
2091 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2092 }
2093
2094 MULTI_update_va_mapping(mcs.mc, vaddr,
2095 mfn_pte(mfn, PAGE_KERNEL), flags);
2096
2097 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2098 }
2099
2100 xen_mc_issue(0);
2101}
2102
2103/*
2104 * Perform the hypercall to exchange a region of our pfns to point to
2105 * memory with the required contiguous alignment. Takes the pfns as
2106 * input, and populates mfns as output.
2107 *
2108 * Returns a success code indicating whether the hypervisor was able to
2109 * satisfy the request or not.
2110 */
2111static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2112 unsigned long *pfns_in,
2113 unsigned long extents_out,
2114 unsigned int order_out,
2115 unsigned long *mfns_out,
2116 unsigned int address_bits)
2117{
2118 long rc;
2119 int success;
2120
2121 struct xen_memory_exchange exchange = {
2122 .in = {
2123 .nr_extents = extents_in,
2124 .extent_order = order_in,
2125 .extent_start = pfns_in,
2126 .domid = DOMID_SELF
2127 },
2128 .out = {
2129 .nr_extents = extents_out,
2130 .extent_order = order_out,
2131 .extent_start = mfns_out,
2132 .address_bits = address_bits,
2133 .domid = DOMID_SELF
2134 }
2135 };
2136
2137 BUG_ON(extents_in << order_in != extents_out << order_out);
2138
2139 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2140 success = (exchange.nr_exchanged == extents_in);
2141
2142 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2143 BUG_ON(success && (rc != 0));
2144
2145 return success;
1942} 2146}
1943 2147
2148int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2149 unsigned int address_bits)
2150{
2151 unsigned long *in_frames = discontig_frames, out_frame;
2152 unsigned long flags;
2153 int success;
2154
2155 /*
2156 * Currently an auto-translated guest will not perform I/O, nor will
2157 * it require PAE page directories below 4GB. Therefore any calls to
2158 * this function are redundant and can be ignored.
2159 */
2160
2161 if (xen_feature(XENFEAT_auto_translated_physmap))
2162 return 0;
2163
2164 if (unlikely(order > MAX_CONTIG_ORDER))
2165 return -ENOMEM;
2166
2167 memset((void *) vstart, 0, PAGE_SIZE << order);
2168
2169 spin_lock_irqsave(&xen_reservation_lock, flags);
2170
2171 /* 1. Zap current PTEs, remembering MFNs. */
2172 xen_zap_pfn_range(vstart, order, in_frames, NULL);
2173
2174 /* 2. Get a new contiguous memory extent. */
2175 out_frame = virt_to_pfn(vstart);
2176 success = xen_exchange_memory(1UL << order, 0, in_frames,
2177 1, order, &out_frame,
2178 address_bits);
2179
2180 /* 3. Map the new extent in place of old pages. */
2181 if (success)
2182 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2183 else
2184 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2185
2186 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2187
2188 return success ? 0 : -ENOMEM;
2189}
2190EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2191
2192void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2193{
2194 unsigned long *out_frames = discontig_frames, in_frame;
2195 unsigned long flags;
2196 int success;
2197
2198 if (xen_feature(XENFEAT_auto_translated_physmap))
2199 return;
2200
2201 if (unlikely(order > MAX_CONTIG_ORDER))
2202 return;
2203
2204 memset((void *) vstart, 0, PAGE_SIZE << order);
2205
2206 spin_lock_irqsave(&xen_reservation_lock, flags);
2207
2208 /* 1. Find start MFN of contiguous extent. */
2209 in_frame = virt_to_mfn(vstart);
2210
2211 /* 2. Zap current PTEs. */
2212 xen_zap_pfn_range(vstart, order, NULL, out_frames);
2213
2214 /* 3. Do the exchange for non-contiguous MFNs. */
2215 success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2216 0, out_frames, 0);
2217
2218 /* 4. Map new pages in place of old pages. */
2219 if (success)
2220 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2221 else
2222 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2223
2224 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2225}
2226EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2227
2228#ifdef CONFIG_XEN_PVHVM
2229static void xen_hvm_exit_mmap(struct mm_struct *mm)
2230{
2231 struct xen_hvm_pagetable_dying a;
2232 int rc;
2233
2234 a.domid = DOMID_SELF;
2235 a.gpa = __pa(mm->pgd);
2236 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2237 WARN_ON_ONCE(rc < 0);
2238}
2239
2240static int is_pagetable_dying_supported(void)
2241{
2242 struct xen_hvm_pagetable_dying a;
2243 int rc = 0;
2244
2245 a.domid = DOMID_SELF;
2246 a.gpa = 0x00;
2247 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2248 if (rc < 0) {
2249 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2250 return 0;
2251 }
2252 return 1;
2253}
2254
2255void __init xen_hvm_init_mmu_ops(void)
2256{
2257 if (is_pagetable_dying_supported())
2258 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2259}
2260#endif
2261
1944#ifdef CONFIG_XEN_DEBUG_FS 2262#ifdef CONFIG_XEN_DEBUG_FS
1945 2263
1946static struct dentry *d_mmu_debug; 2264static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe6bc7f5ec..fa938c4aa2f 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -60,4 +60,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
60unsigned long xen_read_cr2_direct(void); 60unsigned long xen_read_cr2_direct(void);
61 61
62extern void xen_init_mmu_ops(void); 62extern void xen_init_mmu_ops(void);
63extern void xen_hvm_init_mmu_ops(void);
63#endif /* _XEN_MMU_H */ 64#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
new file mode 100644
index 00000000000..a013ec9d0c5
--- /dev/null
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -0,0 +1,58 @@
1/* Glue code to lib/swiotlb-xen.c */
2
3#include <linux/dma-mapping.h>
4#include <xen/swiotlb-xen.h>
5
6#include <asm/xen/hypervisor.h>
7#include <xen/xen.h>
8
9int xen_swiotlb __read_mostly;
10
11static struct dma_map_ops xen_swiotlb_dma_ops = {
12 .mapping_error = xen_swiotlb_dma_mapping_error,
13 .alloc_coherent = xen_swiotlb_alloc_coherent,
14 .free_coherent = xen_swiotlb_free_coherent,
15 .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
16 .sync_single_for_device = xen_swiotlb_sync_single_for_device,
17 .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
18 .sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
19 .map_sg = xen_swiotlb_map_sg_attrs,
20 .unmap_sg = xen_swiotlb_unmap_sg_attrs,
21 .map_page = xen_swiotlb_map_page,
22 .unmap_page = xen_swiotlb_unmap_page,
23 .dma_supported = xen_swiotlb_dma_supported,
24};
25
26/*
27 * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
28 *
29 * This returns non-zero if we are forced to use xen_swiotlb (by the boot
30 * option).
31 */
32int __init pci_xen_swiotlb_detect(void)
33{
34
35 /* If running as PV guest, either iommu=soft, or swiotlb=force will
36 * activate this IOMMU. If running as PV privileged, activate it
37 * irregardlesss.
38 */
39 if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
40 (xen_pv_domain()))
41 xen_swiotlb = 1;
42
43 /* If we are running under Xen, we MUST disable the native SWIOTLB.
44 * Don't worry about swiotlb_force flag activating the native, as
45 * the 'swiotlb' flag is the only one turning it on. */
46 if (xen_pv_domain())
47 swiotlb = 0;
48
49 return xen_swiotlb;
50}
51
52void __init pci_xen_swiotlb_init(void)
53{
54 if (xen_swiotlb) {
55 xen_swiotlb_init(1);
56 dma_ops = &xen_swiotlb_dma_ops;
57 }
58}
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
new file mode 100644
index 00000000000..554c002a1e1
--- /dev/null
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -0,0 +1,137 @@
1/******************************************************************************
2 * platform-pci-unplug.c
3 *
4 * Xen platform PCI device driver
5 * Copyright (c) 2010, Citrix
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
18 * Place - Suite 330, Boston, MA 02111-1307 USA.
19 *
20 */
21
22#include <linux/init.h>
23#include <linux/io.h>
24#include <linux/module.h>
25
26#include <xen/platform_pci.h>
27
28#define XEN_PLATFORM_ERR_MAGIC -1
29#define XEN_PLATFORM_ERR_PROTOCOL -2
30#define XEN_PLATFORM_ERR_BLACKLIST -3
31
32/* store the value of xen_emul_unplug after the unplug is done */
33int xen_platform_pci_unplug;
34EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
35#ifdef CONFIG_XEN_PVHVM
36static int xen_emul_unplug;
37
38static int __init check_platform_magic(void)
39{
40 short magic;
41 char protocol;
42
43 magic = inw(XEN_IOPORT_MAGIC);
44 if (magic != XEN_IOPORT_MAGIC_VAL) {
45 printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
46 return XEN_PLATFORM_ERR_MAGIC;
47 }
48
49 protocol = inb(XEN_IOPORT_PROTOVER);
50
51 printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
52 protocol);
53
54 switch (protocol) {
55 case 1:
56 outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
57 outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
58 if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
59 printk(KERN_ERR "Xen Platform: blacklisted by host\n");
60 return XEN_PLATFORM_ERR_BLACKLIST;
61 }
62 break;
63 default:
64 printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
65 return XEN_PLATFORM_ERR_PROTOCOL;
66 }
67
68 return 0;
69}
70
71void __init xen_unplug_emulated_devices(void)
72{
73 int r;
74
75 /* check the version of the xen platform PCI device */
76 r = check_platform_magic();
77 /* If the version matches enable the Xen platform PCI driver.
78 * Also enable the Xen platform PCI driver if the version is really old
79 * and the user told us to ignore it. */
80 if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
81 (xen_emul_unplug & XEN_UNPLUG_IGNORE)))
82 return;
83 /* Set the default value of xen_emul_unplug depending on whether or
84 * not the Xen PV frontends and the Xen platform PCI driver have
85 * been compiled for this kernel (modules or built-in are both OK). */
86 if (!xen_emul_unplug) {
87 if (xen_must_unplug_nics()) {
88 printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
89 "been compiled for this kernel: unplug emulated NICs.\n");
90 xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
91 }
92 if (xen_must_unplug_disks()) {
93 printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
94 "been compiled for this kernel: unplug emulated disks.\n"
95 "You might have to change the root device\n"
96 "from /dev/hd[a-d] to /dev/xvd[a-d]\n"
97 "in your root= kernel command line option\n");
98 xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
99 }
100 }
101 /* Now unplug the emulated devices */
102 if (!(xen_emul_unplug & XEN_UNPLUG_IGNORE))
103 outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
104 xen_platform_pci_unplug = xen_emul_unplug;
105}
106
107static int __init parse_xen_emul_unplug(char *arg)
108{
109 char *p, *q;
110 int l;
111
112 for (p = arg; p; p = q) {
113 q = strchr(p, ',');
114 if (q) {
115 l = q - p;
116 q++;
117 } else {
118 l = strlen(p);
119 }
120 if (!strncmp(p, "all", l))
121 xen_emul_unplug |= XEN_UNPLUG_ALL;
122 else if (!strncmp(p, "ide-disks", l))
123 xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
124 else if (!strncmp(p, "aux-ide-disks", l))
125 xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
126 else if (!strncmp(p, "nics", l))
127 xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
128 else if (!strncmp(p, "ignore", l))
129 xen_emul_unplug |= XEN_UNPLUG_IGNORE;
130 else
131 printk(KERN_WARNING "unrecognised option '%s' "
132 "in parameter 'xen_emul_unplug'\n", p);
133 }
134 return 0;
135}
136early_param("xen_emul_unplug", parse_xen_emul_unplug);
137#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ad0047f47cd..328b0030542 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
20#include <xen/page.h> 20#include <xen/page.h>
21#include <xen/interface/callback.h> 21#include <xen/interface/callback.h>
22#include <xen/interface/physdev.h> 22#include <xen/interface/physdev.h>
23#include <xen/interface/memory.h>
23#include <xen/features.h> 24#include <xen/features.h>
24 25
25#include "xen-ops.h" 26#include "xen-ops.h"
@@ -32,6 +33,73 @@ extern void xen_sysenter_target(void);
32extern void xen_syscall_target(void); 33extern void xen_syscall_target(void);
33extern void xen_syscall32_target(void); 34extern void xen_syscall32_target(void);
34 35
36static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
37 phys_addr_t end_addr)
38{
39 struct xen_memory_reservation reservation = {
40 .address_bits = 0,
41 .extent_order = 0,
42 .domid = DOMID_SELF
43 };
44 unsigned long start, end;
45 unsigned long len = 0;
46 unsigned long pfn;
47 int ret;
48
49 start = PFN_UP(start_addr);
50 end = PFN_DOWN(end_addr);
51
52 if (end <= start)
53 return 0;
54
55 printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ",
56 start, end);
57 for(pfn = start; pfn < end; pfn++) {
58 unsigned long mfn = pfn_to_mfn(pfn);
59
60 /* Make sure pfn exists to start with */
61 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
62 continue;
63
64 set_xen_guest_handle(reservation.extent_start, &mfn);
65 reservation.nr_extents = 1;
66
67 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
68 &reservation);
69 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
70 start, end, ret);
71 if (ret == 1) {
72 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
73 len++;
74 }
75 }
76 printk(KERN_CONT "%ld pages freed\n", len);
77
78 return len;
79}
80
81static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
82 const struct e820map *e820)
83{
84 phys_addr_t max_addr = PFN_PHYS(max_pfn);
85 phys_addr_t last_end = 0;
86 unsigned long released = 0;
87 int i;
88
89 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
90 phys_addr_t end = e820->map[i].addr;
91 end = min(max_addr, end);
92
93 released += xen_release_chunk(last_end, end);
94 last_end = e820->map[i].addr + e820->map[i].size;
95 }
96
97 if (last_end < max_addr)
98 released += xen_release_chunk(last_end, max_addr);
99
100 printk(KERN_INFO "released %ld pages of unused memory\n", released);
101 return released;
102}
35 103
36/** 104/**
37 * machine_specific_memory_setup - Hook for machine specific memory setup. 105 * machine_specific_memory_setup - Hook for machine specific memory setup.
@@ -67,6 +135,8 @@ char * __init xen_memory_setup(void)
67 135
68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 136 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
69 137
138 xen_return_unused_memory(xen_start_info->nr_pages, &e820);
139
70 return "Xen"; 140 return "Xen";
71} 141}
72 142
@@ -156,6 +226,8 @@ void __init xen_arch_setup(void)
156 struct physdev_set_iopl set_iopl; 226 struct physdev_set_iopl set_iopl;
157 int rc; 227 int rc;
158 228
229 xen_panic_handler_init();
230
159 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 231 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
160 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 232 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
161 233
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index a29693fd313..25f232b18a8 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -394,6 +394,8 @@ static void stop_self(void *v)
394 load_cr3(swapper_pg_dir); 394 load_cr3(swapper_pg_dir);
395 /* should set up a minimal gdt */ 395 /* should set up a minimal gdt */
396 396
397 set_cpu_online(cpu, false);
398
397 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); 399 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
398 BUG(); 400 BUG();
399} 401}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index a9c66110803..1d789d56877 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -26,6 +26,18 @@ void xen_pre_suspend(void)
26 BUG(); 26 BUG();
27} 27}
28 28
29void xen_hvm_post_suspend(int suspend_cancelled)
30{
31 int cpu;
32 xen_hvm_init_shared_info();
33 xen_callback_vector();
34 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
35 for_each_online_cpu(cpu) {
36 xen_setup_runstate_info(cpu);
37 }
38 }
39}
40
29void xen_post_suspend(int suspend_cancelled) 41void xen_post_suspend(int suspend_cancelled)
30{ 42{
31 xen_build_mfn_list_list(); 43 xen_build_mfn_list_list();
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b3c6c59ed30..1a5353a753f 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -20,6 +20,7 @@
20#include <asm/xen/hypercall.h> 20#include <asm/xen/hypercall.h>
21 21
22#include <xen/events.h> 22#include <xen/events.h>
23#include <xen/features.h>
23#include <xen/interface/xen.h> 24#include <xen/interface/xen.h>
24#include <xen/interface/vcpu.h> 25#include <xen/interface/vcpu.h>
25 26
@@ -155,47 +156,8 @@ static void do_stolen_accounting(void)
155 account_idle_ticks(ticks); 156 account_idle_ticks(ticks);
156} 157}
157 158
158/*
159 * Xen sched_clock implementation. Returns the number of unstolen
160 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
161 * states.
162 */
163unsigned long long xen_sched_clock(void)
164{
165 struct vcpu_runstate_info state;
166 cycle_t now;
167 u64 ret;
168 s64 offset;
169
170 /*
171 * Ideally sched_clock should be called on a per-cpu basis
172 * anyway, so preempt should already be disabled, but that's
173 * not current practice at the moment.
174 */
175 preempt_disable();
176
177 now = xen_clocksource_read();
178
179 get_runstate_snapshot(&state);
180
181 WARN_ON(state.state != RUNSTATE_running);
182
183 offset = now - state.state_entry_time;
184 if (offset < 0)
185 offset = 0;
186
187 ret = state.time[RUNSTATE_blocked] +
188 state.time[RUNSTATE_running] +
189 offset;
190
191 preempt_enable();
192
193 return ret;
194}
195
196
197/* Get the TSC speed from Xen */ 159/* Get the TSC speed from Xen */
198unsigned long xen_tsc_khz(void) 160static unsigned long xen_tsc_khz(void)
199{ 161{
200 struct pvclock_vcpu_time_info *info = 162 struct pvclock_vcpu_time_info *info =
201 &HYPERVISOR_shared_info->vcpu_info[0].time; 163 &HYPERVISOR_shared_info->vcpu_info[0].time;
@@ -230,7 +192,7 @@ static void xen_read_wallclock(struct timespec *ts)
230 put_cpu_var(xen_vcpu); 192 put_cpu_var(xen_vcpu);
231} 193}
232 194
233unsigned long xen_get_wallclock(void) 195static unsigned long xen_get_wallclock(void)
234{ 196{
235 struct timespec ts; 197 struct timespec ts;
236 198
@@ -238,7 +200,7 @@ unsigned long xen_get_wallclock(void)
238 return ts.tv_sec; 200 return ts.tv_sec;
239} 201}
240 202
241int xen_set_wallclock(unsigned long now) 203static int xen_set_wallclock(unsigned long now)
242{ 204{
243 /* do nothing for domU */ 205 /* do nothing for domU */
244 return -1; 206 return -1;
@@ -473,7 +435,11 @@ void xen_timer_resume(void)
473 } 435 }
474} 436}
475 437
476__init void xen_time_init(void) 438static const struct pv_time_ops xen_time_ops __initdata = {
439 .sched_clock = xen_clocksource_read,
440};
441
442static __init void xen_time_init(void)
477{ 443{
478 int cpu = smp_processor_id(); 444 int cpu = smp_processor_id();
479 struct timespec tp; 445 struct timespec tp;
@@ -497,3 +463,47 @@ __init void xen_time_init(void)
497 xen_setup_timer(cpu); 463 xen_setup_timer(cpu);
498 xen_setup_cpu_clockevents(); 464 xen_setup_cpu_clockevents();
499} 465}
466
467__init void xen_init_time_ops(void)
468{
469 pv_time_ops = xen_time_ops;
470
471 x86_init.timers.timer_init = xen_time_init;
472 x86_init.timers.setup_percpu_clockev = x86_init_noop;
473 x86_cpuinit.setup_percpu_clockev = x86_init_noop;
474
475 x86_platform.calibrate_tsc = xen_tsc_khz;
476 x86_platform.get_wallclock = xen_get_wallclock;
477 x86_platform.set_wallclock = xen_set_wallclock;
478}
479
480#ifdef CONFIG_XEN_PVHVM
481static void xen_hvm_setup_cpu_clockevents(void)
482{
483 int cpu = smp_processor_id();
484 xen_setup_runstate_info(cpu);
485 xen_setup_timer(cpu);
486 xen_setup_cpu_clockevents();
487}
488
489__init void xen_hvm_init_time_ops(void)
490{
491 /* vector callback is needed otherwise we cannot receive interrupts
492 * on cpu > 0 */
493 if (!xen_have_vector_callback && num_present_cpus() > 1)
494 return;
495 if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
496 printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
497 "disable pv timer\n");
498 return;
499 }
500
501 pv_time_ops = xen_time_ops;
502 x86_init.timers.setup_percpu_clockev = xen_time_init;
503 x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
504
505 x86_platform.calibrate_tsc = xen_tsc_khz;
506 x86_platform.get_wallclock = xen_get_wallclock;
507 x86_platform.set_wallclock = xen_set_wallclock;
508}
509#endif
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f9153a300bc..7c8ab86163e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -38,6 +38,10 @@ void xen_enable_sysenter(void);
38void xen_enable_syscall(void); 38void xen_enable_syscall(void);
39void xen_vcpu_restore(void); 39void xen_vcpu_restore(void);
40 40
41void xen_callback_vector(void);
42void xen_hvm_init_shared_info(void);
43void __init xen_unplug_emulated_devices(void);
44
41void __init xen_build_dynamic_phys_to_machine(void); 45void __init xen_build_dynamic_phys_to_machine(void);
42 46
43void xen_init_irq_ops(void); 47void xen_init_irq_ops(void);
@@ -46,11 +50,8 @@ void xen_setup_runstate_info(int cpu);
46void xen_teardown_timer(int cpu); 50void xen_teardown_timer(int cpu);
47cycle_t xen_clocksource_read(void); 51cycle_t xen_clocksource_read(void);
48void xen_setup_cpu_clockevents(void); 52void xen_setup_cpu_clockevents(void);
49unsigned long xen_tsc_khz(void); 53void __init xen_init_time_ops(void);
50void __init xen_time_init(void); 54void __init xen_hvm_init_time_ops(void);
51unsigned long xen_get_wallclock(void);
52int xen_set_wallclock(unsigned long time);
53unsigned long long xen_sched_clock(void);
54 55
55irqreturn_t xen_debug_interrupt(int irq, void *dev_id); 56irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
56 57
@@ -101,4 +102,6 @@ void xen_sysret32(void);
101void xen_sysret64(void); 102void xen_sysret64(void);
102void xen_adjust_exception_frame(void); 103void xen_adjust_exception_frame(void);
103 104
105extern int xen_panic_handler_init(void);
106
104#endif /* XEN_OPS_H */ 107#endif /* XEN_OPS_H */