aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-12-09 02:24:25 -0500
committerIngo Molnar <mingo@elte.hu>2009-12-09 02:24:57 -0500
commit4c68db38c85188824b21698842b42a62b4f78657 (patch)
tree3ee1c3b22af6713adf669a3bb452ce82bc7fe495 /arch/x86
parent5c0e9f28da84c68ce0ae68b7a75faaf862e156e2 (diff)
parent2b876f95d03e226394b5d360c86127cbefaf614b (diff)
Merge branch 'linus' into x86/urgent
Merge reason: We want to queue up a dependent patch. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig8
-rw-r--r--arch/x86/Kconfig.cpu14
-rw-r--r--arch/x86/boot/compressed/head_64.S3
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S3
-rw-r--r--arch/x86/boot/video.c6
-rw-r--r--arch/x86/crypto/Makefile3
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S517
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S157
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c333
-rw-r--r--arch/x86/ia32/ia32entry.S3
-rw-r--r--arch/x86/ia32/sys_ia32.c56
-rw-r--r--arch/x86/include/asm/acpi.h3
-rw-r--r--arch/x86/include/asm/cache.h7
-rw-r--r--arch/x86/include/asm/cacheflush.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/e820.h23
-rw-r--r--arch/x86/include/asm/elf.h20
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hpet.h7
-rw-r--r--arch/x86/include/asm/hw_irq.h4
-rw-r--r--arch/x86/include/asm/i387.h7
-rw-r--r--arch/x86/include/asm/inst.h150
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h2
-rw-r--r--arch/x86/include/asm/k8.h5
-rw-r--r--arch/x86/include/asm/kvm.h30
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h34
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/mpspec.h11
-rw-r--r--arch/x86/include/asm/page_types.h3
-rw-r--r--arch/x86/include/asm/pgtable.h6
-rw-r--r--arch/x86/include/asm/proto.h17
-rw-r--r--arch/x86/include/asm/sections.h6
-rw-r--r--arch/x86/include/asm/svm.h3
-rw-r--r--arch/x86/include/asm/sys_ia32.h5
-rw-r--r--arch/x86/include/asm/thread_info.h7
-rw-r--r--arch/x86/include/asm/unistd_32.h3
-rw-r--r--arch/x86/include/asm/unistd_64.h2
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/include/asm/x86_init.h4
-rw-r--r--arch/x86/kernel/acpi/boot.c1
-rw-r--r--arch/x86/kernel/acpi/sleep.c24
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c49
-rw-r--r--arch/x86/kernel/apic/numaq_32.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c19
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c51
-rw-r--r--arch/x86/kernel/entry_64.S4
-rw-r--r--arch/x86/kernel/ftrace.c17
-rw-r--r--arch/x86/kernel/head_32.S18
-rw-r--r--arch/x86/kernel/head_64.S3
-rw-r--r--arch/x86/kernel/hpet.c77
-rw-r--r--arch/x86/kernel/irq.c20
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/machine_kexec_32.c6
-rw-r--r--arch/x86/kernel/microcode_amd.c87
-rw-r--r--arch/x86/kernel/microcode_core.c8
-rw-r--r--arch/x86/kernel/mpparse.c44
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/process_64.c32
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c1
-rw-r--r--arch/x86/kernel/setup.c106
-rw-r--r--arch/x86/kernel/signal.c3
-rw-r--r--arch/x86/kernel/syscall_table_32.S1
-rw-r--r--arch/x86/kernel/tlb_uv.c4
-rw-r--r--arch/x86/kernel/uv_time.c80
-rw-r--r--arch/x86/kernel/visws_quirks.c2
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S38
-rw-r--r--arch/x86/kernel/vsyscall_64.c7
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c159
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/i8259.c44
-rw-r--r--arch/x86/kvm/irq.h7
-rw-r--r--arch/x86/kvm/lapic.c8
-rw-r--r--arch/x86/kvm/mmu.c3
-rw-r--r--arch/x86/kvm/paging_tmpl.h1
-rw-r--r--arch/x86/kvm/svm.c331
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c448
-rw-r--r--arch/x86/kvm/x86.c550
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_32.c10
-rw-r--r--arch/x86/mm/init_64.c35
-rw-r--r--arch/x86/mm/ioremap.c26
-rw-r--r--arch/x86/mm/k8topology_64.c101
-rw-r--r--arch/x86/mm/numa_32.c4
-rw-r--r--arch/x86/mm/numa_64.c252
-rw-r--r--arch/x86/mm/pageattr.c22
-rw-r--r--arch/x86/mm/pat.c20
-rw-r--r--arch/x86/mm/setup_nx.c59
-rw-r--r--arch/x86/mm/srat_64.c29
-rw-r--r--arch/x86/mm/tlb.c3
-rw-r--r--arch/x86/vdso/vdso32-setup.c1
-rw-r--r--arch/x86/xen/enlighten.c4
-rw-r--r--arch/x86/xen/smp.c2
103 files changed, 3058 insertions, 1442 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b3cfd2433c34..32a1918e1b88 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -51,6 +51,7 @@ config X86
51 select HAVE_KERNEL_LZMA 51 select HAVE_KERNEL_LZMA
52 select HAVE_HW_BREAKPOINT 52 select HAVE_HW_BREAKPOINT
53 select HAVE_ARCH_KMEMCHECK 53 select HAVE_ARCH_KMEMCHECK
54 select HAVE_USER_RETURN_NOTIFIER
54 55
55config OUTPUT_FORMAT 56config OUTPUT_FORMAT
56 string 57 string
@@ -1331,7 +1332,9 @@ config MATH_EMULATION
1331 kernel, it won't hurt. 1332 kernel, it won't hurt.
1332 1333
1333config MTRR 1334config MTRR
1334 bool "MTRR (Memory Type Range Register) support" 1335 bool
1336 default y
1337 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
1335 ---help--- 1338 ---help---
1336 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1339 On Intel P6 family processors (Pentium Pro, Pentium II and later)
1337 the Memory Type Range Registers (MTRRs) may be used to control 1340 the Memory Type Range Registers (MTRRs) may be used to control
@@ -1397,7 +1400,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1397 1400
1398config X86_PAT 1401config X86_PAT
1399 bool 1402 bool
1400 prompt "x86 PAT support" 1403 default y
1404 prompt "x86 PAT support" if EMBEDDED
1401 depends on MTRR 1405 depends on MTRR
1402 ---help--- 1406 ---help---
1403 Use PAT attributes to setup page level cache control. 1407 Use PAT attributes to setup page level cache control.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 5e99762eb5c2..08e442bc3ab9 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -301,15 +301,11 @@ config X86_CPU
301 301
302# 302#
303# Define implied options from the CPU selection here 303# Define implied options from the CPU selection here
304config X86_L1_CACHE_BYTES 304config X86_INTERNODE_CACHE_SHIFT
305 int 305 int
306 default "128" if MPSC 306 default "12" if X86_VSMP
307 default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32 307 default "7" if NUMA
308 308 default X86_L1_CACHE_SHIFT
309config X86_INTERNODE_CACHE_BYTES
310 int
311 default "4096" if X86_VSMP
312 default X86_L1_CACHE_BYTES if !X86_VSMP
313 309
314config X86_CMPXCHG 310config X86_CMPXCHG
315 def_bool X86_64 || (X86_32 && !M386) 311 def_bool X86_64 || (X86_32 && !M386)
@@ -317,9 +313,9 @@ config X86_CMPXCHG
317config X86_L1_CACHE_SHIFT 313config X86_L1_CACHE_SHIFT
318 int 314 int
319 default "7" if MPENTIUM4 || MPSC 315 default "7" if MPENTIUM4 || MPSC
316 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
320 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 317 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
321 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 318 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
322 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
323 319
324config X86_XADD 320config X86_XADD
325 def_bool y 321 def_bool y
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 077e1b69198e..faff0dc9c06a 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -107,8 +107,7 @@ ENTRY(startup_32)
107 lgdt gdt(%ebp) 107 lgdt gdt(%ebp)
108 108
109 /* Enable PAE mode */ 109 /* Enable PAE mode */
110 xorl %eax, %eax 110 movl $(X86_CR4_PAE), %eax
111 orl $(X86_CR4_PAE), %eax
112 movl %eax, %cr4 111 movl %eax, %cr4
113 112
114 /* 113 /*
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index f4193bb48782..a6f1a59a5b0c 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -4,6 +4,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
4 4
5#undef i386 5#undef i386
6 6
7#include <asm/cache.h>
7#include <asm/page_types.h> 8#include <asm/page_types.h>
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
@@ -46,7 +47,7 @@ SECTIONS
46 *(.data.*) 47 *(.data.*)
47 _edata = . ; 48 _edata = . ;
48 } 49 }
49 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 50 . = ALIGN(L1_CACHE_BYTES);
50 .bss : { 51 .bss : {
51 _bss = . ; 52 _bss = . ;
52 *(.bss) 53 *(.bss)
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index d42da3802499..f767164cd5df 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -27,6 +27,12 @@ static void store_cursor_position(void)
27 27
28 boot_params.screen_info.orig_x = oreg.dl; 28 boot_params.screen_info.orig_x = oreg.dl;
29 boot_params.screen_info.orig_y = oreg.dh; 29 boot_params.screen_info.orig_y = oreg.dh;
30
31 if (oreg.ch & 0x20)
32 boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
33
34 if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f))
35 boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
30} 36}
31 37
32static void store_video_mode(void) 38static void store_video_mode(void)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index cfb0010fa940..1a58ad89fdf7 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
15obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
15 16
16obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o 17obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
17 18
@@ -24,3 +25,5 @@ twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
24salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 25salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
25 26
26aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o 27aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
28
29ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index eb0566e83319..20bb0e1ac681 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -16,6 +16,7 @@
16 */ 16 */
17 17
18#include <linux/linkage.h> 18#include <linux/linkage.h>
19#include <asm/inst.h>
19 20
20.text 21.text
21 22
@@ -122,103 +123,72 @@ ENTRY(aesni_set_key)
122 movups 0x10(%rsi), %xmm2 # other user key 123 movups 0x10(%rsi), %xmm2 # other user key
123 movaps %xmm2, (%rcx) 124 movaps %xmm2, (%rcx)
124 add $0x10, %rcx 125 add $0x10, %rcx
125 # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 126 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
126 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
127 call _key_expansion_256a 127 call _key_expansion_256a
128 # aeskeygenassist $0x1, %xmm0, %xmm1 128 AESKEYGENASSIST 0x1 %xmm0 %xmm1
129 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
130 call _key_expansion_256b 129 call _key_expansion_256b
131 # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 130 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
132 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
133 call _key_expansion_256a 131 call _key_expansion_256a
134 # aeskeygenassist $0x2, %xmm0, %xmm1 132 AESKEYGENASSIST 0x2 %xmm0 %xmm1
135 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
136 call _key_expansion_256b 133 call _key_expansion_256b
137 # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 134 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
138 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
139 call _key_expansion_256a 135 call _key_expansion_256a
140 # aeskeygenassist $0x4, %xmm0, %xmm1 136 AESKEYGENASSIST 0x4 %xmm0 %xmm1
141 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
142 call _key_expansion_256b 137 call _key_expansion_256b
143 # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 138 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
144 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
145 call _key_expansion_256a 139 call _key_expansion_256a
146 # aeskeygenassist $0x8, %xmm0, %xmm1 140 AESKEYGENASSIST 0x8 %xmm0 %xmm1
147 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
148 call _key_expansion_256b 141 call _key_expansion_256b
149 # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 142 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
150 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
151 call _key_expansion_256a 143 call _key_expansion_256a
152 # aeskeygenassist $0x10, %xmm0, %xmm1 144 AESKEYGENASSIST 0x10 %xmm0 %xmm1
153 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
154 call _key_expansion_256b 145 call _key_expansion_256b
155 # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 146 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
156 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
157 call _key_expansion_256a 147 call _key_expansion_256a
158 # aeskeygenassist $0x20, %xmm0, %xmm1 148 AESKEYGENASSIST 0x20 %xmm0 %xmm1
159 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
160 call _key_expansion_256b 149 call _key_expansion_256b
161 # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 150 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
162 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
163 call _key_expansion_256a 151 call _key_expansion_256a
164 jmp .Ldec_key 152 jmp .Ldec_key
165.Lenc_key192: 153.Lenc_key192:
166 movq 0x10(%rsi), %xmm2 # other user key 154 movq 0x10(%rsi), %xmm2 # other user key
167 # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 155 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
168 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
169 call _key_expansion_192a 156 call _key_expansion_192a
170 # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 157 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
171 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
172 call _key_expansion_192b 158 call _key_expansion_192b
173 # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 159 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
174 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
175 call _key_expansion_192a 160 call _key_expansion_192a
176 # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 161 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
177 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
178 call _key_expansion_192b 162 call _key_expansion_192b
179 # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 163 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
180 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
181 call _key_expansion_192a 164 call _key_expansion_192a
182 # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 165 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
183 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
184 call _key_expansion_192b 166 call _key_expansion_192b
185 # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 167 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
186 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
187 call _key_expansion_192a 168 call _key_expansion_192a
188 # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 169 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
189 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
190 call _key_expansion_192b 170 call _key_expansion_192b
191 jmp .Ldec_key 171 jmp .Ldec_key
192.Lenc_key128: 172.Lenc_key128:
193 # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 173 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
194 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
195 call _key_expansion_128 174 call _key_expansion_128
196 # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 175 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
197 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
198 call _key_expansion_128 176 call _key_expansion_128
199 # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 177 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
200 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
201 call _key_expansion_128 178 call _key_expansion_128
202 # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 179 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
203 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
204 call _key_expansion_128 180 call _key_expansion_128
205 # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 181 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
206 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
207 call _key_expansion_128 182 call _key_expansion_128
208 # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 183 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
209 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
210 call _key_expansion_128 184 call _key_expansion_128
211 # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 185 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
212 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
213 call _key_expansion_128 186 call _key_expansion_128
214 # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 187 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
215 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
216 call _key_expansion_128 188 call _key_expansion_128
217 # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 189 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
218 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
219 call _key_expansion_128 190 call _key_expansion_128
220 # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 191 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
221 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
222 call _key_expansion_128 192 call _key_expansion_128
223.Ldec_key: 193.Ldec_key:
224 sub $0x10, %rcx 194 sub $0x10, %rcx
@@ -231,8 +201,7 @@ ENTRY(aesni_set_key)
231.align 4 201.align 4
232.Ldec_key_loop: 202.Ldec_key_loop:
233 movaps (%rdi), %xmm0 203 movaps (%rdi), %xmm0
234 # aesimc %xmm0, %xmm1 204 AESIMC %xmm0 %xmm1
235 .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8
236 movaps %xmm1, (%rsi) 205 movaps %xmm1, (%rsi)
237 add $0x10, %rdi 206 add $0x10, %rdi
238 sub $0x10, %rsi 207 sub $0x10, %rsi
@@ -274,51 +243,37 @@ _aesni_enc1:
274 je .Lenc192 243 je .Lenc192
275 add $0x20, TKEYP 244 add $0x20, TKEYP
276 movaps -0x60(TKEYP), KEY 245 movaps -0x60(TKEYP), KEY
277 # aesenc KEY, STATE 246 AESENC KEY STATE
278 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
279 movaps -0x50(TKEYP), KEY 247 movaps -0x50(TKEYP), KEY
280 # aesenc KEY, STATE 248 AESENC KEY STATE
281 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
282.align 4 249.align 4
283.Lenc192: 250.Lenc192:
284 movaps -0x40(TKEYP), KEY 251 movaps -0x40(TKEYP), KEY
285 # aesenc KEY, STATE 252 AESENC KEY STATE
286 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
287 movaps -0x30(TKEYP), KEY 253 movaps -0x30(TKEYP), KEY
288 # aesenc KEY, STATE 254 AESENC KEY STATE
289 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
290.align 4 255.align 4
291.Lenc128: 256.Lenc128:
292 movaps -0x20(TKEYP), KEY 257 movaps -0x20(TKEYP), KEY
293 # aesenc KEY, STATE 258 AESENC KEY STATE
294 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
295 movaps -0x10(TKEYP), KEY 259 movaps -0x10(TKEYP), KEY
296 # aesenc KEY, STATE 260 AESENC KEY STATE
297 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
298 movaps (TKEYP), KEY 261 movaps (TKEYP), KEY
299 # aesenc KEY, STATE 262 AESENC KEY STATE
300 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
301 movaps 0x10(TKEYP), KEY 263 movaps 0x10(TKEYP), KEY
302 # aesenc KEY, STATE 264 AESENC KEY STATE
303 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
304 movaps 0x20(TKEYP), KEY 265 movaps 0x20(TKEYP), KEY
305 # aesenc KEY, STATE 266 AESENC KEY STATE
306 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
307 movaps 0x30(TKEYP), KEY 267 movaps 0x30(TKEYP), KEY
308 # aesenc KEY, STATE 268 AESENC KEY STATE
309 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
310 movaps 0x40(TKEYP), KEY 269 movaps 0x40(TKEYP), KEY
311 # aesenc KEY, STATE 270 AESENC KEY STATE
312 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
313 movaps 0x50(TKEYP), KEY 271 movaps 0x50(TKEYP), KEY
314 # aesenc KEY, STATE 272 AESENC KEY STATE
315 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
316 movaps 0x60(TKEYP), KEY 273 movaps 0x60(TKEYP), KEY
317 # aesenc KEY, STATE 274 AESENC KEY STATE
318 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
319 movaps 0x70(TKEYP), KEY 275 movaps 0x70(TKEYP), KEY
320 # aesenclast KEY, STATE # last round 276 AESENCLAST KEY STATE
321 .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
322 ret 277 ret
323 278
324/* 279/*
@@ -353,135 +308,79 @@ _aesni_enc4:
353 je .L4enc192 308 je .L4enc192
354 add $0x20, TKEYP 309 add $0x20, TKEYP
355 movaps -0x60(TKEYP), KEY 310 movaps -0x60(TKEYP), KEY
356 # aesenc KEY, STATE1 311 AESENC KEY STATE1
357 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 312 AESENC KEY STATE2
358 # aesenc KEY, STATE2 313 AESENC KEY STATE3
359 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 314 AESENC KEY STATE4
360 # aesenc KEY, STATE3
361 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
362 # aesenc KEY, STATE4
363 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
364 movaps -0x50(TKEYP), KEY 315 movaps -0x50(TKEYP), KEY
365 # aesenc KEY, STATE1 316 AESENC KEY STATE1
366 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 317 AESENC KEY STATE2
367 # aesenc KEY, STATE2 318 AESENC KEY STATE3
368 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 319 AESENC KEY STATE4
369 # aesenc KEY, STATE3
370 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
371 # aesenc KEY, STATE4
372 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
373#.align 4 320#.align 4
374.L4enc192: 321.L4enc192:
375 movaps -0x40(TKEYP), KEY 322 movaps -0x40(TKEYP), KEY
376 # aesenc KEY, STATE1 323 AESENC KEY STATE1
377 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 324 AESENC KEY STATE2
378 # aesenc KEY, STATE2 325 AESENC KEY STATE3
379 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 326 AESENC KEY STATE4
380 # aesenc KEY, STATE3
381 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
382 # aesenc KEY, STATE4
383 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
384 movaps -0x30(TKEYP), KEY 327 movaps -0x30(TKEYP), KEY
385 # aesenc KEY, STATE1 328 AESENC KEY STATE1
386 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 329 AESENC KEY STATE2
387 # aesenc KEY, STATE2 330 AESENC KEY STATE3
388 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 331 AESENC KEY STATE4
389 # aesenc KEY, STATE3
390 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
391 # aesenc KEY, STATE4
392 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
393#.align 4 332#.align 4
394.L4enc128: 333.L4enc128:
395 movaps -0x20(TKEYP), KEY 334 movaps -0x20(TKEYP), KEY
396 # aesenc KEY, STATE1 335 AESENC KEY STATE1
397 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 336 AESENC KEY STATE2
398 # aesenc KEY, STATE2 337 AESENC KEY STATE3
399 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 338 AESENC KEY STATE4
400 # aesenc KEY, STATE3
401 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
402 # aesenc KEY, STATE4
403 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
404 movaps -0x10(TKEYP), KEY 339 movaps -0x10(TKEYP), KEY
405 # aesenc KEY, STATE1 340 AESENC KEY STATE1
406 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 341 AESENC KEY STATE2
407 # aesenc KEY, STATE2 342 AESENC KEY STATE3
408 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 343 AESENC KEY STATE4
409 # aesenc KEY, STATE3
410 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
411 # aesenc KEY, STATE4
412 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
413 movaps (TKEYP), KEY 344 movaps (TKEYP), KEY
414 # aesenc KEY, STATE1 345 AESENC KEY STATE1
415 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 346 AESENC KEY STATE2
416 # aesenc KEY, STATE2 347 AESENC KEY STATE3
417 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 348 AESENC KEY STATE4
418 # aesenc KEY, STATE3
419 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
420 # aesenc KEY, STATE4
421 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
422 movaps 0x10(TKEYP), KEY 349 movaps 0x10(TKEYP), KEY
423 # aesenc KEY, STATE1 350 AESENC KEY STATE1
424 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 351 AESENC KEY STATE2
425 # aesenc KEY, STATE2 352 AESENC KEY STATE3
426 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 353 AESENC KEY STATE4
427 # aesenc KEY, STATE3
428 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
429 # aesenc KEY, STATE4
430 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
431 movaps 0x20(TKEYP), KEY 354 movaps 0x20(TKEYP), KEY
432 # aesenc KEY, STATE1 355 AESENC KEY STATE1
433 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 356 AESENC KEY STATE2
434 # aesenc KEY, STATE2 357 AESENC KEY STATE3
435 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 358 AESENC KEY STATE4
436 # aesenc KEY, STATE3
437 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
438 # aesenc KEY, STATE4
439 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
440 movaps 0x30(TKEYP), KEY 359 movaps 0x30(TKEYP), KEY
441 # aesenc KEY, STATE1 360 AESENC KEY STATE1
442 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 361 AESENC KEY STATE2
443 # aesenc KEY, STATE2 362 AESENC KEY STATE3
444 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 363 AESENC KEY STATE4
445 # aesenc KEY, STATE3
446 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
447 # aesenc KEY, STATE4
448 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
449 movaps 0x40(TKEYP), KEY 364 movaps 0x40(TKEYP), KEY
450 # aesenc KEY, STATE1 365 AESENC KEY STATE1
451 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 366 AESENC KEY STATE2
452 # aesenc KEY, STATE2 367 AESENC KEY STATE3
453 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 368 AESENC KEY STATE4
454 # aesenc KEY, STATE3
455 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
456 # aesenc KEY, STATE4
457 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
458 movaps 0x50(TKEYP), KEY 369 movaps 0x50(TKEYP), KEY
459 # aesenc KEY, STATE1 370 AESENC KEY STATE1
460 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 371 AESENC KEY STATE2
461 # aesenc KEY, STATE2 372 AESENC KEY STATE3
462 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 373 AESENC KEY STATE4
463 # aesenc KEY, STATE3
464 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
465 # aesenc KEY, STATE4
466 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
467 movaps 0x60(TKEYP), KEY 374 movaps 0x60(TKEYP), KEY
468 # aesenc KEY, STATE1 375 AESENC KEY STATE1
469 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 376 AESENC KEY STATE2
470 # aesenc KEY, STATE2 377 AESENC KEY STATE3
471 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 378 AESENC KEY STATE4
472 # aesenc KEY, STATE3
473 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
474 # aesenc KEY, STATE4
475 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
476 movaps 0x70(TKEYP), KEY 379 movaps 0x70(TKEYP), KEY
477 # aesenclast KEY, STATE1 # last round 380 AESENCLAST KEY STATE1 # last round
478 .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 381 AESENCLAST KEY STATE2
479 # aesenclast KEY, STATE2 382 AESENCLAST KEY STATE3
480 .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2 383 AESENCLAST KEY STATE4
481 # aesenclast KEY, STATE3
482 .byte 0x66, 0x0f, 0x38, 0xdd, 0xea
483 # aesenclast KEY, STATE4
484 .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2
485 ret 384 ret
486 385
487/* 386/*
@@ -518,51 +417,37 @@ _aesni_dec1:
518 je .Ldec192 417 je .Ldec192
519 add $0x20, TKEYP 418 add $0x20, TKEYP
520 movaps -0x60(TKEYP), KEY 419 movaps -0x60(TKEYP), KEY
521 # aesdec KEY, STATE 420 AESDEC KEY STATE
522 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
523 movaps -0x50(TKEYP), KEY 421 movaps -0x50(TKEYP), KEY
524 # aesdec KEY, STATE 422 AESDEC KEY STATE
525 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
526.align 4 423.align 4
527.Ldec192: 424.Ldec192:
528 movaps -0x40(TKEYP), KEY 425 movaps -0x40(TKEYP), KEY
529 # aesdec KEY, STATE 426 AESDEC KEY STATE
530 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
531 movaps -0x30(TKEYP), KEY 427 movaps -0x30(TKEYP), KEY
532 # aesdec KEY, STATE 428 AESDEC KEY STATE
533 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
534.align 4 429.align 4
535.Ldec128: 430.Ldec128:
536 movaps -0x20(TKEYP), KEY 431 movaps -0x20(TKEYP), KEY
537 # aesdec KEY, STATE 432 AESDEC KEY STATE
538 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
539 movaps -0x10(TKEYP), KEY 433 movaps -0x10(TKEYP), KEY
540 # aesdec KEY, STATE 434 AESDEC KEY STATE
541 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
542 movaps (TKEYP), KEY 435 movaps (TKEYP), KEY
543 # aesdec KEY, STATE 436 AESDEC KEY STATE
544 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
545 movaps 0x10(TKEYP), KEY 437 movaps 0x10(TKEYP), KEY
546 # aesdec KEY, STATE 438 AESDEC KEY STATE
547 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
548 movaps 0x20(TKEYP), KEY 439 movaps 0x20(TKEYP), KEY
549 # aesdec KEY, STATE 440 AESDEC KEY STATE
550 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
551 movaps 0x30(TKEYP), KEY 441 movaps 0x30(TKEYP), KEY
552 # aesdec KEY, STATE 442 AESDEC KEY STATE
553 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
554 movaps 0x40(TKEYP), KEY 443 movaps 0x40(TKEYP), KEY
555 # aesdec KEY, STATE 444 AESDEC KEY STATE
556 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
557 movaps 0x50(TKEYP), KEY 445 movaps 0x50(TKEYP), KEY
558 # aesdec KEY, STATE 446 AESDEC KEY STATE
559 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
560 movaps 0x60(TKEYP), KEY 447 movaps 0x60(TKEYP), KEY
561 # aesdec KEY, STATE 448 AESDEC KEY STATE
562 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
563 movaps 0x70(TKEYP), KEY 449 movaps 0x70(TKEYP), KEY
564 # aesdeclast KEY, STATE # last round 450 AESDECLAST KEY STATE
565 .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
566 ret 451 ret
567 452
568/* 453/*
@@ -597,135 +482,79 @@ _aesni_dec4:
597 je .L4dec192 482 je .L4dec192
598 add $0x20, TKEYP 483 add $0x20, TKEYP
599 movaps -0x60(TKEYP), KEY 484 movaps -0x60(TKEYP), KEY
600 # aesdec KEY, STATE1 485 AESDEC KEY STATE1
601 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 486 AESDEC KEY STATE2
602 # aesdec KEY, STATE2 487 AESDEC KEY STATE3
603 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 488 AESDEC KEY STATE4
604 # aesdec KEY, STATE3
605 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
606 # aesdec KEY, STATE4
607 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
608 movaps -0x50(TKEYP), KEY 489 movaps -0x50(TKEYP), KEY
609 # aesdec KEY, STATE1 490 AESDEC KEY STATE1
610 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 491 AESDEC KEY STATE2
611 # aesdec KEY, STATE2 492 AESDEC KEY STATE3
612 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 493 AESDEC KEY STATE4
613 # aesdec KEY, STATE3
614 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
615 # aesdec KEY, STATE4
616 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
617.align 4 494.align 4
618.L4dec192: 495.L4dec192:
619 movaps -0x40(TKEYP), KEY 496 movaps -0x40(TKEYP), KEY
620 # aesdec KEY, STATE1 497 AESDEC KEY STATE1
621 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 498 AESDEC KEY STATE2
622 # aesdec KEY, STATE2 499 AESDEC KEY STATE3
623 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 500 AESDEC KEY STATE4
624 # aesdec KEY, STATE3
625 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
626 # aesdec KEY, STATE4
627 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
628 movaps -0x30(TKEYP), KEY 501 movaps -0x30(TKEYP), KEY
629 # aesdec KEY, STATE1 502 AESDEC KEY STATE1
630 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 503 AESDEC KEY STATE2
631 # aesdec KEY, STATE2 504 AESDEC KEY STATE3
632 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 505 AESDEC KEY STATE4
633 # aesdec KEY, STATE3
634 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
635 # aesdec KEY, STATE4
636 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
637.align 4 506.align 4
638.L4dec128: 507.L4dec128:
639 movaps -0x20(TKEYP), KEY 508 movaps -0x20(TKEYP), KEY
640 # aesdec KEY, STATE1 509 AESDEC KEY STATE1
641 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 510 AESDEC KEY STATE2
642 # aesdec KEY, STATE2 511 AESDEC KEY STATE3
643 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 512 AESDEC KEY STATE4
644 # aesdec KEY, STATE3
645 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
646 # aesdec KEY, STATE4
647 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
648 movaps -0x10(TKEYP), KEY 513 movaps -0x10(TKEYP), KEY
649 # aesdec KEY, STATE1 514 AESDEC KEY STATE1
650 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 515 AESDEC KEY STATE2
651 # aesdec KEY, STATE2 516 AESDEC KEY STATE3
652 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 517 AESDEC KEY STATE4
653 # aesdec KEY, STATE3
654 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
655 # aesdec KEY, STATE4
656 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
657 movaps (TKEYP), KEY 518 movaps (TKEYP), KEY
658 # aesdec KEY, STATE1 519 AESDEC KEY STATE1
659 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 520 AESDEC KEY STATE2
660 # aesdec KEY, STATE2 521 AESDEC KEY STATE3
661 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 522 AESDEC KEY STATE4
662 # aesdec KEY, STATE3
663 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
664 # aesdec KEY, STATE4
665 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
666 movaps 0x10(TKEYP), KEY 523 movaps 0x10(TKEYP), KEY
667 # aesdec KEY, STATE1 524 AESDEC KEY STATE1
668 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 525 AESDEC KEY STATE2
669 # aesdec KEY, STATE2 526 AESDEC KEY STATE3
670 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 527 AESDEC KEY STATE4
671 # aesdec KEY, STATE3
672 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
673 # aesdec KEY, STATE4
674 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
675 movaps 0x20(TKEYP), KEY 528 movaps 0x20(TKEYP), KEY
676 # aesdec KEY, STATE1 529 AESDEC KEY STATE1
677 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 530 AESDEC KEY STATE2
678 # aesdec KEY, STATE2 531 AESDEC KEY STATE3
679 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 532 AESDEC KEY STATE4
680 # aesdec KEY, STATE3
681 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
682 # aesdec KEY, STATE4
683 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
684 movaps 0x30(TKEYP), KEY 533 movaps 0x30(TKEYP), KEY
685 # aesdec KEY, STATE1 534 AESDEC KEY STATE1
686 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 535 AESDEC KEY STATE2
687 # aesdec KEY, STATE2 536 AESDEC KEY STATE3
688 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 537 AESDEC KEY STATE4
689 # aesdec KEY, STATE3
690 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
691 # aesdec KEY, STATE4
692 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
693 movaps 0x40(TKEYP), KEY 538 movaps 0x40(TKEYP), KEY
694 # aesdec KEY, STATE1 539 AESDEC KEY STATE1
695 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 540 AESDEC KEY STATE2
696 # aesdec KEY, STATE2 541 AESDEC KEY STATE3
697 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 542 AESDEC KEY STATE4
698 # aesdec KEY, STATE3
699 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
700 # aesdec KEY, STATE4
701 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
702 movaps 0x50(TKEYP), KEY 543 movaps 0x50(TKEYP), KEY
703 # aesdec KEY, STATE1 544 AESDEC KEY STATE1
704 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 545 AESDEC KEY STATE2
705 # aesdec KEY, STATE2 546 AESDEC KEY STATE3
706 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 547 AESDEC KEY STATE4
707 # aesdec KEY, STATE3
708 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
709 # aesdec KEY, STATE4
710 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
711 movaps 0x60(TKEYP), KEY 548 movaps 0x60(TKEYP), KEY
712 # aesdec KEY, STATE1 549 AESDEC KEY STATE1
713 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 550 AESDEC KEY STATE2
714 # aesdec KEY, STATE2 551 AESDEC KEY STATE3
715 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 552 AESDEC KEY STATE4
716 # aesdec KEY, STATE3
717 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
718 # aesdec KEY, STATE4
719 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
720 movaps 0x70(TKEYP), KEY 553 movaps 0x70(TKEYP), KEY
721 # aesdeclast KEY, STATE1 # last round 554 AESDECLAST KEY STATE1 # last round
722 .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 555 AESDECLAST KEY STATE2
723 # aesdeclast KEY, STATE2 556 AESDECLAST KEY STATE3
724 .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2 557 AESDECLAST KEY STATE4
725 # aesdeclast KEY, STATE3
726 .byte 0x66, 0x0f, 0x38, 0xdf, 0xea
727 # aesdeclast KEY, STATE4
728 .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2
729 ret 558 ret
730 559
731/* 560/*
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
new file mode 100644
index 000000000000..1eb7f90cb7b9
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,157 @@
1/*
2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
3 * instructions. This file contains accelerated part of ghash
4 * implementation. More information about PCLMULQDQ can be found at:
5 *
6 * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
7 *
8 * Copyright (c) 2009 Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal
11 * Erdinc Ozturk
12 * Deniz Karakoyunlu
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License version 2 as published
16 * by the Free Software Foundation.
17 */
18
19#include <linux/linkage.h>
20#include <asm/inst.h>
21
22.data
23
24.align 16
25.Lbswap_mask:
26 .octa 0x000102030405060708090a0b0c0d0e0f
27.Lpoly:
28 .octa 0xc2000000000000000000000000000001
29.Ltwo_one:
30 .octa 0x00000001000000000000000000000001
31
32#define DATA %xmm0
33#define SHASH %xmm1
34#define T1 %xmm2
35#define T2 %xmm3
36#define T3 %xmm4
37#define BSWAP %xmm5
38#define IN1 %xmm6
39
40.text
41
42/*
43 * __clmul_gf128mul_ble: internal ABI
44 * input:
45 * DATA: operand1
46 * SHASH: operand2, hash_key << 1 mod poly
47 * output:
48 * DATA: operand1 * operand2 mod poly
49 * changed:
50 * T1
51 * T2
52 * T3
53 */
54__clmul_gf128mul_ble:
55 movaps DATA, T1
56 pshufd $0b01001110, DATA, T2
57 pshufd $0b01001110, SHASH, T3
58 pxor DATA, T2
59 pxor SHASH, T3
60
61 PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0
62 PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1
63 PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0)
64 pxor DATA, T2
65 pxor T1, T2 # T2 = a0 * b1 + a1 * b0
66
67 movaps T2, T3
68 pslldq $8, T3
69 psrldq $8, T2
70 pxor T3, DATA
71 pxor T2, T1 # <T1:DATA> is result of
72 # carry-less multiplication
73
74 # first phase of the reduction
75 movaps DATA, T3
76 psllq $1, T3
77 pxor DATA, T3
78 psllq $5, T3
79 pxor DATA, T3
80 psllq $57, T3
81 movaps T3, T2
82 pslldq $8, T2
83 psrldq $8, T3
84 pxor T2, DATA
85 pxor T3, T1
86
87 # second phase of the reduction
88 movaps DATA, T2
89 psrlq $5, T2
90 pxor DATA, T2
91 psrlq $1, T2
92 pxor DATA, T2
93 psrlq $1, T2
94 pxor T2, T1
95 pxor T1, DATA
96 ret
97
98/* void clmul_ghash_mul(char *dst, const be128 *shash) */
99ENTRY(clmul_ghash_mul)
100 movups (%rdi), DATA
101 movups (%rsi), SHASH
102 movaps .Lbswap_mask, BSWAP
103 PSHUFB_XMM BSWAP DATA
104 call __clmul_gf128mul_ble
105 PSHUFB_XMM BSWAP DATA
106 movups DATA, (%rdi)
107 ret
108
109/*
110 * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
111 * const be128 *shash);
112 */
113ENTRY(clmul_ghash_update)
114 cmp $16, %rdx
115 jb .Lupdate_just_ret # check length
116 movaps .Lbswap_mask, BSWAP
117 movups (%rdi), DATA
118 movups (%rcx), SHASH
119 PSHUFB_XMM BSWAP DATA
120.align 4
121.Lupdate_loop:
122 movups (%rsi), IN1
123 PSHUFB_XMM BSWAP IN1
124 pxor IN1, DATA
125 call __clmul_gf128mul_ble
126 sub $16, %rdx
127 add $16, %rsi
128 cmp $16, %rdx
129 jge .Lupdate_loop
130 PSHUFB_XMM BSWAP DATA
131 movups DATA, (%rdi)
132.Lupdate_just_ret:
133 ret
134
135/*
136 * void clmul_ghash_setkey(be128 *shash, const u8 *key);
137 *
138 * Calculate hash_key << 1 mod poly
139 */
140ENTRY(clmul_ghash_setkey)
141 movaps .Lbswap_mask, BSWAP
142 movups (%rsi), %xmm0
143 PSHUFB_XMM BSWAP %xmm0
144 movaps %xmm0, %xmm1
145 psllq $1, %xmm0
146 psrlq $63, %xmm1
147 movaps %xmm1, %xmm2
148 pslldq $8, %xmm1
149 psrldq $8, %xmm2
150 por %xmm1, %xmm0
151 # reduction
152 pshufd $0b00100100, %xmm2, %xmm1
153 pcmpeqd .Ltwo_one, %xmm1
154 pand .Lpoly, %xmm1
155 pxor %xmm1, %xmm0
156 movups %xmm0, (%rdi)
157 ret
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
new file mode 100644
index 000000000000..cbcc8d8ea93a
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -0,0 +1,333 @@
1/*
2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
3 * instructions. This file contains glue code.
4 *
5 * Copyright (c) 2009 Intel Corp.
6 * Author: Huang Ying <ying.huang@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License version 2 as published
10 * by the Free Software Foundation.
11 */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/crypto.h>
17#include <crypto/algapi.h>
18#include <crypto/cryptd.h>
19#include <crypto/gf128mul.h>
20#include <crypto/internal/hash.h>
21#include <asm/i387.h>
22
23#define GHASH_BLOCK_SIZE 16
24#define GHASH_DIGEST_SIZE 16
25
26void clmul_ghash_mul(char *dst, const be128 *shash);
27
28void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
29 const be128 *shash);
30
31void clmul_ghash_setkey(be128 *shash, const u8 *key);
32
33struct ghash_async_ctx {
34 struct cryptd_ahash *cryptd_tfm;
35};
36
37struct ghash_ctx {
38 be128 shash;
39};
40
41struct ghash_desc_ctx {
42 u8 buffer[GHASH_BLOCK_SIZE];
43 u32 bytes;
44};
45
46static int ghash_init(struct shash_desc *desc)
47{
48 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
49
50 memset(dctx, 0, sizeof(*dctx));
51
52 return 0;
53}
54
55static int ghash_setkey(struct crypto_shash *tfm,
56 const u8 *key, unsigned int keylen)
57{
58 struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
59
60 if (keylen != GHASH_BLOCK_SIZE) {
61 crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
62 return -EINVAL;
63 }
64
65 clmul_ghash_setkey(&ctx->shash, key);
66
67 return 0;
68}
69
70static int ghash_update(struct shash_desc *desc,
71 const u8 *src, unsigned int srclen)
72{
73 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
74 struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
75 u8 *dst = dctx->buffer;
76
77 kernel_fpu_begin();
78 if (dctx->bytes) {
79 int n = min(srclen, dctx->bytes);
80 u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
81
82 dctx->bytes -= n;
83 srclen -= n;
84
85 while (n--)
86 *pos++ ^= *src++;
87
88 if (!dctx->bytes)
89 clmul_ghash_mul(dst, &ctx->shash);
90 }
91
92 clmul_ghash_update(dst, src, srclen, &ctx->shash);
93 kernel_fpu_end();
94
95 if (srclen & 0xf) {
96 src += srclen - (srclen & 0xf);
97 srclen &= 0xf;
98 dctx->bytes = GHASH_BLOCK_SIZE - srclen;
99 while (srclen--)
100 *dst++ ^= *src++;
101 }
102
103 return 0;
104}
105
106static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
107{
108 u8 *dst = dctx->buffer;
109
110 if (dctx->bytes) {
111 u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
112
113 while (dctx->bytes--)
114 *tmp++ ^= 0;
115
116 kernel_fpu_begin();
117 clmul_ghash_mul(dst, &ctx->shash);
118 kernel_fpu_end();
119 }
120
121 dctx->bytes = 0;
122}
123
124static int ghash_final(struct shash_desc *desc, u8 *dst)
125{
126 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
127 struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
128 u8 *buf = dctx->buffer;
129
130 ghash_flush(ctx, dctx);
131 memcpy(dst, buf, GHASH_BLOCK_SIZE);
132
133 return 0;
134}
135
136static struct shash_alg ghash_alg = {
137 .digestsize = GHASH_DIGEST_SIZE,
138 .init = ghash_init,
139 .update = ghash_update,
140 .final = ghash_final,
141 .setkey = ghash_setkey,
142 .descsize = sizeof(struct ghash_desc_ctx),
143 .base = {
144 .cra_name = "__ghash",
145 .cra_driver_name = "__ghash-pclmulqdqni",
146 .cra_priority = 0,
147 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
148 .cra_blocksize = GHASH_BLOCK_SIZE,
149 .cra_ctxsize = sizeof(struct ghash_ctx),
150 .cra_module = THIS_MODULE,
151 .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list),
152 },
153};
154
155static int ghash_async_init(struct ahash_request *req)
156{
157 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
158 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
159 struct ahash_request *cryptd_req = ahash_request_ctx(req);
160 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
161
162 if (!irq_fpu_usable()) {
163 memcpy(cryptd_req, req, sizeof(*req));
164 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
165 return crypto_ahash_init(cryptd_req);
166 } else {
167 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
168 struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
169
170 desc->tfm = child;
171 desc->flags = req->base.flags;
172 return crypto_shash_init(desc);
173 }
174}
175
176static int ghash_async_update(struct ahash_request *req)
177{
178 struct ahash_request *cryptd_req = ahash_request_ctx(req);
179
180 if (!irq_fpu_usable()) {
181 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
182 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
183 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
184
185 memcpy(cryptd_req, req, sizeof(*req));
186 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
187 return crypto_ahash_update(cryptd_req);
188 } else {
189 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
190 return shash_ahash_update(req, desc);
191 }
192}
193
194static int ghash_async_final(struct ahash_request *req)
195{
196 struct ahash_request *cryptd_req = ahash_request_ctx(req);
197
198 if (!irq_fpu_usable()) {
199 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
200 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
201 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
202
203 memcpy(cryptd_req, req, sizeof(*req));
204 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
205 return crypto_ahash_final(cryptd_req);
206 } else {
207 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
208 return crypto_shash_final(desc, req->result);
209 }
210}
211
212static int ghash_async_digest(struct ahash_request *req)
213{
214 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
215 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
216 struct ahash_request *cryptd_req = ahash_request_ctx(req);
217 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
218
219 if (!irq_fpu_usable()) {
220 memcpy(cryptd_req, req, sizeof(*req));
221 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
222 return crypto_ahash_digest(cryptd_req);
223 } else {
224 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
225 struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
226
227 desc->tfm = child;
228 desc->flags = req->base.flags;
229 return shash_ahash_digest(req, desc);
230 }
231}
232
233static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
234 unsigned int keylen)
235{
236 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
237 struct crypto_ahash *child = &ctx->cryptd_tfm->base;
238 int err;
239
240 crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
241 crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
242 & CRYPTO_TFM_REQ_MASK);
243 err = crypto_ahash_setkey(child, key, keylen);
244 crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
245 & CRYPTO_TFM_RES_MASK);
246
247 return 0;
248}
249
250static int ghash_async_init_tfm(struct crypto_tfm *tfm)
251{
252 struct cryptd_ahash *cryptd_tfm;
253 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
254
255 cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0);
256 if (IS_ERR(cryptd_tfm))
257 return PTR_ERR(cryptd_tfm);
258 ctx->cryptd_tfm = cryptd_tfm;
259 crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
260 sizeof(struct ahash_request) +
261 crypto_ahash_reqsize(&cryptd_tfm->base));
262
263 return 0;
264}
265
266static void ghash_async_exit_tfm(struct crypto_tfm *tfm)
267{
268 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
269
270 cryptd_free_ahash(ctx->cryptd_tfm);
271}
272
273static struct ahash_alg ghash_async_alg = {
274 .init = ghash_async_init,
275 .update = ghash_async_update,
276 .final = ghash_async_final,
277 .setkey = ghash_async_setkey,
278 .digest = ghash_async_digest,
279 .halg = {
280 .digestsize = GHASH_DIGEST_SIZE,
281 .base = {
282 .cra_name = "ghash",
283 .cra_driver_name = "ghash-clmulni",
284 .cra_priority = 400,
285 .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
286 .cra_blocksize = GHASH_BLOCK_SIZE,
287 .cra_type = &crypto_ahash_type,
288 .cra_module = THIS_MODULE,
289 .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list),
290 .cra_init = ghash_async_init_tfm,
291 .cra_exit = ghash_async_exit_tfm,
292 },
293 },
294};
295
296static int __init ghash_pclmulqdqni_mod_init(void)
297{
298 int err;
299
300 if (!cpu_has_pclmulqdq) {
301 printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not"
302 " detected.\n");
303 return -ENODEV;
304 }
305
306 err = crypto_register_shash(&ghash_alg);
307 if (err)
308 goto err_out;
309 err = crypto_register_ahash(&ghash_async_alg);
310 if (err)
311 goto err_shash;
312
313 return 0;
314
315err_shash:
316 crypto_unregister_shash(&ghash_alg);
317err_out:
318 return err;
319}
320
321static void __exit ghash_pclmulqdqni_mod_exit(void)
322{
323 crypto_unregister_ahash(&ghash_async_alg);
324 crypto_unregister_shash(&ghash_alg);
325}
326
327module_init(ghash_pclmulqdqni_mod_init);
328module_exit(ghash_pclmulqdqni_mod_exit);
329
330MODULE_LICENSE("GPL");
331MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
332 "acclerated by PCLMULQDQ-NI");
333MODULE_ALIAS("ghash");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 581b0568fe19..4eefdca9832b 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -653,7 +653,7 @@ ia32_sys_call_table:
653 .quad compat_sys_writev 653 .quad compat_sys_writev
654 .quad sys_getsid 654 .quad sys_getsid
655 .quad sys_fdatasync 655 .quad sys_fdatasync
656 .quad sys32_sysctl /* sysctl */ 656 .quad compat_sys_sysctl /* sysctl */
657 .quad sys_mlock /* 150 */ 657 .quad sys_mlock /* 150 */
658 .quad sys_munlock 658 .quad sys_munlock
659 .quad sys_mlockall 659 .quad sys_mlockall
@@ -841,4 +841,5 @@ ia32_sys_call_table:
841 .quad compat_sys_pwritev 841 .quad compat_sys_pwritev
842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */ 842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
843 .quad sys_perf_event_open 843 .quad sys_perf_event_open
844 .quad compat_sys_recvmmsg
844ia32_syscall_end: 845ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 9f5527198825..df82c0e48ded 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -434,62 +434,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
434 return ret; 434 return ret;
435} 435}
436 436
437#ifdef CONFIG_SYSCTL_SYSCALL
438struct sysctl_ia32 {
439 unsigned int name;
440 int nlen;
441 unsigned int oldval;
442 unsigned int oldlenp;
443 unsigned int newval;
444 unsigned int newlen;
445 unsigned int __unused[4];
446};
447
448
449asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32)
450{
451 struct sysctl_ia32 a32;
452 mm_segment_t old_fs = get_fs();
453 void __user *oldvalp, *newvalp;
454 size_t oldlen;
455 int __user *namep;
456 long ret;
457
458 if (copy_from_user(&a32, args32, sizeof(a32)))
459 return -EFAULT;
460
461 /*
462 * We need to pre-validate these because we have to disable
463 * address checking before calling do_sysctl() because of
464 * OLDLEN but we can't run the risk of the user specifying bad
465 * addresses here. Well, since we're dealing with 32 bit
466 * addresses, we KNOW that access_ok() will always succeed, so
467 * this is an expensive NOP, but so what...
468 */
469 namep = compat_ptr(a32.name);
470 oldvalp = compat_ptr(a32.oldval);
471 newvalp = compat_ptr(a32.newval);
472
473 if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
474 || !access_ok(VERIFY_WRITE, namep, 0)
475 || !access_ok(VERIFY_WRITE, oldvalp, 0)
476 || !access_ok(VERIFY_WRITE, newvalp, 0))
477 return -EFAULT;
478
479 set_fs(KERNEL_DS);
480 lock_kernel();
481 ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen,
482 newvalp, (size_t) a32.newlen);
483 unlock_kernel();
484 set_fs(old_fs);
485
486 if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
487 return -EFAULT;
488
489 return ret;
490}
491#endif
492
493/* warning: next two assume little endian */ 437/* warning: next two assume little endian */
494asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, 438asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
495 u32 poslo, u32 poshi) 439 u32 poslo, u32 poshi)
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4518dc500903..60d2b2db0bc5 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -118,7 +118,7 @@ extern void acpi_restore_state_mem(void);
118extern unsigned long acpi_wakeup_address; 118extern unsigned long acpi_wakeup_address;
119 119
120/* early initialization routine */ 120/* early initialization routine */
121extern void acpi_reserve_bootmem(void); 121extern void acpi_reserve_wakeup_memory(void);
122 122
123/* 123/*
124 * Check if the CPU can handle C2 and deeper 124 * Check if the CPU can handle C2 and deeper
@@ -158,6 +158,7 @@ struct bootnode;
158 158
159#ifdef CONFIG_ACPI_NUMA 159#ifdef CONFIG_ACPI_NUMA
160extern int acpi_numa; 160extern int acpi_numa;
161extern int acpi_get_nodes(struct bootnode *physnodes);
161extern int acpi_scan_nodes(unsigned long start, unsigned long end); 162extern int acpi_scan_nodes(unsigned long start, unsigned long end);
162#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 163#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
163extern void acpi_fake_nodes(const struct bootnode *fake_nodes, 164extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 549860d3be8f..2f9047cfaaca 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -9,12 +9,13 @@
9 9
10#define __read_mostly __attribute__((__section__(".data.read_mostly"))) 10#define __read_mostly __attribute__((__section__(".data.read_mostly")))
11 11
12#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
13#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
14
12#ifdef CONFIG_X86_VSMP 15#ifdef CONFIG_X86_VSMP
13/* vSMP Internode cacheline shift */
14#define INTERNODE_CACHE_SHIFT (12)
15#ifdef CONFIG_SMP 16#ifdef CONFIG_SMP
16#define __cacheline_aligned_in_smp \ 17#define __cacheline_aligned_in_smp \
17 __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ 18 __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \
18 __page_aligned_data 19 __page_aligned_data
19#endif 20#endif
20#endif 21#endif
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b54f6afe7ec4..634c40a739a6 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
12 unsigned long start, unsigned long end) { } 12 unsigned long start, unsigned long end) { }
13static inline void flush_cache_page(struct vm_area_struct *vma, 13static inline void flush_cache_page(struct vm_area_struct *vma,
14 unsigned long vmaddr, unsigned long pfn) { } 14 unsigned long vmaddr, unsigned long pfn) { }
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
15static inline void flush_dcache_page(struct page *page) { } 16static inline void flush_dcache_page(struct page *page) { }
16static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } 17static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
17static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } 18static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
@@ -176,6 +177,7 @@ void clflush_cache_range(void *addr, unsigned int size);
176#ifdef CONFIG_DEBUG_RODATA 177#ifdef CONFIG_DEBUG_RODATA
177void mark_rodata_ro(void); 178void mark_rodata_ro(void);
178extern const int rodata_test_data; 179extern const int rodata_test_data;
180extern int kernel_set_to_readonly;
179void set_kernel_text_rw(void); 181void set_kernel_text_rw(void);
180void set_kernel_text_ro(void); 182void set_kernel_text_ro(void);
181#else 183#else
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9cfc88b97742..613700f27a4a 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -248,6 +248,7 @@ extern const char * const x86_power_flags[32];
248#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) 248#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
249#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 249#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
250#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 250#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
251#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
251 252
252#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 253#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
253# define cpu_has_invlpg 1 254# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 40b4e614fe71..761249e396fe 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -61,6 +61,12 @@ struct e820map {
61 struct e820entry map[E820_X_MAX]; 61 struct e820entry map[E820_X_MAX];
62}; 62};
63 63
64#define ISA_START_ADDRESS 0xa0000
65#define ISA_END_ADDRESS 0x100000
66
67#define BIOS_BEGIN 0x000a0000
68#define BIOS_END 0x00100000
69
64#ifdef __KERNEL__ 70#ifdef __KERNEL__
65/* see comment in arch/x86/kernel/e820.c */ 71/* see comment in arch/x86/kernel/e820.c */
66extern struct e820map e820; 72extern struct e820map e820;
@@ -126,15 +132,18 @@ extern void e820_reserve_resources(void);
126extern void e820_reserve_resources_late(void); 132extern void e820_reserve_resources_late(void);
127extern void setup_memory_map(void); 133extern void setup_memory_map(void);
128extern char *default_machine_specific_memory_setup(void); 134extern char *default_machine_specific_memory_setup(void);
129#endif /* __KERNEL__ */
130#endif /* __ASSEMBLY__ */
131 135
132#define ISA_START_ADDRESS 0xa0000 136/*
133#define ISA_END_ADDRESS 0x100000 137 * Returns true iff the specified range [s,e) is completely contained inside
134#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) 138 * the ISA region.
139 */
140static inline bool is_ISA_range(u64 s, u64 e)
141{
142 return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS;
143}
135 144
136#define BIOS_BEGIN 0x000a0000 145#endif /* __KERNEL__ */
137#define BIOS_END 0x00100000 146#endif /* __ASSEMBLY__ */
138 147
139#ifdef __KERNEL__ 148#ifdef __KERNEL__
140#include <linux/ioport.h> 149#include <linux/ioport.h>
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 456a304b8172..8a024babe5e6 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -157,19 +157,6 @@ do { \
157 157
158#define compat_elf_check_arch(x) elf_check_arch_ia32(x) 158#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
159 159
160static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
161{
162 loadsegment(fs, 0);
163 loadsegment(ds, __USER32_DS);
164 loadsegment(es, __USER32_DS);
165 load_gs_index(0);
166 regs->ip = ip;
167 regs->sp = sp;
168 regs->flags = X86_EFLAGS_IF;
169 regs->cs = __USER32_CS;
170 regs->ss = __USER32_DS;
171}
172
173static inline void elf_common_init(struct thread_struct *t, 160static inline void elf_common_init(struct thread_struct *t,
174 struct pt_regs *regs, const u16 ds) 161 struct pt_regs *regs, const u16 ds)
175{ 162{
@@ -191,11 +178,8 @@ do { \
191#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ 178#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
192 elf_common_init(&current->thread, regs, __USER_DS) 179 elf_common_init(&current->thread, regs, __USER_DS)
193 180
194#define compat_start_thread(regs, ip, sp) \ 181void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
195do { \ 182#define compat_start_thread start_thread_ia32
196 start_ia32_thread(regs, ip, sp); \
197 set_fs(USER_DS); \
198} while (0)
199 183
200#define COMPAT_SET_PERSONALITY(ex) \ 184#define COMPAT_SET_PERSONALITY(ex) \
201do { \ 185do { \
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index f5693c81a1db..8e8ec663a98f 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -34,7 +34,7 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
34 smp_invalidate_interrupt) 34 smp_invalidate_interrupt)
35#endif 35#endif
36 36
37BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) 37BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
38 38
39/* 39/*
40 * every pentium local APIC has two 'local interrupts', with a 40 * every pentium local APIC has two 'local interrupts', with a
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 108eb6fd1ae7..0f8576427cfe 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,7 +12,7 @@ typedef struct {
12 unsigned int apic_timer_irqs; /* arch dependent */ 12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int x86_platform_ipis; /* arch dependent */
16 unsigned int apic_perf_irqs; 16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs; 17 unsigned int apic_pending_irqs;
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1c22cb05ad6a..5d89fd2a3690 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -65,11 +65,12 @@
65/* hpet memory map physical address */ 65/* hpet memory map physical address */
66extern unsigned long hpet_address; 66extern unsigned long hpet_address;
67extern unsigned long force_hpet_address; 67extern unsigned long force_hpet_address;
68extern u8 hpet_blockid;
68extern int hpet_force_user; 69extern int hpet_force_user;
69extern int is_hpet_enabled(void); 70extern int is_hpet_enabled(void);
70extern int hpet_enable(void); 71extern int hpet_enable(void);
71extern void hpet_disable(void); 72extern void hpet_disable(void);
72extern unsigned long hpet_readl(unsigned long a); 73extern unsigned int hpet_readl(unsigned int a);
73extern void force_hpet_resume(void); 74extern void force_hpet_resume(void);
74 75
75extern void hpet_msi_unmask(unsigned int irq); 76extern void hpet_msi_unmask(unsigned int irq);
@@ -78,9 +79,9 @@ extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
78extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); 79extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
79 80
80#ifdef CONFIG_PCI_MSI 81#ifdef CONFIG_PCI_MSI
81extern int arch_setup_hpet_msi(unsigned int irq); 82extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
82#else 83#else
83static inline int arch_setup_hpet_msi(unsigned int irq) 84static inline int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
84{ 85{
85 return -EINVAL; 86 return -EINVAL;
86} 87}
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 6e124269fd4b..08c48a81841f 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,7 +27,7 @@
27 27
28/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void x86_platform_ipi(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_pending_interrupt(void); 32extern void perf_pending_interrupt(void);
33 33
@@ -119,7 +119,7 @@ extern void eisa_set_level_irq(unsigned int irq);
119/* SMP */ 119/* SMP */
120extern void smp_apic_timer_interrupt(struct pt_regs *); 120extern void smp_apic_timer_interrupt(struct pt_regs *);
121extern void smp_spurious_interrupt(struct pt_regs *); 121extern void smp_spurious_interrupt(struct pt_regs *);
122extern void smp_generic_interrupt(struct pt_regs *); 122extern void smp_x86_platform_ipi(struct pt_regs *);
123extern void smp_error_interrupt(struct pt_regs *); 123extern void smp_error_interrupt(struct pt_regs *);
124#ifdef CONFIG_X86_IO_APIC 124#ifdef CONFIG_X86_IO_APIC
125extern asmlinkage void smp_irq_move_cleanup_interrupt(void); 125extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 0b20bbb758f2..ebfb8a9e11f7 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -10,6 +10,8 @@
10#ifndef _ASM_X86_I387_H 10#ifndef _ASM_X86_I387_H
11#define _ASM_X86_I387_H 11#define _ASM_X86_I387_H
12 12
13#ifndef __ASSEMBLY__
14
13#include <linux/sched.h> 15#include <linux/sched.h>
14#include <linux/kernel_stat.h> 16#include <linux/kernel_stat.h>
15#include <linux/regset.h> 17#include <linux/regset.h>
@@ -411,4 +413,9 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
411 } 413 }
412} 414}
413 415
416#endif /* __ASSEMBLY__ */
417
418#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
419#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
420
414#endif /* _ASM_X86_I387_H */ 421#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
new file mode 100644
index 000000000000..14cf526091f9
--- /dev/null
+++ b/arch/x86/include/asm/inst.h
@@ -0,0 +1,150 @@
1/*
2 * Generate .byte code for some instructions not supported by old
3 * binutils.
4 */
5#ifndef X86_ASM_INST_H
6#define X86_ASM_INST_H
7
8#ifdef __ASSEMBLY__
9
10 .macro XMM_NUM opd xmm
11 .ifc \xmm,%xmm0
12 \opd = 0
13 .endif
14 .ifc \xmm,%xmm1
15 \opd = 1
16 .endif
17 .ifc \xmm,%xmm2
18 \opd = 2
19 .endif
20 .ifc \xmm,%xmm3
21 \opd = 3
22 .endif
23 .ifc \xmm,%xmm4
24 \opd = 4
25 .endif
26 .ifc \xmm,%xmm5
27 \opd = 5
28 .endif
29 .ifc \xmm,%xmm6
30 \opd = 6
31 .endif
32 .ifc \xmm,%xmm7
33 \opd = 7
34 .endif
35 .ifc \xmm,%xmm8
36 \opd = 8
37 .endif
38 .ifc \xmm,%xmm9
39 \opd = 9
40 .endif
41 .ifc \xmm,%xmm10
42 \opd = 10
43 .endif
44 .ifc \xmm,%xmm11
45 \opd = 11
46 .endif
47 .ifc \xmm,%xmm12
48 \opd = 12
49 .endif
50 .ifc \xmm,%xmm13
51 \opd = 13
52 .endif
53 .ifc \xmm,%xmm14
54 \opd = 14
55 .endif
56 .ifc \xmm,%xmm15
57 \opd = 15
58 .endif
59 .endm
60
61 .macro PFX_OPD_SIZE
62 .byte 0x66
63 .endm
64
65 .macro PFX_REX opd1 opd2
66 .if (\opd1 | \opd2) & 8
67 .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1)
68 .endif
69 .endm
70
71 .macro MODRM mod opd1 opd2
72 .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
73 .endm
74
75 .macro PSHUFB_XMM xmm1 xmm2
76 XMM_NUM pshufb_opd1 \xmm1
77 XMM_NUM pshufb_opd2 \xmm2
78 PFX_OPD_SIZE
79 PFX_REX pshufb_opd1 pshufb_opd2
80 .byte 0x0f, 0x38, 0x00
81 MODRM 0xc0 pshufb_opd1 pshufb_opd2
82 .endm
83
84 .macro PCLMULQDQ imm8 xmm1 xmm2
85 XMM_NUM clmul_opd1 \xmm1
86 XMM_NUM clmul_opd2 \xmm2
87 PFX_OPD_SIZE
88 PFX_REX clmul_opd1 clmul_opd2
89 .byte 0x0f, 0x3a, 0x44
90 MODRM 0xc0 clmul_opd1 clmul_opd2
91 .byte \imm8
92 .endm
93
94 .macro AESKEYGENASSIST rcon xmm1 xmm2
95 XMM_NUM aeskeygen_opd1 \xmm1
96 XMM_NUM aeskeygen_opd2 \xmm2
97 PFX_OPD_SIZE
98 PFX_REX aeskeygen_opd1 aeskeygen_opd2
99 .byte 0x0f, 0x3a, 0xdf
100 MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2
101 .byte \rcon
102 .endm
103
104 .macro AESIMC xmm1 xmm2
105 XMM_NUM aesimc_opd1 \xmm1
106 XMM_NUM aesimc_opd2 \xmm2
107 PFX_OPD_SIZE
108 PFX_REX aesimc_opd1 aesimc_opd2
109 .byte 0x0f, 0x38, 0xdb
110 MODRM 0xc0 aesimc_opd1 aesimc_opd2
111 .endm
112
113 .macro AESENC xmm1 xmm2
114 XMM_NUM aesenc_opd1 \xmm1
115 XMM_NUM aesenc_opd2 \xmm2
116 PFX_OPD_SIZE
117 PFX_REX aesenc_opd1 aesenc_opd2
118 .byte 0x0f, 0x38, 0xdc
119 MODRM 0xc0 aesenc_opd1 aesenc_opd2
120 .endm
121
122 .macro AESENCLAST xmm1 xmm2
123 XMM_NUM aesenclast_opd1 \xmm1
124 XMM_NUM aesenclast_opd2 \xmm2
125 PFX_OPD_SIZE
126 PFX_REX aesenclast_opd1 aesenclast_opd2
127 .byte 0x0f, 0x38, 0xdd
128 MODRM 0xc0 aesenclast_opd1 aesenclast_opd2
129 .endm
130
131 .macro AESDEC xmm1 xmm2
132 XMM_NUM aesdec_opd1 \xmm1
133 XMM_NUM aesdec_opd2 \xmm2
134 PFX_OPD_SIZE
135 PFX_REX aesdec_opd1 aesdec_opd2
136 .byte 0x0f, 0x38, 0xde
137 MODRM 0xc0 aesdec_opd1 aesdec_opd2
138 .endm
139
140 .macro AESDECLAST xmm1 xmm2
141 XMM_NUM aesdeclast_opd1 \xmm1
142 XMM_NUM aesdeclast_opd2 \xmm2
143 PFX_OPD_SIZE
144 PFX_REX aesdeclast_opd1 aesdeclast_opd2
145 .byte 0x0f, 0x38, 0xdf
146 MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
147 .endm
148#endif
149
150#endif
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ffd700ff5dcb..5458380b6ef8 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -37,7 +37,7 @@ extern void fixup_irqs(void);
37extern void irq_force_complete_move(int); 37extern void irq_force_complete_move(int);
38#endif 38#endif
39 39
40extern void (*generic_interrupt_extension)(void); 40extern void (*x86_platform_ipi_callback)(void);
41extern void native_init_IRQ(void); 41extern void native_init_IRQ(void);
42extern bool handle_irq(unsigned irq, struct pt_regs *regs); 42extern bool handle_irq(unsigned irq, struct pt_regs *regs);
43 43
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 5b21f0ec3df2..6a635bd39867 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -106,7 +106,7 @@
106/* 106/*
107 * Generic system vector for platform specific use 107 * Generic system vector for platform specific use
108 */ 108 */
109#define GENERIC_INTERRUPT_VECTOR 0xed 109#define X86_PLATFORM_IPI_VECTOR 0xed
110 110
111/* 111/*
112 * Performance monitoring pending work vector: 112 * Performance monitoring pending work vector:
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index c2d1f3b58e5f..f70e60071fe8 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -4,13 +4,16 @@
4#include <linux/pci.h> 4#include <linux/pci.h>
5 5
6extern struct pci_device_id k8_nb_ids[]; 6extern struct pci_device_id k8_nb_ids[];
7struct bootnode;
7 8
8extern int early_is_k8_nb(u32 value); 9extern int early_is_k8_nb(u32 value);
9extern struct pci_dev **k8_northbridges; 10extern struct pci_dev **k8_northbridges;
10extern int num_k8_northbridges; 11extern int num_k8_northbridges;
11extern int cache_k8_northbridges(void); 12extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void); 13extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end); 14extern int k8_get_nodes(struct bootnode *nodes);
15extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
16extern int k8_scan_nodes(void);
14 17
15#ifdef CONFIG_K8_NB 18#ifdef CONFIG_K8_NB
16static inline struct pci_dev *node_to_k8_nb_misc(int node) 19static inline struct pci_dev *node_to_k8_nb_misc(int node)
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4a5fe914dc59..950df434763f 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -19,6 +19,8 @@
19#define __KVM_HAVE_MSIX 19#define __KVM_HAVE_MSIX
20#define __KVM_HAVE_MCE 20#define __KVM_HAVE_MCE
21#define __KVM_HAVE_PIT_STATE2 21#define __KVM_HAVE_PIT_STATE2
22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS
22 24
23/* Architectural interrupt line count. */ 25/* Architectural interrupt line count. */
24#define KVM_NR_INTERRUPTS 256 26#define KVM_NR_INTERRUPTS 256
@@ -79,6 +81,7 @@ struct kvm_ioapic_state {
79#define KVM_IRQCHIP_PIC_MASTER 0 81#define KVM_IRQCHIP_PIC_MASTER 0
80#define KVM_IRQCHIP_PIC_SLAVE 1 82#define KVM_IRQCHIP_PIC_SLAVE 1
81#define KVM_IRQCHIP_IOAPIC 2 83#define KVM_IRQCHIP_IOAPIC 2
84#define KVM_NR_IRQCHIPS 3
82 85
83/* for KVM_GET_REGS and KVM_SET_REGS */ 86/* for KVM_GET_REGS and KVM_SET_REGS */
84struct kvm_regs { 87struct kvm_regs {
@@ -250,4 +253,31 @@ struct kvm_reinject_control {
250 __u8 pit_reinject; 253 __u8 pit_reinject;
251 __u8 reserved[31]; 254 __u8 reserved[31];
252}; 255};
256
257/* for KVM_GET/SET_VCPU_EVENTS */
258struct kvm_vcpu_events {
259 struct {
260 __u8 injected;
261 __u8 nr;
262 __u8 has_error_code;
263 __u8 pad;
264 __u32 error_code;
265 } exception;
266 struct {
267 __u8 injected;
268 __u8 nr;
269 __u8 soft;
270 __u8 pad;
271 } interrupt;
272 struct {
273 __u8 injected;
274 __u8 pending;
275 __u8 masked;
276 __u8 pad;
277 } nmi;
278 __u32 sipi_vector;
279 __u32 flags;
280 __u32 reserved[10];
281};
282
253#endif /* _ASM_X86_KVM_H */ 283#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c423116..7c18e1230f54 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -129,7 +129,7 @@ struct decode_cache {
129 u8 seg_override; 129 u8 seg_override;
130 unsigned int d; 130 unsigned int d;
131 unsigned long regs[NR_VCPU_REGS]; 131 unsigned long regs[NR_VCPU_REGS];
132 unsigned long eip; 132 unsigned long eip, eip_orig;
133 /* modrm */ 133 /* modrm */
134 u8 modrm; 134 u8 modrm;
135 u8 modrm_mod; 135 u8 modrm_mod;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d83892226f73..4f865e8b8540 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -354,7 +354,6 @@ struct kvm_vcpu_arch {
354 unsigned int time_offset; 354 unsigned int time_offset;
355 struct page *time_page; 355 struct page *time_page;
356 356
357 bool singlestep; /* guest is single stepped by KVM */
358 bool nmi_pending; 357 bool nmi_pending;
359 bool nmi_injected; 358 bool nmi_injected;
360 359
@@ -371,6 +370,10 @@ struct kvm_vcpu_arch {
371 u64 mcg_status; 370 u64 mcg_status;
372 u64 mcg_ctl; 371 u64 mcg_ctl;
373 u64 *mce_banks; 372 u64 *mce_banks;
373
374 /* used for guest single stepping over the given code position */
375 u16 singlestep_cs;
376 unsigned long singlestep_rip;
374}; 377};
375 378
376struct kvm_mem_alias { 379struct kvm_mem_alias {
@@ -397,7 +400,6 @@ struct kvm_arch{
397 struct kvm_pic *vpic; 400 struct kvm_pic *vpic;
398 struct kvm_ioapic *vioapic; 401 struct kvm_ioapic *vioapic;
399 struct kvm_pit *vpit; 402 struct kvm_pit *vpit;
400 struct hlist_head irq_ack_notifier_list;
401 int vapics_in_nmi_mode; 403 int vapics_in_nmi_mode;
402 404
403 unsigned int tss_addr; 405 unsigned int tss_addr;
@@ -410,8 +412,10 @@ struct kvm_arch{
410 gpa_t ept_identity_map_addr; 412 gpa_t ept_identity_map_addr;
411 413
412 unsigned long irq_sources_bitmap; 414 unsigned long irq_sources_bitmap;
413 unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
414 u64 vm_init_tsc; 415 u64 vm_init_tsc;
416 s64 kvmclock_offset;
417
418 struct kvm_xen_hvm_config xen_hvm_config;
415}; 419};
416 420
417struct kvm_vm_stat { 421struct kvm_vm_stat {
@@ -461,7 +465,7 @@ struct descriptor_table {
461struct kvm_x86_ops { 465struct kvm_x86_ops {
462 int (*cpu_has_kvm_support)(void); /* __init */ 466 int (*cpu_has_kvm_support)(void); /* __init */
463 int (*disabled_by_bios)(void); /* __init */ 467 int (*disabled_by_bios)(void); /* __init */
464 void (*hardware_enable)(void *dummy); /* __init */ 468 int (*hardware_enable)(void *dummy);
465 void (*hardware_disable)(void *dummy); 469 void (*hardware_disable)(void *dummy);
466 void (*check_processor_compatibility)(void *rtn); 470 void (*check_processor_compatibility)(void *rtn);
467 int (*hardware_setup)(void); /* __init */ 471 int (*hardware_setup)(void); /* __init */
@@ -477,8 +481,8 @@ struct kvm_x86_ops {
477 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 481 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
478 void (*vcpu_put)(struct kvm_vcpu *vcpu); 482 void (*vcpu_put)(struct kvm_vcpu *vcpu);
479 483
480 int (*set_guest_debug)(struct kvm_vcpu *vcpu, 484 void (*set_guest_debug)(struct kvm_vcpu *vcpu,
481 struct kvm_guest_debug *dbg); 485 struct kvm_guest_debug *dbg);
482 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 486 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
483 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 487 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
484 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 488 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -506,8 +510,8 @@ struct kvm_x86_ops {
506 510
507 void (*tlb_flush)(struct kvm_vcpu *vcpu); 511 void (*tlb_flush)(struct kvm_vcpu *vcpu);
508 512
509 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 513 void (*run)(struct kvm_vcpu *vcpu);
510 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 514 int (*handle_exit)(struct kvm_vcpu *vcpu);
511 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 515 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
512 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 516 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
513 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 517 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
@@ -519,6 +523,8 @@ struct kvm_x86_ops {
519 bool has_error_code, u32 error_code); 523 bool has_error_code, u32 error_code);
520 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 524 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
521 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 525 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
526 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
527 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
522 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 528 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
523 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 529 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
524 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 530 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
@@ -568,7 +574,7 @@ enum emulation_result {
568#define EMULTYPE_NO_DECODE (1 << 0) 574#define EMULTYPE_NO_DECODE (1 << 0)
569#define EMULTYPE_TRAP_UD (1 << 1) 575#define EMULTYPE_TRAP_UD (1 << 1)
570#define EMULTYPE_SKIP (1 << 2) 576#define EMULTYPE_SKIP (1 << 2)
571int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 577int emulate_instruction(struct kvm_vcpu *vcpu,
572 unsigned long cr2, u16 error_code, int emulation_type); 578 unsigned long cr2, u16 error_code, int emulation_type);
573void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
574void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -585,9 +591,9 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
585 591
586struct x86_emulate_ctxt; 592struct x86_emulate_ctxt;
587 593
588int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 594int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
589 int size, unsigned port); 595 int size, unsigned port);
590int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 596int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
591 int size, unsigned long count, int down, 597 int size, unsigned long count, int down,
592 gva_t address, int rep, unsigned port); 598 gva_t address, int rep, unsigned port);
593void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 599void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -616,6 +622,9 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
616int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 622int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
617int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 623int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
618 624
625unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
626void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
627
619void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); 628void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
620void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 629void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
621void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 630void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
@@ -802,4 +811,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
802int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 811int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
803int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 812int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
804 813
814void kvm_define_shared_msr(unsigned index, u32 msr);
815void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
816
805#endif /* _ASM_X86_KVM_HOST_H */ 817#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b501e22a..c24ca9a56458 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -12,6 +12,8 @@ struct device;
12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; 12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
13 13
14struct microcode_ops { 14struct microcode_ops {
15 void (*init)(struct device *device);
16 void (*fini)(void);
15 enum ucode_state (*request_microcode_user) (int cpu, 17 enum ucode_state (*request_microcode_user) (int cpu,
16 const void __user *buf, size_t size); 18 const void __user *buf, size_t size);
17 19
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 61d90b1331c3..d8bf23a88d05 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -71,12 +71,7 @@ static inline void early_get_smp_config(void)
71 71
72static inline void find_smp_config(void) 72static inline void find_smp_config(void)
73{ 73{
74 x86_init.mpparse.find_smp_config(1); 74 x86_init.mpparse.find_smp_config();
75}
76
77static inline void early_find_smp_config(void)
78{
79 x86_init.mpparse.find_smp_config(0);
80} 75}
81 76
82#ifdef CONFIG_X86_MPPARSE 77#ifdef CONFIG_X86_MPPARSE
@@ -89,7 +84,7 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
89# else 84# else
90# define default_mpc_oem_bus_info NULL 85# define default_mpc_oem_bus_info NULL
91# endif 86# endif
92extern void default_find_smp_config(unsigned int reserve); 87extern void default_find_smp_config(void);
93extern void default_get_smp_config(unsigned int early); 88extern void default_get_smp_config(unsigned int early);
94#else 89#else
95static inline void early_reserve_e820_mpc_new(void) { } 90static inline void early_reserve_e820_mpc_new(void) { }
@@ -97,7 +92,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
97#define default_mpc_apic_id NULL 92#define default_mpc_apic_id NULL
98#define default_smp_read_mpc_oem NULL 93#define default_smp_read_mpc_oem NULL
99#define default_mpc_oem_bus_info NULL 94#define default_mpc_oem_bus_info NULL
100#define default_find_smp_config x86_init_uint_noop 95#define default_find_smp_config x86_init_noop
101#define default_get_smp_config x86_init_uint_noop 96#define default_get_smp_config x86_init_uint_noop
102#endif 97#endif
103 98
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 6473f5ccff85..642fe34b36a2 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -49,7 +49,8 @@ extern unsigned long max_pfn_mapped;
49extern unsigned long init_memory_mapping(unsigned long start, 49extern unsigned long init_memory_mapping(unsigned long start,
50 unsigned long end); 50 unsigned long end);
51 51
52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); 52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn,
53 int acpi, int k8);
53extern void free_initmem(void); 54extern void free_initmem(void);
54 55
55#endif /* !__ASSEMBLY__ */ 56#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index af6fd360ab35..a34c785c5a63 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -16,6 +16,8 @@
16 16
17#ifndef __ASSEMBLY__ 17#ifndef __ASSEMBLY__
18 18
19#include <asm/x86_init.h>
20
19/* 21/*
20 * ZERO_PAGE is a global shared page that is always zero: used 22 * ZERO_PAGE is a global shared page that is always zero: used
21 * for zero-mapped memory areas etc.. 23 * for zero-mapped memory areas etc..
@@ -270,9 +272,9 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
270 unsigned long new_flags) 272 unsigned long new_flags)
271{ 273{
272 /* 274 /*
273 * PAT type is always WB for ISA. So no need to check. 275 * PAT type is always WB for untracked ranges, so no need to check.
274 */ 276 */
275 if (is_ISA_range(paddr, paddr + size - 1)) 277 if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
276 return 1; 278 return 1;
277 279
278 /* 280 /*
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 621f56d73121..4009f6534f52 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -5,18 +5,19 @@
5 5
6/* misc architecture specific prototypes */ 6/* misc architecture specific prototypes */
7 7
8extern void early_idt_handler(void); 8void early_idt_handler(void);
9 9
10extern void system_call(void); 10void system_call(void);
11extern void syscall_init(void); 11void syscall_init(void);
12 12
13extern void ia32_syscall(void); 13void ia32_syscall(void);
14extern void ia32_cstar_target(void); 14void ia32_cstar_target(void);
15extern void ia32_sysenter_target(void); 15void ia32_sysenter_target(void);
16 16
17extern void syscall32_cpu_init(void); 17void syscall32_cpu_init(void);
18 18
19extern void check_efer(void); 19void x86_configure_nx(void);
20void x86_report_nx(void);
20 21
21extern int reboot_force; 22extern int reboot_force;
22 23
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 1b7ee5d673c2..0a5242428659 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,7 +2,13 @@
2#define _ASM_X86_SECTIONS_H 2#define _ASM_X86_SECTIONS_H
3 3
4#include <asm-generic/sections.h> 4#include <asm-generic/sections.h>
5#include <asm/uaccess.h>
5 6
6extern char __brk_base[], __brk_limit[]; 7extern char __brk_base[], __brk_limit[];
8extern struct exception_table_entry __stop___ex_table[];
9
10#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
11extern char __end_rodata_hpage_align[];
12#endif
7 13
8#endif /* _ASM_X86_SECTIONS_H */ 14#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7c1bc1..1fecb7e61130 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
57 u16 intercept_dr_write; 57 u16 intercept_dr_write;
58 u32 intercept_exceptions; 58 u32 intercept_exceptions;
59 u64 intercept; 59 u64 intercept;
60 u8 reserved_1[44]; 60 u8 reserved_1[42];
61 u16 pause_filter_count;
61 u64 iopm_base_pa; 62 u64 iopm_base_pa;
62 u64 msrpm_base_pa; 63 u64 msrpm_base_pa;
63 u64 tsc_offset; 64 u64 tsc_offset;
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 72a6dcd1299b..9af9decb38c3 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -51,11 +51,6 @@ asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
51asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t); 51asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
52asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); 52asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
53 53
54#ifdef CONFIG_SYSCTL_SYSCALL
55struct sysctl_ia32;
56asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
57#endif
58
59asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); 54asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
60asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); 55asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
61 56
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2fec4c..375c917c37d2 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
84#define TIF_SECCOMP 8 /* secure computing */ 84#define TIF_SECCOMP 8 /* secure computing */
85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
86#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
86#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 87#define TIF_NOTSC 16 /* TSC is not accessible in userland */
87#define TIF_IA32 17 /* 32bit process */ 88#define TIF_IA32 17 /* 32bit process */
88#define TIF_FORK 18 /* ret_from_fork */ 89#define TIF_FORK 18 /* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
107#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 108#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
108#define _TIF_SECCOMP (1 << TIF_SECCOMP) 109#define _TIF_SECCOMP (1 << TIF_SECCOMP)
109#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 110#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
111#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
110#define _TIF_NOTSC (1 << TIF_NOTSC) 112#define _TIF_NOTSC (1 << TIF_NOTSC)
111#define _TIF_IA32 (1 << TIF_IA32) 113#define _TIF_IA32 (1 << TIF_IA32)
112#define _TIF_FORK (1 << TIF_FORK) 114#define _TIF_FORK (1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
142 144
143/* Only used for 64 bit */ 145/* Only used for 64 bit */
144#define _TIF_DO_NOTIFY_MASK \ 146#define _TIF_DO_NOTIFY_MASK \
145 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) 147 (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
148 _TIF_USER_RETURN_NOTIFY)
146 149
147/* flags to check in __switch_to() */ 150/* flags to check in __switch_to() */
148#define _TIF_WORK_CTXSW \ 151#define _TIF_WORK_CTXSW \
149 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) 152 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
150 153
151#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW 154#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
152#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) 155#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
153 156
154#define PREEMPT_ACTIVE 0x10000000 157#define PREEMPT_ACTIVE 0x10000000
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6fb3c209a7e3..3baf379fa840 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -342,10 +342,11 @@
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335 343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_event_open 336 344#define __NR_perf_event_open 336
345#define __NR_recvmmsg 337
345 346
346#ifdef __KERNEL__ 347#ifdef __KERNEL__
347 348
348#define NR_syscalls 337 349#define NR_syscalls 338
349 350
350#define __ARCH_WANT_IPC_PARSE_VERSION 351#define __ARCH_WANT_IPC_PARSE_VERSION
351#define __ARCH_WANT_OLD_READDIR 352#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 8d3ad0adbc68..4843f7ba754a 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) 661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_event_open 298 662#define __NR_perf_event_open 298
663__SYSCALL(__NR_perf_event_open, sys_perf_event_open) 663__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
664#define __NR_recvmmsg 299
665__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
664 666
665#ifndef __NO_STUBS 667#ifndef __NO_STUBS
666#define __ARCH_WANT_OLD_READDIR 668#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 272514c2d456..2b4945419a84 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -56,6 +56,7 @@
56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
59#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
59 60
60 61
61#define PIN_BASED_EXT_INTR_MASK 0x00000001 62#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -144,6 +145,8 @@ enum vmcs_field {
144 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, 145 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
145 TPR_THRESHOLD = 0x0000401c, 146 TPR_THRESHOLD = 0x0000401c,
146 SECONDARY_VM_EXEC_CONTROL = 0x0000401e, 147 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
148 PLE_GAP = 0x00004020,
149 PLE_WINDOW = 0x00004022,
147 VM_INSTRUCTION_ERROR = 0x00004400, 150 VM_INSTRUCTION_ERROR = 0x00004400,
148 VM_EXIT_REASON = 0x00004402, 151 VM_EXIT_REASON = 0x00004402,
149 VM_EXIT_INTR_INFO = 0x00004404, 152 VM_EXIT_INTR_INFO = 0x00004404,
@@ -248,6 +251,7 @@ enum vmcs_field {
248#define EXIT_REASON_MSR_READ 31 251#define EXIT_REASON_MSR_READ 31
249#define EXIT_REASON_MSR_WRITE 32 252#define EXIT_REASON_MSR_WRITE 32
250#define EXIT_REASON_MWAIT_INSTRUCTION 36 253#define EXIT_REASON_MWAIT_INSTRUCTION 36
254#define EXIT_REASON_PAUSE_INSTRUCTION 40
251#define EXIT_REASON_MCE_DURING_VMENTRY 41 255#define EXIT_REASON_MCE_DURING_VMENTRY 41
252#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 256#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
253#define EXIT_REASON_APIC_ACCESS 44 257#define EXIT_REASON_APIC_ACCESS 44
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index d8e71459f025..ea0e8ea15e15 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -26,7 +26,7 @@ struct x86_init_mpparse {
26 void (*smp_read_mpc_oem)(struct mpc_table *mpc); 26 void (*smp_read_mpc_oem)(struct mpc_table *mpc);
27 void (*mpc_oem_pci_bus)(struct mpc_bus *m); 27 void (*mpc_oem_pci_bus)(struct mpc_bus *m);
28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); 28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
29 void (*find_smp_config)(unsigned int reserve); 29 void (*find_smp_config)(void);
30 void (*get_smp_config)(unsigned int early); 30 void (*get_smp_config)(unsigned int early);
31}; 31};
32 32
@@ -125,12 +125,14 @@ struct x86_cpuinit_ops {
125 * @calibrate_tsc: calibrate TSC 125 * @calibrate_tsc: calibrate TSC
126 * @get_wallclock: get time from HW clock like RTC etc. 126 * @get_wallclock: get time from HW clock like RTC etc.
127 * @set_wallclock: set time back to HW clock 127 * @set_wallclock: set time back to HW clock
128 * @is_untracked_pat_range exclude from PAT logic
128 */ 129 */
129struct x86_platform_ops { 130struct x86_platform_ops {
130 unsigned long (*calibrate_tsc)(void); 131 unsigned long (*calibrate_tsc)(void);
131 unsigned long (*get_wallclock)(void); 132 unsigned long (*get_wallclock)(void);
132 int (*set_wallclock)(unsigned long nowtime); 133 int (*set_wallclock)(unsigned long nowtime);
133 void (*iommu_shutdown)(void); 134 void (*iommu_shutdown)(void);
135 bool (*is_untracked_pat_range)(u64 start, u64 end);
134}; 136};
135 137
136extern struct x86_init_ops x86_init; 138extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 67e929b89875..87eee07da21f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -624,6 +624,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
624 } 624 }
625 625
626 hpet_address = hpet_tbl->address.address; 626 hpet_address = hpet_tbl->address.address;
627 hpet_blockid = hpet_tbl->sequence;
627 628
628 /* 629 /*
629 * Some broken BIOSes advertise HPET at 0x0. We really do not 630 * Some broken BIOSes advertise HPET at 0x0. We really do not
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638ba430..82e508677b91 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,12 +78,9 @@ int acpi_save_state_mem(void)
78#ifndef CONFIG_64BIT 78#ifndef CONFIG_64BIT
79 store_gdt((struct desc_ptr *)&header->pmode_gdt); 79 store_gdt((struct desc_ptr *)&header->pmode_gdt);
80 80
81 header->pmode_efer_low = nx_enabled; 81 if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
82 if (header->pmode_efer_low & 1) { 82 &header->pmode_efer_high))
83 /* This is strange, why not save efer, always? */ 83 header->pmode_efer_low = header->pmode_efer_high = 0;
84 rdmsr(MSR_EFER, header->pmode_efer_low,
85 header->pmode_efer_high);
86 }
87#endif /* !CONFIG_64BIT */ 84#endif /* !CONFIG_64BIT */
88 85
89 header->pmode_cr0 = read_cr0(); 86 header->pmode_cr0 = read_cr0();
@@ -119,29 +116,32 @@ void acpi_restore_state_mem(void)
119 116
120 117
121/** 118/**
122 * acpi_reserve_bootmem - do _very_ early ACPI initialisation 119 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
123 * 120 *
124 * We allocate a page from the first 1MB of memory for the wakeup 121 * We allocate a page from the first 1MB of memory for the wakeup
125 * routine for when we come back from a sleep state. The 122 * routine for when we come back from a sleep state. The
126 * runtime allocator allows specification of <16MB pages, but not 123 * runtime allocator allows specification of <16MB pages, but not
127 * <1MB pages. 124 * <1MB pages.
128 */ 125 */
129void __init acpi_reserve_bootmem(void) 126void __init acpi_reserve_wakeup_memory(void)
130{ 127{
128 unsigned long mem;
129
131 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { 130 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
132 printk(KERN_ERR 131 printk(KERN_ERR
133 "ACPI: Wakeup code way too big, S3 disabled.\n"); 132 "ACPI: Wakeup code way too big, S3 disabled.\n");
134 return; 133 return;
135 } 134 }
136 135
137 acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); 136 mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
138 137
139 if (!acpi_realmode) { 138 if (mem == -1L) {
140 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); 139 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
141 return; 140 return;
142 } 141 }
143 142 acpi_realmode = (unsigned long) phys_to_virt(mem);
144 acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); 143 acpi_wakeup_address = mem;
144 reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
145} 145}
146 146
147 147
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ad8c75b9e453..efb2b9cd132c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -647,7 +647,7 @@ static int __init calibrate_APIC_clock(void)
647 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; 647 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
648 648
649 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); 649 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
650 apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); 650 apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
651 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", 651 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
652 calibration_result); 652 calibration_result);
653 653
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index c0b4468683f9..d5d498fbee4b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3267,7 +3267,8 @@ void destroy_irq(unsigned int irq)
3267 * MSI message composition 3267 * MSI message composition
3268 */ 3268 */
3269#ifdef CONFIG_PCI_MSI 3269#ifdef CONFIG_PCI_MSI
3270static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) 3270static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3271 struct msi_msg *msg, u8 hpet_id)
3271{ 3272{
3272 struct irq_cfg *cfg; 3273 struct irq_cfg *cfg;
3273 int err; 3274 int err;
@@ -3301,7 +3302,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3301 irte.dest_id = IRTE_DEST(dest); 3302 irte.dest_id = IRTE_DEST(dest);
3302 3303
3303 /* Set source-id of interrupt request */ 3304 /* Set source-id of interrupt request */
3304 set_msi_sid(&irte, pdev); 3305 if (pdev)
3306 set_msi_sid(&irte, pdev);
3307 else
3308 set_hpet_sid(&irte, hpet_id);
3305 3309
3306 modify_irte(irq, &irte); 3310 modify_irte(irq, &irte);
3307 3311
@@ -3466,7 +3470,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3466 int ret; 3470 int ret;
3467 struct msi_msg msg; 3471 struct msi_msg msg;
3468 3472
3469 ret = msi_compose_msg(dev, irq, &msg); 3473 ret = msi_compose_msg(dev, irq, &msg, -1);
3470 if (ret < 0) 3474 if (ret < 0)
3471 return ret; 3475 return ret;
3472 3476
@@ -3599,7 +3603,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3599 int ret; 3603 int ret;
3600 struct msi_msg msg; 3604 struct msi_msg msg;
3601 3605
3602 ret = msi_compose_msg(NULL, irq, &msg); 3606 ret = msi_compose_msg(NULL, irq, &msg, -1);
3603 if (ret < 0) 3607 if (ret < 0)
3604 return ret; 3608 return ret;
3605 dmar_msi_write(irq, &msg); 3609 dmar_msi_write(irq, &msg);
@@ -3639,6 +3643,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3639 3643
3640#endif /* CONFIG_SMP */ 3644#endif /* CONFIG_SMP */
3641 3645
3646static struct irq_chip ir_hpet_msi_type = {
3647 .name = "IR-HPET_MSI",
3648 .unmask = hpet_msi_unmask,
3649 .mask = hpet_msi_mask,
3650#ifdef CONFIG_INTR_REMAP
3651 .ack = ir_ack_apic_edge,
3652#ifdef CONFIG_SMP
3653 .set_affinity = ir_set_msi_irq_affinity,
3654#endif
3655#endif
3656 .retrigger = ioapic_retrigger_irq,
3657};
3658
3642static struct irq_chip hpet_msi_type = { 3659static struct irq_chip hpet_msi_type = {
3643 .name = "HPET_MSI", 3660 .name = "HPET_MSI",
3644 .unmask = hpet_msi_unmask, 3661 .unmask = hpet_msi_unmask,
@@ -3650,20 +3667,36 @@ static struct irq_chip hpet_msi_type = {
3650 .retrigger = ioapic_retrigger_irq, 3667 .retrigger = ioapic_retrigger_irq,
3651}; 3668};
3652 3669
3653int arch_setup_hpet_msi(unsigned int irq) 3670int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3654{ 3671{
3655 int ret; 3672 int ret;
3656 struct msi_msg msg; 3673 struct msi_msg msg;
3657 struct irq_desc *desc = irq_to_desc(irq); 3674 struct irq_desc *desc = irq_to_desc(irq);
3658 3675
3659 ret = msi_compose_msg(NULL, irq, &msg); 3676 if (intr_remapping_enabled) {
3677 struct intel_iommu *iommu = map_hpet_to_ir(id);
3678 int index;
3679
3680 if (!iommu)
3681 return -1;
3682
3683 index = alloc_irte(iommu, irq, 1);
3684 if (index < 0)
3685 return -1;
3686 }
3687
3688 ret = msi_compose_msg(NULL, irq, &msg, id);
3660 if (ret < 0) 3689 if (ret < 0)
3661 return ret; 3690 return ret;
3662 3691
3663 hpet_msi_write(irq, &msg); 3692 hpet_msi_write(irq, &msg);
3664 desc->status |= IRQ_MOVE_PCNTXT; 3693 desc->status |= IRQ_MOVE_PCNTXT;
3665 set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, 3694 if (irq_remapped(irq))
3666 "edge"); 3695 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
3696 handle_edge_irq, "edge");
3697 else
3698 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3699 handle_edge_irq, "edge");
3667 3700
3668 return 0; 3701 return 0;
3669} 3702}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 07cdbdcd7a92..98c4665f251c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -264,11 +264,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
264static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
265{ 265{
266 /* 266 /*
267 * Find possible boot-time SMP configuration:
268 */
269 early_find_smp_config();
270
271 /*
272 * get boot-time SMP configuration: 267 * get boot-time SMP configuration:
273 */ 268 */
274 if (smp_found_config) 269 if (smp_found_config)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 130c4b934877..b684bb303cbf 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -30,10 +30,22 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/ipi.h> 31#include <asm/ipi.h>
32#include <asm/smp.h> 32#include <asm/smp.h>
33#include <asm/x86_init.h>
33 34
34DEFINE_PER_CPU(int, x2apic_extra_bits); 35DEFINE_PER_CPU(int, x2apic_extra_bits);
35 36
36static enum uv_system_type uv_system_type; 37static enum uv_system_type uv_system_type;
38static u64 gru_start_paddr, gru_end_paddr;
39
40static inline bool is_GRU_range(u64 start, u64 end)
41{
42 return start >= gru_start_paddr && end <= gru_end_paddr;
43}
44
45static bool uv_is_untracked_pat_range(u64 start, u64 end)
46{
47 return is_ISA_range(start, end) || is_GRU_range(start, end);
48}
37 49
38static int early_get_nodeid(void) 50static int early_get_nodeid(void)
39{ 51{
@@ -49,6 +61,7 @@ static int early_get_nodeid(void)
49static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 61static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 62{
51 if (!strcmp(oem_id, "SGI")) { 63 if (!strcmp(oem_id, "SGI")) {
64 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
52 if (!strcmp(oem_table_id, "UVL")) 65 if (!strcmp(oem_table_id, "UVL"))
53 uv_system_type = UV_LEGACY_APIC; 66 uv_system_type = UV_LEGACY_APIC;
54 else if (!strcmp(oem_table_id, "UVX")) 67 else if (!strcmp(oem_table_id, "UVX"))
@@ -385,8 +398,12 @@ static __init void map_gru_high(int max_pnode)
385 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; 398 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
386 399
387 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 400 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
388 if (gru.s.enable) 401 if (gru.s.enable) {
389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 402 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
403 gru_start_paddr = ((u64)gru.s.base << shift);
404 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
405
406 }
390} 407}
391 408
392static __init void map_mmr_high(int max_pnode) 409static __init void map_mmr_high(int max_pnode)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a4ec8b647544..c1afa990a6c8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1136,7 +1136,7 @@ void __cpuinit cpu_init(void)
1136 wrmsrl(MSR_KERNEL_GS_BASE, 0); 1136 wrmsrl(MSR_KERNEL_GS_BASE, 0);
1137 barrier(); 1137 barrier();
1138 1138
1139 check_efer(); 1139 x86_configure_nx();
1140 if (cpu != 0) 1140 if (cpu != 0)
1141 enable_x2apic(); 1141 enable_x2apic();
1142 1142
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 40e1835b35e8..c900b73f9224 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -263,8 +263,12 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
263 /* Don't do the funky fallback heuristics the AMD version employs 263 /* Don't do the funky fallback heuristics the AMD version employs
264 for now. */ 264 for now. */
265 node = apicid_to_node[apicid]; 265 node = apicid_to_node[apicid];
266 if (node == NUMA_NO_NODE || !node_online(node)) 266 if (node == NUMA_NO_NODE)
267 node = first_node(node_online_map); 267 node = first_node(node_online_map);
268 else if (!node_online(node)) {
269 /* reuse the value from init_cpu_to_node() */
270 node = cpu_to_node(cpu);
271 }
268 numa_set_node(cpu, node); 272 numa_set_node(cpu, node);
269 273
270 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); 274 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 73c86db5acbe..09b1698e0466 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -170,6 +170,41 @@ static int __init cmp_range(const void *x1, const void *x2)
170 return start1 - start2; 170 return start1 - start2;
171} 171}
172 172
173static int __init clean_sort_range(struct res_range *range, int az)
174{
175 int i, j, k = az - 1, nr_range = 0;
176
177 for (i = 0; i < k; i++) {
178 if (range[i].end)
179 continue;
180 for (j = k; j > i; j--) {
181 if (range[j].end) {
182 k = j;
183 break;
184 }
185 }
186 if (j == i)
187 break;
188 range[i].start = range[k].start;
189 range[i].end = range[k].end;
190 range[k].start = 0;
191 range[k].end = 0;
192 k--;
193 }
194 /* count it */
195 for (i = 0; i < az; i++) {
196 if (!range[i].end) {
197 nr_range = i;
198 break;
199 }
200 }
201
202 /* sort them */
203 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
204
205 return nr_range;
206}
207
173#define BIOS_BUG_MSG KERN_WARNING \ 208#define BIOS_BUG_MSG KERN_WARNING \
174 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 209 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
175 210
@@ -223,22 +258,18 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
223 subtract_range(range, extra_remove_base, 258 subtract_range(range, extra_remove_base,
224 extra_remove_base + extra_remove_size - 1); 259 extra_remove_base + extra_remove_size - 1);
225 260
226 /* get new range num */
227 nr_range = 0;
228 for (i = 0; i < RANGE_NUM; i++) {
229 if (!range[i].end)
230 continue;
231 nr_range++;
232 }
233 if (debug_print) { 261 if (debug_print) {
234 printk(KERN_DEBUG "After UC checking\n"); 262 printk(KERN_DEBUG "After UC checking\n");
235 for (i = 0; i < nr_range; i++) 263 for (i = 0; i < RANGE_NUM; i++) {
264 if (!range[i].end)
265 continue;
236 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 266 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
237 range[i].start, range[i].end + 1); 267 range[i].start, range[i].end + 1);
268 }
238 } 269 }
239 270
240 /* sort the ranges */ 271 /* sort the ranges */
241 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 272 nr_range = clean_sort_range(range, RANGE_NUM);
242 if (debug_print) { 273 if (debug_print) {
243 printk(KERN_DEBUG "After sorting\n"); 274 printk(KERN_DEBUG "After sorting\n");
244 for (i = 0; i < nr_range; i++) 275 for (i = 0; i < nr_range; i++)
@@ -689,8 +720,6 @@ static int __init mtrr_need_cleanup(void)
689 continue; 720 continue;
690 if (!size) 721 if (!size)
691 type = MTRR_NUM_TYPES; 722 type = MTRR_NUM_TYPES;
692 if (type == MTRR_TYPE_WRPROT)
693 type = MTRR_TYPE_UNCACHABLE;
694 num[type]++; 723 num[type]++;
695 } 724 }
696 725
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4deb8fc849dd..63bca794c8f9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -977,8 +977,8 @@ apicinterrupt UV_BAU_MESSAGE \
977#endif 977#endif
978apicinterrupt LOCAL_TIMER_VECTOR \ 978apicinterrupt LOCAL_TIMER_VECTOR \
979 apic_timer_interrupt smp_apic_timer_interrupt 979 apic_timer_interrupt smp_apic_timer_interrupt
980apicinterrupt GENERIC_INTERRUPT_VECTOR \ 980apicinterrupt X86_PLATFORM_IPI_VECTOR \
981 generic_interrupt smp_generic_interrupt 981 x86_platform_ipi smp_x86_platform_ipi
982 982
983#ifdef CONFIG_SMP 983#ifdef CONFIG_SMP
984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 5a1b9758fd62..309689245431 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -189,9 +189,26 @@ static void wait_for_nmi(void)
189 nmi_wait_count++; 189 nmi_wait_count++;
190} 190}
191 191
192static inline int
193within(unsigned long addr, unsigned long start, unsigned long end)
194{
195 return addr >= start && addr < end;
196}
197
192static int 198static int
193do_ftrace_mod_code(unsigned long ip, void *new_code) 199do_ftrace_mod_code(unsigned long ip, void *new_code)
194{ 200{
201 /*
202 * On x86_64, kernel text mappings are mapped read-only with
203 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
204 * of the kernel text mapping to modify the kernel text.
205 *
206 * For 32bit kernels, these mappings are same and we can use
207 * kernel identity mapping to modify code.
208 */
209 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
210 ip = (unsigned long)__va(__pa(ip));
211
195 mod_code_ip = (void *)ip; 212 mod_code_ip = (void *)ip;
196 mod_code_newcode = new_code; 213 mod_code_newcode = new_code;
197 214
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 050c278481b1..7fd318bac59c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -18,6 +18,8 @@
18#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
19#include <asm/setup.h> 19#include <asm/setup.h>
20#include <asm/processor-flags.h> 20#include <asm/processor-flags.h>
21#include <asm/msr-index.h>
22#include <asm/cpufeature.h>
21#include <asm/percpu.h> 23#include <asm/percpu.h>
22 24
23/* Physical address */ 25/* Physical address */
@@ -297,25 +299,27 @@ ENTRY(startup_32_smp)
297 orl %edx,%eax 299 orl %edx,%eax
298 movl %eax,%cr4 300 movl %eax,%cr4
299 301
300 btl $5, %eax # check if PAE is enabled 302 testb $X86_CR4_PAE, %al # check if PAE is enabled
301 jnc 6f 303 jz 6f
302 304
303 /* Check if extended functions are implemented */ 305 /* Check if extended functions are implemented */
304 movl $0x80000000, %eax 306 movl $0x80000000, %eax
305 cpuid 307 cpuid
306 cmpl $0x80000000, %eax 308 /* Value must be in the range 0x80000001 to 0x8000ffff */
307 jbe 6f 309 subl $0x80000001, %eax
310 cmpl $(0x8000ffff-0x80000001), %eax
311 ja 6f
308 mov $0x80000001, %eax 312 mov $0x80000001, %eax
309 cpuid 313 cpuid
310 /* Execute Disable bit supported? */ 314 /* Execute Disable bit supported? */
311 btl $20, %edx 315 btl $(X86_FEATURE_NX & 31), %edx
312 jnc 6f 316 jnc 6f
313 317
314 /* Setup EFER (Extended Feature Enable Register) */ 318 /* Setup EFER (Extended Feature Enable Register) */
315 movl $0xc0000080, %ecx 319 movl $MSR_EFER, %ecx
316 rdmsr 320 rdmsr
317 321
318 btsl $11, %eax 322 btsl $_EFER_NX, %eax
319 /* Make changes effective */ 323 /* Make changes effective */
320 wrmsr 324 wrmsr
321 325
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 22db86a37643..2d8b5035371c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -262,11 +262,11 @@ ENTRY(secondary_startup_64)
262 .quad x86_64_start_kernel 262 .quad x86_64_start_kernel
263 ENTRY(initial_gs) 263 ENTRY(initial_gs)
264 .quad INIT_PER_CPU_VAR(irq_stack_union) 264 .quad INIT_PER_CPU_VAR(irq_stack_union)
265 __FINITDATA
266 265
267 ENTRY(stack_start) 266 ENTRY(stack_start)
268 .quad init_thread_union+THREAD_SIZE-8 267 .quad init_thread_union+THREAD_SIZE-8
269 .word 0 268 .word 0
269 __FINITDATA
270 270
271bad_address: 271bad_address:
272 jmp bad_address 272 jmp bad_address
@@ -340,6 +340,7 @@ ENTRY(name)
340 i = i + 1 ; \ 340 i = i + 1 ; \
341 .endr 341 .endr
342 342
343 .data
343 /* 344 /*
344 * This default setting generates an ident mapping at address 0x100000 345 * This default setting generates an ident mapping at address 0x100000
345 * and a mapping for the kernel that precisely maps virtual address 346 * and a mapping for the kernel that precisely maps virtual address
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dedc2bddf7a5..ba6e65884603 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,6 +33,7 @@
33 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
34 */ 34 */
35unsigned long hpet_address; 35unsigned long hpet_address;
36u8 hpet_blockid; /* OS timer block num */
36#ifdef CONFIG_PCI_MSI 37#ifdef CONFIG_PCI_MSI
37static unsigned long hpet_num_timers; 38static unsigned long hpet_num_timers;
38#endif 39#endif
@@ -47,12 +48,12 @@ struct hpet_dev {
47 char name[10]; 48 char name[10];
48}; 49};
49 50
50unsigned long hpet_readl(unsigned long a) 51inline unsigned int hpet_readl(unsigned int a)
51{ 52{
52 return readl(hpet_virt_address + a); 53 return readl(hpet_virt_address + a);
53} 54}
54 55
55static inline void hpet_writel(unsigned long d, unsigned long a) 56static inline void hpet_writel(unsigned int d, unsigned int a)
56{ 57{
57 writel(d, hpet_virt_address + a); 58 writel(d, hpet_virt_address + a);
58} 59}
@@ -167,7 +168,7 @@ do { \
167 168
168static void hpet_reserve_msi_timers(struct hpet_data *hd); 169static void hpet_reserve_msi_timers(struct hpet_data *hd);
169 170
170static void hpet_reserve_platform_timers(unsigned long id) 171static void hpet_reserve_platform_timers(unsigned int id)
171{ 172{
172 struct hpet __iomem *hpet = hpet_virt_address; 173 struct hpet __iomem *hpet = hpet_virt_address;
173 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; 174 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
@@ -205,7 +206,7 @@ static void hpet_reserve_platform_timers(unsigned long id)
205 206
206} 207}
207#else 208#else
208static void hpet_reserve_platform_timers(unsigned long id) { } 209static void hpet_reserve_platform_timers(unsigned int id) { }
209#endif 210#endif
210 211
211/* 212/*
@@ -246,7 +247,7 @@ static void hpet_reset_counter(void)
246 247
247static void hpet_start_counter(void) 248static void hpet_start_counter(void)
248{ 249{
249 unsigned long cfg = hpet_readl(HPET_CFG); 250 unsigned int cfg = hpet_readl(HPET_CFG);
250 cfg |= HPET_CFG_ENABLE; 251 cfg |= HPET_CFG_ENABLE;
251 hpet_writel(cfg, HPET_CFG); 252 hpet_writel(cfg, HPET_CFG);
252} 253}
@@ -271,7 +272,7 @@ static void hpet_resume_counter(void)
271 272
272static void hpet_enable_legacy_int(void) 273static void hpet_enable_legacy_int(void)
273{ 274{
274 unsigned long cfg = hpet_readl(HPET_CFG); 275 unsigned int cfg = hpet_readl(HPET_CFG);
275 276
276 cfg |= HPET_CFG_LEGACY; 277 cfg |= HPET_CFG_LEGACY;
277 hpet_writel(cfg, HPET_CFG); 278 hpet_writel(cfg, HPET_CFG);
@@ -314,7 +315,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
314static void hpet_set_mode(enum clock_event_mode mode, 315static void hpet_set_mode(enum clock_event_mode mode,
315 struct clock_event_device *evt, int timer) 316 struct clock_event_device *evt, int timer)
316{ 317{
317 unsigned long cfg, cmp, now; 318 unsigned int cfg, cmp, now;
318 uint64_t delta; 319 uint64_t delta;
319 320
320 switch (mode) { 321 switch (mode) {
@@ -323,7 +324,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
323 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; 324 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
324 delta >>= evt->shift; 325 delta >>= evt->shift;
325 now = hpet_readl(HPET_COUNTER); 326 now = hpet_readl(HPET_COUNTER);
326 cmp = now + (unsigned long) delta; 327 cmp = now + (unsigned int) delta;
327 cfg = hpet_readl(HPET_Tn_CFG(timer)); 328 cfg = hpet_readl(HPET_Tn_CFG(timer));
328 /* Make sure we use edge triggered interrupts */ 329 /* Make sure we use edge triggered interrupts */
329 cfg &= ~HPET_TN_LEVEL; 330 cfg &= ~HPET_TN_LEVEL;
@@ -339,7 +340,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
339 * (See AMD-8111 HyperTransport I/O Hub Data Sheet, 340 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
340 * Publication # 24674) 341 * Publication # 24674)
341 */ 342 */
342 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); 343 hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
343 hpet_start_counter(); 344 hpet_start_counter();
344 hpet_print_config(); 345 hpet_print_config();
345 break; 346 break;
@@ -383,13 +384,24 @@ static int hpet_next_event(unsigned long delta,
383 hpet_writel(cnt, HPET_Tn_CMP(timer)); 384 hpet_writel(cnt, HPET_Tn_CMP(timer));
384 385
385 /* 386 /*
386 * We need to read back the CMP register to make sure that 387 * We need to read back the CMP register on certain HPET
387 * what we wrote hit the chip before we compare it to the 388 * implementations (ATI chipsets) which seem to delay the
388 * counter. 389 * transfer of the compare register into the internal compare
390 * logic. With small deltas this might actually be too late as
391 * the counter could already be higher than the compare value
392 * at that point and we would wait for the next hpet interrupt
393 * forever. We found out that reading the CMP register back
394 * forces the transfer so we can rely on the comparison with
395 * the counter register below. If the read back from the
396 * compare register does not match the value we programmed
397 * then we might have a real hardware problem. We can not do
398 * much about it here, but at least alert the user/admin with
399 * a prominent warning.
389 */ 400 */
390 WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); 401 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
402 KERN_WARNING "hpet: compare register read back failed.\n");
391 403
392 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 404 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
393} 405}
394 406
395static void hpet_legacy_set_mode(enum clock_event_mode mode, 407static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -415,7 +427,7 @@ static struct hpet_dev *hpet_devs;
415void hpet_msi_unmask(unsigned int irq) 427void hpet_msi_unmask(unsigned int irq)
416{ 428{
417 struct hpet_dev *hdev = get_irq_data(irq); 429 struct hpet_dev *hdev = get_irq_data(irq);
418 unsigned long cfg; 430 unsigned int cfg;
419 431
420 /* unmask it */ 432 /* unmask it */
421 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 433 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -425,7 +437,7 @@ void hpet_msi_unmask(unsigned int irq)
425 437
426void hpet_msi_mask(unsigned int irq) 438void hpet_msi_mask(unsigned int irq)
427{ 439{
428 unsigned long cfg; 440 unsigned int cfg;
429 struct hpet_dev *hdev = get_irq_data(irq); 441 struct hpet_dev *hdev = get_irq_data(irq);
430 442
431 /* mask it */ 443 /* mask it */
@@ -467,7 +479,7 @@ static int hpet_msi_next_event(unsigned long delta,
467 479
468static int hpet_setup_msi_irq(unsigned int irq) 480static int hpet_setup_msi_irq(unsigned int irq)
469{ 481{
470 if (arch_setup_hpet_msi(irq)) { 482 if (arch_setup_hpet_msi(irq, hpet_blockid)) {
471 destroy_irq(irq); 483 destroy_irq(irq);
472 return -EINVAL; 484 return -EINVAL;
473 } 485 }
@@ -584,6 +596,8 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
584 unsigned int num_timers_used = 0; 596 unsigned int num_timers_used = 0;
585 int i; 597 int i;
586 598
599 if (boot_cpu_has(X86_FEATURE_ARAT))
600 return;
587 id = hpet_readl(HPET_ID); 601 id = hpet_readl(HPET_ID);
588 602
589 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); 603 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
@@ -598,7 +612,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
598 612
599 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { 613 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
600 struct hpet_dev *hdev = &hpet_devs[num_timers_used]; 614 struct hpet_dev *hdev = &hpet_devs[num_timers_used];
601 unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); 615 unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
602 616
603 /* Only consider HPET timer with MSI support */ 617 /* Only consider HPET timer with MSI support */
604 if (!(cfg & HPET_TN_FSB_CAP)) 618 if (!(cfg & HPET_TN_FSB_CAP))
@@ -813,7 +827,7 @@ static int hpet_clocksource_register(void)
813 */ 827 */
814int __init hpet_enable(void) 828int __init hpet_enable(void)
815{ 829{
816 unsigned long id; 830 unsigned int id;
817 int i; 831 int i;
818 832
819 if (!is_hpet_capable()) 833 if (!is_hpet_capable())
@@ -872,10 +886,8 @@ int __init hpet_enable(void)
872 886
873 if (id & HPET_ID_LEGSUP) { 887 if (id & HPET_ID_LEGSUP) {
874 hpet_legacy_clockevent_register(); 888 hpet_legacy_clockevent_register();
875 hpet_msi_capability_lookup(2);
876 return 1; 889 return 1;
877 } 890 }
878 hpet_msi_capability_lookup(0);
879 return 0; 891 return 0;
880 892
881out_nohpet: 893out_nohpet:
@@ -908,9 +920,17 @@ static __init int hpet_late_init(void)
908 if (!hpet_virt_address) 920 if (!hpet_virt_address)
909 return -ENODEV; 921 return -ENODEV;
910 922
923 if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
924 hpet_msi_capability_lookup(2);
925 else
926 hpet_msi_capability_lookup(0);
927
911 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 928 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
912 hpet_print_config(); 929 hpet_print_config();
913 930
931 if (boot_cpu_has(X86_FEATURE_ARAT))
932 return 0;
933
914 for_each_online_cpu(cpu) { 934 for_each_online_cpu(cpu) {
915 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); 935 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
916 } 936 }
@@ -925,7 +945,7 @@ fs_initcall(hpet_late_init);
925void hpet_disable(void) 945void hpet_disable(void)
926{ 946{
927 if (is_hpet_capable()) { 947 if (is_hpet_capable()) {
928 unsigned long cfg = hpet_readl(HPET_CFG); 948 unsigned int cfg = hpet_readl(HPET_CFG);
929 949
930 if (hpet_legacy_int_enabled) { 950 if (hpet_legacy_int_enabled) {
931 cfg &= ~HPET_CFG_LEGACY; 951 cfg &= ~HPET_CFG_LEGACY;
@@ -965,8 +985,8 @@ static int hpet_prev_update_sec;
965static struct rtc_time hpet_alarm_time; 985static struct rtc_time hpet_alarm_time;
966static unsigned long hpet_pie_count; 986static unsigned long hpet_pie_count;
967static u32 hpet_t1_cmp; 987static u32 hpet_t1_cmp;
968static unsigned long hpet_default_delta; 988static u32 hpet_default_delta;
969static unsigned long hpet_pie_delta; 989static u32 hpet_pie_delta;
970static unsigned long hpet_pie_limit; 990static unsigned long hpet_pie_limit;
971 991
972static rtc_irq_handler irq_handler; 992static rtc_irq_handler irq_handler;
@@ -1017,7 +1037,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
1017 */ 1037 */
1018int hpet_rtc_timer_init(void) 1038int hpet_rtc_timer_init(void)
1019{ 1039{
1020 unsigned long cfg, cnt, delta, flags; 1040 unsigned int cfg, cnt, delta;
1041 unsigned long flags;
1021 1042
1022 if (!is_hpet_enabled()) 1043 if (!is_hpet_enabled())
1023 return 0; 1044 return 0;
@@ -1027,7 +1048,7 @@ int hpet_rtc_timer_init(void)
1027 1048
1028 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1049 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1029 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; 1050 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
1030 hpet_default_delta = (unsigned long) clc; 1051 hpet_default_delta = clc;
1031 } 1052 }
1032 1053
1033 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) 1054 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
@@ -1113,7 +1134,7 @@ int hpet_set_periodic_freq(unsigned long freq)
1113 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1134 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1114 do_div(clc, freq); 1135 do_div(clc, freq);
1115 clc >>= hpet_clockevent.shift; 1136 clc >>= hpet_clockevent.shift;
1116 hpet_pie_delta = (unsigned long) clc; 1137 hpet_pie_delta = clc;
1117 } 1138 }
1118 return 1; 1139 return 1;
1119} 1140}
@@ -1127,7 +1148,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
1127 1148
1128static void hpet_rtc_timer_reinit(void) 1149static void hpet_rtc_timer_reinit(void)
1129{ 1150{
1130 unsigned long cfg, delta; 1151 unsigned int cfg, delta;
1131 int lost_ints = -1; 1152 int lost_ints = -1;
1132 1153
1133 if (unlikely(!hpet_rtc_flags)) { 1154 if (unlikely(!hpet_rtc_flags)) {
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index fee6cc2b2079..664bcb7384ac 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -18,7 +18,7 @@
18atomic_t irq_err_count; 18atomic_t irq_err_count;
19 19
20/* Function pointer for generic interrupt vector handling */ 20/* Function pointer for generic interrupt vector handling */
21void (*generic_interrupt_extension)(void) = NULL; 21void (*x86_platform_ipi_callback)(void) = NULL;
22 22
23/* 23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'. 24 * 'what should we do if we get a hw irq event on an illegal vector'.
@@ -72,10 +72,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n"); 73 seq_printf(p, " Performance pending work\n");
74#endif 74#endif
75 if (generic_interrupt_extension) { 75 if (x86_platform_ipi_callback) {
76 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
77 for_each_online_cpu(j) 77 for_each_online_cpu(j)
78 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); 78 seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
79 seq_printf(p, " Platform interrupts\n"); 79 seq_printf(p, " Platform interrupts\n");
80 } 80 }
81#ifdef CONFIG_SMP 81#ifdef CONFIG_SMP
@@ -187,8 +187,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
187 sum += irq_stats(cpu)->apic_perf_irqs; 187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs; 188 sum += irq_stats(cpu)->apic_pending_irqs;
189#endif 189#endif
190 if (generic_interrupt_extension) 190 if (x86_platform_ipi_callback)
191 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->x86_platform_ipis;
192#ifdef CONFIG_SMP 192#ifdef CONFIG_SMP
193 sum += irq_stats(cpu)->irq_resched_count; 193 sum += irq_stats(cpu)->irq_resched_count;
194 sum += irq_stats(cpu)->irq_call_count; 194 sum += irq_stats(cpu)->irq_call_count;
@@ -251,9 +251,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
251} 251}
252 252
253/* 253/*
254 * Handler for GENERIC_INTERRUPT_VECTOR. 254 * Handler for X86_PLATFORM_IPI_VECTOR.
255 */ 255 */
256void smp_generic_interrupt(struct pt_regs *regs) 256void smp_x86_platform_ipi(struct pt_regs *regs)
257{ 257{
258 struct pt_regs *old_regs = set_irq_regs(regs); 258 struct pt_regs *old_regs = set_irq_regs(regs);
259 259
@@ -263,10 +263,10 @@ void smp_generic_interrupt(struct pt_regs *regs)
263 263
264 irq_enter(); 264 irq_enter();
265 265
266 inc_irq_stat(generic_irqs); 266 inc_irq_stat(x86_platform_ipis);
267 267
268 if (generic_interrupt_extension) 268 if (x86_platform_ipi_callback)
269 generic_interrupt_extension(); 269 x86_platform_ipi_callback();
270 270
271 irq_exit(); 271 irq_exit();
272 272
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 40f30773fb29..d5932226614f 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -200,8 +200,8 @@ static void __init apic_intr_init(void)
200 /* self generated IPI for local APIC timer */ 200 /* self generated IPI for local APIC timer */
201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
202 202
203 /* generic IPI for platform specific use */ 203 /* IPI for X86 platform specific use */
204 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); 204 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
205 205
206 /* IPI vectors for APIC spurious and error interrupts */ 206 /* IPI vectors for APIC spurious and error interrupts */
207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c843f8406da2..a3fa43ba5d3b 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -158,8 +158,7 @@ int machine_kexec_prepare(struct kimage *image)
158{ 158{
159 int error; 159 int error;
160 160
161 if (nx_enabled) 161 set_pages_x(image->control_code_page, 1);
162 set_pages_x(image->control_code_page, 1);
163 error = machine_kexec_alloc_page_tables(image); 162 error = machine_kexec_alloc_page_tables(image);
164 if (error) 163 if (error)
165 return error; 164 return error;
@@ -173,8 +172,7 @@ int machine_kexec_prepare(struct kimage *image)
173 */ 172 */
174void machine_kexec_cleanup(struct kimage *image) 173void machine_kexec_cleanup(struct kimage *image)
175{ 174{
176 if (nx_enabled) 175 set_pages_nx(image->control_code_page, 1);
177 set_pages_nx(image->control_code_page, 1);
178 machine_kexec_free_page_tables(image); 176 machine_kexec_free_page_tables(image);
179} 177}
180 178
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index f4c538b681ca..63123d902103 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -33,6 +33,9 @@ MODULE_LICENSE("GPL v2");
33#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 33#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
34#define UCODE_UCODE_TYPE 0x00000001 34#define UCODE_UCODE_TYPE 0x00000001
35 35
36const struct firmware *firmware;
37static int supported_cpu;
38
36struct equiv_cpu_entry { 39struct equiv_cpu_entry {
37 u32 installed_cpu; 40 u32 installed_cpu;
38 u32 fixed_errata_mask; 41 u32 fixed_errata_mask;
@@ -71,17 +74,14 @@ static struct equiv_cpu_entry *equiv_cpu_table;
71 74
72static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 75static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
73{ 76{
74 struct cpuinfo_x86 *c = &cpu_data(cpu);
75 u32 dummy; 77 u32 dummy;
76 78
77 memset(csig, 0, sizeof(*csig)); 79 if (!supported_cpu)
78 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
79 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
80 "supported\n", cpu, c->x86);
81 return -1; 80 return -1;
82 } 81
82 memset(csig, 0, sizeof(*csig));
83 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 83 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
84 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); 84 pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
85 return 0; 85 return 0;
86} 86}
87 87
@@ -103,22 +103,15 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
103 i++; 103 i++;
104 } 104 }
105 105
106 if (!equiv_cpu_id) { 106 if (!equiv_cpu_id)
107 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
108 "not listed in equivalent cpu table\n", cpu);
109 return 0; 107 return 0;
110 }
111 108
112 if (mc_header->processor_rev_id != equiv_cpu_id) { 109 if (mc_header->processor_rev_id != equiv_cpu_id)
113 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
114 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
115 cpu, mc_header->processor_rev_id, equiv_cpu_id);
116 return 0; 110 return 0;
117 }
118 111
119 /* ucode might be chipset specific -- currently we don't support this */ 112 /* ucode might be chipset specific -- currently we don't support this */
120 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 113 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
121 printk(KERN_ERR "microcode: CPU%d: loading of chipset " 114 pr_err(KERN_ERR "microcode: CPU%d: loading of chipset "
122 "specific code not yet supported\n", cpu); 115 "specific code not yet supported\n", cpu);
123 return 0; 116 return 0;
124 } 117 }
@@ -148,14 +141,12 @@ static int apply_microcode_amd(int cpu)
148 141
149 /* check current patch id and patch's id for match */ 142 /* check current patch id and patch's id for match */
150 if (rev != mc_amd->hdr.patch_id) { 143 if (rev != mc_amd->hdr.patch_id) {
151 printk(KERN_ERR "microcode: CPU%d: update failed " 144 pr_err("microcode: CPU%d: update failed "
152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 145 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
153 return -1; 146 return -1;
154 } 147 }
155 148
156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 149 pr_info("microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
157 cpu, rev);
158
159 uci->cpu_sig.rev = rev; 150 uci->cpu_sig.rev = rev;
160 151
161 return 0; 152 return 0;
@@ -178,18 +169,15 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
178 return NULL; 169 return NULL;
179 170
180 if (section_hdr[0] != UCODE_UCODE_TYPE) { 171 if (section_hdr[0] != UCODE_UCODE_TYPE) {
181 printk(KERN_ERR "microcode: error: invalid type field in " 172 pr_err("microcode: error: invalid type field in "
182 "container file section header\n"); 173 "container file section header\n");
183 return NULL; 174 return NULL;
184 } 175 }
185 176
186 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 177 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
187 178
188 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
189 size, total_size);
190
191 if (total_size > size || total_size > UCODE_MAX_SIZE) { 179 if (total_size > size || total_size > UCODE_MAX_SIZE) {
192 printk(KERN_ERR "microcode: error: size mismatch\n"); 180 pr_err("microcode: error: size mismatch\n");
193 return NULL; 181 return NULL;
194 } 182 }
195 183
@@ -218,15 +206,14 @@ static int install_equiv_cpu_table(const u8 *buf)
218 size = buf_pos[2]; 206 size = buf_pos[2];
219 207
220 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 208 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
221 printk(KERN_ERR "microcode: error: invalid type field in " 209 pr_err("microcode: error: invalid type field in "
222 "container file section header\n"); 210 "container file section header\n");
223 return 0; 211 return 0;
224 } 212 }
225 213
226 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 214 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
227 if (!equiv_cpu_table) { 215 if (!equiv_cpu_table) {
228 printk(KERN_ERR "microcode: failed to allocate " 216 pr_err("microcode: failed to allocate equivalent CPU table\n");
229 "equivalent CPU table\n");
230 return 0; 217 return 0;
231 } 218 }
232 219
@@ -259,8 +246,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
259 246
260 offset = install_equiv_cpu_table(ucode_ptr); 247 offset = install_equiv_cpu_table(ucode_ptr);
261 if (!offset) { 248 if (!offset) {
262 printk(KERN_ERR "microcode: failed to create " 249 pr_err("microcode: failed to create equivalent cpu table\n");
263 "equivalent cpu table\n");
264 return UCODE_ERROR; 250 return UCODE_ERROR;
265 } 251 }
266 252
@@ -308,33 +294,27 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
308 294
309static enum ucode_state request_microcode_fw(int cpu, struct device *device) 295static enum ucode_state request_microcode_fw(int cpu, struct device *device)
310{ 296{
311 const char *fw_name = "amd-ucode/microcode_amd.bin";
312 const struct firmware *firmware;
313 enum ucode_state ret; 297 enum ucode_state ret;
314 298
315 if (request_firmware(&firmware, fw_name, device)) { 299 if (firmware == NULL)
316 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
317 return UCODE_NFOUND; 300 return UCODE_NFOUND;
318 }
319 301
320 if (*(u32 *)firmware->data != UCODE_MAGIC) { 302 if (*(u32 *)firmware->data != UCODE_MAGIC) {
321 printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", 303 pr_err("microcode: invalid UCODE_MAGIC (0x%08x)\n",
322 *(u32 *)firmware->data); 304 *(u32 *)firmware->data);
323 return UCODE_ERROR; 305 return UCODE_ERROR;
324 } 306 }
325 307
326 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 308 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
327 309
328 release_firmware(firmware);
329
330 return ret; 310 return ret;
331} 311}
332 312
333static enum ucode_state 313static enum ucode_state
334request_microcode_user(int cpu, const void __user *buf, size_t size) 314request_microcode_user(int cpu, const void __user *buf, size_t size)
335{ 315{
336 printk(KERN_INFO "microcode: AMD microcode update via " 316 pr_info("microcode: AMD microcode update via "
337 "/dev/cpu/microcode not supported\n"); 317 "/dev/cpu/microcode not supported\n");
338 return UCODE_ERROR; 318 return UCODE_ERROR;
339} 319}
340 320
@@ -346,7 +326,32 @@ static void microcode_fini_cpu_amd(int cpu)
346 uci->mc = NULL; 326 uci->mc = NULL;
347} 327}
348 328
329void init_microcode_amd(struct device *device)
330{
331 const char *fw_name = "amd-ucode/microcode_amd.bin";
332 struct cpuinfo_x86 *c = &boot_cpu_data;
333
334 WARN_ON(c->x86_vendor != X86_VENDOR_AMD);
335
336 if (c->x86 < 0x10) {
337 pr_warning("microcode: AMD CPU family 0x%x not supported\n",
338 c->x86);
339 return;
340 }
341 supported_cpu = 1;
342
343 if (request_firmware(&firmware, fw_name, device))
344 pr_err("microcode: failed to load file %s\n", fw_name);
345}
346
347void fini_microcode_amd(void)
348{
349 release_firmware(firmware);
350}
351
349static struct microcode_ops microcode_amd_ops = { 352static struct microcode_ops microcode_amd_ops = {
353 .init = init_microcode_amd,
354 .fini = fini_microcode_amd,
350 .request_microcode_user = request_microcode_user, 355 .request_microcode_user = request_microcode_user,
351 .request_microcode_fw = request_microcode_fw, 356 .request_microcode_fw = request_microcode_fw,
352 .collect_cpu_info = collect_cpu_info_amd, 357 .collect_cpu_info = collect_cpu_info_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 2bcad3926edb..e68aae397869 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -391,7 +391,7 @@ static enum ucode_state microcode_update_cpu(int cpu)
391 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 391 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
392 enum ucode_state ustate; 392 enum ucode_state ustate;
393 393
394 if (uci->valid) 394 if (uci->valid && uci->mc)
395 ustate = microcode_resume_cpu(cpu); 395 ustate = microcode_resume_cpu(cpu);
396 else 396 else
397 ustate = microcode_init_cpu(cpu); 397 ustate = microcode_init_cpu(cpu);
@@ -518,6 +518,9 @@ static int __init microcode_init(void)
518 return PTR_ERR(microcode_pdev); 518 return PTR_ERR(microcode_pdev);
519 } 519 }
520 520
521 if (microcode_ops->init)
522 microcode_ops->init(&microcode_pdev->dev);
523
521 get_online_cpus(); 524 get_online_cpus();
522 mutex_lock(&microcode_mutex); 525 mutex_lock(&microcode_mutex);
523 526
@@ -561,6 +564,9 @@ static void __exit microcode_exit(void)
561 564
562 platform_device_unregister(microcode_pdev); 565 platform_device_unregister(microcode_pdev);
563 566
567 if (microcode_ops->fini)
568 microcode_ops->fini();
569
564 microcode_ops = NULL; 570 microcode_ops = NULL;
565 571
566 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); 572 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5be95ef4ffec..35a57c963df9 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -667,36 +667,18 @@ void __init default_get_smp_config(unsigned int early)
667 */ 667 */
668} 668}
669 669
670static void __init smp_reserve_bootmem(struct mpf_intel *mpf) 670static void __init smp_reserve_memory(struct mpf_intel *mpf)
671{ 671{
672 unsigned long size = get_mpc_size(mpf->physptr); 672 unsigned long size = get_mpc_size(mpf->physptr);
673#ifdef CONFIG_X86_32
674 /*
675 * We cannot access to MPC table to compute table size yet,
676 * as only few megabytes from the bottom is mapped now.
677 * PC-9800's MPC table places on the very last of physical
678 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
679 * yields BUG() in reserve_bootmem.
680 * also need to make sure physptr is below than max_low_pfn
681 * we don't need reserve the area above max_low_pfn
682 */
683 unsigned long end = max_low_pfn * PAGE_SIZE;
684 673
685 if (mpf->physptr < end) { 674 reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc");
686 if (mpf->physptr + size > end)
687 size = end - mpf->physptr;
688 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
689 }
690#else
691 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
692#endif
693} 675}
694 676
695static int __init smp_scan_config(unsigned long base, unsigned long length, 677static int __init smp_scan_config(unsigned long base, unsigned long length)
696 unsigned reserve)
697{ 678{
698 unsigned int *bp = phys_to_virt(base); 679 unsigned int *bp = phys_to_virt(base);
699 struct mpf_intel *mpf; 680 struct mpf_intel *mpf;
681 unsigned long mem;
700 682
701 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 683 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
702 bp, length); 684 bp, length);
@@ -717,12 +699,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
717 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", 699 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
718 mpf, (u64)virt_to_phys(mpf)); 700 mpf, (u64)virt_to_phys(mpf));
719 701
720 if (!reserve) 702 mem = virt_to_phys(mpf);
721 return 1; 703 reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf");
722 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
723 BOOTMEM_DEFAULT);
724 if (mpf->physptr) 704 if (mpf->physptr)
725 smp_reserve_bootmem(mpf); 705 smp_reserve_memory(mpf);
726 706
727 return 1; 707 return 1;
728 } 708 }
@@ -732,7 +712,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
732 return 0; 712 return 0;
733} 713}
734 714
735void __init default_find_smp_config(unsigned int reserve) 715void __init default_find_smp_config(void)
736{ 716{
737 unsigned int address; 717 unsigned int address;
738 718
@@ -744,9 +724,9 @@ void __init default_find_smp_config(unsigned int reserve)
744 * 2) Scan the top 1K of base RAM 724 * 2) Scan the top 1K of base RAM
745 * 3) Scan the 64K of bios 725 * 3) Scan the 64K of bios
746 */ 726 */
747 if (smp_scan_config(0x0, 0x400, reserve) || 727 if (smp_scan_config(0x0, 0x400) ||
748 smp_scan_config(639 * 0x400, 0x400, reserve) || 728 smp_scan_config(639 * 0x400, 0x400) ||
749 smp_scan_config(0xF0000, 0x10000, reserve)) 729 smp_scan_config(0xF0000, 0x10000))
750 return; 730 return;
751 /* 731 /*
752 * If it is an SMP machine we should know now, unless the 732 * If it is an SMP machine we should know now, unless the
@@ -767,7 +747,7 @@ void __init default_find_smp_config(unsigned int reserve)
767 747
768 address = get_bios_ebda(); 748 address = get_bios_ebda();
769 if (address) 749 if (address)
770 smp_scan_config(address, 0x400, reserve); 750 smp_scan_config(address, 0x400);
771} 751}
772 752
773#ifdef CONFIG_X86_IO_APIC 753#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 744508e7cfdd..5e2ba634ea15 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/user-return-notifier.h>
12#include <trace/events/power.h> 13#include <trace/events/power.h>
13#include <linux/hw_breakpoint.h> 14#include <linux/hw_breakpoint.h>
14#include <asm/system.h> 15#include <asm/system.h>
@@ -209,6 +210,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
209 */ 210 */
210 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 211 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
211 } 212 }
213 propagate_user_return_notify(prev_p, next_p);
212} 214}
213 215
214int sys_fork(struct pt_regs *regs) 216int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a98fe88fab64..c95c8f4e790a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -349,26 +349,42 @@ out:
349 return err; 349 return err;
350} 350}
351 351
352void 352static void
353start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 353start_thread_common(struct pt_regs *regs, unsigned long new_ip,
354 unsigned long new_sp,
355 unsigned int _cs, unsigned int _ss, unsigned int _ds)
354{ 356{
355 loadsegment(fs, 0); 357 loadsegment(fs, 0);
356 loadsegment(es, 0); 358 loadsegment(es, _ds);
357 loadsegment(ds, 0); 359 loadsegment(ds, _ds);
358 load_gs_index(0); 360 load_gs_index(0);
359 regs->ip = new_ip; 361 regs->ip = new_ip;
360 regs->sp = new_sp; 362 regs->sp = new_sp;
361 percpu_write(old_rsp, new_sp); 363 percpu_write(old_rsp, new_sp);
362 regs->cs = __USER_CS; 364 regs->cs = _cs;
363 regs->ss = __USER_DS; 365 regs->ss = _ss;
364 regs->flags = 0x200; 366 regs->flags = X86_EFLAGS_IF;
365 set_fs(USER_DS); 367 set_fs(USER_DS);
366 /* 368 /*
367 * Free the old FP and other extended state 369 * Free the old FP and other extended state
368 */ 370 */
369 free_thread_xstate(current); 371 free_thread_xstate(current);
370} 372}
371EXPORT_SYMBOL_GPL(start_thread); 373
374void
375start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
376{
377 start_thread_common(regs, new_ip, new_sp,
378 __USER_CS, __USER_DS, 0);
379}
380
381#ifdef CONFIG_IA32_EMULATION
382void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
383{
384 start_thread_common(regs, new_ip, new_sp,
385 __USER32_CS, __USER32_DS, __USER32_DS);
386}
387#endif
372 388
373/* 389/*
374 * switch_to(x,y) should switch tasks from x to y. 390 * switch_to(x,y) should switch tasks from x to y.
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index 61a837743fe5..201eab63b05f 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -80,6 +80,7 @@ void mach_reboot_fixups(void)
80 continue; 80 continue;
81 81
82 cur->reboot_fixup(dev); 82 cur->reboot_fixup(dev);
83 pci_dev_put(dev);
83 } 84 }
84} 85}
85 86
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 82e88cdda9bc..946a311a25c9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -106,6 +106,7 @@
106#include <asm/percpu.h> 106#include <asm/percpu.h>
107#include <asm/topology.h> 107#include <asm/topology.h>
108#include <asm/apicdef.h> 108#include <asm/apicdef.h>
109#include <asm/k8.h>
109#ifdef CONFIG_X86_64 110#ifdef CONFIG_X86_64
110#include <asm/numa_64.h> 111#include <asm/numa_64.h>
111#endif 112#endif
@@ -487,42 +488,11 @@ static void __init reserve_early_setup_data(void)
487 488
488#ifdef CONFIG_KEXEC 489#ifdef CONFIG_KEXEC
489 490
490/**
491 * Reserve @size bytes of crashkernel memory at any suitable offset.
492 *
493 * @size: Size of the crashkernel memory to reserve.
494 * Returns the base address on success, and -1ULL on failure.
495 */
496static
497unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
498{
499 const unsigned long long alignment = 16<<20; /* 16M */
500 unsigned long long start = 0LL;
501
502 while (1) {
503 int ret;
504
505 start = find_e820_area(start, ULONG_MAX, size, alignment);
506 if (start == -1ULL)
507 return start;
508
509 /* try to reserve it */
510 ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
511 if (ret >= 0)
512 return start;
513
514 start += alignment;
515 }
516}
517
518static inline unsigned long long get_total_mem(void) 491static inline unsigned long long get_total_mem(void)
519{ 492{
520 unsigned long long total; 493 unsigned long long total;
521 494
522 total = max_low_pfn - min_low_pfn; 495 total = max_pfn - min_low_pfn;
523#ifdef CONFIG_HIGHMEM
524 total += highend_pfn - highstart_pfn;
525#endif
526 496
527 return total << PAGE_SHIFT; 497 return total << PAGE_SHIFT;
528} 498}
@@ -542,21 +512,25 @@ static void __init reserve_crashkernel(void)
542 512
543 /* 0 means: find the address automatically */ 513 /* 0 means: find the address automatically */
544 if (crash_base <= 0) { 514 if (crash_base <= 0) {
545 crash_base = find_and_reserve_crashkernel(crash_size); 515 const unsigned long long alignment = 16<<20; /* 16M */
516
517 crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
518 alignment);
546 if (crash_base == -1ULL) { 519 if (crash_base == -1ULL) {
547 pr_info("crashkernel reservation failed. " 520 pr_info("crashkernel reservation failed - No suitable area found.\n");
548 "No suitable area found.\n");
549 return; 521 return;
550 } 522 }
551 } else { 523 } else {
552 ret = reserve_bootmem_generic(crash_base, crash_size, 524 unsigned long long start;
553 BOOTMEM_EXCLUSIVE); 525
554 if (ret < 0) { 526 start = find_e820_area(crash_base, ULONG_MAX, crash_size,
555 pr_info("crashkernel reservation failed - " 527 1<<20);
556 "memory is in use\n"); 528 if (start != crash_base) {
529 pr_info("crashkernel reservation failed - memory is in use.\n");
557 return; 530 return;
558 } 531 }
559 } 532 }
533 reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
560 534
561 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 535 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
562 "for crashkernel (System RAM: %ldMB)\n", 536 "for crashkernel (System RAM: %ldMB)\n",
@@ -699,6 +673,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
699 673
700void __init setup_arch(char **cmdline_p) 674void __init setup_arch(char **cmdline_p)
701{ 675{
676 int acpi = 0;
677 int k8 = 0;
678
702#ifdef CONFIG_X86_32 679#ifdef CONFIG_X86_32
703 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 680 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
704 visws_early_detect(); 681 visws_early_detect();
@@ -791,21 +768,18 @@ void __init setup_arch(char **cmdline_p)
791 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 768 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
792 *cmdline_p = command_line; 769 *cmdline_p = command_line;
793 770
794#ifdef CONFIG_X86_64
795 /* 771 /*
796 * Must call this twice: Once just to detect whether hardware doesn't 772 * x86_configure_nx() is called before parse_early_param() to detect
797 * support NX (so that the early EHCI debug console setup can safely 773 * whether hardware doesn't support NX (so that the early EHCI debug
798 * call set_fixmap(), and then again after parsing early parameters to 774 * console setup can safely call set_fixmap()). It may then be called
799 * honor the respective command line option. 775 * again from within noexec_setup() during parsing early parameters
776 * to honor the respective command line option.
800 */ 777 */
801 check_efer(); 778 x86_configure_nx();
802#endif
803 779
804 parse_early_param(); 780 parse_early_param();
805 781
806#ifdef CONFIG_X86_64 782 x86_report_nx();
807 check_efer();
808#endif
809 783
810 /* Must be before kernel pagetables are setup */ 784 /* Must be before kernel pagetables are setup */
811 vmi_activate(); 785 vmi_activate();
@@ -901,6 +875,13 @@ void __init setup_arch(char **cmdline_p)
901 875
902 reserve_brk(); 876 reserve_brk();
903 877
878#ifdef CONFIG_ACPI_SLEEP
879 /*
880 * Reserve low memory region for sleep support.
881 * even before init_memory_mapping
882 */
883 acpi_reserve_wakeup_memory();
884#endif
904 init_gbpages(); 885 init_gbpages();
905 886
906 /* max_pfn_mapped is updated here */ 887 /* max_pfn_mapped is updated here */
@@ -927,6 +908,8 @@ void __init setup_arch(char **cmdline_p)
927 908
928 reserve_initrd(); 909 reserve_initrd();
929 910
911 reserve_crashkernel();
912
930 vsmp_init(); 913 vsmp_init();
931 914
932 io_delay_init(); 915 io_delay_init();
@@ -938,27 +921,24 @@ void __init setup_arch(char **cmdline_p)
938 921
939 early_acpi_boot_init(); 922 early_acpi_boot_init();
940 923
924 /*
925 * Find and reserve possible boot-time SMP configuration:
926 */
927 find_smp_config();
928
941#ifdef CONFIG_ACPI_NUMA 929#ifdef CONFIG_ACPI_NUMA
942 /* 930 /*
943 * Parse SRAT to discover nodes. 931 * Parse SRAT to discover nodes.
944 */ 932 */
945 acpi_numa_init(); 933 acpi = acpi_numa_init();
946#endif 934#endif
947 935
948 initmem_init(0, max_pfn); 936#ifdef CONFIG_K8_NUMA
949 937 if (!acpi)
950#ifdef CONFIG_ACPI_SLEEP 938 k8 = !k8_numa_init(0, max_pfn);
951 /*
952 * Reserve low memory region for sleep support.
953 */
954 acpi_reserve_bootmem();
955#endif 939#endif
956 /*
957 * Find and reserve possible boot-time SMP configuration:
958 */
959 find_smp_config();
960 940
961 reserve_crashkernel(); 941 initmem_init(0, max_pfn, acpi, k8);
962 942
963#ifdef CONFIG_X86_64 943#ifdef CONFIG_X86_64
964 /* 944 /*
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index fbf3b07c8567..74fe6d86dc5d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/user-return-notifier.h>
22 23
23#include <asm/processor.h> 24#include <asm/processor.h>
24#include <asm/ucontext.h> 25#include <asm/ucontext.h>
@@ -863,6 +864,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
863 if (current->replacement_session_keyring) 864 if (current->replacement_session_keyring)
864 key_replace_session_keyring(); 865 key_replace_session_keyring();
865 } 866 }
867 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
868 fire_user_return_notifiers();
866 869
867#ifdef CONFIG_X86_32 870#ifdef CONFIG_X86_32
868 clear_thread_flag(TIF_IRET); 871 clear_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 0157cd26d7cc..70c2125d55b9 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_recvmmsg
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 1740c85e24bb..364d015efebc 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -817,10 +817,8 @@ static int __init uv_init_blade(int blade)
817 */ 817 */
818 apicid = blade_to_first_apicid(blade); 818 apicid = blade_to_first_apicid(blade);
819 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); 819 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
820 if ((pa & 0xff) != UV_BAU_MESSAGE) { 820 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
821 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
822 ((apicid << 32) | UV_BAU_MESSAGE)); 821 ((apicid << 32) | UV_BAU_MESSAGE));
823 }
824 return 0; 822 return 0;
825} 823}
826 824
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 583f11d5c480..3c84aa001c11 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -74,7 +74,7 @@ struct uv_rtc_timer_head {
74 */ 74 */
75static struct uv_rtc_timer_head **blade_info __read_mostly; 75static struct uv_rtc_timer_head **blade_info __read_mostly;
76 76
77static int uv_rtc_enable; 77static int uv_rtc_evt_enable;
78 78
79/* 79/*
80 * Hardware interface routines 80 * Hardware interface routines
@@ -90,7 +90,7 @@ static void uv_rtc_send_IPI(int cpu)
90 pnode = uv_apicid_to_pnode(apicid); 90 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 91 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) | 92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); 93 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94 94
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96} 96}
@@ -115,7 +115,7 @@ static int uv_setup_intr(int cpu, u64 expires)
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, 115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK); 116 UVH_EVENT_OCCURRED0_RTC1_MASK);
117 117
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 118 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120 120
121 /* Set configuration */ 121 /* Set configuration */
@@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires)
123 /* Initialize comparator value */ 123 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); 124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125 125
126 return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); 126 if (uv_read_rtc(NULL) <= expires)
127 return 0;
128
129 return !uv_intr_pending(pnode);
127} 130}
128 131
129/* 132/*
@@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
223 226
224 next_cpu = head->next_cpu; 227 next_cpu = head->next_cpu;
225 *t = expires; 228 *t = expires;
229
226 /* Will this one be next to go off? */ 230 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu || 231 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) { 232 expires < head->cpu[next_cpu].expires) {
@@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
231 *t = ULLONG_MAX; 235 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode); 236 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags); 237 spin_unlock_irqrestore(&head->lock, flags);
234 return 1; 238 return -ETIME;
235 } 239 }
236 } 240 }
237 241
@@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
244 * 248 *
245 * Returns 1 if this timer was pending. 249 * Returns 1 if this timer was pending.
246 */ 250 */
247static int uv_rtc_unset_timer(int cpu) 251static int uv_rtc_unset_timer(int cpu, int force)
248{ 252{
249 int pnode = uv_cpu_to_pnode(cpu); 253 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu); 254 int bid = uv_cpu_to_blade_id(cpu);
@@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu)
256 260
257 spin_lock_irqsave(&head->lock, flags); 261 spin_lock_irqsave(&head->lock, flags);
258 262
259 if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) 263 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
260 rc = 1; 264 rc = 1;
261 265
262 *t = ULLONG_MAX; 266 if (rc) {
263 267 *t = ULLONG_MAX;
264 /* Was the hardware setup for this timer? */ 268 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu) 269 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode); 270 uv_rtc_find_next_timer(head, pnode);
271 }
267 272
268 spin_unlock_irqrestore(&head->lock, flags); 273 spin_unlock_irqrestore(&head->lock, flags);
269 274
@@ -310,32 +315,32 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode,
310 break; 315 break;
311 case CLOCK_EVT_MODE_UNUSED: 316 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN: 317 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu); 318 uv_rtc_unset_timer(ced_cpu, 1);
314 break; 319 break;
315 } 320 }
316} 321}
317 322
318static void uv_rtc_interrupt(void) 323static void uv_rtc_interrupt(void)
319{ 324{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id(); 325 int cpu = smp_processor_id();
326 struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
322 327
323 if (!ced || !ced->event_handler) 328 if (!ced || !ced->event_handler)
324 return; 329 return;
325 330
326 if (uv_rtc_unset_timer(cpu) != 1) 331 if (uv_rtc_unset_timer(cpu, 0) != 1)
327 return; 332 return;
328 333
329 ced->event_handler(ced); 334 ced->event_handler(ced);
330} 335}
331 336
332static int __init uv_enable_rtc(char *str) 337static int __init uv_enable_evt_rtc(char *str)
333{ 338{
334 uv_rtc_enable = 1; 339 uv_rtc_evt_enable = 1;
335 340
336 return 1; 341 return 1;
337} 342}
338__setup("uvrtc", uv_enable_rtc); 343__setup("uvrtcevt", uv_enable_evt_rtc);
339 344
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy) 345static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{ 346{
@@ -350,27 +355,32 @@ static __init int uv_rtc_setup_clock(void)
350{ 355{
351 int rc; 356 int rc;
352 357
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) 358 if (!is_uv_system())
354 return -ENODEV; 359 return -ENODEV;
355 360
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, 361 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift); 362 clocksource_uv.shift);
360 363
364 /* If single blade, prefer tsc */
365 if (uv_num_possible_blades() == 1)
366 clocksource_uv.rating = 250;
367
361 rc = clocksource_register(&clocksource_uv); 368 rc = clocksource_register(&clocksource_uv);
362 if (rc) { 369 if (rc)
363 generic_interrupt_extension = NULL; 370 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
371 else
372 printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
373 sn_rtc_cycles_per_second/(unsigned long)1E6);
374
375 if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
364 return rc; 376 return rc;
365 }
366 377
367 /* Setup and register clockevents */ 378 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers(); 379 rc = uv_rtc_allocate_timers();
369 if (rc) { 380 if (rc)
370 clocksource_unregister(&clocksource_uv); 381 goto error;
371 generic_interrupt_extension = NULL; 382
372 return rc; 383 x86_platform_ipi_callback = uv_rtc_interrupt;
373 }
374 384
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, 385 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift); 386 NSEC_PER_SEC, clock_event_device_uv.shift);
@@ -383,11 +393,19 @@ static __init int uv_rtc_setup_clock(void)
383 393
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents); 394 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) { 395 if (rc) {
386 clocksource_unregister(&clocksource_uv); 396 x86_platform_ipi_callback = NULL;
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers(); 397 uv_rtc_deallocate_timers();
398 goto error;
389 } 399 }
390 400
401 printk(KERN_INFO "UV RTC clockevents registered\n");
402
403 return 0;
404
405error:
406 clocksource_unregister(&clocksource_uv);
407 printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
408
391 return rc; 409 return rc;
392} 410}
393arch_initcall(uv_rtc_setup_clock); 411arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index abda6f53e71e..34a279a7471d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -197,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
197 apic_version[m->apicid] = ver; 197 apic_version[m->apicid] = ver;
198} 198}
199 199
200static void __init visws_find_smp_config(unsigned int reserve) 200static void __init visws_find_smp_config(void)
201{ 201{
202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); 202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 611b9e2360d3..74c92bb194df 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
226 evt->min_delta_ns = clockevent_delta2ns(1, evt); 226 evt->min_delta_ns = clockevent_delta2ns(1, evt);
227 evt->cpumask = cpumask_of(cpu); 227 evt->cpumask = cpumask_of(cpu);
228 228
229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", 229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
230 evt->name, evt->mult, evt->shift); 230 evt->name, evt->mult, evt->shift);
231 clockevents_register_device(evt); 231 clockevents_register_device(evt);
232} 232}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3c68fe2d46cf..f3f2104408d9 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,6 +41,32 @@ ENTRY(phys_startup_64)
41jiffies_64 = jiffies; 41jiffies_64 = jiffies;
42#endif 42#endif
43 43
44#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
45/*
46 * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
47 * we retain large page mappings for boundaries spanning kernel text, rodata
48 * and data sections.
49 *
50 * However, kernel identity mappings will have different RWX permissions
51 * to the pages mapping to text and to the pages padding (which are freed) the
52 * text section. Hence kernel identity mappings will be broken to smaller
53 * pages. For 64-bit, kernel text and kernel identity mappings are different,
54 * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
55 * as well as retain 2MB large page mappings for kernel text.
56 */
57#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
58
59#define X64_ALIGN_DEBUG_RODATA_END \
60 . = ALIGN(HPAGE_SIZE); \
61 __end_rodata_hpage_align = .;
62
63#else
64
65#define X64_ALIGN_DEBUG_RODATA_BEGIN
66#define X64_ALIGN_DEBUG_RODATA_END
67
68#endif
69
44PHDRS { 70PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(7); /* RWE */
@@ -90,7 +116,9 @@ SECTIONS
90 116
91 EXCEPTION_TABLE(16) :text = 0x9090 117 EXCEPTION_TABLE(16) :text = 0x9090
92 118
119 X64_ALIGN_DEBUG_RODATA_BEGIN
93 RO_DATA(PAGE_SIZE) 120 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END
94 122
95 /* Data */ 123 /* Data */
96 .data : AT(ADDR(.data) - LOAD_OFFSET) { 124 .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -107,13 +135,13 @@ SECTIONS
107 135
108 PAGE_ALIGNED_DATA(PAGE_SIZE) 136 PAGE_ALIGNED_DATA(PAGE_SIZE)
109 137
110 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) 138 CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
111 139
112 DATA_DATA 140 DATA_DATA
113 CONSTRUCTORS 141 CONSTRUCTORS
114 142
115 /* rarely changed data like cpu maps */ 143 /* rarely changed data like cpu maps */
116 READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) 144 READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
117 145
118 /* End of data section */ 146 /* End of data section */
119 _edata = .; 147 _edata = .;
@@ -137,12 +165,12 @@ SECTIONS
137 *(.vsyscall_0) 165 *(.vsyscall_0)
138 } :user 166 } :user
139 167
140 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 168 . = ALIGN(L1_CACHE_BYTES);
141 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { 169 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
142 *(.vsyscall_fn) 170 *(.vsyscall_fn)
143 } 171 }
144 172
145 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 173 . = ALIGN(L1_CACHE_BYTES);
146 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { 174 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
147 *(.vsyscall_gtod_data) 175 *(.vsyscall_gtod_data)
148 } 176 }
@@ -166,7 +194,7 @@ SECTIONS
166 } 194 }
167 vgetcpu_mode = VVIRT(.vgetcpu_mode); 195 vgetcpu_mode = VVIRT(.vgetcpu_mode);
168 196
169 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 197 . = ALIGN(L1_CACHE_BYTES);
170 .jiffies : AT(VLOAD(.jiffies)) { 198 .jiffies : AT(VLOAD(.jiffies)) {
171 *(.jiffies) 199 *(.jiffies)
172 } 200 }
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974ff599..9055e5872ff0 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) 76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
77 u32 mult)
77{ 78{
78 unsigned long flags; 79 unsigned long flags;
79 80
@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
82 vsyscall_gtod_data.clock.vread = clock->vread; 83 vsyscall_gtod_data.clock.vread = clock->vread;
83 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 84 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
84 vsyscall_gtod_data.clock.mask = clock->mask; 85 vsyscall_gtod_data.clock.mask = clock->mask;
85 vsyscall_gtod_data.clock.mult = clock->mult; 86 vsyscall_gtod_data.clock.mult = mult;
86 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
@@ -237,7 +238,7 @@ static ctl_table kernel_table2[] = {
237}; 238};
238 239
239static ctl_table kernel_root_table2[] = { 240static ctl_table kernel_root_table2[] = {
240 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, 241 { .procname = "kernel", .mode = 0555,
241 .child = kernel_table2 }, 242 .child = kernel_table2 },
242 {} 243 {}
243}; 244};
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index d11c5ff7c65e..ccd179dec36e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -13,6 +13,7 @@
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/time.h> 14#include <asm/time.h>
15#include <asm/irq.h> 15#include <asm/irq.h>
16#include <asm/pat.h>
16#include <asm/tsc.h> 17#include <asm/tsc.h>
17#include <asm/iommu.h> 18#include <asm/iommu.h>
18 19
@@ -80,4 +81,5 @@ struct x86_platform_ops x86_platform = {
80 .get_wallclock = mach_get_cmos_time, 81 .get_wallclock = mach_get_cmos_time,
81 .set_wallclock = mach_set_rtc_mmss, 82 .set_wallclock = mach_set_rtc_mmss,
82 .iommu_shutdown = iommu_shutdown_noop, 83 .iommu_shutdown = iommu_shutdown_noop,
84 .is_untracked_pat_range = is_ISA_range,
83}; 85};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b84e571f4175..4cd498332466 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select USER_RETURN_NOTIFIER
31 ---help--- 32 ---help---
32 Support hosting fully virtualized guest machines using hardware 33 Support hosting fully virtualized guest machines using hardware
33 virtualization extensions. You will need a fairly recent 34 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 0e7fe78d0f74..31a7035c4bd9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,8 @@ CFLAGS_svm.o := -I.
6CFLAGS_vmx.o := -I. 6CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o) 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o)
10kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
11 12
12kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1be5cd640e93..7e8faea4651e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -75,6 +75,8 @@
75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
77#define GroupMask 0xff /* Group number stored in bits 0:7 */ 77#define GroupMask 0xff /* Group number stored in bits 0:7 */
78/* Misc flags */
79#define No64 (1<<28)
78/* Source 2 operand type */ 80/* Source 2 operand type */
79#define Src2None (0<<29) 81#define Src2None (0<<29)
80#define Src2CL (1<<29) 82#define Src2CL (1<<29)
@@ -92,19 +94,23 @@ static u32 opcode_table[256] = {
92 /* 0x00 - 0x07 */ 94 /* 0x00 - 0x07 */
93 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
94 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
95 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 97 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
98 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
96 /* 0x08 - 0x0F */ 99 /* 0x08 - 0x0F */
97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 100 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 101 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
99 0, 0, 0, 0, 102 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
103 ImplicitOps | Stack | No64, 0,
100 /* 0x10 - 0x17 */ 104 /* 0x10 - 0x17 */
101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
103 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
104 /* 0x18 - 0x1F */ 109 /* 0x18 - 0x1F */
105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 110 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
113 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
108 /* 0x20 - 0x27 */ 114 /* 0x20 - 0x27 */
109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 115 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -133,7 +139,8 @@ static u32 opcode_table[256] = {
133 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 139 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
134 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 140 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
135 /* 0x60 - 0x67 */ 141 /* 0x60 - 0x67 */
136 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 142 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
143 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
137 0, 0, 0, 0, 144 0, 0, 0, 0,
138 /* 0x68 - 0x6F */ 145 /* 0x68 - 0x6F */
139 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 146 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
@@ -158,7 +165,7 @@ static u32 opcode_table[256] = {
158 /* 0x90 - 0x97 */ 165 /* 0x90 - 0x97 */
159 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 166 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
160 /* 0x98 - 0x9F */ 167 /* 0x98 - 0x9F */
161 0, 0, SrcImm | Src2Imm16, 0, 168 0, 0, SrcImm | Src2Imm16 | No64, 0,
162 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 169 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
163 /* 0xA0 - 0xA7 */ 170 /* 0xA0 - 0xA7 */
164 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 171 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -185,7 +192,7 @@ static u32 opcode_table[256] = {
185 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 192 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
186 /* 0xC8 - 0xCF */ 193 /* 0xC8 - 0xCF */
187 0, 0, 0, ImplicitOps | Stack, 194 0, 0, 0, ImplicitOps | Stack,
188 ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, 195 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
189 /* 0xD0 - 0xD7 */ 196 /* 0xD0 - 0xD7 */
190 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 197 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
191 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 198 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -198,7 +205,7 @@ static u32 opcode_table[256] = {
198 ByteOp | SrcImmUByte, SrcImmUByte, 205 ByteOp | SrcImmUByte, SrcImmUByte,
199 /* 0xE8 - 0xEF */ 206 /* 0xE8 - 0xEF */
200 SrcImm | Stack, SrcImm | ImplicitOps, 207 SrcImm | Stack, SrcImm | ImplicitOps,
201 SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, 208 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
202 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 209 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
203 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 210 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
204 /* 0xF0 - 0xF7 */ 211 /* 0xF0 - 0xF7 */
@@ -244,11 +251,13 @@ static u32 twobyte_table[256] = {
244 /* 0x90 - 0x9F */ 251 /* 0x90 - 0x9F */
245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0xA0 - 0xA7 */ 253 /* 0xA0 - 0xA7 */
247 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 254 ImplicitOps | Stack, ImplicitOps | Stack,
255 0, DstMem | SrcReg | ModRM | BitOp,
248 DstMem | SrcReg | Src2ImmByte | ModRM, 256 DstMem | SrcReg | Src2ImmByte | ModRM,
249 DstMem | SrcReg | Src2CL | ModRM, 0, 0, 257 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
250 /* 0xA8 - 0xAF */ 258 /* 0xA8 - 0xAF */
251 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 259 ImplicitOps | Stack, ImplicitOps | Stack,
260 0, DstMem | SrcReg | ModRM | BitOp,
252 DstMem | SrcReg | Src2ImmByte | ModRM, 261 DstMem | SrcReg | Src2ImmByte | ModRM,
253 DstMem | SrcReg | Src2CL | ModRM, 262 DstMem | SrcReg | Src2CL | ModRM,
254 ModRM, 0, 263 ModRM, 0,
@@ -613,6 +622,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
613{ 622{
614 int rc = 0; 623 int rc = 0;
615 624
625 /* x86 instructions are limited to 15 bytes. */
626 if (eip + size - ctxt->decode.eip_orig > 15)
627 return X86EMUL_UNHANDLEABLE;
616 eip += ctxt->cs_base; 628 eip += ctxt->cs_base;
617 while (size--) { 629 while (size--) {
618 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 630 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
@@ -871,7 +883,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
871 /* Shadow copy of register state. Committed on successful emulation. */ 883 /* Shadow copy of register state. Committed on successful emulation. */
872 884
873 memset(c, 0, sizeof(struct decode_cache)); 885 memset(c, 0, sizeof(struct decode_cache));
874 c->eip = kvm_rip_read(ctxt->vcpu); 886 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
875 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 887 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
876 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 888 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
877 889
@@ -962,6 +974,11 @@ done_prefixes:
962 } 974 }
963 } 975 }
964 976
977 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
978 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");;
979 return -1;
980 }
981
965 if (c->d & Group) { 982 if (c->d & Group) {
966 group = c->d & GroupMask; 983 group = c->d & GroupMask;
967 c->modrm = insn_fetch(u8, 1, c->eip); 984 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1186,6 +1203,69 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1186 return rc; 1203 return rc;
1187} 1204}
1188 1205
1206static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1207{
1208 struct decode_cache *c = &ctxt->decode;
1209 struct kvm_segment segment;
1210
1211 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
1212
1213 c->src.val = segment.selector;
1214 emulate_push(ctxt);
1215}
1216
1217static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1218 struct x86_emulate_ops *ops, int seg)
1219{
1220 struct decode_cache *c = &ctxt->decode;
1221 unsigned long selector;
1222 int rc;
1223
1224 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1225 if (rc != 0)
1226 return rc;
1227
1228 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg);
1229 return rc;
1230}
1231
1232static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
1233{
1234 struct decode_cache *c = &ctxt->decode;
1235 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1236 int reg = VCPU_REGS_RAX;
1237
1238 while (reg <= VCPU_REGS_RDI) {
1239 (reg == VCPU_REGS_RSP) ?
1240 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1241
1242 emulate_push(ctxt);
1243 ++reg;
1244 }
1245}
1246
1247static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1248 struct x86_emulate_ops *ops)
1249{
1250 struct decode_cache *c = &ctxt->decode;
1251 int rc = 0;
1252 int reg = VCPU_REGS_RDI;
1253
1254 while (reg >= VCPU_REGS_RAX) {
1255 if (reg == VCPU_REGS_RSP) {
1256 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1257 c->op_bytes);
1258 --reg;
1259 }
1260
1261 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1262 if (rc != 0)
1263 break;
1264 --reg;
1265 }
1266 return rc;
1267}
1268
1189static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1269static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1190 struct x86_emulate_ops *ops) 1270 struct x86_emulate_ops *ops)
1191{ 1271{
@@ -1707,18 +1787,45 @@ special_insn:
1707 add: /* add */ 1787 add: /* add */
1708 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 1788 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1709 break; 1789 break;
1790 case 0x06: /* push es */
1791 emulate_push_sreg(ctxt, VCPU_SREG_ES);
1792 break;
1793 case 0x07: /* pop es */
1794 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1795 if (rc != 0)
1796 goto done;
1797 break;
1710 case 0x08 ... 0x0d: 1798 case 0x08 ... 0x0d:
1711 or: /* or */ 1799 or: /* or */
1712 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 1800 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1713 break; 1801 break;
1802 case 0x0e: /* push cs */
1803 emulate_push_sreg(ctxt, VCPU_SREG_CS);
1804 break;
1714 case 0x10 ... 0x15: 1805 case 0x10 ... 0x15:
1715 adc: /* adc */ 1806 adc: /* adc */
1716 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 1807 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1717 break; 1808 break;
1809 case 0x16: /* push ss */
1810 emulate_push_sreg(ctxt, VCPU_SREG_SS);
1811 break;
1812 case 0x17: /* pop ss */
1813 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1814 if (rc != 0)
1815 goto done;
1816 break;
1718 case 0x18 ... 0x1d: 1817 case 0x18 ... 0x1d:
1719 sbb: /* sbb */ 1818 sbb: /* sbb */
1720 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1819 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1721 break; 1820 break;
1821 case 0x1e: /* push ds */
1822 emulate_push_sreg(ctxt, VCPU_SREG_DS);
1823 break;
1824 case 0x1f: /* pop ds */
1825 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1826 if (rc != 0)
1827 goto done;
1828 break;
1722 case 0x20 ... 0x25: 1829 case 0x20 ... 0x25:
1723 and: /* and */ 1830 and: /* and */
1724 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1831 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
@@ -1750,6 +1857,14 @@ special_insn:
1750 if (rc != 0) 1857 if (rc != 0)
1751 goto done; 1858 goto done;
1752 break; 1859 break;
1860 case 0x60: /* pusha */
1861 emulate_pusha(ctxt);
1862 break;
1863 case 0x61: /* popa */
1864 rc = emulate_popa(ctxt, ops);
1865 if (rc != 0)
1866 goto done;
1867 break;
1753 case 0x63: /* movsxd */ 1868 case 0x63: /* movsxd */
1754 if (ctxt->mode != X86EMUL_MODE_PROT64) 1869 if (ctxt->mode != X86EMUL_MODE_PROT64)
1755 goto cannot_emulate; 1870 goto cannot_emulate;
@@ -1761,7 +1876,7 @@ special_insn:
1761 break; 1876 break;
1762 case 0x6c: /* insb */ 1877 case 0x6c: /* insb */
1763 case 0x6d: /* insw/insd */ 1878 case 0x6d: /* insw/insd */
1764 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1879 if (kvm_emulate_pio_string(ctxt->vcpu,
1765 1, 1880 1,
1766 (c->d & ByteOp) ? 1 : c->op_bytes, 1881 (c->d & ByteOp) ? 1 : c->op_bytes,
1767 c->rep_prefix ? 1882 c->rep_prefix ?
@@ -1777,7 +1892,7 @@ special_insn:
1777 return 0; 1892 return 0;
1778 case 0x6e: /* outsb */ 1893 case 0x6e: /* outsb */
1779 case 0x6f: /* outsw/outsd */ 1894 case 0x6f: /* outsw/outsd */
1780 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1895 if (kvm_emulate_pio_string(ctxt->vcpu,
1781 0, 1896 0,
1782 (c->d & ByteOp) ? 1 : c->op_bytes, 1897 (c->d & ByteOp) ? 1 : c->op_bytes,
1783 c->rep_prefix ? 1898 c->rep_prefix ?
@@ -2070,7 +2185,7 @@ special_insn:
2070 case 0xef: /* out (e/r)ax,dx */ 2185 case 0xef: /* out (e/r)ax,dx */
2071 port = c->regs[VCPU_REGS_RDX]; 2186 port = c->regs[VCPU_REGS_RDX];
2072 io_dir_in = 0; 2187 io_dir_in = 0;
2073 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, 2188 do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
2074 (c->d & ByteOp) ? 1 : c->op_bytes, 2189 (c->d & ByteOp) ? 1 : c->op_bytes,
2075 port) != 0) { 2190 port) != 0) {
2076 c->eip = saved_eip; 2191 c->eip = saved_eip;
@@ -2297,6 +2412,14 @@ twobyte_insn:
2297 jmp_rel(c, c->src.val); 2412 jmp_rel(c, c->src.val);
2298 c->dst.type = OP_NONE; 2413 c->dst.type = OP_NONE;
2299 break; 2414 break;
2415 case 0xa0: /* push fs */
2416 emulate_push_sreg(ctxt, VCPU_SREG_FS);
2417 break;
2418 case 0xa1: /* pop fs */
2419 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2420 if (rc != 0)
2421 goto done;
2422 break;
2300 case 0xa3: 2423 case 0xa3:
2301 bt: /* bt */ 2424 bt: /* bt */
2302 c->dst.type = OP_NONE; 2425 c->dst.type = OP_NONE;
@@ -2308,6 +2431,14 @@ twobyte_insn:
2308 case 0xa5: /* shld cl, r, r/m */ 2431 case 0xa5: /* shld cl, r, r/m */
2309 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 2432 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
2310 break; 2433 break;
2434 case 0xa8: /* push gs */
2435 emulate_push_sreg(ctxt, VCPU_SREG_GS);
2436 break;
2437 case 0xa9: /* pop gs */
2438 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2439 if (rc != 0)
2440 goto done;
2441 break;
2311 case 0xab: 2442 case 0xab:
2312 bts: /* bts */ 2443 bts: /* bts */
2313 /* only subword offset */ 2444 /* only subword offset */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 144e7f60b5e2..fab7440c9bb2 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -688,10 +688,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
688 struct kvm_vcpu *vcpu; 688 struct kvm_vcpu *vcpu;
689 int i; 689 int i;
690 690
691 mutex_lock(&kvm->irq_lock);
692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 691 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
693 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
694 mutex_unlock(&kvm->irq_lock);
695 693
696 /* 694 /*
697 * Provides NMI watchdog support via Virtual Wire mode. 695 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 01f151682802..d057c0cbd245 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -38,7 +38,15 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
38 s->isr_ack |= (1 << irq); 38 s->isr_ack |= (1 << irq);
39 if (s != &s->pics_state->pics[0]) 39 if (s != &s->pics_state->pics[0])
40 irq += 8; 40 irq += 8;
41 /*
42 * We are dropping lock while calling ack notifiers since ack
43 * notifier callbacks for assigned devices call into PIC recursively.
44 * Other interrupt may be delivered to PIC while lock is dropped but
45 * it should be safe since PIC state is already updated at this stage.
46 */
47 spin_unlock(&s->pics_state->lock);
41 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 48 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
49 spin_lock(&s->pics_state->lock);
42} 50}
43 51
44void kvm_pic_clear_isr_ack(struct kvm *kvm) 52void kvm_pic_clear_isr_ack(struct kvm *kvm)
@@ -176,16 +184,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
176static inline void pic_intack(struct kvm_kpic_state *s, int irq) 184static inline void pic_intack(struct kvm_kpic_state *s, int irq)
177{ 185{
178 s->isr |= 1 << irq; 186 s->isr |= 1 << irq;
179 if (s->auto_eoi) {
180 if (s->rotate_on_auto_eoi)
181 s->priority_add = (irq + 1) & 7;
182 pic_clear_isr(s, irq);
183 }
184 /* 187 /*
185 * We don't clear a level sensitive interrupt here 188 * We don't clear a level sensitive interrupt here
186 */ 189 */
187 if (!(s->elcr & (1 << irq))) 190 if (!(s->elcr & (1 << irq)))
188 s->irr &= ~(1 << irq); 191 s->irr &= ~(1 << irq);
192
193 if (s->auto_eoi) {
194 if (s->rotate_on_auto_eoi)
195 s->priority_add = (irq + 1) & 7;
196 pic_clear_isr(s, irq);
197 }
198
189} 199}
190 200
191int kvm_pic_read_irq(struct kvm *kvm) 201int kvm_pic_read_irq(struct kvm *kvm)
@@ -225,22 +235,11 @@ int kvm_pic_read_irq(struct kvm *kvm)
225 235
226void kvm_pic_reset(struct kvm_kpic_state *s) 236void kvm_pic_reset(struct kvm_kpic_state *s)
227{ 237{
228 int irq, irqbase, n; 238 int irq;
229 struct kvm *kvm = s->pics_state->irq_request_opaque; 239 struct kvm *kvm = s->pics_state->irq_request_opaque;
230 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; 240 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
241 u8 irr = s->irr, isr = s->imr;
231 242
232 if (s == &s->pics_state->pics[0])
233 irqbase = 0;
234 else
235 irqbase = 8;
236
237 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
238 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
239 if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
240 n = irq + irqbase;
241 kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
242 }
243 }
244 s->last_irr = 0; 243 s->last_irr = 0;
245 s->irr = 0; 244 s->irr = 0;
246 s->imr = 0; 245 s->imr = 0;
@@ -256,6 +255,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
256 s->rotate_on_auto_eoi = 0; 255 s->rotate_on_auto_eoi = 0;
257 s->special_fully_nested_mode = 0; 256 s->special_fully_nested_mode = 0;
258 s->init4 = 0; 257 s->init4 = 0;
258
259 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
260 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
261 if (irr & (1 << irq) || isr & (1 << irq)) {
262 pic_clear_isr(s, irq);
263 }
264 }
259} 265}
260 266
261static void pic_ioport_write(void *opaque, u32 addr, u32 val) 267static void pic_ioport_write(void *opaque, u32 addr, u32 val)
@@ -298,9 +304,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
298 priority = get_priority(s, s->isr); 304 priority = get_priority(s, s->isr);
299 if (priority != 8) { 305 if (priority != 8) {
300 irq = (priority + s->priority_add) & 7; 306 irq = (priority + s->priority_add) & 7;
301 pic_clear_isr(s, irq);
302 if (cmd == 5) 307 if (cmd == 5)
303 s->priority_add = (irq + 1) & 7; 308 s->priority_add = (irq + 1) & 7;
309 pic_clear_isr(s, irq);
304 pic_update_irq(s->pics_state); 310 pic_update_irq(s->pics_state);
305 } 311 }
306 break; 312 break;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7d6058a2fd38..be399e207d57 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -71,6 +71,7 @@ struct kvm_pic {
71 int output; /* intr from master PIC */ 71 int output; /* intr from master PIC */
72 struct kvm_io_device dev; 72 struct kvm_io_device dev;
73 void (*ack_notifier)(void *opaque, int irq); 73 void (*ack_notifier)(void *opaque, int irq);
74 unsigned long irq_states[16];
74}; 75};
75 76
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 77struct kvm_pic *kvm_create_pic(struct kvm *kvm);
@@ -85,7 +86,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
85 86
86static inline int irqchip_in_kernel(struct kvm *kvm) 87static inline int irqchip_in_kernel(struct kvm *kvm)
87{ 88{
88 return pic_irqchip(kvm) != NULL; 89 int ret;
90
91 ret = (pic_irqchip(kvm) != NULL);
92 smp_rmb();
93 return ret;
89} 94}
90 95
91void kvm_pic_reset(struct kvm_kpic_state *s); 96void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 23c217692ea9..cd60c0bd1b32 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,7 +32,6 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include <asm/apicdef.h>
36#include "kvm_cache_regs.h" 35#include "kvm_cache_regs.h"
37#include "irq.h" 36#include "irq.h"
38#include "trace.h" 37#include "trace.h"
@@ -471,11 +470,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
471 trigger_mode = IOAPIC_LEVEL_TRIG; 470 trigger_mode = IOAPIC_LEVEL_TRIG;
472 else 471 else
473 trigger_mode = IOAPIC_EDGE_TRIG; 472 trigger_mode = IOAPIC_EDGE_TRIG;
474 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { 473 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
475 mutex_lock(&apic->vcpu->kvm->irq_lock);
476 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 474 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
477 mutex_unlock(&apic->vcpu->kvm->irq_lock);
478 }
479} 475}
480 476
481static void apic_send_ipi(struct kvm_lapic *apic) 477static void apic_send_ipi(struct kvm_lapic *apic)
@@ -504,9 +500,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
504 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 500 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
505 irq.vector); 501 irq.vector);
506 502
507 mutex_lock(&apic->vcpu->kvm->irq_lock);
508 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 503 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
509 mutex_unlock(&apic->vcpu->kvm->irq_lock);
510} 504}
511 505
512static u32 apic_get_tmcct(struct kvm_lapic *apic) 506static u32 apic_get_tmcct(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 818b92ad82cf..4c3e5b2314cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2789,7 +2789,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2789 if (r) 2789 if (r)
2790 goto out; 2790 goto out;
2791 2791
2792 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); 2792 er = emulate_instruction(vcpu, cr2, error_code, 0);
2793 2793
2794 switch (er) { 2794 switch (er) {
2795 case EMULATE_DONE: 2795 case EMULATE_DONE:
@@ -2800,6 +2800,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2800 case EMULATE_FAIL: 2800 case EMULATE_FAIL:
2801 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2801 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2802 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 2802 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2803 vcpu->run->internal.ndata = 0;
2803 return 0; 2804 return 0;
2804 default: 2805 default:
2805 BUG(); 2806 BUG();
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 72558f8ff3f5..a6017132fba8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -467,7 +467,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
467 level = iterator.level; 467 level = iterator.level;
468 sptep = iterator.sptep; 468 sptep = iterator.sptep;
469 469
470 /* FIXME: properly handle invlpg on large guest pages */
471 if (level == PT_PAGE_TABLE_LEVEL || 470 if (level == PT_PAGE_TABLE_LEVEL ||
472 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
473 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c17404add91f..3de0b37ec038 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -46,6 +46,7 @@ MODULE_LICENSE("GPL");
46#define SVM_FEATURE_NPT (1 << 0) 46#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1) 47#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_FEATURE_SVML (1 << 2) 48#define SVM_FEATURE_SVML (1 << 2)
49#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
49 50
50#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 51#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
51#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 52#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -53,15 +54,6 @@ MODULE_LICENSE("GPL");
53 54
54#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 55#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
55 56
56/* Turn on to get debugging output*/
57/* #define NESTED_DEBUG */
58
59#ifdef NESTED_DEBUG
60#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
61#else
62#define nsvm_printk(fmt, args...) do {} while(0)
63#endif
64
65static const u32 host_save_user_msrs[] = { 57static const u32 host_save_user_msrs[] = {
66#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
67 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, 59 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -85,6 +77,9 @@ struct nested_state {
85 /* gpa pointers to the real vectors */ 77 /* gpa pointers to the real vectors */
86 u64 vmcb_msrpm; 78 u64 vmcb_msrpm;
87 79
80 /* A VMEXIT is required but not yet emulated */
81 bool exit_required;
82
88 /* cache for intercepts of the guest */ 83 /* cache for intercepts of the guest */
89 u16 intercept_cr_read; 84 u16 intercept_cr_read;
90 u16 intercept_cr_write; 85 u16 intercept_cr_write;
@@ -112,6 +107,8 @@ struct vcpu_svm {
112 u32 *msrpm; 107 u32 *msrpm;
113 108
114 struct nested_state nested; 109 struct nested_state nested;
110
111 bool nmi_singlestep;
115}; 112};
116 113
117/* enable NPT for AMD64 and X86 with PAE */ 114/* enable NPT for AMD64 and X86 with PAE */
@@ -286,7 +283,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
286 struct vcpu_svm *svm = to_svm(vcpu); 283 struct vcpu_svm *svm = to_svm(vcpu);
287 284
288 if (!svm->next_rip) { 285 if (!svm->next_rip) {
289 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != 286 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
290 EMULATE_DONE) 287 EMULATE_DONE)
291 printk(KERN_DEBUG "%s: NOP\n", __func__); 288 printk(KERN_DEBUG "%s: NOP\n", __func__);
292 return; 289 return;
@@ -316,7 +313,7 @@ static void svm_hardware_disable(void *garbage)
316 cpu_svm_disable(); 313 cpu_svm_disable();
317} 314}
318 315
319static void svm_hardware_enable(void *garbage) 316static int svm_hardware_enable(void *garbage)
320{ 317{
321 318
322 struct svm_cpu_data *svm_data; 319 struct svm_cpu_data *svm_data;
@@ -325,16 +322,21 @@ static void svm_hardware_enable(void *garbage)
325 struct desc_struct *gdt; 322 struct desc_struct *gdt;
326 int me = raw_smp_processor_id(); 323 int me = raw_smp_processor_id();
327 324
325 rdmsrl(MSR_EFER, efer);
326 if (efer & EFER_SVME)
327 return -EBUSY;
328
328 if (!has_svm()) { 329 if (!has_svm()) {
329 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); 330 printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
330 return; 331 me);
332 return -EINVAL;
331 } 333 }
332 svm_data = per_cpu(svm_data, me); 334 svm_data = per_cpu(svm_data, me);
333 335
334 if (!svm_data) { 336 if (!svm_data) {
335 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", 337 printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
336 me); 338 me);
337 return; 339 return -EINVAL;
338 } 340 }
339 341
340 svm_data->asid_generation = 1; 342 svm_data->asid_generation = 1;
@@ -345,11 +347,12 @@ static void svm_hardware_enable(void *garbage)
345 gdt = (struct desc_struct *)gdt_descr.base; 347 gdt = (struct desc_struct *)gdt_descr.base;
346 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 348 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
347 349
348 rdmsrl(MSR_EFER, efer);
349 wrmsrl(MSR_EFER, efer | EFER_SVME); 350 wrmsrl(MSR_EFER, efer | EFER_SVME);
350 351
351 wrmsrl(MSR_VM_HSAVE_PA, 352 wrmsrl(MSR_VM_HSAVE_PA,
352 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 353 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
354
355 return 0;
353} 356}
354 357
355static void svm_cpu_uninit(int cpu) 358static void svm_cpu_uninit(int cpu)
@@ -476,7 +479,7 @@ static __init int svm_hardware_setup(void)
476 kvm_enable_efer_bits(EFER_SVME); 479 kvm_enable_efer_bits(EFER_SVME);
477 } 480 }
478 481
479 for_each_online_cpu(cpu) { 482 for_each_possible_cpu(cpu) {
480 r = svm_cpu_init(cpu); 483 r = svm_cpu_init(cpu);
481 if (r) 484 if (r)
482 goto err; 485 goto err;
@@ -510,7 +513,7 @@ static __exit void svm_hardware_unsetup(void)
510{ 513{
511 int cpu; 514 int cpu;
512 515
513 for_each_online_cpu(cpu) 516 for_each_possible_cpu(cpu)
514 svm_cpu_uninit(cpu); 517 svm_cpu_uninit(cpu);
515 518
516 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 519 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
@@ -625,11 +628,12 @@ static void init_vmcb(struct vcpu_svm *svm)
625 save->rip = 0x0000fff0; 628 save->rip = 0x0000fff0;
626 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 629 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
627 630
628 /* 631 /* This is the guest-visible cr0 value.
629 * cr0 val on cpu init should be 0x60000010, we enable cpu 632 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
630 * cache by default. the orderly way is to enable cache in bios.
631 */ 633 */
632 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; 634 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
635 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
636
633 save->cr4 = X86_CR4_PAE; 637 save->cr4 = X86_CR4_PAE;
634 /* rdx = ?? */ 638 /* rdx = ?? */
635 639
@@ -644,8 +648,6 @@ static void init_vmcb(struct vcpu_svm *svm)
644 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| 648 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
645 INTERCEPT_CR3_MASK); 649 INTERCEPT_CR3_MASK);
646 save->g_pat = 0x0007040600070406ULL; 650 save->g_pat = 0x0007040600070406ULL;
647 /* enable caching because the QEMU Bios doesn't enable it */
648 save->cr0 = X86_CR0_ET;
649 save->cr3 = 0; 651 save->cr3 = 0;
650 save->cr4 = 0; 652 save->cr4 = 0;
651 } 653 }
@@ -654,6 +656,11 @@ static void init_vmcb(struct vcpu_svm *svm)
654 svm->nested.vmcb = 0; 656 svm->nested.vmcb = 0;
655 svm->vcpu.arch.hflags = 0; 657 svm->vcpu.arch.hflags = 0;
656 658
659 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
660 control->pause_filter_count = 3000;
661 control->intercept |= (1ULL << INTERCEPT_PAUSE);
662 }
663
657 enable_gif(svm); 664 enable_gif(svm);
658} 665}
659 666
@@ -758,14 +765,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
758 int i; 765 int i;
759 766
760 if (unlikely(cpu != vcpu->cpu)) { 767 if (unlikely(cpu != vcpu->cpu)) {
761 u64 tsc_this, delta; 768 u64 delta;
762 769
763 /* 770 /*
764 * Make sure that the guest sees a monotonically 771 * Make sure that the guest sees a monotonically
765 * increasing TSC. 772 * increasing TSC.
766 */ 773 */
767 rdtscll(tsc_this); 774 delta = vcpu->arch.host_tsc - native_read_tsc();
768 delta = vcpu->arch.host_tsc - tsc_this;
769 svm->vmcb->control.tsc_offset += delta; 775 svm->vmcb->control.tsc_offset += delta;
770 if (is_nested(svm)) 776 if (is_nested(svm))
771 svm->nested.hsave->control.tsc_offset += delta; 777 svm->nested.hsave->control.tsc_offset += delta;
@@ -787,7 +793,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
787 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 793 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
788 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 794 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
789 795
790 rdtscll(vcpu->arch.host_tsc); 796 vcpu->arch.host_tsc = native_read_tsc();
791} 797}
792 798
793static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 799static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1045,7 +1051,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1045 svm->vmcb->control.intercept_exceptions &= 1051 svm->vmcb->control.intercept_exceptions &=
1046 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1052 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
1047 1053
1048 if (vcpu->arch.singlestep) 1054 if (svm->nmi_singlestep)
1049 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1055 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
1050 1056
1051 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1057 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -1060,26 +1066,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1060 vcpu->guest_debug = 0; 1066 vcpu->guest_debug = 0;
1061} 1067}
1062 1068
1063static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1069static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1064{ 1070{
1065 int old_debug = vcpu->guest_debug;
1066 struct vcpu_svm *svm = to_svm(vcpu); 1071 struct vcpu_svm *svm = to_svm(vcpu);
1067 1072
1068 vcpu->guest_debug = dbg->control;
1069
1070 update_db_intercept(vcpu);
1071
1072 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1073 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1073 svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; 1074 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1074 else 1075 else
1075 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1076 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1076 1077
1077 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 1078 update_db_intercept(vcpu);
1078 svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1079 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1080 svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1081
1082 return 0;
1083} 1079}
1084 1080
1085static void load_host_msrs(struct kvm_vcpu *vcpu) 1081static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1180,7 +1176,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1180 } 1176 }
1181} 1177}
1182 1178
1183static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1179static int pf_interception(struct vcpu_svm *svm)
1184{ 1180{
1185 u64 fault_address; 1181 u64 fault_address;
1186 u32 error_code; 1182 u32 error_code;
@@ -1194,17 +1190,19 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1194 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1190 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1195} 1191}
1196 1192
1197static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1193static int db_interception(struct vcpu_svm *svm)
1198{ 1194{
1195 struct kvm_run *kvm_run = svm->vcpu.run;
1196
1199 if (!(svm->vcpu.guest_debug & 1197 if (!(svm->vcpu.guest_debug &
1200 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 1198 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1201 !svm->vcpu.arch.singlestep) { 1199 !svm->nmi_singlestep) {
1202 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 1200 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1203 return 1; 1201 return 1;
1204 } 1202 }
1205 1203
1206 if (svm->vcpu.arch.singlestep) { 1204 if (svm->nmi_singlestep) {
1207 svm->vcpu.arch.singlestep = false; 1205 svm->nmi_singlestep = false;
1208 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) 1206 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1209 svm->vmcb->save.rflags &= 1207 svm->vmcb->save.rflags &=
1210 ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1208 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
@@ -1223,25 +1221,27 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1223 return 1; 1221 return 1;
1224} 1222}
1225 1223
1226static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1224static int bp_interception(struct vcpu_svm *svm)
1227{ 1225{
1226 struct kvm_run *kvm_run = svm->vcpu.run;
1227
1228 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1228 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1229 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1229 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1230 kvm_run->debug.arch.exception = BP_VECTOR; 1230 kvm_run->debug.arch.exception = BP_VECTOR;
1231 return 0; 1231 return 0;
1232} 1232}
1233 1233
1234static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1234static int ud_interception(struct vcpu_svm *svm)
1235{ 1235{
1236 int er; 1236 int er;
1237 1237
1238 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 1238 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
1239 if (er != EMULATE_DONE) 1239 if (er != EMULATE_DONE)
1240 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1240 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1241 return 1; 1241 return 1;
1242} 1242}
1243 1243
1244static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1244static int nm_interception(struct vcpu_svm *svm)
1245{ 1245{
1246 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1246 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1247 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) 1247 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
@@ -1251,7 +1251,7 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1251 return 1; 1251 return 1;
1252} 1252}
1253 1253
1254static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1254static int mc_interception(struct vcpu_svm *svm)
1255{ 1255{
1256 /* 1256 /*
1257 * On an #MC intercept the MCE handler is not called automatically in 1257 * On an #MC intercept the MCE handler is not called automatically in
@@ -1264,8 +1264,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1264 return 1; 1264 return 1;
1265} 1265}
1266 1266
1267static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1267static int shutdown_interception(struct vcpu_svm *svm)
1268{ 1268{
1269 struct kvm_run *kvm_run = svm->vcpu.run;
1270
1269 /* 1271 /*
1270 * VMCB is undefined after a SHUTDOWN intercept 1272 * VMCB is undefined after a SHUTDOWN intercept
1271 * so reinitialize it. 1273 * so reinitialize it.
@@ -1277,7 +1279,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1277 return 0; 1279 return 0;
1278} 1280}
1279 1281
1280static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1282static int io_interception(struct vcpu_svm *svm)
1281{ 1283{
1282 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1284 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1283 int size, in, string; 1285 int size, in, string;
@@ -1291,7 +1293,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1291 1293
1292 if (string) { 1294 if (string) {
1293 if (emulate_instruction(&svm->vcpu, 1295 if (emulate_instruction(&svm->vcpu,
1294 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) 1296 0, 0, 0) == EMULATE_DO_MMIO)
1295 return 0; 1297 return 0;
1296 return 1; 1298 return 1;
1297 } 1299 }
@@ -1301,33 +1303,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1301 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1303 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1302 1304
1303 skip_emulated_instruction(&svm->vcpu); 1305 skip_emulated_instruction(&svm->vcpu);
1304 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1306 return kvm_emulate_pio(&svm->vcpu, in, size, port);
1305} 1307}
1306 1308
1307static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1309static int nmi_interception(struct vcpu_svm *svm)
1308{ 1310{
1309 return 1; 1311 return 1;
1310} 1312}
1311 1313
1312static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1314static int intr_interception(struct vcpu_svm *svm)
1313{ 1315{
1314 ++svm->vcpu.stat.irq_exits; 1316 ++svm->vcpu.stat.irq_exits;
1315 return 1; 1317 return 1;
1316} 1318}
1317 1319
1318static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1320static int nop_on_interception(struct vcpu_svm *svm)
1319{ 1321{
1320 return 1; 1322 return 1;
1321} 1323}
1322 1324
1323static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1325static int halt_interception(struct vcpu_svm *svm)
1324{ 1326{
1325 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 1327 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1326 skip_emulated_instruction(&svm->vcpu); 1328 skip_emulated_instruction(&svm->vcpu);
1327 return kvm_emulate_halt(&svm->vcpu); 1329 return kvm_emulate_halt(&svm->vcpu);
1328} 1330}
1329 1331
1330static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1332static int vmmcall_interception(struct vcpu_svm *svm)
1331{ 1333{
1332 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1334 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1333 skip_emulated_instruction(&svm->vcpu); 1335 skip_emulated_instruction(&svm->vcpu);
@@ -1378,8 +1380,15 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1378 1380
1379 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1381 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1380 1382
1381 if (nested_svm_exit_handled(svm)) { 1383 if (svm->nested.intercept & 1ULL) {
1382 nsvm_printk("VMexit -> INTR\n"); 1384 /*
1385 * The #vmexit can't be emulated here directly because this
1386 * code path runs with irqs and preemtion disabled. A
1387 * #vmexit emulation might sleep. Only signal request for
1388 * the #vmexit here.
1389 */
1390 svm->nested.exit_required = true;
1391 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1383 return 1; 1392 return 1;
1384 } 1393 }
1385 1394
@@ -1390,10 +1399,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
1390{ 1399{
1391 struct page *page; 1400 struct page *page;
1392 1401
1393 down_read(&current->mm->mmap_sem);
1394 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1402 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1395 up_read(&current->mm->mmap_sem);
1396
1397 if (is_error_page(page)) 1403 if (is_error_page(page))
1398 goto error; 1404 goto error;
1399 1405
@@ -1532,14 +1538,12 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1532 } 1538 }
1533 default: { 1539 default: {
1534 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1540 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1535 nsvm_printk("exit code: 0x%x\n", exit_code);
1536 if (svm->nested.intercept & exit_bits) 1541 if (svm->nested.intercept & exit_bits)
1537 vmexit = NESTED_EXIT_DONE; 1542 vmexit = NESTED_EXIT_DONE;
1538 } 1543 }
1539 } 1544 }
1540 1545
1541 if (vmexit == NESTED_EXIT_DONE) { 1546 if (vmexit == NESTED_EXIT_DONE) {
1542 nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
1543 nested_svm_vmexit(svm); 1547 nested_svm_vmexit(svm);
1544 } 1548 }
1545 1549
@@ -1584,6 +1588,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1584 struct vmcb *hsave = svm->nested.hsave; 1588 struct vmcb *hsave = svm->nested.hsave;
1585 struct vmcb *vmcb = svm->vmcb; 1589 struct vmcb *vmcb = svm->vmcb;
1586 1590
1591 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1592 vmcb->control.exit_info_1,
1593 vmcb->control.exit_info_2,
1594 vmcb->control.exit_int_info,
1595 vmcb->control.exit_int_info_err);
1596
1587 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1597 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
1588 if (!nested_vmcb) 1598 if (!nested_vmcb)
1589 return 1; 1599 return 1;
@@ -1617,6 +1627,22 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1617 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 1627 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1618 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 1628 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1619 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 1629 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
1630
1631 /*
1632 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
1633 * to make sure that we do not lose injected events. So check event_inj
1634 * here and copy it to exit_int_info if it is valid.
1635 * Exit_int_info and event_inj can't be both valid because the case
1636 * below only happens on a VMRUN instruction intercept which has
1637 * no valid exit_int_info set.
1638 */
1639 if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
1640 struct vmcb_control_area *nc = &nested_vmcb->control;
1641
1642 nc->exit_int_info = vmcb->control.event_inj;
1643 nc->exit_int_info_err = vmcb->control.event_inj_err;
1644 }
1645
1620 nested_vmcb->control.tlb_ctl = 0; 1646 nested_vmcb->control.tlb_ctl = 0;
1621 nested_vmcb->control.event_inj = 0; 1647 nested_vmcb->control.event_inj = 0;
1622 nested_vmcb->control.event_inj_err = 0; 1648 nested_vmcb->control.event_inj_err = 0;
@@ -1628,10 +1654,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1628 /* Restore the original control entries */ 1654 /* Restore the original control entries */
1629 copy_vmcb_control_area(vmcb, hsave); 1655 copy_vmcb_control_area(vmcb, hsave);
1630 1656
1631 /* Kill any pending exceptions */
1632 if (svm->vcpu.arch.exception.pending == true)
1633 nsvm_printk("WARNING: Pending Exception\n");
1634
1635 kvm_clear_exception_queue(&svm->vcpu); 1657 kvm_clear_exception_queue(&svm->vcpu);
1636 kvm_clear_interrupt_queue(&svm->vcpu); 1658 kvm_clear_interrupt_queue(&svm->vcpu);
1637 1659
@@ -1702,6 +1724,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1702 /* nested_vmcb is our indicator if nested SVM is activated */ 1724 /* nested_vmcb is our indicator if nested SVM is activated */
1703 svm->nested.vmcb = svm->vmcb->save.rax; 1725 svm->nested.vmcb = svm->vmcb->save.rax;
1704 1726
1727 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1728 nested_vmcb->save.rip,
1729 nested_vmcb->control.int_ctl,
1730 nested_vmcb->control.event_inj,
1731 nested_vmcb->control.nested_ctl);
1732
1705 /* Clear internal status */ 1733 /* Clear internal status */
1706 kvm_clear_exception_queue(&svm->vcpu); 1734 kvm_clear_exception_queue(&svm->vcpu);
1707 kvm_clear_interrupt_queue(&svm->vcpu); 1735 kvm_clear_interrupt_queue(&svm->vcpu);
@@ -1789,28 +1817,15 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1789 svm->nested.intercept = nested_vmcb->control.intercept; 1817 svm->nested.intercept = nested_vmcb->control.intercept;
1790 1818
1791 force_new_asid(&svm->vcpu); 1819 force_new_asid(&svm->vcpu);
1792 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
1793 svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
1794 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 1820 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
1795 if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
1796 nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
1797 nested_vmcb->control.int_ctl);
1798 }
1799 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 1821 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
1800 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 1822 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
1801 else 1823 else
1802 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 1824 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1803 1825
1804 nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
1805 nested_vmcb->control.exit_int_info,
1806 nested_vmcb->control.int_state);
1807
1808 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 1826 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1809 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 1827 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1810 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 1828 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1811 if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
1812 nsvm_printk("Injecting Event: 0x%x\n",
1813 nested_vmcb->control.event_inj);
1814 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 1829 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1815 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 1830 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1816 1831
@@ -1837,7 +1852,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1837 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 1852 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1838} 1853}
1839 1854
1840static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1855static int vmload_interception(struct vcpu_svm *svm)
1841{ 1856{
1842 struct vmcb *nested_vmcb; 1857 struct vmcb *nested_vmcb;
1843 1858
@@ -1857,7 +1872,7 @@ static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1857 return 1; 1872 return 1;
1858} 1873}
1859 1874
1860static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1875static int vmsave_interception(struct vcpu_svm *svm)
1861{ 1876{
1862 struct vmcb *nested_vmcb; 1877 struct vmcb *nested_vmcb;
1863 1878
@@ -1877,10 +1892,8 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1877 return 1; 1892 return 1;
1878} 1893}
1879 1894
1880static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1895static int vmrun_interception(struct vcpu_svm *svm)
1881{ 1896{
1882 nsvm_printk("VMrun\n");
1883
1884 if (nested_svm_check_permissions(svm)) 1897 if (nested_svm_check_permissions(svm))
1885 return 1; 1898 return 1;
1886 1899
@@ -1907,7 +1920,7 @@ failed:
1907 return 1; 1920 return 1;
1908} 1921}
1909 1922
1910static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1923static int stgi_interception(struct vcpu_svm *svm)
1911{ 1924{
1912 if (nested_svm_check_permissions(svm)) 1925 if (nested_svm_check_permissions(svm))
1913 return 1; 1926 return 1;
@@ -1920,7 +1933,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1920 return 1; 1933 return 1;
1921} 1934}
1922 1935
1923static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1936static int clgi_interception(struct vcpu_svm *svm)
1924{ 1937{
1925 if (nested_svm_check_permissions(svm)) 1938 if (nested_svm_check_permissions(svm))
1926 return 1; 1939 return 1;
@@ -1937,10 +1950,12 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1937 return 1; 1950 return 1;
1938} 1951}
1939 1952
1940static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1953static int invlpga_interception(struct vcpu_svm *svm)
1941{ 1954{
1942 struct kvm_vcpu *vcpu = &svm->vcpu; 1955 struct kvm_vcpu *vcpu = &svm->vcpu;
1943 nsvm_printk("INVLPGA\n"); 1956
1957 trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
1958 vcpu->arch.regs[VCPU_REGS_RAX]);
1944 1959
1945 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 1960 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
1946 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); 1961 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
@@ -1950,15 +1965,21 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1950 return 1; 1965 return 1;
1951} 1966}
1952 1967
1953static int invalid_op_interception(struct vcpu_svm *svm, 1968static int skinit_interception(struct vcpu_svm *svm)
1954 struct kvm_run *kvm_run)
1955{ 1969{
1970 trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
1971
1956 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1972 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1957 return 1; 1973 return 1;
1958} 1974}
1959 1975
1960static int task_switch_interception(struct vcpu_svm *svm, 1976static int invalid_op_interception(struct vcpu_svm *svm)
1961 struct kvm_run *kvm_run) 1977{
1978 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1979 return 1;
1980}
1981
1982static int task_switch_interception(struct vcpu_svm *svm)
1962{ 1983{
1963 u16 tss_selector; 1984 u16 tss_selector;
1964 int reason; 1985 int reason;
@@ -2008,14 +2029,14 @@ static int task_switch_interception(struct vcpu_svm *svm,
2008 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2029 return kvm_task_switch(&svm->vcpu, tss_selector, reason);
2009} 2030}
2010 2031
2011static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2032static int cpuid_interception(struct vcpu_svm *svm)
2012{ 2033{
2013 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2034 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2014 kvm_emulate_cpuid(&svm->vcpu); 2035 kvm_emulate_cpuid(&svm->vcpu);
2015 return 1; 2036 return 1;
2016} 2037}
2017 2038
2018static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2039static int iret_interception(struct vcpu_svm *svm)
2019{ 2040{
2020 ++svm->vcpu.stat.nmi_window_exits; 2041 ++svm->vcpu.stat.nmi_window_exits;
2021 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); 2042 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
@@ -2023,26 +2044,27 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2023 return 1; 2044 return 1;
2024} 2045}
2025 2046
2026static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2047static int invlpg_interception(struct vcpu_svm *svm)
2027{ 2048{
2028 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) 2049 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2029 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2050 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2030 return 1; 2051 return 1;
2031} 2052}
2032 2053
2033static int emulate_on_interception(struct vcpu_svm *svm, 2054static int emulate_on_interception(struct vcpu_svm *svm)
2034 struct kvm_run *kvm_run)
2035{ 2055{
2036 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) 2056 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2037 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2057 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2038 return 1; 2058 return 1;
2039} 2059}
2040 2060
2041static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2061static int cr8_write_interception(struct vcpu_svm *svm)
2042{ 2062{
2063 struct kvm_run *kvm_run = svm->vcpu.run;
2064
2043 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2065 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2044 /* instruction emulation calls kvm_set_cr8() */ 2066 /* instruction emulation calls kvm_set_cr8() */
2045 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); 2067 emulate_instruction(&svm->vcpu, 0, 0, 0);
2046 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2068 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2047 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2069 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2048 return 1; 2070 return 1;
@@ -2128,7 +2150,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2128 return 0; 2150 return 0;
2129} 2151}
2130 2152
2131static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2153static int rdmsr_interception(struct vcpu_svm *svm)
2132{ 2154{
2133 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2155 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2134 u64 data; 2156 u64 data;
@@ -2221,7 +2243,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2221 return 0; 2243 return 0;
2222} 2244}
2223 2245
2224static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2246static int wrmsr_interception(struct vcpu_svm *svm)
2225{ 2247{
2226 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2248 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2227 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2249 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
@@ -2237,17 +2259,18 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2237 return 1; 2259 return 1;
2238} 2260}
2239 2261
2240static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2262static int msr_interception(struct vcpu_svm *svm)
2241{ 2263{
2242 if (svm->vmcb->control.exit_info_1) 2264 if (svm->vmcb->control.exit_info_1)
2243 return wrmsr_interception(svm, kvm_run); 2265 return wrmsr_interception(svm);
2244 else 2266 else
2245 return rdmsr_interception(svm, kvm_run); 2267 return rdmsr_interception(svm);
2246} 2268}
2247 2269
2248static int interrupt_window_interception(struct vcpu_svm *svm, 2270static int interrupt_window_interception(struct vcpu_svm *svm)
2249 struct kvm_run *kvm_run)
2250{ 2271{
2272 struct kvm_run *kvm_run = svm->vcpu.run;
2273
2251 svm_clear_vintr(svm); 2274 svm_clear_vintr(svm);
2252 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2275 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2253 /* 2276 /*
@@ -2265,8 +2288,13 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
2265 return 1; 2288 return 1;
2266} 2289}
2267 2290
2268static int (*svm_exit_handlers[])(struct vcpu_svm *svm, 2291static int pause_interception(struct vcpu_svm *svm)
2269 struct kvm_run *kvm_run) = { 2292{
2293 kvm_vcpu_on_spin(&(svm->vcpu));
2294 return 1;
2295}
2296
2297static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2270 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2298 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2271 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2299 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2272 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2300 [SVM_EXIT_READ_CR4] = emulate_on_interception,
@@ -2301,6 +2329,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2301 [SVM_EXIT_CPUID] = cpuid_interception, 2329 [SVM_EXIT_CPUID] = cpuid_interception,
2302 [SVM_EXIT_IRET] = iret_interception, 2330 [SVM_EXIT_IRET] = iret_interception,
2303 [SVM_EXIT_INVD] = emulate_on_interception, 2331 [SVM_EXIT_INVD] = emulate_on_interception,
2332 [SVM_EXIT_PAUSE] = pause_interception,
2304 [SVM_EXIT_HLT] = halt_interception, 2333 [SVM_EXIT_HLT] = halt_interception,
2305 [SVM_EXIT_INVLPG] = invlpg_interception, 2334 [SVM_EXIT_INVLPG] = invlpg_interception,
2306 [SVM_EXIT_INVLPGA] = invlpga_interception, 2335 [SVM_EXIT_INVLPGA] = invlpga_interception,
@@ -2314,26 +2343,36 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2314 [SVM_EXIT_VMSAVE] = vmsave_interception, 2343 [SVM_EXIT_VMSAVE] = vmsave_interception,
2315 [SVM_EXIT_STGI] = stgi_interception, 2344 [SVM_EXIT_STGI] = stgi_interception,
2316 [SVM_EXIT_CLGI] = clgi_interception, 2345 [SVM_EXIT_CLGI] = clgi_interception,
2317 [SVM_EXIT_SKINIT] = invalid_op_interception, 2346 [SVM_EXIT_SKINIT] = skinit_interception,
2318 [SVM_EXIT_WBINVD] = emulate_on_interception, 2347 [SVM_EXIT_WBINVD] = emulate_on_interception,
2319 [SVM_EXIT_MONITOR] = invalid_op_interception, 2348 [SVM_EXIT_MONITOR] = invalid_op_interception,
2320 [SVM_EXIT_MWAIT] = invalid_op_interception, 2349 [SVM_EXIT_MWAIT] = invalid_op_interception,
2321 [SVM_EXIT_NPF] = pf_interception, 2350 [SVM_EXIT_NPF] = pf_interception,
2322}; 2351};
2323 2352
2324static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 2353static int handle_exit(struct kvm_vcpu *vcpu)
2325{ 2354{
2326 struct vcpu_svm *svm = to_svm(vcpu); 2355 struct vcpu_svm *svm = to_svm(vcpu);
2356 struct kvm_run *kvm_run = vcpu->run;
2327 u32 exit_code = svm->vmcb->control.exit_code; 2357 u32 exit_code = svm->vmcb->control.exit_code;
2328 2358
2329 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2359 trace_kvm_exit(exit_code, svm->vmcb->save.rip);
2330 2360
2361 if (unlikely(svm->nested.exit_required)) {
2362 nested_svm_vmexit(svm);
2363 svm->nested.exit_required = false;
2364
2365 return 1;
2366 }
2367
2331 if (is_nested(svm)) { 2368 if (is_nested(svm)) {
2332 int vmexit; 2369 int vmexit;
2333 2370
2334 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", 2371 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
2335 exit_code, svm->vmcb->control.exit_info_1, 2372 svm->vmcb->control.exit_info_1,
2336 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); 2373 svm->vmcb->control.exit_info_2,
2374 svm->vmcb->control.exit_int_info,
2375 svm->vmcb->control.exit_int_info_err);
2337 2376
2338 vmexit = nested_svm_exit_special(svm); 2377 vmexit = nested_svm_exit_special(svm);
2339 2378
@@ -2383,7 +2422,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2383 return 0; 2422 return 0;
2384 } 2423 }
2385 2424
2386 return svm_exit_handlers[exit_code](svm, kvm_run); 2425 return svm_exit_handlers[exit_code](svm);
2387} 2426}
2388 2427
2389static void reload_tss(struct kvm_vcpu *vcpu) 2428static void reload_tss(struct kvm_vcpu *vcpu)
@@ -2460,20 +2499,47 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2460 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 2499 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2461} 2500}
2462 2501
2502static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
2503{
2504 struct vcpu_svm *svm = to_svm(vcpu);
2505
2506 return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
2507}
2508
2509static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2510{
2511 struct vcpu_svm *svm = to_svm(vcpu);
2512
2513 if (masked) {
2514 svm->vcpu.arch.hflags |= HF_NMI_MASK;
2515 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
2516 } else {
2517 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
2518 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
2519 }
2520}
2521
2463static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) 2522static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2464{ 2523{
2465 struct vcpu_svm *svm = to_svm(vcpu); 2524 struct vcpu_svm *svm = to_svm(vcpu);
2466 struct vmcb *vmcb = svm->vmcb; 2525 struct vmcb *vmcb = svm->vmcb;
2467 return (vmcb->save.rflags & X86_EFLAGS_IF) && 2526 int ret;
2468 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2527
2469 gif_set(svm) && 2528 if (!gif_set(svm) ||
2470 !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); 2529 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
2530 return 0;
2531
2532 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
2533
2534 if (is_nested(svm))
2535 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
2536
2537 return ret;
2471} 2538}
2472 2539
2473static void enable_irq_window(struct kvm_vcpu *vcpu) 2540static void enable_irq_window(struct kvm_vcpu *vcpu)
2474{ 2541{
2475 struct vcpu_svm *svm = to_svm(vcpu); 2542 struct vcpu_svm *svm = to_svm(vcpu);
2476 nsvm_printk("Trying to open IRQ window\n");
2477 2543
2478 nested_svm_intr(svm); 2544 nested_svm_intr(svm);
2479 2545
@@ -2498,7 +2564,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2498 /* Something prevents NMI from been injected. Single step over 2564 /* Something prevents NMI from been injected. Single step over
2499 possible problem (IRET or exception injection or interrupt 2565 possible problem (IRET or exception injection or interrupt
2500 shadow) */ 2566 shadow) */
2501 vcpu->arch.singlestep = true; 2567 svm->nmi_singlestep = true;
2502 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2568 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2503 update_db_intercept(vcpu); 2569 update_db_intercept(vcpu);
2504} 2570}
@@ -2588,13 +2654,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2588#define R "e" 2654#define R "e"
2589#endif 2655#endif
2590 2656
2591static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2657static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2592{ 2658{
2593 struct vcpu_svm *svm = to_svm(vcpu); 2659 struct vcpu_svm *svm = to_svm(vcpu);
2594 u16 fs_selector; 2660 u16 fs_selector;
2595 u16 gs_selector; 2661 u16 gs_selector;
2596 u16 ldt_selector; 2662 u16 ldt_selector;
2597 2663
2664 /*
2665 * A vmexit emulation is required before the vcpu can be executed
2666 * again.
2667 */
2668 if (unlikely(svm->nested.exit_required))
2669 return;
2670
2598 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 2671 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2599 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 2672 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2600 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 2673 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -2893,6 +2966,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2893 .queue_exception = svm_queue_exception, 2966 .queue_exception = svm_queue_exception,
2894 .interrupt_allowed = svm_interrupt_allowed, 2967 .interrupt_allowed = svm_interrupt_allowed,
2895 .nmi_allowed = svm_nmi_allowed, 2968 .nmi_allowed = svm_nmi_allowed,
2969 .get_nmi_mask = svm_get_nmi_mask,
2970 .set_nmi_mask = svm_set_nmi_mask,
2896 .enable_nmi_window = enable_nmi_window, 2971 .enable_nmi_window = enable_nmi_window,
2897 .enable_irq_window = enable_irq_window, 2972 .enable_irq_window = enable_irq_window,
2898 .update_cr8_intercept = update_cr8_intercept, 2973 .update_cr8_intercept = update_cr8_intercept,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0d480e77eacf..816e0449db0b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -349,6 +349,171 @@ TRACE_EVENT(kvm_apic_accept_irq,
349 __entry->coalesced ? " (coalesced)" : "") 349 __entry->coalesced ? " (coalesced)" : "")
350); 350);
351 351
352/*
353 * Tracepoint for nested VMRUN
354 */
355TRACE_EVENT(kvm_nested_vmrun,
356 TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
357 __u32 event_inj, bool npt),
358 TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
359
360 TP_STRUCT__entry(
361 __field( __u64, rip )
362 __field( __u64, vmcb )
363 __field( __u64, nested_rip )
364 __field( __u32, int_ctl )
365 __field( __u32, event_inj )
366 __field( bool, npt )
367 ),
368
369 TP_fast_assign(
370 __entry->rip = rip;
371 __entry->vmcb = vmcb;
372 __entry->nested_rip = nested_rip;
373 __entry->int_ctl = int_ctl;
374 __entry->event_inj = event_inj;
375 __entry->npt = npt;
376 ),
377
378 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
379 "event_inj: 0x%08x npt: %s\n",
380 __entry->rip, __entry->vmcb, __entry->nested_rip,
381 __entry->int_ctl, __entry->event_inj,
382 __entry->npt ? "on" : "off")
383);
384
385/*
386 * Tracepoint for #VMEXIT while nested
387 */
388TRACE_EVENT(kvm_nested_vmexit,
389 TP_PROTO(__u64 rip, __u32 exit_code,
390 __u64 exit_info1, __u64 exit_info2,
391 __u32 exit_int_info, __u32 exit_int_info_err),
392 TP_ARGS(rip, exit_code, exit_info1, exit_info2,
393 exit_int_info, exit_int_info_err),
394
395 TP_STRUCT__entry(
396 __field( __u64, rip )
397 __field( __u32, exit_code )
398 __field( __u64, exit_info1 )
399 __field( __u64, exit_info2 )
400 __field( __u32, exit_int_info )
401 __field( __u32, exit_int_info_err )
402 ),
403
404 TP_fast_assign(
405 __entry->rip = rip;
406 __entry->exit_code = exit_code;
407 __entry->exit_info1 = exit_info1;
408 __entry->exit_info2 = exit_info2;
409 __entry->exit_int_info = exit_int_info;
410 __entry->exit_int_info_err = exit_int_info_err;
411 ),
412 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
413 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
414 __entry->rip,
415 ftrace_print_symbols_seq(p, __entry->exit_code,
416 kvm_x86_ops->exit_reasons_str),
417 __entry->exit_info1, __entry->exit_info2,
418 __entry->exit_int_info, __entry->exit_int_info_err)
419);
420
421/*
422 * Tracepoint for #VMEXIT reinjected to the guest
423 */
424TRACE_EVENT(kvm_nested_vmexit_inject,
425 TP_PROTO(__u32 exit_code,
426 __u64 exit_info1, __u64 exit_info2,
427 __u32 exit_int_info, __u32 exit_int_info_err),
428 TP_ARGS(exit_code, exit_info1, exit_info2,
429 exit_int_info, exit_int_info_err),
430
431 TP_STRUCT__entry(
432 __field( __u32, exit_code )
433 __field( __u64, exit_info1 )
434 __field( __u64, exit_info2 )
435 __field( __u32, exit_int_info )
436 __field( __u32, exit_int_info_err )
437 ),
438
439 TP_fast_assign(
440 __entry->exit_code = exit_code;
441 __entry->exit_info1 = exit_info1;
442 __entry->exit_info2 = exit_info2;
443 __entry->exit_int_info = exit_int_info;
444 __entry->exit_int_info_err = exit_int_info_err;
445 ),
446
447 TP_printk("reason: %s ext_inf1: 0x%016llx "
448 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
449 ftrace_print_symbols_seq(p, __entry->exit_code,
450 kvm_x86_ops->exit_reasons_str),
451 __entry->exit_info1, __entry->exit_info2,
452 __entry->exit_int_info, __entry->exit_int_info_err)
453);
454
455/*
456 * Tracepoint for nested #vmexit because of interrupt pending
457 */
458TRACE_EVENT(kvm_nested_intr_vmexit,
459 TP_PROTO(__u64 rip),
460 TP_ARGS(rip),
461
462 TP_STRUCT__entry(
463 __field( __u64, rip )
464 ),
465
466 TP_fast_assign(
467 __entry->rip = rip
468 ),
469
470 TP_printk("rip: 0x%016llx\n", __entry->rip)
471);
472
473/*
474 * Tracepoint for nested #vmexit because of interrupt pending
475 */
476TRACE_EVENT(kvm_invlpga,
477 TP_PROTO(__u64 rip, int asid, u64 address),
478 TP_ARGS(rip, asid, address),
479
480 TP_STRUCT__entry(
481 __field( __u64, rip )
482 __field( int, asid )
483 __field( __u64, address )
484 ),
485
486 TP_fast_assign(
487 __entry->rip = rip;
488 __entry->asid = asid;
489 __entry->address = address;
490 ),
491
492 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
493 __entry->rip, __entry->asid, __entry->address)
494);
495
496/*
497 * Tracepoint for nested #vmexit because of interrupt pending
498 */
499TRACE_EVENT(kvm_skinit,
500 TP_PROTO(__u64 rip, __u32 slb),
501 TP_ARGS(rip, slb),
502
503 TP_STRUCT__entry(
504 __field( __u64, rip )
505 __field( __u32, slb )
506 ),
507
508 TP_fast_assign(
509 __entry->rip = rip;
510 __entry->slb = slb;
511 ),
512
513 TP_printk("rip: 0x%016llx slb: 0x%08x\n",
514 __entry->rip, __entry->slb)
515);
516
352#endif /* _TRACE_KVM_H */ 517#endif /* _TRACE_KVM_H */
353 518
354/* This part must be outside protection */ 519/* This part must be outside protection */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed53b42caba1..d4918d6fc924 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,12 +61,37 @@ module_param_named(unrestricted_guest,
61static int __read_mostly emulate_invalid_guest_state = 0; 61static int __read_mostly emulate_invalid_guest_state = 0;
62module_param(emulate_invalid_guest_state, bool, S_IRUGO); 62module_param(emulate_invalid_guest_state, bool, S_IRUGO);
63 63
64/*
65 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
66 * ple_gap: upper bound on the amount of time between two successive
67 * executions of PAUSE in a loop. Also indicate if ple enabled.
68 * According to test, this time is usually small than 41 cycles.
69 * ple_window: upper bound on the amount of time a guest is allowed to execute
70 * in a PAUSE loop. Tests indicate that most spinlocks are held for
71 * less than 2^12 cycles
72 * Time is measured based on a counter that runs at the same rate as the TSC,
73 * refer SDM volume 3b section 21.6.13 & 22.1.3.
74 */
75#define KVM_VMX_DEFAULT_PLE_GAP 41
76#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
77static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
78module_param(ple_gap, int, S_IRUGO);
79
80static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
81module_param(ple_window, int, S_IRUGO);
82
64struct vmcs { 83struct vmcs {
65 u32 revision_id; 84 u32 revision_id;
66 u32 abort; 85 u32 abort;
67 char data[0]; 86 char data[0];
68}; 87};
69 88
89struct shared_msr_entry {
90 unsigned index;
91 u64 data;
92 u64 mask;
93};
94
70struct vcpu_vmx { 95struct vcpu_vmx {
71 struct kvm_vcpu vcpu; 96 struct kvm_vcpu vcpu;
72 struct list_head local_vcpus_link; 97 struct list_head local_vcpus_link;
@@ -74,13 +99,12 @@ struct vcpu_vmx {
74 int launched; 99 int launched;
75 u8 fail; 100 u8 fail;
76 u32 idt_vectoring_info; 101 u32 idt_vectoring_info;
77 struct kvm_msr_entry *guest_msrs; 102 struct shared_msr_entry *guest_msrs;
78 struct kvm_msr_entry *host_msrs;
79 int nmsrs; 103 int nmsrs;
80 int save_nmsrs; 104 int save_nmsrs;
81 int msr_offset_efer;
82#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
83 int msr_offset_kernel_gs_base; 106 u64 msr_host_kernel_gs_base;
107 u64 msr_guest_kernel_gs_base;
84#endif 108#endif
85 struct vmcs *vmcs; 109 struct vmcs *vmcs;
86 struct { 110 struct {
@@ -88,7 +112,6 @@ struct vcpu_vmx {
88 u16 fs_sel, gs_sel, ldt_sel; 112 u16 fs_sel, gs_sel, ldt_sel;
89 int gs_ldt_reload_needed; 113 int gs_ldt_reload_needed;
90 int fs_reload_needed; 114 int fs_reload_needed;
91 int guest_efer_loaded;
92 } host_state; 115 } host_state;
93 struct { 116 struct {
94 int vm86_active; 117 int vm86_active;
@@ -107,7 +130,6 @@ struct vcpu_vmx {
107 } rmode; 130 } rmode;
108 int vpid; 131 int vpid;
109 bool emulation_required; 132 bool emulation_required;
110 enum emulation_result invalid_state_emulation_result;
111 133
112 /* Support for vnmi-less CPUs */ 134 /* Support for vnmi-less CPUs */
113 int soft_vnmi_blocked; 135 int soft_vnmi_blocked;
@@ -176,6 +198,8 @@ static struct kvm_vmx_segment_field {
176 VMX_SEGMENT_FIELD(LDTR), 198 VMX_SEGMENT_FIELD(LDTR),
177}; 199};
178 200
201static u64 host_efer;
202
179static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 203static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
180 204
181/* 205/*
@@ -184,28 +208,12 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
184 */ 208 */
185static const u32 vmx_msr_index[] = { 209static const u32 vmx_msr_index[] = {
186#ifdef CONFIG_X86_64 210#ifdef CONFIG_X86_64
187 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, 211 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
188#endif 212#endif
189 MSR_EFER, MSR_K6_STAR, 213 MSR_EFER, MSR_K6_STAR,
190}; 214};
191#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 215#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
192 216
193static void load_msrs(struct kvm_msr_entry *e, int n)
194{
195 int i;
196
197 for (i = 0; i < n; ++i)
198 wrmsrl(e[i].index, e[i].data);
199}
200
201static void save_msrs(struct kvm_msr_entry *e, int n)
202{
203 int i;
204
205 for (i = 0; i < n; ++i)
206 rdmsrl(e[i].index, e[i].data);
207}
208
209static inline int is_page_fault(u32 intr_info) 217static inline int is_page_fault(u32 intr_info)
210{ 218{
211 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 219 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -320,6 +328,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
320 SECONDARY_EXEC_UNRESTRICTED_GUEST; 328 SECONDARY_EXEC_UNRESTRICTED_GUEST;
321} 329}
322 330
331static inline int cpu_has_vmx_ple(void)
332{
333 return vmcs_config.cpu_based_2nd_exec_ctrl &
334 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
335}
336
323static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 337static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
324{ 338{
325 return flexpriority_enabled && 339 return flexpriority_enabled &&
@@ -348,7 +362,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
348 int i; 362 int i;
349 363
350 for (i = 0; i < vmx->nmsrs; ++i) 364 for (i = 0; i < vmx->nmsrs; ++i)
351 if (vmx->guest_msrs[i].index == msr) 365 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
352 return i; 366 return i;
353 return -1; 367 return -1;
354} 368}
@@ -379,7 +393,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
379 : : "a" (&operand), "c" (ext) : "cc", "memory"); 393 : : "a" (&operand), "c" (ext) : "cc", "memory");
380} 394}
381 395
382static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 396static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
383{ 397{
384 int i; 398 int i;
385 399
@@ -570,17 +584,12 @@ static void reload_tss(void)
570 load_TR_desc(); 584 load_TR_desc();
571} 585}
572 586
573static void load_transition_efer(struct vcpu_vmx *vmx) 587static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
574{ 588{
575 int efer_offset = vmx->msr_offset_efer;
576 u64 host_efer;
577 u64 guest_efer; 589 u64 guest_efer;
578 u64 ignore_bits; 590 u64 ignore_bits;
579 591
580 if (efer_offset < 0) 592 guest_efer = vmx->vcpu.arch.shadow_efer;
581 return;
582 host_efer = vmx->host_msrs[efer_offset].data;
583 guest_efer = vmx->guest_msrs[efer_offset].data;
584 593
585 /* 594 /*
586 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 595 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -593,27 +602,17 @@ static void load_transition_efer(struct vcpu_vmx *vmx)
593 if (guest_efer & EFER_LMA) 602 if (guest_efer & EFER_LMA)
594 ignore_bits &= ~(u64)EFER_SCE; 603 ignore_bits &= ~(u64)EFER_SCE;
595#endif 604#endif
596 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
597 return;
598
599 vmx->host_state.guest_efer_loaded = 1;
600 guest_efer &= ~ignore_bits; 605 guest_efer &= ~ignore_bits;
601 guest_efer |= host_efer & ignore_bits; 606 guest_efer |= host_efer & ignore_bits;
602 wrmsrl(MSR_EFER, guest_efer); 607 vmx->guest_msrs[efer_offset].data = guest_efer;
603 vmx->vcpu.stat.efer_reload++; 608 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
604} 609 return true;
605
606static void reload_host_efer(struct vcpu_vmx *vmx)
607{
608 if (vmx->host_state.guest_efer_loaded) {
609 vmx->host_state.guest_efer_loaded = 0;
610 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
611 }
612} 610}
613 611
614static void vmx_save_host_state(struct kvm_vcpu *vcpu) 612static void vmx_save_host_state(struct kvm_vcpu *vcpu)
615{ 613{
616 struct vcpu_vmx *vmx = to_vmx(vcpu); 614 struct vcpu_vmx *vmx = to_vmx(vcpu);
615 int i;
617 616
618 if (vmx->host_state.loaded) 617 if (vmx->host_state.loaded)
619 return; 618 return;
@@ -650,13 +649,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
650#endif 649#endif
651 650
652#ifdef CONFIG_X86_64 651#ifdef CONFIG_X86_64
653 if (is_long_mode(&vmx->vcpu)) 652 if (is_long_mode(&vmx->vcpu)) {
654 save_msrs(vmx->host_msrs + 653 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
655 vmx->msr_offset_kernel_gs_base, 1); 654 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
656 655 }
657#endif 656#endif
658 load_msrs(vmx->guest_msrs, vmx->save_nmsrs); 657 for (i = 0; i < vmx->save_nmsrs; ++i)
659 load_transition_efer(vmx); 658 kvm_set_shared_msr(vmx->guest_msrs[i].index,
659 vmx->guest_msrs[i].data,
660 vmx->guest_msrs[i].mask);
660} 661}
661 662
662static void __vmx_load_host_state(struct vcpu_vmx *vmx) 663static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -684,9 +685,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
684 local_irq_restore(flags); 685 local_irq_restore(flags);
685 } 686 }
686 reload_tss(); 687 reload_tss();
687 save_msrs(vmx->guest_msrs, vmx->save_nmsrs); 688#ifdef CONFIG_X86_64
688 load_msrs(vmx->host_msrs, vmx->save_nmsrs); 689 if (is_long_mode(&vmx->vcpu)) {
689 reload_host_efer(vmx); 690 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
691 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
692 }
693#endif
690} 694}
691 695
692static void vmx_load_host_state(struct vcpu_vmx *vmx) 696static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -877,19 +881,14 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
877/* 881/*
878 * Swap MSR entry in host/guest MSR entry array. 882 * Swap MSR entry in host/guest MSR entry array.
879 */ 883 */
880#ifdef CONFIG_X86_64
881static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 884static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
882{ 885{
883 struct kvm_msr_entry tmp; 886 struct shared_msr_entry tmp;
884 887
885 tmp = vmx->guest_msrs[to]; 888 tmp = vmx->guest_msrs[to];
886 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 889 vmx->guest_msrs[to] = vmx->guest_msrs[from];
887 vmx->guest_msrs[from] = tmp; 890 vmx->guest_msrs[from] = tmp;
888 tmp = vmx->host_msrs[to];
889 vmx->host_msrs[to] = vmx->host_msrs[from];
890 vmx->host_msrs[from] = tmp;
891} 891}
892#endif
893 892
894/* 893/*
895 * Set up the vmcs to automatically save and restore system 894 * Set up the vmcs to automatically save and restore system
@@ -898,15 +897,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
898 */ 897 */
899static void setup_msrs(struct vcpu_vmx *vmx) 898static void setup_msrs(struct vcpu_vmx *vmx)
900{ 899{
901 int save_nmsrs; 900 int save_nmsrs, index;
902 unsigned long *msr_bitmap; 901 unsigned long *msr_bitmap;
903 902
904 vmx_load_host_state(vmx); 903 vmx_load_host_state(vmx);
905 save_nmsrs = 0; 904 save_nmsrs = 0;
906#ifdef CONFIG_X86_64 905#ifdef CONFIG_X86_64
907 if (is_long_mode(&vmx->vcpu)) { 906 if (is_long_mode(&vmx->vcpu)) {
908 int index;
909
910 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 907 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
911 if (index >= 0) 908 if (index >= 0)
912 move_msr_up(vmx, index, save_nmsrs++); 909 move_msr_up(vmx, index, save_nmsrs++);
@@ -916,9 +913,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
916 index = __find_msr_index(vmx, MSR_CSTAR); 913 index = __find_msr_index(vmx, MSR_CSTAR);
917 if (index >= 0) 914 if (index >= 0)
918 move_msr_up(vmx, index, save_nmsrs++); 915 move_msr_up(vmx, index, save_nmsrs++);
919 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
920 if (index >= 0)
921 move_msr_up(vmx, index, save_nmsrs++);
922 /* 916 /*
923 * MSR_K6_STAR is only needed on long mode guests, and only 917 * MSR_K6_STAR is only needed on long mode guests, and only
924 * if efer.sce is enabled. 918 * if efer.sce is enabled.
@@ -928,13 +922,11 @@ static void setup_msrs(struct vcpu_vmx *vmx)
928 move_msr_up(vmx, index, save_nmsrs++); 922 move_msr_up(vmx, index, save_nmsrs++);
929 } 923 }
930#endif 924#endif
931 vmx->save_nmsrs = save_nmsrs; 925 index = __find_msr_index(vmx, MSR_EFER);
926 if (index >= 0 && update_transition_efer(vmx, index))
927 move_msr_up(vmx, index, save_nmsrs++);
932 928
933#ifdef CONFIG_X86_64 929 vmx->save_nmsrs = save_nmsrs;
934 vmx->msr_offset_kernel_gs_base =
935 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
936#endif
937 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
938 930
939 if (cpu_has_vmx_msr_bitmap()) { 931 if (cpu_has_vmx_msr_bitmap()) {
940 if (is_long_mode(&vmx->vcpu)) 932 if (is_long_mode(&vmx->vcpu))
@@ -976,7 +968,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
976static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 968static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
977{ 969{
978 u64 data; 970 u64 data;
979 struct kvm_msr_entry *msr; 971 struct shared_msr_entry *msr;
980 972
981 if (!pdata) { 973 if (!pdata) {
982 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); 974 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -991,9 +983,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
991 case MSR_GS_BASE: 983 case MSR_GS_BASE:
992 data = vmcs_readl(GUEST_GS_BASE); 984 data = vmcs_readl(GUEST_GS_BASE);
993 break; 985 break;
986 case MSR_KERNEL_GS_BASE:
987 vmx_load_host_state(to_vmx(vcpu));
988 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
989 break;
990#endif
994 case MSR_EFER: 991 case MSR_EFER:
995 return kvm_get_msr_common(vcpu, msr_index, pdata); 992 return kvm_get_msr_common(vcpu, msr_index, pdata);
996#endif
997 case MSR_IA32_TSC: 993 case MSR_IA32_TSC:
998 data = guest_read_tsc(); 994 data = guest_read_tsc();
999 break; 995 break;
@@ -1007,6 +1003,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1007 data = vmcs_readl(GUEST_SYSENTER_ESP); 1003 data = vmcs_readl(GUEST_SYSENTER_ESP);
1008 break; 1004 break;
1009 default: 1005 default:
1006 vmx_load_host_state(to_vmx(vcpu));
1010 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1007 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1011 if (msr) { 1008 if (msr) {
1012 vmx_load_host_state(to_vmx(vcpu)); 1009 vmx_load_host_state(to_vmx(vcpu));
@@ -1028,7 +1025,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1028static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1025static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1029{ 1026{
1030 struct vcpu_vmx *vmx = to_vmx(vcpu); 1027 struct vcpu_vmx *vmx = to_vmx(vcpu);
1031 struct kvm_msr_entry *msr; 1028 struct shared_msr_entry *msr;
1032 u64 host_tsc; 1029 u64 host_tsc;
1033 int ret = 0; 1030 int ret = 0;
1034 1031
@@ -1044,6 +1041,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1044 case MSR_GS_BASE: 1041 case MSR_GS_BASE:
1045 vmcs_writel(GUEST_GS_BASE, data); 1042 vmcs_writel(GUEST_GS_BASE, data);
1046 break; 1043 break;
1044 case MSR_KERNEL_GS_BASE:
1045 vmx_load_host_state(vmx);
1046 vmx->msr_guest_kernel_gs_base = data;
1047 break;
1047#endif 1048#endif
1048 case MSR_IA32_SYSENTER_CS: 1049 case MSR_IA32_SYSENTER_CS:
1049 vmcs_write32(GUEST_SYSENTER_CS, data); 1050 vmcs_write32(GUEST_SYSENTER_CS, data);
@@ -1097,30 +1098,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1097 } 1098 }
1098} 1099}
1099 1100
1100static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1101static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1101{ 1102{
1102 int old_debug = vcpu->guest_debug;
1103 unsigned long flags;
1104
1105 vcpu->guest_debug = dbg->control;
1106 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
1107 vcpu->guest_debug = 0;
1108
1109 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1103 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1110 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); 1104 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1111 else 1105 else
1112 vmcs_writel(GUEST_DR7, vcpu->arch.dr7); 1106 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1113 1107
1114 flags = vmcs_readl(GUEST_RFLAGS);
1115 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
1116 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1117 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1118 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1119 vmcs_writel(GUEST_RFLAGS, flags);
1120
1121 update_exception_bitmap(vcpu); 1108 update_exception_bitmap(vcpu);
1122
1123 return 0;
1124} 1109}
1125 1110
1126static __init int cpu_has_kvm_support(void) 1111static __init int cpu_has_kvm_support(void)
@@ -1139,12 +1124,15 @@ static __init int vmx_disabled_by_bios(void)
1139 /* locked but not enabled */ 1124 /* locked but not enabled */
1140} 1125}
1141 1126
1142static void hardware_enable(void *garbage) 1127static int hardware_enable(void *garbage)
1143{ 1128{
1144 int cpu = raw_smp_processor_id(); 1129 int cpu = raw_smp_processor_id();
1145 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1130 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1146 u64 old; 1131 u64 old;
1147 1132
1133 if (read_cr4() & X86_CR4_VMXE)
1134 return -EBUSY;
1135
1148 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1136 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1149 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1137 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1150 if ((old & (FEATURE_CONTROL_LOCKED | 1138 if ((old & (FEATURE_CONTROL_LOCKED |
@@ -1159,6 +1147,10 @@ static void hardware_enable(void *garbage)
1159 asm volatile (ASM_VMX_VMXON_RAX 1147 asm volatile (ASM_VMX_VMXON_RAX
1160 : : "a"(&phys_addr), "m"(phys_addr) 1148 : : "a"(&phys_addr), "m"(phys_addr)
1161 : "memory", "cc"); 1149 : "memory", "cc");
1150
1151 ept_sync_global();
1152
1153 return 0;
1162} 1154}
1163 1155
1164static void vmclear_local_vcpus(void) 1156static void vmclear_local_vcpus(void)
@@ -1250,7 +1242,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1250 SECONDARY_EXEC_WBINVD_EXITING | 1242 SECONDARY_EXEC_WBINVD_EXITING |
1251 SECONDARY_EXEC_ENABLE_VPID | 1243 SECONDARY_EXEC_ENABLE_VPID |
1252 SECONDARY_EXEC_ENABLE_EPT | 1244 SECONDARY_EXEC_ENABLE_EPT |
1253 SECONDARY_EXEC_UNRESTRICTED_GUEST; 1245 SECONDARY_EXEC_UNRESTRICTED_GUEST |
1246 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1254 if (adjust_vmx_controls(min2, opt2, 1247 if (adjust_vmx_controls(min2, opt2,
1255 MSR_IA32_VMX_PROCBASED_CTLS2, 1248 MSR_IA32_VMX_PROCBASED_CTLS2,
1256 &_cpu_based_2nd_exec_control) < 0) 1249 &_cpu_based_2nd_exec_control) < 0)
@@ -1344,15 +1337,17 @@ static void free_kvm_area(void)
1344{ 1337{
1345 int cpu; 1338 int cpu;
1346 1339
1347 for_each_online_cpu(cpu) 1340 for_each_possible_cpu(cpu) {
1348 free_vmcs(per_cpu(vmxarea, cpu)); 1341 free_vmcs(per_cpu(vmxarea, cpu));
1342 per_cpu(vmxarea, cpu) = NULL;
1343 }
1349} 1344}
1350 1345
1351static __init int alloc_kvm_area(void) 1346static __init int alloc_kvm_area(void)
1352{ 1347{
1353 int cpu; 1348 int cpu;
1354 1349
1355 for_each_online_cpu(cpu) { 1350 for_each_possible_cpu(cpu) {
1356 struct vmcs *vmcs; 1351 struct vmcs *vmcs;
1357 1352
1358 vmcs = alloc_vmcs_cpu(cpu); 1353 vmcs = alloc_vmcs_cpu(cpu);
@@ -1394,6 +1389,9 @@ static __init int hardware_setup(void)
1394 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 1389 if (enable_ept && !cpu_has_vmx_ept_2m_page())
1395 kvm_disable_largepages(); 1390 kvm_disable_largepages();
1396 1391
1392 if (!cpu_has_vmx_ple())
1393 ple_gap = 0;
1394
1397 return alloc_kvm_area(); 1395 return alloc_kvm_area();
1398} 1396}
1399 1397
@@ -1536,8 +1534,16 @@ continue_rmode:
1536static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1534static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1537{ 1535{
1538 struct vcpu_vmx *vmx = to_vmx(vcpu); 1536 struct vcpu_vmx *vmx = to_vmx(vcpu);
1539 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 1537 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1538
1539 if (!msr)
1540 return;
1540 1541
1542 /*
1543 * Force kernel_gs_base reloading before EFER changes, as control
1544 * of this msr depends on is_long_mode().
1545 */
1546 vmx_load_host_state(to_vmx(vcpu));
1541 vcpu->arch.shadow_efer = efer; 1547 vcpu->arch.shadow_efer = efer;
1542 if (!msr) 1548 if (!msr)
1543 return; 1549 return;
@@ -1727,6 +1733,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1727 vmcs_write64(EPT_POINTER, eptp); 1733 vmcs_write64(EPT_POINTER, eptp);
1728 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 1734 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1729 vcpu->kvm->arch.ept_identity_map_addr; 1735 vcpu->kvm->arch.ept_identity_map_addr;
1736 ept_load_pdptrs(vcpu);
1730 } 1737 }
1731 1738
1732 vmx_flush_tlb(vcpu); 1739 vmx_flush_tlb(vcpu);
@@ -2302,13 +2309,22 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2302 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2309 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2303 if (vmx->vpid == 0) 2310 if (vmx->vpid == 0)
2304 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2311 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2305 if (!enable_ept) 2312 if (!enable_ept) {
2306 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2313 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2314 enable_unrestricted_guest = 0;
2315 }
2307 if (!enable_unrestricted_guest) 2316 if (!enable_unrestricted_guest)
2308 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2317 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2318 if (!ple_gap)
2319 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2309 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2320 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2310 } 2321 }
2311 2322
2323 if (ple_gap) {
2324 vmcs_write32(PLE_GAP, ple_gap);
2325 vmcs_write32(PLE_WINDOW, ple_window);
2326 }
2327
2312 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); 2328 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2313 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2329 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2314 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2330 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -2376,10 +2392,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2376 if (wrmsr_safe(index, data_low, data_high) < 0) 2392 if (wrmsr_safe(index, data_low, data_high) < 0)
2377 continue; 2393 continue;
2378 data = data_low | ((u64)data_high << 32); 2394 data = data_low | ((u64)data_high << 32);
2379 vmx->host_msrs[j].index = index; 2395 vmx->guest_msrs[j].index = i;
2380 vmx->host_msrs[j].reserved = 0; 2396 vmx->guest_msrs[j].data = 0;
2381 vmx->host_msrs[j].data = data; 2397 vmx->guest_msrs[j].mask = -1ull;
2382 vmx->guest_msrs[j] = vmx->host_msrs[j];
2383 ++vmx->nmsrs; 2398 ++vmx->nmsrs;
2384 } 2399 }
2385 2400
@@ -2510,7 +2525,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2510 if (vmx->vpid != 0) 2525 if (vmx->vpid != 0)
2511 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2512 2527
2513 vmx->vcpu.arch.cr0 = 0x60000010; 2528 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
2514 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ 2529 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
2515 vmx_set_cr4(&vmx->vcpu, 0); 2530 vmx_set_cr4(&vmx->vcpu, 0);
2516 vmx_set_efer(&vmx->vcpu, 0); 2531 vmx_set_efer(&vmx->vcpu, 0);
@@ -2627,6 +2642,34 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2627 GUEST_INTR_STATE_NMI)); 2642 GUEST_INTR_STATE_NMI));
2628} 2643}
2629 2644
2645static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2646{
2647 if (!cpu_has_virtual_nmis())
2648 return to_vmx(vcpu)->soft_vnmi_blocked;
2649 else
2650 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2651 GUEST_INTR_STATE_NMI);
2652}
2653
2654static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2655{
2656 struct vcpu_vmx *vmx = to_vmx(vcpu);
2657
2658 if (!cpu_has_virtual_nmis()) {
2659 if (vmx->soft_vnmi_blocked != masked) {
2660 vmx->soft_vnmi_blocked = masked;
2661 vmx->vnmi_blocked_time = 0;
2662 }
2663 } else {
2664 if (masked)
2665 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2666 GUEST_INTR_STATE_NMI);
2667 else
2668 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2669 GUEST_INTR_STATE_NMI);
2670 }
2671}
2672
2630static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 2673static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2631{ 2674{
2632 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 2675 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -2659,7 +2702,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2659 * Cause the #SS fault with 0 error code in VM86 mode. 2702 * Cause the #SS fault with 0 error code in VM86 mode.
2660 */ 2703 */
2661 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2704 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2662 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2705 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
2663 return 1; 2706 return 1;
2664 /* 2707 /*
2665 * Forward all other exceptions that are valid in real mode. 2708 * Forward all other exceptions that are valid in real mode.
@@ -2710,15 +2753,16 @@ static void kvm_machine_check(void)
2710#endif 2753#endif
2711} 2754}
2712 2755
2713static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2756static int handle_machine_check(struct kvm_vcpu *vcpu)
2714{ 2757{
2715 /* already handled by vcpu_run */ 2758 /* already handled by vcpu_run */
2716 return 1; 2759 return 1;
2717} 2760}
2718 2761
2719static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2762static int handle_exception(struct kvm_vcpu *vcpu)
2720{ 2763{
2721 struct vcpu_vmx *vmx = to_vmx(vcpu); 2764 struct vcpu_vmx *vmx = to_vmx(vcpu);
2765 struct kvm_run *kvm_run = vcpu->run;
2722 u32 intr_info, ex_no, error_code; 2766 u32 intr_info, ex_no, error_code;
2723 unsigned long cr2, rip, dr6; 2767 unsigned long cr2, rip, dr6;
2724 u32 vect_info; 2768 u32 vect_info;
@@ -2728,12 +2772,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2728 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2772 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2729 2773
2730 if (is_machine_check(intr_info)) 2774 if (is_machine_check(intr_info))
2731 return handle_machine_check(vcpu, kvm_run); 2775 return handle_machine_check(vcpu);
2732 2776
2733 if ((vect_info & VECTORING_INFO_VALID_MASK) && 2777 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2734 !is_page_fault(intr_info)) 2778 !is_page_fault(intr_info)) {
2735 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 2779 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2736 "intr info 0x%x\n", __func__, vect_info, intr_info); 2780 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
2781 vcpu->run->internal.ndata = 2;
2782 vcpu->run->internal.data[0] = vect_info;
2783 vcpu->run->internal.data[1] = intr_info;
2784 return 0;
2785 }
2737 2786
2738 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 2787 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2739 return 1; /* already handled by vmx_vcpu_run() */ 2788 return 1; /* already handled by vmx_vcpu_run() */
@@ -2744,7 +2793,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2744 } 2793 }
2745 2794
2746 if (is_invalid_opcode(intr_info)) { 2795 if (is_invalid_opcode(intr_info)) {
2747 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 2796 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
2748 if (er != EMULATE_DONE) 2797 if (er != EMULATE_DONE)
2749 kvm_queue_exception(vcpu, UD_VECTOR); 2798 kvm_queue_exception(vcpu, UD_VECTOR);
2750 return 1; 2799 return 1;
@@ -2803,20 +2852,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2803 return 0; 2852 return 0;
2804} 2853}
2805 2854
2806static int handle_external_interrupt(struct kvm_vcpu *vcpu, 2855static int handle_external_interrupt(struct kvm_vcpu *vcpu)
2807 struct kvm_run *kvm_run)
2808{ 2856{
2809 ++vcpu->stat.irq_exits; 2857 ++vcpu->stat.irq_exits;
2810 return 1; 2858 return 1;
2811} 2859}
2812 2860
2813static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2861static int handle_triple_fault(struct kvm_vcpu *vcpu)
2814{ 2862{
2815 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2863 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2816 return 0; 2864 return 0;
2817} 2865}
2818 2866
2819static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2867static int handle_io(struct kvm_vcpu *vcpu)
2820{ 2868{
2821 unsigned long exit_qualification; 2869 unsigned long exit_qualification;
2822 int size, in, string; 2870 int size, in, string;
@@ -2827,8 +2875,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2827 string = (exit_qualification & 16) != 0; 2875 string = (exit_qualification & 16) != 0;
2828 2876
2829 if (string) { 2877 if (string) {
2830 if (emulate_instruction(vcpu, 2878 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2831 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
2832 return 0; 2879 return 0;
2833 return 1; 2880 return 1;
2834 } 2881 }
@@ -2838,7 +2885,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 port = exit_qualification >> 16; 2885 port = exit_qualification >> 16;
2839 2886
2840 skip_emulated_instruction(vcpu); 2887 skip_emulated_instruction(vcpu);
2841 return kvm_emulate_pio(vcpu, kvm_run, in, size, port); 2888 return kvm_emulate_pio(vcpu, in, size, port);
2842} 2889}
2843 2890
2844static void 2891static void
@@ -2852,7 +2899,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2852 hypercall[2] = 0xc1; 2899 hypercall[2] = 0xc1;
2853} 2900}
2854 2901
2855static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2902static int handle_cr(struct kvm_vcpu *vcpu)
2856{ 2903{
2857 unsigned long exit_qualification, val; 2904 unsigned long exit_qualification, val;
2858 int cr; 2905 int cr;
@@ -2887,7 +2934,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2887 return 1; 2934 return 1;
2888 if (cr8_prev <= cr8) 2935 if (cr8_prev <= cr8)
2889 return 1; 2936 return 1;
2890 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2937 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2891 return 0; 2938 return 0;
2892 } 2939 }
2893 }; 2940 };
@@ -2922,13 +2969,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2922 default: 2969 default:
2923 break; 2970 break;
2924 } 2971 }
2925 kvm_run->exit_reason = 0; 2972 vcpu->run->exit_reason = 0;
2926 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 2973 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2927 (int)(exit_qualification >> 4) & 3, cr); 2974 (int)(exit_qualification >> 4) & 3, cr);
2928 return 0; 2975 return 0;
2929} 2976}
2930 2977
2931static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2978static int handle_dr(struct kvm_vcpu *vcpu)
2932{ 2979{
2933 unsigned long exit_qualification; 2980 unsigned long exit_qualification;
2934 unsigned long val; 2981 unsigned long val;
@@ -2944,13 +2991,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2944 * guest debugging itself. 2991 * guest debugging itself.
2945 */ 2992 */
2946 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 2993 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
2947 kvm_run->debug.arch.dr6 = vcpu->arch.dr6; 2994 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
2948 kvm_run->debug.arch.dr7 = dr; 2995 vcpu->run->debug.arch.dr7 = dr;
2949 kvm_run->debug.arch.pc = 2996 vcpu->run->debug.arch.pc =
2950 vmcs_readl(GUEST_CS_BASE) + 2997 vmcs_readl(GUEST_CS_BASE) +
2951 vmcs_readl(GUEST_RIP); 2998 vmcs_readl(GUEST_RIP);
2952 kvm_run->debug.arch.exception = DB_VECTOR; 2999 vcpu->run->debug.arch.exception = DB_VECTOR;
2953 kvm_run->exit_reason = KVM_EXIT_DEBUG; 3000 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
2954 return 0; 3001 return 0;
2955 } else { 3002 } else {
2956 vcpu->arch.dr7 &= ~DR7_GD; 3003 vcpu->arch.dr7 &= ~DR7_GD;
@@ -3016,13 +3063,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016 return 1; 3063 return 1;
3017} 3064}
3018 3065
3019static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3066static int handle_cpuid(struct kvm_vcpu *vcpu)
3020{ 3067{
3021 kvm_emulate_cpuid(vcpu); 3068 kvm_emulate_cpuid(vcpu);
3022 return 1; 3069 return 1;
3023} 3070}
3024 3071
3025static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3072static int handle_rdmsr(struct kvm_vcpu *vcpu)
3026{ 3073{
3027 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3074 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3028 u64 data; 3075 u64 data;
@@ -3041,7 +3088,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3041 return 1; 3088 return 1;
3042} 3089}
3043 3090
3044static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3091static int handle_wrmsr(struct kvm_vcpu *vcpu)
3045{ 3092{
3046 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3093 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3047 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3094 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
@@ -3058,14 +3105,12 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3058 return 1; 3105 return 1;
3059} 3106}
3060 3107
3061static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, 3108static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3062 struct kvm_run *kvm_run)
3063{ 3109{
3064 return 1; 3110 return 1;
3065} 3111}
3066 3112
3067static int handle_interrupt_window(struct kvm_vcpu *vcpu, 3113static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3068 struct kvm_run *kvm_run)
3069{ 3114{
3070 u32 cpu_based_vm_exec_control; 3115 u32 cpu_based_vm_exec_control;
3071 3116
@@ -3081,34 +3126,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
3081 * possible 3126 * possible
3082 */ 3127 */
3083 if (!irqchip_in_kernel(vcpu->kvm) && 3128 if (!irqchip_in_kernel(vcpu->kvm) &&
3084 kvm_run->request_interrupt_window && 3129 vcpu->run->request_interrupt_window &&
3085 !kvm_cpu_has_interrupt(vcpu)) { 3130 !kvm_cpu_has_interrupt(vcpu)) {
3086 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 3131 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3087 return 0; 3132 return 0;
3088 } 3133 }
3089 return 1; 3134 return 1;
3090} 3135}
3091 3136
3092static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3137static int handle_halt(struct kvm_vcpu *vcpu)
3093{ 3138{
3094 skip_emulated_instruction(vcpu); 3139 skip_emulated_instruction(vcpu);
3095 return kvm_emulate_halt(vcpu); 3140 return kvm_emulate_halt(vcpu);
3096} 3141}
3097 3142
3098static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3143static int handle_vmcall(struct kvm_vcpu *vcpu)
3099{ 3144{
3100 skip_emulated_instruction(vcpu); 3145 skip_emulated_instruction(vcpu);
3101 kvm_emulate_hypercall(vcpu); 3146 kvm_emulate_hypercall(vcpu);
3102 return 1; 3147 return 1;
3103} 3148}
3104 3149
3105static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3150static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3106{ 3151{
3107 kvm_queue_exception(vcpu, UD_VECTOR); 3152 kvm_queue_exception(vcpu, UD_VECTOR);
3108 return 1; 3153 return 1;
3109} 3154}
3110 3155
3111static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3156static int handle_invlpg(struct kvm_vcpu *vcpu)
3112{ 3157{
3113 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3158 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3114 3159
@@ -3117,14 +3162,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3117 return 1; 3162 return 1;
3118} 3163}
3119 3164
3120static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3165static int handle_wbinvd(struct kvm_vcpu *vcpu)
3121{ 3166{
3122 skip_emulated_instruction(vcpu); 3167 skip_emulated_instruction(vcpu);
3123 /* TODO: Add support for VT-d/pass-through device */ 3168 /* TODO: Add support for VT-d/pass-through device */
3124 return 1; 3169 return 1;
3125} 3170}
3126 3171
3127static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3172static int handle_apic_access(struct kvm_vcpu *vcpu)
3128{ 3173{
3129 unsigned long exit_qualification; 3174 unsigned long exit_qualification;
3130 enum emulation_result er; 3175 enum emulation_result er;
@@ -3133,7 +3178,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3133 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3178 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3134 offset = exit_qualification & 0xffful; 3179 offset = exit_qualification & 0xffful;
3135 3180
3136 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3181 er = emulate_instruction(vcpu, 0, 0, 0);
3137 3182
3138 if (er != EMULATE_DONE) { 3183 if (er != EMULATE_DONE) {
3139 printk(KERN_ERR 3184 printk(KERN_ERR
@@ -3144,7 +3189,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3144 return 1; 3189 return 1;
3145} 3190}
3146 3191
3147static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3192static int handle_task_switch(struct kvm_vcpu *vcpu)
3148{ 3193{
3149 struct vcpu_vmx *vmx = to_vmx(vcpu); 3194 struct vcpu_vmx *vmx = to_vmx(vcpu);
3150 unsigned long exit_qualification; 3195 unsigned long exit_qualification;
@@ -3198,7 +3243,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3198 return 1; 3243 return 1;
3199} 3244}
3200 3245
3201static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3246static int handle_ept_violation(struct kvm_vcpu *vcpu)
3202{ 3247{
3203 unsigned long exit_qualification; 3248 unsigned long exit_qualification;
3204 gpa_t gpa; 3249 gpa_t gpa;
@@ -3219,8 +3264,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3219 vmcs_readl(GUEST_LINEAR_ADDRESS)); 3264 vmcs_readl(GUEST_LINEAR_ADDRESS));
3220 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3265 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3221 (long unsigned int)exit_qualification); 3266 (long unsigned int)exit_qualification);
3222 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3267 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3223 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; 3268 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
3224 return 0; 3269 return 0;
3225 } 3270 }
3226 3271
@@ -3290,7 +3335,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3290 } 3335 }
3291} 3336}
3292 3337
3293static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3338static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3294{ 3339{
3295 u64 sptes[4]; 3340 u64 sptes[4];
3296 int nr_sptes, i; 3341 int nr_sptes, i;
@@ -3306,13 +3351,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3306 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) 3351 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
3307 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); 3352 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
3308 3353
3309 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3354 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3310 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; 3355 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
3311 3356
3312 return 0; 3357 return 0;
3313} 3358}
3314 3359
3315static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3360static int handle_nmi_window(struct kvm_vcpu *vcpu)
3316{ 3361{
3317 u32 cpu_based_vm_exec_control; 3362 u32 cpu_based_vm_exec_control;
3318 3363
@@ -3325,36 +3370,50 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3325 return 1; 3370 return 1;
3326} 3371}
3327 3372
3328static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, 3373static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3329 struct kvm_run *kvm_run)
3330{ 3374{
3331 struct vcpu_vmx *vmx = to_vmx(vcpu); 3375 struct vcpu_vmx *vmx = to_vmx(vcpu);
3332 enum emulation_result err = EMULATE_DONE; 3376 enum emulation_result err = EMULATE_DONE;
3333 3377 int ret = 1;
3334 local_irq_enable();
3335 preempt_enable();
3336 3378
3337 while (!guest_state_valid(vcpu)) { 3379 while (!guest_state_valid(vcpu)) {
3338 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3380 err = emulate_instruction(vcpu, 0, 0, 0);
3339 3381
3340 if (err == EMULATE_DO_MMIO) 3382 if (err == EMULATE_DO_MMIO) {
3341 break; 3383 ret = 0;
3384 goto out;
3385 }
3342 3386
3343 if (err != EMULATE_DONE) { 3387 if (err != EMULATE_DONE) {
3344 kvm_report_emulation_failure(vcpu, "emulation failure"); 3388 kvm_report_emulation_failure(vcpu, "emulation failure");
3345 break; 3389 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3390 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3391 vcpu->run->internal.ndata = 0;
3392 ret = 0;
3393 goto out;
3346 } 3394 }
3347 3395
3348 if (signal_pending(current)) 3396 if (signal_pending(current))
3349 break; 3397 goto out;
3350 if (need_resched()) 3398 if (need_resched())
3351 schedule(); 3399 schedule();
3352 } 3400 }
3353 3401
3354 preempt_disable(); 3402 vmx->emulation_required = 0;
3355 local_irq_disable(); 3403out:
3404 return ret;
3405}
3356 3406
3357 vmx->invalid_state_emulation_result = err; 3407/*
3408 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
3409 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
3410 */
3411static int handle_pause(struct kvm_vcpu *vcpu)
3412{
3413 skip_emulated_instruction(vcpu);
3414 kvm_vcpu_on_spin(vcpu);
3415
3416 return 1;
3358} 3417}
3359 3418
3360/* 3419/*
@@ -3362,8 +3421,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3362 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 3421 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3363 * to be done to userspace and return 0. 3422 * to be done to userspace and return 0.
3364 */ 3423 */
3365static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, 3424static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3366 struct kvm_run *kvm_run) = {
3367 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 3425 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
3368 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 3426 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
3369 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 3427 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
@@ -3394,6 +3452,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3394 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3452 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3395 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3453 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3396 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 3454 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3455 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
3397}; 3456};
3398 3457
3399static const int kvm_vmx_max_exit_handlers = 3458static const int kvm_vmx_max_exit_handlers =
@@ -3403,7 +3462,7 @@ static const int kvm_vmx_max_exit_handlers =
3403 * The guest has exited. See if we can fix it or if we need userspace 3462 * The guest has exited. See if we can fix it or if we need userspace
3404 * assistance. 3463 * assistance.
3405 */ 3464 */
3406static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3465static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3407{ 3466{
3408 struct vcpu_vmx *vmx = to_vmx(vcpu); 3467 struct vcpu_vmx *vmx = to_vmx(vcpu);
3409 u32 exit_reason = vmx->exit_reason; 3468 u32 exit_reason = vmx->exit_reason;
@@ -3411,13 +3470,9 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3411 3470
3412 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3471 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
3413 3472
3414 /* If we need to emulate an MMIO from handle_invalid_guest_state 3473 /* If guest state is invalid, start emulating */
3415 * we just return 0 */ 3474 if (vmx->emulation_required && emulate_invalid_guest_state)
3416 if (vmx->emulation_required && emulate_invalid_guest_state) { 3475 return handle_invalid_guest_state(vcpu);
3417 if (guest_state_valid(vcpu))
3418 vmx->emulation_required = 0;
3419 return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
3420 }
3421 3476
3422 /* Access CR3 don't cause VMExit in paging mode, so we need 3477 /* Access CR3 don't cause VMExit in paging mode, so we need
3423 * to sync with guest real CR3. */ 3478 * to sync with guest real CR3. */
@@ -3425,8 +3480,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3425 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3480 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3426 3481
3427 if (unlikely(vmx->fail)) { 3482 if (unlikely(vmx->fail)) {
3428 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3483 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3429 kvm_run->fail_entry.hardware_entry_failure_reason 3484 vcpu->run->fail_entry.hardware_entry_failure_reason
3430 = vmcs_read32(VM_INSTRUCTION_ERROR); 3485 = vmcs_read32(VM_INSTRUCTION_ERROR);
3431 return 0; 3486 return 0;
3432 } 3487 }
@@ -3459,10 +3514,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3459 3514
3460 if (exit_reason < kvm_vmx_max_exit_handlers 3515 if (exit_reason < kvm_vmx_max_exit_handlers
3461 && kvm_vmx_exit_handlers[exit_reason]) 3516 && kvm_vmx_exit_handlers[exit_reason])
3462 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 3517 return kvm_vmx_exit_handlers[exit_reason](vcpu);
3463 else { 3518 else {
3464 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3519 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3465 kvm_run->hw.hardware_exit_reason = exit_reason; 3520 vcpu->run->hw.hardware_exit_reason = exit_reason;
3466 } 3521 }
3467 return 0; 3522 return 0;
3468} 3523}
@@ -3600,23 +3655,18 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3600#define Q "l" 3655#define Q "l"
3601#endif 3656#endif
3602 3657
3603static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3658static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3604{ 3659{
3605 struct vcpu_vmx *vmx = to_vmx(vcpu); 3660 struct vcpu_vmx *vmx = to_vmx(vcpu);
3606 3661
3607 if (enable_ept && is_paging(vcpu)) {
3608 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3609 ept_load_pdptrs(vcpu);
3610 }
3611 /* Record the guest's net vcpu time for enforced NMI injections. */ 3662 /* Record the guest's net vcpu time for enforced NMI injections. */
3612 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3663 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3613 vmx->entry_time = ktime_get(); 3664 vmx->entry_time = ktime_get();
3614 3665
3615 /* Handle invalid guest state instead of entering VMX */ 3666 /* Don't enter VMX if guest state is invalid, let the exit handler
3616 if (vmx->emulation_required && emulate_invalid_guest_state) { 3667 start emulation until we arrive back to a valid state */
3617 handle_invalid_guest_state(vcpu, kvm_run); 3668 if (vmx->emulation_required && emulate_invalid_guest_state)
3618 return; 3669 return;
3619 }
3620 3670
3621 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 3671 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3622 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 3672 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -3775,7 +3825,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3775 __clear_bit(vmx->vpid, vmx_vpid_bitmap); 3825 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3776 spin_unlock(&vmx_vpid_lock); 3826 spin_unlock(&vmx_vpid_lock);
3777 vmx_free_vmcs(vcpu); 3827 vmx_free_vmcs(vcpu);
3778 kfree(vmx->host_msrs);
3779 kfree(vmx->guest_msrs); 3828 kfree(vmx->guest_msrs);
3780 kvm_vcpu_uninit(vcpu); 3829 kvm_vcpu_uninit(vcpu);
3781 kmem_cache_free(kvm_vcpu_cache, vmx); 3830 kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -3802,10 +3851,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3802 goto uninit_vcpu; 3851 goto uninit_vcpu;
3803 } 3852 }
3804 3853
3805 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3806 if (!vmx->host_msrs)
3807 goto free_guest_msrs;
3808
3809 vmx->vmcs = alloc_vmcs(); 3854 vmx->vmcs = alloc_vmcs();
3810 if (!vmx->vmcs) 3855 if (!vmx->vmcs)
3811 goto free_msrs; 3856 goto free_msrs;
@@ -3836,8 +3881,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3836free_vmcs: 3881free_vmcs:
3837 free_vmcs(vmx->vmcs); 3882 free_vmcs(vmx->vmcs);
3838free_msrs: 3883free_msrs:
3839 kfree(vmx->host_msrs);
3840free_guest_msrs:
3841 kfree(vmx->guest_msrs); 3884 kfree(vmx->guest_msrs);
3842uninit_vcpu: 3885uninit_vcpu:
3843 kvm_vcpu_uninit(&vmx->vcpu); 3886 kvm_vcpu_uninit(&vmx->vcpu);
@@ -3973,6 +4016,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
3973 .queue_exception = vmx_queue_exception, 4016 .queue_exception = vmx_queue_exception,
3974 .interrupt_allowed = vmx_interrupt_allowed, 4017 .interrupt_allowed = vmx_interrupt_allowed,
3975 .nmi_allowed = vmx_nmi_allowed, 4018 .nmi_allowed = vmx_nmi_allowed,
4019 .get_nmi_mask = vmx_get_nmi_mask,
4020 .set_nmi_mask = vmx_set_nmi_mask,
3976 .enable_nmi_window = enable_nmi_window, 4021 .enable_nmi_window = enable_nmi_window,
3977 .enable_irq_window = enable_irq_window, 4022 .enable_irq_window = enable_irq_window,
3978 .update_cr8_intercept = update_cr8_intercept, 4023 .update_cr8_intercept = update_cr8_intercept,
@@ -3987,7 +4032,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
3987 4032
3988static int __init vmx_init(void) 4033static int __init vmx_init(void)
3989{ 4034{
3990 int r; 4035 int r, i;
4036
4037 rdmsrl_safe(MSR_EFER, &host_efer);
4038
4039 for (i = 0; i < NR_VMX_MSR; ++i)
4040 kvm_define_shared_msr(i, vmx_msr_index[i]);
3991 4041
3992 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); 4042 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3993 if (!vmx_io_bitmap_a) 4043 if (!vmx_io_bitmap_a)
@@ -4049,8 +4099,6 @@ static int __init vmx_init(void)
4049 if (bypass_guest_pf) 4099 if (bypass_guest_pf)
4050 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 4100 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4051 4101
4052 ept_sync_global();
4053
4054 return 0; 4102 return 0;
4055 4103
4056out3: 4104out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4fc80174191c..9d068966fb2a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,6 +37,7 @@
37#include <linux/iommu.h> 37#include <linux/iommu.h>
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <linux/user-return-notifier.h>
40#include <trace/events/kvm.h> 41#include <trace/events/kvm.h>
41#undef TRACE_INCLUDE_FILE 42#undef TRACE_INCLUDE_FILE
42#define CREATE_TRACE_POINTS 43#define CREATE_TRACE_POINTS
@@ -88,6 +89,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
88int ignore_msrs = 0; 89int ignore_msrs = 0;
89module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 90module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
90 91
92#define KVM_NR_SHARED_MSRS 16
93
94struct kvm_shared_msrs_global {
95 int nr;
96 struct kvm_shared_msr {
97 u32 msr;
98 u64 value;
99 } msrs[KVM_NR_SHARED_MSRS];
100};
101
102struct kvm_shared_msrs {
103 struct user_return_notifier urn;
104 bool registered;
105 u64 current_value[KVM_NR_SHARED_MSRS];
106};
107
108static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
109static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
110
91struct kvm_stats_debugfs_item debugfs_entries[] = { 111struct kvm_stats_debugfs_item debugfs_entries[] = {
92 { "pf_fixed", VCPU_STAT(pf_fixed) }, 112 { "pf_fixed", VCPU_STAT(pf_fixed) },
93 { "pf_guest", VCPU_STAT(pf_guest) }, 113 { "pf_guest", VCPU_STAT(pf_guest) },
@@ -124,6 +144,72 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
124 { NULL } 144 { NULL }
125}; 145};
126 146
147static void kvm_on_user_return(struct user_return_notifier *urn)
148{
149 unsigned slot;
150 struct kvm_shared_msr *global;
151 struct kvm_shared_msrs *locals
152 = container_of(urn, struct kvm_shared_msrs, urn);
153
154 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
155 global = &shared_msrs_global.msrs[slot];
156 if (global->value != locals->current_value[slot]) {
157 wrmsrl(global->msr, global->value);
158 locals->current_value[slot] = global->value;
159 }
160 }
161 locals->registered = false;
162 user_return_notifier_unregister(urn);
163}
164
165void kvm_define_shared_msr(unsigned slot, u32 msr)
166{
167 int cpu;
168 u64 value;
169
170 if (slot >= shared_msrs_global.nr)
171 shared_msrs_global.nr = slot + 1;
172 shared_msrs_global.msrs[slot].msr = msr;
173 rdmsrl_safe(msr, &value);
174 shared_msrs_global.msrs[slot].value = value;
175 for_each_online_cpu(cpu)
176 per_cpu(shared_msrs, cpu).current_value[slot] = value;
177}
178EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
179
180static void kvm_shared_msr_cpu_online(void)
181{
182 unsigned i;
183 struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
184
185 for (i = 0; i < shared_msrs_global.nr; ++i)
186 locals->current_value[i] = shared_msrs_global.msrs[i].value;
187}
188
189void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
190{
191 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
192
193 if (((value ^ smsr->current_value[slot]) & mask) == 0)
194 return;
195 smsr->current_value[slot] = value;
196 wrmsrl(shared_msrs_global.msrs[slot].msr, value);
197 if (!smsr->registered) {
198 smsr->urn.on_user_return = kvm_on_user_return;
199 user_return_notifier_register(&smsr->urn);
200 smsr->registered = true;
201 }
202}
203EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
204
205static void drop_user_return_notifiers(void *ignore)
206{
207 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
208
209 if (smsr->registered)
210 kvm_on_user_return(&smsr->urn);
211}
212
127unsigned long segment_base(u16 selector) 213unsigned long segment_base(u16 selector)
128{ 214{
129 struct descriptor_table gdt; 215 struct descriptor_table gdt;
@@ -485,16 +571,19 @@ static inline u32 bit(int bitno)
485 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 571 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
486 * 572 *
487 * This list is modified at module load time to reflect the 573 * This list is modified at module load time to reflect the
488 * capabilities of the host cpu. 574 * capabilities of the host cpu. This capabilities test skips MSRs that are
575 * kvm-specific. Those are put in the beginning of the list.
489 */ 576 */
577
578#define KVM_SAVE_MSRS_BEGIN 2
490static u32 msrs_to_save[] = { 579static u32 msrs_to_save[] = {
580 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
491 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 581 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
492 MSR_K6_STAR, 582 MSR_K6_STAR,
493#ifdef CONFIG_X86_64 583#ifdef CONFIG_X86_64
494 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 584 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
495#endif 585#endif
496 MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 586 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
497 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
498}; 587};
499 588
500static unsigned num_msrs_to_save; 589static unsigned num_msrs_to_save;
@@ -678,7 +767,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
678 /* With all the info we got, fill in the values */ 767 /* With all the info we got, fill in the values */
679 768
680 vcpu->hv_clock.system_time = ts.tv_nsec + 769 vcpu->hv_clock.system_time = ts.tv_nsec +
681 (NSEC_PER_SEC * (u64)ts.tv_sec); 770 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
771
682 /* 772 /*
683 * The interface expects us to write an even number signaling that the 773 * The interface expects us to write an even number signaling that the
684 * update is finished. Since the guest won't see the intermediate 774 * update is finished. Since the guest won't see the intermediate
@@ -836,6 +926,38 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
836 return 0; 926 return 0;
837} 927}
838 928
929static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
930{
931 struct kvm *kvm = vcpu->kvm;
932 int lm = is_long_mode(vcpu);
933 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
934 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
935 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
936 : kvm->arch.xen_hvm_config.blob_size_32;
937 u32 page_num = data & ~PAGE_MASK;
938 u64 page_addr = data & PAGE_MASK;
939 u8 *page;
940 int r;
941
942 r = -E2BIG;
943 if (page_num >= blob_size)
944 goto out;
945 r = -ENOMEM;
946 page = kzalloc(PAGE_SIZE, GFP_KERNEL);
947 if (!page)
948 goto out;
949 r = -EFAULT;
950 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
951 goto out_free;
952 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
953 goto out_free;
954 r = 0;
955out_free:
956 kfree(page);
957out:
958 return r;
959}
960
839int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 961int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
840{ 962{
841 switch (msr) { 963 switch (msr) {
@@ -951,6 +1073,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
951 "0x%x data 0x%llx\n", msr, data); 1073 "0x%x data 0x%llx\n", msr, data);
952 break; 1074 break;
953 default: 1075 default:
1076 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1077 return xen_hvm_config(vcpu, data);
954 if (!ignore_msrs) { 1078 if (!ignore_msrs) {
955 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1079 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
956 msr, data); 1080 msr, data);
@@ -1225,6 +1349,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1225 case KVM_CAP_PIT2: 1349 case KVM_CAP_PIT2:
1226 case KVM_CAP_PIT_STATE2: 1350 case KVM_CAP_PIT_STATE2:
1227 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1351 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1352 case KVM_CAP_XEN_HVM:
1353 case KVM_CAP_ADJUST_CLOCK:
1354 case KVM_CAP_VCPU_EVENTS:
1228 r = 1; 1355 r = 1;
1229 break; 1356 break;
1230 case KVM_CAP_COALESCED_MMIO: 1357 case KVM_CAP_COALESCED_MMIO:
@@ -1239,8 +1366,8 @@ int kvm_dev_ioctl_check_extension(long ext)
1239 case KVM_CAP_NR_MEMSLOTS: 1366 case KVM_CAP_NR_MEMSLOTS:
1240 r = KVM_MEMORY_SLOTS; 1367 r = KVM_MEMORY_SLOTS;
1241 break; 1368 break;
1242 case KVM_CAP_PV_MMU: 1369 case KVM_CAP_PV_MMU: /* obsolete */
1243 r = !tdp_enabled; 1370 r = 0;
1244 break; 1371 break;
1245 case KVM_CAP_IOMMU: 1372 case KVM_CAP_IOMMU:
1246 r = iommu_found(); 1373 r = iommu_found();
@@ -1327,6 +1454,12 @@ out:
1327void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1454void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1328{ 1455{
1329 kvm_x86_ops->vcpu_load(vcpu, cpu); 1456 kvm_x86_ops->vcpu_load(vcpu, cpu);
1457 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1458 unsigned long khz = cpufreq_quick_get(cpu);
1459 if (!khz)
1460 khz = tsc_khz;
1461 per_cpu(cpu_tsc_khz, cpu) = khz;
1462 }
1330 kvm_request_guest_time_update(vcpu); 1463 kvm_request_guest_time_update(vcpu);
1331} 1464}
1332 1465
@@ -1760,6 +1893,61 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1760 return 0; 1893 return 0;
1761} 1894}
1762 1895
1896static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
1897 struct kvm_vcpu_events *events)
1898{
1899 vcpu_load(vcpu);
1900
1901 events->exception.injected = vcpu->arch.exception.pending;
1902 events->exception.nr = vcpu->arch.exception.nr;
1903 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
1904 events->exception.error_code = vcpu->arch.exception.error_code;
1905
1906 events->interrupt.injected = vcpu->arch.interrupt.pending;
1907 events->interrupt.nr = vcpu->arch.interrupt.nr;
1908 events->interrupt.soft = vcpu->arch.interrupt.soft;
1909
1910 events->nmi.injected = vcpu->arch.nmi_injected;
1911 events->nmi.pending = vcpu->arch.nmi_pending;
1912 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
1913
1914 events->sipi_vector = vcpu->arch.sipi_vector;
1915
1916 events->flags = 0;
1917
1918 vcpu_put(vcpu);
1919}
1920
1921static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
1922 struct kvm_vcpu_events *events)
1923{
1924 if (events->flags)
1925 return -EINVAL;
1926
1927 vcpu_load(vcpu);
1928
1929 vcpu->arch.exception.pending = events->exception.injected;
1930 vcpu->arch.exception.nr = events->exception.nr;
1931 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
1932 vcpu->arch.exception.error_code = events->exception.error_code;
1933
1934 vcpu->arch.interrupt.pending = events->interrupt.injected;
1935 vcpu->arch.interrupt.nr = events->interrupt.nr;
1936 vcpu->arch.interrupt.soft = events->interrupt.soft;
1937 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
1938 kvm_pic_clear_isr_ack(vcpu->kvm);
1939
1940 vcpu->arch.nmi_injected = events->nmi.injected;
1941 vcpu->arch.nmi_pending = events->nmi.pending;
1942 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
1943
1944 vcpu->arch.sipi_vector = events->sipi_vector;
1945
1946 vcpu_put(vcpu);
1947
1948 return 0;
1949}
1950
1763long kvm_arch_vcpu_ioctl(struct file *filp, 1951long kvm_arch_vcpu_ioctl(struct file *filp,
1764 unsigned int ioctl, unsigned long arg) 1952 unsigned int ioctl, unsigned long arg)
1765{ 1953{
@@ -1770,6 +1958,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1770 1958
1771 switch (ioctl) { 1959 switch (ioctl) {
1772 case KVM_GET_LAPIC: { 1960 case KVM_GET_LAPIC: {
1961 r = -EINVAL;
1962 if (!vcpu->arch.apic)
1963 goto out;
1773 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1964 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1774 1965
1775 r = -ENOMEM; 1966 r = -ENOMEM;
@@ -1785,6 +1976,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1785 break; 1976 break;
1786 } 1977 }
1787 case KVM_SET_LAPIC: { 1978 case KVM_SET_LAPIC: {
1979 r = -EINVAL;
1980 if (!vcpu->arch.apic)
1981 goto out;
1788 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1982 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1789 r = -ENOMEM; 1983 r = -ENOMEM;
1790 if (!lapic) 1984 if (!lapic)
@@ -1911,6 +2105,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1911 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2105 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1912 break; 2106 break;
1913 } 2107 }
2108 case KVM_GET_VCPU_EVENTS: {
2109 struct kvm_vcpu_events events;
2110
2111 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
2112
2113 r = -EFAULT;
2114 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
2115 break;
2116 r = 0;
2117 break;
2118 }
2119 case KVM_SET_VCPU_EVENTS: {
2120 struct kvm_vcpu_events events;
2121
2122 r = -EFAULT;
2123 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
2124 break;
2125
2126 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2127 break;
2128 }
1914 default: 2129 default:
1915 r = -EINVAL; 2130 r = -EINVAL;
1916 } 2131 }
@@ -2039,9 +2254,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2039 sizeof(struct kvm_pic_state)); 2254 sizeof(struct kvm_pic_state));
2040 break; 2255 break;
2041 case KVM_IRQCHIP_IOAPIC: 2256 case KVM_IRQCHIP_IOAPIC:
2042 memcpy(&chip->chip.ioapic, 2257 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2043 ioapic_irqchip(kvm),
2044 sizeof(struct kvm_ioapic_state));
2045 break; 2258 break;
2046 default: 2259 default:
2047 r = -EINVAL; 2260 r = -EINVAL;
@@ -2071,11 +2284,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2071 spin_unlock(&pic_irqchip(kvm)->lock); 2284 spin_unlock(&pic_irqchip(kvm)->lock);
2072 break; 2285 break;
2073 case KVM_IRQCHIP_IOAPIC: 2286 case KVM_IRQCHIP_IOAPIC:
2074 mutex_lock(&kvm->irq_lock); 2287 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2075 memcpy(ioapic_irqchip(kvm),
2076 &chip->chip.ioapic,
2077 sizeof(struct kvm_ioapic_state));
2078 mutex_unlock(&kvm->irq_lock);
2079 break; 2288 break;
2080 default: 2289 default:
2081 r = -EINVAL; 2290 r = -EINVAL;
@@ -2183,7 +2392,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2183{ 2392{
2184 struct kvm *kvm = filp->private_data; 2393 struct kvm *kvm = filp->private_data;
2185 void __user *argp = (void __user *)arg; 2394 void __user *argp = (void __user *)arg;
2186 int r = -EINVAL; 2395 int r = -ENOTTY;
2187 /* 2396 /*
2188 * This union makes it completely explicit to gcc-3.x 2397 * This union makes it completely explicit to gcc-3.x
2189 * that these two variables' stack usage should be 2398 * that these two variables' stack usage should be
@@ -2245,25 +2454,39 @@ long kvm_arch_vm_ioctl(struct file *filp,
2245 if (r) 2454 if (r)
2246 goto out; 2455 goto out;
2247 break; 2456 break;
2248 case KVM_CREATE_IRQCHIP: 2457 case KVM_CREATE_IRQCHIP: {
2458 struct kvm_pic *vpic;
2459
2460 mutex_lock(&kvm->lock);
2461 r = -EEXIST;
2462 if (kvm->arch.vpic)
2463 goto create_irqchip_unlock;
2249 r = -ENOMEM; 2464 r = -ENOMEM;
2250 kvm->arch.vpic = kvm_create_pic(kvm); 2465 vpic = kvm_create_pic(kvm);
2251 if (kvm->arch.vpic) { 2466 if (vpic) {
2252 r = kvm_ioapic_init(kvm); 2467 r = kvm_ioapic_init(kvm);
2253 if (r) { 2468 if (r) {
2254 kfree(kvm->arch.vpic); 2469 kfree(vpic);
2255 kvm->arch.vpic = NULL; 2470 goto create_irqchip_unlock;
2256 goto out;
2257 } 2471 }
2258 } else 2472 } else
2259 goto out; 2473 goto create_irqchip_unlock;
2474 smp_wmb();
2475 kvm->arch.vpic = vpic;
2476 smp_wmb();
2260 r = kvm_setup_default_irq_routing(kvm); 2477 r = kvm_setup_default_irq_routing(kvm);
2261 if (r) { 2478 if (r) {
2479 mutex_lock(&kvm->irq_lock);
2262 kfree(kvm->arch.vpic); 2480 kfree(kvm->arch.vpic);
2263 kfree(kvm->arch.vioapic); 2481 kfree(kvm->arch.vioapic);
2264 goto out; 2482 kvm->arch.vpic = NULL;
2483 kvm->arch.vioapic = NULL;
2484 mutex_unlock(&kvm->irq_lock);
2265 } 2485 }
2486 create_irqchip_unlock:
2487 mutex_unlock(&kvm->lock);
2266 break; 2488 break;
2489 }
2267 case KVM_CREATE_PIT: 2490 case KVM_CREATE_PIT:
2268 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2491 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2269 goto create_pit; 2492 goto create_pit;
@@ -2293,10 +2516,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2293 goto out; 2516 goto out;
2294 if (irqchip_in_kernel(kvm)) { 2517 if (irqchip_in_kernel(kvm)) {
2295 __s32 status; 2518 __s32 status;
2296 mutex_lock(&kvm->irq_lock);
2297 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2519 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2298 irq_event.irq, irq_event.level); 2520 irq_event.irq, irq_event.level);
2299 mutex_unlock(&kvm->irq_lock);
2300 if (ioctl == KVM_IRQ_LINE_STATUS) { 2521 if (ioctl == KVM_IRQ_LINE_STATUS) {
2301 irq_event.status = status; 2522 irq_event.status = status;
2302 if (copy_to_user(argp, &irq_event, 2523 if (copy_to_user(argp, &irq_event,
@@ -2422,6 +2643,55 @@ long kvm_arch_vm_ioctl(struct file *filp,
2422 r = 0; 2643 r = 0;
2423 break; 2644 break;
2424 } 2645 }
2646 case KVM_XEN_HVM_CONFIG: {
2647 r = -EFAULT;
2648 if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
2649 sizeof(struct kvm_xen_hvm_config)))
2650 goto out;
2651 r = -EINVAL;
2652 if (kvm->arch.xen_hvm_config.flags)
2653 goto out;
2654 r = 0;
2655 break;
2656 }
2657 case KVM_SET_CLOCK: {
2658 struct timespec now;
2659 struct kvm_clock_data user_ns;
2660 u64 now_ns;
2661 s64 delta;
2662
2663 r = -EFAULT;
2664 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
2665 goto out;
2666
2667 r = -EINVAL;
2668 if (user_ns.flags)
2669 goto out;
2670
2671 r = 0;
2672 ktime_get_ts(&now);
2673 now_ns = timespec_to_ns(&now);
2674 delta = user_ns.clock - now_ns;
2675 kvm->arch.kvmclock_offset = delta;
2676 break;
2677 }
2678 case KVM_GET_CLOCK: {
2679 struct timespec now;
2680 struct kvm_clock_data user_ns;
2681 u64 now_ns;
2682
2683 ktime_get_ts(&now);
2684 now_ns = timespec_to_ns(&now);
2685 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
2686 user_ns.flags = 0;
2687
2688 r = -EFAULT;
2689 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
2690 goto out;
2691 r = 0;
2692 break;
2693 }
2694
2425 default: 2695 default:
2426 ; 2696 ;
2427 } 2697 }
@@ -2434,7 +2704,8 @@ static void kvm_init_msr_list(void)
2434 u32 dummy[2]; 2704 u32 dummy[2];
2435 unsigned i, j; 2705 unsigned i, j;
2436 2706
2437 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2707 /* skip the first msrs in the list. KVM-specific */
2708 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2438 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2709 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2439 continue; 2710 continue;
2440 if (j < i) 2711 if (j < i)
@@ -2758,13 +3029,13 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
2758} 3029}
2759 3030
2760int emulate_instruction(struct kvm_vcpu *vcpu, 3031int emulate_instruction(struct kvm_vcpu *vcpu,
2761 struct kvm_run *run,
2762 unsigned long cr2, 3032 unsigned long cr2,
2763 u16 error_code, 3033 u16 error_code,
2764 int emulation_type) 3034 int emulation_type)
2765{ 3035{
2766 int r, shadow_mask; 3036 int r, shadow_mask;
2767 struct decode_cache *c; 3037 struct decode_cache *c;
3038 struct kvm_run *run = vcpu->run;
2768 3039
2769 kvm_clear_exception_queue(vcpu); 3040 kvm_clear_exception_queue(vcpu);
2770 vcpu->arch.mmio_fault_cr2 = cr2; 3041 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -2784,7 +3055,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2784 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3055 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2785 3056
2786 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3057 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2787 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 3058 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
2788 vcpu->arch.emulate_ctxt.mode = 3059 vcpu->arch.emulate_ctxt.mode =
2789 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3060 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2790 ? X86EMUL_MODE_REAL : cs_l 3061 ? X86EMUL_MODE_REAL : cs_l
@@ -2862,7 +3133,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2862 return EMULATE_DO_MMIO; 3133 return EMULATE_DO_MMIO;
2863 } 3134 }
2864 3135
2865 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 3136 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2866 3137
2867 if (vcpu->mmio_is_write) { 3138 if (vcpu->mmio_is_write) {
2868 vcpu->mmio_needed = 0; 3139 vcpu->mmio_needed = 0;
@@ -2970,8 +3241,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
2970 return r; 3241 return r;
2971} 3242}
2972 3243
2973int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3244int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
2974 int size, unsigned port)
2975{ 3245{
2976 unsigned long val; 3246 unsigned long val;
2977 3247
@@ -3000,7 +3270,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
3000} 3270}
3001EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3271EXPORT_SYMBOL_GPL(kvm_emulate_pio);
3002 3272
3003int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3273int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3004 int size, unsigned long count, int down, 3274 int size, unsigned long count, int down,
3005 gva_t address, int rep, unsigned port) 3275 gva_t address, int rep, unsigned port)
3006{ 3276{
@@ -3073,9 +3343,6 @@ static void bounce_off(void *info)
3073 /* nothing */ 3343 /* nothing */
3074} 3344}
3075 3345
3076static unsigned int ref_freq;
3077static unsigned long tsc_khz_ref;
3078
3079static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3346static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3080 void *data) 3347 void *data)
3081{ 3348{
@@ -3084,14 +3351,11 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
3084 struct kvm_vcpu *vcpu; 3351 struct kvm_vcpu *vcpu;
3085 int i, send_ipi = 0; 3352 int i, send_ipi = 0;
3086 3353
3087 if (!ref_freq)
3088 ref_freq = freq->old;
3089
3090 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3354 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3091 return 0; 3355 return 0;
3092 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3356 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3093 return 0; 3357 return 0;
3094 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 3358 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3095 3359
3096 spin_lock(&kvm_lock); 3360 spin_lock(&kvm_lock);
3097 list_for_each_entry(kvm, &vm_list, vm_list) { 3361 list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3128,9 +3392,28 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
3128 .notifier_call = kvmclock_cpufreq_notifier 3392 .notifier_call = kvmclock_cpufreq_notifier
3129}; 3393};
3130 3394
3395static void kvm_timer_init(void)
3396{
3397 int cpu;
3398
3399 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3400 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3401 CPUFREQ_TRANSITION_NOTIFIER);
3402 for_each_online_cpu(cpu) {
3403 unsigned long khz = cpufreq_get(cpu);
3404 if (!khz)
3405 khz = tsc_khz;
3406 per_cpu(cpu_tsc_khz, cpu) = khz;
3407 }
3408 } else {
3409 for_each_possible_cpu(cpu)
3410 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3411 }
3412}
3413
3131int kvm_arch_init(void *opaque) 3414int kvm_arch_init(void *opaque)
3132{ 3415{
3133 int r, cpu; 3416 int r;
3134 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3417 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3135 3418
3136 if (kvm_x86_ops) { 3419 if (kvm_x86_ops) {
@@ -3162,13 +3445,7 @@ int kvm_arch_init(void *opaque)
3162 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3445 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3163 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3446 PT_DIRTY_MASK, PT64_NX_MASK, 0);
3164 3447
3165 for_each_possible_cpu(cpu) 3448 kvm_timer_init();
3166 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3167 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3168 tsc_khz_ref = tsc_khz;
3169 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3170 CPUFREQ_TRANSITION_NOTIFIER);
3171 }
3172 3449
3173 return 0; 3450 return 0;
3174 3451
@@ -3296,7 +3573,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3296 unsigned long *rflags) 3573 unsigned long *rflags)
3297{ 3574{
3298 kvm_lmsw(vcpu, msw); 3575 kvm_lmsw(vcpu, msw);
3299 *rflags = kvm_x86_ops->get_rflags(vcpu); 3576 *rflags = kvm_get_rflags(vcpu);
3300} 3577}
3301 3578
3302unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3579unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
@@ -3334,7 +3611,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3334 switch (cr) { 3611 switch (cr) {
3335 case 0: 3612 case 0:
3336 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3613 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3337 *rflags = kvm_x86_ops->get_rflags(vcpu); 3614 *rflags = kvm_get_rflags(vcpu);
3338 break; 3615 break;
3339 case 2: 3616 case 2:
3340 vcpu->arch.cr2 = val; 3617 vcpu->arch.cr2 = val;
@@ -3454,18 +3731,18 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3454 * 3731 *
3455 * No need to exit to userspace if we already have an interrupt queued. 3732 * No need to exit to userspace if we already have an interrupt queued.
3456 */ 3733 */
3457static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3734static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
3458 struct kvm_run *kvm_run)
3459{ 3735{
3460 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3736 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3461 kvm_run->request_interrupt_window && 3737 vcpu->run->request_interrupt_window &&
3462 kvm_arch_interrupt_allowed(vcpu)); 3738 kvm_arch_interrupt_allowed(vcpu));
3463} 3739}
3464 3740
3465static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3741static void post_kvm_run_save(struct kvm_vcpu *vcpu)
3466 struct kvm_run *kvm_run)
3467{ 3742{
3468 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3743 struct kvm_run *kvm_run = vcpu->run;
3744
3745 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3469 kvm_run->cr8 = kvm_get_cr8(vcpu); 3746 kvm_run->cr8 = kvm_get_cr8(vcpu);
3470 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3747 kvm_run->apic_base = kvm_get_apic_base(vcpu);
3471 if (irqchip_in_kernel(vcpu->kvm)) 3748 if (irqchip_in_kernel(vcpu->kvm))
@@ -3526,7 +3803,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3526 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3803 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3527} 3804}
3528 3805
3529static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3806static void inject_pending_event(struct kvm_vcpu *vcpu)
3530{ 3807{
3531 /* try to reinject previous events if any */ 3808 /* try to reinject previous events if any */
3532 if (vcpu->arch.exception.pending) { 3809 if (vcpu->arch.exception.pending) {
@@ -3562,11 +3839,11 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3562 } 3839 }
3563} 3840}
3564 3841
3565static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3842static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3566{ 3843{
3567 int r; 3844 int r;
3568 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3845 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3569 kvm_run->request_interrupt_window; 3846 vcpu->run->request_interrupt_window;
3570 3847
3571 if (vcpu->requests) 3848 if (vcpu->requests)
3572 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3849 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3587,12 +3864,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3587 kvm_x86_ops->tlb_flush(vcpu); 3864 kvm_x86_ops->tlb_flush(vcpu);
3588 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3865 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3589 &vcpu->requests)) { 3866 &vcpu->requests)) {
3590 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 3867 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
3591 r = 0; 3868 r = 0;
3592 goto out; 3869 goto out;
3593 } 3870 }
3594 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3871 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3595 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 3872 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3596 r = 0; 3873 r = 0;
3597 goto out; 3874 goto out;
3598 } 3875 }
@@ -3616,7 +3893,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3616 goto out; 3893 goto out;
3617 } 3894 }
3618 3895
3619 inject_pending_event(vcpu, kvm_run); 3896 inject_pending_event(vcpu);
3620 3897
3621 /* enable NMI/IRQ window open exits if needed */ 3898 /* enable NMI/IRQ window open exits if needed */
3622 if (vcpu->arch.nmi_pending) 3899 if (vcpu->arch.nmi_pending)
@@ -3642,7 +3919,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3642 } 3919 }
3643 3920
3644 trace_kvm_entry(vcpu->vcpu_id); 3921 trace_kvm_entry(vcpu->vcpu_id);
3645 kvm_x86_ops->run(vcpu, kvm_run); 3922 kvm_x86_ops->run(vcpu);
3646 3923
3647 /* 3924 /*
3648 * If the guest has used debug registers, at least dr7 3925 * If the guest has used debug registers, at least dr7
@@ -3684,13 +3961,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3684 3961
3685 kvm_lapic_sync_from_vapic(vcpu); 3962 kvm_lapic_sync_from_vapic(vcpu);
3686 3963
3687 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3964 r = kvm_x86_ops->handle_exit(vcpu);
3688out: 3965out:
3689 return r; 3966 return r;
3690} 3967}
3691 3968
3692 3969
3693static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3970static int __vcpu_run(struct kvm_vcpu *vcpu)
3694{ 3971{
3695 int r; 3972 int r;
3696 3973
@@ -3710,7 +3987,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3710 r = 1; 3987 r = 1;
3711 while (r > 0) { 3988 while (r > 0) {
3712 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3989 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3713 r = vcpu_enter_guest(vcpu, kvm_run); 3990 r = vcpu_enter_guest(vcpu);
3714 else { 3991 else {
3715 up_read(&vcpu->kvm->slots_lock); 3992 up_read(&vcpu->kvm->slots_lock);
3716 kvm_vcpu_block(vcpu); 3993 kvm_vcpu_block(vcpu);
@@ -3738,14 +4015,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3738 if (kvm_cpu_has_pending_timer(vcpu)) 4015 if (kvm_cpu_has_pending_timer(vcpu))
3739 kvm_inject_pending_timer_irqs(vcpu); 4016 kvm_inject_pending_timer_irqs(vcpu);
3740 4017
3741 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 4018 if (dm_request_for_irq_injection(vcpu)) {
3742 r = -EINTR; 4019 r = -EINTR;
3743 kvm_run->exit_reason = KVM_EXIT_INTR; 4020 vcpu->run->exit_reason = KVM_EXIT_INTR;
3744 ++vcpu->stat.request_irq_exits; 4021 ++vcpu->stat.request_irq_exits;
3745 } 4022 }
3746 if (signal_pending(current)) { 4023 if (signal_pending(current)) {
3747 r = -EINTR; 4024 r = -EINTR;
3748 kvm_run->exit_reason = KVM_EXIT_INTR; 4025 vcpu->run->exit_reason = KVM_EXIT_INTR;
3749 ++vcpu->stat.signal_exits; 4026 ++vcpu->stat.signal_exits;
3750 } 4027 }
3751 if (need_resched()) { 4028 if (need_resched()) {
@@ -3756,7 +4033,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3756 } 4033 }
3757 4034
3758 up_read(&vcpu->kvm->slots_lock); 4035 up_read(&vcpu->kvm->slots_lock);
3759 post_kvm_run_save(vcpu, kvm_run); 4036 post_kvm_run_save(vcpu);
3760 4037
3761 vapic_exit(vcpu); 4038 vapic_exit(vcpu);
3762 4039
@@ -3789,15 +4066,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3789 if (r) 4066 if (r)
3790 goto out; 4067 goto out;
3791 } 4068 }
3792#if CONFIG_HAS_IOMEM
3793 if (vcpu->mmio_needed) { 4069 if (vcpu->mmio_needed) {
3794 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4070 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3795 vcpu->mmio_read_completed = 1; 4071 vcpu->mmio_read_completed = 1;
3796 vcpu->mmio_needed = 0; 4072 vcpu->mmio_needed = 0;
3797 4073
3798 down_read(&vcpu->kvm->slots_lock); 4074 down_read(&vcpu->kvm->slots_lock);
3799 r = emulate_instruction(vcpu, kvm_run, 4075 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
3800 vcpu->arch.mmio_fault_cr2, 0,
3801 EMULTYPE_NO_DECODE); 4076 EMULTYPE_NO_DECODE);
3802 up_read(&vcpu->kvm->slots_lock); 4077 up_read(&vcpu->kvm->slots_lock);
3803 if (r == EMULATE_DO_MMIO) { 4078 if (r == EMULATE_DO_MMIO) {
@@ -3808,12 +4083,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3808 goto out; 4083 goto out;
3809 } 4084 }
3810 } 4085 }
3811#endif
3812 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 4086 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3813 kvm_register_write(vcpu, VCPU_REGS_RAX, 4087 kvm_register_write(vcpu, VCPU_REGS_RAX,
3814 kvm_run->hypercall.ret); 4088 kvm_run->hypercall.ret);
3815 4089
3816 r = __vcpu_run(vcpu, kvm_run); 4090 r = __vcpu_run(vcpu);
3817 4091
3818out: 4092out:
3819 if (vcpu->sigset_active) 4093 if (vcpu->sigset_active)
@@ -3847,13 +4121,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3847#endif 4121#endif
3848 4122
3849 regs->rip = kvm_rip_read(vcpu); 4123 regs->rip = kvm_rip_read(vcpu);
3850 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 4124 regs->rflags = kvm_get_rflags(vcpu);
3851
3852 /*
3853 * Don't leak debug flags in case they were set for guest debugging
3854 */
3855 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3856 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3857 4125
3858 vcpu_put(vcpu); 4126 vcpu_put(vcpu);
3859 4127
@@ -3881,12 +4149,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3881 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 4149 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3882 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 4150 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3883 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 4151 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3884
3885#endif 4152#endif
3886 4153
3887 kvm_rip_write(vcpu, regs->rip); 4154 kvm_rip_write(vcpu, regs->rip);
3888 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 4155 kvm_set_rflags(vcpu, regs->rflags);
3889
3890 4156
3891 vcpu->arch.exception.pending = false; 4157 vcpu->arch.exception.pending = false;
3892 4158
@@ -4105,7 +4371,7 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4105{ 4371{
4106 return (seg != VCPU_SREG_LDTR) && 4372 return (seg != VCPU_SREG_LDTR) &&
4107 (seg != VCPU_SREG_TR) && 4373 (seg != VCPU_SREG_TR) &&
4108 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); 4374 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4109} 4375}
4110 4376
4111int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4377int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
@@ -4133,7 +4399,7 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4133{ 4399{
4134 tss->cr3 = vcpu->arch.cr3; 4400 tss->cr3 = vcpu->arch.cr3;
4135 tss->eip = kvm_rip_read(vcpu); 4401 tss->eip = kvm_rip_read(vcpu);
4136 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 4402 tss->eflags = kvm_get_rflags(vcpu);
4137 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4403 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4138 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4404 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4139 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4405 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4157,7 +4423,7 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4157 kvm_set_cr3(vcpu, tss->cr3); 4423 kvm_set_cr3(vcpu, tss->cr3);
4158 4424
4159 kvm_rip_write(vcpu, tss->eip); 4425 kvm_rip_write(vcpu, tss->eip);
4160 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 4426 kvm_set_rflags(vcpu, tss->eflags | 2);
4161 4427
4162 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4428 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4163 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4429 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
@@ -4195,7 +4461,7 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4195 struct tss_segment_16 *tss) 4461 struct tss_segment_16 *tss)
4196{ 4462{
4197 tss->ip = kvm_rip_read(vcpu); 4463 tss->ip = kvm_rip_read(vcpu);
4198 tss->flag = kvm_x86_ops->get_rflags(vcpu); 4464 tss->flag = kvm_get_rflags(vcpu);
4199 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4465 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4200 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4466 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4201 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4467 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4210,14 +4476,13 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4210 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4476 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4211 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4477 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4212 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4478 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4213 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
4214} 4479}
4215 4480
4216static int load_state_from_tss16(struct kvm_vcpu *vcpu, 4481static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4217 struct tss_segment_16 *tss) 4482 struct tss_segment_16 *tss)
4218{ 4483{
4219 kvm_rip_write(vcpu, tss->ip); 4484 kvm_rip_write(vcpu, tss->ip);
4220 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 4485 kvm_set_rflags(vcpu, tss->flag | 2);
4221 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 4486 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4222 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 4487 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4223 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 4488 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
@@ -4363,8 +4628,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4363 } 4628 }
4364 4629
4365 if (reason == TASK_SWITCH_IRET) { 4630 if (reason == TASK_SWITCH_IRET) {
4366 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4631 u32 eflags = kvm_get_rflags(vcpu);
4367 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4632 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4368 } 4633 }
4369 4634
4370 /* set back link to prev task only if NT bit is set in eflags 4635 /* set back link to prev task only if NT bit is set in eflags
@@ -4372,11 +4637,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4372 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4637 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4373 old_tss_sel = 0xffff; 4638 old_tss_sel = 0xffff;
4374 4639
4375 /* set back link to prev task only if NT bit is set in eflags
4376 note that old_tss_sel is not used afetr this point */
4377 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4378 old_tss_sel = 0xffff;
4379
4380 if (nseg_desc.type & 8) 4640 if (nseg_desc.type & 8)
4381 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4641 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4382 old_tss_base, &nseg_desc); 4642 old_tss_base, &nseg_desc);
@@ -4385,8 +4645,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4385 old_tss_base, &nseg_desc); 4645 old_tss_base, &nseg_desc);
4386 4646
4387 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4647 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4388 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4648 u32 eflags = kvm_get_rflags(vcpu);
4389 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4649 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4390 } 4650 }
4391 4651
4392 if (reason != TASK_SWITCH_IRET) { 4652 if (reason != TASK_SWITCH_IRET) {
@@ -4438,8 +4698,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4438 4698
4439 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4699 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4440 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4700 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4441 if (!is_long_mode(vcpu) && is_pae(vcpu)) 4701 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4442 load_pdptrs(vcpu, vcpu->arch.cr3); 4702 load_pdptrs(vcpu, vcpu->arch.cr3);
4703 mmu_reset_needed = 1;
4704 }
4443 4705
4444 if (mmu_reset_needed) 4706 if (mmu_reset_needed)
4445 kvm_mmu_reset_context(vcpu); 4707 kvm_mmu_reset_context(vcpu);
@@ -4480,12 +4742,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4480int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4742int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4481 struct kvm_guest_debug *dbg) 4743 struct kvm_guest_debug *dbg)
4482{ 4744{
4745 unsigned long rflags;
4483 int i, r; 4746 int i, r;
4484 4747
4485 vcpu_load(vcpu); 4748 vcpu_load(vcpu);
4486 4749
4487 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 4750 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
4488 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 4751 r = -EBUSY;
4752 if (vcpu->arch.exception.pending)
4753 goto unlock_out;
4754 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4755 kvm_queue_exception(vcpu, DB_VECTOR);
4756 else
4757 kvm_queue_exception(vcpu, BP_VECTOR);
4758 }
4759
4760 /*
4761 * Read rflags as long as potentially injected trace flags are still
4762 * filtered out.
4763 */
4764 rflags = kvm_get_rflags(vcpu);
4765
4766 vcpu->guest_debug = dbg->control;
4767 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
4768 vcpu->guest_debug = 0;
4769
4770 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4489 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4771 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4490 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4772 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4491 vcpu->arch.switch_db_regs = 4773 vcpu->arch.switch_db_regs =
@@ -4496,13 +4778,23 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4496 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4778 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4497 } 4779 }
4498 4780
4499 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4781 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
4782 vcpu->arch.singlestep_cs =
4783 get_segment_selector(vcpu, VCPU_SREG_CS);
4784 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
4785 }
4786
4787 /*
4788 * Trigger an rflags update that will inject or remove the trace
4789 * flags.
4790 */
4791 kvm_set_rflags(vcpu, rflags);
4792
4793 kvm_x86_ops->set_guest_debug(vcpu, dbg);
4500 4794
4501 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4795 r = 0;
4502 kvm_queue_exception(vcpu, DB_VECTOR);
4503 else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4504 kvm_queue_exception(vcpu, BP_VECTOR);
4505 4796
4797unlock_out:
4506 vcpu_put(vcpu); 4798 vcpu_put(vcpu);
4507 4799
4508 return r; 4800 return r;
@@ -4703,14 +4995,26 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4703 return kvm_x86_ops->vcpu_reset(vcpu); 4995 return kvm_x86_ops->vcpu_reset(vcpu);
4704} 4996}
4705 4997
4706void kvm_arch_hardware_enable(void *garbage) 4998int kvm_arch_hardware_enable(void *garbage)
4707{ 4999{
4708 kvm_x86_ops->hardware_enable(garbage); 5000 /*
5001 * Since this may be called from a hotplug notifcation,
5002 * we can't get the CPU frequency directly.
5003 */
5004 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5005 int cpu = raw_smp_processor_id();
5006 per_cpu(cpu_tsc_khz, cpu) = 0;
5007 }
5008
5009 kvm_shared_msr_cpu_online();
5010
5011 return kvm_x86_ops->hardware_enable(garbage);
4709} 5012}
4710 5013
4711void kvm_arch_hardware_disable(void *garbage) 5014void kvm_arch_hardware_disable(void *garbage)
4712{ 5015{
4713 kvm_x86_ops->hardware_disable(garbage); 5016 kvm_x86_ops->hardware_disable(garbage);
5017 drop_user_return_notifiers(garbage);
4714} 5018}
4715 5019
4716int kvm_arch_hardware_setup(void) 5020int kvm_arch_hardware_setup(void)
@@ -4948,8 +5252,36 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4948 return kvm_x86_ops->interrupt_allowed(vcpu); 5252 return kvm_x86_ops->interrupt_allowed(vcpu);
4949} 5253}
4950 5254
5255unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5256{
5257 unsigned long rflags;
5258
5259 rflags = kvm_x86_ops->get_rflags(vcpu);
5260 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5261 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
5262 return rflags;
5263}
5264EXPORT_SYMBOL_GPL(kvm_get_rflags);
5265
5266void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5267{
5268 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5269 vcpu->arch.singlestep_cs ==
5270 get_segment_selector(vcpu, VCPU_SREG_CS) &&
5271 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5272 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5273 kvm_x86_ops->set_rflags(vcpu, rflags);
5274}
5275EXPORT_SYMBOL_GPL(kvm_set_rflags);
5276
4951EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 5277EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
4952EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 5278EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
4953EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 5279EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
4954EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 5280EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
4955EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 5281EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
5282EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
5283EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
5284EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5285EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5286EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5287EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 73ffd5536f62..d406c5239019 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -146,10 +146,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
146 use_gbpages = direct_gbpages; 146 use_gbpages = direct_gbpages;
147#endif 147#endif
148 148
149 set_nx();
150 if (nx_enabled)
151 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
152
153 /* Enable PSE if available */ 149 /* Enable PSE if available */
154 if (cpu_has_pse) 150 if (cpu_has_pse)
155 set_in_cr4(X86_CR4_PSE); 151 set_in_cr4(X86_CR4_PSE);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 30938c1d8d5d..c973f8e2a6cf 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -412,7 +412,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
412 pkmap_page_table = pte; 412 pkmap_page_table = pte;
413} 413}
414 414
415static void __init add_one_highpage_init(struct page *page, int pfn) 415static void __init add_one_highpage_init(struct page *page)
416{ 416{
417 ClearPageReserved(page); 417 ClearPageReserved(page);
418 init_page_count(page); 418 init_page_count(page);
@@ -445,7 +445,7 @@ static int __init add_highpages_work_fn(unsigned long start_pfn,
445 if (!pfn_valid(node_pfn)) 445 if (!pfn_valid(node_pfn))
446 continue; 446 continue;
447 page = pfn_to_page(node_pfn); 447 page = pfn_to_page(node_pfn);
448 add_one_highpage_init(page, node_pfn); 448 add_one_highpage_init(page);
449 } 449 }
450 450
451 return 0; 451 return 0;
@@ -703,8 +703,8 @@ void __init find_low_pfn_range(void)
703} 703}
704 704
705#ifndef CONFIG_NEED_MULTIPLE_NODES 705#ifndef CONFIG_NEED_MULTIPLE_NODES
706void __init initmem_init(unsigned long start_pfn, 706void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
707 unsigned long end_pfn) 707 int acpi, int k8)
708{ 708{
709#ifdef CONFIG_HIGHMEM 709#ifdef CONFIG_HIGHMEM
710 highstart_pfn = highend_pfn = max_pfn; 710 highstart_pfn = highend_pfn = max_pfn;
@@ -997,7 +997,7 @@ static noinline int do_test_wp_bit(void)
997const int rodata_test_data = 0xC3; 997const int rodata_test_data = 0xC3;
998EXPORT_SYMBOL_GPL(rodata_test_data); 998EXPORT_SYMBOL_GPL(rodata_test_data);
999 999
1000static int kernel_set_to_readonly; 1000int kernel_set_to_readonly __read_mostly;
1001 1001
1002void set_kernel_text_rw(void) 1002void set_kernel_text_rw(void)
1003{ 1003{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a4398a6006b..5198b9bb34ef 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -568,7 +568,8 @@ kernel_physical_mapping_init(unsigned long start,
568} 568}
569 569
570#ifndef CONFIG_NUMA 570#ifndef CONFIG_NUMA
571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) 571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
572 int acpi, int k8)
572{ 573{
573 unsigned long bootmap_size, bootmap; 574 unsigned long bootmap_size, bootmap;
574 575
@@ -694,12 +695,12 @@ void __init mem_init(void)
694const int rodata_test_data = 0xC3; 695const int rodata_test_data = 0xC3;
695EXPORT_SYMBOL_GPL(rodata_test_data); 696EXPORT_SYMBOL_GPL(rodata_test_data);
696 697
697static int kernel_set_to_readonly; 698int kernel_set_to_readonly;
698 699
699void set_kernel_text_rw(void) 700void set_kernel_text_rw(void)
700{ 701{
701 unsigned long start = PFN_ALIGN(_stext); 702 unsigned long start = PFN_ALIGN(_text);
702 unsigned long end = PFN_ALIGN(__start_rodata); 703 unsigned long end = PFN_ALIGN(__stop___ex_table);
703 704
704 if (!kernel_set_to_readonly) 705 if (!kernel_set_to_readonly)
705 return; 706 return;
@@ -707,13 +708,18 @@ void set_kernel_text_rw(void)
707 pr_debug("Set kernel text: %lx - %lx for read write\n", 708 pr_debug("Set kernel text: %lx - %lx for read write\n",
708 start, end); 709 start, end);
709 710
711 /*
712 * Make the kernel identity mapping for text RW. Kernel text
713 * mapping will always be RO. Refer to the comment in
714 * static_protections() in pageattr.c
715 */
710 set_memory_rw(start, (end - start) >> PAGE_SHIFT); 716 set_memory_rw(start, (end - start) >> PAGE_SHIFT);
711} 717}
712 718
713void set_kernel_text_ro(void) 719void set_kernel_text_ro(void)
714{ 720{
715 unsigned long start = PFN_ALIGN(_stext); 721 unsigned long start = PFN_ALIGN(_text);
716 unsigned long end = PFN_ALIGN(__start_rodata); 722 unsigned long end = PFN_ALIGN(__stop___ex_table);
717 723
718 if (!kernel_set_to_readonly) 724 if (!kernel_set_to_readonly)
719 return; 725 return;
@@ -721,14 +727,21 @@ void set_kernel_text_ro(void)
721 pr_debug("Set kernel text: %lx - %lx for read only\n", 727 pr_debug("Set kernel text: %lx - %lx for read only\n",
722 start, end); 728 start, end);
723 729
730 /*
731 * Set the kernel identity mapping for text RO.
732 */
724 set_memory_ro(start, (end - start) >> PAGE_SHIFT); 733 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
725} 734}
726 735
727void mark_rodata_ro(void) 736void mark_rodata_ro(void)
728{ 737{
729 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 738 unsigned long start = PFN_ALIGN(_text);
730 unsigned long rodata_start = 739 unsigned long rodata_start =
731 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 740 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
741 unsigned long end = (unsigned long) &__end_rodata_hpage_align;
742 unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
743 unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
744 unsigned long data_start = (unsigned long) &_sdata;
732 745
733 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 746 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
734 (end - start) >> 10); 747 (end - start) >> 10);
@@ -751,6 +764,14 @@ void mark_rodata_ro(void)
751 printk(KERN_INFO "Testing CPA: again\n"); 764 printk(KERN_INFO "Testing CPA: again\n");
752 set_memory_ro(start, (end-start) >> PAGE_SHIFT); 765 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
753#endif 766#endif
767
768 free_init_pages("unused kernel memory",
769 (unsigned long) page_address(virt_to_page(text_end)),
770 (unsigned long)
771 page_address(virt_to_page(rodata_start)));
772 free_init_pages("unused kernel memory",
773 (unsigned long) page_address(virt_to_page(rodata_end)),
774 (unsigned long) page_address(virt_to_page(data_start)));
754} 775}
755 776
756#endif 777#endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 2feb9bdedaaf..c246d259822d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -281,30 +281,6 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
281} 281}
282EXPORT_SYMBOL(ioremap_cache); 282EXPORT_SYMBOL(ioremap_cache);
283 283
284static void __iomem *ioremap_default(resource_size_t phys_addr,
285 unsigned long size)
286{
287 unsigned long flags;
288 void __iomem *ret;
289 int err;
290
291 /*
292 * - WB for WB-able memory and no other conflicting mappings
293 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
294 * - Inherit from confliting mappings otherwise
295 */
296 err = reserve_memtype(phys_addr, phys_addr + size,
297 _PAGE_CACHE_WB, &flags);
298 if (err < 0)
299 return NULL;
300
301 ret = __ioremap_caller(phys_addr, size, flags,
302 __builtin_return_address(0));
303
304 free_memtype(phys_addr, phys_addr + size);
305 return ret;
306}
307
308void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, 284void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
309 unsigned long prot_val) 285 unsigned long prot_val)
310{ 286{
@@ -380,7 +356,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
380 if (page_is_ram(start >> PAGE_SHIFT)) 356 if (page_is_ram(start >> PAGE_SHIFT))
381 return __va(phys); 357 return __va(phys);
382 358
383 addr = (void __force *)ioremap_default(start, PAGE_SIZE); 359 addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
384 if (addr) 360 if (addr)
385 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 361 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
386 362
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 268f8255280f..970ed579d4e4 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -24,6 +24,9 @@
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h> 25#include <asm/k8.h>
26 26
27static struct bootnode __initdata nodes[8];
28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
29
27static __init int find_northbridge(void) 30static __init int find_northbridge(void)
28{ 31{
29 int num; 32 int num;
@@ -54,18 +57,6 @@ static __init void early_get_boot_cpu_id(void)
54 * need to get boot_cpu_id so can use that to create apicid_to_node 57 * need to get boot_cpu_id so can use that to create apicid_to_node
55 * in k8_scan_nodes() 58 * in k8_scan_nodes()
56 */ 59 */
57 /*
58 * Find possible boot-time SMP configuration:
59 */
60#ifdef CONFIG_X86_MPPARSE
61 early_find_smp_config();
62#endif
63#ifdef CONFIG_ACPI
64 /*
65 * Read APIC information from ACPI tables.
66 */
67 early_acpi_boot_init();
68#endif
69#ifdef CONFIG_X86_MPPARSE 60#ifdef CONFIG_X86_MPPARSE
70 /* 61 /*
71 * get boot-time SMP configuration: 62 * get boot-time SMP configuration:
@@ -76,12 +67,26 @@ static __init void early_get_boot_cpu_id(void)
76 early_init_lapic_mapping(); 67 early_init_lapic_mapping();
77} 68}
78 69
79int __init k8_scan_nodes(unsigned long start, unsigned long end) 70int __init k8_get_nodes(struct bootnode *physnodes)
80{ 71{
81 unsigned numnodes, cores, bits, apicid_base; 72 int i;
73 int ret = 0;
74
75 for_each_node_mask(i, nodes_parsed) {
76 physnodes[ret].start = nodes[i].start;
77 physnodes[ret].end = nodes[i].end;
78 ret++;
79 }
80 return ret;
81}
82
83int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
84{
85 unsigned long start = PFN_PHYS(start_pfn);
86 unsigned long end = PFN_PHYS(end_pfn);
87 unsigned numnodes;
82 unsigned long prevbase; 88 unsigned long prevbase;
83 struct bootnode nodes[8]; 89 int i, nb, found = 0;
84 int i, j, nb, found = 0;
85 u32 nodeid, reg; 90 u32 nodeid, reg;
86 91
87 if (!early_pci_allowed()) 92 if (!early_pci_allowed())
@@ -91,16 +96,15 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
91 if (nb < 0) 96 if (nb < 0)
92 return nb; 97 return nb;
93 98
94 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 99 pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
95 100
96 reg = read_pci_config(0, nb, 0, 0x60); 101 reg = read_pci_config(0, nb, 0, 0x60);
97 numnodes = ((reg >> 4) & 0xF) + 1; 102 numnodes = ((reg >> 4) & 0xF) + 1;
98 if (numnodes <= 1) 103 if (numnodes <= 1)
99 return -1; 104 return -1;
100 105
101 printk(KERN_INFO "Number of nodes %d\n", numnodes); 106 pr_info("Number of physical nodes %d\n", numnodes);
102 107
103 memset(&nodes, 0, sizeof(nodes));
104 prevbase = 0; 108 prevbase = 0;
105 for (i = 0; i < 8; i++) { 109 for (i = 0; i < 8; i++) {
106 unsigned long base, limit; 110 unsigned long base, limit;
@@ -111,28 +115,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
111 nodeid = limit & 7; 115 nodeid = limit & 7;
112 if ((base & 3) == 0) { 116 if ((base & 3) == 0) {
113 if (i < numnodes) 117 if (i < numnodes)
114 printk("Skipping disabled node %d\n", i); 118 pr_info("Skipping disabled node %d\n", i);
115 continue; 119 continue;
116 } 120 }
117 if (nodeid >= numnodes) { 121 if (nodeid >= numnodes) {
118 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, 122 pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
119 base, limit); 123 base, limit);
120 continue; 124 continue;
121 } 125 }
122 126
123 if (!limit) { 127 if (!limit) {
124 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", 128 pr_info("Skipping node entry %d (base %lx)\n",
125 i, base); 129 i, base);
126 continue; 130 continue;
127 } 131 }
128 if ((base >> 8) & 3 || (limit >> 8) & 3) { 132 if ((base >> 8) & 3 || (limit >> 8) & 3) {
129 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 133 pr_err("Node %d using interleaving mode %lx/%lx\n",
130 nodeid, (base>>8)&3, (limit>>8) & 3); 134 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
131 return -1; 135 return -1;
132 } 136 }
133 if (node_isset(nodeid, node_possible_map)) { 137 if (node_isset(nodeid, nodes_parsed)) {
134 printk(KERN_INFO "Node %d already present. Skipping\n", 138 pr_info("Node %d already present, skipping\n",
135 nodeid); 139 nodeid);
136 continue; 140 continue;
137 } 141 }
138 142
@@ -141,8 +145,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
141 limit |= (1<<24)-1; 145 limit |= (1<<24)-1;
142 limit++; 146 limit++;
143 147
144 if (limit > max_pfn << PAGE_SHIFT) 148 if (limit > end)
145 limit = max_pfn << PAGE_SHIFT; 149 limit = end;
146 if (limit <= base) 150 if (limit <= base)
147 continue; 151 continue;
148 152
@@ -154,24 +158,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
154 if (limit > end) 158 if (limit > end)
155 limit = end; 159 limit = end;
156 if (limit == base) { 160 if (limit == base) {
157 printk(KERN_ERR "Empty node %d\n", nodeid); 161 pr_err("Empty node %d\n", nodeid);
158 continue; 162 continue;
159 } 163 }
160 if (limit < base) { 164 if (limit < base) {
161 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", 165 pr_err("Node %d bogus settings %lx-%lx.\n",
162 nodeid, base, limit); 166 nodeid, base, limit);
163 continue; 167 continue;
164 } 168 }
165 169
166 /* Could sort here, but pun for now. Should not happen anyroads. */ 170 /* Could sort here, but pun for now. Should not happen anyroads. */
167 if (prevbase > base) { 171 if (prevbase > base) {
168 printk(KERN_ERR "Node map not sorted %lx,%lx\n", 172 pr_err("Node map not sorted %lx,%lx\n",
169 prevbase, base); 173 prevbase, base);
170 return -1; 174 return -1;
171 } 175 }
172 176
173 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 177 pr_info("Node %d MemBase %016lx Limit %016lx\n",
174 nodeid, base, limit); 178 nodeid, base, limit);
175 179
176 found++; 180 found++;
177 181
@@ -180,18 +184,29 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
180 184
181 prevbase = base; 185 prevbase = base;
182 186
183 node_set(nodeid, node_possible_map); 187 node_set(nodeid, nodes_parsed);
184 } 188 }
185 189
186 if (!found) 190 if (!found)
187 return -1; 191 return -1;
192 return 0;
193}
194
195int __init k8_scan_nodes(void)
196{
197 unsigned int bits;
198 unsigned int cores;
199 unsigned int apicid_base;
200 int i;
188 201
202 BUG_ON(nodes_empty(nodes_parsed));
203 node_possible_map = nodes_parsed;
189 memnode_shift = compute_hash_shift(nodes, 8, NULL); 204 memnode_shift = compute_hash_shift(nodes, 8, NULL);
190 if (memnode_shift < 0) { 205 if (memnode_shift < 0) {
191 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 206 pr_err("No NUMA node hash function found. Contact maintainer\n");
192 return -1; 207 return -1;
193 } 208 }
194 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 209 pr_info("Using node hash shift of %d\n", memnode_shift);
195 210
196 /* use the coreid bits from early_identify_cpu */ 211 /* use the coreid bits from early_identify_cpu */
197 bits = boot_cpu_data.x86_coreid_bits; 212 bits = boot_cpu_data.x86_coreid_bits;
@@ -200,14 +215,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
200 /* need to get boot_cpu_id early for system with apicid lifting */ 215 /* need to get boot_cpu_id early for system with apicid lifting */
201 early_get_boot_cpu_id(); 216 early_get_boot_cpu_id();
202 if (boot_cpu_physical_apicid > 0) { 217 if (boot_cpu_physical_apicid > 0) {
203 printk(KERN_INFO "BSP APIC ID: %02x\n", 218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
204 boot_cpu_physical_apicid);
205 apicid_base = boot_cpu_physical_apicid; 219 apicid_base = boot_cpu_physical_apicid;
206 } 220 }
207 221
208 for (i = 0; i < 8; i++) { 222 for_each_node_mask(i, node_possible_map) {
209 if (nodes[i].start == nodes[i].end) 223 int j;
210 continue;
211 224
212 e820_register_active_regions(i, 225 e820_register_active_regions(i,
213 nodes[i].start >> PAGE_SHIFT, 226 nodes[i].start >> PAGE_SHIFT,
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d2530062fe00..b20760ca7244 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -347,8 +347,8 @@ static void init_remap_allocator(int nid)
347 (ulong) node_remap_end_vaddr[nid]); 347 (ulong) node_remap_end_vaddr[nid]);
348} 348}
349 349
350void __init initmem_init(unsigned long start_pfn, 350void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
351 unsigned long end_pfn) 351 int acpi, int k8)
352{ 352{
353 int nid; 353 int nid;
354 long kva_target_pfn; 354 long kva_target_pfn;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 459913beac71..83bbc70d11bb 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -239,8 +239,14 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 239 bootmap = early_node_mem(nodeid, bootmap_start, end,
240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
241 if (bootmap == NULL) { 241 if (bootmap == NULL) {
242 if (nodedata_phys < start || nodedata_phys >= end) 242 if (nodedata_phys < start || nodedata_phys >= end) {
243 free_bootmem(nodedata_phys, pgdat_size); 243 /*
244 * only need to free it if it is from other node
245 * bootmem
246 */
247 if (nid != nodeid)
248 free_bootmem(nodedata_phys, pgdat_size);
249 }
244 node_data[nodeid] = NULL; 250 node_data[nodeid] = NULL;
245 return; 251 return;
246 } 252 }
@@ -306,8 +312,71 @@ void __init numa_init_array(void)
306 312
307#ifdef CONFIG_NUMA_EMU 313#ifdef CONFIG_NUMA_EMU
308/* Numa emulation */ 314/* Numa emulation */
315static struct bootnode nodes[MAX_NUMNODES] __initdata;
316static struct bootnode physnodes[MAX_NUMNODES] __initdata;
309static char *cmdline __initdata; 317static char *cmdline __initdata;
310 318
319static int __init setup_physnodes(unsigned long start, unsigned long end,
320 int acpi, int k8)
321{
322 int nr_nodes = 0;
323 int ret = 0;
324 int i;
325
326#ifdef CONFIG_ACPI_NUMA
327 if (acpi)
328 nr_nodes = acpi_get_nodes(physnodes);
329#endif
330#ifdef CONFIG_K8_NUMA
331 if (k8)
332 nr_nodes = k8_get_nodes(physnodes);
333#endif
334 /*
335 * Basic sanity checking on the physical node map: there may be errors
336 * if the SRAT or K8 incorrectly reported the topology or the mem=
337 * kernel parameter is used.
338 */
339 for (i = 0; i < nr_nodes; i++) {
340 if (physnodes[i].start == physnodes[i].end)
341 continue;
342 if (physnodes[i].start > end) {
343 physnodes[i].end = physnodes[i].start;
344 continue;
345 }
346 if (physnodes[i].end < start) {
347 physnodes[i].start = physnodes[i].end;
348 continue;
349 }
350 if (physnodes[i].start < start)
351 physnodes[i].start = start;
352 if (physnodes[i].end > end)
353 physnodes[i].end = end;
354 }
355
356 /*
357 * Remove all nodes that have no memory or were truncated because of the
358 * limited address range.
359 */
360 for (i = 0; i < nr_nodes; i++) {
361 if (physnodes[i].start == physnodes[i].end)
362 continue;
363 physnodes[ret].start = physnodes[i].start;
364 physnodes[ret].end = physnodes[i].end;
365 ret++;
366 }
367
368 /*
369 * If no physical topology was detected, a single node is faked to cover
370 * the entire address space.
371 */
372 if (!ret) {
373 physnodes[ret].start = start;
374 physnodes[ret].end = end;
375 ret = 1;
376 }
377 return ret;
378}
379
311/* 380/*
312 * Setups up nid to range from addr to addr + size. If the end 381 * Setups up nid to range from addr to addr + size. If the end
313 * boundary is greater than max_addr, then max_addr is used instead. 382 * boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +384,9 @@ static char *cmdline __initdata;
315 * allocation past addr and -1 otherwise. addr is adjusted to be at 384 * allocation past addr and -1 otherwise. addr is adjusted to be at
316 * the end of the node. 385 * the end of the node.
317 */ 386 */
318static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 387static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
319 u64 size, u64 max_addr)
320{ 388{
321 int ret = 0; 389 int ret = 0;
322
323 nodes[nid].start = *addr; 390 nodes[nid].start = *addr;
324 *addr += size; 391 *addr += size;
325 if (*addr >= max_addr) { 392 if (*addr >= max_addr) {
@@ -335,12 +402,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
335} 402}
336 403
337/* 404/*
405 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
406 * to max_addr. The return value is the number of nodes allocated.
407 */
408static int __init split_nodes_interleave(u64 addr, u64 max_addr,
409 int nr_phys_nodes, int nr_nodes)
410{
411 nodemask_t physnode_mask = NODE_MASK_NONE;
412 u64 size;
413 int big;
414 int ret = 0;
415 int i;
416
417 if (nr_nodes <= 0)
418 return -1;
419 if (nr_nodes > MAX_NUMNODES) {
420 pr_info("numa=fake=%d too large, reducing to %d\n",
421 nr_nodes, MAX_NUMNODES);
422 nr_nodes = MAX_NUMNODES;
423 }
424
425 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
426 /*
427 * Calculate the number of big nodes that can be allocated as a result
428 * of consolidating the remainder.
429 */
430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
431 FAKE_NODE_MIN_SIZE;
432
433 size &= FAKE_NODE_MIN_HASH_MASK;
434 if (!size) {
435 pr_err("Not enough memory for each node. "
436 "NUMA emulation disabled.\n");
437 return -1;
438 }
439
440 for (i = 0; i < nr_phys_nodes; i++)
441 if (physnodes[i].start != physnodes[i].end)
442 node_set(i, physnode_mask);
443
444 /*
445 * Continue to fill physical nodes with fake nodes until there is no
446 * memory left on any of them.
447 */
448 while (nodes_weight(physnode_mask)) {
449 for_each_node_mask(i, physnode_mask) {
450 u64 end = physnodes[i].start + size;
451 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
452
453 if (ret < big)
454 end += FAKE_NODE_MIN_SIZE;
455
456 /*
457 * Continue to add memory to this fake node if its
458 * non-reserved memory is less than the per-node size.
459 */
460 while (end - physnodes[i].start -
461 e820_hole_size(physnodes[i].start, end) < size) {
462 end += FAKE_NODE_MIN_SIZE;
463 if (end > physnodes[i].end) {
464 end = physnodes[i].end;
465 break;
466 }
467 }
468
469 /*
470 * If there won't be at least FAKE_NODE_MIN_SIZE of
471 * non-reserved memory in ZONE_DMA32 for the next node,
472 * this one must extend to the boundary.
473 */
474 if (end < dma32_end && dma32_end - end -
475 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
476 end = dma32_end;
477
478 /*
479 * If there won't be enough non-reserved memory for the
480 * next node, this one must extend to the end of the
481 * physical node.
482 */
483 if (physnodes[i].end - end -
484 e820_hole_size(end, physnodes[i].end) < size)
485 end = physnodes[i].end;
486
487 /*
488 * Avoid allocating more nodes than requested, which can
489 * happen as a result of rounding down each node's size
490 * to FAKE_NODE_MIN_SIZE.
491 */
492 if (nodes_weight(physnode_mask) + ret >= nr_nodes)
493 end = physnodes[i].end;
494
495 if (setup_node_range(ret++, &physnodes[i].start,
496 end - physnodes[i].start,
497 physnodes[i].end) < 0)
498 node_clear(i, physnode_mask);
499 }
500 }
501 return ret;
502}
503
504/*
338 * Splits num_nodes nodes up equally starting at node_start. The return value 505 * Splits num_nodes nodes up equally starting at node_start. The return value
339 * is the number of nodes split up and addr is adjusted to be at the end of the 506 * is the number of nodes split up and addr is adjusted to be at the end of the
340 * last node allocated. 507 * last node allocated.
341 */ 508 */
342static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, 509static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
343 u64 max_addr, int node_start,
344 int num_nodes) 510 int num_nodes)
345{ 511{
346 unsigned int big; 512 unsigned int big;
@@ -388,7 +554,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
388 break; 554 break;
389 } 555 }
390 } 556 }
391 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) 557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
392 break; 558 break;
393 } 559 }
394 return i - node_start + 1; 560 return i - node_start + 1;
@@ -399,12 +565,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
399 * always assigned to a final node and can be asymmetric. Returns the number of 565 * always assigned to a final node and can be asymmetric. Returns the number of
400 * nodes split. 566 * nodes split.
401 */ 567 */
402static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, 568static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
403 u64 max_addr, int node_start, u64 size) 569 u64 size)
404{ 570{
405 int i = node_start; 571 int i = node_start;
406 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
407 while (!setup_node_range(i++, nodes, addr, size, max_addr)) 573 while (!setup_node_range(i++, addr, size, max_addr))
408 ; 574 ;
409 return i - node_start; 575 return i - node_start;
410} 576}
@@ -413,15 +579,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413 * Sets up the system RAM area from start_pfn to last_pfn according to the 579 * Sets up the system RAM area from start_pfn to last_pfn according to the
414 * numa=fake command-line option. 580 * numa=fake command-line option.
415 */ 581 */
416static struct bootnode nodes[MAX_NUMNODES] __initdata; 582static int __init numa_emulation(unsigned long start_pfn,
417 583 unsigned long last_pfn, int acpi, int k8)
418static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
419{ 584{
420 u64 size, addr = start_pfn << PAGE_SHIFT; 585 u64 size, addr = start_pfn << PAGE_SHIFT;
421 u64 max_addr = last_pfn << PAGE_SHIFT; 586 u64 max_addr = last_pfn << PAGE_SHIFT;
422 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
588 int num_phys_nodes;
423 589
424 memset(&nodes, 0, sizeof(nodes)); 590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
425 /* 591 /*
426 * If the numa=fake command-line is just a single number N, split the 592 * If the numa=fake command-line is just a single number N, split the
427 * system RAM into N fake nodes. 593 * system RAM into N fake nodes.
@@ -429,7 +595,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
429 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
430 long n = simple_strtol(cmdline, NULL, 0); 596 long n = simple_strtol(cmdline, NULL, 0);
431 597
432 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); 598 num_nodes = split_nodes_interleave(addr, max_addr,
599 num_phys_nodes, n);
433 if (num_nodes < 0) 600 if (num_nodes < 0)
434 return num_nodes; 601 return num_nodes;
435 goto out; 602 goto out;
@@ -456,8 +623,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
456 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; 623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
457 if (size) 624 if (size)
458 for (i = 0; i < coeff; i++, num_nodes++) 625 for (i = 0; i < coeff; i++, num_nodes++)
459 if (setup_node_range(num_nodes, nodes, 626 if (setup_node_range(num_nodes, &addr,
460 &addr, size, max_addr) < 0) 627 size, max_addr) < 0)
461 goto done; 628 goto done;
462 if (!*cmdline) 629 if (!*cmdline)
463 break; 630 break;
@@ -473,7 +640,7 @@ done:
473 if (addr < max_addr) { 640 if (addr < max_addr) {
474 if (coeff_flag && coeff < 0) { 641 if (coeff_flag && coeff < 0) {
475 /* Split remaining nodes into num-sized chunks */ 642 /* Split remaining nodes into num-sized chunks */
476 num_nodes += split_nodes_by_size(nodes, &addr, max_addr, 643 num_nodes += split_nodes_by_size(&addr, max_addr,
477 num_nodes, num); 644 num_nodes, num);
478 goto out; 645 goto out;
479 } 646 }
@@ -482,7 +649,7 @@ done:
482 /* Split remaining nodes into coeff chunks */ 649 /* Split remaining nodes into coeff chunks */
483 if (coeff <= 0) 650 if (coeff <= 0)
484 break; 651 break;
485 num_nodes += split_nodes_equally(nodes, &addr, max_addr, 652 num_nodes += split_nodes_equally(&addr, max_addr,
486 num_nodes, coeff); 653 num_nodes, coeff);
487 break; 654 break;
488 case ',': 655 case ',':
@@ -490,8 +657,8 @@ done:
490 break; 657 break;
491 default: 658 default:
492 /* Give one final node */ 659 /* Give one final node */
493 setup_node_range(num_nodes, nodes, &addr, 660 setup_node_range(num_nodes, &addr, max_addr - addr,
494 max_addr - addr, max_addr); 661 max_addr);
495 num_nodes++; 662 num_nodes++;
496 } 663 }
497 } 664 }
@@ -505,14 +672,10 @@ out:
505 } 672 }
506 673
507 /* 674 /*
508 * We need to vacate all active ranges that may have been registered by 675 * We need to vacate all active ranges that may have been registered for
509 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns 676 * the e820 memory map.
510 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
511 */ 677 */
512 remove_all_active_ranges(); 678 remove_all_active_ranges();
513#ifdef CONFIG_ACPI_NUMA
514 acpi_numa = -1;
515#endif
516 for_each_node_mask(i, node_possible_map) { 679 for_each_node_mask(i, node_possible_map) {
517 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 680 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
518 nodes[i].end >> PAGE_SHIFT); 681 nodes[i].end >> PAGE_SHIFT);
@@ -524,7 +687,8 @@ out:
524} 687}
525#endif /* CONFIG_NUMA_EMU */ 688#endif /* CONFIG_NUMA_EMU */
526 689
527void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) 690void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
691 int acpi, int k8)
528{ 692{
529 int i; 693 int i;
530 694
@@ -532,23 +696,22 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
532 nodes_clear(node_online_map); 696 nodes_clear(node_online_map);
533 697
534#ifdef CONFIG_NUMA_EMU 698#ifdef CONFIG_NUMA_EMU
535 if (cmdline && !numa_emulation(start_pfn, last_pfn)) 699 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
536 return; 700 return;
537 nodes_clear(node_possible_map); 701 nodes_clear(node_possible_map);
538 nodes_clear(node_online_map); 702 nodes_clear(node_online_map);
539#endif 703#endif
540 704
541#ifdef CONFIG_ACPI_NUMA 705#ifdef CONFIG_ACPI_NUMA
542 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 706 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
543 last_pfn << PAGE_SHIFT)) 707 last_pfn << PAGE_SHIFT))
544 return; 708 return;
545 nodes_clear(node_possible_map); 709 nodes_clear(node_possible_map);
546 nodes_clear(node_online_map); 710 nodes_clear(node_online_map);
547#endif 711#endif
548 712
549#ifdef CONFIG_K8_NUMA 713#ifdef CONFIG_K8_NUMA
550 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 714 if (!numa_off && k8 && !k8_scan_nodes())
551 last_pfn<<PAGE_SHIFT))
552 return; 715 return;
553 nodes_clear(node_possible_map); 716 nodes_clear(node_possible_map);
554 nodes_clear(node_online_map); 717 nodes_clear(node_online_map);
@@ -601,6 +764,25 @@ static __init int numa_setup(char *opt)
601early_param("numa", numa_setup); 764early_param("numa", numa_setup);
602 765
603#ifdef CONFIG_NUMA 766#ifdef CONFIG_NUMA
767
768static __init int find_near_online_node(int node)
769{
770 int n, val;
771 int min_val = INT_MAX;
772 int best_node = -1;
773
774 for_each_online_node(n) {
775 val = node_distance(node, n);
776
777 if (val < min_val) {
778 min_val = val;
779 best_node = n;
780 }
781 }
782
783 return best_node;
784}
785
604/* 786/*
605 * Setup early cpu_to_node. 787 * Setup early cpu_to_node.
606 * 788 *
@@ -632,7 +814,7 @@ void __init init_cpu_to_node(void)
632 if (node == NUMA_NO_NODE) 814 if (node == NUMA_NO_NODE)
633 continue; 815 continue;
634 if (!node_online(node)) 816 if (!node_online(node))
635 continue; 817 node = find_near_online_node(node);
636 numa_set_node(cpu, node); 818 numa_set_node(cpu, node);
637 } 819 }
638} 820}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dd38bfbefd1f..1d4eb93d333c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -279,6 +279,22 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
280 pgprot_val(forbidden) |= _PAGE_RW; 280 pgprot_val(forbidden) |= _PAGE_RW;
281 281
282#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
283 /*
284 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
285 * kernel text mappings for the large page aligned text, rodata sections
286 * will be always read-only. For the kernel identity mappings covering
287 * the holes caused by this alignment can be anything that user asks.
288 *
289 * This will preserve the large page mappings for kernel text/data
290 * at no extra cost.
291 */
292 if (kernel_set_to_readonly &&
293 within(address, (unsigned long)_text,
294 (unsigned long)__end_rodata_hpage_align))
295 pgprot_val(forbidden) |= _PAGE_RW;
296#endif
297
282 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 298 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
283 299
284 return prot; 300 return prot;
@@ -1069,12 +1085,18 @@ EXPORT_SYMBOL(set_memory_array_wb);
1069 1085
1070int set_memory_x(unsigned long addr, int numpages) 1086int set_memory_x(unsigned long addr, int numpages)
1071{ 1087{
1088 if (!(__supported_pte_mask & _PAGE_NX))
1089 return 0;
1090
1072 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); 1091 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1073} 1092}
1074EXPORT_SYMBOL(set_memory_x); 1093EXPORT_SYMBOL(set_memory_x);
1075 1094
1076int set_memory_nx(unsigned long addr, int numpages) 1095int set_memory_nx(unsigned long addr, int numpages)
1077{ 1096{
1097 if (!(__supported_pte_mask & _PAGE_NX))
1098 return 0;
1099
1078 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); 1100 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1079} 1101}
1080EXPORT_SYMBOL(set_memory_nx); 1102EXPORT_SYMBOL(set_memory_nx);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e78cd0ec2bcf..66b55d6e69ed 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -20,6 +20,7 @@
20#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
21#include <asm/processor.h> 21#include <asm/processor.h>
22#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
23#include <asm/x86_init.h>
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/fcntl.h> 25#include <asm/fcntl.h>
25#include <asm/e820.h> 26#include <asm/e820.h>
@@ -355,9 +356,6 @@ static int free_ram_pages_type(u64 start, u64 end)
355 * - _PAGE_CACHE_UC_MINUS 356 * - _PAGE_CACHE_UC_MINUS
356 * - _PAGE_CACHE_UC 357 * - _PAGE_CACHE_UC
357 * 358 *
358 * req_type will have a special case value '-1', when requester want to inherit
359 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
360 *
361 * If new_type is NULL, function will return an error if it cannot reserve the 359 * If new_type is NULL, function will return an error if it cannot reserve the
362 * region with req_type. If new_type is non-NULL, function will return 360 * region with req_type. If new_type is non-NULL, function will return
363 * available type in new_type in case of no error. In case of any error 361 * available type in new_type in case of no error. In case of any error
@@ -377,9 +375,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
377 if (!pat_enabled) { 375 if (!pat_enabled) {
378 /* This is identical to page table setting without PAT */ 376 /* This is identical to page table setting without PAT */
379 if (new_type) { 377 if (new_type) {
380 if (req_type == -1) 378 if (req_type == _PAGE_CACHE_WC)
381 *new_type = _PAGE_CACHE_WB;
382 else if (req_type == _PAGE_CACHE_WC)
383 *new_type = _PAGE_CACHE_UC_MINUS; 379 *new_type = _PAGE_CACHE_UC_MINUS;
384 else 380 else
385 *new_type = req_type & _PAGE_CACHE_MASK; 381 *new_type = req_type & _PAGE_CACHE_MASK;
@@ -388,7 +384,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
388 } 384 }
389 385
390 /* Low ISA region is always mapped WB in page table. No need to track */ 386 /* Low ISA region is always mapped WB in page table. No need to track */
391 if (is_ISA_range(start, end - 1)) { 387 if (x86_platform.is_untracked_pat_range(start, end)) {
392 if (new_type) 388 if (new_type)
393 *new_type = _PAGE_CACHE_WB; 389 *new_type = _PAGE_CACHE_WB;
394 return 0; 390 return 0;
@@ -499,7 +495,7 @@ int free_memtype(u64 start, u64 end)
499 return 0; 495 return 0;
500 496
501 /* Low ISA region is always mapped WB. No need to track */ 497 /* Low ISA region is always mapped WB. No need to track */
502 if (is_ISA_range(start, end - 1)) 498 if (x86_platform.is_untracked_pat_range(start, end))
503 return 0; 499 return 0;
504 500
505 is_range_ram = pat_pagerange_is_ram(start, end); 501 is_range_ram = pat_pagerange_is_ram(start, end);
@@ -582,7 +578,7 @@ static unsigned long lookup_memtype(u64 paddr)
582 int rettype = _PAGE_CACHE_WB; 578 int rettype = _PAGE_CACHE_WB;
583 struct memtype *entry; 579 struct memtype *entry;
584 580
585 if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) 581 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
586 return rettype; 582 return rettype;
587 583
588 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 584 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
@@ -1018,8 +1014,10 @@ static const struct file_operations memtype_fops = {
1018 1014
1019static int __init pat_memtype_list_init(void) 1015static int __init pat_memtype_list_init(void)
1020{ 1016{
1021 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, 1017 if (pat_enabled) {
1022 NULL, &memtype_fops); 1018 debugfs_create_file("pat_memtype_list", S_IRUSR,
1019 arch_debugfs_dir, NULL, &memtype_fops);
1020 }
1023 return 0; 1021 return 0;
1024} 1022}
1025 1023
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 513d8ed5d2ec..a3250aa34086 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -3,10 +3,8 @@
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5#include <asm/pgtable.h> 5#include <asm/pgtable.h>
6#include <asm/proto.h>
6 7
7int nx_enabled;
8
9#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
10static int disable_nx __cpuinitdata; 8static int disable_nx __cpuinitdata;
11 9
12/* 10/*
@@ -22,48 +20,41 @@ static int __init noexec_setup(char *str)
22 if (!str) 20 if (!str)
23 return -EINVAL; 21 return -EINVAL;
24 if (!strncmp(str, "on", 2)) { 22 if (!strncmp(str, "on", 2)) {
25 __supported_pte_mask |= _PAGE_NX;
26 disable_nx = 0; 23 disable_nx = 0;
27 } else if (!strncmp(str, "off", 3)) { 24 } else if (!strncmp(str, "off", 3)) {
28 disable_nx = 1; 25 disable_nx = 1;
29 __supported_pte_mask &= ~_PAGE_NX;
30 } 26 }
27 x86_configure_nx();
31 return 0; 28 return 0;
32} 29}
33early_param("noexec", noexec_setup); 30early_param("noexec", noexec_setup);
34#endif
35 31
36#ifdef CONFIG_X86_PAE 32void __cpuinit x86_configure_nx(void)
37void __init set_nx(void)
38{ 33{
39 unsigned int v[4], l, h; 34 if (cpu_has_nx && !disable_nx)
40 35 __supported_pte_mask |= _PAGE_NX;
41 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { 36 else
42 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); 37 __supported_pte_mask &= ~_PAGE_NX;
38}
43 39
44 if ((v[3] & (1 << 20)) && !disable_nx) { 40void __init x86_report_nx(void)
45 rdmsr(MSR_EFER, l, h); 41{
46 l |= EFER_NX; 42 if (!cpu_has_nx) {
47 wrmsr(MSR_EFER, l, h); 43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
48 nx_enabled = 1; 44 "missing in CPU or disabled in BIOS!\n");
49 __supported_pte_mask |= _PAGE_NX; 45 } else {
46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
47 if (disable_nx) {
48 printk(KERN_INFO "NX (Execute Disable) protection: "
49 "disabled by kernel command line option\n");
50 } else {
51 printk(KERN_INFO "NX (Execute Disable) protection: "
52 "active\n");
50 } 53 }
51 }
52}
53#else 54#else
54void set_nx(void) 55 /* 32bit non-PAE kernel, NX cannot be used */
55{ 56 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
56} 57 "cannot be enabled: non-PAE kernel!\n");
57#endif 58#endif
58 59 }
59#ifdef CONFIG_X86_64
60void __cpuinit check_efer(void)
61{
62 unsigned long efer;
63
64 rdmsrl(MSR_EFER, efer);
65 if (!(efer & EFER_NX) || disable_nx)
66 __supported_pte_mask &= ~_PAGE_NX;
67} 60}
68#endif
69
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 9d7ce96e5a5c..d89075489664 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -290,8 +290,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
290 290
291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
292 start, end); 292 start, end);
293 e820_register_active_regions(node, start >> PAGE_SHIFT,
294 end >> PAGE_SHIFT);
295 293
296 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 294 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
297 update_nodes_add(node, start, end); 295 update_nodes_add(node, start, end);
@@ -338,6 +336,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
338 336
339void __init acpi_numa_arch_fixup(void) {} 337void __init acpi_numa_arch_fixup(void) {}
340 338
339int __init acpi_get_nodes(struct bootnode *physnodes)
340{
341 int i;
342 int ret = 0;
343
344 for_each_node_mask(i, nodes_parsed) {
345 physnodes[ret].start = nodes[i].start;
346 physnodes[ret].end = nodes[i].end;
347 ret++;
348 }
349 return ret;
350}
351
341/* Use the information discovered above to actually set up the nodes. */ 352/* Use the information discovered above to actually set up the nodes. */
342int __init acpi_scan_nodes(unsigned long start, unsigned long end) 353int __init acpi_scan_nodes(unsigned long start, unsigned long end)
343{ 354{
@@ -350,11 +361,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
350 for (i = 0; i < MAX_NUMNODES; i++) 361 for (i = 0; i < MAX_NUMNODES; i++)
351 cutoff_node(i, start, end); 362 cutoff_node(i, start, end);
352 363
353 if (!nodes_cover_memory(nodes)) {
354 bad_srat();
355 return -1;
356 }
357
358 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, 364 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
359 memblk_nodeid); 365 memblk_nodeid);
360 if (memnode_shift < 0) { 366 if (memnode_shift < 0) {
@@ -364,6 +370,14 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
364 return -1; 370 return -1;
365 } 371 }
366 372
373 for_each_node_mask(i, nodes_parsed)
374 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
375 nodes[i].end >> PAGE_SHIFT);
376 if (!nodes_cover_memory(nodes)) {
377 bad_srat();
378 return -1;
379 }
380
367 /* Account for nodes with cpus and no memory */ 381 /* Account for nodes with cpus and no memory */
368 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); 382 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
369 383
@@ -454,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
454 for (i = 0; i < num_nodes; i++) 468 for (i = 0; i < num_nodes; i++)
455 if (fake_nodes[i].start != fake_nodes[i].end) 469 if (fake_nodes[i].start != fake_nodes[i].end)
456 node_set(i, nodes_parsed); 470 node_set(i, nodes_parsed);
457 WARN_ON(!nodes_cover_memory(fake_nodes));
458} 471}
459 472
460static int null_slit_node_compare(int a, int b) 473static int null_slit_node_compare(int a, int b)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36fe08eeb5c3..65b58e4b0b8b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -8,6 +8,7 @@
8 8
9#include <asm/tlbflush.h> 9#include <asm/tlbflush.h>
10#include <asm/mmu_context.h> 10#include <asm/mmu_context.h>
11#include <asm/cache.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
12#include <asm/uv/uv.h> 13#include <asm/uv/uv.h>
13 14
@@ -43,7 +44,7 @@ union smp_flush_state {
43 spinlock_t tlbstate_lock; 44 spinlock_t tlbstate_lock;
44 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
45 }; 46 };
46 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; 47 char pad[INTERNODE_CACHE_BYTES];
47} ____cacheline_internodealigned_in_smp; 48} ____cacheline_internodealigned_in_smp;
48 49
49/* State is put into the per CPU data section, but padded 50/* State is put into the per CPU data section, but padded
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 58bc00f68b12..02b442e92007 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -393,7 +393,6 @@ static ctl_table abi_table2[] = {
393 393
394static ctl_table abi_root_table2[] = { 394static ctl_table abi_root_table2[] = {
395 { 395 {
396 .ctl_name = CTL_ABI,
397 .procname = "abi", 396 .procname = "abi",
398 .mode = 0555, 397 .mode = 0555,
399 .child = abi_table2 398 .child = abi_table2
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index dfbf70e65860..c462cea8ef09 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1093,10 +1093,8 @@ asmlinkage void __init xen_start_kernel(void)
1093 1093
1094 __supported_pte_mask |= _PAGE_IOMAP; 1094 __supported_pte_mask |= _PAGE_IOMAP;
1095 1095
1096#ifdef CONFIG_X86_64
1097 /* Work out if we support NX */ 1096 /* Work out if we support NX */
1098 check_efer(); 1097 x86_configure_nx();
1099#endif
1100 1098
1101 xen_setup_features(); 1099 xen_setup_features();
1102 1100
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index fe03eeed7b48..738da0cb0d8b 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -73,7 +73,7 @@ static __cpuinit void cpu_bringup(void)
73 73
74 xen_setup_cpu_clockevents(); 74 xen_setup_cpu_clockevents();
75 75
76 cpu_set(cpu, cpu_online_map); 76 set_cpu_online(cpu, true);
77 percpu_write(cpu_state, CPU_ONLINE); 77 percpu_write(cpu_state, CPU_ONLINE);
78 wmb(); 78 wmb();
79 79