aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig10
-rw-r--r--arch/x86/Kconfig.cpu14
-rw-r--r--arch/x86/Kconfig.debug4
-rw-r--r--arch/x86/Makefile_32.cpu7
-rw-r--r--arch/x86/boot/compressed/head_64.S3
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S3
-rw-r--r--arch/x86/boot/video.c6
-rw-r--r--arch/x86/crypto/Makefile3
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S517
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S157
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c333
-rw-r--r--arch/x86/ia32/ia32entry.S5
-rw-r--r--arch/x86/ia32/sys_ia32.c99
-rw-r--r--arch/x86/include/asm/acpi.h3
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h4
-rw-r--r--arch/x86/include/asm/cache.h7
-rw-r--r--arch/x86/include/asm/cacheflush.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/desc_defs.h4
-rw-r--r--arch/x86/include/asm/e820.h23
-rw-r--r--arch/x86/include/asm/elf.h20
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hpet.h7
-rw-r--r--arch/x86/include/asm/hw_irq.h4
-rw-r--r--arch/x86/include/asm/i387.h7
-rw-r--r--arch/x86/include/asm/inst.h150
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h2
-rw-r--r--arch/x86/include/asm/k8.h5
-rw-r--r--arch/x86/include/asm/kvm.h30
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h34
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/mmzone_32.h2
-rw-r--r--arch/x86/include/asm/mpspec.h11
-rw-r--r--arch/x86/include/asm/page_types.h3
-rw-r--r--arch/x86/include/asm/pci_x86.h20
-rw-r--r--arch/x86/include/asm/percpu.h104
-rw-r--r--arch/x86/include/asm/pgtable.h6
-rw-r--r--arch/x86/include/asm/proto.h17
-rw-r--r--arch/x86/include/asm/sections.h6
-rw-r--r--arch/x86/include/asm/sigcontext.h4
-rw-r--r--arch/x86/include/asm/svm.h3
-rw-r--r--arch/x86/include/asm/sys_ia32.h8
-rw-r--r--arch/x86/include/asm/syscalls.h2
-rw-r--r--arch/x86/include/asm/system.h1
-rw-r--r--arch/x86/include/asm/thread_info.h7
-rw-r--r--arch/x86/include/asm/unistd_32.h3
-rw-r--r--arch/x86/include/asm/unistd_64.h2
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/include/asm/x86_init.h4
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h27
-rw-r--r--arch/x86/kernel/acpi/boot.c3
-rw-r--r--arch/x86/kernel/acpi/sleep.c24
-rw-r--r--arch/x86/kernel/amd_iommu.c50
-rw-r--r--arch/x86/kernel/amd_iommu_init.c12
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/apic_noop.c2
-rw-r--r--arch/x86/kernel/apic/es7000_32.c12
-rw-r--r--arch/x86/kernel/apic/io_apic.c49
-rw-r--r--arch/x86/kernel/apic/nmi.c8
-rw-r--r--arch/x86/kernel/apic/numaq_32.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c19
-rw-r--r--arch/x86/kernel/cpu/common.c10
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c30
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c45
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c32
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h24
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c59
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c11
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c51
-rw-r--r--arch/x86/kernel/cpu/perf_event.c33
-rw-r--r--arch/x86/kernel/ds.c4
-rw-r--r--arch/x86/kernel/dumpstack_64.c33
-rw-r--r--arch/x86/kernel/entry_64.S10
-rw-r--r--arch/x86/kernel/ftrace.c17
-rw-r--r--arch/x86/kernel/head_32.S18
-rw-r--r--arch/x86/kernel/head_64.S3
-rw-r--r--arch/x86/kernel/hpet.c77
-rw-r--r--arch/x86/kernel/hw_breakpoint.c5
-rw-r--r--arch/x86/kernel/irq.c20
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kgdb.c14
-rw-r--r--arch/x86/kernel/kprobes.c4
-rw-r--r--arch/x86/kernel/machine_kexec_32.c6
-rw-r--r--arch/x86/kernel/microcode_amd.c97
-rw-r--r--arch/x86/kernel/microcode_core.c34
-rw-r--r--arch/x86/kernel/microcode_intel.c47
-rw-r--r--arch/x86/kernel/mpparse.c44
-rw-r--r--arch/x86/kernel/process.c23
-rw-r--r--arch/x86/kernel/process_32.c14
-rw-r--r--arch/x86/kernel/process_64.c48
-rw-r--r--arch/x86/kernel/ptrace.c68
-rw-r--r--arch/x86/kernel/quirks.c9
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c1
-rw-r--r--arch/x86/kernel/setup.c106
-rw-r--r--arch/x86/kernel/setup_percpu.c13
-rw-r--r--arch/x86/kernel/signal.c3
-rw-r--r--arch/x86/kernel/smpboot.c4
-rw-r--r--arch/x86/kernel/sys_i386_32.c27
-rw-r--r--arch/x86/kernel/sys_x86_64.c17
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/tlb_uv.c4
-rw-r--r--arch/x86/kernel/uv_time.c80
-rw-r--r--arch/x86/kernel/visws_quirks.c2
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S38
-rw-r--r--arch/x86/kernel/vsyscall_64.c7
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c159
-rw-r--r--arch/x86/kvm/i8254.c14
-rw-r--r--arch/x86/kvm/i8259.c44
-rw-r--r--arch/x86/kvm/irq.h7
-rw-r--r--arch/x86/kvm/lapic.c8
-rw-r--r--arch/x86/kvm/mmu.c3
-rw-r--r--arch/x86/kvm/paging_tmpl.h1
-rw-r--r--arch/x86/kvm/svm.c395
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c448
-rw-r--r--arch/x86/kvm/x86.c550
-rw-r--r--arch/x86/lib/Makefile4
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_32.c10
-rw-r--r--arch/x86/mm/init_64.c35
-rw-r--r--arch/x86/mm/ioremap.c26
-rw-r--r--arch/x86/mm/k8topology_64.c101
-rw-r--r--arch/x86/mm/kmmio.c46
-rw-r--r--arch/x86/mm/mmio-mod.c71
-rw-r--r--arch/x86/mm/numa_32.c4
-rw-r--r--arch/x86/mm/numa_64.c252
-rw-r--r--arch/x86/mm/pageattr.c22
-rw-r--r--arch/x86/mm/pat.c23
-rw-r--r--arch/x86/mm/setup_nx.c59
-rw-r--r--arch/x86/mm/srat_64.c29
-rw-r--r--arch/x86/mm/tlb.c3
-rw-r--r--arch/x86/pci/Makefile5
-rw-r--r--arch/x86/pci/acpi.c74
-rw-r--r--arch/x86/pci/amd_bus.c120
-rw-r--r--arch/x86/pci/bus_numa.c101
-rw-r--r--arch/x86/pci/bus_numa.h27
-rw-r--r--arch/x86/pci/common.c20
-rw-r--r--arch/x86/pci/early.c7
-rw-r--r--arch/x86/pci/i386.c42
-rw-r--r--arch/x86/pci/intel_bus.c90
-rw-r--r--arch/x86/pci/mmconfig-shared.c356
-rw-r--r--arch/x86/pci/mmconfig_32.c16
-rw-r--r--arch/x86/pci/mmconfig_64.c88
-rw-r--r--arch/x86/tools/test_get_len.c2
-rw-r--r--arch/x86/vdso/vdso32-setup.c1
-rw-r--r--arch/x86/xen/enlighten.c37
-rw-r--r--arch/x86/xen/mmu.c2
-rw-r--r--arch/x86/xen/smp.c44
-rw-r--r--arch/x86/xen/suspend.c17
-rw-r--r--arch/x86/xen/time.c31
-rw-r--r--arch/x86/xen/xen-asm_64.S4
-rw-r--r--arch/x86/xen/xen-ops.h2
168 files changed, 4294 insertions, 2425 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 178084b4377c..32a1918e1b88 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -51,6 +51,7 @@ config X86
51 select HAVE_KERNEL_LZMA 51 select HAVE_KERNEL_LZMA
52 select HAVE_HW_BREAKPOINT 52 select HAVE_HW_BREAKPOINT
53 select HAVE_ARCH_KMEMCHECK 53 select HAVE_ARCH_KMEMCHECK
54 select HAVE_USER_RETURN_NOTIFIER
54 55
55config OUTPUT_FORMAT 56config OUTPUT_FORMAT
56 string 57 string
@@ -1331,7 +1332,9 @@ config MATH_EMULATION
1331 kernel, it won't hurt. 1332 kernel, it won't hurt.
1332 1333
1333config MTRR 1334config MTRR
1334 bool "MTRR (Memory Type Range Register) support" 1335 bool
1336 default y
1337 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
1335 ---help--- 1338 ---help---
1336 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1339 On Intel P6 family processors (Pentium Pro, Pentium II and later)
1337 the Memory Type Range Registers (MTRRs) may be used to control 1340 the Memory Type Range Registers (MTRRs) may be used to control
@@ -1397,7 +1400,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1397 1400
1398config X86_PAT 1401config X86_PAT
1399 bool 1402 bool
1400 prompt "x86 PAT support" 1403 default y
1404 prompt "x86 PAT support" if EMBEDDED
1401 depends on MTRR 1405 depends on MTRR
1402 ---help--- 1406 ---help---
1403 Use PAT attributes to setup page level cache control. 1407 Use PAT attributes to setup page level cache control.
@@ -1603,7 +1607,7 @@ config COMPAT_VDSO
1603 depends on X86_32 || IA32_EMULATION 1607 depends on X86_32 || IA32_EMULATION
1604 ---help--- 1608 ---help---
1605 Map the 32-bit VDSO to the predictable old-style address too. 1609 Map the 32-bit VDSO to the predictable old-style address too.
1606 ---help--- 1610
1607 Say N here if you are running a sufficiently recent glibc 1611 Say N here if you are running a sufficiently recent glibc
1608 version (2.3.3 or later), to remove the high-mapped 1612 version (2.3.3 or later), to remove the high-mapped
1609 VDSO mapping and to exclusively use the randomized VDSO. 1613 VDSO mapping and to exclusively use the randomized VDSO.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 5e99762eb5c2..08e442bc3ab9 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -301,15 +301,11 @@ config X86_CPU
301 301
302# 302#
303# Define implied options from the CPU selection here 303# Define implied options from the CPU selection here
304config X86_L1_CACHE_BYTES 304config X86_INTERNODE_CACHE_SHIFT
305 int 305 int
306 default "128" if MPSC 306 default "12" if X86_VSMP
307 default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32 307 default "7" if NUMA
308 308 default X86_L1_CACHE_SHIFT
309config X86_INTERNODE_CACHE_BYTES
310 int
311 default "4096" if X86_VSMP
312 default X86_L1_CACHE_BYTES if !X86_VSMP
313 309
314config X86_CMPXCHG 310config X86_CMPXCHG
315 def_bool X86_64 || (X86_32 && !M386) 311 def_bool X86_64 || (X86_32 && !M386)
@@ -317,9 +313,9 @@ config X86_CMPXCHG
317config X86_L1_CACHE_SHIFT 313config X86_L1_CACHE_SHIFT
318 int 314 int
319 default "7" if MPENTIUM4 || MPSC 315 default "7" if MPENTIUM4 || MPSC
316 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
320 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 317 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
321 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 318 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
322 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
323 319
324config X86_XADD 320config X86_XADD
325 def_bool y 321 def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 731318e5ac1d..bc01e3ebfeb2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -187,8 +187,8 @@ config HAVE_MMIOTRACE_SUPPORT
187 def_bool y 187 def_bool y
188 188
189config X86_DECODER_SELFTEST 189config X86_DECODER_SELFTEST
190 bool "x86 instruction decoder selftest" 190 bool "x86 instruction decoder selftest"
191 depends on DEBUG_KERNEL 191 depends on DEBUG_KERNEL && KPROBES
192 ---help--- 192 ---help---
193 Perform x86 instruction decoder selftests at build time. 193 Perform x86 instruction decoder selftests at build time.
194 This option is useful for checking the sanity of x86 instruction 194 This option is useful for checking the sanity of x86 instruction
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index cbf0776dbec1..1255d953c65d 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -46,6 +46,13 @@ cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx)
46# cpu entries 46# cpu entries
47cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) 47cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
48 48
49# Work around the pentium-mmx code generator madness of gcc4.4.x which
50# does stack alignment by generating horrible code _before_ the mcount
51# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph
52# tracer assumptions. For i686, generic, core2 this is set by the
53# compiler anyway
54cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args)
55
49# Bug fix for binutils: this option is required in order to keep 56# Bug fix for binutils: this option is required in order to keep
50# binutils from generating NOPL instructions against our will. 57# binutils from generating NOPL instructions against our will.
51ifneq ($(CONFIG_X86_P6_NOP),y) 58ifneq ($(CONFIG_X86_P6_NOP),y)
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 077e1b69198e..faff0dc9c06a 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -107,8 +107,7 @@ ENTRY(startup_32)
107 lgdt gdt(%ebp) 107 lgdt gdt(%ebp)
108 108
109 /* Enable PAE mode */ 109 /* Enable PAE mode */
110 xorl %eax, %eax 110 movl $(X86_CR4_PAE), %eax
111 orl $(X86_CR4_PAE), %eax
112 movl %eax, %cr4 111 movl %eax, %cr4
113 112
114 /* 113 /*
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index f4193bb48782..a6f1a59a5b0c 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -4,6 +4,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
4 4
5#undef i386 5#undef i386
6 6
7#include <asm/cache.h>
7#include <asm/page_types.h> 8#include <asm/page_types.h>
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
@@ -46,7 +47,7 @@ SECTIONS
46 *(.data.*) 47 *(.data.*)
47 _edata = . ; 48 _edata = . ;
48 } 49 }
49 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 50 . = ALIGN(L1_CACHE_BYTES);
50 .bss : { 51 .bss : {
51 _bss = . ; 52 _bss = . ;
52 *(.bss) 53 *(.bss)
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index d42da3802499..f767164cd5df 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -27,6 +27,12 @@ static void store_cursor_position(void)
27 27
28 boot_params.screen_info.orig_x = oreg.dl; 28 boot_params.screen_info.orig_x = oreg.dl;
29 boot_params.screen_info.orig_y = oreg.dh; 29 boot_params.screen_info.orig_y = oreg.dh;
30
31 if (oreg.ch & 0x20)
32 boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
33
34 if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f))
35 boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
30} 36}
31 37
32static void store_video_mode(void) 38static void store_video_mode(void)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index cfb0010fa940..1a58ad89fdf7 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
15obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
15 16
16obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o 17obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
17 18
@@ -24,3 +25,5 @@ twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
24salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 25salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
25 26
26aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o 27aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
28
29ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index eb0566e83319..20bb0e1ac681 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -16,6 +16,7 @@
16 */ 16 */
17 17
18#include <linux/linkage.h> 18#include <linux/linkage.h>
19#include <asm/inst.h>
19 20
20.text 21.text
21 22
@@ -122,103 +123,72 @@ ENTRY(aesni_set_key)
122 movups 0x10(%rsi), %xmm2 # other user key 123 movups 0x10(%rsi), %xmm2 # other user key
123 movaps %xmm2, (%rcx) 124 movaps %xmm2, (%rcx)
124 add $0x10, %rcx 125 add $0x10, %rcx
125 # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 126 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
126 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
127 call _key_expansion_256a 127 call _key_expansion_256a
128 # aeskeygenassist $0x1, %xmm0, %xmm1 128 AESKEYGENASSIST 0x1 %xmm0 %xmm1
129 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
130 call _key_expansion_256b 129 call _key_expansion_256b
131 # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 130 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
132 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
133 call _key_expansion_256a 131 call _key_expansion_256a
134 # aeskeygenassist $0x2, %xmm0, %xmm1 132 AESKEYGENASSIST 0x2 %xmm0 %xmm1
135 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
136 call _key_expansion_256b 133 call _key_expansion_256b
137 # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 134 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
138 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
139 call _key_expansion_256a 135 call _key_expansion_256a
140 # aeskeygenassist $0x4, %xmm0, %xmm1 136 AESKEYGENASSIST 0x4 %xmm0 %xmm1
141 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
142 call _key_expansion_256b 137 call _key_expansion_256b
143 # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 138 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
144 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
145 call _key_expansion_256a 139 call _key_expansion_256a
146 # aeskeygenassist $0x8, %xmm0, %xmm1 140 AESKEYGENASSIST 0x8 %xmm0 %xmm1
147 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
148 call _key_expansion_256b 141 call _key_expansion_256b
149 # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 142 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
150 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
151 call _key_expansion_256a 143 call _key_expansion_256a
152 # aeskeygenassist $0x10, %xmm0, %xmm1 144 AESKEYGENASSIST 0x10 %xmm0 %xmm1
153 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
154 call _key_expansion_256b 145 call _key_expansion_256b
155 # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 146 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
156 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
157 call _key_expansion_256a 147 call _key_expansion_256a
158 # aeskeygenassist $0x20, %xmm0, %xmm1 148 AESKEYGENASSIST 0x20 %xmm0 %xmm1
159 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
160 call _key_expansion_256b 149 call _key_expansion_256b
161 # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 150 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
162 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
163 call _key_expansion_256a 151 call _key_expansion_256a
164 jmp .Ldec_key 152 jmp .Ldec_key
165.Lenc_key192: 153.Lenc_key192:
166 movq 0x10(%rsi), %xmm2 # other user key 154 movq 0x10(%rsi), %xmm2 # other user key
167 # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 155 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
168 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
169 call _key_expansion_192a 156 call _key_expansion_192a
170 # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 157 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
171 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
172 call _key_expansion_192b 158 call _key_expansion_192b
173 # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 159 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
174 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
175 call _key_expansion_192a 160 call _key_expansion_192a
176 # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 161 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
177 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
178 call _key_expansion_192b 162 call _key_expansion_192b
179 # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 163 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
180 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
181 call _key_expansion_192a 164 call _key_expansion_192a
182 # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 165 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
183 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
184 call _key_expansion_192b 166 call _key_expansion_192b
185 # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 167 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
186 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
187 call _key_expansion_192a 168 call _key_expansion_192a
188 # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 169 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
189 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
190 call _key_expansion_192b 170 call _key_expansion_192b
191 jmp .Ldec_key 171 jmp .Ldec_key
192.Lenc_key128: 172.Lenc_key128:
193 # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 173 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
194 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
195 call _key_expansion_128 174 call _key_expansion_128
196 # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 175 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
197 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
198 call _key_expansion_128 176 call _key_expansion_128
199 # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 177 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
200 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
201 call _key_expansion_128 178 call _key_expansion_128
202 # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 179 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
203 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
204 call _key_expansion_128 180 call _key_expansion_128
205 # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 181 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
206 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
207 call _key_expansion_128 182 call _key_expansion_128
208 # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 183 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
209 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
210 call _key_expansion_128 184 call _key_expansion_128
211 # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 185 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
212 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
213 call _key_expansion_128 186 call _key_expansion_128
214 # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 187 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
215 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
216 call _key_expansion_128 188 call _key_expansion_128
217 # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 189 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
218 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
219 call _key_expansion_128 190 call _key_expansion_128
220 # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 191 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
221 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
222 call _key_expansion_128 192 call _key_expansion_128
223.Ldec_key: 193.Ldec_key:
224 sub $0x10, %rcx 194 sub $0x10, %rcx
@@ -231,8 +201,7 @@ ENTRY(aesni_set_key)
231.align 4 201.align 4
232.Ldec_key_loop: 202.Ldec_key_loop:
233 movaps (%rdi), %xmm0 203 movaps (%rdi), %xmm0
234 # aesimc %xmm0, %xmm1 204 AESIMC %xmm0 %xmm1
235 .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8
236 movaps %xmm1, (%rsi) 205 movaps %xmm1, (%rsi)
237 add $0x10, %rdi 206 add $0x10, %rdi
238 sub $0x10, %rsi 207 sub $0x10, %rsi
@@ -274,51 +243,37 @@ _aesni_enc1:
274 je .Lenc192 243 je .Lenc192
275 add $0x20, TKEYP 244 add $0x20, TKEYP
276 movaps -0x60(TKEYP), KEY 245 movaps -0x60(TKEYP), KEY
277 # aesenc KEY, STATE 246 AESENC KEY STATE
278 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
279 movaps -0x50(TKEYP), KEY 247 movaps -0x50(TKEYP), KEY
280 # aesenc KEY, STATE 248 AESENC KEY STATE
281 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
282.align 4 249.align 4
283.Lenc192: 250.Lenc192:
284 movaps -0x40(TKEYP), KEY 251 movaps -0x40(TKEYP), KEY
285 # aesenc KEY, STATE 252 AESENC KEY STATE
286 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
287 movaps -0x30(TKEYP), KEY 253 movaps -0x30(TKEYP), KEY
288 # aesenc KEY, STATE 254 AESENC KEY STATE
289 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
290.align 4 255.align 4
291.Lenc128: 256.Lenc128:
292 movaps -0x20(TKEYP), KEY 257 movaps -0x20(TKEYP), KEY
293 # aesenc KEY, STATE 258 AESENC KEY STATE
294 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
295 movaps -0x10(TKEYP), KEY 259 movaps -0x10(TKEYP), KEY
296 # aesenc KEY, STATE 260 AESENC KEY STATE
297 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
298 movaps (TKEYP), KEY 261 movaps (TKEYP), KEY
299 # aesenc KEY, STATE 262 AESENC KEY STATE
300 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
301 movaps 0x10(TKEYP), KEY 263 movaps 0x10(TKEYP), KEY
302 # aesenc KEY, STATE 264 AESENC KEY STATE
303 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
304 movaps 0x20(TKEYP), KEY 265 movaps 0x20(TKEYP), KEY
305 # aesenc KEY, STATE 266 AESENC KEY STATE
306 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
307 movaps 0x30(TKEYP), KEY 267 movaps 0x30(TKEYP), KEY
308 # aesenc KEY, STATE 268 AESENC KEY STATE
309 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
310 movaps 0x40(TKEYP), KEY 269 movaps 0x40(TKEYP), KEY
311 # aesenc KEY, STATE 270 AESENC KEY STATE
312 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
313 movaps 0x50(TKEYP), KEY 271 movaps 0x50(TKEYP), KEY
314 # aesenc KEY, STATE 272 AESENC KEY STATE
315 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
316 movaps 0x60(TKEYP), KEY 273 movaps 0x60(TKEYP), KEY
317 # aesenc KEY, STATE 274 AESENC KEY STATE
318 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
319 movaps 0x70(TKEYP), KEY 275 movaps 0x70(TKEYP), KEY
320 # aesenclast KEY, STATE # last round 276 AESENCLAST KEY STATE
321 .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
322 ret 277 ret
323 278
324/* 279/*
@@ -353,135 +308,79 @@ _aesni_enc4:
353 je .L4enc192 308 je .L4enc192
354 add $0x20, TKEYP 309 add $0x20, TKEYP
355 movaps -0x60(TKEYP), KEY 310 movaps -0x60(TKEYP), KEY
356 # aesenc KEY, STATE1 311 AESENC KEY STATE1
357 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 312 AESENC KEY STATE2
358 # aesenc KEY, STATE2 313 AESENC KEY STATE3
359 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 314 AESENC KEY STATE4
360 # aesenc KEY, STATE3
361 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
362 # aesenc KEY, STATE4
363 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
364 movaps -0x50(TKEYP), KEY 315 movaps -0x50(TKEYP), KEY
365 # aesenc KEY, STATE1 316 AESENC KEY STATE1
366 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 317 AESENC KEY STATE2
367 # aesenc KEY, STATE2 318 AESENC KEY STATE3
368 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 319 AESENC KEY STATE4
369 # aesenc KEY, STATE3
370 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
371 # aesenc KEY, STATE4
372 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
373#.align 4 320#.align 4
374.L4enc192: 321.L4enc192:
375 movaps -0x40(TKEYP), KEY 322 movaps -0x40(TKEYP), KEY
376 # aesenc KEY, STATE1 323 AESENC KEY STATE1
377 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 324 AESENC KEY STATE2
378 # aesenc KEY, STATE2 325 AESENC KEY STATE3
379 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 326 AESENC KEY STATE4
380 # aesenc KEY, STATE3
381 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
382 # aesenc KEY, STATE4
383 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
384 movaps -0x30(TKEYP), KEY 327 movaps -0x30(TKEYP), KEY
385 # aesenc KEY, STATE1 328 AESENC KEY STATE1
386 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 329 AESENC KEY STATE2
387 # aesenc KEY, STATE2 330 AESENC KEY STATE3
388 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 331 AESENC KEY STATE4
389 # aesenc KEY, STATE3
390 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
391 # aesenc KEY, STATE4
392 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
393#.align 4 332#.align 4
394.L4enc128: 333.L4enc128:
395 movaps -0x20(TKEYP), KEY 334 movaps -0x20(TKEYP), KEY
396 # aesenc KEY, STATE1 335 AESENC KEY STATE1
397 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 336 AESENC KEY STATE2
398 # aesenc KEY, STATE2 337 AESENC KEY STATE3
399 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 338 AESENC KEY STATE4
400 # aesenc KEY, STATE3
401 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
402 # aesenc KEY, STATE4
403 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
404 movaps -0x10(TKEYP), KEY 339 movaps -0x10(TKEYP), KEY
405 # aesenc KEY, STATE1 340 AESENC KEY STATE1
406 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 341 AESENC KEY STATE2
407 # aesenc KEY, STATE2 342 AESENC KEY STATE3
408 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 343 AESENC KEY STATE4
409 # aesenc KEY, STATE3
410 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
411 # aesenc KEY, STATE4
412 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
413 movaps (TKEYP), KEY 344 movaps (TKEYP), KEY
414 # aesenc KEY, STATE1 345 AESENC KEY STATE1
415 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 346 AESENC KEY STATE2
416 # aesenc KEY, STATE2 347 AESENC KEY STATE3
417 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 348 AESENC KEY STATE4
418 # aesenc KEY, STATE3
419 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
420 # aesenc KEY, STATE4
421 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
422 movaps 0x10(TKEYP), KEY 349 movaps 0x10(TKEYP), KEY
423 # aesenc KEY, STATE1 350 AESENC KEY STATE1
424 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 351 AESENC KEY STATE2
425 # aesenc KEY, STATE2 352 AESENC KEY STATE3
426 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 353 AESENC KEY STATE4
427 # aesenc KEY, STATE3
428 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
429 # aesenc KEY, STATE4
430 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
431 movaps 0x20(TKEYP), KEY 354 movaps 0x20(TKEYP), KEY
432 # aesenc KEY, STATE1 355 AESENC KEY STATE1
433 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 356 AESENC KEY STATE2
434 # aesenc KEY, STATE2 357 AESENC KEY STATE3
435 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 358 AESENC KEY STATE4
436 # aesenc KEY, STATE3
437 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
438 # aesenc KEY, STATE4
439 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
440 movaps 0x30(TKEYP), KEY 359 movaps 0x30(TKEYP), KEY
441 # aesenc KEY, STATE1 360 AESENC KEY STATE1
442 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 361 AESENC KEY STATE2
443 # aesenc KEY, STATE2 362 AESENC KEY STATE3
444 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 363 AESENC KEY STATE4
445 # aesenc KEY, STATE3
446 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
447 # aesenc KEY, STATE4
448 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
449 movaps 0x40(TKEYP), KEY 364 movaps 0x40(TKEYP), KEY
450 # aesenc KEY, STATE1 365 AESENC KEY STATE1
451 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 366 AESENC KEY STATE2
452 # aesenc KEY, STATE2 367 AESENC KEY STATE3
453 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 368 AESENC KEY STATE4
454 # aesenc KEY, STATE3
455 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
456 # aesenc KEY, STATE4
457 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
458 movaps 0x50(TKEYP), KEY 369 movaps 0x50(TKEYP), KEY
459 # aesenc KEY, STATE1 370 AESENC KEY STATE1
460 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 371 AESENC KEY STATE2
461 # aesenc KEY, STATE2 372 AESENC KEY STATE3
462 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 373 AESENC KEY STATE4
463 # aesenc KEY, STATE3
464 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
465 # aesenc KEY, STATE4
466 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
467 movaps 0x60(TKEYP), KEY 374 movaps 0x60(TKEYP), KEY
468 # aesenc KEY, STATE1 375 AESENC KEY STATE1
469 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 376 AESENC KEY STATE2
470 # aesenc KEY, STATE2 377 AESENC KEY STATE3
471 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 378 AESENC KEY STATE4
472 # aesenc KEY, STATE3
473 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
474 # aesenc KEY, STATE4
475 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
476 movaps 0x70(TKEYP), KEY 379 movaps 0x70(TKEYP), KEY
477 # aesenclast KEY, STATE1 # last round 380 AESENCLAST KEY STATE1 # last round
478 .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 381 AESENCLAST KEY STATE2
479 # aesenclast KEY, STATE2 382 AESENCLAST KEY STATE3
480 .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2 383 AESENCLAST KEY STATE4
481 # aesenclast KEY, STATE3
482 .byte 0x66, 0x0f, 0x38, 0xdd, 0xea
483 # aesenclast KEY, STATE4
484 .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2
485 ret 384 ret
486 385
487/* 386/*
@@ -518,51 +417,37 @@ _aesni_dec1:
518 je .Ldec192 417 je .Ldec192
519 add $0x20, TKEYP 418 add $0x20, TKEYP
520 movaps -0x60(TKEYP), KEY 419 movaps -0x60(TKEYP), KEY
521 # aesdec KEY, STATE 420 AESDEC KEY STATE
522 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
523 movaps -0x50(TKEYP), KEY 421 movaps -0x50(TKEYP), KEY
524 # aesdec KEY, STATE 422 AESDEC KEY STATE
525 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
526.align 4 423.align 4
527.Ldec192: 424.Ldec192:
528 movaps -0x40(TKEYP), KEY 425 movaps -0x40(TKEYP), KEY
529 # aesdec KEY, STATE 426 AESDEC KEY STATE
530 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
531 movaps -0x30(TKEYP), KEY 427 movaps -0x30(TKEYP), KEY
532 # aesdec KEY, STATE 428 AESDEC KEY STATE
533 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
534.align 4 429.align 4
535.Ldec128: 430.Ldec128:
536 movaps -0x20(TKEYP), KEY 431 movaps -0x20(TKEYP), KEY
537 # aesdec KEY, STATE 432 AESDEC KEY STATE
538 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
539 movaps -0x10(TKEYP), KEY 433 movaps -0x10(TKEYP), KEY
540 # aesdec KEY, STATE 434 AESDEC KEY STATE
541 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
542 movaps (TKEYP), KEY 435 movaps (TKEYP), KEY
543 # aesdec KEY, STATE 436 AESDEC KEY STATE
544 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
545 movaps 0x10(TKEYP), KEY 437 movaps 0x10(TKEYP), KEY
546 # aesdec KEY, STATE 438 AESDEC KEY STATE
547 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
548 movaps 0x20(TKEYP), KEY 439 movaps 0x20(TKEYP), KEY
549 # aesdec KEY, STATE 440 AESDEC KEY STATE
550 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
551 movaps 0x30(TKEYP), KEY 441 movaps 0x30(TKEYP), KEY
552 # aesdec KEY, STATE 442 AESDEC KEY STATE
553 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
554 movaps 0x40(TKEYP), KEY 443 movaps 0x40(TKEYP), KEY
555 # aesdec KEY, STATE 444 AESDEC KEY STATE
556 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
557 movaps 0x50(TKEYP), KEY 445 movaps 0x50(TKEYP), KEY
558 # aesdec KEY, STATE 446 AESDEC KEY STATE
559 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
560 movaps 0x60(TKEYP), KEY 447 movaps 0x60(TKEYP), KEY
561 # aesdec KEY, STATE 448 AESDEC KEY STATE
562 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
563 movaps 0x70(TKEYP), KEY 449 movaps 0x70(TKEYP), KEY
564 # aesdeclast KEY, STATE # last round 450 AESDECLAST KEY STATE
565 .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
566 ret 451 ret
567 452
568/* 453/*
@@ -597,135 +482,79 @@ _aesni_dec4:
597 je .L4dec192 482 je .L4dec192
598 add $0x20, TKEYP 483 add $0x20, TKEYP
599 movaps -0x60(TKEYP), KEY 484 movaps -0x60(TKEYP), KEY
600 # aesdec KEY, STATE1 485 AESDEC KEY STATE1
601 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 486 AESDEC KEY STATE2
602 # aesdec KEY, STATE2 487 AESDEC KEY STATE3
603 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 488 AESDEC KEY STATE4
604 # aesdec KEY, STATE3
605 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
606 # aesdec KEY, STATE4
607 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
608 movaps -0x50(TKEYP), KEY 489 movaps -0x50(TKEYP), KEY
609 # aesdec KEY, STATE1 490 AESDEC KEY STATE1
610 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 491 AESDEC KEY STATE2
611 # aesdec KEY, STATE2 492 AESDEC KEY STATE3
612 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 493 AESDEC KEY STATE4
613 # aesdec KEY, STATE3
614 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
615 # aesdec KEY, STATE4
616 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
617.align 4 494.align 4
618.L4dec192: 495.L4dec192:
619 movaps -0x40(TKEYP), KEY 496 movaps -0x40(TKEYP), KEY
620 # aesdec KEY, STATE1 497 AESDEC KEY STATE1
621 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 498 AESDEC KEY STATE2
622 # aesdec KEY, STATE2 499 AESDEC KEY STATE3
623 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 500 AESDEC KEY STATE4
624 # aesdec KEY, STATE3
625 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
626 # aesdec KEY, STATE4
627 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
628 movaps -0x30(TKEYP), KEY 501 movaps -0x30(TKEYP), KEY
629 # aesdec KEY, STATE1 502 AESDEC KEY STATE1
630 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 503 AESDEC KEY STATE2
631 # aesdec KEY, STATE2 504 AESDEC KEY STATE3
632 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 505 AESDEC KEY STATE4
633 # aesdec KEY, STATE3
634 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
635 # aesdec KEY, STATE4
636 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
637.align 4 506.align 4
638.L4dec128: 507.L4dec128:
639 movaps -0x20(TKEYP), KEY 508 movaps -0x20(TKEYP), KEY
640 # aesdec KEY, STATE1 509 AESDEC KEY STATE1
641 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 510 AESDEC KEY STATE2
642 # aesdec KEY, STATE2 511 AESDEC KEY STATE3
643 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 512 AESDEC KEY STATE4
644 # aesdec KEY, STATE3
645 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
646 # aesdec KEY, STATE4
647 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
648 movaps -0x10(TKEYP), KEY 513 movaps -0x10(TKEYP), KEY
649 # aesdec KEY, STATE1 514 AESDEC KEY STATE1
650 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 515 AESDEC KEY STATE2
651 # aesdec KEY, STATE2 516 AESDEC KEY STATE3
652 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 517 AESDEC KEY STATE4
653 # aesdec KEY, STATE3
654 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
655 # aesdec KEY, STATE4
656 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
657 movaps (TKEYP), KEY 518 movaps (TKEYP), KEY
658 # aesdec KEY, STATE1 519 AESDEC KEY STATE1
659 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 520 AESDEC KEY STATE2
660 # aesdec KEY, STATE2 521 AESDEC KEY STATE3
661 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 522 AESDEC KEY STATE4
662 # aesdec KEY, STATE3
663 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
664 # aesdec KEY, STATE4
665 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
666 movaps 0x10(TKEYP), KEY 523 movaps 0x10(TKEYP), KEY
667 # aesdec KEY, STATE1 524 AESDEC KEY STATE1
668 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 525 AESDEC KEY STATE2
669 # aesdec KEY, STATE2 526 AESDEC KEY STATE3
670 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 527 AESDEC KEY STATE4
671 # aesdec KEY, STATE3
672 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
673 # aesdec KEY, STATE4
674 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
675 movaps 0x20(TKEYP), KEY 528 movaps 0x20(TKEYP), KEY
676 # aesdec KEY, STATE1 529 AESDEC KEY STATE1
677 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 530 AESDEC KEY STATE2
678 # aesdec KEY, STATE2 531 AESDEC KEY STATE3
679 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 532 AESDEC KEY STATE4
680 # aesdec KEY, STATE3
681 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
682 # aesdec KEY, STATE4
683 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
684 movaps 0x30(TKEYP), KEY 533 movaps 0x30(TKEYP), KEY
685 # aesdec KEY, STATE1 534 AESDEC KEY STATE1
686 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 535 AESDEC KEY STATE2
687 # aesdec KEY, STATE2 536 AESDEC KEY STATE3
688 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 537 AESDEC KEY STATE4
689 # aesdec KEY, STATE3
690 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
691 # aesdec KEY, STATE4
692 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
693 movaps 0x40(TKEYP), KEY 538 movaps 0x40(TKEYP), KEY
694 # aesdec KEY, STATE1 539 AESDEC KEY STATE1
695 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 540 AESDEC KEY STATE2
696 # aesdec KEY, STATE2 541 AESDEC KEY STATE3
697 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 542 AESDEC KEY STATE4
698 # aesdec KEY, STATE3
699 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
700 # aesdec KEY, STATE4
701 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
702 movaps 0x50(TKEYP), KEY 543 movaps 0x50(TKEYP), KEY
703 # aesdec KEY, STATE1 544 AESDEC KEY STATE1
704 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 545 AESDEC KEY STATE2
705 # aesdec KEY, STATE2 546 AESDEC KEY STATE3
706 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 547 AESDEC KEY STATE4
707 # aesdec KEY, STATE3
708 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
709 # aesdec KEY, STATE4
710 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
711 movaps 0x60(TKEYP), KEY 548 movaps 0x60(TKEYP), KEY
712 # aesdec KEY, STATE1 549 AESDEC KEY STATE1
713 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 550 AESDEC KEY STATE2
714 # aesdec KEY, STATE2 551 AESDEC KEY STATE3
715 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 552 AESDEC KEY STATE4
716 # aesdec KEY, STATE3
717 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
718 # aesdec KEY, STATE4
719 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
720 movaps 0x70(TKEYP), KEY 553 movaps 0x70(TKEYP), KEY
721 # aesdeclast KEY, STATE1 # last round 554 AESDECLAST KEY STATE1 # last round
722 .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 555 AESDECLAST KEY STATE2
723 # aesdeclast KEY, STATE2 556 AESDECLAST KEY STATE3
724 .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2 557 AESDECLAST KEY STATE4
725 # aesdeclast KEY, STATE3
726 .byte 0x66, 0x0f, 0x38, 0xdf, 0xea
727 # aesdeclast KEY, STATE4
728 .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2
729 ret 558 ret
730 559
731/* 560/*
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
new file mode 100644
index 000000000000..1eb7f90cb7b9
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,157 @@
1/*
2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
3 * instructions. This file contains accelerated part of ghash
4 * implementation. More information about PCLMULQDQ can be found at:
5 *
6 * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
7 *
8 * Copyright (c) 2009 Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal
11 * Erdinc Ozturk
12 * Deniz Karakoyunlu
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License version 2 as published
16 * by the Free Software Foundation.
17 */
18
19#include <linux/linkage.h>
20#include <asm/inst.h>
21
22.data
23
24.align 16
25.Lbswap_mask:
26 .octa 0x000102030405060708090a0b0c0d0e0f
27.Lpoly:
28 .octa 0xc2000000000000000000000000000001
29.Ltwo_one:
30 .octa 0x00000001000000000000000000000001
31
32#define DATA %xmm0
33#define SHASH %xmm1
34#define T1 %xmm2
35#define T2 %xmm3
36#define T3 %xmm4
37#define BSWAP %xmm5
38#define IN1 %xmm6
39
40.text
41
42/*
43 * __clmul_gf128mul_ble: internal ABI
44 * input:
45 * DATA: operand1
46 * SHASH: operand2, hash_key << 1 mod poly
47 * output:
48 * DATA: operand1 * operand2 mod poly
49 * changed:
50 * T1
51 * T2
52 * T3
53 */
54__clmul_gf128mul_ble:
55 movaps DATA, T1
56 pshufd $0b01001110, DATA, T2
57 pshufd $0b01001110, SHASH, T3
58 pxor DATA, T2
59 pxor SHASH, T3
60
61 PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0
62 PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1
63 PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0)
64 pxor DATA, T2
65 pxor T1, T2 # T2 = a0 * b1 + a1 * b0
66
67 movaps T2, T3
68 pslldq $8, T3
69 psrldq $8, T2
70 pxor T3, DATA
71 pxor T2, T1 # <T1:DATA> is result of
72 # carry-less multiplication
73
74 # first phase of the reduction
75 movaps DATA, T3
76 psllq $1, T3
77 pxor DATA, T3
78 psllq $5, T3
79 pxor DATA, T3
80 psllq $57, T3
81 movaps T3, T2
82 pslldq $8, T2
83 psrldq $8, T3
84 pxor T2, DATA
85 pxor T3, T1
86
87 # second phase of the reduction
88 movaps DATA, T2
89 psrlq $5, T2
90 pxor DATA, T2
91 psrlq $1, T2
92 pxor DATA, T2
93 psrlq $1, T2
94 pxor T2, T1
95 pxor T1, DATA
96 ret
97
98/* void clmul_ghash_mul(char *dst, const be128 *shash) */
99ENTRY(clmul_ghash_mul)
100 movups (%rdi), DATA
101 movups (%rsi), SHASH
102 movaps .Lbswap_mask, BSWAP
103 PSHUFB_XMM BSWAP DATA
104 call __clmul_gf128mul_ble
105 PSHUFB_XMM BSWAP DATA
106 movups DATA, (%rdi)
107 ret
108
109/*
110 * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
111 * const be128 *shash);
112 */
113ENTRY(clmul_ghash_update)
114 cmp $16, %rdx
115 jb .Lupdate_just_ret # check length
116 movaps .Lbswap_mask, BSWAP
117 movups (%rdi), DATA
118 movups (%rcx), SHASH
119 PSHUFB_XMM BSWAP DATA
120.align 4
121.Lupdate_loop:
122 movups (%rsi), IN1
123 PSHUFB_XMM BSWAP IN1
124 pxor IN1, DATA
125 call __clmul_gf128mul_ble
126 sub $16, %rdx
127 add $16, %rsi
128 cmp $16, %rdx
129 jge .Lupdate_loop
130 PSHUFB_XMM BSWAP DATA
131 movups DATA, (%rdi)
132.Lupdate_just_ret:
133 ret
134
135/*
136 * void clmul_ghash_setkey(be128 *shash, const u8 *key);
137 *
138 * Calculate hash_key << 1 mod poly
139 */
140ENTRY(clmul_ghash_setkey)
141 movaps .Lbswap_mask, BSWAP
142 movups (%rsi), %xmm0
143 PSHUFB_XMM BSWAP %xmm0
144 movaps %xmm0, %xmm1
145 psllq $1, %xmm0
146 psrlq $63, %xmm1
147 movaps %xmm1, %xmm2
148 pslldq $8, %xmm1
149 psrldq $8, %xmm2
150 por %xmm1, %xmm0
151 # reduction
152 pshufd $0b00100100, %xmm2, %xmm1
153 pcmpeqd .Ltwo_one, %xmm1
154 pand .Lpoly, %xmm1
155 pxor %xmm1, %xmm0
156 movups %xmm0, (%rdi)
157 ret
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
new file mode 100644
index 000000000000..cbcc8d8ea93a
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -0,0 +1,333 @@
1/*
2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
3 * instructions. This file contains glue code.
4 *
5 * Copyright (c) 2009 Intel Corp.
6 * Author: Huang Ying <ying.huang@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License version 2 as published
10 * by the Free Software Foundation.
11 */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/crypto.h>
17#include <crypto/algapi.h>
18#include <crypto/cryptd.h>
19#include <crypto/gf128mul.h>
20#include <crypto/internal/hash.h>
21#include <asm/i387.h>
22
23#define GHASH_BLOCK_SIZE 16
24#define GHASH_DIGEST_SIZE 16
25
26void clmul_ghash_mul(char *dst, const be128 *shash);
27
28void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
29 const be128 *shash);
30
31void clmul_ghash_setkey(be128 *shash, const u8 *key);
32
33struct ghash_async_ctx {
34 struct cryptd_ahash *cryptd_tfm;
35};
36
37struct ghash_ctx {
38 be128 shash;
39};
40
41struct ghash_desc_ctx {
42 u8 buffer[GHASH_BLOCK_SIZE];
43 u32 bytes;
44};
45
46static int ghash_init(struct shash_desc *desc)
47{
48 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
49
50 memset(dctx, 0, sizeof(*dctx));
51
52 return 0;
53}
54
55static int ghash_setkey(struct crypto_shash *tfm,
56 const u8 *key, unsigned int keylen)
57{
58 struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
59
60 if (keylen != GHASH_BLOCK_SIZE) {
61 crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
62 return -EINVAL;
63 }
64
65 clmul_ghash_setkey(&ctx->shash, key);
66
67 return 0;
68}
69
70static int ghash_update(struct shash_desc *desc,
71 const u8 *src, unsigned int srclen)
72{
73 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
74 struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
75 u8 *dst = dctx->buffer;
76
77 kernel_fpu_begin();
78 if (dctx->bytes) {
79 int n = min(srclen, dctx->bytes);
80 u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
81
82 dctx->bytes -= n;
83 srclen -= n;
84
85 while (n--)
86 *pos++ ^= *src++;
87
88 if (!dctx->bytes)
89 clmul_ghash_mul(dst, &ctx->shash);
90 }
91
92 clmul_ghash_update(dst, src, srclen, &ctx->shash);
93 kernel_fpu_end();
94
95 if (srclen & 0xf) {
96 src += srclen - (srclen & 0xf);
97 srclen &= 0xf;
98 dctx->bytes = GHASH_BLOCK_SIZE - srclen;
99 while (srclen--)
100 *dst++ ^= *src++;
101 }
102
103 return 0;
104}
105
106static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
107{
108 u8 *dst = dctx->buffer;
109
110 if (dctx->bytes) {
111 u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
112
113 while (dctx->bytes--)
114 *tmp++ ^= 0;
115
116 kernel_fpu_begin();
117 clmul_ghash_mul(dst, &ctx->shash);
118 kernel_fpu_end();
119 }
120
121 dctx->bytes = 0;
122}
123
124static int ghash_final(struct shash_desc *desc, u8 *dst)
125{
126 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
127 struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
128 u8 *buf = dctx->buffer;
129
130 ghash_flush(ctx, dctx);
131 memcpy(dst, buf, GHASH_BLOCK_SIZE);
132
133 return 0;
134}
135
136static struct shash_alg ghash_alg = {
137 .digestsize = GHASH_DIGEST_SIZE,
138 .init = ghash_init,
139 .update = ghash_update,
140 .final = ghash_final,
141 .setkey = ghash_setkey,
142 .descsize = sizeof(struct ghash_desc_ctx),
143 .base = {
144 .cra_name = "__ghash",
145 .cra_driver_name = "__ghash-pclmulqdqni",
146 .cra_priority = 0,
147 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
148 .cra_blocksize = GHASH_BLOCK_SIZE,
149 .cra_ctxsize = sizeof(struct ghash_ctx),
150 .cra_module = THIS_MODULE,
151 .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list),
152 },
153};
154
155static int ghash_async_init(struct ahash_request *req)
156{
157 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
158 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
159 struct ahash_request *cryptd_req = ahash_request_ctx(req);
160 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
161
162 if (!irq_fpu_usable()) {
163 memcpy(cryptd_req, req, sizeof(*req));
164 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
165 return crypto_ahash_init(cryptd_req);
166 } else {
167 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
168 struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
169
170 desc->tfm = child;
171 desc->flags = req->base.flags;
172 return crypto_shash_init(desc);
173 }
174}
175
176static int ghash_async_update(struct ahash_request *req)
177{
178 struct ahash_request *cryptd_req = ahash_request_ctx(req);
179
180 if (!irq_fpu_usable()) {
181 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
182 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
183 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
184
185 memcpy(cryptd_req, req, sizeof(*req));
186 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
187 return crypto_ahash_update(cryptd_req);
188 } else {
189 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
190 return shash_ahash_update(req, desc);
191 }
192}
193
194static int ghash_async_final(struct ahash_request *req)
195{
196 struct ahash_request *cryptd_req = ahash_request_ctx(req);
197
198 if (!irq_fpu_usable()) {
199 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
200 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
201 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
202
203 memcpy(cryptd_req, req, sizeof(*req));
204 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
205 return crypto_ahash_final(cryptd_req);
206 } else {
207 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
208 return crypto_shash_final(desc, req->result);
209 }
210}
211
212static int ghash_async_digest(struct ahash_request *req)
213{
214 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
215 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
216 struct ahash_request *cryptd_req = ahash_request_ctx(req);
217 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
218
219 if (!irq_fpu_usable()) {
220 memcpy(cryptd_req, req, sizeof(*req));
221 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
222 return crypto_ahash_digest(cryptd_req);
223 } else {
224 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
225 struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
226
227 desc->tfm = child;
228 desc->flags = req->base.flags;
229 return shash_ahash_digest(req, desc);
230 }
231}
232
233static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
234 unsigned int keylen)
235{
236 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
237 struct crypto_ahash *child = &ctx->cryptd_tfm->base;
238 int err;
239
240 crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
241 crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
242 & CRYPTO_TFM_REQ_MASK);
243 err = crypto_ahash_setkey(child, key, keylen);
244 crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
245 & CRYPTO_TFM_RES_MASK);
246
247 return 0;
248}
249
250static int ghash_async_init_tfm(struct crypto_tfm *tfm)
251{
252 struct cryptd_ahash *cryptd_tfm;
253 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
254
255 cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0);
256 if (IS_ERR(cryptd_tfm))
257 return PTR_ERR(cryptd_tfm);
258 ctx->cryptd_tfm = cryptd_tfm;
259 crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
260 sizeof(struct ahash_request) +
261 crypto_ahash_reqsize(&cryptd_tfm->base));
262
263 return 0;
264}
265
266static void ghash_async_exit_tfm(struct crypto_tfm *tfm)
267{
268 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
269
270 cryptd_free_ahash(ctx->cryptd_tfm);
271}
272
273static struct ahash_alg ghash_async_alg = {
274 .init = ghash_async_init,
275 .update = ghash_async_update,
276 .final = ghash_async_final,
277 .setkey = ghash_async_setkey,
278 .digest = ghash_async_digest,
279 .halg = {
280 .digestsize = GHASH_DIGEST_SIZE,
281 .base = {
282 .cra_name = "ghash",
283 .cra_driver_name = "ghash-clmulni",
284 .cra_priority = 400,
285 .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
286 .cra_blocksize = GHASH_BLOCK_SIZE,
287 .cra_type = &crypto_ahash_type,
288 .cra_module = THIS_MODULE,
289 .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list),
290 .cra_init = ghash_async_init_tfm,
291 .cra_exit = ghash_async_exit_tfm,
292 },
293 },
294};
295
296static int __init ghash_pclmulqdqni_mod_init(void)
297{
298 int err;
299
300 if (!cpu_has_pclmulqdq) {
301 printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not"
302 " detected.\n");
303 return -ENODEV;
304 }
305
306 err = crypto_register_shash(&ghash_alg);
307 if (err)
308 goto err_out;
309 err = crypto_register_ahash(&ghash_async_alg);
310 if (err)
311 goto err_shash;
312
313 return 0;
314
315err_shash:
316 crypto_unregister_shash(&ghash_alg);
317err_out:
318 return err;
319}
320
321static void __exit ghash_pclmulqdqni_mod_exit(void)
322{
323 crypto_unregister_ahash(&ghash_async_alg);
324 crypto_unregister_shash(&ghash_alg);
325}
326
327module_init(ghash_pclmulqdqni_mod_init);
328module_exit(ghash_pclmulqdqni_mod_exit);
329
330MODULE_LICENSE("GPL");
331MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
332 "acclerated by PCLMULQDQ-NI");
333MODULE_ALIAS("ghash");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 581b0568fe19..53147ad85b96 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -653,7 +653,7 @@ ia32_sys_call_table:
653 .quad compat_sys_writev 653 .quad compat_sys_writev
654 .quad sys_getsid 654 .quad sys_getsid
655 .quad sys_fdatasync 655 .quad sys_fdatasync
656 .quad sys32_sysctl /* sysctl */ 656 .quad compat_sys_sysctl /* sysctl */
657 .quad sys_mlock /* 150 */ 657 .quad sys_mlock /* 150 */
658 .quad sys_munlock 658 .quad sys_munlock
659 .quad sys_mlockall 659 .quad sys_mlockall
@@ -696,7 +696,7 @@ ia32_sys_call_table:
696 .quad quiet_ni_syscall /* streams2 */ 696 .quad quiet_ni_syscall /* streams2 */
697 .quad stub32_vfork /* 190 */ 697 .quad stub32_vfork /* 190 */
698 .quad compat_sys_getrlimit 698 .quad compat_sys_getrlimit
699 .quad sys32_mmap2 699 .quad sys_mmap_pgoff
700 .quad sys32_truncate64 700 .quad sys32_truncate64
701 .quad sys32_ftruncate64 701 .quad sys32_ftruncate64
702 .quad sys32_stat64 /* 195 */ 702 .quad sys32_stat64 /* 195 */
@@ -841,4 +841,5 @@ ia32_sys_call_table:
841 .quad compat_sys_pwritev 841 .quad compat_sys_pwritev
842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */ 842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
843 .quad sys_perf_event_open 843 .quad sys_perf_event_open
844 .quad compat_sys_recvmmsg
844ia32_syscall_end: 845ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 9f5527198825..422572c77923 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -155,9 +155,6 @@ struct mmap_arg_struct {
155asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) 155asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
156{ 156{
157 struct mmap_arg_struct a; 157 struct mmap_arg_struct a;
158 struct file *file = NULL;
159 unsigned long retval;
160 struct mm_struct *mm ;
161 158
162 if (copy_from_user(&a, arg, sizeof(a))) 159 if (copy_from_user(&a, arg, sizeof(a)))
163 return -EFAULT; 160 return -EFAULT;
@@ -165,22 +162,8 @@ asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
165 if (a.offset & ~PAGE_MASK) 162 if (a.offset & ~PAGE_MASK)
166 return -EINVAL; 163 return -EINVAL;
167 164
168 if (!(a.flags & MAP_ANONYMOUS)) { 165 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
169 file = fget(a.fd);
170 if (!file)
171 return -EBADF;
172 }
173
174 mm = current->mm;
175 down_write(&mm->mmap_sem);
176 retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags,
177 a.offset>>PAGE_SHIFT); 166 a.offset>>PAGE_SHIFT);
178 if (file)
179 fput(file);
180
181 up_write(&mm->mmap_sem);
182
183 return retval;
184} 167}
185 168
186asmlinkage long sys32_mprotect(unsigned long start, size_t len, 169asmlinkage long sys32_mprotect(unsigned long start, size_t len,
@@ -434,62 +417,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
434 return ret; 417 return ret;
435} 418}
436 419
437#ifdef CONFIG_SYSCTL_SYSCALL
438struct sysctl_ia32 {
439 unsigned int name;
440 int nlen;
441 unsigned int oldval;
442 unsigned int oldlenp;
443 unsigned int newval;
444 unsigned int newlen;
445 unsigned int __unused[4];
446};
447
448
449asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32)
450{
451 struct sysctl_ia32 a32;
452 mm_segment_t old_fs = get_fs();
453 void __user *oldvalp, *newvalp;
454 size_t oldlen;
455 int __user *namep;
456 long ret;
457
458 if (copy_from_user(&a32, args32, sizeof(a32)))
459 return -EFAULT;
460
461 /*
462 * We need to pre-validate these because we have to disable
463 * address checking before calling do_sysctl() because of
464 * OLDLEN but we can't run the risk of the user specifying bad
465 * addresses here. Well, since we're dealing with 32 bit
466 * addresses, we KNOW that access_ok() will always succeed, so
467 * this is an expensive NOP, but so what...
468 */
469 namep = compat_ptr(a32.name);
470 oldvalp = compat_ptr(a32.oldval);
471 newvalp = compat_ptr(a32.newval);
472
473 if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
474 || !access_ok(VERIFY_WRITE, namep, 0)
475 || !access_ok(VERIFY_WRITE, oldvalp, 0)
476 || !access_ok(VERIFY_WRITE, newvalp, 0))
477 return -EFAULT;
478
479 set_fs(KERNEL_DS);
480 lock_kernel();
481 ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen,
482 newvalp, (size_t) a32.newlen);
483 unlock_kernel();
484 set_fs(old_fs);
485
486 if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
487 return -EFAULT;
488
489 return ret;
490}
491#endif
492
493/* warning: next two assume little endian */ 420/* warning: next two assume little endian */
494asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, 421asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
495 u32 poslo, u32 poshi) 422 u32 poslo, u32 poshi)
@@ -539,30 +466,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
539 return ret; 466 return ret;
540} 467}
541 468
542asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
543 unsigned long prot, unsigned long flags,
544 unsigned long fd, unsigned long pgoff)
545{
546 struct mm_struct *mm = current->mm;
547 unsigned long error;
548 struct file *file = NULL;
549
550 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
551 if (!(flags & MAP_ANONYMOUS)) {
552 file = fget(fd);
553 if (!file)
554 return -EBADF;
555 }
556
557 down_write(&mm->mmap_sem);
558 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
559 up_write(&mm->mmap_sem);
560
561 if (file)
562 fput(file);
563 return error;
564}
565
566asmlinkage long sys32_olduname(struct oldold_utsname __user *name) 469asmlinkage long sys32_olduname(struct oldold_utsname __user *name)
567{ 470{
568 char *arch = "x86_64"; 471 char *arch = "x86_64";
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4518dc500903..60d2b2db0bc5 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -118,7 +118,7 @@ extern void acpi_restore_state_mem(void);
118extern unsigned long acpi_wakeup_address; 118extern unsigned long acpi_wakeup_address;
119 119
120/* early initialization routine */ 120/* early initialization routine */
121extern void acpi_reserve_bootmem(void); 121extern void acpi_reserve_wakeup_memory(void);
122 122
123/* 123/*
124 * Check if the CPU can handle C2 and deeper 124 * Check if the CPU can handle C2 and deeper
@@ -158,6 +158,7 @@ struct bootnode;
158 158
159#ifdef CONFIG_ACPI_NUMA 159#ifdef CONFIG_ACPI_NUMA
160extern int acpi_numa; 160extern int acpi_numa;
161extern int acpi_get_nodes(struct bootnode *physnodes);
161extern int acpi_scan_nodes(unsigned long start, unsigned long end); 162extern int acpi_scan_nodes(unsigned long start, unsigned long end);
162#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 163#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
163extern void acpi_fake_nodes(const struct bootnode *fake_nodes, 164extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index 84786fb9a23b..4d817f9e6e77 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -28,7 +28,9 @@ extern void amd_iommu_flush_all_domains(void);
28extern void amd_iommu_flush_all_devices(void); 28extern void amd_iommu_flush_all_devices(void);
29extern void amd_iommu_apply_erratum_63(u16 devid); 29extern void amd_iommu_apply_erratum_63(u16 devid);
30extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); 30extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
31 31extern int amd_iommu_init_devices(void);
32extern void amd_iommu_uninit_devices(void);
33extern void amd_iommu_init_notifier(void);
32#ifndef CONFIG_AMD_IOMMU_STATS 34#ifndef CONFIG_AMD_IOMMU_STATS
33 35
34static inline void amd_iommu_stats_init(void) { } 36static inline void amd_iommu_stats_init(void) { }
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 549860d3be8f..2f9047cfaaca 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -9,12 +9,13 @@
9 9
10#define __read_mostly __attribute__((__section__(".data.read_mostly"))) 10#define __read_mostly __attribute__((__section__(".data.read_mostly")))
11 11
12#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
13#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
14
12#ifdef CONFIG_X86_VSMP 15#ifdef CONFIG_X86_VSMP
13/* vSMP Internode cacheline shift */
14#define INTERNODE_CACHE_SHIFT (12)
15#ifdef CONFIG_SMP 16#ifdef CONFIG_SMP
16#define __cacheline_aligned_in_smp \ 17#define __cacheline_aligned_in_smp \
17 __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ 18 __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \
18 __page_aligned_data 19 __page_aligned_data
19#endif 20#endif
20#endif 21#endif
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b54f6afe7ec4..634c40a739a6 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
12 unsigned long start, unsigned long end) { } 12 unsigned long start, unsigned long end) { }
13static inline void flush_cache_page(struct vm_area_struct *vma, 13static inline void flush_cache_page(struct vm_area_struct *vma,
14 unsigned long vmaddr, unsigned long pfn) { } 14 unsigned long vmaddr, unsigned long pfn) { }
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
15static inline void flush_dcache_page(struct page *page) { } 16static inline void flush_dcache_page(struct page *page) { }
16static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } 17static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
17static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } 18static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
@@ -176,6 +177,7 @@ void clflush_cache_range(void *addr, unsigned int size);
176#ifdef CONFIG_DEBUG_RODATA 177#ifdef CONFIG_DEBUG_RODATA
177void mark_rodata_ro(void); 178void mark_rodata_ro(void);
178extern const int rodata_test_data; 179extern const int rodata_test_data;
180extern int kernel_set_to_readonly;
179void set_kernel_text_rw(void); 181void set_kernel_text_rw(void);
180void set_kernel_text_ro(void); 182void set_kernel_text_ro(void);
181#else 183#else
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9cfc88b97742..613700f27a4a 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -248,6 +248,7 @@ extern const char * const x86_power_flags[32];
248#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) 248#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
249#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 249#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
250#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 250#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
251#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
251 252
252#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 253#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
253# define cpu_has_invlpg 1 254# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 9d6684849fd9..278441f39856 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -12,9 +12,9 @@
12#include <linux/types.h> 12#include <linux/types.h>
13 13
14/* 14/*
15 * FIXME: Acessing the desc_struct through its fields is more elegant, 15 * FIXME: Accessing the desc_struct through its fields is more elegant,
16 * and should be the one valid thing to do. However, a lot of open code 16 * and should be the one valid thing to do. However, a lot of open code
17 * still touches the a and b acessors, and doing this allow us to do it 17 * still touches the a and b accessors, and doing this allow us to do it
18 * incrementally. We keep the signature as a struct, rather than an union, 18 * incrementally. We keep the signature as a struct, rather than an union,
19 * so we can get rid of it transparently in the future -- glommer 19 * so we can get rid of it transparently in the future -- glommer
20 */ 20 */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 40b4e614fe71..761249e396fe 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -61,6 +61,12 @@ struct e820map {
61 struct e820entry map[E820_X_MAX]; 61 struct e820entry map[E820_X_MAX];
62}; 62};
63 63
64#define ISA_START_ADDRESS 0xa0000
65#define ISA_END_ADDRESS 0x100000
66
67#define BIOS_BEGIN 0x000a0000
68#define BIOS_END 0x00100000
69
64#ifdef __KERNEL__ 70#ifdef __KERNEL__
65/* see comment in arch/x86/kernel/e820.c */ 71/* see comment in arch/x86/kernel/e820.c */
66extern struct e820map e820; 72extern struct e820map e820;
@@ -126,15 +132,18 @@ extern void e820_reserve_resources(void);
126extern void e820_reserve_resources_late(void); 132extern void e820_reserve_resources_late(void);
127extern void setup_memory_map(void); 133extern void setup_memory_map(void);
128extern char *default_machine_specific_memory_setup(void); 134extern char *default_machine_specific_memory_setup(void);
129#endif /* __KERNEL__ */
130#endif /* __ASSEMBLY__ */
131 135
132#define ISA_START_ADDRESS 0xa0000 136/*
133#define ISA_END_ADDRESS 0x100000 137 * Returns true iff the specified range [s,e) is completely contained inside
134#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) 138 * the ISA region.
139 */
140static inline bool is_ISA_range(u64 s, u64 e)
141{
142 return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS;
143}
135 144
136#define BIOS_BEGIN 0x000a0000 145#endif /* __KERNEL__ */
137#define BIOS_END 0x00100000 146#endif /* __ASSEMBLY__ */
138 147
139#ifdef __KERNEL__ 148#ifdef __KERNEL__
140#include <linux/ioport.h> 149#include <linux/ioport.h>
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 456a304b8172..8a024babe5e6 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -157,19 +157,6 @@ do { \
157 157
158#define compat_elf_check_arch(x) elf_check_arch_ia32(x) 158#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
159 159
160static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
161{
162 loadsegment(fs, 0);
163 loadsegment(ds, __USER32_DS);
164 loadsegment(es, __USER32_DS);
165 load_gs_index(0);
166 regs->ip = ip;
167 regs->sp = sp;
168 regs->flags = X86_EFLAGS_IF;
169 regs->cs = __USER32_CS;
170 regs->ss = __USER32_DS;
171}
172
173static inline void elf_common_init(struct thread_struct *t, 160static inline void elf_common_init(struct thread_struct *t,
174 struct pt_regs *regs, const u16 ds) 161 struct pt_regs *regs, const u16 ds)
175{ 162{
@@ -191,11 +178,8 @@ do { \
191#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ 178#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
192 elf_common_init(&current->thread, regs, __USER_DS) 179 elf_common_init(&current->thread, regs, __USER_DS)
193 180
194#define compat_start_thread(regs, ip, sp) \ 181void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
195do { \ 182#define compat_start_thread start_thread_ia32
196 start_ia32_thread(regs, ip, sp); \
197 set_fs(USER_DS); \
198} while (0)
199 183
200#define COMPAT_SET_PERSONALITY(ex) \ 184#define COMPAT_SET_PERSONALITY(ex) \
201do { \ 185do { \
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index f5693c81a1db..8e8ec663a98f 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -34,7 +34,7 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
34 smp_invalidate_interrupt) 34 smp_invalidate_interrupt)
35#endif 35#endif
36 36
37BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) 37BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
38 38
39/* 39/*
40 * every pentium local APIC has two 'local interrupts', with a 40 * every pentium local APIC has two 'local interrupts', with a
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 108eb6fd1ae7..0f8576427cfe 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,7 +12,7 @@ typedef struct {
12 unsigned int apic_timer_irqs; /* arch dependent */ 12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int x86_platform_ipis; /* arch dependent */
16 unsigned int apic_perf_irqs; 16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs; 17 unsigned int apic_pending_irqs;
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1c22cb05ad6a..5d89fd2a3690 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -65,11 +65,12 @@
65/* hpet memory map physical address */ 65/* hpet memory map physical address */
66extern unsigned long hpet_address; 66extern unsigned long hpet_address;
67extern unsigned long force_hpet_address; 67extern unsigned long force_hpet_address;
68extern u8 hpet_blockid;
68extern int hpet_force_user; 69extern int hpet_force_user;
69extern int is_hpet_enabled(void); 70extern int is_hpet_enabled(void);
70extern int hpet_enable(void); 71extern int hpet_enable(void);
71extern void hpet_disable(void); 72extern void hpet_disable(void);
72extern unsigned long hpet_readl(unsigned long a); 73extern unsigned int hpet_readl(unsigned int a);
73extern void force_hpet_resume(void); 74extern void force_hpet_resume(void);
74 75
75extern void hpet_msi_unmask(unsigned int irq); 76extern void hpet_msi_unmask(unsigned int irq);
@@ -78,9 +79,9 @@ extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
78extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); 79extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
79 80
80#ifdef CONFIG_PCI_MSI 81#ifdef CONFIG_PCI_MSI
81extern int arch_setup_hpet_msi(unsigned int irq); 82extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
82#else 83#else
83static inline int arch_setup_hpet_msi(unsigned int irq) 84static inline int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
84{ 85{
85 return -EINVAL; 86 return -EINVAL;
86} 87}
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 6e124269fd4b..08c48a81841f 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,7 +27,7 @@
27 27
28/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void x86_platform_ipi(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_pending_interrupt(void); 32extern void perf_pending_interrupt(void);
33 33
@@ -119,7 +119,7 @@ extern void eisa_set_level_irq(unsigned int irq);
119/* SMP */ 119/* SMP */
120extern void smp_apic_timer_interrupt(struct pt_regs *); 120extern void smp_apic_timer_interrupt(struct pt_regs *);
121extern void smp_spurious_interrupt(struct pt_regs *); 121extern void smp_spurious_interrupt(struct pt_regs *);
122extern void smp_generic_interrupt(struct pt_regs *); 122extern void smp_x86_platform_ipi(struct pt_regs *);
123extern void smp_error_interrupt(struct pt_regs *); 123extern void smp_error_interrupt(struct pt_regs *);
124#ifdef CONFIG_X86_IO_APIC 124#ifdef CONFIG_X86_IO_APIC
125extern asmlinkage void smp_irq_move_cleanup_interrupt(void); 125extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 0b20bbb758f2..ebfb8a9e11f7 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -10,6 +10,8 @@
10#ifndef _ASM_X86_I387_H 10#ifndef _ASM_X86_I387_H
11#define _ASM_X86_I387_H 11#define _ASM_X86_I387_H
12 12
13#ifndef __ASSEMBLY__
14
13#include <linux/sched.h> 15#include <linux/sched.h>
14#include <linux/kernel_stat.h> 16#include <linux/kernel_stat.h>
15#include <linux/regset.h> 17#include <linux/regset.h>
@@ -411,4 +413,9 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
411 } 413 }
412} 414}
413 415
416#endif /* __ASSEMBLY__ */
417
418#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
419#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
420
414#endif /* _ASM_X86_I387_H */ 421#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
new file mode 100644
index 000000000000..14cf526091f9
--- /dev/null
+++ b/arch/x86/include/asm/inst.h
@@ -0,0 +1,150 @@
1/*
2 * Generate .byte code for some instructions not supported by old
3 * binutils.
4 */
5#ifndef X86_ASM_INST_H
6#define X86_ASM_INST_H
7
8#ifdef __ASSEMBLY__
9
10 .macro XMM_NUM opd xmm
11 .ifc \xmm,%xmm0
12 \opd = 0
13 .endif
14 .ifc \xmm,%xmm1
15 \opd = 1
16 .endif
17 .ifc \xmm,%xmm2
18 \opd = 2
19 .endif
20 .ifc \xmm,%xmm3
21 \opd = 3
22 .endif
23 .ifc \xmm,%xmm4
24 \opd = 4
25 .endif
26 .ifc \xmm,%xmm5
27 \opd = 5
28 .endif
29 .ifc \xmm,%xmm6
30 \opd = 6
31 .endif
32 .ifc \xmm,%xmm7
33 \opd = 7
34 .endif
35 .ifc \xmm,%xmm8
36 \opd = 8
37 .endif
38 .ifc \xmm,%xmm9
39 \opd = 9
40 .endif
41 .ifc \xmm,%xmm10
42 \opd = 10
43 .endif
44 .ifc \xmm,%xmm11
45 \opd = 11
46 .endif
47 .ifc \xmm,%xmm12
48 \opd = 12
49 .endif
50 .ifc \xmm,%xmm13
51 \opd = 13
52 .endif
53 .ifc \xmm,%xmm14
54 \opd = 14
55 .endif
56 .ifc \xmm,%xmm15
57 \opd = 15
58 .endif
59 .endm
60
61 .macro PFX_OPD_SIZE
62 .byte 0x66
63 .endm
64
65 .macro PFX_REX opd1 opd2
66 .if (\opd1 | \opd2) & 8
67 .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1)
68 .endif
69 .endm
70
71 .macro MODRM mod opd1 opd2
72 .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
73 .endm
74
75 .macro PSHUFB_XMM xmm1 xmm2
76 XMM_NUM pshufb_opd1 \xmm1
77 XMM_NUM pshufb_opd2 \xmm2
78 PFX_OPD_SIZE
79 PFX_REX pshufb_opd1 pshufb_opd2
80 .byte 0x0f, 0x38, 0x00
81 MODRM 0xc0 pshufb_opd1 pshufb_opd2
82 .endm
83
84 .macro PCLMULQDQ imm8 xmm1 xmm2
85 XMM_NUM clmul_opd1 \xmm1
86 XMM_NUM clmul_opd2 \xmm2
87 PFX_OPD_SIZE
88 PFX_REX clmul_opd1 clmul_opd2
89 .byte 0x0f, 0x3a, 0x44
90 MODRM 0xc0 clmul_opd1 clmul_opd2
91 .byte \imm8
92 .endm
93
94 .macro AESKEYGENASSIST rcon xmm1 xmm2
95 XMM_NUM aeskeygen_opd1 \xmm1
96 XMM_NUM aeskeygen_opd2 \xmm2
97 PFX_OPD_SIZE
98 PFX_REX aeskeygen_opd1 aeskeygen_opd2
99 .byte 0x0f, 0x3a, 0xdf
100 MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2
101 .byte \rcon
102 .endm
103
104 .macro AESIMC xmm1 xmm2
105 XMM_NUM aesimc_opd1 \xmm1
106 XMM_NUM aesimc_opd2 \xmm2
107 PFX_OPD_SIZE
108 PFX_REX aesimc_opd1 aesimc_opd2
109 .byte 0x0f, 0x38, 0xdb
110 MODRM 0xc0 aesimc_opd1 aesimc_opd2
111 .endm
112
113 .macro AESENC xmm1 xmm2
114 XMM_NUM aesenc_opd1 \xmm1
115 XMM_NUM aesenc_opd2 \xmm2
116 PFX_OPD_SIZE
117 PFX_REX aesenc_opd1 aesenc_opd2
118 .byte 0x0f, 0x38, 0xdc
119 MODRM 0xc0 aesenc_opd1 aesenc_opd2
120 .endm
121
122 .macro AESENCLAST xmm1 xmm2
123 XMM_NUM aesenclast_opd1 \xmm1
124 XMM_NUM aesenclast_opd2 \xmm2
125 PFX_OPD_SIZE
126 PFX_REX aesenclast_opd1 aesenclast_opd2
127 .byte 0x0f, 0x38, 0xdd
128 MODRM 0xc0 aesenclast_opd1 aesenclast_opd2
129 .endm
130
131 .macro AESDEC xmm1 xmm2
132 XMM_NUM aesdec_opd1 \xmm1
133 XMM_NUM aesdec_opd2 \xmm2
134 PFX_OPD_SIZE
135 PFX_REX aesdec_opd1 aesdec_opd2
136 .byte 0x0f, 0x38, 0xde
137 MODRM 0xc0 aesdec_opd1 aesdec_opd2
138 .endm
139
140 .macro AESDECLAST xmm1 xmm2
141 XMM_NUM aesdeclast_opd1 \xmm1
142 XMM_NUM aesdeclast_opd2 \xmm2
143 PFX_OPD_SIZE
144 PFX_REX aesdeclast_opd1 aesdeclast_opd2
145 .byte 0x0f, 0x38, 0xdf
146 MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
147 .endm
148#endif
149
150#endif
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ffd700ff5dcb..5458380b6ef8 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -37,7 +37,7 @@ extern void fixup_irqs(void);
37extern void irq_force_complete_move(int); 37extern void irq_force_complete_move(int);
38#endif 38#endif
39 39
40extern void (*generic_interrupt_extension)(void); 40extern void (*x86_platform_ipi_callback)(void);
41extern void native_init_IRQ(void); 41extern void native_init_IRQ(void);
42extern bool handle_irq(unsigned irq, struct pt_regs *regs); 42extern bool handle_irq(unsigned irq, struct pt_regs *regs);
43 43
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 5b21f0ec3df2..6a635bd39867 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -106,7 +106,7 @@
106/* 106/*
107 * Generic system vector for platform specific use 107 * Generic system vector for platform specific use
108 */ 108 */
109#define GENERIC_INTERRUPT_VECTOR 0xed 109#define X86_PLATFORM_IPI_VECTOR 0xed
110 110
111/* 111/*
112 * Performance monitoring pending work vector: 112 * Performance monitoring pending work vector:
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index c2d1f3b58e5f..f70e60071fe8 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -4,13 +4,16 @@
4#include <linux/pci.h> 4#include <linux/pci.h>
5 5
6extern struct pci_device_id k8_nb_ids[]; 6extern struct pci_device_id k8_nb_ids[];
7struct bootnode;
7 8
8extern int early_is_k8_nb(u32 value); 9extern int early_is_k8_nb(u32 value);
9extern struct pci_dev **k8_northbridges; 10extern struct pci_dev **k8_northbridges;
10extern int num_k8_northbridges; 11extern int num_k8_northbridges;
11extern int cache_k8_northbridges(void); 12extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void); 13extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end); 14extern int k8_get_nodes(struct bootnode *nodes);
15extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
16extern int k8_scan_nodes(void);
14 17
15#ifdef CONFIG_K8_NB 18#ifdef CONFIG_K8_NB
16static inline struct pci_dev *node_to_k8_nb_misc(int node) 19static inline struct pci_dev *node_to_k8_nb_misc(int node)
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4a5fe914dc59..950df434763f 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -19,6 +19,8 @@
19#define __KVM_HAVE_MSIX 19#define __KVM_HAVE_MSIX
20#define __KVM_HAVE_MCE 20#define __KVM_HAVE_MCE
21#define __KVM_HAVE_PIT_STATE2 21#define __KVM_HAVE_PIT_STATE2
22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS
22 24
23/* Architectural interrupt line count. */ 25/* Architectural interrupt line count. */
24#define KVM_NR_INTERRUPTS 256 26#define KVM_NR_INTERRUPTS 256
@@ -79,6 +81,7 @@ struct kvm_ioapic_state {
79#define KVM_IRQCHIP_PIC_MASTER 0 81#define KVM_IRQCHIP_PIC_MASTER 0
80#define KVM_IRQCHIP_PIC_SLAVE 1 82#define KVM_IRQCHIP_PIC_SLAVE 1
81#define KVM_IRQCHIP_IOAPIC 2 83#define KVM_IRQCHIP_IOAPIC 2
84#define KVM_NR_IRQCHIPS 3
82 85
83/* for KVM_GET_REGS and KVM_SET_REGS */ 86/* for KVM_GET_REGS and KVM_SET_REGS */
84struct kvm_regs { 87struct kvm_regs {
@@ -250,4 +253,31 @@ struct kvm_reinject_control {
250 __u8 pit_reinject; 253 __u8 pit_reinject;
251 __u8 reserved[31]; 254 __u8 reserved[31];
252}; 255};
256
257/* for KVM_GET/SET_VCPU_EVENTS */
258struct kvm_vcpu_events {
259 struct {
260 __u8 injected;
261 __u8 nr;
262 __u8 has_error_code;
263 __u8 pad;
264 __u32 error_code;
265 } exception;
266 struct {
267 __u8 injected;
268 __u8 nr;
269 __u8 soft;
270 __u8 pad;
271 } interrupt;
272 struct {
273 __u8 injected;
274 __u8 pending;
275 __u8 masked;
276 __u8 pad;
277 } nmi;
278 __u32 sipi_vector;
279 __u32 flags;
280 __u32 reserved[10];
281};
282
253#endif /* _ASM_X86_KVM_H */ 283#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c423116..7c18e1230f54 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -129,7 +129,7 @@ struct decode_cache {
129 u8 seg_override; 129 u8 seg_override;
130 unsigned int d; 130 unsigned int d;
131 unsigned long regs[NR_VCPU_REGS]; 131 unsigned long regs[NR_VCPU_REGS];
132 unsigned long eip; 132 unsigned long eip, eip_orig;
133 /* modrm */ 133 /* modrm */
134 u8 modrm; 134 u8 modrm;
135 u8 modrm_mod; 135 u8 modrm_mod;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d83892226f73..4f865e8b8540 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -354,7 +354,6 @@ struct kvm_vcpu_arch {
354 unsigned int time_offset; 354 unsigned int time_offset;
355 struct page *time_page; 355 struct page *time_page;
356 356
357 bool singlestep; /* guest is single stepped by KVM */
358 bool nmi_pending; 357 bool nmi_pending;
359 bool nmi_injected; 358 bool nmi_injected;
360 359
@@ -371,6 +370,10 @@ struct kvm_vcpu_arch {
371 u64 mcg_status; 370 u64 mcg_status;
372 u64 mcg_ctl; 371 u64 mcg_ctl;
373 u64 *mce_banks; 372 u64 *mce_banks;
373
374 /* used for guest single stepping over the given code position */
375 u16 singlestep_cs;
376 unsigned long singlestep_rip;
374}; 377};
375 378
376struct kvm_mem_alias { 379struct kvm_mem_alias {
@@ -397,7 +400,6 @@ struct kvm_arch{
397 struct kvm_pic *vpic; 400 struct kvm_pic *vpic;
398 struct kvm_ioapic *vioapic; 401 struct kvm_ioapic *vioapic;
399 struct kvm_pit *vpit; 402 struct kvm_pit *vpit;
400 struct hlist_head irq_ack_notifier_list;
401 int vapics_in_nmi_mode; 403 int vapics_in_nmi_mode;
402 404
403 unsigned int tss_addr; 405 unsigned int tss_addr;
@@ -410,8 +412,10 @@ struct kvm_arch{
410 gpa_t ept_identity_map_addr; 412 gpa_t ept_identity_map_addr;
411 413
412 unsigned long irq_sources_bitmap; 414 unsigned long irq_sources_bitmap;
413 unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
414 u64 vm_init_tsc; 415 u64 vm_init_tsc;
416 s64 kvmclock_offset;
417
418 struct kvm_xen_hvm_config xen_hvm_config;
415}; 419};
416 420
417struct kvm_vm_stat { 421struct kvm_vm_stat {
@@ -461,7 +465,7 @@ struct descriptor_table {
461struct kvm_x86_ops { 465struct kvm_x86_ops {
462 int (*cpu_has_kvm_support)(void); /* __init */ 466 int (*cpu_has_kvm_support)(void); /* __init */
463 int (*disabled_by_bios)(void); /* __init */ 467 int (*disabled_by_bios)(void); /* __init */
464 void (*hardware_enable)(void *dummy); /* __init */ 468 int (*hardware_enable)(void *dummy);
465 void (*hardware_disable)(void *dummy); 469 void (*hardware_disable)(void *dummy);
466 void (*check_processor_compatibility)(void *rtn); 470 void (*check_processor_compatibility)(void *rtn);
467 int (*hardware_setup)(void); /* __init */ 471 int (*hardware_setup)(void); /* __init */
@@ -477,8 +481,8 @@ struct kvm_x86_ops {
477 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 481 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
478 void (*vcpu_put)(struct kvm_vcpu *vcpu); 482 void (*vcpu_put)(struct kvm_vcpu *vcpu);
479 483
480 int (*set_guest_debug)(struct kvm_vcpu *vcpu, 484 void (*set_guest_debug)(struct kvm_vcpu *vcpu,
481 struct kvm_guest_debug *dbg); 485 struct kvm_guest_debug *dbg);
482 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 486 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
483 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 487 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
484 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 488 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -506,8 +510,8 @@ struct kvm_x86_ops {
506 510
507 void (*tlb_flush)(struct kvm_vcpu *vcpu); 511 void (*tlb_flush)(struct kvm_vcpu *vcpu);
508 512
509 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 513 void (*run)(struct kvm_vcpu *vcpu);
510 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 514 int (*handle_exit)(struct kvm_vcpu *vcpu);
511 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 515 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
512 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 516 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
513 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 517 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
@@ -519,6 +523,8 @@ struct kvm_x86_ops {
519 bool has_error_code, u32 error_code); 523 bool has_error_code, u32 error_code);
520 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 524 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
521 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 525 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
526 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
527 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
522 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 528 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
523 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 529 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
524 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 530 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
@@ -568,7 +574,7 @@ enum emulation_result {
568#define EMULTYPE_NO_DECODE (1 << 0) 574#define EMULTYPE_NO_DECODE (1 << 0)
569#define EMULTYPE_TRAP_UD (1 << 1) 575#define EMULTYPE_TRAP_UD (1 << 1)
570#define EMULTYPE_SKIP (1 << 2) 576#define EMULTYPE_SKIP (1 << 2)
571int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 577int emulate_instruction(struct kvm_vcpu *vcpu,
572 unsigned long cr2, u16 error_code, int emulation_type); 578 unsigned long cr2, u16 error_code, int emulation_type);
573void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
574void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -585,9 +591,9 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
585 591
586struct x86_emulate_ctxt; 592struct x86_emulate_ctxt;
587 593
588int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 594int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
589 int size, unsigned port); 595 int size, unsigned port);
590int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 596int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
591 int size, unsigned long count, int down, 597 int size, unsigned long count, int down,
592 gva_t address, int rep, unsigned port); 598 gva_t address, int rep, unsigned port);
593void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 599void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -616,6 +622,9 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
616int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 622int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
617int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 623int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
618 624
625unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
626void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
627
619void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); 628void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
620void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 629void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
621void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 630void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
@@ -802,4 +811,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
802int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 811int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
803int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 812int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
804 813
814void kvm_define_shared_msr(unsigned index, u32 msr);
815void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
816
805#endif /* _ASM_X86_KVM_HOST_H */ 817#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b501e22a..c24ca9a56458 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -12,6 +12,8 @@ struct device;
12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; 12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
13 13
14struct microcode_ops { 14struct microcode_ops {
15 void (*init)(struct device *device);
16 void (*fini)(void);
15 enum ucode_state (*request_microcode_user) (int cpu, 17 enum ucode_state (*request_microcode_user) (int cpu,
16 const void __user *buf, size_t size); 18 const void __user *buf, size_t size);
17 19
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index ede6998bd92c..91df7c51806c 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -47,7 +47,7 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {}
47/* 47/*
48 * generic node memory support, the following assumptions apply: 48 * generic node memory support, the following assumptions apply:
49 * 49 *
50 * 1) memory comes in 64Mb contigious chunks which are either present or not 50 * 1) memory comes in 64Mb contiguous chunks which are either present or not
51 * 2) we will not have more than 64Gb in total 51 * 2) we will not have more than 64Gb in total
52 * 52 *
53 * for now assume that 64Gb is max amount of RAM for whole system 53 * for now assume that 64Gb is max amount of RAM for whole system
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 61d90b1331c3..d8bf23a88d05 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -71,12 +71,7 @@ static inline void early_get_smp_config(void)
71 71
72static inline void find_smp_config(void) 72static inline void find_smp_config(void)
73{ 73{
74 x86_init.mpparse.find_smp_config(1); 74 x86_init.mpparse.find_smp_config();
75}
76
77static inline void early_find_smp_config(void)
78{
79 x86_init.mpparse.find_smp_config(0);
80} 75}
81 76
82#ifdef CONFIG_X86_MPPARSE 77#ifdef CONFIG_X86_MPPARSE
@@ -89,7 +84,7 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
89# else 84# else
90# define default_mpc_oem_bus_info NULL 85# define default_mpc_oem_bus_info NULL
91# endif 86# endif
92extern void default_find_smp_config(unsigned int reserve); 87extern void default_find_smp_config(void);
93extern void default_get_smp_config(unsigned int early); 88extern void default_get_smp_config(unsigned int early);
94#else 89#else
95static inline void early_reserve_e820_mpc_new(void) { } 90static inline void early_reserve_e820_mpc_new(void) { }
@@ -97,7 +92,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
97#define default_mpc_apic_id NULL 92#define default_mpc_apic_id NULL
98#define default_smp_read_mpc_oem NULL 93#define default_smp_read_mpc_oem NULL
99#define default_mpc_oem_bus_info NULL 94#define default_mpc_oem_bus_info NULL
100#define default_find_smp_config x86_init_uint_noop 95#define default_find_smp_config x86_init_noop
101#define default_get_smp_config x86_init_uint_noop 96#define default_get_smp_config x86_init_uint_noop
102#endif 97#endif
103 98
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 6473f5ccff85..642fe34b36a2 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -49,7 +49,8 @@ extern unsigned long max_pfn_mapped;
49extern unsigned long init_memory_mapping(unsigned long start, 49extern unsigned long init_memory_mapping(unsigned long start,
50 unsigned long end); 50 unsigned long end);
51 51
52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); 52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn,
53 int acpi, int k8);
53extern void free_initmem(void); 54extern void free_initmem(void);
54 55
55#endif /* !__ASSEMBLY__ */ 56#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b399988eee3a..b4bf9a942ed0 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -118,11 +118,27 @@ extern int __init pcibios_init(void);
118 118
119/* pci-mmconfig.c */ 119/* pci-mmconfig.c */
120 120
121/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
122#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
123
124struct pci_mmcfg_region {
125 struct list_head list;
126 struct resource res;
127 u64 address;
128 char __iomem *virt;
129 u16 segment;
130 u8 start_bus;
131 u8 end_bus;
132 char name[PCI_MMCFG_RESOURCE_NAME_LEN];
133};
134
121extern int __init pci_mmcfg_arch_init(void); 135extern int __init pci_mmcfg_arch_init(void);
122extern void __init pci_mmcfg_arch_free(void); 136extern void __init pci_mmcfg_arch_free(void);
137extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
138
139extern struct list_head pci_mmcfg_list;
123 140
124extern struct acpi_mcfg_allocation *pci_mmcfg_config; 141#define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20)
125extern int pci_mmcfg_config_num;
126 142
127/* 143/*
128 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space 144 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index b65a36defeb7..0c44196b78ac 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -74,31 +74,31 @@ extern void __bad_percpu_size(void);
74 74
75#define percpu_to_op(op, var, val) \ 75#define percpu_to_op(op, var, val) \
76do { \ 76do { \
77 typedef typeof(var) T__; \ 77 typedef typeof(var) pto_T__; \
78 if (0) { \ 78 if (0) { \
79 T__ tmp__; \ 79 pto_T__ pto_tmp__; \
80 tmp__ = (val); \ 80 pto_tmp__ = (val); \
81 } \ 81 } \
82 switch (sizeof(var)) { \ 82 switch (sizeof(var)) { \
83 case 1: \ 83 case 1: \
84 asm(op "b %1,"__percpu_arg(0) \ 84 asm(op "b %1,"__percpu_arg(0) \
85 : "+m" (var) \ 85 : "+m" (var) \
86 : "qi" ((T__)(val))); \ 86 : "qi" ((pto_T__)(val))); \
87 break; \ 87 break; \
88 case 2: \ 88 case 2: \
89 asm(op "w %1,"__percpu_arg(0) \ 89 asm(op "w %1,"__percpu_arg(0) \
90 : "+m" (var) \ 90 : "+m" (var) \
91 : "ri" ((T__)(val))); \ 91 : "ri" ((pto_T__)(val))); \
92 break; \ 92 break; \
93 case 4: \ 93 case 4: \
94 asm(op "l %1,"__percpu_arg(0) \ 94 asm(op "l %1,"__percpu_arg(0) \
95 : "+m" (var) \ 95 : "+m" (var) \
96 : "ri" ((T__)(val))); \ 96 : "ri" ((pto_T__)(val))); \
97 break; \ 97 break; \
98 case 8: \ 98 case 8: \
99 asm(op "q %1,"__percpu_arg(0) \ 99 asm(op "q %1,"__percpu_arg(0) \
100 : "+m" (var) \ 100 : "+m" (var) \
101 : "re" ((T__)(val))); \ 101 : "re" ((pto_T__)(val))); \
102 break; \ 102 break; \
103 default: __bad_percpu_size(); \ 103 default: __bad_percpu_size(); \
104 } \ 104 } \
@@ -106,31 +106,31 @@ do { \
106 106
107#define percpu_from_op(op, var, constraint) \ 107#define percpu_from_op(op, var, constraint) \
108({ \ 108({ \
109 typeof(var) ret__; \ 109 typeof(var) pfo_ret__; \
110 switch (sizeof(var)) { \ 110 switch (sizeof(var)) { \
111 case 1: \ 111 case 1: \
112 asm(op "b "__percpu_arg(1)",%0" \ 112 asm(op "b "__percpu_arg(1)",%0" \
113 : "=q" (ret__) \ 113 : "=q" (pfo_ret__) \
114 : constraint); \ 114 : constraint); \
115 break; \ 115 break; \
116 case 2: \ 116 case 2: \
117 asm(op "w "__percpu_arg(1)",%0" \ 117 asm(op "w "__percpu_arg(1)",%0" \
118 : "=r" (ret__) \ 118 : "=r" (pfo_ret__) \
119 : constraint); \ 119 : constraint); \
120 break; \ 120 break; \
121 case 4: \ 121 case 4: \
122 asm(op "l "__percpu_arg(1)",%0" \ 122 asm(op "l "__percpu_arg(1)",%0" \
123 : "=r" (ret__) \ 123 : "=r" (pfo_ret__) \
124 : constraint); \ 124 : constraint); \
125 break; \ 125 break; \
126 case 8: \ 126 case 8: \
127 asm(op "q "__percpu_arg(1)",%0" \ 127 asm(op "q "__percpu_arg(1)",%0" \
128 : "=r" (ret__) \ 128 : "=r" (pfo_ret__) \
129 : constraint); \ 129 : constraint); \
130 break; \ 130 break; \
131 default: __bad_percpu_size(); \ 131 default: __bad_percpu_size(); \
132 } \ 132 } \
133 ret__; \ 133 pfo_ret__; \
134}) 134})
135 135
136/* 136/*
@@ -153,6 +153,84 @@ do { \
153#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) 153#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
154#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val) 154#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
155 155
156#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
157#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
158#define __this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
159
160#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
161#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
162#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
163#define __this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val)
164#define __this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val)
165#define __this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val)
166#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
167#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
168#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
169#define __this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
170#define __this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
171#define __this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
172#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
173#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
174#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
175
176#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
177#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
178#define this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
179#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
180#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
181#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
182#define this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val)
183#define this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val)
184#define this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val)
185#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
186#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
187#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
188#define this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
189#define this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
190#define this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
191#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
192#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
193#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
194
195#define irqsafe_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val)
196#define irqsafe_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val)
197#define irqsafe_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val)
198#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
199#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
200#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
201#define irqsafe_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
202#define irqsafe_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
203#define irqsafe_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
204#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
205#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
206#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
207
208/*
209 * Per cpu atomic 64 bit operations are only available under 64 bit.
210 * 32 bit must fall back to generic operations.
211 */
212#ifdef CONFIG_X86_64
213#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
214#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
215#define __this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val)
216#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
217#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
218#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
219
220#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
221#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
222#define this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val)
223#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
224#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
225#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
226
227#define irqsafe_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val)
228#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
229#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
230#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
231
232#endif
233
156/* This is not atomic against other CPUs -- CPU preemption needs to be off */ 234/* This is not atomic against other CPUs -- CPU preemption needs to be off */
157#define x86_test_and_clear_bit_percpu(bit, var) \ 235#define x86_test_and_clear_bit_percpu(bit, var) \
158({ \ 236({ \
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index af6fd360ab35..a34c785c5a63 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -16,6 +16,8 @@
16 16
17#ifndef __ASSEMBLY__ 17#ifndef __ASSEMBLY__
18 18
19#include <asm/x86_init.h>
20
19/* 21/*
20 * ZERO_PAGE is a global shared page that is always zero: used 22 * ZERO_PAGE is a global shared page that is always zero: used
21 * for zero-mapped memory areas etc.. 23 * for zero-mapped memory areas etc..
@@ -270,9 +272,9 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
270 unsigned long new_flags) 272 unsigned long new_flags)
271{ 273{
272 /* 274 /*
273 * PAT type is always WB for ISA. So no need to check. 275 * PAT type is always WB for untracked ranges, so no need to check.
274 */ 276 */
275 if (is_ISA_range(paddr, paddr + size - 1)) 277 if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
276 return 1; 278 return 1;
277 279
278 /* 280 /*
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 621f56d73121..4009f6534f52 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -5,18 +5,19 @@
5 5
6/* misc architecture specific prototypes */ 6/* misc architecture specific prototypes */
7 7
8extern void early_idt_handler(void); 8void early_idt_handler(void);
9 9
10extern void system_call(void); 10void system_call(void);
11extern void syscall_init(void); 11void syscall_init(void);
12 12
13extern void ia32_syscall(void); 13void ia32_syscall(void);
14extern void ia32_cstar_target(void); 14void ia32_cstar_target(void);
15extern void ia32_sysenter_target(void); 15void ia32_sysenter_target(void);
16 16
17extern void syscall32_cpu_init(void); 17void syscall32_cpu_init(void);
18 18
19extern void check_efer(void); 19void x86_configure_nx(void);
20void x86_report_nx(void);
20 21
21extern int reboot_force; 22extern int reboot_force;
22 23
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 1b7ee5d673c2..0a5242428659 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,7 +2,13 @@
2#define _ASM_X86_SECTIONS_H 2#define _ASM_X86_SECTIONS_H
3 3
4#include <asm-generic/sections.h> 4#include <asm-generic/sections.h>
5#include <asm/uaccess.h>
5 6
6extern char __brk_base[], __brk_limit[]; 7extern char __brk_base[], __brk_limit[];
8extern struct exception_table_entry __stop___ex_table[];
9
10#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
11extern char __end_rodata_hpage_align[];
12#endif
7 13
8#endif /* _ASM_X86_SECTIONS_H */ 14#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 72e5a4491661..04459d25e66e 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -124,7 +124,7 @@ struct sigcontext {
124 * fpstate is really (struct _fpstate *) or (struct _xstate *) 124 * fpstate is really (struct _fpstate *) or (struct _xstate *)
125 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved 125 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
126 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end 126 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
127 * of extended memory layout. See comments at the defintion of 127 * of extended memory layout. See comments at the definition of
128 * (struct _fpx_sw_bytes) 128 * (struct _fpx_sw_bytes)
129 */ 129 */
130 void __user *fpstate; /* zero when no FPU/extended context */ 130 void __user *fpstate; /* zero when no FPU/extended context */
@@ -219,7 +219,7 @@ struct sigcontext {
219 * fpstate is really (struct _fpstate *) or (struct _xstate *) 219 * fpstate is really (struct _fpstate *) or (struct _xstate *)
220 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved 220 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
221 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end 221 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
222 * of extended memory layout. See comments at the defintion of 222 * of extended memory layout. See comments at the definition of
223 * (struct _fpx_sw_bytes) 223 * (struct _fpx_sw_bytes)
224 */ 224 */
225 void __user *fpstate; /* zero when no FPU/extended context */ 225 void __user *fpstate; /* zero when no FPU/extended context */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7c1bc1..1fecb7e61130 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
57 u16 intercept_dr_write; 57 u16 intercept_dr_write;
58 u32 intercept_exceptions; 58 u32 intercept_exceptions;
59 u64 intercept; 59 u64 intercept;
60 u8 reserved_1[44]; 60 u8 reserved_1[42];
61 u16 pause_filter_count;
61 u64 iopm_base_pa; 62 u64 iopm_base_pa;
62 u64 msrpm_base_pa; 63 u64 msrpm_base_pa;
63 u64 tsc_offset; 64 u64 tsc_offset;
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 72a6dcd1299b..4a5a089e1c62 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -51,20 +51,12 @@ asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
51asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t); 51asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
52asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); 52asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
53 53
54#ifdef CONFIG_SYSCTL_SYSCALL
55struct sysctl_ia32;
56asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
57#endif
58
59asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); 54asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
60asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); 55asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
61 56
62asmlinkage long sys32_personality(unsigned long); 57asmlinkage long sys32_personality(unsigned long);
63asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); 58asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
64 59
65asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long,
66 unsigned long, unsigned long, unsigned long);
67
68struct oldold_utsname; 60struct oldold_utsname;
69struct old_utsname; 61struct old_utsname;
70asmlinkage long sys32_olduname(struct oldold_utsname __user *); 62asmlinkage long sys32_olduname(struct oldold_utsname __user *);
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 372b76edd63f..1bb6e395881c 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -55,8 +55,6 @@ struct sel_arg_struct;
55struct oldold_utsname; 55struct oldold_utsname;
56struct old_utsname; 56struct old_utsname;
57 57
58asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
59 unsigned long, unsigned long, unsigned long);
60asmlinkage int old_mmap(struct mmap_arg_struct __user *); 58asmlinkage int old_mmap(struct mmap_arg_struct __user *);
61asmlinkage int old_select(struct sel_arg_struct __user *); 59asmlinkage int old_select(struct sel_arg_struct __user *);
62asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); 60asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 022a84386de8..ecb544e65382 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -23,6 +23,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
23struct tss_struct; 23struct tss_struct;
24void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 24void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
25 struct tss_struct *tss); 25 struct tss_struct *tss);
26extern void show_regs_common(void);
26 27
27#ifdef CONFIG_X86_32 28#ifdef CONFIG_X86_32
28 29
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2fec4c..375c917c37d2 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
84#define TIF_SECCOMP 8 /* secure computing */ 84#define TIF_SECCOMP 8 /* secure computing */
85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
86#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
86#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 87#define TIF_NOTSC 16 /* TSC is not accessible in userland */
87#define TIF_IA32 17 /* 32bit process */ 88#define TIF_IA32 17 /* 32bit process */
88#define TIF_FORK 18 /* ret_from_fork */ 89#define TIF_FORK 18 /* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
107#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 108#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
108#define _TIF_SECCOMP (1 << TIF_SECCOMP) 109#define _TIF_SECCOMP (1 << TIF_SECCOMP)
109#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 110#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
111#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
110#define _TIF_NOTSC (1 << TIF_NOTSC) 112#define _TIF_NOTSC (1 << TIF_NOTSC)
111#define _TIF_IA32 (1 << TIF_IA32) 113#define _TIF_IA32 (1 << TIF_IA32)
112#define _TIF_FORK (1 << TIF_FORK) 114#define _TIF_FORK (1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
142 144
143/* Only used for 64 bit */ 145/* Only used for 64 bit */
144#define _TIF_DO_NOTIFY_MASK \ 146#define _TIF_DO_NOTIFY_MASK \
145 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) 147 (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
148 _TIF_USER_RETURN_NOTIFY)
146 149
147/* flags to check in __switch_to() */ 150/* flags to check in __switch_to() */
148#define _TIF_WORK_CTXSW \ 151#define _TIF_WORK_CTXSW \
149 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) 152 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
150 153
151#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW 154#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
152#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) 155#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
153 156
154#define PREEMPT_ACTIVE 0x10000000 157#define PREEMPT_ACTIVE 0x10000000
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6fb3c209a7e3..3baf379fa840 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -342,10 +342,11 @@
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335 343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_event_open 336 344#define __NR_perf_event_open 336
345#define __NR_recvmmsg 337
345 346
346#ifdef __KERNEL__ 347#ifdef __KERNEL__
347 348
348#define NR_syscalls 337 349#define NR_syscalls 338
349 350
350#define __ARCH_WANT_IPC_PARSE_VERSION 351#define __ARCH_WANT_IPC_PARSE_VERSION
351#define __ARCH_WANT_OLD_READDIR 352#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 8d3ad0adbc68..4843f7ba754a 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) 661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_event_open 298 662#define __NR_perf_event_open 298
663__SYSCALL(__NR_perf_event_open, sys_perf_event_open) 663__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
664#define __NR_recvmmsg 299
665__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
664 666
665#ifndef __NO_STUBS 667#ifndef __NO_STUBS
666#define __ARCH_WANT_OLD_READDIR 668#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 80e2984f521c..b414d2b401f6 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -55,7 +55,7 @@
55#define DESC_STATUS_SOURCE_TIMEOUT 3 55#define DESC_STATUS_SOURCE_TIMEOUT 3
56 56
57/* 57/*
58 * source side threshholds at which message retries print a warning 58 * source side thresholds at which message retries print a warning
59 */ 59 */
60#define SOURCE_TIMEOUT_LIMIT 20 60#define SOURCE_TIMEOUT_LIMIT 20
61#define DESTINATION_TIMEOUT_LIMIT 20 61#define DESTINATION_TIMEOUT_LIMIT 20
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 272514c2d456..2b4945419a84 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -56,6 +56,7 @@
56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
59#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
59 60
60 61
61#define PIN_BASED_EXT_INTR_MASK 0x00000001 62#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -144,6 +145,8 @@ enum vmcs_field {
144 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, 145 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
145 TPR_THRESHOLD = 0x0000401c, 146 TPR_THRESHOLD = 0x0000401c,
146 SECONDARY_VM_EXEC_CONTROL = 0x0000401e, 147 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
148 PLE_GAP = 0x00004020,
149 PLE_WINDOW = 0x00004022,
147 VM_INSTRUCTION_ERROR = 0x00004400, 150 VM_INSTRUCTION_ERROR = 0x00004400,
148 VM_EXIT_REASON = 0x00004402, 151 VM_EXIT_REASON = 0x00004402,
149 VM_EXIT_INTR_INFO = 0x00004404, 152 VM_EXIT_INTR_INFO = 0x00004404,
@@ -248,6 +251,7 @@ enum vmcs_field {
248#define EXIT_REASON_MSR_READ 31 251#define EXIT_REASON_MSR_READ 31
249#define EXIT_REASON_MSR_WRITE 32 252#define EXIT_REASON_MSR_WRITE 32
250#define EXIT_REASON_MWAIT_INSTRUCTION 36 253#define EXIT_REASON_MWAIT_INSTRUCTION 36
254#define EXIT_REASON_PAUSE_INSTRUCTION 40
251#define EXIT_REASON_MCE_DURING_VMENTRY 41 255#define EXIT_REASON_MCE_DURING_VMENTRY 41
252#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 256#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
253#define EXIT_REASON_APIC_ACCESS 44 257#define EXIT_REASON_APIC_ACCESS 44
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index d8e71459f025..ea0e8ea15e15 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -26,7 +26,7 @@ struct x86_init_mpparse {
26 void (*smp_read_mpc_oem)(struct mpc_table *mpc); 26 void (*smp_read_mpc_oem)(struct mpc_table *mpc);
27 void (*mpc_oem_pci_bus)(struct mpc_bus *m); 27 void (*mpc_oem_pci_bus)(struct mpc_bus *m);
28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); 28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
29 void (*find_smp_config)(unsigned int reserve); 29 void (*find_smp_config)(void);
30 void (*get_smp_config)(unsigned int early); 30 void (*get_smp_config)(unsigned int early);
31}; 31};
32 32
@@ -125,12 +125,14 @@ struct x86_cpuinit_ops {
125 * @calibrate_tsc: calibrate TSC 125 * @calibrate_tsc: calibrate TSC
126 * @get_wallclock: get time from HW clock like RTC etc. 126 * @get_wallclock: get time from HW clock like RTC etc.
127 * @set_wallclock: set time back to HW clock 127 * @set_wallclock: set time back to HW clock
128 * @is_untracked_pat_range exclude from PAT logic
128 */ 129 */
129struct x86_platform_ops { 130struct x86_platform_ops {
130 unsigned long (*calibrate_tsc)(void); 131 unsigned long (*calibrate_tsc)(void);
131 unsigned long (*get_wallclock)(void); 132 unsigned long (*get_wallclock)(void);
132 int (*set_wallclock)(unsigned long nowtime); 133 int (*set_wallclock)(unsigned long nowtime);
133 void (*iommu_shutdown)(void); 134 void (*iommu_shutdown)(void);
135 bool (*is_untracked_pat_range)(u64 start, u64 end);
134}; 136};
135 137
136extern struct x86_init_ops x86_init; 138extern struct x86_init_ops x86_init;
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index d5b7e90c0edf..396ff4cc8ed4 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -37,31 +37,4 @@
37extern struct shared_info *HYPERVISOR_shared_info; 37extern struct shared_info *HYPERVISOR_shared_info;
38extern struct start_info *xen_start_info; 38extern struct start_info *xen_start_info;
39 39
40enum xen_domain_type {
41 XEN_NATIVE, /* running on bare hardware */
42 XEN_PV_DOMAIN, /* running in a PV domain */
43 XEN_HVM_DOMAIN, /* running in a Xen hvm domain */
44};
45
46#ifdef CONFIG_XEN
47extern enum xen_domain_type xen_domain_type;
48#else
49#define xen_domain_type XEN_NATIVE
50#endif
51
52#define xen_domain() (xen_domain_type != XEN_NATIVE)
53#define xen_pv_domain() (xen_domain() && \
54 xen_domain_type == XEN_PV_DOMAIN)
55#define xen_hvm_domain() (xen_domain() && \
56 xen_domain_type == XEN_HVM_DOMAIN)
57
58#ifdef CONFIG_XEN_DOM0
59#include <xen/interface/xen.h>
60
61#define xen_initial_domain() (xen_pv_domain() && \
62 xen_start_info->flags & SIF_INITDOMAIN)
63#else /* !CONFIG_XEN_DOM0 */
64#define xen_initial_domain() (0)
65#endif /* CONFIG_XEN_DOM0 */
66
67#endif /* _ASM_X86_XEN_HYPERVISOR_H */ 40#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 67e929b89875..fb1035cd9a6a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -624,6 +624,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
624 } 624 }
625 625
626 hpet_address = hpet_tbl->address.address; 626 hpet_address = hpet_tbl->address.address;
627 hpet_blockid = hpet_tbl->sequence;
627 628
628 /* 629 /*
629 * Some broken BIOSes advertise HPET at 0x0. We really do not 630 * Some broken BIOSes advertise HPET at 0x0. We really do not
@@ -1122,7 +1123,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1122 if (!acpi_sci_override_gsi) 1123 if (!acpi_sci_override_gsi)
1123 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); 1124 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
1124 1125
1125 /* Fill in identity legacy mapings where no override */ 1126 /* Fill in identity legacy mappings where no override */
1126 mp_config_acpi_legacy_irqs(); 1127 mp_config_acpi_legacy_irqs();
1127 1128
1128 count = 1129 count =
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638ba430..82e508677b91 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,12 +78,9 @@ int acpi_save_state_mem(void)
78#ifndef CONFIG_64BIT 78#ifndef CONFIG_64BIT
79 store_gdt((struct desc_ptr *)&header->pmode_gdt); 79 store_gdt((struct desc_ptr *)&header->pmode_gdt);
80 80
81 header->pmode_efer_low = nx_enabled; 81 if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
82 if (header->pmode_efer_low & 1) { 82 &header->pmode_efer_high))
83 /* This is strange, why not save efer, always? */ 83 header->pmode_efer_low = header->pmode_efer_high = 0;
84 rdmsr(MSR_EFER, header->pmode_efer_low,
85 header->pmode_efer_high);
86 }
87#endif /* !CONFIG_64BIT */ 84#endif /* !CONFIG_64BIT */
88 85
89 header->pmode_cr0 = read_cr0(); 86 header->pmode_cr0 = read_cr0();
@@ -119,29 +116,32 @@ void acpi_restore_state_mem(void)
119 116
120 117
121/** 118/**
122 * acpi_reserve_bootmem - do _very_ early ACPI initialisation 119 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
123 * 120 *
124 * We allocate a page from the first 1MB of memory for the wakeup 121 * We allocate a page from the first 1MB of memory for the wakeup
125 * routine for when we come back from a sleep state. The 122 * routine for when we come back from a sleep state. The
126 * runtime allocator allows specification of <16MB pages, but not 123 * runtime allocator allows specification of <16MB pages, but not
127 * <1MB pages. 124 * <1MB pages.
128 */ 125 */
129void __init acpi_reserve_bootmem(void) 126void __init acpi_reserve_wakeup_memory(void)
130{ 127{
128 unsigned long mem;
129
131 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { 130 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
132 printk(KERN_ERR 131 printk(KERN_ERR
133 "ACPI: Wakeup code way too big, S3 disabled.\n"); 132 "ACPI: Wakeup code way too big, S3 disabled.\n");
134 return; 133 return;
135 } 134 }
136 135
137 acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); 136 mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
138 137
139 if (!acpi_realmode) { 138 if (mem == -1L) {
140 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); 139 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
141 return; 140 return;
142 } 141 }
143 142 acpi_realmode = (unsigned long) phys_to_virt(mem);
144 acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); 143 acpi_wakeup_address = mem;
144 reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
145} 145}
146 146
147 147
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 32fb09102a13..b990b5cc9541 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -166,6 +166,43 @@ static void iommu_uninit_device(struct device *dev)
166{ 166{
167 kfree(dev->archdata.iommu); 167 kfree(dev->archdata.iommu);
168} 168}
169
170void __init amd_iommu_uninit_devices(void)
171{
172 struct pci_dev *pdev = NULL;
173
174 for_each_pci_dev(pdev) {
175
176 if (!check_device(&pdev->dev))
177 continue;
178
179 iommu_uninit_device(&pdev->dev);
180 }
181}
182
183int __init amd_iommu_init_devices(void)
184{
185 struct pci_dev *pdev = NULL;
186 int ret = 0;
187
188 for_each_pci_dev(pdev) {
189
190 if (!check_device(&pdev->dev))
191 continue;
192
193 ret = iommu_init_device(&pdev->dev);
194 if (ret)
195 goto out_free;
196 }
197
198 return 0;
199
200out_free:
201
202 amd_iommu_uninit_devices();
203
204 return ret;
205}
169#ifdef CONFIG_AMD_IOMMU_STATS 206#ifdef CONFIG_AMD_IOMMU_STATS
170 207
171/* 208/*
@@ -1587,6 +1624,11 @@ static struct notifier_block device_nb = {
1587 .notifier_call = device_change_notifier, 1624 .notifier_call = device_change_notifier,
1588}; 1625};
1589 1626
1627void amd_iommu_init_notifier(void)
1628{
1629 bus_register_notifier(&pci_bus_type, &device_nb);
1630}
1631
1590/***************************************************************************** 1632/*****************************************************************************
1591 * 1633 *
1592 * The next functions belong to the dma_ops mapping/unmapping code. 1634 * The next functions belong to the dma_ops mapping/unmapping code.
@@ -1783,7 +1825,7 @@ retry:
1783 goto out; 1825 goto out;
1784 1826
1785 /* 1827 /*
1786 * aperture was sucessfully enlarged by 128 MB, try 1828 * aperture was successfully enlarged by 128 MB, try
1787 * allocation again 1829 * allocation again
1788 */ 1830 */
1789 goto retry; 1831 goto retry;
@@ -2145,8 +2187,6 @@ static void prealloc_protection_domains(void)
2145 if (!check_device(&dev->dev)) 2187 if (!check_device(&dev->dev))
2146 continue; 2188 continue;
2147 2189
2148 iommu_init_device(&dev->dev);
2149
2150 /* Is there already any domain for it? */ 2190 /* Is there already any domain for it? */
2151 if (domain_for_device(&dev->dev)) 2191 if (domain_for_device(&dev->dev))
2152 continue; 2192 continue;
@@ -2215,8 +2255,6 @@ int __init amd_iommu_init_dma_ops(void)
2215 2255
2216 register_iommu(&amd_iommu_ops); 2256 register_iommu(&amd_iommu_ops);
2217 2257
2218 bus_register_notifier(&pci_bus_type, &device_nb);
2219
2220 amd_iommu_stats_init(); 2258 amd_iommu_stats_init();
2221 2259
2222 return 0; 2260 return 0;
@@ -2490,7 +2528,7 @@ int __init amd_iommu_init_passthrough(void)
2490 struct pci_dev *dev = NULL; 2528 struct pci_dev *dev = NULL;
2491 u16 devid; 2529 u16 devid;
2492 2530
2493 /* allocate passthroug domain */ 2531 /* allocate passthrough domain */
2494 pt_domain = protection_domain_alloc(); 2532 pt_domain = protection_domain_alloc();
2495 if (!pt_domain) 2533 if (!pt_domain)
2496 return -ENOMEM; 2534 return -ENOMEM;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 7ffc39965233..1dca9c34eaeb 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1274,6 +1274,10 @@ static int __init amd_iommu_init(void)
1274 if (ret) 1274 if (ret)
1275 goto free; 1275 goto free;
1276 1276
1277 ret = amd_iommu_init_devices();
1278 if (ret)
1279 goto free;
1280
1277 if (iommu_pass_through) 1281 if (iommu_pass_through)
1278 ret = amd_iommu_init_passthrough(); 1282 ret = amd_iommu_init_passthrough();
1279 else 1283 else
@@ -1281,6 +1285,8 @@ static int __init amd_iommu_init(void)
1281 if (ret) 1285 if (ret)
1282 goto free; 1286 goto free;
1283 1287
1288 amd_iommu_init_notifier();
1289
1284 enable_iommus(); 1290 enable_iommus();
1285 1291
1286 if (iommu_pass_through) 1292 if (iommu_pass_through)
@@ -1296,6 +1302,9 @@ out:
1296 return ret; 1302 return ret;
1297 1303
1298free: 1304free:
1305
1306 amd_iommu_uninit_devices();
1307
1299 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1308 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1300 get_order(MAX_DOMAIN_ID/8)); 1309 get_order(MAX_DOMAIN_ID/8));
1301 1310
@@ -1336,6 +1345,9 @@ void __init amd_iommu_detect(void)
1336 iommu_detected = 1; 1345 iommu_detected = 1;
1337 amd_iommu_detected = 1; 1346 amd_iommu_detected = 1;
1338 x86_init.iommu.iommu_init = amd_iommu_init; 1347 x86_init.iommu.iommu_init = amd_iommu_init;
1348
1349 /* Make sure ACS will be enabled */
1350 pci_request_acs();
1339 } 1351 }
1340} 1352}
1341 1353
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ad8c75b9e453..efb2b9cd132c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -647,7 +647,7 @@ static int __init calibrate_APIC_clock(void)
647 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; 647 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
648 648
649 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); 649 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
650 apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); 650 apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
651 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", 651 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
652 calibration_result); 652 calibration_result);
653 653
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index d9acc3bee0f4..e31b9ffe25f5 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -127,7 +127,7 @@ static u32 noop_apic_read(u32 reg)
127 127
128static void noop_apic_write(u32 reg, u32 v) 128static void noop_apic_write(u32 reg, u32 v)
129{ 129{
130 WARN_ON_ONCE((cpu_has_apic || !disable_apic)); 130 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
131} 131}
132 132
133struct apic apic_noop = { 133struct apic apic_noop = {
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index e85f8fb7f8e7..dd2b5f264643 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -27,6 +27,9 @@
27 * 27 *
28 * http://www.unisys.com 28 * http://www.unisys.com
29 */ 29 */
30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
30#include <linux/notifier.h> 33#include <linux/notifier.h>
31#include <linux/spinlock.h> 34#include <linux/spinlock.h>
32#include <linux/cpumask.h> 35#include <linux/cpumask.h>
@@ -223,9 +226,9 @@ static int parse_unisys_oem(char *oemptr)
223 mip_addr = val; 226 mip_addr = val;
224 mip = (struct mip_reg *)val; 227 mip = (struct mip_reg *)val;
225 mip_reg = __va(mip); 228 mip_reg = __va(mip);
226 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", 229 pr_debug("host_reg = 0x%lx\n",
227 (unsigned long)host_reg); 230 (unsigned long)host_reg);
228 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", 231 pr_debug("mip_reg = 0x%lx\n",
229 (unsigned long)mip_reg); 232 (unsigned long)mip_reg);
230 success++; 233 success++;
231 break; 234 break;
@@ -401,7 +404,7 @@ static void es7000_enable_apic_mode(void)
401 if (!es7000_plat) 404 if (!es7000_plat)
402 return; 405 return;
403 406
404 printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); 407 pr_info("Enabling APIC mode.\n");
405 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); 408 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
406 es7000_mip_reg.off_0x00 = MIP_SW_APIC; 409 es7000_mip_reg.off_0x00 = MIP_SW_APIC;
407 es7000_mip_reg.off_0x38 = MIP_VALID; 410 es7000_mip_reg.off_0x38 = MIP_VALID;
@@ -514,8 +517,7 @@ static void es7000_setup_apic_routing(void)
514{ 517{
515 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); 518 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
516 519
517 printk(KERN_INFO 520 pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
518 "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
519 (apic_version[apic] == 0x14) ? 521 (apic_version[apic] == 0x14) ?
520 "Physical Cluster" : "Logical Cluster", 522 "Physical Cluster" : "Logical Cluster",
521 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 523 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index c0b4468683f9..d5d498fbee4b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3267,7 +3267,8 @@ void destroy_irq(unsigned int irq)
3267 * MSI message composition 3267 * MSI message composition
3268 */ 3268 */
3269#ifdef CONFIG_PCI_MSI 3269#ifdef CONFIG_PCI_MSI
3270static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) 3270static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3271 struct msi_msg *msg, u8 hpet_id)
3271{ 3272{
3272 struct irq_cfg *cfg; 3273 struct irq_cfg *cfg;
3273 int err; 3274 int err;
@@ -3301,7 +3302,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3301 irte.dest_id = IRTE_DEST(dest); 3302 irte.dest_id = IRTE_DEST(dest);
3302 3303
3303 /* Set source-id of interrupt request */ 3304 /* Set source-id of interrupt request */
3304 set_msi_sid(&irte, pdev); 3305 if (pdev)
3306 set_msi_sid(&irte, pdev);
3307 else
3308 set_hpet_sid(&irte, hpet_id);
3305 3309
3306 modify_irte(irq, &irte); 3310 modify_irte(irq, &irte);
3307 3311
@@ -3466,7 +3470,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3466 int ret; 3470 int ret;
3467 struct msi_msg msg; 3471 struct msi_msg msg;
3468 3472
3469 ret = msi_compose_msg(dev, irq, &msg); 3473 ret = msi_compose_msg(dev, irq, &msg, -1);
3470 if (ret < 0) 3474 if (ret < 0)
3471 return ret; 3475 return ret;
3472 3476
@@ -3599,7 +3603,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3599 int ret; 3603 int ret;
3600 struct msi_msg msg; 3604 struct msi_msg msg;
3601 3605
3602 ret = msi_compose_msg(NULL, irq, &msg); 3606 ret = msi_compose_msg(NULL, irq, &msg, -1);
3603 if (ret < 0) 3607 if (ret < 0)
3604 return ret; 3608 return ret;
3605 dmar_msi_write(irq, &msg); 3609 dmar_msi_write(irq, &msg);
@@ -3639,6 +3643,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3639 3643
3640#endif /* CONFIG_SMP */ 3644#endif /* CONFIG_SMP */
3641 3645
3646static struct irq_chip ir_hpet_msi_type = {
3647 .name = "IR-HPET_MSI",
3648 .unmask = hpet_msi_unmask,
3649 .mask = hpet_msi_mask,
3650#ifdef CONFIG_INTR_REMAP
3651 .ack = ir_ack_apic_edge,
3652#ifdef CONFIG_SMP
3653 .set_affinity = ir_set_msi_irq_affinity,
3654#endif
3655#endif
3656 .retrigger = ioapic_retrigger_irq,
3657};
3658
3642static struct irq_chip hpet_msi_type = { 3659static struct irq_chip hpet_msi_type = {
3643 .name = "HPET_MSI", 3660 .name = "HPET_MSI",
3644 .unmask = hpet_msi_unmask, 3661 .unmask = hpet_msi_unmask,
@@ -3650,20 +3667,36 @@ static struct irq_chip hpet_msi_type = {
3650 .retrigger = ioapic_retrigger_irq, 3667 .retrigger = ioapic_retrigger_irq,
3651}; 3668};
3652 3669
3653int arch_setup_hpet_msi(unsigned int irq) 3670int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3654{ 3671{
3655 int ret; 3672 int ret;
3656 struct msi_msg msg; 3673 struct msi_msg msg;
3657 struct irq_desc *desc = irq_to_desc(irq); 3674 struct irq_desc *desc = irq_to_desc(irq);
3658 3675
3659 ret = msi_compose_msg(NULL, irq, &msg); 3676 if (intr_remapping_enabled) {
3677 struct intel_iommu *iommu = map_hpet_to_ir(id);
3678 int index;
3679
3680 if (!iommu)
3681 return -1;
3682
3683 index = alloc_irte(iommu, irq, 1);
3684 if (index < 0)
3685 return -1;
3686 }
3687
3688 ret = msi_compose_msg(NULL, irq, &msg, id);
3660 if (ret < 0) 3689 if (ret < 0)
3661 return ret; 3690 return ret;
3662 3691
3663 hpet_msi_write(irq, &msg); 3692 hpet_msi_write(irq, &msg);
3664 desc->status |= IRQ_MOVE_PCNTXT; 3693 desc->status |= IRQ_MOVE_PCNTXT;
3665 set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, 3694 if (irq_remapped(irq))
3666 "edge"); 3695 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
3696 handle_edge_irq, "edge");
3697 else
3698 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3699 handle_edge_irq, "edge");
3667 3700
3668 return 0; 3701 return 0;
3669} 3702}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 6389432a9dbf..0159a69396cb 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -361,7 +361,7 @@ void stop_apic_nmi_watchdog(void *unused)
361 */ 361 */
362 362
363static DEFINE_PER_CPU(unsigned, last_irq_sum); 363static DEFINE_PER_CPU(unsigned, last_irq_sum);
364static DEFINE_PER_CPU(local_t, alert_counter); 364static DEFINE_PER_CPU(long, alert_counter);
365static DEFINE_PER_CPU(int, nmi_touch); 365static DEFINE_PER_CPU(int, nmi_touch);
366 366
367void touch_nmi_watchdog(void) 367void touch_nmi_watchdog(void)
@@ -438,8 +438,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
438 * Ayiee, looks like this CPU is stuck ... 438 * Ayiee, looks like this CPU is stuck ...
439 * wait a few IRQs (5 seconds) before doing the oops ... 439 * wait a few IRQs (5 seconds) before doing the oops ...
440 */ 440 */
441 local_inc(&__get_cpu_var(alert_counter)); 441 __this_cpu_inc(per_cpu_var(alert_counter));
442 if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) 442 if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz)
443 /* 443 /*
444 * die_nmi will return ONLY if NOTIFY_STOP happens.. 444 * die_nmi will return ONLY if NOTIFY_STOP happens..
445 */ 445 */
@@ -447,7 +447,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
447 regs, panic_on_timeout); 447 regs, panic_on_timeout);
448 } else { 448 } else {
449 __get_cpu_var(last_irq_sum) = sum; 449 __get_cpu_var(last_irq_sum) = sum;
450 local_set(&__get_cpu_var(alert_counter), 0); 450 __this_cpu_write(per_cpu_var(alert_counter), 0);
451 } 451 }
452 452
453 /* see if the nmi watchdog went off */ 453 /* see if the nmi watchdog went off */
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 07cdbdcd7a92..98c4665f251c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -264,11 +264,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
264static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
265{ 265{
266 /* 266 /*
267 * Find possible boot-time SMP configuration:
268 */
269 early_find_smp_config();
270
271 /*
272 * get boot-time SMP configuration: 267 * get boot-time SMP configuration:
273 */ 268 */
274 if (smp_found_config) 269 if (smp_found_config)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 130c4b934877..b684bb303cbf 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -30,10 +30,22 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/ipi.h> 31#include <asm/ipi.h>
32#include <asm/smp.h> 32#include <asm/smp.h>
33#include <asm/x86_init.h>
33 34
34DEFINE_PER_CPU(int, x2apic_extra_bits); 35DEFINE_PER_CPU(int, x2apic_extra_bits);
35 36
36static enum uv_system_type uv_system_type; 37static enum uv_system_type uv_system_type;
38static u64 gru_start_paddr, gru_end_paddr;
39
40static inline bool is_GRU_range(u64 start, u64 end)
41{
42 return start >= gru_start_paddr && end <= gru_end_paddr;
43}
44
45static bool uv_is_untracked_pat_range(u64 start, u64 end)
46{
47 return is_ISA_range(start, end) || is_GRU_range(start, end);
48}
37 49
38static int early_get_nodeid(void) 50static int early_get_nodeid(void)
39{ 51{
@@ -49,6 +61,7 @@ static int early_get_nodeid(void)
49static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 61static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 62{
51 if (!strcmp(oem_id, "SGI")) { 63 if (!strcmp(oem_id, "SGI")) {
64 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
52 if (!strcmp(oem_table_id, "UVL")) 65 if (!strcmp(oem_table_id, "UVL"))
53 uv_system_type = UV_LEGACY_APIC; 66 uv_system_type = UV_LEGACY_APIC;
54 else if (!strcmp(oem_table_id, "UVX")) 67 else if (!strcmp(oem_table_id, "UVX"))
@@ -385,8 +398,12 @@ static __init void map_gru_high(int max_pnode)
385 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; 398 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
386 399
387 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 400 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
388 if (gru.s.enable) 401 if (gru.s.enable) {
389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 402 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
403 gru_start_paddr = ((u64)gru.s.base << shift);
404 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
405
406 }
390} 407}
391 408
392static __init void map_mmr_high(int max_pnode) 409static __init void map_mmr_high(int max_pnode)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a4ec8b647544..20399b7b0c3f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1093,7 +1093,7 @@ static void clear_all_debug_regs(void)
1093 1093
1094void __cpuinit cpu_init(void) 1094void __cpuinit cpu_init(void)
1095{ 1095{
1096 struct orig_ist *orig_ist; 1096 struct orig_ist *oist;
1097 struct task_struct *me; 1097 struct task_struct *me;
1098 struct tss_struct *t; 1098 struct tss_struct *t;
1099 unsigned long v; 1099 unsigned long v;
@@ -1102,7 +1102,7 @@ void __cpuinit cpu_init(void)
1102 1102
1103 cpu = stack_smp_processor_id(); 1103 cpu = stack_smp_processor_id();
1104 t = &per_cpu(init_tss, cpu); 1104 t = &per_cpu(init_tss, cpu);
1105 orig_ist = &per_cpu(orig_ist, cpu); 1105 oist = &per_cpu(orig_ist, cpu);
1106 1106
1107#ifdef CONFIG_NUMA 1107#ifdef CONFIG_NUMA
1108 if (cpu != 0 && percpu_read(node_number) == 0 && 1108 if (cpu != 0 && percpu_read(node_number) == 0 &&
@@ -1136,19 +1136,19 @@ void __cpuinit cpu_init(void)
1136 wrmsrl(MSR_KERNEL_GS_BASE, 0); 1136 wrmsrl(MSR_KERNEL_GS_BASE, 0);
1137 barrier(); 1137 barrier();
1138 1138
1139 check_efer(); 1139 x86_configure_nx();
1140 if (cpu != 0) 1140 if (cpu != 0)
1141 enable_x2apic(); 1141 enable_x2apic();
1142 1142
1143 /* 1143 /*
1144 * set up and load the per-CPU TSS 1144 * set up and load the per-CPU TSS
1145 */ 1145 */
1146 if (!orig_ist->ist[0]) { 1146 if (!oist->ist[0]) {
1147 char *estacks = per_cpu(exception_stacks, cpu); 1147 char *estacks = per_cpu(exception_stacks, cpu);
1148 1148
1149 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1149 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1150 estacks += exception_stack_sizes[v]; 1150 estacks += exception_stack_sizes[v];
1151 orig_ist->ist[v] = t->x86_tss.ist[v] = 1151 oist->ist[v] = t->x86_tss.ist[v] =
1152 (unsigned long)estacks; 1152 (unsigned long)estacks;
1153 } 1153 }
1154 } 1154 }
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index dca325c03999..b368cd862997 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -30,9 +30,9 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); 33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpud_arr);
34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); 34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], cpud_priv_arr);
35static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpud_priv_count);
36 36
37static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
38 38
@@ -531,7 +531,7 @@ static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
531 531
532 /* Already intialized */ 532 /* Already intialized */
533 if (file == CPU_INDEX_BIT) 533 if (file == CPU_INDEX_BIT)
534 if (per_cpu(cpu_arr[type].init, cpu)) 534 if (per_cpu(cpud_arr[type].init, cpu))
535 return 0; 535 return 0;
536 536
537 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 537 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
@@ -543,8 +543,8 @@ static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
543 priv->reg = reg; 543 priv->reg = reg;
544 priv->file = file; 544 priv->file = file;
545 mutex_lock(&cpu_debug_lock); 545 mutex_lock(&cpu_debug_lock);
546 per_cpu(priv_arr[type], cpu) = priv; 546 per_cpu(cpud_priv_arr[type], cpu) = priv;
547 per_cpu(cpu_priv_count, cpu)++; 547 per_cpu(cpud_priv_count, cpu)++;
548 mutex_unlock(&cpu_debug_lock); 548 mutex_unlock(&cpu_debug_lock);
549 549
550 if (file) 550 if (file)
@@ -552,10 +552,10 @@ static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
552 dentry, (void *)priv, &cpu_fops); 552 dentry, (void *)priv, &cpu_fops);
553 else { 553 else {
554 debugfs_create_file(cpu_base[type].name, S_IRUGO, 554 debugfs_create_file(cpu_base[type].name, S_IRUGO,
555 per_cpu(cpu_arr[type].dentry, cpu), 555 per_cpu(cpud_arr[type].dentry, cpu),
556 (void *)priv, &cpu_fops); 556 (void *)priv, &cpu_fops);
557 mutex_lock(&cpu_debug_lock); 557 mutex_lock(&cpu_debug_lock);
558 per_cpu(cpu_arr[type].init, cpu) = 1; 558 per_cpu(cpud_arr[type].init, cpu) = 1;
559 mutex_unlock(&cpu_debug_lock); 559 mutex_unlock(&cpu_debug_lock);
560 } 560 }
561 561
@@ -615,7 +615,7 @@ static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
615 if (!is_typeflag_valid(cpu, cpu_base[type].flag)) 615 if (!is_typeflag_valid(cpu, cpu_base[type].flag))
616 continue; 616 continue;
617 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry); 617 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
618 per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry; 618 per_cpu(cpud_arr[type].dentry, cpu) = cpu_dentry;
619 619
620 if (type < CPU_TSS_BIT) 620 if (type < CPU_TSS_BIT)
621 err = cpu_init_msr(cpu, type, cpu_dentry); 621 err = cpu_init_msr(cpu, type, cpu_dentry);
@@ -647,11 +647,11 @@ static int cpu_init_cpu(void)
647 err = cpu_init_allreg(cpu, cpu_dentry); 647 err = cpu_init_allreg(cpu, cpu_dentry);
648 648
649 pr_info("cpu%d(%d) debug files %d\n", 649 pr_info("cpu%d(%d) debug files %d\n",
650 cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu)); 650 cpu, nr_cpu_ids, per_cpu(cpud_priv_count, cpu));
651 if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) { 651 if (per_cpu(cpud_priv_count, cpu) > MAX_CPU_FILES) {
652 pr_err("Register files count %d exceeds limit %d\n", 652 pr_err("Register files count %d exceeds limit %d\n",
653 per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES); 653 per_cpu(cpud_priv_count, cpu), MAX_CPU_FILES);
654 per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES; 654 per_cpu(cpud_priv_count, cpu) = MAX_CPU_FILES;
655 err = -ENFILE; 655 err = -ENFILE;
656 } 656 }
657 if (err) 657 if (err)
@@ -676,8 +676,8 @@ static void __exit cpu_debug_exit(void)
676 debugfs_remove_recursive(cpu_debugfs_dir); 676 debugfs_remove_recursive(cpu_debugfs_dir);
677 677
678 for (cpu = 0; cpu < nr_cpu_ids; cpu++) 678 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
679 for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++) 679 for (i = 0; i < per_cpu(cpud_priv_count, cpu); i++)
680 kfree(per_cpu(priv_arr[i], cpu)); 680 kfree(per_cpu(cpud_priv_arr[i], cpu));
681} 681}
682 682
683module_init(cpu_debug_init); 683module_init(cpu_debug_init);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8b581d3905cb..f28decf8dde3 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -68,9 +68,9 @@ struct acpi_cpufreq_data {
68 unsigned int cpu_feature; 68 unsigned int cpu_feature;
69}; 69};
70 70
71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
72 72
73static DEFINE_PER_CPU(struct aperfmperf, old_perf); 73static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
74 74
75/* acpi_perf_data is a pointer to percpu data. */ 75/* acpi_perf_data is a pointer to percpu data. */
76static struct acpi_processor_performance *acpi_perf_data; 76static struct acpi_processor_performance *acpi_perf_data;
@@ -214,14 +214,14 @@ static u32 get_cur_val(const struct cpumask *mask)
214 if (unlikely(cpumask_empty(mask))) 214 if (unlikely(cpumask_empty(mask)))
215 return 0; 215 return 0;
216 216
217 switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) { 217 switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
218 case SYSTEM_INTEL_MSR_CAPABLE: 218 case SYSTEM_INTEL_MSR_CAPABLE:
219 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 219 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
220 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; 220 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
221 break; 221 break;
222 case SYSTEM_IO_CAPABLE: 222 case SYSTEM_IO_CAPABLE:
223 cmd.type = SYSTEM_IO_CAPABLE; 223 cmd.type = SYSTEM_IO_CAPABLE;
224 perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data; 224 perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
225 cmd.addr.io.port = perf->control_register.address; 225 cmd.addr.io.port = perf->control_register.address;
226 cmd.addr.io.bit_width = perf->control_register.bit_width; 226 cmd.addr.io.bit_width = perf->control_register.bit_width;
227 break; 227 break;
@@ -268,8 +268,8 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
268 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) 268 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
269 return 0; 269 return 0;
270 270
271 ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); 271 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
272 per_cpu(old_perf, cpu) = perf; 272 per_cpu(acfreq_old_perf, cpu) = perf;
273 273
274 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; 274 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
275 275
@@ -278,7 +278,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
278 278
279static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 279static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
280{ 280{
281 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); 281 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
282 unsigned int freq; 282 unsigned int freq;
283 unsigned int cached_freq; 283 unsigned int cached_freq;
284 284
@@ -322,7 +322,7 @@ static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
322static int acpi_cpufreq_target(struct cpufreq_policy *policy, 322static int acpi_cpufreq_target(struct cpufreq_policy *policy,
323 unsigned int target_freq, unsigned int relation) 323 unsigned int target_freq, unsigned int relation)
324{ 324{
325 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 325 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
326 struct acpi_processor_performance *perf; 326 struct acpi_processor_performance *perf;
327 struct cpufreq_freqs freqs; 327 struct cpufreq_freqs freqs;
328 struct drv_cmd cmd; 328 struct drv_cmd cmd;
@@ -416,7 +416,7 @@ out:
416 416
417static int acpi_cpufreq_verify(struct cpufreq_policy *policy) 417static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
418{ 418{
419 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 419 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
420 420
421 dprintk("acpi_cpufreq_verify\n"); 421 dprintk("acpi_cpufreq_verify\n");
422 422
@@ -574,7 +574,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
574 return -ENOMEM; 574 return -ENOMEM;
575 575
576 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); 576 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
577 per_cpu(drv_data, cpu) = data; 577 per_cpu(acfreq_data, cpu) = data;
578 578
579 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 579 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
580 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; 580 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -725,20 +725,20 @@ err_unreg:
725 acpi_processor_unregister_performance(perf, cpu); 725 acpi_processor_unregister_performance(perf, cpu);
726err_free: 726err_free:
727 kfree(data); 727 kfree(data);
728 per_cpu(drv_data, cpu) = NULL; 728 per_cpu(acfreq_data, cpu) = NULL;
729 729
730 return result; 730 return result;
731} 731}
732 732
733static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) 733static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
734{ 734{
735 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 735 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
736 736
737 dprintk("acpi_cpufreq_cpu_exit\n"); 737 dprintk("acpi_cpufreq_cpu_exit\n");
738 738
739 if (data) { 739 if (data) {
740 cpufreq_frequency_table_put_attr(policy->cpu); 740 cpufreq_frequency_table_put_attr(policy->cpu);
741 per_cpu(drv_data, policy->cpu) = NULL; 741 per_cpu(acfreq_data, policy->cpu) = NULL;
742 acpi_processor_unregister_performance(data->acpi_data, 742 acpi_processor_unregister_performance(data->acpi_data,
743 policy->cpu); 743 policy->cpu);
744 kfree(data); 744 kfree(data);
@@ -749,7 +749,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
749 749
750static int acpi_cpufreq_resume(struct cpufreq_policy *policy) 750static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
751{ 751{
752 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 752 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
753 753
754 dprintk("acpi_cpufreq_resume\n"); 754 dprintk("acpi_cpufreq_resume\n");
755 755
@@ -764,14 +764,15 @@ static struct freq_attr *acpi_cpufreq_attr[] = {
764}; 764};
765 765
766static struct cpufreq_driver acpi_cpufreq_driver = { 766static struct cpufreq_driver acpi_cpufreq_driver = {
767 .verify = acpi_cpufreq_verify, 767 .verify = acpi_cpufreq_verify,
768 .target = acpi_cpufreq_target, 768 .target = acpi_cpufreq_target,
769 .init = acpi_cpufreq_cpu_init, 769 .bios_limit = acpi_processor_get_bios_limit,
770 .exit = acpi_cpufreq_cpu_exit, 770 .init = acpi_cpufreq_cpu_init,
771 .resume = acpi_cpufreq_resume, 771 .exit = acpi_cpufreq_cpu_exit,
772 .name = "acpi-cpufreq", 772 .resume = acpi_cpufreq_resume,
773 .owner = THIS_MODULE, 773 .name = "acpi-cpufreq",
774 .attr = acpi_cpufreq_attr, 774 .owner = THIS_MODULE,
775 .attr = acpi_cpufreq_attr,
775}; 776};
776 777
777static int __init acpi_cpufreq_init(void) 778static int __init acpi_cpufreq_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index cabd2fa3fc93..7e7eea4f8261 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -885,7 +885,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
885 885
886 /* Find ACPI data for processor */ 886 /* Find ACPI data for processor */
887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, 887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
888 ACPI_UINT32_MAX, &longhaul_walk_callback, 888 ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
889 NULL, (void *)&pr); 889 NULL, (void *)&pr);
890 890
891 /* Check ACPI support for C3 state */ 891 /* Check ACPI support for C3 state */
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index f10dea409f40..cb01dac267d3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -164,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
164 } 164 }
165 165
166 /* cpuinfo and default policy values */ 166 /* cpuinfo and default policy values */
167 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; 167 policy->cpuinfo.transition_latency = 200000;
168 policy->cur = busfreq * max_multiplier; 168 policy->cur = busfreq * max_multiplier;
169 169
170 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 170 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index d47c775eb0ab..9a97116f89e5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -714,14 +714,17 @@ static struct freq_attr *powernow_table_attr[] = {
714}; 714};
715 715
716static struct cpufreq_driver powernow_driver = { 716static struct cpufreq_driver powernow_driver = {
717 .verify = powernow_verify, 717 .verify = powernow_verify,
718 .target = powernow_target, 718 .target = powernow_target,
719 .get = powernow_get, 719 .get = powernow_get,
720 .init = powernow_cpu_init, 720#ifdef CONFIG_X86_POWERNOW_K7_ACPI
721 .exit = powernow_cpu_exit, 721 .bios_limit = acpi_processor_get_bios_limit,
722 .name = "powernow-k7", 722#endif
723 .owner = THIS_MODULE, 723 .init = powernow_cpu_init,
724 .attr = powernow_table_attr, 724 .exit = powernow_cpu_exit,
725 .name = "powernow-k7",
726 .owner = THIS_MODULE,
727 .attr = powernow_table_attr,
725}; 728};
726 729
727static int __init powernow_init(void) 730static int __init powernow_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 3f12dabeab52..a9df9441a9a2 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1118,7 +1118,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1118static int powernowk8_target(struct cpufreq_policy *pol, 1118static int powernowk8_target(struct cpufreq_policy *pol,
1119 unsigned targfreq, unsigned relation) 1119 unsigned targfreq, unsigned relation)
1120{ 1120{
1121 cpumask_t oldmask; 1121 cpumask_var_t oldmask;
1122 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1122 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1123 u32 checkfid; 1123 u32 checkfid;
1124 u32 checkvid; 1124 u32 checkvid;
@@ -1131,9 +1131,13 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1131 checkfid = data->currfid; 1131 checkfid = data->currfid;
1132 checkvid = data->currvid; 1132 checkvid = data->currvid;
1133 1133
1134 /* only run on specific CPU from here on */ 1134 /* only run on specific CPU from here on. */
1135 oldmask = current->cpus_allowed; 1135 /* This is poor form: use a workqueue or smp_call_function_single */
1136 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1136 if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
1137 return -ENOMEM;
1138
1139 cpumask_copy(oldmask, tsk_cpumask(current));
1140 set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
1137 1141
1138 if (smp_processor_id() != pol->cpu) { 1142 if (smp_processor_id() != pol->cpu) {
1139 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1143 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1193,7 +1197,8 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1193 ret = 0; 1197 ret = 0;
1194 1198
1195err_out: 1199err_out:
1196 set_cpus_allowed_ptr(current, &oldmask); 1200 set_cpus_allowed_ptr(current, oldmask);
1201 free_cpumask_var(oldmask);
1197 return ret; 1202 return ret;
1198} 1203}
1199 1204
@@ -1393,14 +1398,15 @@ static struct freq_attr *powernow_k8_attr[] = {
1393}; 1398};
1394 1399
1395static struct cpufreq_driver cpufreq_amd64_driver = { 1400static struct cpufreq_driver cpufreq_amd64_driver = {
1396 .verify = powernowk8_verify, 1401 .verify = powernowk8_verify,
1397 .target = powernowk8_target, 1402 .target = powernowk8_target,
1398 .init = powernowk8_cpu_init, 1403 .bios_limit = acpi_processor_get_bios_limit,
1399 .exit = __devexit_p(powernowk8_cpu_exit), 1404 .init = powernowk8_cpu_init,
1400 .get = powernowk8_get, 1405 .exit = __devexit_p(powernowk8_cpu_exit),
1401 .name = "powernow-k8", 1406 .get = powernowk8_get,
1402 .owner = THIS_MODULE, 1407 .name = "powernow-k8",
1403 .attr = powernow_k8_attr, 1408 .owner = THIS_MODULE,
1409 .attr = powernow_k8_attr,
1404}; 1410};
1405 1411
1406/* driver entry point for init */ 1412/* driver entry point for init */
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 3ae5a7a3a500..2ce8e0b5cc54 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev;
39 39
40/* speedstep_processor 40/* speedstep_processor
41 */ 41 */
42static unsigned int speedstep_processor; 42static enum speedstep_processor speedstep_processor;
43 43
44static u32 pmbase; 44static u32 pmbase;
45 45
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index f4c290b8482f..ad0083abfa23 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -34,7 +34,7 @@ static int relaxed_check;
34 * GET PROCESSOR CORE SPEED IN KHZ * 34 * GET PROCESSOR CORE SPEED IN KHZ *
35 *********************************************************************/ 35 *********************************************************************/
36 36
37static unsigned int pentium3_get_frequency(unsigned int processor) 37static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
38{ 38{
39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ 39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
40 struct { 40 struct {
@@ -227,7 +227,7 @@ static unsigned int pentium4_get_frequency(void)
227 227
228 228
229/* Warning: may get called from smp_call_function_single. */ 229/* Warning: may get called from smp_call_function_single. */
230unsigned int speedstep_get_frequency(unsigned int processor) 230unsigned int speedstep_get_frequency(enum speedstep_processor processor)
231{ 231{
232 switch (processor) { 232 switch (processor) {
233 case SPEEDSTEP_CPU_PCORE: 233 case SPEEDSTEP_CPU_PCORE:
@@ -380,7 +380,7 @@ EXPORT_SYMBOL_GPL(speedstep_detect_processor);
380 * DETECT SPEEDSTEP SPEEDS * 380 * DETECT SPEEDSTEP SPEEDS *
381 *********************************************************************/ 381 *********************************************************************/
382 382
383unsigned int speedstep_get_freqs(unsigned int processor, 383unsigned int speedstep_get_freqs(enum speedstep_processor processor,
384 unsigned int *low_speed, 384 unsigned int *low_speed,
385 unsigned int *high_speed, 385 unsigned int *high_speed,
386 unsigned int *transition_latency, 386 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
index 2b6c04e5a304..70d9cea1219d 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -11,18 +11,18 @@
11 11
12 12
13/* processors */ 13/* processors */
14 14enum speedstep_processor {
15#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */ 15 SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
16#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */ 16 SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
17#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */ 17 SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
18#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */ 18 SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
19
20/* the following processors are not speedstep-capable and are not auto-detected 19/* the following processors are not speedstep-capable and are not auto-detected
21 * in speedstep_detect_processor(). However, their speed can be detected using 20 * in speedstep_detect_processor(). However, their speed can be detected using
22 * the speedstep_get_frequency() call. */ 21 * the speedstep_get_frequency() call. */
23#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */ 22 SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
24#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */ 23 SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
25#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */ 24 SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
25};
26 26
27/* speedstep states -- only two of them */ 27/* speedstep states -- only two of them */
28 28
@@ -31,10 +31,10 @@
31 31
32 32
33/* detect a speedstep-capable processor */ 33/* detect a speedstep-capable processor */
34extern unsigned int speedstep_detect_processor (void); 34extern enum speedstep_processor speedstep_detect_processor(void);
35 35
36/* detect the current speed (in khz) of the processor */ 36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_frequency(unsigned int processor); 37extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
38 38
39 39
40/* detect the low and high speeds of the processor. The callback 40/* detect the low and high speeds of the processor. The callback
@@ -42,7 +42,7 @@ extern unsigned int speedstep_get_frequency(unsigned int processor);
42 * SPEEDSTEP_LOW; the second argument is zero so that no 42 * SPEEDSTEP_LOW; the second argument is zero so that no
43 * cpufreq_notify_transition calls are initiated. 43 * cpufreq_notify_transition calls are initiated.
44 */ 44 */
45extern unsigned int speedstep_get_freqs(unsigned int processor, 45extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
46 unsigned int *low_speed, 46 unsigned int *low_speed,
47 unsigned int *high_speed, 47 unsigned int *high_speed,
48 unsigned int *transition_latency, 48 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index befea088e4f5..04d73c114e49 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -35,7 +35,7 @@ static int smi_cmd;
35static unsigned int smi_sig; 35static unsigned int smi_sig;
36 36
37/* info about the processor */ 37/* info about the processor */
38static unsigned int speedstep_processor; 38static enum speedstep_processor speedstep_processor;
39 39
40/* 40/*
41 * There are only two frequency states for each processor. Values 41 * There are only two frequency states for each processor. Values
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 40e1835b35e8..c900b73f9224 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -263,8 +263,12 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
263 /* Don't do the funky fallback heuristics the AMD version employs 263 /* Don't do the funky fallback heuristics the AMD version employs
264 for now. */ 264 for now. */
265 node = apicid_to_node[apicid]; 265 node = apicid_to_node[apicid];
266 if (node == NUMA_NO_NODE || !node_online(node)) 266 if (node == NUMA_NO_NODE)
267 node = first_node(node_online_map); 267 node = first_node(node_online_map);
268 else if (!node_online(node)) {
269 /* reuse the value from init_cpu_to_node() */
270 node = cpu_to_node(cpu);
271 }
268 numa_set_node(cpu, node); 272 numa_set_node(cpu, node);
269 273
270 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); 274 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 0df4c2b7107f..0c06bca2a1dc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -94,7 +94,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ 94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */
95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ 95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */
96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ 96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */
97 { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ 97 { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */
98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ 99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */
100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
@@ -102,6 +102,9 @@ static const struct _cache_table __cpuinitconst cache_table[] =
102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ 102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */
103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
105 { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */
106 { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */
107 { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */
105 { 0x00, 0, 0} 108 { 0x00, 0, 0}
106}; 109};
107 110
@@ -496,8 +499,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
496#ifdef CONFIG_SYSFS 499#ifdef CONFIG_SYSFS
497 500
498/* pointer to _cpuid4_info array (for each cache leaf) */ 501/* pointer to _cpuid4_info array (for each cache leaf) */
499static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 502static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
500#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 503#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
501 504
502#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
503static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 506static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -510,7 +513,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
510 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 513 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
511 struct cpuinfo_x86 *d; 514 struct cpuinfo_x86 *d;
512 for_each_online_cpu(i) { 515 for_each_online_cpu(i) {
513 if (!per_cpu(cpuid4_info, i)) 516 if (!per_cpu(ici_cpuid4_info, i))
514 continue; 517 continue;
515 d = &cpu_data(i); 518 d = &cpu_data(i);
516 this_leaf = CPUID4_INFO_IDX(i, index); 519 this_leaf = CPUID4_INFO_IDX(i, index);
@@ -532,7 +535,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
532 c->apicid >> index_msb) { 535 c->apicid >> index_msb) {
533 cpumask_set_cpu(i, 536 cpumask_set_cpu(i,
534 to_cpumask(this_leaf->shared_cpu_map)); 537 to_cpumask(this_leaf->shared_cpu_map));
535 if (i != cpu && per_cpu(cpuid4_info, i)) { 538 if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
536 sibling_leaf = 539 sibling_leaf =
537 CPUID4_INFO_IDX(i, index); 540 CPUID4_INFO_IDX(i, index);
538 cpumask_set_cpu(cpu, to_cpumask( 541 cpumask_set_cpu(cpu, to_cpumask(
@@ -571,8 +574,8 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
571 for (i = 0; i < num_cache_leaves; i++) 574 for (i = 0; i < num_cache_leaves; i++)
572 cache_remove_shared_cpu_map(cpu, i); 575 cache_remove_shared_cpu_map(cpu, i);
573 576
574 kfree(per_cpu(cpuid4_info, cpu)); 577 kfree(per_cpu(ici_cpuid4_info, cpu));
575 per_cpu(cpuid4_info, cpu) = NULL; 578 per_cpu(ici_cpuid4_info, cpu) = NULL;
576} 579}
577 580
578static int 581static int
@@ -611,15 +614,15 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
611 if (num_cache_leaves == 0) 614 if (num_cache_leaves == 0)
612 return -ENOENT; 615 return -ENOENT;
613 616
614 per_cpu(cpuid4_info, cpu) = kzalloc( 617 per_cpu(ici_cpuid4_info, cpu) = kzalloc(
615 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 618 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
616 if (per_cpu(cpuid4_info, cpu) == NULL) 619 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
617 return -ENOMEM; 620 return -ENOMEM;
618 621
619 smp_call_function_single(cpu, get_cpu_leaves, &retval, true); 622 smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
620 if (retval) { 623 if (retval) {
621 kfree(per_cpu(cpuid4_info, cpu)); 624 kfree(per_cpu(ici_cpuid4_info, cpu));
622 per_cpu(cpuid4_info, cpu) = NULL; 625 per_cpu(ici_cpuid4_info, cpu) = NULL;
623 } 626 }
624 627
625 return retval; 628 return retval;
@@ -631,7 +634,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
631extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ 634extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
632 635
633/* pointer to kobject for cpuX/cache */ 636/* pointer to kobject for cpuX/cache */
634static DEFINE_PER_CPU(struct kobject *, cache_kobject); 637static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
635 638
636struct _index_kobject { 639struct _index_kobject {
637 struct kobject kobj; 640 struct kobject kobj;
@@ -640,8 +643,8 @@ struct _index_kobject {
640}; 643};
641 644
642/* pointer to array of kobjects for cpuX/cache/indexY */ 645/* pointer to array of kobjects for cpuX/cache/indexY */
643static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); 646static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
644#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) 647#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
645 648
646#define show_one_plus(file_name, object, val) \ 649#define show_one_plus(file_name, object, val) \
647static ssize_t show_##file_name \ 650static ssize_t show_##file_name \
@@ -860,10 +863,10 @@ static struct kobj_type ktype_percpu_entry = {
860 863
861static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) 864static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
862{ 865{
863 kfree(per_cpu(cache_kobject, cpu)); 866 kfree(per_cpu(ici_cache_kobject, cpu));
864 kfree(per_cpu(index_kobject, cpu)); 867 kfree(per_cpu(ici_index_kobject, cpu));
865 per_cpu(cache_kobject, cpu) = NULL; 868 per_cpu(ici_cache_kobject, cpu) = NULL;
866 per_cpu(index_kobject, cpu) = NULL; 869 per_cpu(ici_index_kobject, cpu) = NULL;
867 free_cache_attributes(cpu); 870 free_cache_attributes(cpu);
868} 871}
869 872
@@ -879,14 +882,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
879 return err; 882 return err;
880 883
881 /* Allocate all required memory */ 884 /* Allocate all required memory */
882 per_cpu(cache_kobject, cpu) = 885 per_cpu(ici_cache_kobject, cpu) =
883 kzalloc(sizeof(struct kobject), GFP_KERNEL); 886 kzalloc(sizeof(struct kobject), GFP_KERNEL);
884 if (unlikely(per_cpu(cache_kobject, cpu) == NULL)) 887 if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
885 goto err_out; 888 goto err_out;
886 889
887 per_cpu(index_kobject, cpu) = kzalloc( 890 per_cpu(ici_index_kobject, cpu) = kzalloc(
888 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); 891 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
889 if (unlikely(per_cpu(index_kobject, cpu) == NULL)) 892 if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
890 goto err_out; 893 goto err_out;
891 894
892 return 0; 895 return 0;
@@ -910,7 +913,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
910 if (unlikely(retval < 0)) 913 if (unlikely(retval < 0))
911 return retval; 914 return retval;
912 915
913 retval = kobject_init_and_add(per_cpu(cache_kobject, cpu), 916 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
914 &ktype_percpu_entry, 917 &ktype_percpu_entry,
915 &sys_dev->kobj, "%s", "cache"); 918 &sys_dev->kobj, "%s", "cache");
916 if (retval < 0) { 919 if (retval < 0) {
@@ -924,12 +927,12 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
924 this_object->index = i; 927 this_object->index = i;
925 retval = kobject_init_and_add(&(this_object->kobj), 928 retval = kobject_init_and_add(&(this_object->kobj),
926 &ktype_cache, 929 &ktype_cache,
927 per_cpu(cache_kobject, cpu), 930 per_cpu(ici_cache_kobject, cpu),
928 "index%1lu", i); 931 "index%1lu", i);
929 if (unlikely(retval)) { 932 if (unlikely(retval)) {
930 for (j = 0; j < i; j++) 933 for (j = 0; j < i; j++)
931 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); 934 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
932 kobject_put(per_cpu(cache_kobject, cpu)); 935 kobject_put(per_cpu(ici_cache_kobject, cpu));
933 cpuid4_cache_sysfs_exit(cpu); 936 cpuid4_cache_sysfs_exit(cpu);
934 return retval; 937 return retval;
935 } 938 }
@@ -937,7 +940,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
937 } 940 }
938 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); 941 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
939 942
940 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 943 kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
941 return 0; 944 return 0;
942} 945}
943 946
@@ -946,7 +949,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
946 unsigned int cpu = sys_dev->id; 949 unsigned int cpu = sys_dev->id;
947 unsigned long i; 950 unsigned long i;
948 951
949 if (per_cpu(cpuid4_info, cpu) == NULL) 952 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
950 return; 953 return;
951 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) 954 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
952 return; 955 return;
@@ -954,7 +957,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
954 957
955 for (i = 0; i < num_cache_leaves; i++) 958 for (i = 0; i < num_cache_leaves; i++)
956 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); 959 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
957 kobject_put(per_cpu(cache_kobject, cpu)); 960 kobject_put(per_cpu(ici_cache_kobject, cpu));
958 cpuid4_cache_sysfs_exit(cpu); 961 cpuid4_cache_sysfs_exit(cpu);
959} 962}
960 963
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 0bcaa3875863..a8aacd4b513c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1388,13 +1388,14 @@ static void __mcheck_cpu_init_timer(void)
1388 struct timer_list *t = &__get_cpu_var(mce_timer); 1388 struct timer_list *t = &__get_cpu_var(mce_timer);
1389 int *n = &__get_cpu_var(mce_next_interval); 1389 int *n = &__get_cpu_var(mce_next_interval);
1390 1390
1391 setup_timer(t, mce_start_timer, smp_processor_id());
1392
1391 if (mce_ignore_ce) 1393 if (mce_ignore_ce)
1392 return; 1394 return;
1393 1395
1394 *n = check_interval * HZ; 1396 *n = check_interval * HZ;
1395 if (!*n) 1397 if (!*n)
1396 return; 1398 return;
1397 setup_timer(t, mce_start_timer, smp_processor_id());
1398 t->expires = round_jiffies(jiffies + *n); 1399 t->expires = round_jiffies(jiffies + *n);
1399 add_timer_on(t, smp_processor_id()); 1400 add_timer_on(t, smp_processor_id());
1400} 1401}
@@ -1928,7 +1929,7 @@ error2:
1928 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1929 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1929error: 1930error:
1930 while (--i >= 0) 1931 while (--i >= 0)
1931 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1932 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1932 1933
1933 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1934 sysdev_unregister(&per_cpu(mce_dev, cpu));
1934 1935
@@ -2016,9 +2017,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2016 break; 2017 break;
2017 case CPU_DOWN_FAILED: 2018 case CPU_DOWN_FAILED:
2018 case CPU_DOWN_FAILED_FROZEN: 2019 case CPU_DOWN_FAILED_FROZEN:
2019 t->expires = round_jiffies(jiffies + 2020 if (!mce_ignore_ce && check_interval) {
2021 t->expires = round_jiffies(jiffies +
2020 __get_cpu_var(mce_next_interval)); 2022 __get_cpu_var(mce_next_interval));
2021 add_timer_on(t, cpu); 2023 add_timer_on(t, cpu);
2024 }
2022 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2025 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2023 break; 2026 break;
2024 case CPU_POST_DEAD: 2027 case CPU_POST_DEAD:
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 73c86db5acbe..09b1698e0466 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -170,6 +170,41 @@ static int __init cmp_range(const void *x1, const void *x2)
170 return start1 - start2; 170 return start1 - start2;
171} 171}
172 172
173static int __init clean_sort_range(struct res_range *range, int az)
174{
175 int i, j, k = az - 1, nr_range = 0;
176
177 for (i = 0; i < k; i++) {
178 if (range[i].end)
179 continue;
180 for (j = k; j > i; j--) {
181 if (range[j].end) {
182 k = j;
183 break;
184 }
185 }
186 if (j == i)
187 break;
188 range[i].start = range[k].start;
189 range[i].end = range[k].end;
190 range[k].start = 0;
191 range[k].end = 0;
192 k--;
193 }
194 /* count it */
195 for (i = 0; i < az; i++) {
196 if (!range[i].end) {
197 nr_range = i;
198 break;
199 }
200 }
201
202 /* sort them */
203 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
204
205 return nr_range;
206}
207
173#define BIOS_BUG_MSG KERN_WARNING \ 208#define BIOS_BUG_MSG KERN_WARNING \
174 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 209 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
175 210
@@ -223,22 +258,18 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
223 subtract_range(range, extra_remove_base, 258 subtract_range(range, extra_remove_base,
224 extra_remove_base + extra_remove_size - 1); 259 extra_remove_base + extra_remove_size - 1);
225 260
226 /* get new range num */
227 nr_range = 0;
228 for (i = 0; i < RANGE_NUM; i++) {
229 if (!range[i].end)
230 continue;
231 nr_range++;
232 }
233 if (debug_print) { 261 if (debug_print) {
234 printk(KERN_DEBUG "After UC checking\n"); 262 printk(KERN_DEBUG "After UC checking\n");
235 for (i = 0; i < nr_range; i++) 263 for (i = 0; i < RANGE_NUM; i++) {
264 if (!range[i].end)
265 continue;
236 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 266 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
237 range[i].start, range[i].end + 1); 267 range[i].start, range[i].end + 1);
268 }
238 } 269 }
239 270
240 /* sort the ranges */ 271 /* sort the ranges */
241 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 272 nr_range = clean_sort_range(range, RANGE_NUM);
242 if (debug_print) { 273 if (debug_print) {
243 printk(KERN_DEBUG "After sorting\n"); 274 printk(KERN_DEBUG "After sorting\n");
244 for (i = 0; i < nr_range; i++) 275 for (i = 0; i < nr_range; i++)
@@ -689,8 +720,6 @@ static int __init mtrr_need_cleanup(void)
689 continue; 720 continue;
690 if (!size) 721 if (!size)
691 type = MTRR_NUM_TYPES; 722 type = MTRR_NUM_TYPES;
692 if (type == MTRR_TYPE_WRPROT)
693 type = MTRR_TYPE_UNCACHABLE;
694 num[type]++; 723 num[type]++;
695 } 724 }
696 725
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c1bbed1021d9..45506d5dd8df 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1286,7 +1286,7 @@ x86_perf_event_set_period(struct perf_event *event,
1286 return 0; 1286 return 0;
1287 1287
1288 /* 1288 /*
1289 * If we are way outside a reasoable range then just skip forward: 1289 * If we are way outside a reasonable range then just skip forward:
1290 */ 1290 */
1291 if (unlikely(left <= -period)) { 1291 if (unlikely(left <= -period)) {
1292 left = period; 1292 left = period;
@@ -1632,6 +1632,7 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
1632 1632
1633 data.period = event->hw.last_period; 1633 data.period = event->hw.last_period;
1634 data.addr = 0; 1634 data.addr = 0;
1635 data.raw = NULL;
1635 regs.ip = 0; 1636 regs.ip = 0;
1636 1637
1637 /* 1638 /*
@@ -1749,6 +1750,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
1749 u64 val; 1750 u64 val;
1750 1751
1751 data.addr = 0; 1752 data.addr = 0;
1753 data.raw = NULL;
1752 1754
1753 cpuc = &__get_cpu_var(cpu_hw_events); 1755 cpuc = &__get_cpu_var(cpu_hw_events);
1754 1756
@@ -1794,6 +1796,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1794 u64 ack, status; 1796 u64 ack, status;
1795 1797
1796 data.addr = 0; 1798 data.addr = 0;
1799 data.raw = NULL;
1797 1800
1798 cpuc = &__get_cpu_var(cpu_hw_events); 1801 cpuc = &__get_cpu_var(cpu_hw_events);
1799 1802
@@ -1857,6 +1860,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1857 u64 val; 1860 u64 val;
1858 1861
1859 data.addr = 0; 1862 data.addr = 0;
1863 data.raw = NULL;
1860 1864
1861 cpuc = &__get_cpu_var(cpu_hw_events); 1865 cpuc = &__get_cpu_var(cpu_hw_events);
1862 1866
@@ -2062,12 +2066,6 @@ static __init int p6_pmu_init(void)
2062 2066
2063 x86_pmu = p6_pmu; 2067 x86_pmu = p6_pmu;
2064 2068
2065 if (!cpu_has_apic) {
2066 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
2067 pr_info("no hardware sampling interrupt available.\n");
2068 x86_pmu.apic = 0;
2069 }
2070
2071 return 0; 2069 return 0;
2072} 2070}
2073 2071
@@ -2159,6 +2157,16 @@ static __init int amd_pmu_init(void)
2159 return 0; 2157 return 0;
2160} 2158}
2161 2159
2160static void __init pmu_check_apic(void)
2161{
2162 if (cpu_has_apic)
2163 return;
2164
2165 x86_pmu.apic = 0;
2166 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
2167 pr_info("no hardware sampling interrupt available.\n");
2168}
2169
2162void __init init_hw_perf_events(void) 2170void __init init_hw_perf_events(void)
2163{ 2171{
2164 int err; 2172 int err;
@@ -2180,6 +2188,8 @@ void __init init_hw_perf_events(void)
2180 return; 2188 return;
2181 } 2189 }
2182 2190
2191 pmu_check_apic();
2192
2183 pr_cont("%s PMU driver.\n", x86_pmu.name); 2193 pr_cont("%s PMU driver.\n", x86_pmu.name);
2184 2194
2185 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 2195 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
@@ -2287,7 +2297,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2287 2297
2288static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 2298static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2289static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 2299static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2290static DEFINE_PER_CPU(int, in_nmi_frame); 2300static DEFINE_PER_CPU(int, in_ignored_frame);
2291 2301
2292 2302
2293static void 2303static void
@@ -2303,8 +2313,9 @@ static void backtrace_warning(void *data, char *msg)
2303 2313
2304static int backtrace_stack(void *data, char *name) 2314static int backtrace_stack(void *data, char *name)
2305{ 2315{
2306 per_cpu(in_nmi_frame, smp_processor_id()) = 2316 per_cpu(in_ignored_frame, smp_processor_id()) =
2307 x86_is_stack_id(NMI_STACK, name); 2317 x86_is_stack_id(NMI_STACK, name) ||
2318 x86_is_stack_id(DEBUG_STACK, name);
2308 2319
2309 return 0; 2320 return 0;
2310} 2321}
@@ -2313,7 +2324,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
2313{ 2324{
2314 struct perf_callchain_entry *entry = data; 2325 struct perf_callchain_entry *entry = data;
2315 2326
2316 if (per_cpu(in_nmi_frame, smp_processor_id())) 2327 if (per_cpu(in_ignored_frame, smp_processor_id()))
2317 return; 2328 return;
2318 2329
2319 if (reliable) 2330 if (reliable)
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index ef42a038f1a6..1c47390dd0e5 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -265,13 +265,13 @@ struct ds_context {
265 int cpu; 265 int cpu;
266}; 266};
267 267
268static DEFINE_PER_CPU(struct ds_context *, cpu_context); 268static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
269 269
270 270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu) 271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
272{ 272{
273 struct ds_context **p_context = 273 struct ds_context **p_context =
274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); 274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
275 struct ds_context *context = NULL; 275 struct ds_context *context = NULL;
276 struct ds_context *new_context = NULL; 276 struct ds_context *new_context = NULL;
277 277
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 8e740934bd1f..b13af53883aa 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -103,6 +103,35 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
103 return NULL; 103 return NULL;
104} 104}
105 105
106static inline int
107in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
108 unsigned long *irq_stack_end)
109{
110 return (stack >= irq_stack && stack < irq_stack_end);
111}
112
113/*
114 * We are returning from the irq stack and go to the previous one.
115 * If the previous stack is also in the irq stack, then bp in the first
116 * frame of the irq stack points to the previous, interrupted one.
117 * Otherwise we have another level of indirection: We first save
118 * the bp of the previous stack, then we switch the stack to the irq one
119 * and save a new bp that links to the previous one.
120 * (See save_args())
121 */
122static inline unsigned long
123fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
124 unsigned long *irq_stack, unsigned long *irq_stack_end)
125{
126#ifdef CONFIG_FRAME_POINTER
127 struct stack_frame *frame = (struct stack_frame *)bp;
128
129 if (!in_irq_stack(stack, irq_stack, irq_stack_end))
130 return (unsigned long)frame->next_frame;
131#endif
132 return bp;
133}
134
106/* 135/*
107 * x86-64 can have up to three kernel stacks: 136 * x86-64 can have up to three kernel stacks:
108 * process stack 137 * process stack
@@ -175,7 +204,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
175 irq_stack = irq_stack_end - 204 irq_stack = irq_stack_end -
176 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); 205 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
177 206
178 if (stack >= irq_stack && stack < irq_stack_end) { 207 if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
179 if (ops->stack(data, "IRQ") < 0) 208 if (ops->stack(data, "IRQ") < 0)
180 break; 209 break;
181 bp = print_context_stack(tinfo, stack, bp, 210 bp = print_context_stack(tinfo, stack, bp,
@@ -186,6 +215,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
186 * pointer (index -1 to end) in the IRQ stack: 215 * pointer (index -1 to end) in the IRQ stack:
187 */ 216 */
188 stack = (unsigned long *) (irq_stack_end[-1]); 217 stack = (unsigned long *) (irq_stack_end[-1]);
218 bp = fixup_bp_irq_link(bp, stack, irq_stack,
219 irq_stack_end);
189 irq_stack_end = NULL; 220 irq_stack_end = NULL;
190 ops->stack(data, "EOI"); 221 ops->stack(data, "EOI");
191 continue; 222 continue;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4deb8fc849dd..673f693fb451 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -977,8 +977,8 @@ apicinterrupt UV_BAU_MESSAGE \
977#endif 977#endif
978apicinterrupt LOCAL_TIMER_VECTOR \ 978apicinterrupt LOCAL_TIMER_VECTOR \
979 apic_timer_interrupt smp_apic_timer_interrupt 979 apic_timer_interrupt smp_apic_timer_interrupt
980apicinterrupt GENERIC_INTERRUPT_VECTOR \ 980apicinterrupt X86_PLATFORM_IPI_VECTOR \
981 generic_interrupt smp_generic_interrupt 981 x86_platform_ipi smp_x86_platform_ipi
982 982
983#ifdef CONFIG_SMP 983#ifdef CONFIG_SMP
984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -1076,10 +1076,10 @@ ENTRY(\sym)
1076 TRACE_IRQS_OFF 1076 TRACE_IRQS_OFF
1077 movq %rsp,%rdi /* pt_regs pointer */ 1077 movq %rsp,%rdi /* pt_regs pointer */
1078 xorl %esi,%esi /* no error code */ 1078 xorl %esi,%esi /* no error code */
1079 PER_CPU(init_tss, %rbp) 1079 PER_CPU(init_tss, %r12)
1080 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1080 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1081 call \do_sym 1081 call \do_sym
1082 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1082 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1083 jmp paranoid_exit /* %ebx: no swapgs flag */ 1083 jmp paranoid_exit /* %ebx: no swapgs flag */
1084 CFI_ENDPROC 1084 CFI_ENDPROC
1085END(\sym) 1085END(\sym)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 5a1b9758fd62..309689245431 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -189,9 +189,26 @@ static void wait_for_nmi(void)
189 nmi_wait_count++; 189 nmi_wait_count++;
190} 190}
191 191
192static inline int
193within(unsigned long addr, unsigned long start, unsigned long end)
194{
195 return addr >= start && addr < end;
196}
197
192static int 198static int
193do_ftrace_mod_code(unsigned long ip, void *new_code) 199do_ftrace_mod_code(unsigned long ip, void *new_code)
194{ 200{
201 /*
202 * On x86_64, kernel text mappings are mapped read-only with
203 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
204 * of the kernel text mapping to modify the kernel text.
205 *
206 * For 32bit kernels, these mappings are same and we can use
207 * kernel identity mapping to modify code.
208 */
209 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
210 ip = (unsigned long)__va(__pa(ip));
211
195 mod_code_ip = (void *)ip; 212 mod_code_ip = (void *)ip;
196 mod_code_newcode = new_code; 213 mod_code_newcode = new_code;
197 214
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 050c278481b1..7fd318bac59c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -18,6 +18,8 @@
18#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
19#include <asm/setup.h> 19#include <asm/setup.h>
20#include <asm/processor-flags.h> 20#include <asm/processor-flags.h>
21#include <asm/msr-index.h>
22#include <asm/cpufeature.h>
21#include <asm/percpu.h> 23#include <asm/percpu.h>
22 24
23/* Physical address */ 25/* Physical address */
@@ -297,25 +299,27 @@ ENTRY(startup_32_smp)
297 orl %edx,%eax 299 orl %edx,%eax
298 movl %eax,%cr4 300 movl %eax,%cr4
299 301
300 btl $5, %eax # check if PAE is enabled 302 testb $X86_CR4_PAE, %al # check if PAE is enabled
301 jnc 6f 303 jz 6f
302 304
303 /* Check if extended functions are implemented */ 305 /* Check if extended functions are implemented */
304 movl $0x80000000, %eax 306 movl $0x80000000, %eax
305 cpuid 307 cpuid
306 cmpl $0x80000000, %eax 308 /* Value must be in the range 0x80000001 to 0x8000ffff */
307 jbe 6f 309 subl $0x80000001, %eax
310 cmpl $(0x8000ffff-0x80000001), %eax
311 ja 6f
308 mov $0x80000001, %eax 312 mov $0x80000001, %eax
309 cpuid 313 cpuid
310 /* Execute Disable bit supported? */ 314 /* Execute Disable bit supported? */
311 btl $20, %edx 315 btl $(X86_FEATURE_NX & 31), %edx
312 jnc 6f 316 jnc 6f
313 317
314 /* Setup EFER (Extended Feature Enable Register) */ 318 /* Setup EFER (Extended Feature Enable Register) */
315 movl $0xc0000080, %ecx 319 movl $MSR_EFER, %ecx
316 rdmsr 320 rdmsr
317 321
318 btsl $11, %eax 322 btsl $_EFER_NX, %eax
319 /* Make changes effective */ 323 /* Make changes effective */
320 wrmsr 324 wrmsr
321 325
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 22db86a37643..2d8b5035371c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -262,11 +262,11 @@ ENTRY(secondary_startup_64)
262 .quad x86_64_start_kernel 262 .quad x86_64_start_kernel
263 ENTRY(initial_gs) 263 ENTRY(initial_gs)
264 .quad INIT_PER_CPU_VAR(irq_stack_union) 264 .quad INIT_PER_CPU_VAR(irq_stack_union)
265 __FINITDATA
266 265
267 ENTRY(stack_start) 266 ENTRY(stack_start)
268 .quad init_thread_union+THREAD_SIZE-8 267 .quad init_thread_union+THREAD_SIZE-8
269 .word 0 268 .word 0
269 __FINITDATA
270 270
271bad_address: 271bad_address:
272 jmp bad_address 272 jmp bad_address
@@ -340,6 +340,7 @@ ENTRY(name)
340 i = i + 1 ; \ 340 i = i + 1 ; \
341 .endr 341 .endr
342 342
343 .data
343 /* 344 /*
344 * This default setting generates an ident mapping at address 0x100000 345 * This default setting generates an ident mapping at address 0x100000
345 * and a mapping for the kernel that precisely maps virtual address 346 * and a mapping for the kernel that precisely maps virtual address
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dedc2bddf7a5..ba6e65884603 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,6 +33,7 @@
33 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
34 */ 34 */
35unsigned long hpet_address; 35unsigned long hpet_address;
36u8 hpet_blockid; /* OS timer block num */
36#ifdef CONFIG_PCI_MSI 37#ifdef CONFIG_PCI_MSI
37static unsigned long hpet_num_timers; 38static unsigned long hpet_num_timers;
38#endif 39#endif
@@ -47,12 +48,12 @@ struct hpet_dev {
47 char name[10]; 48 char name[10];
48}; 49};
49 50
50unsigned long hpet_readl(unsigned long a) 51inline unsigned int hpet_readl(unsigned int a)
51{ 52{
52 return readl(hpet_virt_address + a); 53 return readl(hpet_virt_address + a);
53} 54}
54 55
55static inline void hpet_writel(unsigned long d, unsigned long a) 56static inline void hpet_writel(unsigned int d, unsigned int a)
56{ 57{
57 writel(d, hpet_virt_address + a); 58 writel(d, hpet_virt_address + a);
58} 59}
@@ -167,7 +168,7 @@ do { \
167 168
168static void hpet_reserve_msi_timers(struct hpet_data *hd); 169static void hpet_reserve_msi_timers(struct hpet_data *hd);
169 170
170static void hpet_reserve_platform_timers(unsigned long id) 171static void hpet_reserve_platform_timers(unsigned int id)
171{ 172{
172 struct hpet __iomem *hpet = hpet_virt_address; 173 struct hpet __iomem *hpet = hpet_virt_address;
173 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; 174 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
@@ -205,7 +206,7 @@ static void hpet_reserve_platform_timers(unsigned long id)
205 206
206} 207}
207#else 208#else
208static void hpet_reserve_platform_timers(unsigned long id) { } 209static void hpet_reserve_platform_timers(unsigned int id) { }
209#endif 210#endif
210 211
211/* 212/*
@@ -246,7 +247,7 @@ static void hpet_reset_counter(void)
246 247
247static void hpet_start_counter(void) 248static void hpet_start_counter(void)
248{ 249{
249 unsigned long cfg = hpet_readl(HPET_CFG); 250 unsigned int cfg = hpet_readl(HPET_CFG);
250 cfg |= HPET_CFG_ENABLE; 251 cfg |= HPET_CFG_ENABLE;
251 hpet_writel(cfg, HPET_CFG); 252 hpet_writel(cfg, HPET_CFG);
252} 253}
@@ -271,7 +272,7 @@ static void hpet_resume_counter(void)
271 272
272static void hpet_enable_legacy_int(void) 273static void hpet_enable_legacy_int(void)
273{ 274{
274 unsigned long cfg = hpet_readl(HPET_CFG); 275 unsigned int cfg = hpet_readl(HPET_CFG);
275 276
276 cfg |= HPET_CFG_LEGACY; 277 cfg |= HPET_CFG_LEGACY;
277 hpet_writel(cfg, HPET_CFG); 278 hpet_writel(cfg, HPET_CFG);
@@ -314,7 +315,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
314static void hpet_set_mode(enum clock_event_mode mode, 315static void hpet_set_mode(enum clock_event_mode mode,
315 struct clock_event_device *evt, int timer) 316 struct clock_event_device *evt, int timer)
316{ 317{
317 unsigned long cfg, cmp, now; 318 unsigned int cfg, cmp, now;
318 uint64_t delta; 319 uint64_t delta;
319 320
320 switch (mode) { 321 switch (mode) {
@@ -323,7 +324,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
323 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; 324 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
324 delta >>= evt->shift; 325 delta >>= evt->shift;
325 now = hpet_readl(HPET_COUNTER); 326 now = hpet_readl(HPET_COUNTER);
326 cmp = now + (unsigned long) delta; 327 cmp = now + (unsigned int) delta;
327 cfg = hpet_readl(HPET_Tn_CFG(timer)); 328 cfg = hpet_readl(HPET_Tn_CFG(timer));
328 /* Make sure we use edge triggered interrupts */ 329 /* Make sure we use edge triggered interrupts */
329 cfg &= ~HPET_TN_LEVEL; 330 cfg &= ~HPET_TN_LEVEL;
@@ -339,7 +340,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
339 * (See AMD-8111 HyperTransport I/O Hub Data Sheet, 340 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
340 * Publication # 24674) 341 * Publication # 24674)
341 */ 342 */
342 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); 343 hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
343 hpet_start_counter(); 344 hpet_start_counter();
344 hpet_print_config(); 345 hpet_print_config();
345 break; 346 break;
@@ -383,13 +384,24 @@ static int hpet_next_event(unsigned long delta,
383 hpet_writel(cnt, HPET_Tn_CMP(timer)); 384 hpet_writel(cnt, HPET_Tn_CMP(timer));
384 385
385 /* 386 /*
386 * We need to read back the CMP register to make sure that 387 * We need to read back the CMP register on certain HPET
387 * what we wrote hit the chip before we compare it to the 388 * implementations (ATI chipsets) which seem to delay the
388 * counter. 389 * transfer of the compare register into the internal compare
390 * logic. With small deltas this might actually be too late as
391 * the counter could already be higher than the compare value
392 * at that point and we would wait for the next hpet interrupt
393 * forever. We found out that reading the CMP register back
394 * forces the transfer so we can rely on the comparison with
395 * the counter register below. If the read back from the
396 * compare register does not match the value we programmed
397 * then we might have a real hardware problem. We can not do
398 * much about it here, but at least alert the user/admin with
399 * a prominent warning.
389 */ 400 */
390 WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); 401 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
402 KERN_WARNING "hpet: compare register read back failed.\n");
391 403
392 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 404 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
393} 405}
394 406
395static void hpet_legacy_set_mode(enum clock_event_mode mode, 407static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -415,7 +427,7 @@ static struct hpet_dev *hpet_devs;
415void hpet_msi_unmask(unsigned int irq) 427void hpet_msi_unmask(unsigned int irq)
416{ 428{
417 struct hpet_dev *hdev = get_irq_data(irq); 429 struct hpet_dev *hdev = get_irq_data(irq);
418 unsigned long cfg; 430 unsigned int cfg;
419 431
420 /* unmask it */ 432 /* unmask it */
421 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 433 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -425,7 +437,7 @@ void hpet_msi_unmask(unsigned int irq)
425 437
426void hpet_msi_mask(unsigned int irq) 438void hpet_msi_mask(unsigned int irq)
427{ 439{
428 unsigned long cfg; 440 unsigned int cfg;
429 struct hpet_dev *hdev = get_irq_data(irq); 441 struct hpet_dev *hdev = get_irq_data(irq);
430 442
431 /* mask it */ 443 /* mask it */
@@ -467,7 +479,7 @@ static int hpet_msi_next_event(unsigned long delta,
467 479
468static int hpet_setup_msi_irq(unsigned int irq) 480static int hpet_setup_msi_irq(unsigned int irq)
469{ 481{
470 if (arch_setup_hpet_msi(irq)) { 482 if (arch_setup_hpet_msi(irq, hpet_blockid)) {
471 destroy_irq(irq); 483 destroy_irq(irq);
472 return -EINVAL; 484 return -EINVAL;
473 } 485 }
@@ -584,6 +596,8 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
584 unsigned int num_timers_used = 0; 596 unsigned int num_timers_used = 0;
585 int i; 597 int i;
586 598
599 if (boot_cpu_has(X86_FEATURE_ARAT))
600 return;
587 id = hpet_readl(HPET_ID); 601 id = hpet_readl(HPET_ID);
588 602
589 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); 603 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
@@ -598,7 +612,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
598 612
599 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { 613 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
600 struct hpet_dev *hdev = &hpet_devs[num_timers_used]; 614 struct hpet_dev *hdev = &hpet_devs[num_timers_used];
601 unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); 615 unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
602 616
603 /* Only consider HPET timer with MSI support */ 617 /* Only consider HPET timer with MSI support */
604 if (!(cfg & HPET_TN_FSB_CAP)) 618 if (!(cfg & HPET_TN_FSB_CAP))
@@ -813,7 +827,7 @@ static int hpet_clocksource_register(void)
813 */ 827 */
814int __init hpet_enable(void) 828int __init hpet_enable(void)
815{ 829{
816 unsigned long id; 830 unsigned int id;
817 int i; 831 int i;
818 832
819 if (!is_hpet_capable()) 833 if (!is_hpet_capable())
@@ -872,10 +886,8 @@ int __init hpet_enable(void)
872 886
873 if (id & HPET_ID_LEGSUP) { 887 if (id & HPET_ID_LEGSUP) {
874 hpet_legacy_clockevent_register(); 888 hpet_legacy_clockevent_register();
875 hpet_msi_capability_lookup(2);
876 return 1; 889 return 1;
877 } 890 }
878 hpet_msi_capability_lookup(0);
879 return 0; 891 return 0;
880 892
881out_nohpet: 893out_nohpet:
@@ -908,9 +920,17 @@ static __init int hpet_late_init(void)
908 if (!hpet_virt_address) 920 if (!hpet_virt_address)
909 return -ENODEV; 921 return -ENODEV;
910 922
923 if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
924 hpet_msi_capability_lookup(2);
925 else
926 hpet_msi_capability_lookup(0);
927
911 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 928 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
912 hpet_print_config(); 929 hpet_print_config();
913 930
931 if (boot_cpu_has(X86_FEATURE_ARAT))
932 return 0;
933
914 for_each_online_cpu(cpu) { 934 for_each_online_cpu(cpu) {
915 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); 935 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
916 } 936 }
@@ -925,7 +945,7 @@ fs_initcall(hpet_late_init);
925void hpet_disable(void) 945void hpet_disable(void)
926{ 946{
927 if (is_hpet_capable()) { 947 if (is_hpet_capable()) {
928 unsigned long cfg = hpet_readl(HPET_CFG); 948 unsigned int cfg = hpet_readl(HPET_CFG);
929 949
930 if (hpet_legacy_int_enabled) { 950 if (hpet_legacy_int_enabled) {
931 cfg &= ~HPET_CFG_LEGACY; 951 cfg &= ~HPET_CFG_LEGACY;
@@ -965,8 +985,8 @@ static int hpet_prev_update_sec;
965static struct rtc_time hpet_alarm_time; 985static struct rtc_time hpet_alarm_time;
966static unsigned long hpet_pie_count; 986static unsigned long hpet_pie_count;
967static u32 hpet_t1_cmp; 987static u32 hpet_t1_cmp;
968static unsigned long hpet_default_delta; 988static u32 hpet_default_delta;
969static unsigned long hpet_pie_delta; 989static u32 hpet_pie_delta;
970static unsigned long hpet_pie_limit; 990static unsigned long hpet_pie_limit;
971 991
972static rtc_irq_handler irq_handler; 992static rtc_irq_handler irq_handler;
@@ -1017,7 +1037,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
1017 */ 1037 */
1018int hpet_rtc_timer_init(void) 1038int hpet_rtc_timer_init(void)
1019{ 1039{
1020 unsigned long cfg, cnt, delta, flags; 1040 unsigned int cfg, cnt, delta;
1041 unsigned long flags;
1021 1042
1022 if (!is_hpet_enabled()) 1043 if (!is_hpet_enabled())
1023 return 0; 1044 return 0;
@@ -1027,7 +1048,7 @@ int hpet_rtc_timer_init(void)
1027 1048
1028 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1049 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1029 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; 1050 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
1030 hpet_default_delta = (unsigned long) clc; 1051 hpet_default_delta = clc;
1031 } 1052 }
1032 1053
1033 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) 1054 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
@@ -1113,7 +1134,7 @@ int hpet_set_periodic_freq(unsigned long freq)
1113 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1134 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1114 do_div(clc, freq); 1135 do_div(clc, freq);
1115 clc >>= hpet_clockevent.shift; 1136 clc >>= hpet_clockevent.shift;
1116 hpet_pie_delta = (unsigned long) clc; 1137 hpet_pie_delta = clc;
1117 } 1138 }
1118 return 1; 1139 return 1;
1119} 1140}
@@ -1127,7 +1148,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
1127 1148
1128static void hpet_rtc_timer_reinit(void) 1149static void hpet_rtc_timer_reinit(void)
1129{ 1150{
1130 unsigned long cfg, delta; 1151 unsigned int cfg, delta;
1131 int lost_ints = -1; 1152 int lost_ints = -1;
1132 1153
1133 if (unlikely(!hpet_rtc_flags)) { 1154 if (unlikely(!hpet_rtc_flags)) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index d42f65ac4927..05d5fec64a94 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -362,8 +362,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
362 return ret; 362 return ret;
363 } 363 }
364 364
365 if (bp->callback) 365 ret = arch_store_info(bp);
366 ret = arch_store_info(bp);
367 366
368 if (ret < 0) 367 if (ret < 0)
369 return ret; 368 return ret;
@@ -519,7 +518,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
519 break; 518 break;
520 } 519 }
521 520
522 (bp->callback)(bp, args->regs); 521 perf_bp_event(bp, args->regs);
523 522
524 rcu_read_unlock(); 523 rcu_read_unlock();
525 } 524 }
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index fee6cc2b2079..664bcb7384ac 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -18,7 +18,7 @@
18atomic_t irq_err_count; 18atomic_t irq_err_count;
19 19
20/* Function pointer for generic interrupt vector handling */ 20/* Function pointer for generic interrupt vector handling */
21void (*generic_interrupt_extension)(void) = NULL; 21void (*x86_platform_ipi_callback)(void) = NULL;
22 22
23/* 23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'. 24 * 'what should we do if we get a hw irq event on an illegal vector'.
@@ -72,10 +72,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n"); 73 seq_printf(p, " Performance pending work\n");
74#endif 74#endif
75 if (generic_interrupt_extension) { 75 if (x86_platform_ipi_callback) {
76 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
77 for_each_online_cpu(j) 77 for_each_online_cpu(j)
78 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); 78 seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
79 seq_printf(p, " Platform interrupts\n"); 79 seq_printf(p, " Platform interrupts\n");
80 } 80 }
81#ifdef CONFIG_SMP 81#ifdef CONFIG_SMP
@@ -187,8 +187,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
187 sum += irq_stats(cpu)->apic_perf_irqs; 187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs; 188 sum += irq_stats(cpu)->apic_pending_irqs;
189#endif 189#endif
190 if (generic_interrupt_extension) 190 if (x86_platform_ipi_callback)
191 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->x86_platform_ipis;
192#ifdef CONFIG_SMP 192#ifdef CONFIG_SMP
193 sum += irq_stats(cpu)->irq_resched_count; 193 sum += irq_stats(cpu)->irq_resched_count;
194 sum += irq_stats(cpu)->irq_call_count; 194 sum += irq_stats(cpu)->irq_call_count;
@@ -251,9 +251,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
251} 251}
252 252
253/* 253/*
254 * Handler for GENERIC_INTERRUPT_VECTOR. 254 * Handler for X86_PLATFORM_IPI_VECTOR.
255 */ 255 */
256void smp_generic_interrupt(struct pt_regs *regs) 256void smp_x86_platform_ipi(struct pt_regs *regs)
257{ 257{
258 struct pt_regs *old_regs = set_irq_regs(regs); 258 struct pt_regs *old_regs = set_irq_regs(regs);
259 259
@@ -263,10 +263,10 @@ void smp_generic_interrupt(struct pt_regs *regs)
263 263
264 irq_enter(); 264 irq_enter();
265 265
266 inc_irq_stat(generic_irqs); 266 inc_irq_stat(x86_platform_ipis);
267 267
268 if (generic_interrupt_extension) 268 if (x86_platform_ipi_callback)
269 generic_interrupt_extension(); 269 x86_platform_ipi_callback();
270 270
271 irq_exit(); 271 irq_exit();
272 272
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 40f30773fb29..d5932226614f 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -200,8 +200,8 @@ static void __init apic_intr_init(void)
200 /* self generated IPI for local APIC timer */ 200 /* self generated IPI for local APIC timer */
201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
202 202
203 /* generic IPI for platform specific use */ 203 /* IPI for X86 platform specific use */
204 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); 204 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
205 205
206 /* IPI vectors for APIC spurious and error interrupts */ 206 /* IPI vectors for APIC spurious and error interrupts */
207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 20a5b3689463..dd74fe7273b1 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -86,9 +86,15 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
86 gdb_regs[GDB_DS] = regs->ds; 86 gdb_regs[GDB_DS] = regs->ds;
87 gdb_regs[GDB_ES] = regs->es; 87 gdb_regs[GDB_ES] = regs->es;
88 gdb_regs[GDB_CS] = regs->cs; 88 gdb_regs[GDB_CS] = regs->cs;
89 gdb_regs[GDB_SS] = __KERNEL_DS;
90 gdb_regs[GDB_FS] = 0xFFFF; 89 gdb_regs[GDB_FS] = 0xFFFF;
91 gdb_regs[GDB_GS] = 0xFFFF; 90 gdb_regs[GDB_GS] = 0xFFFF;
91 if (user_mode_vm(regs)) {
92 gdb_regs[GDB_SS] = regs->ss;
93 gdb_regs[GDB_SP] = regs->sp;
94 } else {
95 gdb_regs[GDB_SS] = __KERNEL_DS;
96 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
97 }
92#else 98#else
93 gdb_regs[GDB_R8] = regs->r8; 99 gdb_regs[GDB_R8] = regs->r8;
94 gdb_regs[GDB_R9] = regs->r9; 100 gdb_regs[GDB_R9] = regs->r9;
@@ -101,8 +107,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
101 gdb_regs32[GDB_PS] = regs->flags; 107 gdb_regs32[GDB_PS] = regs->flags;
102 gdb_regs32[GDB_CS] = regs->cs; 108 gdb_regs32[GDB_CS] = regs->cs;
103 gdb_regs32[GDB_SS] = regs->ss; 109 gdb_regs32[GDB_SS] = regs->ss;
104#endif
105 gdb_regs[GDB_SP] = kernel_stack_pointer(regs); 110 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
111#endif
106} 112}
107 113
108/** 114/**
@@ -220,8 +226,7 @@ static void kgdb_correct_hw_break(void)
220 dr7 |= ((breakinfo[breakno].len << 2) | 226 dr7 |= ((breakinfo[breakno].len << 2) |
221 breakinfo[breakno].type) << 227 breakinfo[breakno].type) <<
222 ((breakno << 2) + 16); 228 ((breakno << 2) + 16);
223 if (breakno >= 0 && breakno <= 3) 229 set_debugreg(breakinfo[breakno].addr, breakno);
224 set_debugreg(breakinfo[breakno].addr, breakno);
225 230
226 } else { 231 } else {
227 if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { 232 if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
@@ -395,7 +400,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
395 /* set the trace bit if we're stepping */ 400 /* set the trace bit if we're stepping */
396 if (remcomInBuffer[0] == 's') { 401 if (remcomInBuffer[0] == 's') {
397 linux_regs->flags |= X86_EFLAGS_TF; 402 linux_regs->flags |= X86_EFLAGS_TF;
398 kgdb_single_step = 1;
399 atomic_set(&kgdb_cpu_doing_single_step, 403 atomic_set(&kgdb_cpu_doing_single_step,
400 raw_smp_processor_id()); 404 raw_smp_processor_id());
401 } 405 }
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 1f3186ce213c..5b8c7505b3bc 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -481,7 +481,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
481 481
482/* 482/*
483 * Interrupts are disabled on entry as trap3 is an interrupt gate and they 483 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
484 * remain disabled thorough out this function. 484 * remain disabled throughout this function.
485 */ 485 */
486static int __kprobes kprobe_handler(struct pt_regs *regs) 486static int __kprobes kprobe_handler(struct pt_regs *regs)
487{ 487{
@@ -818,7 +818,7 @@ no_change:
818 818
819/* 819/*
820 * Interrupts are disabled on entry as trap1 is an interrupt gate and they 820 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
821 * remain disabled thoroughout this function. 821 * remain disabled throughout this function.
822 */ 822 */
823static int __kprobes post_kprobe_handler(struct pt_regs *regs) 823static int __kprobes post_kprobe_handler(struct pt_regs *regs)
824{ 824{
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c843f8406da2..a3fa43ba5d3b 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -158,8 +158,7 @@ int machine_kexec_prepare(struct kimage *image)
158{ 158{
159 int error; 159 int error;
160 160
161 if (nx_enabled) 161 set_pages_x(image->control_code_page, 1);
162 set_pages_x(image->control_code_page, 1);
163 error = machine_kexec_alloc_page_tables(image); 162 error = machine_kexec_alloc_page_tables(image);
164 if (error) 163 if (error)
165 return error; 164 return error;
@@ -173,8 +172,7 @@ int machine_kexec_prepare(struct kimage *image)
173 */ 172 */
174void machine_kexec_cleanup(struct kimage *image) 173void machine_kexec_cleanup(struct kimage *image)
175{ 174{
176 if (nx_enabled) 175 set_pages_nx(image->control_code_page, 1);
177 set_pages_nx(image->control_code_page, 1);
178 machine_kexec_free_page_tables(image); 176 machine_kexec_free_page_tables(image);
179} 177}
180 178
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index f4c538b681ca..37542b67c57e 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,6 +13,9 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16
17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
16#include <linux/firmware.h> 19#include <linux/firmware.h>
17#include <linux/pci_ids.h> 20#include <linux/pci_ids.h>
18#include <linux/uaccess.h> 21#include <linux/uaccess.h>
@@ -33,6 +36,9 @@ MODULE_LICENSE("GPL v2");
33#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 36#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
34#define UCODE_UCODE_TYPE 0x00000001 37#define UCODE_UCODE_TYPE 0x00000001
35 38
39const struct firmware *firmware;
40static int supported_cpu;
41
36struct equiv_cpu_entry { 42struct equiv_cpu_entry {
37 u32 installed_cpu; 43 u32 installed_cpu;
38 u32 fixed_errata_mask; 44 u32 fixed_errata_mask;
@@ -71,17 +77,14 @@ static struct equiv_cpu_entry *equiv_cpu_table;
71 77
72static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 78static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
73{ 79{
74 struct cpuinfo_x86 *c = &cpu_data(cpu);
75 u32 dummy; 80 u32 dummy;
76 81
77 memset(csig, 0, sizeof(*csig)); 82 if (!supported_cpu)
78 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
79 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
80 "supported\n", cpu, c->x86);
81 return -1; 83 return -1;
82 } 84
85 memset(csig, 0, sizeof(*csig));
83 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
84 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); 87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
85 return 0; 88 return 0;
86} 89}
87 90
@@ -103,23 +106,16 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
103 i++; 106 i++;
104 } 107 }
105 108
106 if (!equiv_cpu_id) { 109 if (!equiv_cpu_id)
107 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
108 "not listed in equivalent cpu table\n", cpu);
109 return 0; 110 return 0;
110 }
111 111
112 if (mc_header->processor_rev_id != equiv_cpu_id) { 112 if (mc_header->processor_rev_id != equiv_cpu_id)
113 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
114 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
115 cpu, mc_header->processor_rev_id, equiv_cpu_id);
116 return 0; 113 return 0;
117 }
118 114
119 /* ucode might be chipset specific -- currently we don't support this */ 115 /* ucode might be chipset specific -- currently we don't support this */
120 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
121 printk(KERN_ERR "microcode: CPU%d: loading of chipset " 117 pr_err("CPU%d: loading of chipset specific code not yet supported\n",
122 "specific code not yet supported\n", cpu); 118 cpu);
123 return 0; 119 return 0;
124 } 120 }
125 121
@@ -148,14 +144,12 @@ static int apply_microcode_amd(int cpu)
148 144
149 /* check current patch id and patch's id for match */ 145 /* check current patch id and patch's id for match */
150 if (rev != mc_amd->hdr.patch_id) { 146 if (rev != mc_amd->hdr.patch_id) {
151 printk(KERN_ERR "microcode: CPU%d: update failed " 147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n",
152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 148 cpu, mc_amd->hdr.patch_id);
153 return -1; 149 return -1;
154 } 150 }
155 151
156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
157 cpu, rev);
158
159 uci->cpu_sig.rev = rev; 153 uci->cpu_sig.rev = rev;
160 154
161 return 0; 155 return 0;
@@ -178,18 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
178 return NULL; 172 return NULL;
179 173
180 if (section_hdr[0] != UCODE_UCODE_TYPE) { 174 if (section_hdr[0] != UCODE_UCODE_TYPE) {
181 printk(KERN_ERR "microcode: error: invalid type field in " 175 pr_err("error: invalid type field in container file section header\n");
182 "container file section header\n");
183 return NULL; 176 return NULL;
184 } 177 }
185 178
186 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 179 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
187 180
188 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
189 size, total_size);
190
191 if (total_size > size || total_size > UCODE_MAX_SIZE) { 181 if (total_size > size || total_size > UCODE_MAX_SIZE) {
192 printk(KERN_ERR "microcode: error: size mismatch\n"); 182 pr_err("error: size mismatch\n");
193 return NULL; 183 return NULL;
194 } 184 }
195 185
@@ -218,15 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf)
218 size = buf_pos[2]; 208 size = buf_pos[2];
219 209
220 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 210 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
221 printk(KERN_ERR "microcode: error: invalid type field in " 211 pr_err("error: invalid type field in container file section header\n");
222 "container file section header\n");
223 return 0; 212 return 0;
224 } 213 }
225 214
226 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 215 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
227 if (!equiv_cpu_table) { 216 if (!equiv_cpu_table) {
228 printk(KERN_ERR "microcode: failed to allocate " 217 pr_err("failed to allocate equivalent CPU table\n");
229 "equivalent CPU table\n");
230 return 0; 218 return 0;
231 } 219 }
232 220
@@ -259,8 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
259 247
260 offset = install_equiv_cpu_table(ucode_ptr); 248 offset = install_equiv_cpu_table(ucode_ptr);
261 if (!offset) { 249 if (!offset) {
262 printk(KERN_ERR "microcode: failed to create " 250 pr_err("failed to create equivalent cpu table\n");
263 "equivalent cpu table\n");
264 return UCODE_ERROR; 251 return UCODE_ERROR;
265 } 252 }
266 253
@@ -291,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
291 if (!leftover) { 278 if (!leftover) {
292 vfree(uci->mc); 279 vfree(uci->mc);
293 uci->mc = new_mc; 280 uci->mc = new_mc;
294 pr_debug("microcode: CPU%d found a matching microcode " 281 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
295 "update with version 0x%x (current=0x%x)\n",
296 cpu, new_rev, uci->cpu_sig.rev); 282 cpu, new_rev, uci->cpu_sig.rev);
297 } else { 283 } else {
298 vfree(new_mc); 284 vfree(new_mc);
@@ -308,33 +294,26 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
308 294
309static enum ucode_state request_microcode_fw(int cpu, struct device *device) 295static enum ucode_state request_microcode_fw(int cpu, struct device *device)
310{ 296{
311 const char *fw_name = "amd-ucode/microcode_amd.bin";
312 const struct firmware *firmware;
313 enum ucode_state ret; 297 enum ucode_state ret;
314 298
315 if (request_firmware(&firmware, fw_name, device)) { 299 if (firmware == NULL)
316 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
317 return UCODE_NFOUND; 300 return UCODE_NFOUND;
318 }
319 301
320 if (*(u32 *)firmware->data != UCODE_MAGIC) { 302 if (*(u32 *)firmware->data != UCODE_MAGIC) {
321 printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", 303 pr_err("invalid UCODE_MAGIC (0x%08x)\n",
322 *(u32 *)firmware->data); 304 *(u32 *)firmware->data);
323 return UCODE_ERROR; 305 return UCODE_ERROR;
324 } 306 }
325 307
326 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 308 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
327 309
328 release_firmware(firmware);
329
330 return ret; 310 return ret;
331} 311}
332 312
333static enum ucode_state 313static enum ucode_state
334request_microcode_user(int cpu, const void __user *buf, size_t size) 314request_microcode_user(int cpu, const void __user *buf, size_t size)
335{ 315{
336 printk(KERN_INFO "microcode: AMD microcode update via " 316 pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
337 "/dev/cpu/microcode not supported\n");
338 return UCODE_ERROR; 317 return UCODE_ERROR;
339} 318}
340 319
@@ -346,7 +325,31 @@ static void microcode_fini_cpu_amd(int cpu)
346 uci->mc = NULL; 325 uci->mc = NULL;
347} 326}
348 327
328void init_microcode_amd(struct device *device)
329{
330 const char *fw_name = "amd-ucode/microcode_amd.bin";
331 struct cpuinfo_x86 *c = &boot_cpu_data;
332
333 WARN_ON(c->x86_vendor != X86_VENDOR_AMD);
334
335 if (c->x86 < 0x10) {
336 pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
337 return;
338 }
339 supported_cpu = 1;
340
341 if (request_firmware(&firmware, fw_name, device))
342 pr_err("failed to load file %s\n", fw_name);
343}
344
345void fini_microcode_amd(void)
346{
347 release_firmware(firmware);
348}
349
349static struct microcode_ops microcode_amd_ops = { 350static struct microcode_ops microcode_amd_ops = {
351 .init = init_microcode_amd,
352 .fini = fini_microcode_amd,
350 .request_microcode_user = request_microcode_user, 353 .request_microcode_user = request_microcode_user,
351 .request_microcode_fw = request_microcode_fw, 354 .request_microcode_fw = request_microcode_fw,
352 .collect_cpu_info = collect_cpu_info_amd, 355 .collect_cpu_info = collect_cpu_info_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 2bcad3926edb..844c02c65fcb 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -70,6 +70,9 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/platform_device.h> 76#include <linux/platform_device.h>
74#include <linux/miscdevice.h> 77#include <linux/miscdevice.h>
75#include <linux/capability.h> 78#include <linux/capability.h>
@@ -209,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
209 ssize_t ret = -EINVAL; 212 ssize_t ret = -EINVAL;
210 213
211 if ((len >> PAGE_SHIFT) > totalram_pages) { 214 if ((len >> PAGE_SHIFT) > totalram_pages) {
212 pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); 215 pr_err("too much data (max %ld pages)\n", totalram_pages);
213 return ret; 216 return ret;
214 } 217 }
215 218
@@ -244,7 +247,7 @@ static int __init microcode_dev_init(void)
244 247
245 error = misc_register(&microcode_dev); 248 error = misc_register(&microcode_dev);
246 if (error) { 249 if (error) {
247 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); 250 pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
248 return error; 251 return error;
249 } 252 }
250 253
@@ -359,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu)
359 if (!uci->mc) 362 if (!uci->mc)
360 return UCODE_NFOUND; 363 return UCODE_NFOUND;
361 364
362 pr_debug("microcode: CPU%d updated upon resume\n", cpu); 365 pr_debug("CPU%d updated upon resume\n", cpu);
363 apply_microcode_on_target(cpu); 366 apply_microcode_on_target(cpu);
364 367
365 return UCODE_OK; 368 return UCODE_OK;
@@ -379,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu)
379 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev); 382 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
380 383
381 if (ustate == UCODE_OK) { 384 if (ustate == UCODE_OK) {
382 pr_debug("microcode: CPU%d updated upon init\n", cpu); 385 pr_debug("CPU%d updated upon init\n", cpu);
383 apply_microcode_on_target(cpu); 386 apply_microcode_on_target(cpu);
384 } 387 }
385 388
@@ -391,7 +394,7 @@ static enum ucode_state microcode_update_cpu(int cpu)
391 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 394 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
392 enum ucode_state ustate; 395 enum ucode_state ustate;
393 396
394 if (uci->valid) 397 if (uci->valid && uci->mc)
395 ustate = microcode_resume_cpu(cpu); 398 ustate = microcode_resume_cpu(cpu);
396 else 399 else
397 ustate = microcode_init_cpu(cpu); 400 ustate = microcode_init_cpu(cpu);
@@ -406,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
406 if (!cpu_online(cpu)) 409 if (!cpu_online(cpu))
407 return 0; 410 return 0;
408 411
409 pr_debug("microcode: CPU%d added\n", cpu); 412 pr_debug("CPU%d added\n", cpu);
410 413
411 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 414 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
412 if (err) 415 if (err)
@@ -425,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
425 if (!cpu_online(cpu)) 428 if (!cpu_online(cpu))
426 return 0; 429 return 0;
427 430
428 pr_debug("microcode: CPU%d removed\n", cpu); 431 pr_debug("CPU%d removed\n", cpu);
429 microcode_fini_cpu(cpu); 432 microcode_fini_cpu(cpu);
430 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 433 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
431 return 0; 434 return 0;
@@ -473,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
473 microcode_update_cpu(cpu); 476 microcode_update_cpu(cpu);
474 case CPU_DOWN_FAILED: 477 case CPU_DOWN_FAILED:
475 case CPU_DOWN_FAILED_FROZEN: 478 case CPU_DOWN_FAILED_FROZEN:
476 pr_debug("microcode: CPU%d added\n", cpu); 479 pr_debug("CPU%d added\n", cpu);
477 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 480 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
478 pr_err("microcode: Failed to create group for CPU%d\n", cpu); 481 pr_err("Failed to create group for CPU%d\n", cpu);
479 break; 482 break;
480 case CPU_DOWN_PREPARE: 483 case CPU_DOWN_PREPARE:
481 case CPU_DOWN_PREPARE_FROZEN: 484 case CPU_DOWN_PREPARE_FROZEN:
482 /* Suspend is in progress, only remove the interface */ 485 /* Suspend is in progress, only remove the interface */
483 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 486 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
484 pr_debug("microcode: CPU%d removed\n", cpu); 487 pr_debug("CPU%d removed\n", cpu);
485 break; 488 break;
486 case CPU_DEAD: 489 case CPU_DEAD:
487 case CPU_UP_CANCELED_FROZEN: 490 case CPU_UP_CANCELED_FROZEN:
@@ -507,7 +510,7 @@ static int __init microcode_init(void)
507 microcode_ops = init_amd_microcode(); 510 microcode_ops = init_amd_microcode();
508 511
509 if (!microcode_ops) { 512 if (!microcode_ops) {
510 pr_err("microcode: no support for this CPU vendor\n"); 513 pr_err("no support for this CPU vendor\n");
511 return -ENODEV; 514 return -ENODEV;
512 } 515 }
513 516
@@ -518,6 +521,9 @@ static int __init microcode_init(void)
518 return PTR_ERR(microcode_pdev); 521 return PTR_ERR(microcode_pdev);
519 } 522 }
520 523
524 if (microcode_ops->init)
525 microcode_ops->init(&microcode_pdev->dev);
526
521 get_online_cpus(); 527 get_online_cpus();
522 mutex_lock(&microcode_mutex); 528 mutex_lock(&microcode_mutex);
523 529
@@ -538,8 +544,7 @@ static int __init microcode_init(void)
538 register_hotcpu_notifier(&mc_cpu_notifier); 544 register_hotcpu_notifier(&mc_cpu_notifier);
539 545
540 pr_info("Microcode Update Driver: v" MICROCODE_VERSION 546 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
541 " <tigran@aivazian.fsnet.co.uk>," 547 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
542 " Peter Oruba\n");
543 548
544 return 0; 549 return 0;
545} 550}
@@ -561,6 +566,9 @@ static void __exit microcode_exit(void)
561 566
562 platform_device_unregister(microcode_pdev); 567 platform_device_unregister(microcode_pdev);
563 568
569 if (microcode_ops->fini)
570 microcode_ops->fini();
571
564 microcode_ops = NULL; 572 microcode_ops = NULL;
565 573
566 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); 574 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 0d334ddd0a96..ebd193e476ca 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,6 +70,9 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/firmware.h> 76#include <linux/firmware.h>
74#include <linux/uaccess.h> 77#include <linux/uaccess.h>
75#include <linux/kernel.h> 78#include <linux/kernel.h>
@@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
146 149
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || 150 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) { 151 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel " 152 pr_err("CPU%d not a capable Intel processor\n", cpu_num);
150 "processor\n", cpu_num);
151 return -1; 153 return -1;
152 } 154 }
153 155
@@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
165 /* get the current revision from MSR 0x8B */ 167 /* get the current revision from MSR 0x8B */
166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 168 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
167 169
168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", 170 pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
169 cpu_num, csig->sig, csig->pf, csig->rev); 171 cpu_num, csig->sig, csig->pf, csig->rev);
170 172
171 return 0; 173 return 0;
172} 174}
@@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc)
194 data_size = get_datasize(mc_header); 196 data_size = get_datasize(mc_header);
195 197
196 if (data_size + MC_HEADER_SIZE > total_size) { 198 if (data_size + MC_HEADER_SIZE > total_size) {
197 printk(KERN_ERR "microcode: error! " 199 pr_err("error! Bad data size in microcode data file\n");
198 "Bad data size in microcode data file\n");
199 return -EINVAL; 200 return -EINVAL;
200 } 201 }
201 202
202 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { 203 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
203 printk(KERN_ERR "microcode: error! " 204 pr_err("error! Unknown microcode update format\n");
204 "Unknown microcode update format\n");
205 return -EINVAL; 205 return -EINVAL;
206 } 206 }
207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size); 207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
208 if (ext_table_size) { 208 if (ext_table_size) {
209 if ((ext_table_size < EXT_HEADER_SIZE) 209 if ((ext_table_size < EXT_HEADER_SIZE)
210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { 210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
211 printk(KERN_ERR "microcode: error! " 211 pr_err("error! Small exttable size in microcode data file\n");
212 "Small exttable size in microcode data file\n");
213 return -EINVAL; 212 return -EINVAL;
214 } 213 }
215 ext_header = mc + MC_HEADER_SIZE + data_size; 214 ext_header = mc + MC_HEADER_SIZE + data_size;
216 if (ext_table_size != exttable_size(ext_header)) { 215 if (ext_table_size != exttable_size(ext_header)) {
217 printk(KERN_ERR "microcode: error! " 216 pr_err("error! Bad exttable size in microcode data file\n");
218 "Bad exttable size in microcode data file\n");
219 return -EFAULT; 217 return -EFAULT;
220 } 218 }
221 ext_sigcount = ext_header->count; 219 ext_sigcount = ext_header->count;
@@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc)
230 while (i--) 228 while (i--)
231 ext_table_sum += ext_tablep[i]; 229 ext_table_sum += ext_tablep[i];
232 if (ext_table_sum) { 230 if (ext_table_sum) {
233 printk(KERN_WARNING "microcode: aborting, " 231 pr_warning("aborting, bad extended signature table checksum\n");
234 "bad extended signature table checksum\n");
235 return -EINVAL; 232 return -EINVAL;
236 } 233 }
237 } 234 }
@@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc)
242 while (i--) 239 while (i--)
243 orig_sum += ((int *)mc)[i]; 240 orig_sum += ((int *)mc)[i];
244 if (orig_sum) { 241 if (orig_sum) {
245 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 242 pr_err("aborting, bad checksum\n");
246 return -EINVAL; 243 return -EINVAL;
247 } 244 }
248 if (!ext_table_size) 245 if (!ext_table_size)
@@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc)
255 - (mc_header->sig + mc_header->pf + mc_header->cksum) 252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
256 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); 253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
257 if (sum) { 254 if (sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 255 pr_err("aborting, bad checksum\n");
259 return -EINVAL; 256 return -EINVAL;
260 } 257 }
261 } 258 }
@@ -327,13 +324,11 @@ static int apply_microcode(int cpu)
327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 324 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
328 325
329 if (val[1] != mc_intel->hdr.rev) { 326 if (val[1] != mc_intel->hdr.rev) {
330 printk(KERN_ERR "microcode: CPU%d update " 327 pr_err("CPU%d update to revision 0x%x failed\n",
331 "to revision 0x%x failed\n", 328 cpu_num, mc_intel->hdr.rev);
332 cpu_num, mc_intel->hdr.rev);
333 return -1; 329 return -1;
334 } 330 }
335 printk(KERN_INFO "microcode: CPU%d updated to revision " 331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n",
336 "0x%x, date = %04x-%02x-%02x \n",
337 cpu_num, val[1], 332 cpu_num, val[1],
338 mc_intel->hdr.date & 0xffff, 333 mc_intel->hdr.date & 0xffff,
339 mc_intel->hdr.date >> 24, 334 mc_intel->hdr.date >> 24,
@@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
362 357
363 mc_size = get_totalsize(&mc_header); 358 mc_size = get_totalsize(&mc_header);
364 if (!mc_size || mc_size > leftover) { 359 if (!mc_size || mc_size > leftover) {
365 printk(KERN_ERR "microcode: error!" 360 pr_err("error! Bad data in microcode data file\n");
366 "Bad data in microcode data file\n");
367 break; 361 break;
368 } 362 }
369 363
@@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
405 vfree(uci->mc); 399 vfree(uci->mc);
406 uci->mc = (struct microcode_intel *)new_mc; 400 uci->mc = (struct microcode_intel *)new_mc;
407 401
408 pr_debug("microcode: CPU%d found a matching microcode update with" 402 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
409 " version 0x%x (current=0x%x)\n", 403 cpu, new_rev, uci->cpu_sig.rev);
410 cpu, new_rev, uci->cpu_sig.rev);
411out: 404out:
412 return state; 405 return state;
413} 406}
@@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
429 c->x86, c->x86_model, c->x86_mask); 422 c->x86, c->x86_model, c->x86_mask);
430 423
431 if (request_firmware(&firmware, name, device)) { 424 if (request_firmware(&firmware, name, device)) {
432 pr_debug("microcode: data file %s load failed\n", name); 425 pr_debug("data file %s load failed\n", name);
433 return UCODE_NFOUND; 426 return UCODE_NFOUND;
434 } 427 }
435 428
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5be95ef4ffec..35a57c963df9 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -667,36 +667,18 @@ void __init default_get_smp_config(unsigned int early)
667 */ 667 */
668} 668}
669 669
670static void __init smp_reserve_bootmem(struct mpf_intel *mpf) 670static void __init smp_reserve_memory(struct mpf_intel *mpf)
671{ 671{
672 unsigned long size = get_mpc_size(mpf->physptr); 672 unsigned long size = get_mpc_size(mpf->physptr);
673#ifdef CONFIG_X86_32
674 /*
675 * We cannot access to MPC table to compute table size yet,
676 * as only few megabytes from the bottom is mapped now.
677 * PC-9800's MPC table places on the very last of physical
678 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
679 * yields BUG() in reserve_bootmem.
680 * also need to make sure physptr is below than max_low_pfn
681 * we don't need reserve the area above max_low_pfn
682 */
683 unsigned long end = max_low_pfn * PAGE_SIZE;
684 673
685 if (mpf->physptr < end) { 674 reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc");
686 if (mpf->physptr + size > end)
687 size = end - mpf->physptr;
688 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
689 }
690#else
691 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
692#endif
693} 675}
694 676
695static int __init smp_scan_config(unsigned long base, unsigned long length, 677static int __init smp_scan_config(unsigned long base, unsigned long length)
696 unsigned reserve)
697{ 678{
698 unsigned int *bp = phys_to_virt(base); 679 unsigned int *bp = phys_to_virt(base);
699 struct mpf_intel *mpf; 680 struct mpf_intel *mpf;
681 unsigned long mem;
700 682
701 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 683 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
702 bp, length); 684 bp, length);
@@ -717,12 +699,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
717 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", 699 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
718 mpf, (u64)virt_to_phys(mpf)); 700 mpf, (u64)virt_to_phys(mpf));
719 701
720 if (!reserve) 702 mem = virt_to_phys(mpf);
721 return 1; 703 reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf");
722 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
723 BOOTMEM_DEFAULT);
724 if (mpf->physptr) 704 if (mpf->physptr)
725 smp_reserve_bootmem(mpf); 705 smp_reserve_memory(mpf);
726 706
727 return 1; 707 return 1;
728 } 708 }
@@ -732,7 +712,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
732 return 0; 712 return 0;
733} 713}
734 714
735void __init default_find_smp_config(unsigned int reserve) 715void __init default_find_smp_config(void)
736{ 716{
737 unsigned int address; 717 unsigned int address;
738 718
@@ -744,9 +724,9 @@ void __init default_find_smp_config(unsigned int reserve)
744 * 2) Scan the top 1K of base RAM 724 * 2) Scan the top 1K of base RAM
745 * 3) Scan the 64K of bios 725 * 3) Scan the 64K of bios
746 */ 726 */
747 if (smp_scan_config(0x0, 0x400, reserve) || 727 if (smp_scan_config(0x0, 0x400) ||
748 smp_scan_config(639 * 0x400, 0x400, reserve) || 728 smp_scan_config(639 * 0x400, 0x400) ||
749 smp_scan_config(0xF0000, 0x10000, reserve)) 729 smp_scan_config(0xF0000, 0x10000))
750 return; 730 return;
751 /* 731 /*
752 * If it is an SMP machine we should know now, unless the 732 * If it is an SMP machine we should know now, unless the
@@ -767,7 +747,7 @@ void __init default_find_smp_config(unsigned int reserve)
767 747
768 address = get_bios_ebda(); 748 address = get_bios_ebda();
769 if (address) 749 if (address)
770 smp_scan_config(address, 0x400, reserve); 750 smp_scan_config(address, 0x400);
771} 751}
772 752
773#ifdef CONFIG_X86_IO_APIC 753#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 744508e7cfdd..7a7bd4e3ec49 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,9 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/user-return-notifier.h>
13#include <linux/dmi.h>
14#include <linux/utsname.h>
12#include <trace/events/power.h> 15#include <trace/events/power.h>
13#include <linux/hw_breakpoint.h> 16#include <linux/hw_breakpoint.h>
14#include <asm/system.h> 17#include <asm/system.h>
@@ -89,6 +92,25 @@ void exit_thread(void)
89 } 92 }
90} 93}
91 94
95void show_regs_common(void)
96{
97 const char *board, *product;
98
99 board = dmi_get_system_info(DMI_BOARD_NAME);
100 if (!board)
101 board = "";
102 product = dmi_get_system_info(DMI_PRODUCT_NAME);
103 if (!product)
104 product = "";
105
106 printk("\n");
107 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
108 current->pid, current->comm, print_tainted(),
109 init_utsname()->release,
110 (int)strcspn(init_utsname()->version, " "),
111 init_utsname()->version, board, product);
112}
113
92void flush_thread(void) 114void flush_thread(void)
93{ 115{
94 struct task_struct *tsk = current; 116 struct task_struct *tsk = current;
@@ -209,6 +231,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
209 */ 231 */
210 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 232 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
211 } 233 }
234 propagate_user_return_notify(prev_p, next_p);
212} 235}
213 236
214int sys_fork(struct pt_regs *regs) 237int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 075580b35682..120b88797a75 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -23,7 +23,6 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/user.h> 24#include <linux/user.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/utsname.h>
27#include <linux/delay.h> 26#include <linux/delay.h>
28#include <linux/reboot.h> 27#include <linux/reboot.h>
29#include <linux/init.h> 28#include <linux/init.h>
@@ -35,7 +34,6 @@
35#include <linux/tick.h> 34#include <linux/tick.h>
36#include <linux/percpu.h> 35#include <linux/percpu.h>
37#include <linux/prctl.h> 36#include <linux/prctl.h>
38#include <linux/dmi.h>
39#include <linux/ftrace.h> 37#include <linux/ftrace.h>
40#include <linux/uaccess.h> 38#include <linux/uaccess.h>
41#include <linux/io.h> 39#include <linux/io.h>
@@ -128,7 +126,6 @@ void __show_regs(struct pt_regs *regs, int all)
128 unsigned long d0, d1, d2, d3, d6, d7; 126 unsigned long d0, d1, d2, d3, d6, d7;
129 unsigned long sp; 127 unsigned long sp;
130 unsigned short ss, gs; 128 unsigned short ss, gs;
131 const char *board;
132 129
133 if (user_mode_vm(regs)) { 130 if (user_mode_vm(regs)) {
134 sp = regs->sp; 131 sp = regs->sp;
@@ -140,16 +137,7 @@ void __show_regs(struct pt_regs *regs, int all)
140 savesegment(gs, gs); 137 savesegment(gs, gs);
141 } 138 }
142 139
143 printk("\n"); 140 show_regs_common();
144
145 board = dmi_get_system_info(DMI_PRODUCT_NAME);
146 if (!board)
147 board = "";
148 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
149 task_pid_nr(current), current->comm,
150 print_tainted(), init_utsname()->release,
151 (int)strcspn(init_utsname()->version, " "),
152 init_utsname()->version, board);
153 141
154 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 142 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
155 (u16)regs->cs, regs->ip, regs->flags, 143 (u16)regs->cs, regs->ip, regs->flags,
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a98fe88fab64..e5ab0cd0ef36 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -26,7 +26,6 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/user.h> 27#include <linux/user.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/utsname.h>
30#include <linux/delay.h> 29#include <linux/delay.h>
31#include <linux/module.h> 30#include <linux/module.h>
32#include <linux/ptrace.h> 31#include <linux/ptrace.h>
@@ -38,7 +37,6 @@
38#include <linux/uaccess.h> 37#include <linux/uaccess.h>
39#include <linux/io.h> 38#include <linux/io.h>
40#include <linux/ftrace.h> 39#include <linux/ftrace.h>
41#include <linux/dmi.h>
42 40
43#include <asm/pgtable.h> 41#include <asm/pgtable.h>
44#include <asm/system.h> 42#include <asm/system.h>
@@ -163,18 +161,8 @@ void __show_regs(struct pt_regs *regs, int all)
163 unsigned long d0, d1, d2, d3, d6, d7; 161 unsigned long d0, d1, d2, d3, d6, d7;
164 unsigned int fsindex, gsindex; 162 unsigned int fsindex, gsindex;
165 unsigned int ds, cs, es; 163 unsigned int ds, cs, es;
166 const char *board; 164
167 165 show_regs_common();
168 printk("\n");
169 print_modules();
170 board = dmi_get_system_info(DMI_PRODUCT_NAME);
171 if (!board)
172 board = "";
173 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
174 current->pid, current->comm, print_tainted(),
175 init_utsname()->release,
176 (int)strcspn(init_utsname()->version, " "),
177 init_utsname()->version, board);
178 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 166 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
179 printk_address(regs->ip, 1); 167 printk_address(regs->ip, 1);
180 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 168 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
@@ -349,26 +337,42 @@ out:
349 return err; 337 return err;
350} 338}
351 339
352void 340static void
353start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 341start_thread_common(struct pt_regs *regs, unsigned long new_ip,
342 unsigned long new_sp,
343 unsigned int _cs, unsigned int _ss, unsigned int _ds)
354{ 344{
355 loadsegment(fs, 0); 345 loadsegment(fs, 0);
356 loadsegment(es, 0); 346 loadsegment(es, _ds);
357 loadsegment(ds, 0); 347 loadsegment(ds, _ds);
358 load_gs_index(0); 348 load_gs_index(0);
359 regs->ip = new_ip; 349 regs->ip = new_ip;
360 regs->sp = new_sp; 350 regs->sp = new_sp;
361 percpu_write(old_rsp, new_sp); 351 percpu_write(old_rsp, new_sp);
362 regs->cs = __USER_CS; 352 regs->cs = _cs;
363 regs->ss = __USER_DS; 353 regs->ss = _ss;
364 regs->flags = 0x200; 354 regs->flags = X86_EFLAGS_IF;
365 set_fs(USER_DS); 355 set_fs(USER_DS);
366 /* 356 /*
367 * Free the old FP and other extended state 357 * Free the old FP and other extended state
368 */ 358 */
369 free_thread_xstate(current); 359 free_thread_xstate(current);
370} 360}
371EXPORT_SYMBOL_GPL(start_thread); 361
362void
363start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
364{
365 start_thread_common(regs, new_ip, new_sp,
366 __USER_CS, __USER_DS, 0);
367}
368
369#ifdef CONFIG_IA32_EMULATION
370void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
371{
372 start_thread_common(regs, new_ip, new_sp,
373 __USER32_CS, __USER32_DS, __USER32_DS);
374}
375#endif
372 376
373/* 377/*
374 * switch_to(x,y) should switch tasks from x to y. 378 * switch_to(x,y) should switch tasks from x to y.
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 04d182a7cfdb..7079ddaf0731 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -555,7 +555,9 @@ static int genregs_set(struct task_struct *target,
555 return ret; 555 return ret;
556} 556}
557 557
558static void ptrace_triggered(struct perf_event *bp, void *data) 558static void ptrace_triggered(struct perf_event *bp, int nmi,
559 struct perf_sample_data *data,
560 struct pt_regs *regs)
559{ 561{
560 int i; 562 int i;
561 struct thread_struct *thread = &(current->thread); 563 struct thread_struct *thread = &(current->thread);
@@ -593,13 +595,13 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[])
593 return dr7; 595 return dr7;
594} 596}
595 597
596static struct perf_event * 598static int
597ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, 599ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
598 struct task_struct *tsk, int disabled) 600 struct task_struct *tsk, int disabled)
599{ 601{
600 int err; 602 int err;
601 int gen_len, gen_type; 603 int gen_len, gen_type;
602 DEFINE_BREAKPOINT_ATTR(attr); 604 struct perf_event_attr attr;
603 605
604 /* 606 /*
605 * We shoud have at least an inactive breakpoint at this 607 * We shoud have at least an inactive breakpoint at this
@@ -607,18 +609,18 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
607 * written the address register first 609 * written the address register first
608 */ 610 */
609 if (!bp) 611 if (!bp)
610 return ERR_PTR(-EINVAL); 612 return -EINVAL;
611 613
612 err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); 614 err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
613 if (err) 615 if (err)
614 return ERR_PTR(err); 616 return err;
615 617
616 attr = bp->attr; 618 attr = bp->attr;
617 attr.bp_len = gen_len; 619 attr.bp_len = gen_len;
618 attr.bp_type = gen_type; 620 attr.bp_type = gen_type;
619 attr.disabled = disabled; 621 attr.disabled = disabled;
620 622
621 return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); 623 return modify_user_hw_breakpoint(bp, &attr);
622} 624}
623 625
624/* 626/*
@@ -656,28 +658,17 @@ restore:
656 if (!second_pass) 658 if (!second_pass)
657 continue; 659 continue;
658 660
659 thread->ptrace_bps[i] = NULL; 661 rc = ptrace_modify_breakpoint(bp, len, type,
660 bp = ptrace_modify_breakpoint(bp, len, type,
661 tsk, 1); 662 tsk, 1);
662 if (IS_ERR(bp)) { 663 if (rc)
663 rc = PTR_ERR(bp);
664 thread->ptrace_bps[i] = NULL;
665 break; 664 break;
666 }
667 thread->ptrace_bps[i] = bp;
668 } 665 }
669 continue; 666 continue;
670 } 667 }
671 668
672 bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0); 669 rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
673 670 if (rc)
674 /* Incorrect bp, or we have a bug in bp API */
675 if (IS_ERR(bp)) {
676 rc = PTR_ERR(bp);
677 thread->ptrace_bps[i] = NULL;
678 break; 671 break;
679 }
680 thread->ptrace_bps[i] = bp;
681 } 672 }
682 /* 673 /*
683 * Make a second pass to free the remaining unused breakpoints 674 * Make a second pass to free the remaining unused breakpoints
@@ -721,9 +712,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
721{ 712{
722 struct perf_event *bp; 713 struct perf_event *bp;
723 struct thread_struct *t = &tsk->thread; 714 struct thread_struct *t = &tsk->thread;
724 DEFINE_BREAKPOINT_ATTR(attr); 715 struct perf_event_attr attr;
725 716
726 if (!t->ptrace_bps[nr]) { 717 if (!t->ptrace_bps[nr]) {
718 hw_breakpoint_init(&attr);
727 /* 719 /*
728 * Put stub len and type to register (reserve) an inactive but 720 * Put stub len and type to register (reserve) an inactive but
729 * correct bp 721 * correct bp
@@ -734,26 +726,32 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
734 attr.disabled = 1; 726 attr.disabled = 1;
735 727
736 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); 728 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
729
730 /*
731 * CHECKME: the previous code returned -EIO if the addr wasn't
732 * a valid task virtual addr. The new one will return -EINVAL in
733 * this case.
734 * -EINVAL may be what we want for in-kernel breakpoints users,
735 * but -EIO looks better for ptrace, since we refuse a register
736 * writing for the user. And anyway this is the previous
737 * behaviour.
738 */
739 if (IS_ERR(bp))
740 return PTR_ERR(bp);
741
742 t->ptrace_bps[nr] = bp;
737 } else { 743 } else {
744 int err;
745
738 bp = t->ptrace_bps[nr]; 746 bp = t->ptrace_bps[nr];
739 t->ptrace_bps[nr] = NULL;
740 747
741 attr = bp->attr; 748 attr = bp->attr;
742 attr.bp_addr = addr; 749 attr.bp_addr = addr;
743 bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); 750 err = modify_user_hw_breakpoint(bp, &attr);
751 if (err)
752 return err;
744 } 753 }
745 /*
746 * CHECKME: the previous code returned -EIO if the addr wasn't a
747 * valid task virtual addr. The new one will return -EINVAL in this
748 * case.
749 * -EINVAL may be what we want for in-kernel breakpoints users, but
750 * -EIO looks better for ptrace, since we refuse a register writing
751 * for the user. And anyway this is the previous behaviour.
752 */
753 if (IS_ERR(bp))
754 return PTR_ERR(bp);
755 754
756 t->ptrace_bps[nr] = bp;
757 755
758 return 0; 756 return 0;
759} 757}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 6c3b2c6fd772..18093d7498f0 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -499,6 +499,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{ 499{
500 struct pci_dev *nb_ht; 500 struct pci_dev *nb_ht;
501 unsigned int devfn; 501 unsigned int devfn;
502 u32 node;
502 u32 val; 503 u32 val;
503 504
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); 505 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
@@ -507,7 +508,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
507 return; 508 return;
508 509
509 pci_read_config_dword(nb_ht, 0x60, &val); 510 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 511 node = val & 7;
512 /*
513 * Some hardware may return an invalid node ID,
514 * so check it first:
515 */
516 if (node_online(node))
517 set_dev_node(&dev->dev, node);
511 pci_dev_put(nb_ht); 518 pci_dev_put(nb_ht);
512} 519}
513 520
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2b97fc5b124e..1545bc0c9845 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -259,6 +259,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
259 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), 259 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
260 }, 260 },
261 }, 261 },
262 { /* Handle problems with rebooting on ASUS P4S800 */
263 .callback = set_bios_reboot,
264 .ident = "ASUS P4S800",
265 .matches = {
266 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
267 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
268 },
269 },
262 { } 270 { }
263}; 271};
264 272
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index 61a837743fe5..201eab63b05f 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -80,6 +80,7 @@ void mach_reboot_fixups(void)
80 continue; 80 continue;
81 81
82 cur->reboot_fixup(dev); 82 cur->reboot_fixup(dev);
83 pci_dev_put(dev);
83 } 84 }
84} 85}
85 86
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 82e88cdda9bc..946a311a25c9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -106,6 +106,7 @@
106#include <asm/percpu.h> 106#include <asm/percpu.h>
107#include <asm/topology.h> 107#include <asm/topology.h>
108#include <asm/apicdef.h> 108#include <asm/apicdef.h>
109#include <asm/k8.h>
109#ifdef CONFIG_X86_64 110#ifdef CONFIG_X86_64
110#include <asm/numa_64.h> 111#include <asm/numa_64.h>
111#endif 112#endif
@@ -487,42 +488,11 @@ static void __init reserve_early_setup_data(void)
487 488
488#ifdef CONFIG_KEXEC 489#ifdef CONFIG_KEXEC
489 490
490/**
491 * Reserve @size bytes of crashkernel memory at any suitable offset.
492 *
493 * @size: Size of the crashkernel memory to reserve.
494 * Returns the base address on success, and -1ULL on failure.
495 */
496static
497unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
498{
499 const unsigned long long alignment = 16<<20; /* 16M */
500 unsigned long long start = 0LL;
501
502 while (1) {
503 int ret;
504
505 start = find_e820_area(start, ULONG_MAX, size, alignment);
506 if (start == -1ULL)
507 return start;
508
509 /* try to reserve it */
510 ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
511 if (ret >= 0)
512 return start;
513
514 start += alignment;
515 }
516}
517
518static inline unsigned long long get_total_mem(void) 491static inline unsigned long long get_total_mem(void)
519{ 492{
520 unsigned long long total; 493 unsigned long long total;
521 494
522 total = max_low_pfn - min_low_pfn; 495 total = max_pfn - min_low_pfn;
523#ifdef CONFIG_HIGHMEM
524 total += highend_pfn - highstart_pfn;
525#endif
526 496
527 return total << PAGE_SHIFT; 497 return total << PAGE_SHIFT;
528} 498}
@@ -542,21 +512,25 @@ static void __init reserve_crashkernel(void)
542 512
543 /* 0 means: find the address automatically */ 513 /* 0 means: find the address automatically */
544 if (crash_base <= 0) { 514 if (crash_base <= 0) {
545 crash_base = find_and_reserve_crashkernel(crash_size); 515 const unsigned long long alignment = 16<<20; /* 16M */
516
517 crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
518 alignment);
546 if (crash_base == -1ULL) { 519 if (crash_base == -1ULL) {
547 pr_info("crashkernel reservation failed. " 520 pr_info("crashkernel reservation failed - No suitable area found.\n");
548 "No suitable area found.\n");
549 return; 521 return;
550 } 522 }
551 } else { 523 } else {
552 ret = reserve_bootmem_generic(crash_base, crash_size, 524 unsigned long long start;
553 BOOTMEM_EXCLUSIVE); 525
554 if (ret < 0) { 526 start = find_e820_area(crash_base, ULONG_MAX, crash_size,
555 pr_info("crashkernel reservation failed - " 527 1<<20);
556 "memory is in use\n"); 528 if (start != crash_base) {
529 pr_info("crashkernel reservation failed - memory is in use.\n");
557 return; 530 return;
558 } 531 }
559 } 532 }
533 reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
560 534
561 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 535 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
562 "for crashkernel (System RAM: %ldMB)\n", 536 "for crashkernel (System RAM: %ldMB)\n",
@@ -699,6 +673,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
699 673
700void __init setup_arch(char **cmdline_p) 674void __init setup_arch(char **cmdline_p)
701{ 675{
676 int acpi = 0;
677 int k8 = 0;
678
702#ifdef CONFIG_X86_32 679#ifdef CONFIG_X86_32
703 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 680 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
704 visws_early_detect(); 681 visws_early_detect();
@@ -791,21 +768,18 @@ void __init setup_arch(char **cmdline_p)
791 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 768 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
792 *cmdline_p = command_line; 769 *cmdline_p = command_line;
793 770
794#ifdef CONFIG_X86_64
795 /* 771 /*
796 * Must call this twice: Once just to detect whether hardware doesn't 772 * x86_configure_nx() is called before parse_early_param() to detect
797 * support NX (so that the early EHCI debug console setup can safely 773 * whether hardware doesn't support NX (so that the early EHCI debug
798 * call set_fixmap(), and then again after parsing early parameters to 774 * console setup can safely call set_fixmap()). It may then be called
799 * honor the respective command line option. 775 * again from within noexec_setup() during parsing early parameters
776 * to honor the respective command line option.
800 */ 777 */
801 check_efer(); 778 x86_configure_nx();
802#endif
803 779
804 parse_early_param(); 780 parse_early_param();
805 781
806#ifdef CONFIG_X86_64 782 x86_report_nx();
807 check_efer();
808#endif
809 783
810 /* Must be before kernel pagetables are setup */ 784 /* Must be before kernel pagetables are setup */
811 vmi_activate(); 785 vmi_activate();
@@ -901,6 +875,13 @@ void __init setup_arch(char **cmdline_p)
901 875
902 reserve_brk(); 876 reserve_brk();
903 877
878#ifdef CONFIG_ACPI_SLEEP
879 /*
880 * Reserve low memory region for sleep support.
881 * even before init_memory_mapping
882 */
883 acpi_reserve_wakeup_memory();
884#endif
904 init_gbpages(); 885 init_gbpages();
905 886
906 /* max_pfn_mapped is updated here */ 887 /* max_pfn_mapped is updated here */
@@ -927,6 +908,8 @@ void __init setup_arch(char **cmdline_p)
927 908
928 reserve_initrd(); 909 reserve_initrd();
929 910
911 reserve_crashkernel();
912
930 vsmp_init(); 913 vsmp_init();
931 914
932 io_delay_init(); 915 io_delay_init();
@@ -938,27 +921,24 @@ void __init setup_arch(char **cmdline_p)
938 921
939 early_acpi_boot_init(); 922 early_acpi_boot_init();
940 923
924 /*
925 * Find and reserve possible boot-time SMP configuration:
926 */
927 find_smp_config();
928
941#ifdef CONFIG_ACPI_NUMA 929#ifdef CONFIG_ACPI_NUMA
942 /* 930 /*
943 * Parse SRAT to discover nodes. 931 * Parse SRAT to discover nodes.
944 */ 932 */
945 acpi_numa_init(); 933 acpi = acpi_numa_init();
946#endif 934#endif
947 935
948 initmem_init(0, max_pfn); 936#ifdef CONFIG_K8_NUMA
949 937 if (!acpi)
950#ifdef CONFIG_ACPI_SLEEP 938 k8 = !k8_numa_init(0, max_pfn);
951 /*
952 * Reserve low memory region for sleep support.
953 */
954 acpi_reserve_bootmem();
955#endif 939#endif
956 /*
957 * Find and reserve possible boot-time SMP configuration:
958 */
959 find_smp_config();
960 940
961 reserve_crashkernel(); 941 initmem_init(0, max_pfn, acpi, k8);
962 942
963#ifdef CONFIG_X86_64 943#ifdef CONFIG_X86_64
964 /* 944 /*
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d559af913e1f..35abcb8b00e9 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/kernel.h> 3#include <linux/kernel.h>
2#include <linux/module.h> 4#include <linux/module.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -20,9 +22,9 @@
20#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
21 23
22#ifdef CONFIG_DEBUG_PER_CPU_MAPS 24#ifdef CONFIG_DEBUG_PER_CPU_MAPS
23# define DBG(x...) printk(KERN_DEBUG x) 25# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
24#else 26#else
25# define DBG(x...) 27# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
26#endif 28#endif
27 29
28DEFINE_PER_CPU(int, cpu_number); 30DEFINE_PER_CPU(int, cpu_number);
@@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
116 } else { 118 } else {
117 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 119 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
118 size, align, goal); 120 size, align, goal);
119 pr_debug("per cpu data for cpu%d %lu bytes on node%d at " 121 pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
120 "%016lx\n", cpu, size, node, __pa(ptr)); 122 cpu, size, node, __pa(ptr));
121 } 123 }
122 return ptr; 124 return ptr;
123#else 125#else
@@ -198,8 +200,7 @@ void __init setup_per_cpu_areas(void)
198 pcpu_cpu_distance, 200 pcpu_cpu_distance,
199 pcpu_fc_alloc, pcpu_fc_free); 201 pcpu_fc_alloc, pcpu_fc_free);
200 if (rc < 0) 202 if (rc < 0)
201 pr_warning("PERCPU: %s allocator failed (%d), " 203 pr_warning("%s allocator failed (%d), falling back to page size\n",
202 "falling back to page size\n",
203 pcpu_fc_names[pcpu_chosen_fc], rc); 204 pcpu_fc_names[pcpu_chosen_fc], rc);
204 } 205 }
205 if (rc < 0) 206 if (rc < 0)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index fbf3b07c8567..74fe6d86dc5d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/user-return-notifier.h>
22 23
23#include <asm/processor.h> 24#include <asm/processor.h>
24#include <asm/ucontext.h> 25#include <asm/ucontext.h>
@@ -863,6 +864,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
863 if (current->replacement_session_keyring) 864 if (current->replacement_session_keyring)
864 key_replace_session_keyring(); 865 key_replace_session_keyring();
865 } 866 }
867 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
868 fire_user_return_notifiers();
866 869
867#ifdef CONFIG_X86_32 870#ifdef CONFIG_X86_32
868 clear_thread_flag(TIF_IRET); 871 clear_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 324f2a44c221..29e6744f51e3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -687,7 +687,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
687 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 687 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
688 }; 688 };
689 689
690 INIT_WORK(&c_idle.work, do_fork_idle); 690 INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
691 691
692 alternatives_smp_switch(1); 692 alternatives_smp_switch(1);
693 693
@@ -713,6 +713,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
713 713
714 if (IS_ERR(c_idle.idle)) { 714 if (IS_ERR(c_idle.idle)) {
715 printk("failed fork for CPU %d\n", cpu); 715 printk("failed fork for CPU %d\n", cpu);
716 destroy_work_on_stack(&c_idle.work);
716 return PTR_ERR(c_idle.idle); 717 return PTR_ERR(c_idle.idle);
717 } 718 }
718 719
@@ -831,6 +832,7 @@ do_rest:
831 smpboot_restore_warm_reset_vector(); 832 smpboot_restore_warm_reset_vector();
832 } 833 }
833 834
835 destroy_work_on_stack(&c_idle.work);
834 return boot_error; 836 return boot_error;
835} 837}
836 838
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 1884a8d12bfa..dee1ff7cba58 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -24,31 +24,6 @@
24 24
25#include <asm/syscalls.h> 25#include <asm/syscalls.h>
26 26
27asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
28 unsigned long prot, unsigned long flags,
29 unsigned long fd, unsigned long pgoff)
30{
31 int error = -EBADF;
32 struct file *file = NULL;
33 struct mm_struct *mm = current->mm;
34
35 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
36 if (!(flags & MAP_ANONYMOUS)) {
37 file = fget(fd);
38 if (!file)
39 goto out;
40 }
41
42 down_write(&mm->mmap_sem);
43 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
44 up_write(&mm->mmap_sem);
45
46 if (file)
47 fput(file);
48out:
49 return error;
50}
51
52/* 27/*
53 * Perform the select(nd, in, out, ex, tv) and mmap() system 28 * Perform the select(nd, in, out, ex, tv) and mmap() system
54 * calls. Linux/i386 didn't use to be able to handle more than 29 * calls. Linux/i386 didn't use to be able to handle more than
@@ -77,7 +52,7 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
77 if (a.offset & ~PAGE_MASK) 52 if (a.offset & ~PAGE_MASK)
78 goto out; 53 goto out;
79 54
80 err = sys_mmap2(a.addr, a.len, a.prot, a.flags, 55 err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags,
81 a.fd, a.offset >> PAGE_SHIFT); 56 a.fd, a.offset >> PAGE_SHIFT);
82out: 57out:
83 return err; 58 return err;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 45e00eb09c3a..8aa2057efd12 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -23,26 +23,11 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
23 unsigned long, fd, unsigned long, off) 23 unsigned long, fd, unsigned long, off)
24{ 24{
25 long error; 25 long error;
26 struct file *file;
27
28 error = -EINVAL; 26 error = -EINVAL;
29 if (off & ~PAGE_MASK) 27 if (off & ~PAGE_MASK)
30 goto out; 28 goto out;
31 29
32 error = -EBADF; 30 error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
33 file = NULL;
34 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
35 if (!(flags & MAP_ANONYMOUS)) {
36 file = fget(fd);
37 if (!file)
38 goto out;
39 }
40 down_write(&current->mm->mmap_sem);
41 error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
42 up_write(&current->mm->mmap_sem);
43
44 if (file)
45 fput(file);
46out: 31out:
47 return error; 32 return error;
48} 33}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 0157cd26d7cc..15228b5d3eb7 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -191,7 +191,7 @@ ENTRY(sys_call_table)
191 .long sys_ni_syscall /* reserved for streams2 */ 191 .long sys_ni_syscall /* reserved for streams2 */
192 .long ptregs_vfork /* 190 */ 192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit 193 .long sys_getrlimit
194 .long sys_mmap2 194 .long sys_mmap_pgoff
195 .long sys_truncate64 195 .long sys_truncate64
196 .long sys_ftruncate64 196 .long sys_ftruncate64
197 .long sys_stat64 /* 195 */ 197 .long sys_stat64 /* 195 */
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_recvmmsg
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 1740c85e24bb..364d015efebc 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -817,10 +817,8 @@ static int __init uv_init_blade(int blade)
817 */ 817 */
818 apicid = blade_to_first_apicid(blade); 818 apicid = blade_to_first_apicid(blade);
819 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); 819 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
820 if ((pa & 0xff) != UV_BAU_MESSAGE) { 820 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
821 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
822 ((apicid << 32) | UV_BAU_MESSAGE)); 821 ((apicid << 32) | UV_BAU_MESSAGE));
823 }
824 return 0; 822 return 0;
825} 823}
826 824
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 583f11d5c480..3c84aa001c11 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -74,7 +74,7 @@ struct uv_rtc_timer_head {
74 */ 74 */
75static struct uv_rtc_timer_head **blade_info __read_mostly; 75static struct uv_rtc_timer_head **blade_info __read_mostly;
76 76
77static int uv_rtc_enable; 77static int uv_rtc_evt_enable;
78 78
79/* 79/*
80 * Hardware interface routines 80 * Hardware interface routines
@@ -90,7 +90,7 @@ static void uv_rtc_send_IPI(int cpu)
90 pnode = uv_apicid_to_pnode(apicid); 90 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 91 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) | 92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); 93 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94 94
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96} 96}
@@ -115,7 +115,7 @@ static int uv_setup_intr(int cpu, u64 expires)
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, 115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK); 116 UVH_EVENT_OCCURRED0_RTC1_MASK);
117 117
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 118 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120 120
121 /* Set configuration */ 121 /* Set configuration */
@@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires)
123 /* Initialize comparator value */ 123 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); 124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125 125
126 return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); 126 if (uv_read_rtc(NULL) <= expires)
127 return 0;
128
129 return !uv_intr_pending(pnode);
127} 130}
128 131
129/* 132/*
@@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
223 226
224 next_cpu = head->next_cpu; 227 next_cpu = head->next_cpu;
225 *t = expires; 228 *t = expires;
229
226 /* Will this one be next to go off? */ 230 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu || 231 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) { 232 expires < head->cpu[next_cpu].expires) {
@@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
231 *t = ULLONG_MAX; 235 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode); 236 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags); 237 spin_unlock_irqrestore(&head->lock, flags);
234 return 1; 238 return -ETIME;
235 } 239 }
236 } 240 }
237 241
@@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
244 * 248 *
245 * Returns 1 if this timer was pending. 249 * Returns 1 if this timer was pending.
246 */ 250 */
247static int uv_rtc_unset_timer(int cpu) 251static int uv_rtc_unset_timer(int cpu, int force)
248{ 252{
249 int pnode = uv_cpu_to_pnode(cpu); 253 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu); 254 int bid = uv_cpu_to_blade_id(cpu);
@@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu)
256 260
257 spin_lock_irqsave(&head->lock, flags); 261 spin_lock_irqsave(&head->lock, flags);
258 262
259 if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) 263 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
260 rc = 1; 264 rc = 1;
261 265
262 *t = ULLONG_MAX; 266 if (rc) {
263 267 *t = ULLONG_MAX;
264 /* Was the hardware setup for this timer? */ 268 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu) 269 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode); 270 uv_rtc_find_next_timer(head, pnode);
271 }
267 272
268 spin_unlock_irqrestore(&head->lock, flags); 273 spin_unlock_irqrestore(&head->lock, flags);
269 274
@@ -310,32 +315,32 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode,
310 break; 315 break;
311 case CLOCK_EVT_MODE_UNUSED: 316 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN: 317 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu); 318 uv_rtc_unset_timer(ced_cpu, 1);
314 break; 319 break;
315 } 320 }
316} 321}
317 322
318static void uv_rtc_interrupt(void) 323static void uv_rtc_interrupt(void)
319{ 324{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id(); 325 int cpu = smp_processor_id();
326 struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
322 327
323 if (!ced || !ced->event_handler) 328 if (!ced || !ced->event_handler)
324 return; 329 return;
325 330
326 if (uv_rtc_unset_timer(cpu) != 1) 331 if (uv_rtc_unset_timer(cpu, 0) != 1)
327 return; 332 return;
328 333
329 ced->event_handler(ced); 334 ced->event_handler(ced);
330} 335}
331 336
332static int __init uv_enable_rtc(char *str) 337static int __init uv_enable_evt_rtc(char *str)
333{ 338{
334 uv_rtc_enable = 1; 339 uv_rtc_evt_enable = 1;
335 340
336 return 1; 341 return 1;
337} 342}
338__setup("uvrtc", uv_enable_rtc); 343__setup("uvrtcevt", uv_enable_evt_rtc);
339 344
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy) 345static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{ 346{
@@ -350,27 +355,32 @@ static __init int uv_rtc_setup_clock(void)
350{ 355{
351 int rc; 356 int rc;
352 357
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) 358 if (!is_uv_system())
354 return -ENODEV; 359 return -ENODEV;
355 360
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, 361 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift); 362 clocksource_uv.shift);
360 363
364 /* If single blade, prefer tsc */
365 if (uv_num_possible_blades() == 1)
366 clocksource_uv.rating = 250;
367
361 rc = clocksource_register(&clocksource_uv); 368 rc = clocksource_register(&clocksource_uv);
362 if (rc) { 369 if (rc)
363 generic_interrupt_extension = NULL; 370 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
371 else
372 printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
373 sn_rtc_cycles_per_second/(unsigned long)1E6);
374
375 if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
364 return rc; 376 return rc;
365 }
366 377
367 /* Setup and register clockevents */ 378 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers(); 379 rc = uv_rtc_allocate_timers();
369 if (rc) { 380 if (rc)
370 clocksource_unregister(&clocksource_uv); 381 goto error;
371 generic_interrupt_extension = NULL; 382
372 return rc; 383 x86_platform_ipi_callback = uv_rtc_interrupt;
373 }
374 384
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, 385 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift); 386 NSEC_PER_SEC, clock_event_device_uv.shift);
@@ -383,11 +393,19 @@ static __init int uv_rtc_setup_clock(void)
383 393
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents); 394 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) { 395 if (rc) {
386 clocksource_unregister(&clocksource_uv); 396 x86_platform_ipi_callback = NULL;
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers(); 397 uv_rtc_deallocate_timers();
398 goto error;
389 } 399 }
390 400
401 printk(KERN_INFO "UV RTC clockevents registered\n");
402
403 return 0;
404
405error:
406 clocksource_unregister(&clocksource_uv);
407 printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
408
391 return rc; 409 return rc;
392} 410}
393arch_initcall(uv_rtc_setup_clock); 411arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index abda6f53e71e..34a279a7471d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -197,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
197 apic_version[m->apicid] = ver; 197 apic_version[m->apicid] = ver;
198} 198}
199 199
200static void __init visws_find_smp_config(unsigned int reserve) 200static void __init visws_find_smp_config(void)
201{ 201{
202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); 202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 611b9e2360d3..74c92bb194df 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
226 evt->min_delta_ns = clockevent_delta2ns(1, evt); 226 evt->min_delta_ns = clockevent_delta2ns(1, evt);
227 evt->cpumask = cpumask_of(cpu); 227 evt->cpumask = cpumask_of(cpu);
228 228
229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", 229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
230 evt->name, evt->mult, evt->shift); 230 evt->name, evt->mult, evt->shift);
231 clockevents_register_device(evt); 231 clockevents_register_device(evt);
232} 232}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3c68fe2d46cf..f3f2104408d9 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,6 +41,32 @@ ENTRY(phys_startup_64)
41jiffies_64 = jiffies; 41jiffies_64 = jiffies;
42#endif 42#endif
43 43
44#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
45/*
46 * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
47 * we retain large page mappings for boundaries spanning kernel text, rodata
48 * and data sections.
49 *
50 * However, kernel identity mappings will have different RWX permissions
51 * to the pages mapping to text and to the pages padding (which are freed) the
52 * text section. Hence kernel identity mappings will be broken to smaller
53 * pages. For 64-bit, kernel text and kernel identity mappings are different,
54 * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
55 * as well as retain 2MB large page mappings for kernel text.
56 */
57#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
58
59#define X64_ALIGN_DEBUG_RODATA_END \
60 . = ALIGN(HPAGE_SIZE); \
61 __end_rodata_hpage_align = .;
62
63#else
64
65#define X64_ALIGN_DEBUG_RODATA_BEGIN
66#define X64_ALIGN_DEBUG_RODATA_END
67
68#endif
69
44PHDRS { 70PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(7); /* RWE */
@@ -90,7 +116,9 @@ SECTIONS
90 116
91 EXCEPTION_TABLE(16) :text = 0x9090 117 EXCEPTION_TABLE(16) :text = 0x9090
92 118
119 X64_ALIGN_DEBUG_RODATA_BEGIN
93 RO_DATA(PAGE_SIZE) 120 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END
94 122
95 /* Data */ 123 /* Data */
96 .data : AT(ADDR(.data) - LOAD_OFFSET) { 124 .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -107,13 +135,13 @@ SECTIONS
107 135
108 PAGE_ALIGNED_DATA(PAGE_SIZE) 136 PAGE_ALIGNED_DATA(PAGE_SIZE)
109 137
110 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) 138 CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
111 139
112 DATA_DATA 140 DATA_DATA
113 CONSTRUCTORS 141 CONSTRUCTORS
114 142
115 /* rarely changed data like cpu maps */ 143 /* rarely changed data like cpu maps */
116 READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) 144 READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
117 145
118 /* End of data section */ 146 /* End of data section */
119 _edata = .; 147 _edata = .;
@@ -137,12 +165,12 @@ SECTIONS
137 *(.vsyscall_0) 165 *(.vsyscall_0)
138 } :user 166 } :user
139 167
140 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 168 . = ALIGN(L1_CACHE_BYTES);
141 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { 169 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
142 *(.vsyscall_fn) 170 *(.vsyscall_fn)
143 } 171 }
144 172
145 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 173 . = ALIGN(L1_CACHE_BYTES);
146 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { 174 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
147 *(.vsyscall_gtod_data) 175 *(.vsyscall_gtod_data)
148 } 176 }
@@ -166,7 +194,7 @@ SECTIONS
166 } 194 }
167 vgetcpu_mode = VVIRT(.vgetcpu_mode); 195 vgetcpu_mode = VVIRT(.vgetcpu_mode);
168 196
169 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 197 . = ALIGN(L1_CACHE_BYTES);
170 .jiffies : AT(VLOAD(.jiffies)) { 198 .jiffies : AT(VLOAD(.jiffies)) {
171 *(.jiffies) 199 *(.jiffies)
172 } 200 }
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974ff599..9055e5872ff0 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) 76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
77 u32 mult)
77{ 78{
78 unsigned long flags; 79 unsigned long flags;
79 80
@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
82 vsyscall_gtod_data.clock.vread = clock->vread; 83 vsyscall_gtod_data.clock.vread = clock->vread;
83 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 84 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
84 vsyscall_gtod_data.clock.mask = clock->mask; 85 vsyscall_gtod_data.clock.mask = clock->mask;
85 vsyscall_gtod_data.clock.mult = clock->mult; 86 vsyscall_gtod_data.clock.mult = mult;
86 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
@@ -237,7 +238,7 @@ static ctl_table kernel_table2[] = {
237}; 238};
238 239
239static ctl_table kernel_root_table2[] = { 240static ctl_table kernel_root_table2[] = {
240 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, 241 { .procname = "kernel", .mode = 0555,
241 .child = kernel_table2 }, 242 .child = kernel_table2 },
242 {} 243 {}
243}; 244};
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index d11c5ff7c65e..ccd179dec36e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -13,6 +13,7 @@
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/time.h> 14#include <asm/time.h>
15#include <asm/irq.h> 15#include <asm/irq.h>
16#include <asm/pat.h>
16#include <asm/tsc.h> 17#include <asm/tsc.h>
17#include <asm/iommu.h> 18#include <asm/iommu.h>
18 19
@@ -80,4 +81,5 @@ struct x86_platform_ops x86_platform = {
80 .get_wallclock = mach_get_cmos_time, 81 .get_wallclock = mach_get_cmos_time,
81 .set_wallclock = mach_set_rtc_mmss, 82 .set_wallclock = mach_set_rtc_mmss,
82 .iommu_shutdown = iommu_shutdown_noop, 83 .iommu_shutdown = iommu_shutdown_noop,
84 .is_untracked_pat_range = is_ISA_range,
83}; 85};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b84e571f4175..4cd498332466 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select USER_RETURN_NOTIFIER
31 ---help--- 32 ---help---
32 Support hosting fully virtualized guest machines using hardware 33 Support hosting fully virtualized guest machines using hardware
33 virtualization extensions. You will need a fairly recent 34 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 0e7fe78d0f74..31a7035c4bd9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,8 @@ CFLAGS_svm.o := -I.
6CFLAGS_vmx.o := -I. 6CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o) 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o)
10kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
11 12
12kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1be5cd640e93..7e8faea4651e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -75,6 +75,8 @@
75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
77#define GroupMask 0xff /* Group number stored in bits 0:7 */ 77#define GroupMask 0xff /* Group number stored in bits 0:7 */
78/* Misc flags */
79#define No64 (1<<28)
78/* Source 2 operand type */ 80/* Source 2 operand type */
79#define Src2None (0<<29) 81#define Src2None (0<<29)
80#define Src2CL (1<<29) 82#define Src2CL (1<<29)
@@ -92,19 +94,23 @@ static u32 opcode_table[256] = {
92 /* 0x00 - 0x07 */ 94 /* 0x00 - 0x07 */
93 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
94 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
95 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 97 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
98 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
96 /* 0x08 - 0x0F */ 99 /* 0x08 - 0x0F */
97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 100 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 101 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
99 0, 0, 0, 0, 102 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
103 ImplicitOps | Stack | No64, 0,
100 /* 0x10 - 0x17 */ 104 /* 0x10 - 0x17 */
101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
103 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
104 /* 0x18 - 0x1F */ 109 /* 0x18 - 0x1F */
105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 110 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
113 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
108 /* 0x20 - 0x27 */ 114 /* 0x20 - 0x27 */
109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 115 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -133,7 +139,8 @@ static u32 opcode_table[256] = {
133 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 139 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
134 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 140 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
135 /* 0x60 - 0x67 */ 141 /* 0x60 - 0x67 */
136 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 142 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
143 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
137 0, 0, 0, 0, 144 0, 0, 0, 0,
138 /* 0x68 - 0x6F */ 145 /* 0x68 - 0x6F */
139 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 146 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
@@ -158,7 +165,7 @@ static u32 opcode_table[256] = {
158 /* 0x90 - 0x97 */ 165 /* 0x90 - 0x97 */
159 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 166 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
160 /* 0x98 - 0x9F */ 167 /* 0x98 - 0x9F */
161 0, 0, SrcImm | Src2Imm16, 0, 168 0, 0, SrcImm | Src2Imm16 | No64, 0,
162 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 169 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
163 /* 0xA0 - 0xA7 */ 170 /* 0xA0 - 0xA7 */
164 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 171 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -185,7 +192,7 @@ static u32 opcode_table[256] = {
185 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 192 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
186 /* 0xC8 - 0xCF */ 193 /* 0xC8 - 0xCF */
187 0, 0, 0, ImplicitOps | Stack, 194 0, 0, 0, ImplicitOps | Stack,
188 ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, 195 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
189 /* 0xD0 - 0xD7 */ 196 /* 0xD0 - 0xD7 */
190 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 197 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
191 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 198 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -198,7 +205,7 @@ static u32 opcode_table[256] = {
198 ByteOp | SrcImmUByte, SrcImmUByte, 205 ByteOp | SrcImmUByte, SrcImmUByte,
199 /* 0xE8 - 0xEF */ 206 /* 0xE8 - 0xEF */
200 SrcImm | Stack, SrcImm | ImplicitOps, 207 SrcImm | Stack, SrcImm | ImplicitOps,
201 SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, 208 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
202 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 209 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
203 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 210 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
204 /* 0xF0 - 0xF7 */ 211 /* 0xF0 - 0xF7 */
@@ -244,11 +251,13 @@ static u32 twobyte_table[256] = {
244 /* 0x90 - 0x9F */ 251 /* 0x90 - 0x9F */
245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0xA0 - 0xA7 */ 253 /* 0xA0 - 0xA7 */
247 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 254 ImplicitOps | Stack, ImplicitOps | Stack,
255 0, DstMem | SrcReg | ModRM | BitOp,
248 DstMem | SrcReg | Src2ImmByte | ModRM, 256 DstMem | SrcReg | Src2ImmByte | ModRM,
249 DstMem | SrcReg | Src2CL | ModRM, 0, 0, 257 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
250 /* 0xA8 - 0xAF */ 258 /* 0xA8 - 0xAF */
251 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 259 ImplicitOps | Stack, ImplicitOps | Stack,
260 0, DstMem | SrcReg | ModRM | BitOp,
252 DstMem | SrcReg | Src2ImmByte | ModRM, 261 DstMem | SrcReg | Src2ImmByte | ModRM,
253 DstMem | SrcReg | Src2CL | ModRM, 262 DstMem | SrcReg | Src2CL | ModRM,
254 ModRM, 0, 263 ModRM, 0,
@@ -613,6 +622,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
613{ 622{
614 int rc = 0; 623 int rc = 0;
615 624
625 /* x86 instructions are limited to 15 bytes. */
626 if (eip + size - ctxt->decode.eip_orig > 15)
627 return X86EMUL_UNHANDLEABLE;
616 eip += ctxt->cs_base; 628 eip += ctxt->cs_base;
617 while (size--) { 629 while (size--) {
618 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 630 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
@@ -871,7 +883,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
871 /* Shadow copy of register state. Committed on successful emulation. */ 883 /* Shadow copy of register state. Committed on successful emulation. */
872 884
873 memset(c, 0, sizeof(struct decode_cache)); 885 memset(c, 0, sizeof(struct decode_cache));
874 c->eip = kvm_rip_read(ctxt->vcpu); 886 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
875 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 887 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
876 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 888 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
877 889
@@ -962,6 +974,11 @@ done_prefixes:
962 } 974 }
963 } 975 }
964 976
977 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
978 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");;
979 return -1;
980 }
981
965 if (c->d & Group) { 982 if (c->d & Group) {
966 group = c->d & GroupMask; 983 group = c->d & GroupMask;
967 c->modrm = insn_fetch(u8, 1, c->eip); 984 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1186,6 +1203,69 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1186 return rc; 1203 return rc;
1187} 1204}
1188 1205
1206static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1207{
1208 struct decode_cache *c = &ctxt->decode;
1209 struct kvm_segment segment;
1210
1211 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
1212
1213 c->src.val = segment.selector;
1214 emulate_push(ctxt);
1215}
1216
1217static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1218 struct x86_emulate_ops *ops, int seg)
1219{
1220 struct decode_cache *c = &ctxt->decode;
1221 unsigned long selector;
1222 int rc;
1223
1224 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1225 if (rc != 0)
1226 return rc;
1227
1228 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg);
1229 return rc;
1230}
1231
1232static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
1233{
1234 struct decode_cache *c = &ctxt->decode;
1235 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1236 int reg = VCPU_REGS_RAX;
1237
1238 while (reg <= VCPU_REGS_RDI) {
1239 (reg == VCPU_REGS_RSP) ?
1240 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1241
1242 emulate_push(ctxt);
1243 ++reg;
1244 }
1245}
1246
1247static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1248 struct x86_emulate_ops *ops)
1249{
1250 struct decode_cache *c = &ctxt->decode;
1251 int rc = 0;
1252 int reg = VCPU_REGS_RDI;
1253
1254 while (reg >= VCPU_REGS_RAX) {
1255 if (reg == VCPU_REGS_RSP) {
1256 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1257 c->op_bytes);
1258 --reg;
1259 }
1260
1261 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1262 if (rc != 0)
1263 break;
1264 --reg;
1265 }
1266 return rc;
1267}
1268
1189static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1269static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1190 struct x86_emulate_ops *ops) 1270 struct x86_emulate_ops *ops)
1191{ 1271{
@@ -1707,18 +1787,45 @@ special_insn:
1707 add: /* add */ 1787 add: /* add */
1708 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 1788 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1709 break; 1789 break;
1790 case 0x06: /* push es */
1791 emulate_push_sreg(ctxt, VCPU_SREG_ES);
1792 break;
1793 case 0x07: /* pop es */
1794 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1795 if (rc != 0)
1796 goto done;
1797 break;
1710 case 0x08 ... 0x0d: 1798 case 0x08 ... 0x0d:
1711 or: /* or */ 1799 or: /* or */
1712 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 1800 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1713 break; 1801 break;
1802 case 0x0e: /* push cs */
1803 emulate_push_sreg(ctxt, VCPU_SREG_CS);
1804 break;
1714 case 0x10 ... 0x15: 1805 case 0x10 ... 0x15:
1715 adc: /* adc */ 1806 adc: /* adc */
1716 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 1807 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1717 break; 1808 break;
1809 case 0x16: /* push ss */
1810 emulate_push_sreg(ctxt, VCPU_SREG_SS);
1811 break;
1812 case 0x17: /* pop ss */
1813 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1814 if (rc != 0)
1815 goto done;
1816 break;
1718 case 0x18 ... 0x1d: 1817 case 0x18 ... 0x1d:
1719 sbb: /* sbb */ 1818 sbb: /* sbb */
1720 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1819 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1721 break; 1820 break;
1821 case 0x1e: /* push ds */
1822 emulate_push_sreg(ctxt, VCPU_SREG_DS);
1823 break;
1824 case 0x1f: /* pop ds */
1825 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1826 if (rc != 0)
1827 goto done;
1828 break;
1722 case 0x20 ... 0x25: 1829 case 0x20 ... 0x25:
1723 and: /* and */ 1830 and: /* and */
1724 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1831 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
@@ -1750,6 +1857,14 @@ special_insn:
1750 if (rc != 0) 1857 if (rc != 0)
1751 goto done; 1858 goto done;
1752 break; 1859 break;
1860 case 0x60: /* pusha */
1861 emulate_pusha(ctxt);
1862 break;
1863 case 0x61: /* popa */
1864 rc = emulate_popa(ctxt, ops);
1865 if (rc != 0)
1866 goto done;
1867 break;
1753 case 0x63: /* movsxd */ 1868 case 0x63: /* movsxd */
1754 if (ctxt->mode != X86EMUL_MODE_PROT64) 1869 if (ctxt->mode != X86EMUL_MODE_PROT64)
1755 goto cannot_emulate; 1870 goto cannot_emulate;
@@ -1761,7 +1876,7 @@ special_insn:
1761 break; 1876 break;
1762 case 0x6c: /* insb */ 1877 case 0x6c: /* insb */
1763 case 0x6d: /* insw/insd */ 1878 case 0x6d: /* insw/insd */
1764 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1879 if (kvm_emulate_pio_string(ctxt->vcpu,
1765 1, 1880 1,
1766 (c->d & ByteOp) ? 1 : c->op_bytes, 1881 (c->d & ByteOp) ? 1 : c->op_bytes,
1767 c->rep_prefix ? 1882 c->rep_prefix ?
@@ -1777,7 +1892,7 @@ special_insn:
1777 return 0; 1892 return 0;
1778 case 0x6e: /* outsb */ 1893 case 0x6e: /* outsb */
1779 case 0x6f: /* outsw/outsd */ 1894 case 0x6f: /* outsw/outsd */
1780 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1895 if (kvm_emulate_pio_string(ctxt->vcpu,
1781 0, 1896 0,
1782 (c->d & ByteOp) ? 1 : c->op_bytes, 1897 (c->d & ByteOp) ? 1 : c->op_bytes,
1783 c->rep_prefix ? 1898 c->rep_prefix ?
@@ -2070,7 +2185,7 @@ special_insn:
2070 case 0xef: /* out (e/r)ax,dx */ 2185 case 0xef: /* out (e/r)ax,dx */
2071 port = c->regs[VCPU_REGS_RDX]; 2186 port = c->regs[VCPU_REGS_RDX];
2072 io_dir_in = 0; 2187 io_dir_in = 0;
2073 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, 2188 do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
2074 (c->d & ByteOp) ? 1 : c->op_bytes, 2189 (c->d & ByteOp) ? 1 : c->op_bytes,
2075 port) != 0) { 2190 port) != 0) {
2076 c->eip = saved_eip; 2191 c->eip = saved_eip;
@@ -2297,6 +2412,14 @@ twobyte_insn:
2297 jmp_rel(c, c->src.val); 2412 jmp_rel(c, c->src.val);
2298 c->dst.type = OP_NONE; 2413 c->dst.type = OP_NONE;
2299 break; 2414 break;
2415 case 0xa0: /* push fs */
2416 emulate_push_sreg(ctxt, VCPU_SREG_FS);
2417 break;
2418 case 0xa1: /* pop fs */
2419 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2420 if (rc != 0)
2421 goto done;
2422 break;
2300 case 0xa3: 2423 case 0xa3:
2301 bt: /* bt */ 2424 bt: /* bt */
2302 c->dst.type = OP_NONE; 2425 c->dst.type = OP_NONE;
@@ -2308,6 +2431,14 @@ twobyte_insn:
2308 case 0xa5: /* shld cl, r, r/m */ 2431 case 0xa5: /* shld cl, r, r/m */
2309 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 2432 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
2310 break; 2433 break;
2434 case 0xa8: /* push gs */
2435 emulate_push_sreg(ctxt, VCPU_SREG_GS);
2436 break;
2437 case 0xa9: /* pop gs */
2438 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2439 if (rc != 0)
2440 goto done;
2441 break;
2311 case 0xab: 2442 case 0xab:
2312 bts: /* bts */ 2443 bts: /* bts */
2313 /* only subword offset */ 2444 /* only subword offset */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 144e7f60b5e2..296aba49472a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -29,6 +29,8 @@
29 * Based on QEMU and Xen. 29 * Based on QEMU and Xen.
30 */ 30 */
31 31
32#define pr_fmt(fmt) "pit: " fmt
33
32#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
33 35
34#include "irq.h" 36#include "irq.h"
@@ -262,7 +264,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
262 264
263static void destroy_pit_timer(struct kvm_timer *pt) 265static void destroy_pit_timer(struct kvm_timer *pt)
264{ 266{
265 pr_debug("pit: execute del timer!\n"); 267 pr_debug("execute del timer!\n");
266 hrtimer_cancel(&pt->timer); 268 hrtimer_cancel(&pt->timer);
267} 269}
268 270
@@ -284,7 +286,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
284 286
285 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 287 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
286 288
287 pr_debug("pit: create pit timer, interval is %llu nsec\n", interval); 289 pr_debug("create pit timer, interval is %llu nsec\n", interval);
288 290
289 /* TODO The new value only affected after the retriggered */ 291 /* TODO The new value only affected after the retriggered */
290 hrtimer_cancel(&pt->timer); 292 hrtimer_cancel(&pt->timer);
@@ -309,7 +311,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
309 311
310 WARN_ON(!mutex_is_locked(&ps->lock)); 312 WARN_ON(!mutex_is_locked(&ps->lock));
311 313
312 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); 314 pr_debug("load_count val is %d, channel is %d\n", val, channel);
313 315
314 /* 316 /*
315 * The largest possible initial count is 0; this is equivalent 317 * The largest possible initial count is 0; this is equivalent
@@ -395,8 +397,8 @@ static int pit_ioport_write(struct kvm_io_device *this,
395 mutex_lock(&pit_state->lock); 397 mutex_lock(&pit_state->lock);
396 398
397 if (val != 0) 399 if (val != 0)
398 pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n", 400 pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n",
399 (unsigned int)addr, len, val); 401 (unsigned int)addr, len, val);
400 402
401 if (addr == 3) { 403 if (addr == 3) {
402 channel = val >> 6; 404 channel = val >> 6;
@@ -688,10 +690,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
688 struct kvm_vcpu *vcpu; 690 struct kvm_vcpu *vcpu;
689 int i; 691 int i;
690 692
691 mutex_lock(&kvm->irq_lock);
692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 693 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
693 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 694 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
694 mutex_unlock(&kvm->irq_lock);
695 695
696 /* 696 /*
697 * Provides NMI watchdog support via Virtual Wire mode. 697 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 01f151682802..d057c0cbd245 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -38,7 +38,15 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
38 s->isr_ack |= (1 << irq); 38 s->isr_ack |= (1 << irq);
39 if (s != &s->pics_state->pics[0]) 39 if (s != &s->pics_state->pics[0])
40 irq += 8; 40 irq += 8;
41 /*
42 * We are dropping lock while calling ack notifiers since ack
43 * notifier callbacks for assigned devices call into PIC recursively.
44 * Other interrupt may be delivered to PIC while lock is dropped but
45 * it should be safe since PIC state is already updated at this stage.
46 */
47 spin_unlock(&s->pics_state->lock);
41 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 48 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
49 spin_lock(&s->pics_state->lock);
42} 50}
43 51
44void kvm_pic_clear_isr_ack(struct kvm *kvm) 52void kvm_pic_clear_isr_ack(struct kvm *kvm)
@@ -176,16 +184,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
176static inline void pic_intack(struct kvm_kpic_state *s, int irq) 184static inline void pic_intack(struct kvm_kpic_state *s, int irq)
177{ 185{
178 s->isr |= 1 << irq; 186 s->isr |= 1 << irq;
179 if (s->auto_eoi) {
180 if (s->rotate_on_auto_eoi)
181 s->priority_add = (irq + 1) & 7;
182 pic_clear_isr(s, irq);
183 }
184 /* 187 /*
185 * We don't clear a level sensitive interrupt here 188 * We don't clear a level sensitive interrupt here
186 */ 189 */
187 if (!(s->elcr & (1 << irq))) 190 if (!(s->elcr & (1 << irq)))
188 s->irr &= ~(1 << irq); 191 s->irr &= ~(1 << irq);
192
193 if (s->auto_eoi) {
194 if (s->rotate_on_auto_eoi)
195 s->priority_add = (irq + 1) & 7;
196 pic_clear_isr(s, irq);
197 }
198
189} 199}
190 200
191int kvm_pic_read_irq(struct kvm *kvm) 201int kvm_pic_read_irq(struct kvm *kvm)
@@ -225,22 +235,11 @@ int kvm_pic_read_irq(struct kvm *kvm)
225 235
226void kvm_pic_reset(struct kvm_kpic_state *s) 236void kvm_pic_reset(struct kvm_kpic_state *s)
227{ 237{
228 int irq, irqbase, n; 238 int irq;
229 struct kvm *kvm = s->pics_state->irq_request_opaque; 239 struct kvm *kvm = s->pics_state->irq_request_opaque;
230 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; 240 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
241 u8 irr = s->irr, isr = s->imr;
231 242
232 if (s == &s->pics_state->pics[0])
233 irqbase = 0;
234 else
235 irqbase = 8;
236
237 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
238 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
239 if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
240 n = irq + irqbase;
241 kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
242 }
243 }
244 s->last_irr = 0; 243 s->last_irr = 0;
245 s->irr = 0; 244 s->irr = 0;
246 s->imr = 0; 245 s->imr = 0;
@@ -256,6 +255,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
256 s->rotate_on_auto_eoi = 0; 255 s->rotate_on_auto_eoi = 0;
257 s->special_fully_nested_mode = 0; 256 s->special_fully_nested_mode = 0;
258 s->init4 = 0; 257 s->init4 = 0;
258
259 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
260 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
261 if (irr & (1 << irq) || isr & (1 << irq)) {
262 pic_clear_isr(s, irq);
263 }
264 }
259} 265}
260 266
261static void pic_ioport_write(void *opaque, u32 addr, u32 val) 267static void pic_ioport_write(void *opaque, u32 addr, u32 val)
@@ -298,9 +304,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
298 priority = get_priority(s, s->isr); 304 priority = get_priority(s, s->isr);
299 if (priority != 8) { 305 if (priority != 8) {
300 irq = (priority + s->priority_add) & 7; 306 irq = (priority + s->priority_add) & 7;
301 pic_clear_isr(s, irq);
302 if (cmd == 5) 307 if (cmd == 5)
303 s->priority_add = (irq + 1) & 7; 308 s->priority_add = (irq + 1) & 7;
309 pic_clear_isr(s, irq);
304 pic_update_irq(s->pics_state); 310 pic_update_irq(s->pics_state);
305 } 311 }
306 break; 312 break;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7d6058a2fd38..be399e207d57 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -71,6 +71,7 @@ struct kvm_pic {
71 int output; /* intr from master PIC */ 71 int output; /* intr from master PIC */
72 struct kvm_io_device dev; 72 struct kvm_io_device dev;
73 void (*ack_notifier)(void *opaque, int irq); 73 void (*ack_notifier)(void *opaque, int irq);
74 unsigned long irq_states[16];
74}; 75};
75 76
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 77struct kvm_pic *kvm_create_pic(struct kvm *kvm);
@@ -85,7 +86,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
85 86
86static inline int irqchip_in_kernel(struct kvm *kvm) 87static inline int irqchip_in_kernel(struct kvm *kvm)
87{ 88{
88 return pic_irqchip(kvm) != NULL; 89 int ret;
90
91 ret = (pic_irqchip(kvm) != NULL);
92 smp_rmb();
93 return ret;
89} 94}
90 95
91void kvm_pic_reset(struct kvm_kpic_state *s); 96void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 23c217692ea9..cd60c0bd1b32 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,7 +32,6 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include <asm/apicdef.h>
36#include "kvm_cache_regs.h" 35#include "kvm_cache_regs.h"
37#include "irq.h" 36#include "irq.h"
38#include "trace.h" 37#include "trace.h"
@@ -471,11 +470,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
471 trigger_mode = IOAPIC_LEVEL_TRIG; 470 trigger_mode = IOAPIC_LEVEL_TRIG;
472 else 471 else
473 trigger_mode = IOAPIC_EDGE_TRIG; 472 trigger_mode = IOAPIC_EDGE_TRIG;
474 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { 473 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
475 mutex_lock(&apic->vcpu->kvm->irq_lock);
476 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 474 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
477 mutex_unlock(&apic->vcpu->kvm->irq_lock);
478 }
479} 475}
480 476
481static void apic_send_ipi(struct kvm_lapic *apic) 477static void apic_send_ipi(struct kvm_lapic *apic)
@@ -504,9 +500,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
504 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 500 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
505 irq.vector); 501 irq.vector);
506 502
507 mutex_lock(&apic->vcpu->kvm->irq_lock);
508 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 503 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
509 mutex_unlock(&apic->vcpu->kvm->irq_lock);
510} 504}
511 505
512static u32 apic_get_tmcct(struct kvm_lapic *apic) 506static u32 apic_get_tmcct(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 818b92ad82cf..4c3e5b2314cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2789,7 +2789,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2789 if (r) 2789 if (r)
2790 goto out; 2790 goto out;
2791 2791
2792 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); 2792 er = emulate_instruction(vcpu, cr2, error_code, 0);
2793 2793
2794 switch (er) { 2794 switch (er) {
2795 case EMULATE_DONE: 2795 case EMULATE_DONE:
@@ -2800,6 +2800,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2800 case EMULATE_FAIL: 2800 case EMULATE_FAIL:
2801 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2801 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2802 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 2802 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2803 vcpu->run->internal.ndata = 0;
2803 return 0; 2804 return 0;
2804 default: 2805 default:
2805 BUG(); 2806 BUG();
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 72558f8ff3f5..a6017132fba8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -467,7 +467,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
467 level = iterator.level; 467 level = iterator.level;
468 sptep = iterator.sptep; 468 sptep = iterator.sptep;
469 469
470 /* FIXME: properly handle invlpg on large guest pages */
471 if (level == PT_PAGE_TABLE_LEVEL || 470 if (level == PT_PAGE_TABLE_LEVEL ||
472 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
473 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c17404add91f..1d9b33843c80 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -46,6 +46,7 @@ MODULE_LICENSE("GPL");
46#define SVM_FEATURE_NPT (1 << 0) 46#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1) 47#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_FEATURE_SVML (1 << 2) 48#define SVM_FEATURE_SVML (1 << 2)
49#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
49 50
50#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 51#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
51#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 52#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -53,15 +54,6 @@ MODULE_LICENSE("GPL");
53 54
54#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 55#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
55 56
56/* Turn on to get debugging output*/
57/* #define NESTED_DEBUG */
58
59#ifdef NESTED_DEBUG
60#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
61#else
62#define nsvm_printk(fmt, args...) do {} while(0)
63#endif
64
65static const u32 host_save_user_msrs[] = { 57static const u32 host_save_user_msrs[] = {
66#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
67 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, 59 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -85,6 +77,9 @@ struct nested_state {
85 /* gpa pointers to the real vectors */ 77 /* gpa pointers to the real vectors */
86 u64 vmcb_msrpm; 78 u64 vmcb_msrpm;
87 79
80 /* A VMEXIT is required but not yet emulated */
81 bool exit_required;
82
88 /* cache for intercepts of the guest */ 83 /* cache for intercepts of the guest */
89 u16 intercept_cr_read; 84 u16 intercept_cr_read;
90 u16 intercept_cr_write; 85 u16 intercept_cr_write;
@@ -112,6 +107,8 @@ struct vcpu_svm {
112 u32 *msrpm; 107 u32 *msrpm;
113 108
114 struct nested_state nested; 109 struct nested_state nested;
110
111 bool nmi_singlestep;
115}; 112};
116 113
117/* enable NPT for AMD64 and X86 with PAE */ 114/* enable NPT for AMD64 and X86 with PAE */
@@ -286,7 +283,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
286 struct vcpu_svm *svm = to_svm(vcpu); 283 struct vcpu_svm *svm = to_svm(vcpu);
287 284
288 if (!svm->next_rip) { 285 if (!svm->next_rip) {
289 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != 286 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
290 EMULATE_DONE) 287 EMULATE_DONE)
291 printk(KERN_DEBUG "%s: NOP\n", __func__); 288 printk(KERN_DEBUG "%s: NOP\n", __func__);
292 return; 289 return;
@@ -316,75 +313,79 @@ static void svm_hardware_disable(void *garbage)
316 cpu_svm_disable(); 313 cpu_svm_disable();
317} 314}
318 315
319static void svm_hardware_enable(void *garbage) 316static int svm_hardware_enable(void *garbage)
320{ 317{
321 318
322 struct svm_cpu_data *svm_data; 319 struct svm_cpu_data *sd;
323 uint64_t efer; 320 uint64_t efer;
324 struct descriptor_table gdt_descr; 321 struct descriptor_table gdt_descr;
325 struct desc_struct *gdt; 322 struct desc_struct *gdt;
326 int me = raw_smp_processor_id(); 323 int me = raw_smp_processor_id();
327 324
325 rdmsrl(MSR_EFER, efer);
326 if (efer & EFER_SVME)
327 return -EBUSY;
328
328 if (!has_svm()) { 329 if (!has_svm()) {
329 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); 330 printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
330 return; 331 me);
332 return -EINVAL;
331 } 333 }
332 svm_data = per_cpu(svm_data, me); 334 sd = per_cpu(svm_data, me);
333 335
334 if (!svm_data) { 336 if (!sd) {
335 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", 337 printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
336 me); 338 me);
337 return; 339 return -EINVAL;
338 } 340 }
339 341
340 svm_data->asid_generation = 1; 342 sd->asid_generation = 1;
341 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 343 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
342 svm_data->next_asid = svm_data->max_asid + 1; 344 sd->next_asid = sd->max_asid + 1;
343 345
344 kvm_get_gdt(&gdt_descr); 346 kvm_get_gdt(&gdt_descr);
345 gdt = (struct desc_struct *)gdt_descr.base; 347 gdt = (struct desc_struct *)gdt_descr.base;
346 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 348 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
347 349
348 rdmsrl(MSR_EFER, efer);
349 wrmsrl(MSR_EFER, efer | EFER_SVME); 350 wrmsrl(MSR_EFER, efer | EFER_SVME);
350 351
351 wrmsrl(MSR_VM_HSAVE_PA, 352 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
352 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 353
354 return 0;
353} 355}
354 356
355static void svm_cpu_uninit(int cpu) 357static void svm_cpu_uninit(int cpu)
356{ 358{
357 struct svm_cpu_data *svm_data 359 struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
358 = per_cpu(svm_data, raw_smp_processor_id());
359 360
360 if (!svm_data) 361 if (!sd)
361 return; 362 return;
362 363
363 per_cpu(svm_data, raw_smp_processor_id()) = NULL; 364 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
364 __free_page(svm_data->save_area); 365 __free_page(sd->save_area);
365 kfree(svm_data); 366 kfree(sd);
366} 367}
367 368
368static int svm_cpu_init(int cpu) 369static int svm_cpu_init(int cpu)
369{ 370{
370 struct svm_cpu_data *svm_data; 371 struct svm_cpu_data *sd;
371 int r; 372 int r;
372 373
373 svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); 374 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
374 if (!svm_data) 375 if (!sd)
375 return -ENOMEM; 376 return -ENOMEM;
376 svm_data->cpu = cpu; 377 sd->cpu = cpu;
377 svm_data->save_area = alloc_page(GFP_KERNEL); 378 sd->save_area = alloc_page(GFP_KERNEL);
378 r = -ENOMEM; 379 r = -ENOMEM;
379 if (!svm_data->save_area) 380 if (!sd->save_area)
380 goto err_1; 381 goto err_1;
381 382
382 per_cpu(svm_data, cpu) = svm_data; 383 per_cpu(svm_data, cpu) = sd;
383 384
384 return 0; 385 return 0;
385 386
386err_1: 387err_1:
387 kfree(svm_data); 388 kfree(sd);
388 return r; 389 return r;
389 390
390} 391}
@@ -476,7 +477,7 @@ static __init int svm_hardware_setup(void)
476 kvm_enable_efer_bits(EFER_SVME); 477 kvm_enable_efer_bits(EFER_SVME);
477 } 478 }
478 479
479 for_each_online_cpu(cpu) { 480 for_each_possible_cpu(cpu) {
480 r = svm_cpu_init(cpu); 481 r = svm_cpu_init(cpu);
481 if (r) 482 if (r)
482 goto err; 483 goto err;
@@ -510,7 +511,7 @@ static __exit void svm_hardware_unsetup(void)
510{ 511{
511 int cpu; 512 int cpu;
512 513
513 for_each_online_cpu(cpu) 514 for_each_possible_cpu(cpu)
514 svm_cpu_uninit(cpu); 515 svm_cpu_uninit(cpu);
515 516
516 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 517 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
@@ -625,11 +626,12 @@ static void init_vmcb(struct vcpu_svm *svm)
625 save->rip = 0x0000fff0; 626 save->rip = 0x0000fff0;
626 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 627 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
627 628
628 /* 629 /* This is the guest-visible cr0 value.
629 * cr0 val on cpu init should be 0x60000010, we enable cpu 630 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
630 * cache by default. the orderly way is to enable cache in bios.
631 */ 631 */
632 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; 632 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
633 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
634
633 save->cr4 = X86_CR4_PAE; 635 save->cr4 = X86_CR4_PAE;
634 /* rdx = ?? */ 636 /* rdx = ?? */
635 637
@@ -644,8 +646,6 @@ static void init_vmcb(struct vcpu_svm *svm)
644 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| 646 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
645 INTERCEPT_CR3_MASK); 647 INTERCEPT_CR3_MASK);
646 save->g_pat = 0x0007040600070406ULL; 648 save->g_pat = 0x0007040600070406ULL;
647 /* enable caching because the QEMU Bios doesn't enable it */
648 save->cr0 = X86_CR0_ET;
649 save->cr3 = 0; 649 save->cr3 = 0;
650 save->cr4 = 0; 650 save->cr4 = 0;
651 } 651 }
@@ -654,6 +654,11 @@ static void init_vmcb(struct vcpu_svm *svm)
654 svm->nested.vmcb = 0; 654 svm->nested.vmcb = 0;
655 svm->vcpu.arch.hflags = 0; 655 svm->vcpu.arch.hflags = 0;
656 656
657 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
658 control->pause_filter_count = 3000;
659 control->intercept |= (1ULL << INTERCEPT_PAUSE);
660 }
661
657 enable_gif(svm); 662 enable_gif(svm);
658} 663}
659 664
@@ -758,14 +763,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
758 int i; 763 int i;
759 764
760 if (unlikely(cpu != vcpu->cpu)) { 765 if (unlikely(cpu != vcpu->cpu)) {
761 u64 tsc_this, delta; 766 u64 delta;
762 767
763 /* 768 /*
764 * Make sure that the guest sees a monotonically 769 * Make sure that the guest sees a monotonically
765 * increasing TSC. 770 * increasing TSC.
766 */ 771 */
767 rdtscll(tsc_this); 772 delta = vcpu->arch.host_tsc - native_read_tsc();
768 delta = vcpu->arch.host_tsc - tsc_this;
769 svm->vmcb->control.tsc_offset += delta; 773 svm->vmcb->control.tsc_offset += delta;
770 if (is_nested(svm)) 774 if (is_nested(svm))
771 svm->nested.hsave->control.tsc_offset += delta; 775 svm->nested.hsave->control.tsc_offset += delta;
@@ -787,7 +791,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
787 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 791 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
788 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 792 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
789 793
790 rdtscll(vcpu->arch.host_tsc); 794 vcpu->arch.host_tsc = native_read_tsc();
791} 795}
792 796
793static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 797static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1045,7 +1049,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1045 svm->vmcb->control.intercept_exceptions &= 1049 svm->vmcb->control.intercept_exceptions &=
1046 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1050 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
1047 1051
1048 if (vcpu->arch.singlestep) 1052 if (svm->nmi_singlestep)
1049 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1053 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
1050 1054
1051 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1055 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -1060,26 +1064,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1060 vcpu->guest_debug = 0; 1064 vcpu->guest_debug = 0;
1061} 1065}
1062 1066
1063static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1067static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1064{ 1068{
1065 int old_debug = vcpu->guest_debug;
1066 struct vcpu_svm *svm = to_svm(vcpu); 1069 struct vcpu_svm *svm = to_svm(vcpu);
1067 1070
1068 vcpu->guest_debug = dbg->control;
1069
1070 update_db_intercept(vcpu);
1071
1072 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1071 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1073 svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; 1072 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1074 else 1073 else
1075 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1074 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1076 1075
1077 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 1076 update_db_intercept(vcpu);
1078 svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1079 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1080 svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1081
1082 return 0;
1083} 1077}
1084 1078
1085static void load_host_msrs(struct kvm_vcpu *vcpu) 1079static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1096,16 +1090,16 @@ static void save_host_msrs(struct kvm_vcpu *vcpu)
1096#endif 1090#endif
1097} 1091}
1098 1092
1099static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) 1093static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1100{ 1094{
1101 if (svm_data->next_asid > svm_data->max_asid) { 1095 if (sd->next_asid > sd->max_asid) {
1102 ++svm_data->asid_generation; 1096 ++sd->asid_generation;
1103 svm_data->next_asid = 1; 1097 sd->next_asid = 1;
1104 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 1098 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1105 } 1099 }
1106 1100
1107 svm->asid_generation = svm_data->asid_generation; 1101 svm->asid_generation = sd->asid_generation;
1108 svm->vmcb->control.asid = svm_data->next_asid++; 1102 svm->vmcb->control.asid = sd->next_asid++;
1109} 1103}
1110 1104
1111static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 1105static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
@@ -1180,7 +1174,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1180 } 1174 }
1181} 1175}
1182 1176
1183static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1177static int pf_interception(struct vcpu_svm *svm)
1184{ 1178{
1185 u64 fault_address; 1179 u64 fault_address;
1186 u32 error_code; 1180 u32 error_code;
@@ -1194,17 +1188,19 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1194 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1188 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1195} 1189}
1196 1190
1197static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1191static int db_interception(struct vcpu_svm *svm)
1198{ 1192{
1193 struct kvm_run *kvm_run = svm->vcpu.run;
1194
1199 if (!(svm->vcpu.guest_debug & 1195 if (!(svm->vcpu.guest_debug &
1200 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 1196 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1201 !svm->vcpu.arch.singlestep) { 1197 !svm->nmi_singlestep) {
1202 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 1198 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1203 return 1; 1199 return 1;
1204 } 1200 }
1205 1201
1206 if (svm->vcpu.arch.singlestep) { 1202 if (svm->nmi_singlestep) {
1207 svm->vcpu.arch.singlestep = false; 1203 svm->nmi_singlestep = false;
1208 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) 1204 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1209 svm->vmcb->save.rflags &= 1205 svm->vmcb->save.rflags &=
1210 ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1206 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
@@ -1223,25 +1219,27 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1223 return 1; 1219 return 1;
1224} 1220}
1225 1221
1226static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1222static int bp_interception(struct vcpu_svm *svm)
1227{ 1223{
1224 struct kvm_run *kvm_run = svm->vcpu.run;
1225
1228 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1226 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1229 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1227 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1230 kvm_run->debug.arch.exception = BP_VECTOR; 1228 kvm_run->debug.arch.exception = BP_VECTOR;
1231 return 0; 1229 return 0;
1232} 1230}
1233 1231
1234static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1232static int ud_interception(struct vcpu_svm *svm)
1235{ 1233{
1236 int er; 1234 int er;
1237 1235
1238 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 1236 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
1239 if (er != EMULATE_DONE) 1237 if (er != EMULATE_DONE)
1240 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1238 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1241 return 1; 1239 return 1;
1242} 1240}
1243 1241
1244static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1242static int nm_interception(struct vcpu_svm *svm)
1245{ 1243{
1246 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1244 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1247 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) 1245 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
@@ -1251,7 +1249,7 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1251 return 1; 1249 return 1;
1252} 1250}
1253 1251
1254static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1252static int mc_interception(struct vcpu_svm *svm)
1255{ 1253{
1256 /* 1254 /*
1257 * On an #MC intercept the MCE handler is not called automatically in 1255 * On an #MC intercept the MCE handler is not called automatically in
@@ -1264,8 +1262,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1264 return 1; 1262 return 1;
1265} 1263}
1266 1264
1267static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1265static int shutdown_interception(struct vcpu_svm *svm)
1268{ 1266{
1267 struct kvm_run *kvm_run = svm->vcpu.run;
1268
1269 /* 1269 /*
1270 * VMCB is undefined after a SHUTDOWN intercept 1270 * VMCB is undefined after a SHUTDOWN intercept
1271 * so reinitialize it. 1271 * so reinitialize it.
@@ -1277,7 +1277,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1277 return 0; 1277 return 0;
1278} 1278}
1279 1279
1280static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1280static int io_interception(struct vcpu_svm *svm)
1281{ 1281{
1282 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1282 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1283 int size, in, string; 1283 int size, in, string;
@@ -1291,7 +1291,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1291 1291
1292 if (string) { 1292 if (string) {
1293 if (emulate_instruction(&svm->vcpu, 1293 if (emulate_instruction(&svm->vcpu,
1294 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) 1294 0, 0, 0) == EMULATE_DO_MMIO)
1295 return 0; 1295 return 0;
1296 return 1; 1296 return 1;
1297 } 1297 }
@@ -1301,33 +1301,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1301 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1301 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1302 1302
1303 skip_emulated_instruction(&svm->vcpu); 1303 skip_emulated_instruction(&svm->vcpu);
1304 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1304 return kvm_emulate_pio(&svm->vcpu, in, size, port);
1305} 1305}
1306 1306
1307static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1307static int nmi_interception(struct vcpu_svm *svm)
1308{ 1308{
1309 return 1; 1309 return 1;
1310} 1310}
1311 1311
1312static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1312static int intr_interception(struct vcpu_svm *svm)
1313{ 1313{
1314 ++svm->vcpu.stat.irq_exits; 1314 ++svm->vcpu.stat.irq_exits;
1315 return 1; 1315 return 1;
1316} 1316}
1317 1317
1318static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1318static int nop_on_interception(struct vcpu_svm *svm)
1319{ 1319{
1320 return 1; 1320 return 1;
1321} 1321}
1322 1322
1323static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1323static int halt_interception(struct vcpu_svm *svm)
1324{ 1324{
1325 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 1325 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1326 skip_emulated_instruction(&svm->vcpu); 1326 skip_emulated_instruction(&svm->vcpu);
1327 return kvm_emulate_halt(&svm->vcpu); 1327 return kvm_emulate_halt(&svm->vcpu);
1328} 1328}
1329 1329
1330static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1330static int vmmcall_interception(struct vcpu_svm *svm)
1331{ 1331{
1332 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1332 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1333 skip_emulated_instruction(&svm->vcpu); 1333 skip_emulated_instruction(&svm->vcpu);
@@ -1378,8 +1378,15 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1378 1378
1379 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1379 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1380 1380
1381 if (nested_svm_exit_handled(svm)) { 1381 if (svm->nested.intercept & 1ULL) {
1382 nsvm_printk("VMexit -> INTR\n"); 1382 /*
1383 * The #vmexit can't be emulated here directly because this
1384 * code path runs with irqs and preemtion disabled. A
1385 * #vmexit emulation might sleep. Only signal request for
1386 * the #vmexit here.
1387 */
1388 svm->nested.exit_required = true;
1389 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1383 return 1; 1390 return 1;
1384 } 1391 }
1385 1392
@@ -1390,10 +1397,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
1390{ 1397{
1391 struct page *page; 1398 struct page *page;
1392 1399
1393 down_read(&current->mm->mmap_sem);
1394 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1400 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1395 up_read(&current->mm->mmap_sem);
1396
1397 if (is_error_page(page)) 1401 if (is_error_page(page))
1398 goto error; 1402 goto error;
1399 1403
@@ -1532,14 +1536,12 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1532 } 1536 }
1533 default: { 1537 default: {
1534 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1538 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1535 nsvm_printk("exit code: 0x%x\n", exit_code);
1536 if (svm->nested.intercept & exit_bits) 1539 if (svm->nested.intercept & exit_bits)
1537 vmexit = NESTED_EXIT_DONE; 1540 vmexit = NESTED_EXIT_DONE;
1538 } 1541 }
1539 } 1542 }
1540 1543
1541 if (vmexit == NESTED_EXIT_DONE) { 1544 if (vmexit == NESTED_EXIT_DONE) {
1542 nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
1543 nested_svm_vmexit(svm); 1545 nested_svm_vmexit(svm);
1544 } 1546 }
1545 1547
@@ -1584,6 +1586,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1584 struct vmcb *hsave = svm->nested.hsave; 1586 struct vmcb *hsave = svm->nested.hsave;
1585 struct vmcb *vmcb = svm->vmcb; 1587 struct vmcb *vmcb = svm->vmcb;
1586 1588
1589 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1590 vmcb->control.exit_info_1,
1591 vmcb->control.exit_info_2,
1592 vmcb->control.exit_int_info,
1593 vmcb->control.exit_int_info_err);
1594
1587 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1595 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
1588 if (!nested_vmcb) 1596 if (!nested_vmcb)
1589 return 1; 1597 return 1;
@@ -1617,6 +1625,22 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1617 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 1625 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1618 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 1626 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1619 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 1627 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
1628
1629 /*
1630 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
1631 * to make sure that we do not lose injected events. So check event_inj
1632 * here and copy it to exit_int_info if it is valid.
1633 * Exit_int_info and event_inj can't be both valid because the case
1634 * below only happens on a VMRUN instruction intercept which has
1635 * no valid exit_int_info set.
1636 */
1637 if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
1638 struct vmcb_control_area *nc = &nested_vmcb->control;
1639
1640 nc->exit_int_info = vmcb->control.event_inj;
1641 nc->exit_int_info_err = vmcb->control.event_inj_err;
1642 }
1643
1620 nested_vmcb->control.tlb_ctl = 0; 1644 nested_vmcb->control.tlb_ctl = 0;
1621 nested_vmcb->control.event_inj = 0; 1645 nested_vmcb->control.event_inj = 0;
1622 nested_vmcb->control.event_inj_err = 0; 1646 nested_vmcb->control.event_inj_err = 0;
@@ -1628,10 +1652,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1628 /* Restore the original control entries */ 1652 /* Restore the original control entries */
1629 copy_vmcb_control_area(vmcb, hsave); 1653 copy_vmcb_control_area(vmcb, hsave);
1630 1654
1631 /* Kill any pending exceptions */
1632 if (svm->vcpu.arch.exception.pending == true)
1633 nsvm_printk("WARNING: Pending Exception\n");
1634
1635 kvm_clear_exception_queue(&svm->vcpu); 1655 kvm_clear_exception_queue(&svm->vcpu);
1636 kvm_clear_interrupt_queue(&svm->vcpu); 1656 kvm_clear_interrupt_queue(&svm->vcpu);
1637 1657
@@ -1702,6 +1722,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1702 /* nested_vmcb is our indicator if nested SVM is activated */ 1722 /* nested_vmcb is our indicator if nested SVM is activated */
1703 svm->nested.vmcb = svm->vmcb->save.rax; 1723 svm->nested.vmcb = svm->vmcb->save.rax;
1704 1724
1725 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1726 nested_vmcb->save.rip,
1727 nested_vmcb->control.int_ctl,
1728 nested_vmcb->control.event_inj,
1729 nested_vmcb->control.nested_ctl);
1730
1705 /* Clear internal status */ 1731 /* Clear internal status */
1706 kvm_clear_exception_queue(&svm->vcpu); 1732 kvm_clear_exception_queue(&svm->vcpu);
1707 kvm_clear_interrupt_queue(&svm->vcpu); 1733 kvm_clear_interrupt_queue(&svm->vcpu);
@@ -1789,28 +1815,15 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1789 svm->nested.intercept = nested_vmcb->control.intercept; 1815 svm->nested.intercept = nested_vmcb->control.intercept;
1790 1816
1791 force_new_asid(&svm->vcpu); 1817 force_new_asid(&svm->vcpu);
1792 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
1793 svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
1794 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 1818 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
1795 if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
1796 nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
1797 nested_vmcb->control.int_ctl);
1798 }
1799 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 1819 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
1800 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 1820 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
1801 else 1821 else
1802 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 1822 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1803 1823
1804 nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
1805 nested_vmcb->control.exit_int_info,
1806 nested_vmcb->control.int_state);
1807
1808 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 1824 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1809 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 1825 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1810 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 1826 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1811 if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
1812 nsvm_printk("Injecting Event: 0x%x\n",
1813 nested_vmcb->control.event_inj);
1814 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 1827 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1815 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 1828 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1816 1829
@@ -1837,7 +1850,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1837 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 1850 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1838} 1851}
1839 1852
1840static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1853static int vmload_interception(struct vcpu_svm *svm)
1841{ 1854{
1842 struct vmcb *nested_vmcb; 1855 struct vmcb *nested_vmcb;
1843 1856
@@ -1857,7 +1870,7 @@ static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1857 return 1; 1870 return 1;
1858} 1871}
1859 1872
1860static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1873static int vmsave_interception(struct vcpu_svm *svm)
1861{ 1874{
1862 struct vmcb *nested_vmcb; 1875 struct vmcb *nested_vmcb;
1863 1876
@@ -1877,10 +1890,8 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1877 return 1; 1890 return 1;
1878} 1891}
1879 1892
1880static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1893static int vmrun_interception(struct vcpu_svm *svm)
1881{ 1894{
1882 nsvm_printk("VMrun\n");
1883
1884 if (nested_svm_check_permissions(svm)) 1895 if (nested_svm_check_permissions(svm))
1885 return 1; 1896 return 1;
1886 1897
@@ -1907,7 +1918,7 @@ failed:
1907 return 1; 1918 return 1;
1908} 1919}
1909 1920
1910static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1921static int stgi_interception(struct vcpu_svm *svm)
1911{ 1922{
1912 if (nested_svm_check_permissions(svm)) 1923 if (nested_svm_check_permissions(svm))
1913 return 1; 1924 return 1;
@@ -1920,7 +1931,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1920 return 1; 1931 return 1;
1921} 1932}
1922 1933
1923static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1934static int clgi_interception(struct vcpu_svm *svm)
1924{ 1935{
1925 if (nested_svm_check_permissions(svm)) 1936 if (nested_svm_check_permissions(svm))
1926 return 1; 1937 return 1;
@@ -1937,10 +1948,12 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1937 return 1; 1948 return 1;
1938} 1949}
1939 1950
1940static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1951static int invlpga_interception(struct vcpu_svm *svm)
1941{ 1952{
1942 struct kvm_vcpu *vcpu = &svm->vcpu; 1953 struct kvm_vcpu *vcpu = &svm->vcpu;
1943 nsvm_printk("INVLPGA\n"); 1954
1955 trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
1956 vcpu->arch.regs[VCPU_REGS_RAX]);
1944 1957
1945 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 1958 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
1946 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); 1959 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
@@ -1950,15 +1963,21 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1950 return 1; 1963 return 1;
1951} 1964}
1952 1965
1953static int invalid_op_interception(struct vcpu_svm *svm, 1966static int skinit_interception(struct vcpu_svm *svm)
1954 struct kvm_run *kvm_run)
1955{ 1967{
1968 trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
1969
1956 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1970 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1957 return 1; 1971 return 1;
1958} 1972}
1959 1973
1960static int task_switch_interception(struct vcpu_svm *svm, 1974static int invalid_op_interception(struct vcpu_svm *svm)
1961 struct kvm_run *kvm_run) 1975{
1976 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1977 return 1;
1978}
1979
1980static int task_switch_interception(struct vcpu_svm *svm)
1962{ 1981{
1963 u16 tss_selector; 1982 u16 tss_selector;
1964 int reason; 1983 int reason;
@@ -2008,14 +2027,14 @@ static int task_switch_interception(struct vcpu_svm *svm,
2008 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2027 return kvm_task_switch(&svm->vcpu, tss_selector, reason);
2009} 2028}
2010 2029
2011static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2030static int cpuid_interception(struct vcpu_svm *svm)
2012{ 2031{
2013 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2032 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2014 kvm_emulate_cpuid(&svm->vcpu); 2033 kvm_emulate_cpuid(&svm->vcpu);
2015 return 1; 2034 return 1;
2016} 2035}
2017 2036
2018static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2037static int iret_interception(struct vcpu_svm *svm)
2019{ 2038{
2020 ++svm->vcpu.stat.nmi_window_exits; 2039 ++svm->vcpu.stat.nmi_window_exits;
2021 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); 2040 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
@@ -2023,26 +2042,27 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2023 return 1; 2042 return 1;
2024} 2043}
2025 2044
2026static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2045static int invlpg_interception(struct vcpu_svm *svm)
2027{ 2046{
2028 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) 2047 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2029 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2048 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2030 return 1; 2049 return 1;
2031} 2050}
2032 2051
2033static int emulate_on_interception(struct vcpu_svm *svm, 2052static int emulate_on_interception(struct vcpu_svm *svm)
2034 struct kvm_run *kvm_run)
2035{ 2053{
2036 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) 2054 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2037 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2055 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2038 return 1; 2056 return 1;
2039} 2057}
2040 2058
2041static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2059static int cr8_write_interception(struct vcpu_svm *svm)
2042{ 2060{
2061 struct kvm_run *kvm_run = svm->vcpu.run;
2062
2043 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2063 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2044 /* instruction emulation calls kvm_set_cr8() */ 2064 /* instruction emulation calls kvm_set_cr8() */
2045 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); 2065 emulate_instruction(&svm->vcpu, 0, 0, 0);
2046 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2066 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2047 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2067 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2048 return 1; 2068 return 1;
@@ -2128,7 +2148,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2128 return 0; 2148 return 0;
2129} 2149}
2130 2150
2131static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2151static int rdmsr_interception(struct vcpu_svm *svm)
2132{ 2152{
2133 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2153 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2134 u64 data; 2154 u64 data;
@@ -2221,7 +2241,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2221 return 0; 2241 return 0;
2222} 2242}
2223 2243
2224static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2244static int wrmsr_interception(struct vcpu_svm *svm)
2225{ 2245{
2226 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2246 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2227 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2247 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
@@ -2237,17 +2257,18 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2237 return 1; 2257 return 1;
2238} 2258}
2239 2259
2240static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2260static int msr_interception(struct vcpu_svm *svm)
2241{ 2261{
2242 if (svm->vmcb->control.exit_info_1) 2262 if (svm->vmcb->control.exit_info_1)
2243 return wrmsr_interception(svm, kvm_run); 2263 return wrmsr_interception(svm);
2244 else 2264 else
2245 return rdmsr_interception(svm, kvm_run); 2265 return rdmsr_interception(svm);
2246} 2266}
2247 2267
2248static int interrupt_window_interception(struct vcpu_svm *svm, 2268static int interrupt_window_interception(struct vcpu_svm *svm)
2249 struct kvm_run *kvm_run)
2250{ 2269{
2270 struct kvm_run *kvm_run = svm->vcpu.run;
2271
2251 svm_clear_vintr(svm); 2272 svm_clear_vintr(svm);
2252 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2273 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2253 /* 2274 /*
@@ -2265,8 +2286,13 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
2265 return 1; 2286 return 1;
2266} 2287}
2267 2288
2268static int (*svm_exit_handlers[])(struct vcpu_svm *svm, 2289static int pause_interception(struct vcpu_svm *svm)
2269 struct kvm_run *kvm_run) = { 2290{
2291 kvm_vcpu_on_spin(&(svm->vcpu));
2292 return 1;
2293}
2294
2295static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2270 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2296 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2271 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2297 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2272 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2298 [SVM_EXIT_READ_CR4] = emulate_on_interception,
@@ -2301,6 +2327,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2301 [SVM_EXIT_CPUID] = cpuid_interception, 2327 [SVM_EXIT_CPUID] = cpuid_interception,
2302 [SVM_EXIT_IRET] = iret_interception, 2328 [SVM_EXIT_IRET] = iret_interception,
2303 [SVM_EXIT_INVD] = emulate_on_interception, 2329 [SVM_EXIT_INVD] = emulate_on_interception,
2330 [SVM_EXIT_PAUSE] = pause_interception,
2304 [SVM_EXIT_HLT] = halt_interception, 2331 [SVM_EXIT_HLT] = halt_interception,
2305 [SVM_EXIT_INVLPG] = invlpg_interception, 2332 [SVM_EXIT_INVLPG] = invlpg_interception,
2306 [SVM_EXIT_INVLPGA] = invlpga_interception, 2333 [SVM_EXIT_INVLPGA] = invlpga_interception,
@@ -2314,26 +2341,36 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2314 [SVM_EXIT_VMSAVE] = vmsave_interception, 2341 [SVM_EXIT_VMSAVE] = vmsave_interception,
2315 [SVM_EXIT_STGI] = stgi_interception, 2342 [SVM_EXIT_STGI] = stgi_interception,
2316 [SVM_EXIT_CLGI] = clgi_interception, 2343 [SVM_EXIT_CLGI] = clgi_interception,
2317 [SVM_EXIT_SKINIT] = invalid_op_interception, 2344 [SVM_EXIT_SKINIT] = skinit_interception,
2318 [SVM_EXIT_WBINVD] = emulate_on_interception, 2345 [SVM_EXIT_WBINVD] = emulate_on_interception,
2319 [SVM_EXIT_MONITOR] = invalid_op_interception, 2346 [SVM_EXIT_MONITOR] = invalid_op_interception,
2320 [SVM_EXIT_MWAIT] = invalid_op_interception, 2347 [SVM_EXIT_MWAIT] = invalid_op_interception,
2321 [SVM_EXIT_NPF] = pf_interception, 2348 [SVM_EXIT_NPF] = pf_interception,
2322}; 2349};
2323 2350
2324static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 2351static int handle_exit(struct kvm_vcpu *vcpu)
2325{ 2352{
2326 struct vcpu_svm *svm = to_svm(vcpu); 2353 struct vcpu_svm *svm = to_svm(vcpu);
2354 struct kvm_run *kvm_run = vcpu->run;
2327 u32 exit_code = svm->vmcb->control.exit_code; 2355 u32 exit_code = svm->vmcb->control.exit_code;
2328 2356
2329 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2357 trace_kvm_exit(exit_code, svm->vmcb->save.rip);
2330 2358
2359 if (unlikely(svm->nested.exit_required)) {
2360 nested_svm_vmexit(svm);
2361 svm->nested.exit_required = false;
2362
2363 return 1;
2364 }
2365
2331 if (is_nested(svm)) { 2366 if (is_nested(svm)) {
2332 int vmexit; 2367 int vmexit;
2333 2368
2334 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", 2369 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
2335 exit_code, svm->vmcb->control.exit_info_1, 2370 svm->vmcb->control.exit_info_1,
2336 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); 2371 svm->vmcb->control.exit_info_2,
2372 svm->vmcb->control.exit_int_info,
2373 svm->vmcb->control.exit_int_info_err);
2337 2374
2338 vmexit = nested_svm_exit_special(svm); 2375 vmexit = nested_svm_exit_special(svm);
2339 2376
@@ -2383,15 +2420,15 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2383 return 0; 2420 return 0;
2384 } 2421 }
2385 2422
2386 return svm_exit_handlers[exit_code](svm, kvm_run); 2423 return svm_exit_handlers[exit_code](svm);
2387} 2424}
2388 2425
2389static void reload_tss(struct kvm_vcpu *vcpu) 2426static void reload_tss(struct kvm_vcpu *vcpu)
2390{ 2427{
2391 int cpu = raw_smp_processor_id(); 2428 int cpu = raw_smp_processor_id();
2392 2429
2393 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 2430 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
2394 svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ 2431 sd->tss_desc->type = 9; /* available 32/64-bit TSS */
2395 load_TR_desc(); 2432 load_TR_desc();
2396} 2433}
2397 2434
@@ -2399,12 +2436,12 @@ static void pre_svm_run(struct vcpu_svm *svm)
2399{ 2436{
2400 int cpu = raw_smp_processor_id(); 2437 int cpu = raw_smp_processor_id();
2401 2438
2402 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 2439 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
2403 2440
2404 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 2441 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
2405 /* FIXME: handle wraparound of asid_generation */ 2442 /* FIXME: handle wraparound of asid_generation */
2406 if (svm->asid_generation != svm_data->asid_generation) 2443 if (svm->asid_generation != sd->asid_generation)
2407 new_asid(svm, svm_data); 2444 new_asid(svm, sd);
2408} 2445}
2409 2446
2410static void svm_inject_nmi(struct kvm_vcpu *vcpu) 2447static void svm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2460,20 +2497,47 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2460 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 2497 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2461} 2498}
2462 2499
2500static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
2501{
2502 struct vcpu_svm *svm = to_svm(vcpu);
2503
2504 return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
2505}
2506
2507static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2508{
2509 struct vcpu_svm *svm = to_svm(vcpu);
2510
2511 if (masked) {
2512 svm->vcpu.arch.hflags |= HF_NMI_MASK;
2513 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
2514 } else {
2515 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
2516 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
2517 }
2518}
2519
2463static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) 2520static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2464{ 2521{
2465 struct vcpu_svm *svm = to_svm(vcpu); 2522 struct vcpu_svm *svm = to_svm(vcpu);
2466 struct vmcb *vmcb = svm->vmcb; 2523 struct vmcb *vmcb = svm->vmcb;
2467 return (vmcb->save.rflags & X86_EFLAGS_IF) && 2524 int ret;
2468 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2525
2469 gif_set(svm) && 2526 if (!gif_set(svm) ||
2470 !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); 2527 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
2528 return 0;
2529
2530 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
2531
2532 if (is_nested(svm))
2533 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
2534
2535 return ret;
2471} 2536}
2472 2537
2473static void enable_irq_window(struct kvm_vcpu *vcpu) 2538static void enable_irq_window(struct kvm_vcpu *vcpu)
2474{ 2539{
2475 struct vcpu_svm *svm = to_svm(vcpu); 2540 struct vcpu_svm *svm = to_svm(vcpu);
2476 nsvm_printk("Trying to open IRQ window\n");
2477 2541
2478 nested_svm_intr(svm); 2542 nested_svm_intr(svm);
2479 2543
@@ -2498,7 +2562,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2498 /* Something prevents NMI from been injected. Single step over 2562 /* Something prevents NMI from been injected. Single step over
2499 possible problem (IRET or exception injection or interrupt 2563 possible problem (IRET or exception injection or interrupt
2500 shadow) */ 2564 shadow) */
2501 vcpu->arch.singlestep = true; 2565 svm->nmi_singlestep = true;
2502 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2566 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2503 update_db_intercept(vcpu); 2567 update_db_intercept(vcpu);
2504} 2568}
@@ -2588,13 +2652,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2588#define R "e" 2652#define R "e"
2589#endif 2653#endif
2590 2654
2591static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2655static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2592{ 2656{
2593 struct vcpu_svm *svm = to_svm(vcpu); 2657 struct vcpu_svm *svm = to_svm(vcpu);
2594 u16 fs_selector; 2658 u16 fs_selector;
2595 u16 gs_selector; 2659 u16 gs_selector;
2596 u16 ldt_selector; 2660 u16 ldt_selector;
2597 2661
2662 /*
2663 * A vmexit emulation is required before the vcpu can be executed
2664 * again.
2665 */
2666 if (unlikely(svm->nested.exit_required))
2667 return;
2668
2598 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 2669 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2599 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 2670 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2600 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 2671 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -2893,6 +2964,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2893 .queue_exception = svm_queue_exception, 2964 .queue_exception = svm_queue_exception,
2894 .interrupt_allowed = svm_interrupt_allowed, 2965 .interrupt_allowed = svm_interrupt_allowed,
2895 .nmi_allowed = svm_nmi_allowed, 2966 .nmi_allowed = svm_nmi_allowed,
2967 .get_nmi_mask = svm_get_nmi_mask,
2968 .set_nmi_mask = svm_set_nmi_mask,
2896 .enable_nmi_window = enable_nmi_window, 2969 .enable_nmi_window = enable_nmi_window,
2897 .enable_irq_window = enable_irq_window, 2970 .enable_irq_window = enable_irq_window,
2898 .update_cr8_intercept = update_cr8_intercept, 2971 .update_cr8_intercept = update_cr8_intercept,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0d480e77eacf..816e0449db0b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -349,6 +349,171 @@ TRACE_EVENT(kvm_apic_accept_irq,
349 __entry->coalesced ? " (coalesced)" : "") 349 __entry->coalesced ? " (coalesced)" : "")
350); 350);
351 351
352/*
353 * Tracepoint for nested VMRUN
354 */
355TRACE_EVENT(kvm_nested_vmrun,
356 TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
357 __u32 event_inj, bool npt),
358 TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
359
360 TP_STRUCT__entry(
361 __field( __u64, rip )
362 __field( __u64, vmcb )
363 __field( __u64, nested_rip )
364 __field( __u32, int_ctl )
365 __field( __u32, event_inj )
366 __field( bool, npt )
367 ),
368
369 TP_fast_assign(
370 __entry->rip = rip;
371 __entry->vmcb = vmcb;
372 __entry->nested_rip = nested_rip;
373 __entry->int_ctl = int_ctl;
374 __entry->event_inj = event_inj;
375 __entry->npt = npt;
376 ),
377
378 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
379 "event_inj: 0x%08x npt: %s\n",
380 __entry->rip, __entry->vmcb, __entry->nested_rip,
381 __entry->int_ctl, __entry->event_inj,
382 __entry->npt ? "on" : "off")
383);
384
385/*
386 * Tracepoint for #VMEXIT while nested
387 */
388TRACE_EVENT(kvm_nested_vmexit,
389 TP_PROTO(__u64 rip, __u32 exit_code,
390 __u64 exit_info1, __u64 exit_info2,
391 __u32 exit_int_info, __u32 exit_int_info_err),
392 TP_ARGS(rip, exit_code, exit_info1, exit_info2,
393 exit_int_info, exit_int_info_err),
394
395 TP_STRUCT__entry(
396 __field( __u64, rip )
397 __field( __u32, exit_code )
398 __field( __u64, exit_info1 )
399 __field( __u64, exit_info2 )
400 __field( __u32, exit_int_info )
401 __field( __u32, exit_int_info_err )
402 ),
403
404 TP_fast_assign(
405 __entry->rip = rip;
406 __entry->exit_code = exit_code;
407 __entry->exit_info1 = exit_info1;
408 __entry->exit_info2 = exit_info2;
409 __entry->exit_int_info = exit_int_info;
410 __entry->exit_int_info_err = exit_int_info_err;
411 ),
412 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
413 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
414 __entry->rip,
415 ftrace_print_symbols_seq(p, __entry->exit_code,
416 kvm_x86_ops->exit_reasons_str),
417 __entry->exit_info1, __entry->exit_info2,
418 __entry->exit_int_info, __entry->exit_int_info_err)
419);
420
421/*
422 * Tracepoint for #VMEXIT reinjected to the guest
423 */
424TRACE_EVENT(kvm_nested_vmexit_inject,
425 TP_PROTO(__u32 exit_code,
426 __u64 exit_info1, __u64 exit_info2,
427 __u32 exit_int_info, __u32 exit_int_info_err),
428 TP_ARGS(exit_code, exit_info1, exit_info2,
429 exit_int_info, exit_int_info_err),
430
431 TP_STRUCT__entry(
432 __field( __u32, exit_code )
433 __field( __u64, exit_info1 )
434 __field( __u64, exit_info2 )
435 __field( __u32, exit_int_info )
436 __field( __u32, exit_int_info_err )
437 ),
438
439 TP_fast_assign(
440 __entry->exit_code = exit_code;
441 __entry->exit_info1 = exit_info1;
442 __entry->exit_info2 = exit_info2;
443 __entry->exit_int_info = exit_int_info;
444 __entry->exit_int_info_err = exit_int_info_err;
445 ),
446
447 TP_printk("reason: %s ext_inf1: 0x%016llx "
448 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
449 ftrace_print_symbols_seq(p, __entry->exit_code,
450 kvm_x86_ops->exit_reasons_str),
451 __entry->exit_info1, __entry->exit_info2,
452 __entry->exit_int_info, __entry->exit_int_info_err)
453);
454
455/*
456 * Tracepoint for nested #vmexit because of interrupt pending
457 */
458TRACE_EVENT(kvm_nested_intr_vmexit,
459 TP_PROTO(__u64 rip),
460 TP_ARGS(rip),
461
462 TP_STRUCT__entry(
463 __field( __u64, rip )
464 ),
465
466 TP_fast_assign(
467 __entry->rip = rip
468 ),
469
470 TP_printk("rip: 0x%016llx\n", __entry->rip)
471);
472
473/*
474 * Tracepoint for nested #vmexit because of interrupt pending
475 */
476TRACE_EVENT(kvm_invlpga,
477 TP_PROTO(__u64 rip, int asid, u64 address),
478 TP_ARGS(rip, asid, address),
479
480 TP_STRUCT__entry(
481 __field( __u64, rip )
482 __field( int, asid )
483 __field( __u64, address )
484 ),
485
486 TP_fast_assign(
487 __entry->rip = rip;
488 __entry->asid = asid;
489 __entry->address = address;
490 ),
491
492 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
493 __entry->rip, __entry->asid, __entry->address)
494);
495
496/*
497 * Tracepoint for nested #vmexit because of interrupt pending
498 */
499TRACE_EVENT(kvm_skinit,
500 TP_PROTO(__u64 rip, __u32 slb),
501 TP_ARGS(rip, slb),
502
503 TP_STRUCT__entry(
504 __field( __u64, rip )
505 __field( __u32, slb )
506 ),
507
508 TP_fast_assign(
509 __entry->rip = rip;
510 __entry->slb = slb;
511 ),
512
513 TP_printk("rip: 0x%016llx slb: 0x%08x\n",
514 __entry->rip, __entry->slb)
515);
516
352#endif /* _TRACE_KVM_H */ 517#endif /* _TRACE_KVM_H */
353 518
354/* This part must be outside protection */ 519/* This part must be outside protection */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed53b42caba1..d4918d6fc924 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,12 +61,37 @@ module_param_named(unrestricted_guest,
61static int __read_mostly emulate_invalid_guest_state = 0; 61static int __read_mostly emulate_invalid_guest_state = 0;
62module_param(emulate_invalid_guest_state, bool, S_IRUGO); 62module_param(emulate_invalid_guest_state, bool, S_IRUGO);
63 63
64/*
65 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
66 * ple_gap: upper bound on the amount of time between two successive
67 * executions of PAUSE in a loop. Also indicate if ple enabled.
68 * According to test, this time is usually small than 41 cycles.
69 * ple_window: upper bound on the amount of time a guest is allowed to execute
70 * in a PAUSE loop. Tests indicate that most spinlocks are held for
71 * less than 2^12 cycles
72 * Time is measured based on a counter that runs at the same rate as the TSC,
73 * refer SDM volume 3b section 21.6.13 & 22.1.3.
74 */
75#define KVM_VMX_DEFAULT_PLE_GAP 41
76#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
77static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
78module_param(ple_gap, int, S_IRUGO);
79
80static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
81module_param(ple_window, int, S_IRUGO);
82
64struct vmcs { 83struct vmcs {
65 u32 revision_id; 84 u32 revision_id;
66 u32 abort; 85 u32 abort;
67 char data[0]; 86 char data[0];
68}; 87};
69 88
89struct shared_msr_entry {
90 unsigned index;
91 u64 data;
92 u64 mask;
93};
94
70struct vcpu_vmx { 95struct vcpu_vmx {
71 struct kvm_vcpu vcpu; 96 struct kvm_vcpu vcpu;
72 struct list_head local_vcpus_link; 97 struct list_head local_vcpus_link;
@@ -74,13 +99,12 @@ struct vcpu_vmx {
74 int launched; 99 int launched;
75 u8 fail; 100 u8 fail;
76 u32 idt_vectoring_info; 101 u32 idt_vectoring_info;
77 struct kvm_msr_entry *guest_msrs; 102 struct shared_msr_entry *guest_msrs;
78 struct kvm_msr_entry *host_msrs;
79 int nmsrs; 103 int nmsrs;
80 int save_nmsrs; 104 int save_nmsrs;
81 int msr_offset_efer;
82#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
83 int msr_offset_kernel_gs_base; 106 u64 msr_host_kernel_gs_base;
107 u64 msr_guest_kernel_gs_base;
84#endif 108#endif
85 struct vmcs *vmcs; 109 struct vmcs *vmcs;
86 struct { 110 struct {
@@ -88,7 +112,6 @@ struct vcpu_vmx {
88 u16 fs_sel, gs_sel, ldt_sel; 112 u16 fs_sel, gs_sel, ldt_sel;
89 int gs_ldt_reload_needed; 113 int gs_ldt_reload_needed;
90 int fs_reload_needed; 114 int fs_reload_needed;
91 int guest_efer_loaded;
92 } host_state; 115 } host_state;
93 struct { 116 struct {
94 int vm86_active; 117 int vm86_active;
@@ -107,7 +130,6 @@ struct vcpu_vmx {
107 } rmode; 130 } rmode;
108 int vpid; 131 int vpid;
109 bool emulation_required; 132 bool emulation_required;
110 enum emulation_result invalid_state_emulation_result;
111 133
112 /* Support for vnmi-less CPUs */ 134 /* Support for vnmi-less CPUs */
113 int soft_vnmi_blocked; 135 int soft_vnmi_blocked;
@@ -176,6 +198,8 @@ static struct kvm_vmx_segment_field {
176 VMX_SEGMENT_FIELD(LDTR), 198 VMX_SEGMENT_FIELD(LDTR),
177}; 199};
178 200
201static u64 host_efer;
202
179static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 203static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
180 204
181/* 205/*
@@ -184,28 +208,12 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
184 */ 208 */
185static const u32 vmx_msr_index[] = { 209static const u32 vmx_msr_index[] = {
186#ifdef CONFIG_X86_64 210#ifdef CONFIG_X86_64
187 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, 211 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
188#endif 212#endif
189 MSR_EFER, MSR_K6_STAR, 213 MSR_EFER, MSR_K6_STAR,
190}; 214};
191#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 215#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
192 216
193static void load_msrs(struct kvm_msr_entry *e, int n)
194{
195 int i;
196
197 for (i = 0; i < n; ++i)
198 wrmsrl(e[i].index, e[i].data);
199}
200
201static void save_msrs(struct kvm_msr_entry *e, int n)
202{
203 int i;
204
205 for (i = 0; i < n; ++i)
206 rdmsrl(e[i].index, e[i].data);
207}
208
209static inline int is_page_fault(u32 intr_info) 217static inline int is_page_fault(u32 intr_info)
210{ 218{
211 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 219 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -320,6 +328,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
320 SECONDARY_EXEC_UNRESTRICTED_GUEST; 328 SECONDARY_EXEC_UNRESTRICTED_GUEST;
321} 329}
322 330
331static inline int cpu_has_vmx_ple(void)
332{
333 return vmcs_config.cpu_based_2nd_exec_ctrl &
334 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
335}
336
323static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 337static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
324{ 338{
325 return flexpriority_enabled && 339 return flexpriority_enabled &&
@@ -348,7 +362,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
348 int i; 362 int i;
349 363
350 for (i = 0; i < vmx->nmsrs; ++i) 364 for (i = 0; i < vmx->nmsrs; ++i)
351 if (vmx->guest_msrs[i].index == msr) 365 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
352 return i; 366 return i;
353 return -1; 367 return -1;
354} 368}
@@ -379,7 +393,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
379 : : "a" (&operand), "c" (ext) : "cc", "memory"); 393 : : "a" (&operand), "c" (ext) : "cc", "memory");
380} 394}
381 395
382static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 396static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
383{ 397{
384 int i; 398 int i;
385 399
@@ -570,17 +584,12 @@ static void reload_tss(void)
570 load_TR_desc(); 584 load_TR_desc();
571} 585}
572 586
573static void load_transition_efer(struct vcpu_vmx *vmx) 587static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
574{ 588{
575 int efer_offset = vmx->msr_offset_efer;
576 u64 host_efer;
577 u64 guest_efer; 589 u64 guest_efer;
578 u64 ignore_bits; 590 u64 ignore_bits;
579 591
580 if (efer_offset < 0) 592 guest_efer = vmx->vcpu.arch.shadow_efer;
581 return;
582 host_efer = vmx->host_msrs[efer_offset].data;
583 guest_efer = vmx->guest_msrs[efer_offset].data;
584 593
585 /* 594 /*
586 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 595 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -593,27 +602,17 @@ static void load_transition_efer(struct vcpu_vmx *vmx)
593 if (guest_efer & EFER_LMA) 602 if (guest_efer & EFER_LMA)
594 ignore_bits &= ~(u64)EFER_SCE; 603 ignore_bits &= ~(u64)EFER_SCE;
595#endif 604#endif
596 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
597 return;
598
599 vmx->host_state.guest_efer_loaded = 1;
600 guest_efer &= ~ignore_bits; 605 guest_efer &= ~ignore_bits;
601 guest_efer |= host_efer & ignore_bits; 606 guest_efer |= host_efer & ignore_bits;
602 wrmsrl(MSR_EFER, guest_efer); 607 vmx->guest_msrs[efer_offset].data = guest_efer;
603 vmx->vcpu.stat.efer_reload++; 608 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
604} 609 return true;
605
606static void reload_host_efer(struct vcpu_vmx *vmx)
607{
608 if (vmx->host_state.guest_efer_loaded) {
609 vmx->host_state.guest_efer_loaded = 0;
610 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
611 }
612} 610}
613 611
614static void vmx_save_host_state(struct kvm_vcpu *vcpu) 612static void vmx_save_host_state(struct kvm_vcpu *vcpu)
615{ 613{
616 struct vcpu_vmx *vmx = to_vmx(vcpu); 614 struct vcpu_vmx *vmx = to_vmx(vcpu);
615 int i;
617 616
618 if (vmx->host_state.loaded) 617 if (vmx->host_state.loaded)
619 return; 618 return;
@@ -650,13 +649,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
650#endif 649#endif
651 650
652#ifdef CONFIG_X86_64 651#ifdef CONFIG_X86_64
653 if (is_long_mode(&vmx->vcpu)) 652 if (is_long_mode(&vmx->vcpu)) {
654 save_msrs(vmx->host_msrs + 653 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
655 vmx->msr_offset_kernel_gs_base, 1); 654 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
656 655 }
657#endif 656#endif
658 load_msrs(vmx->guest_msrs, vmx->save_nmsrs); 657 for (i = 0; i < vmx->save_nmsrs; ++i)
659 load_transition_efer(vmx); 658 kvm_set_shared_msr(vmx->guest_msrs[i].index,
659 vmx->guest_msrs[i].data,
660 vmx->guest_msrs[i].mask);
660} 661}
661 662
662static void __vmx_load_host_state(struct vcpu_vmx *vmx) 663static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -684,9 +685,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
684 local_irq_restore(flags); 685 local_irq_restore(flags);
685 } 686 }
686 reload_tss(); 687 reload_tss();
687 save_msrs(vmx->guest_msrs, vmx->save_nmsrs); 688#ifdef CONFIG_X86_64
688 load_msrs(vmx->host_msrs, vmx->save_nmsrs); 689 if (is_long_mode(&vmx->vcpu)) {
689 reload_host_efer(vmx); 690 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
691 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
692 }
693#endif
690} 694}
691 695
692static void vmx_load_host_state(struct vcpu_vmx *vmx) 696static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -877,19 +881,14 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
877/* 881/*
878 * Swap MSR entry in host/guest MSR entry array. 882 * Swap MSR entry in host/guest MSR entry array.
879 */ 883 */
880#ifdef CONFIG_X86_64
881static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 884static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
882{ 885{
883 struct kvm_msr_entry tmp; 886 struct shared_msr_entry tmp;
884 887
885 tmp = vmx->guest_msrs[to]; 888 tmp = vmx->guest_msrs[to];
886 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 889 vmx->guest_msrs[to] = vmx->guest_msrs[from];
887 vmx->guest_msrs[from] = tmp; 890 vmx->guest_msrs[from] = tmp;
888 tmp = vmx->host_msrs[to];
889 vmx->host_msrs[to] = vmx->host_msrs[from];
890 vmx->host_msrs[from] = tmp;
891} 891}
892#endif
893 892
894/* 893/*
895 * Set up the vmcs to automatically save and restore system 894 * Set up the vmcs to automatically save and restore system
@@ -898,15 +897,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
898 */ 897 */
899static void setup_msrs(struct vcpu_vmx *vmx) 898static void setup_msrs(struct vcpu_vmx *vmx)
900{ 899{
901 int save_nmsrs; 900 int save_nmsrs, index;
902 unsigned long *msr_bitmap; 901 unsigned long *msr_bitmap;
903 902
904 vmx_load_host_state(vmx); 903 vmx_load_host_state(vmx);
905 save_nmsrs = 0; 904 save_nmsrs = 0;
906#ifdef CONFIG_X86_64 905#ifdef CONFIG_X86_64
907 if (is_long_mode(&vmx->vcpu)) { 906 if (is_long_mode(&vmx->vcpu)) {
908 int index;
909
910 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 907 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
911 if (index >= 0) 908 if (index >= 0)
912 move_msr_up(vmx, index, save_nmsrs++); 909 move_msr_up(vmx, index, save_nmsrs++);
@@ -916,9 +913,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
916 index = __find_msr_index(vmx, MSR_CSTAR); 913 index = __find_msr_index(vmx, MSR_CSTAR);
917 if (index >= 0) 914 if (index >= 0)
918 move_msr_up(vmx, index, save_nmsrs++); 915 move_msr_up(vmx, index, save_nmsrs++);
919 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
920 if (index >= 0)
921 move_msr_up(vmx, index, save_nmsrs++);
922 /* 916 /*
923 * MSR_K6_STAR is only needed on long mode guests, and only 917 * MSR_K6_STAR is only needed on long mode guests, and only
924 * if efer.sce is enabled. 918 * if efer.sce is enabled.
@@ -928,13 +922,11 @@ static void setup_msrs(struct vcpu_vmx *vmx)
928 move_msr_up(vmx, index, save_nmsrs++); 922 move_msr_up(vmx, index, save_nmsrs++);
929 } 923 }
930#endif 924#endif
931 vmx->save_nmsrs = save_nmsrs; 925 index = __find_msr_index(vmx, MSR_EFER);
926 if (index >= 0 && update_transition_efer(vmx, index))
927 move_msr_up(vmx, index, save_nmsrs++);
932 928
933#ifdef CONFIG_X86_64 929 vmx->save_nmsrs = save_nmsrs;
934 vmx->msr_offset_kernel_gs_base =
935 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
936#endif
937 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
938 930
939 if (cpu_has_vmx_msr_bitmap()) { 931 if (cpu_has_vmx_msr_bitmap()) {
940 if (is_long_mode(&vmx->vcpu)) 932 if (is_long_mode(&vmx->vcpu))
@@ -976,7 +968,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
976static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 968static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
977{ 969{
978 u64 data; 970 u64 data;
979 struct kvm_msr_entry *msr; 971 struct shared_msr_entry *msr;
980 972
981 if (!pdata) { 973 if (!pdata) {
982 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); 974 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -991,9 +983,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
991 case MSR_GS_BASE: 983 case MSR_GS_BASE:
992 data = vmcs_readl(GUEST_GS_BASE); 984 data = vmcs_readl(GUEST_GS_BASE);
993 break; 985 break;
986 case MSR_KERNEL_GS_BASE:
987 vmx_load_host_state(to_vmx(vcpu));
988 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
989 break;
990#endif
994 case MSR_EFER: 991 case MSR_EFER:
995 return kvm_get_msr_common(vcpu, msr_index, pdata); 992 return kvm_get_msr_common(vcpu, msr_index, pdata);
996#endif
997 case MSR_IA32_TSC: 993 case MSR_IA32_TSC:
998 data = guest_read_tsc(); 994 data = guest_read_tsc();
999 break; 995 break;
@@ -1007,6 +1003,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1007 data = vmcs_readl(GUEST_SYSENTER_ESP); 1003 data = vmcs_readl(GUEST_SYSENTER_ESP);
1008 break; 1004 break;
1009 default: 1005 default:
1006 vmx_load_host_state(to_vmx(vcpu));
1010 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1007 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1011 if (msr) { 1008 if (msr) {
1012 vmx_load_host_state(to_vmx(vcpu)); 1009 vmx_load_host_state(to_vmx(vcpu));
@@ -1028,7 +1025,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1028static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1025static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1029{ 1026{
1030 struct vcpu_vmx *vmx = to_vmx(vcpu); 1027 struct vcpu_vmx *vmx = to_vmx(vcpu);
1031 struct kvm_msr_entry *msr; 1028 struct shared_msr_entry *msr;
1032 u64 host_tsc; 1029 u64 host_tsc;
1033 int ret = 0; 1030 int ret = 0;
1034 1031
@@ -1044,6 +1041,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1044 case MSR_GS_BASE: 1041 case MSR_GS_BASE:
1045 vmcs_writel(GUEST_GS_BASE, data); 1042 vmcs_writel(GUEST_GS_BASE, data);
1046 break; 1043 break;
1044 case MSR_KERNEL_GS_BASE:
1045 vmx_load_host_state(vmx);
1046 vmx->msr_guest_kernel_gs_base = data;
1047 break;
1047#endif 1048#endif
1048 case MSR_IA32_SYSENTER_CS: 1049 case MSR_IA32_SYSENTER_CS:
1049 vmcs_write32(GUEST_SYSENTER_CS, data); 1050 vmcs_write32(GUEST_SYSENTER_CS, data);
@@ -1097,30 +1098,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1097 } 1098 }
1098} 1099}
1099 1100
1100static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1101static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1101{ 1102{
1102 int old_debug = vcpu->guest_debug;
1103 unsigned long flags;
1104
1105 vcpu->guest_debug = dbg->control;
1106 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
1107 vcpu->guest_debug = 0;
1108
1109 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1103 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1110 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); 1104 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1111 else 1105 else
1112 vmcs_writel(GUEST_DR7, vcpu->arch.dr7); 1106 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1113 1107
1114 flags = vmcs_readl(GUEST_RFLAGS);
1115 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
1116 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1117 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1118 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1119 vmcs_writel(GUEST_RFLAGS, flags);
1120
1121 update_exception_bitmap(vcpu); 1108 update_exception_bitmap(vcpu);
1122
1123 return 0;
1124} 1109}
1125 1110
1126static __init int cpu_has_kvm_support(void) 1111static __init int cpu_has_kvm_support(void)
@@ -1139,12 +1124,15 @@ static __init int vmx_disabled_by_bios(void)
1139 /* locked but not enabled */ 1124 /* locked but not enabled */
1140} 1125}
1141 1126
1142static void hardware_enable(void *garbage) 1127static int hardware_enable(void *garbage)
1143{ 1128{
1144 int cpu = raw_smp_processor_id(); 1129 int cpu = raw_smp_processor_id();
1145 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1130 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1146 u64 old; 1131 u64 old;
1147 1132
1133 if (read_cr4() & X86_CR4_VMXE)
1134 return -EBUSY;
1135
1148 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1136 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1149 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1137 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1150 if ((old & (FEATURE_CONTROL_LOCKED | 1138 if ((old & (FEATURE_CONTROL_LOCKED |
@@ -1159,6 +1147,10 @@ static void hardware_enable(void *garbage)
1159 asm volatile (ASM_VMX_VMXON_RAX 1147 asm volatile (ASM_VMX_VMXON_RAX
1160 : : "a"(&phys_addr), "m"(phys_addr) 1148 : : "a"(&phys_addr), "m"(phys_addr)
1161 : "memory", "cc"); 1149 : "memory", "cc");
1150
1151 ept_sync_global();
1152
1153 return 0;
1162} 1154}
1163 1155
1164static void vmclear_local_vcpus(void) 1156static void vmclear_local_vcpus(void)
@@ -1250,7 +1242,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1250 SECONDARY_EXEC_WBINVD_EXITING | 1242 SECONDARY_EXEC_WBINVD_EXITING |
1251 SECONDARY_EXEC_ENABLE_VPID | 1243 SECONDARY_EXEC_ENABLE_VPID |
1252 SECONDARY_EXEC_ENABLE_EPT | 1244 SECONDARY_EXEC_ENABLE_EPT |
1253 SECONDARY_EXEC_UNRESTRICTED_GUEST; 1245 SECONDARY_EXEC_UNRESTRICTED_GUEST |
1246 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1254 if (adjust_vmx_controls(min2, opt2, 1247 if (adjust_vmx_controls(min2, opt2,
1255 MSR_IA32_VMX_PROCBASED_CTLS2, 1248 MSR_IA32_VMX_PROCBASED_CTLS2,
1256 &_cpu_based_2nd_exec_control) < 0) 1249 &_cpu_based_2nd_exec_control) < 0)
@@ -1344,15 +1337,17 @@ static void free_kvm_area(void)
1344{ 1337{
1345 int cpu; 1338 int cpu;
1346 1339
1347 for_each_online_cpu(cpu) 1340 for_each_possible_cpu(cpu) {
1348 free_vmcs(per_cpu(vmxarea, cpu)); 1341 free_vmcs(per_cpu(vmxarea, cpu));
1342 per_cpu(vmxarea, cpu) = NULL;
1343 }
1349} 1344}
1350 1345
1351static __init int alloc_kvm_area(void) 1346static __init int alloc_kvm_area(void)
1352{ 1347{
1353 int cpu; 1348 int cpu;
1354 1349
1355 for_each_online_cpu(cpu) { 1350 for_each_possible_cpu(cpu) {
1356 struct vmcs *vmcs; 1351 struct vmcs *vmcs;
1357 1352
1358 vmcs = alloc_vmcs_cpu(cpu); 1353 vmcs = alloc_vmcs_cpu(cpu);
@@ -1394,6 +1389,9 @@ static __init int hardware_setup(void)
1394 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 1389 if (enable_ept && !cpu_has_vmx_ept_2m_page())
1395 kvm_disable_largepages(); 1390 kvm_disable_largepages();
1396 1391
1392 if (!cpu_has_vmx_ple())
1393 ple_gap = 0;
1394
1397 return alloc_kvm_area(); 1395 return alloc_kvm_area();
1398} 1396}
1399 1397
@@ -1536,8 +1534,16 @@ continue_rmode:
1536static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1534static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1537{ 1535{
1538 struct vcpu_vmx *vmx = to_vmx(vcpu); 1536 struct vcpu_vmx *vmx = to_vmx(vcpu);
1539 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 1537 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1538
1539 if (!msr)
1540 return;
1540 1541
1542 /*
1543 * Force kernel_gs_base reloading before EFER changes, as control
1544 * of this msr depends on is_long_mode().
1545 */
1546 vmx_load_host_state(to_vmx(vcpu));
1541 vcpu->arch.shadow_efer = efer; 1547 vcpu->arch.shadow_efer = efer;
1542 if (!msr) 1548 if (!msr)
1543 return; 1549 return;
@@ -1727,6 +1733,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1727 vmcs_write64(EPT_POINTER, eptp); 1733 vmcs_write64(EPT_POINTER, eptp);
1728 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 1734 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1729 vcpu->kvm->arch.ept_identity_map_addr; 1735 vcpu->kvm->arch.ept_identity_map_addr;
1736 ept_load_pdptrs(vcpu);
1730 } 1737 }
1731 1738
1732 vmx_flush_tlb(vcpu); 1739 vmx_flush_tlb(vcpu);
@@ -2302,13 +2309,22 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2302 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2309 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2303 if (vmx->vpid == 0) 2310 if (vmx->vpid == 0)
2304 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2311 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2305 if (!enable_ept) 2312 if (!enable_ept) {
2306 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2313 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2314 enable_unrestricted_guest = 0;
2315 }
2307 if (!enable_unrestricted_guest) 2316 if (!enable_unrestricted_guest)
2308 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2317 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2318 if (!ple_gap)
2319 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2309 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2320 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2310 } 2321 }
2311 2322
2323 if (ple_gap) {
2324 vmcs_write32(PLE_GAP, ple_gap);
2325 vmcs_write32(PLE_WINDOW, ple_window);
2326 }
2327
2312 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); 2328 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2313 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2329 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2314 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2330 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -2376,10 +2392,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2376 if (wrmsr_safe(index, data_low, data_high) < 0) 2392 if (wrmsr_safe(index, data_low, data_high) < 0)
2377 continue; 2393 continue;
2378 data = data_low | ((u64)data_high << 32); 2394 data = data_low | ((u64)data_high << 32);
2379 vmx->host_msrs[j].index = index; 2395 vmx->guest_msrs[j].index = i;
2380 vmx->host_msrs[j].reserved = 0; 2396 vmx->guest_msrs[j].data = 0;
2381 vmx->host_msrs[j].data = data; 2397 vmx->guest_msrs[j].mask = -1ull;
2382 vmx->guest_msrs[j] = vmx->host_msrs[j];
2383 ++vmx->nmsrs; 2398 ++vmx->nmsrs;
2384 } 2399 }
2385 2400
@@ -2510,7 +2525,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2510 if (vmx->vpid != 0) 2525 if (vmx->vpid != 0)
2511 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2512 2527
2513 vmx->vcpu.arch.cr0 = 0x60000010; 2528 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
2514 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ 2529 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
2515 vmx_set_cr4(&vmx->vcpu, 0); 2530 vmx_set_cr4(&vmx->vcpu, 0);
2516 vmx_set_efer(&vmx->vcpu, 0); 2531 vmx_set_efer(&vmx->vcpu, 0);
@@ -2627,6 +2642,34 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2627 GUEST_INTR_STATE_NMI)); 2642 GUEST_INTR_STATE_NMI));
2628} 2643}
2629 2644
2645static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2646{
2647 if (!cpu_has_virtual_nmis())
2648 return to_vmx(vcpu)->soft_vnmi_blocked;
2649 else
2650 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2651 GUEST_INTR_STATE_NMI);
2652}
2653
2654static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2655{
2656 struct vcpu_vmx *vmx = to_vmx(vcpu);
2657
2658 if (!cpu_has_virtual_nmis()) {
2659 if (vmx->soft_vnmi_blocked != masked) {
2660 vmx->soft_vnmi_blocked = masked;
2661 vmx->vnmi_blocked_time = 0;
2662 }
2663 } else {
2664 if (masked)
2665 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2666 GUEST_INTR_STATE_NMI);
2667 else
2668 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2669 GUEST_INTR_STATE_NMI);
2670 }
2671}
2672
2630static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 2673static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2631{ 2674{
2632 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 2675 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -2659,7 +2702,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2659 * Cause the #SS fault with 0 error code in VM86 mode. 2702 * Cause the #SS fault with 0 error code in VM86 mode.
2660 */ 2703 */
2661 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2704 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2662 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2705 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
2663 return 1; 2706 return 1;
2664 /* 2707 /*
2665 * Forward all other exceptions that are valid in real mode. 2708 * Forward all other exceptions that are valid in real mode.
@@ -2710,15 +2753,16 @@ static void kvm_machine_check(void)
2710#endif 2753#endif
2711} 2754}
2712 2755
2713static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2756static int handle_machine_check(struct kvm_vcpu *vcpu)
2714{ 2757{
2715 /* already handled by vcpu_run */ 2758 /* already handled by vcpu_run */
2716 return 1; 2759 return 1;
2717} 2760}
2718 2761
2719static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2762static int handle_exception(struct kvm_vcpu *vcpu)
2720{ 2763{
2721 struct vcpu_vmx *vmx = to_vmx(vcpu); 2764 struct vcpu_vmx *vmx = to_vmx(vcpu);
2765 struct kvm_run *kvm_run = vcpu->run;
2722 u32 intr_info, ex_no, error_code; 2766 u32 intr_info, ex_no, error_code;
2723 unsigned long cr2, rip, dr6; 2767 unsigned long cr2, rip, dr6;
2724 u32 vect_info; 2768 u32 vect_info;
@@ -2728,12 +2772,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2728 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2772 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2729 2773
2730 if (is_machine_check(intr_info)) 2774 if (is_machine_check(intr_info))
2731 return handle_machine_check(vcpu, kvm_run); 2775 return handle_machine_check(vcpu);
2732 2776
2733 if ((vect_info & VECTORING_INFO_VALID_MASK) && 2777 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2734 !is_page_fault(intr_info)) 2778 !is_page_fault(intr_info)) {
2735 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 2779 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2736 "intr info 0x%x\n", __func__, vect_info, intr_info); 2780 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
2781 vcpu->run->internal.ndata = 2;
2782 vcpu->run->internal.data[0] = vect_info;
2783 vcpu->run->internal.data[1] = intr_info;
2784 return 0;
2785 }
2737 2786
2738 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 2787 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2739 return 1; /* already handled by vmx_vcpu_run() */ 2788 return 1; /* already handled by vmx_vcpu_run() */
@@ -2744,7 +2793,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2744 } 2793 }
2745 2794
2746 if (is_invalid_opcode(intr_info)) { 2795 if (is_invalid_opcode(intr_info)) {
2747 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 2796 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
2748 if (er != EMULATE_DONE) 2797 if (er != EMULATE_DONE)
2749 kvm_queue_exception(vcpu, UD_VECTOR); 2798 kvm_queue_exception(vcpu, UD_VECTOR);
2750 return 1; 2799 return 1;
@@ -2803,20 +2852,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2803 return 0; 2852 return 0;
2804} 2853}
2805 2854
2806static int handle_external_interrupt(struct kvm_vcpu *vcpu, 2855static int handle_external_interrupt(struct kvm_vcpu *vcpu)
2807 struct kvm_run *kvm_run)
2808{ 2856{
2809 ++vcpu->stat.irq_exits; 2857 ++vcpu->stat.irq_exits;
2810 return 1; 2858 return 1;
2811} 2859}
2812 2860
2813static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2861static int handle_triple_fault(struct kvm_vcpu *vcpu)
2814{ 2862{
2815 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2863 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2816 return 0; 2864 return 0;
2817} 2865}
2818 2866
2819static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2867static int handle_io(struct kvm_vcpu *vcpu)
2820{ 2868{
2821 unsigned long exit_qualification; 2869 unsigned long exit_qualification;
2822 int size, in, string; 2870 int size, in, string;
@@ -2827,8 +2875,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2827 string = (exit_qualification & 16) != 0; 2875 string = (exit_qualification & 16) != 0;
2828 2876
2829 if (string) { 2877 if (string) {
2830 if (emulate_instruction(vcpu, 2878 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2831 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
2832 return 0; 2879 return 0;
2833 return 1; 2880 return 1;
2834 } 2881 }
@@ -2838,7 +2885,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 port = exit_qualification >> 16; 2885 port = exit_qualification >> 16;
2839 2886
2840 skip_emulated_instruction(vcpu); 2887 skip_emulated_instruction(vcpu);
2841 return kvm_emulate_pio(vcpu, kvm_run, in, size, port); 2888 return kvm_emulate_pio(vcpu, in, size, port);
2842} 2889}
2843 2890
2844static void 2891static void
@@ -2852,7 +2899,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2852 hypercall[2] = 0xc1; 2899 hypercall[2] = 0xc1;
2853} 2900}
2854 2901
2855static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2902static int handle_cr(struct kvm_vcpu *vcpu)
2856{ 2903{
2857 unsigned long exit_qualification, val; 2904 unsigned long exit_qualification, val;
2858 int cr; 2905 int cr;
@@ -2887,7 +2934,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2887 return 1; 2934 return 1;
2888 if (cr8_prev <= cr8) 2935 if (cr8_prev <= cr8)
2889 return 1; 2936 return 1;
2890 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2937 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2891 return 0; 2938 return 0;
2892 } 2939 }
2893 }; 2940 };
@@ -2922,13 +2969,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2922 default: 2969 default:
2923 break; 2970 break;
2924 } 2971 }
2925 kvm_run->exit_reason = 0; 2972 vcpu->run->exit_reason = 0;
2926 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 2973 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2927 (int)(exit_qualification >> 4) & 3, cr); 2974 (int)(exit_qualification >> 4) & 3, cr);
2928 return 0; 2975 return 0;
2929} 2976}
2930 2977
2931static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2978static int handle_dr(struct kvm_vcpu *vcpu)
2932{ 2979{
2933 unsigned long exit_qualification; 2980 unsigned long exit_qualification;
2934 unsigned long val; 2981 unsigned long val;
@@ -2944,13 +2991,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2944 * guest debugging itself. 2991 * guest debugging itself.
2945 */ 2992 */
2946 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 2993 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
2947 kvm_run->debug.arch.dr6 = vcpu->arch.dr6; 2994 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
2948 kvm_run->debug.arch.dr7 = dr; 2995 vcpu->run->debug.arch.dr7 = dr;
2949 kvm_run->debug.arch.pc = 2996 vcpu->run->debug.arch.pc =
2950 vmcs_readl(GUEST_CS_BASE) + 2997 vmcs_readl(GUEST_CS_BASE) +
2951 vmcs_readl(GUEST_RIP); 2998 vmcs_readl(GUEST_RIP);
2952 kvm_run->debug.arch.exception = DB_VECTOR; 2999 vcpu->run->debug.arch.exception = DB_VECTOR;
2953 kvm_run->exit_reason = KVM_EXIT_DEBUG; 3000 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
2954 return 0; 3001 return 0;
2955 } else { 3002 } else {
2956 vcpu->arch.dr7 &= ~DR7_GD; 3003 vcpu->arch.dr7 &= ~DR7_GD;
@@ -3016,13 +3063,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016 return 1; 3063 return 1;
3017} 3064}
3018 3065
3019static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3066static int handle_cpuid(struct kvm_vcpu *vcpu)
3020{ 3067{
3021 kvm_emulate_cpuid(vcpu); 3068 kvm_emulate_cpuid(vcpu);
3022 return 1; 3069 return 1;
3023} 3070}
3024 3071
3025static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3072static int handle_rdmsr(struct kvm_vcpu *vcpu)
3026{ 3073{
3027 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3074 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3028 u64 data; 3075 u64 data;
@@ -3041,7 +3088,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3041 return 1; 3088 return 1;
3042} 3089}
3043 3090
3044static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3091static int handle_wrmsr(struct kvm_vcpu *vcpu)
3045{ 3092{
3046 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3093 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3047 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3094 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
@@ -3058,14 +3105,12 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3058 return 1; 3105 return 1;
3059} 3106}
3060 3107
3061static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, 3108static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3062 struct kvm_run *kvm_run)
3063{ 3109{
3064 return 1; 3110 return 1;
3065} 3111}
3066 3112
3067static int handle_interrupt_window(struct kvm_vcpu *vcpu, 3113static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3068 struct kvm_run *kvm_run)
3069{ 3114{
3070 u32 cpu_based_vm_exec_control; 3115 u32 cpu_based_vm_exec_control;
3071 3116
@@ -3081,34 +3126,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
3081 * possible 3126 * possible
3082 */ 3127 */
3083 if (!irqchip_in_kernel(vcpu->kvm) && 3128 if (!irqchip_in_kernel(vcpu->kvm) &&
3084 kvm_run->request_interrupt_window && 3129 vcpu->run->request_interrupt_window &&
3085 !kvm_cpu_has_interrupt(vcpu)) { 3130 !kvm_cpu_has_interrupt(vcpu)) {
3086 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 3131 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3087 return 0; 3132 return 0;
3088 } 3133 }
3089 return 1; 3134 return 1;
3090} 3135}
3091 3136
3092static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3137static int handle_halt(struct kvm_vcpu *vcpu)
3093{ 3138{
3094 skip_emulated_instruction(vcpu); 3139 skip_emulated_instruction(vcpu);
3095 return kvm_emulate_halt(vcpu); 3140 return kvm_emulate_halt(vcpu);
3096} 3141}
3097 3142
3098static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3143static int handle_vmcall(struct kvm_vcpu *vcpu)
3099{ 3144{
3100 skip_emulated_instruction(vcpu); 3145 skip_emulated_instruction(vcpu);
3101 kvm_emulate_hypercall(vcpu); 3146 kvm_emulate_hypercall(vcpu);
3102 return 1; 3147 return 1;
3103} 3148}
3104 3149
3105static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3150static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3106{ 3151{
3107 kvm_queue_exception(vcpu, UD_VECTOR); 3152 kvm_queue_exception(vcpu, UD_VECTOR);
3108 return 1; 3153 return 1;
3109} 3154}
3110 3155
3111static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3156static int handle_invlpg(struct kvm_vcpu *vcpu)
3112{ 3157{
3113 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3158 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3114 3159
@@ -3117,14 +3162,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3117 return 1; 3162 return 1;
3118} 3163}
3119 3164
3120static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3165static int handle_wbinvd(struct kvm_vcpu *vcpu)
3121{ 3166{
3122 skip_emulated_instruction(vcpu); 3167 skip_emulated_instruction(vcpu);
3123 /* TODO: Add support for VT-d/pass-through device */ 3168 /* TODO: Add support for VT-d/pass-through device */
3124 return 1; 3169 return 1;
3125} 3170}
3126 3171
3127static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3172static int handle_apic_access(struct kvm_vcpu *vcpu)
3128{ 3173{
3129 unsigned long exit_qualification; 3174 unsigned long exit_qualification;
3130 enum emulation_result er; 3175 enum emulation_result er;
@@ -3133,7 +3178,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3133 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3178 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3134 offset = exit_qualification & 0xffful; 3179 offset = exit_qualification & 0xffful;
3135 3180
3136 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3181 er = emulate_instruction(vcpu, 0, 0, 0);
3137 3182
3138 if (er != EMULATE_DONE) { 3183 if (er != EMULATE_DONE) {
3139 printk(KERN_ERR 3184 printk(KERN_ERR
@@ -3144,7 +3189,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3144 return 1; 3189 return 1;
3145} 3190}
3146 3191
3147static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3192static int handle_task_switch(struct kvm_vcpu *vcpu)
3148{ 3193{
3149 struct vcpu_vmx *vmx = to_vmx(vcpu); 3194 struct vcpu_vmx *vmx = to_vmx(vcpu);
3150 unsigned long exit_qualification; 3195 unsigned long exit_qualification;
@@ -3198,7 +3243,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3198 return 1; 3243 return 1;
3199} 3244}
3200 3245
3201static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3246static int handle_ept_violation(struct kvm_vcpu *vcpu)
3202{ 3247{
3203 unsigned long exit_qualification; 3248 unsigned long exit_qualification;
3204 gpa_t gpa; 3249 gpa_t gpa;
@@ -3219,8 +3264,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3219 vmcs_readl(GUEST_LINEAR_ADDRESS)); 3264 vmcs_readl(GUEST_LINEAR_ADDRESS));
3220 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3265 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3221 (long unsigned int)exit_qualification); 3266 (long unsigned int)exit_qualification);
3222 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3267 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3223 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; 3268 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
3224 return 0; 3269 return 0;
3225 } 3270 }
3226 3271
@@ -3290,7 +3335,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3290 } 3335 }
3291} 3336}
3292 3337
3293static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3338static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3294{ 3339{
3295 u64 sptes[4]; 3340 u64 sptes[4];
3296 int nr_sptes, i; 3341 int nr_sptes, i;
@@ -3306,13 +3351,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3306 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) 3351 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
3307 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); 3352 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
3308 3353
3309 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3354 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3310 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; 3355 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
3311 3356
3312 return 0; 3357 return 0;
3313} 3358}
3314 3359
3315static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3360static int handle_nmi_window(struct kvm_vcpu *vcpu)
3316{ 3361{
3317 u32 cpu_based_vm_exec_control; 3362 u32 cpu_based_vm_exec_control;
3318 3363
@@ -3325,36 +3370,50 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3325 return 1; 3370 return 1;
3326} 3371}
3327 3372
3328static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, 3373static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3329 struct kvm_run *kvm_run)
3330{ 3374{
3331 struct vcpu_vmx *vmx = to_vmx(vcpu); 3375 struct vcpu_vmx *vmx = to_vmx(vcpu);
3332 enum emulation_result err = EMULATE_DONE; 3376 enum emulation_result err = EMULATE_DONE;
3333 3377 int ret = 1;
3334 local_irq_enable();
3335 preempt_enable();
3336 3378
3337 while (!guest_state_valid(vcpu)) { 3379 while (!guest_state_valid(vcpu)) {
3338 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3380 err = emulate_instruction(vcpu, 0, 0, 0);
3339 3381
3340 if (err == EMULATE_DO_MMIO) 3382 if (err == EMULATE_DO_MMIO) {
3341 break; 3383 ret = 0;
3384 goto out;
3385 }
3342 3386
3343 if (err != EMULATE_DONE) { 3387 if (err != EMULATE_DONE) {
3344 kvm_report_emulation_failure(vcpu, "emulation failure"); 3388 kvm_report_emulation_failure(vcpu, "emulation failure");
3345 break; 3389 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3390 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3391 vcpu->run->internal.ndata = 0;
3392 ret = 0;
3393 goto out;
3346 } 3394 }
3347 3395
3348 if (signal_pending(current)) 3396 if (signal_pending(current))
3349 break; 3397 goto out;
3350 if (need_resched()) 3398 if (need_resched())
3351 schedule(); 3399 schedule();
3352 } 3400 }
3353 3401
3354 preempt_disable(); 3402 vmx->emulation_required = 0;
3355 local_irq_disable(); 3403out:
3404 return ret;
3405}
3356 3406
3357 vmx->invalid_state_emulation_result = err; 3407/*
3408 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
3409 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
3410 */
3411static int handle_pause(struct kvm_vcpu *vcpu)
3412{
3413 skip_emulated_instruction(vcpu);
3414 kvm_vcpu_on_spin(vcpu);
3415
3416 return 1;
3358} 3417}
3359 3418
3360/* 3419/*
@@ -3362,8 +3421,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3362 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 3421 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3363 * to be done to userspace and return 0. 3422 * to be done to userspace and return 0.
3364 */ 3423 */
3365static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, 3424static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3366 struct kvm_run *kvm_run) = {
3367 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 3425 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
3368 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 3426 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
3369 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 3427 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
@@ -3394,6 +3452,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3394 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3452 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3395 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3453 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3396 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 3454 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3455 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
3397}; 3456};
3398 3457
3399static const int kvm_vmx_max_exit_handlers = 3458static const int kvm_vmx_max_exit_handlers =
@@ -3403,7 +3462,7 @@ static const int kvm_vmx_max_exit_handlers =
3403 * The guest has exited. See if we can fix it or if we need userspace 3462 * The guest has exited. See if we can fix it or if we need userspace
3404 * assistance. 3463 * assistance.
3405 */ 3464 */
3406static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3465static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3407{ 3466{
3408 struct vcpu_vmx *vmx = to_vmx(vcpu); 3467 struct vcpu_vmx *vmx = to_vmx(vcpu);
3409 u32 exit_reason = vmx->exit_reason; 3468 u32 exit_reason = vmx->exit_reason;
@@ -3411,13 +3470,9 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3411 3470
3412 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3471 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
3413 3472
3414 /* If we need to emulate an MMIO from handle_invalid_guest_state 3473 /* If guest state is invalid, start emulating */
3415 * we just return 0 */ 3474 if (vmx->emulation_required && emulate_invalid_guest_state)
3416 if (vmx->emulation_required && emulate_invalid_guest_state) { 3475 return handle_invalid_guest_state(vcpu);
3417 if (guest_state_valid(vcpu))
3418 vmx->emulation_required = 0;
3419 return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
3420 }
3421 3476
3422 /* Access CR3 don't cause VMExit in paging mode, so we need 3477 /* Access CR3 don't cause VMExit in paging mode, so we need
3423 * to sync with guest real CR3. */ 3478 * to sync with guest real CR3. */
@@ -3425,8 +3480,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3425 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3480 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3426 3481
3427 if (unlikely(vmx->fail)) { 3482 if (unlikely(vmx->fail)) {
3428 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3483 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3429 kvm_run->fail_entry.hardware_entry_failure_reason 3484 vcpu->run->fail_entry.hardware_entry_failure_reason
3430 = vmcs_read32(VM_INSTRUCTION_ERROR); 3485 = vmcs_read32(VM_INSTRUCTION_ERROR);
3431 return 0; 3486 return 0;
3432 } 3487 }
@@ -3459,10 +3514,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3459 3514
3460 if (exit_reason < kvm_vmx_max_exit_handlers 3515 if (exit_reason < kvm_vmx_max_exit_handlers
3461 && kvm_vmx_exit_handlers[exit_reason]) 3516 && kvm_vmx_exit_handlers[exit_reason])
3462 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 3517 return kvm_vmx_exit_handlers[exit_reason](vcpu);
3463 else { 3518 else {
3464 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3519 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3465 kvm_run->hw.hardware_exit_reason = exit_reason; 3520 vcpu->run->hw.hardware_exit_reason = exit_reason;
3466 } 3521 }
3467 return 0; 3522 return 0;
3468} 3523}
@@ -3600,23 +3655,18 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3600#define Q "l" 3655#define Q "l"
3601#endif 3656#endif
3602 3657
3603static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3658static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3604{ 3659{
3605 struct vcpu_vmx *vmx = to_vmx(vcpu); 3660 struct vcpu_vmx *vmx = to_vmx(vcpu);
3606 3661
3607 if (enable_ept && is_paging(vcpu)) {
3608 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3609 ept_load_pdptrs(vcpu);
3610 }
3611 /* Record the guest's net vcpu time for enforced NMI injections. */ 3662 /* Record the guest's net vcpu time for enforced NMI injections. */
3612 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3663 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3613 vmx->entry_time = ktime_get(); 3664 vmx->entry_time = ktime_get();
3614 3665
3615 /* Handle invalid guest state instead of entering VMX */ 3666 /* Don't enter VMX if guest state is invalid, let the exit handler
3616 if (vmx->emulation_required && emulate_invalid_guest_state) { 3667 start emulation until we arrive back to a valid state */
3617 handle_invalid_guest_state(vcpu, kvm_run); 3668 if (vmx->emulation_required && emulate_invalid_guest_state)
3618 return; 3669 return;
3619 }
3620 3670
3621 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 3671 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3622 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 3672 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -3775,7 +3825,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3775 __clear_bit(vmx->vpid, vmx_vpid_bitmap); 3825 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3776 spin_unlock(&vmx_vpid_lock); 3826 spin_unlock(&vmx_vpid_lock);
3777 vmx_free_vmcs(vcpu); 3827 vmx_free_vmcs(vcpu);
3778 kfree(vmx->host_msrs);
3779 kfree(vmx->guest_msrs); 3828 kfree(vmx->guest_msrs);
3780 kvm_vcpu_uninit(vcpu); 3829 kvm_vcpu_uninit(vcpu);
3781 kmem_cache_free(kvm_vcpu_cache, vmx); 3830 kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -3802,10 +3851,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3802 goto uninit_vcpu; 3851 goto uninit_vcpu;
3803 } 3852 }
3804 3853
3805 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3806 if (!vmx->host_msrs)
3807 goto free_guest_msrs;
3808
3809 vmx->vmcs = alloc_vmcs(); 3854 vmx->vmcs = alloc_vmcs();
3810 if (!vmx->vmcs) 3855 if (!vmx->vmcs)
3811 goto free_msrs; 3856 goto free_msrs;
@@ -3836,8 +3881,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3836free_vmcs: 3881free_vmcs:
3837 free_vmcs(vmx->vmcs); 3882 free_vmcs(vmx->vmcs);
3838free_msrs: 3883free_msrs:
3839 kfree(vmx->host_msrs);
3840free_guest_msrs:
3841 kfree(vmx->guest_msrs); 3884 kfree(vmx->guest_msrs);
3842uninit_vcpu: 3885uninit_vcpu:
3843 kvm_vcpu_uninit(&vmx->vcpu); 3886 kvm_vcpu_uninit(&vmx->vcpu);
@@ -3973,6 +4016,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
3973 .queue_exception = vmx_queue_exception, 4016 .queue_exception = vmx_queue_exception,
3974 .interrupt_allowed = vmx_interrupt_allowed, 4017 .interrupt_allowed = vmx_interrupt_allowed,
3975 .nmi_allowed = vmx_nmi_allowed, 4018 .nmi_allowed = vmx_nmi_allowed,
4019 .get_nmi_mask = vmx_get_nmi_mask,
4020 .set_nmi_mask = vmx_set_nmi_mask,
3976 .enable_nmi_window = enable_nmi_window, 4021 .enable_nmi_window = enable_nmi_window,
3977 .enable_irq_window = enable_irq_window, 4022 .enable_irq_window = enable_irq_window,
3978 .update_cr8_intercept = update_cr8_intercept, 4023 .update_cr8_intercept = update_cr8_intercept,
@@ -3987,7 +4032,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
3987 4032
3988static int __init vmx_init(void) 4033static int __init vmx_init(void)
3989{ 4034{
3990 int r; 4035 int r, i;
4036
4037 rdmsrl_safe(MSR_EFER, &host_efer);
4038
4039 for (i = 0; i < NR_VMX_MSR; ++i)
4040 kvm_define_shared_msr(i, vmx_msr_index[i]);
3991 4041
3992 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); 4042 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3993 if (!vmx_io_bitmap_a) 4043 if (!vmx_io_bitmap_a)
@@ -4049,8 +4099,6 @@ static int __init vmx_init(void)
4049 if (bypass_guest_pf) 4099 if (bypass_guest_pf)
4050 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 4100 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4051 4101
4052 ept_sync_global();
4053
4054 return 0; 4102 return 0;
4055 4103
4056out3: 4104out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4fc80174191c..9d068966fb2a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,6 +37,7 @@
37#include <linux/iommu.h> 37#include <linux/iommu.h>
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <linux/user-return-notifier.h>
40#include <trace/events/kvm.h> 41#include <trace/events/kvm.h>
41#undef TRACE_INCLUDE_FILE 42#undef TRACE_INCLUDE_FILE
42#define CREATE_TRACE_POINTS 43#define CREATE_TRACE_POINTS
@@ -88,6 +89,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
88int ignore_msrs = 0; 89int ignore_msrs = 0;
89module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 90module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
90 91
92#define KVM_NR_SHARED_MSRS 16
93
94struct kvm_shared_msrs_global {
95 int nr;
96 struct kvm_shared_msr {
97 u32 msr;
98 u64 value;
99 } msrs[KVM_NR_SHARED_MSRS];
100};
101
102struct kvm_shared_msrs {
103 struct user_return_notifier urn;
104 bool registered;
105 u64 current_value[KVM_NR_SHARED_MSRS];
106};
107
108static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
109static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
110
91struct kvm_stats_debugfs_item debugfs_entries[] = { 111struct kvm_stats_debugfs_item debugfs_entries[] = {
92 { "pf_fixed", VCPU_STAT(pf_fixed) }, 112 { "pf_fixed", VCPU_STAT(pf_fixed) },
93 { "pf_guest", VCPU_STAT(pf_guest) }, 113 { "pf_guest", VCPU_STAT(pf_guest) },
@@ -124,6 +144,72 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
124 { NULL } 144 { NULL }
125}; 145};
126 146
147static void kvm_on_user_return(struct user_return_notifier *urn)
148{
149 unsigned slot;
150 struct kvm_shared_msr *global;
151 struct kvm_shared_msrs *locals
152 = container_of(urn, struct kvm_shared_msrs, urn);
153
154 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
155 global = &shared_msrs_global.msrs[slot];
156 if (global->value != locals->current_value[slot]) {
157 wrmsrl(global->msr, global->value);
158 locals->current_value[slot] = global->value;
159 }
160 }
161 locals->registered = false;
162 user_return_notifier_unregister(urn);
163}
164
165void kvm_define_shared_msr(unsigned slot, u32 msr)
166{
167 int cpu;
168 u64 value;
169
170 if (slot >= shared_msrs_global.nr)
171 shared_msrs_global.nr = slot + 1;
172 shared_msrs_global.msrs[slot].msr = msr;
173 rdmsrl_safe(msr, &value);
174 shared_msrs_global.msrs[slot].value = value;
175 for_each_online_cpu(cpu)
176 per_cpu(shared_msrs, cpu).current_value[slot] = value;
177}
178EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
179
180static void kvm_shared_msr_cpu_online(void)
181{
182 unsigned i;
183 struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
184
185 for (i = 0; i < shared_msrs_global.nr; ++i)
186 locals->current_value[i] = shared_msrs_global.msrs[i].value;
187}
188
189void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
190{
191 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
192
193 if (((value ^ smsr->current_value[slot]) & mask) == 0)
194 return;
195 smsr->current_value[slot] = value;
196 wrmsrl(shared_msrs_global.msrs[slot].msr, value);
197 if (!smsr->registered) {
198 smsr->urn.on_user_return = kvm_on_user_return;
199 user_return_notifier_register(&smsr->urn);
200 smsr->registered = true;
201 }
202}
203EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
204
205static void drop_user_return_notifiers(void *ignore)
206{
207 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
208
209 if (smsr->registered)
210 kvm_on_user_return(&smsr->urn);
211}
212
127unsigned long segment_base(u16 selector) 213unsigned long segment_base(u16 selector)
128{ 214{
129 struct descriptor_table gdt; 215 struct descriptor_table gdt;
@@ -485,16 +571,19 @@ static inline u32 bit(int bitno)
485 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 571 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
486 * 572 *
487 * This list is modified at module load time to reflect the 573 * This list is modified at module load time to reflect the
488 * capabilities of the host cpu. 574 * capabilities of the host cpu. This capabilities test skips MSRs that are
575 * kvm-specific. Those are put in the beginning of the list.
489 */ 576 */
577
578#define KVM_SAVE_MSRS_BEGIN 2
490static u32 msrs_to_save[] = { 579static u32 msrs_to_save[] = {
580 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
491 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 581 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
492 MSR_K6_STAR, 582 MSR_K6_STAR,
493#ifdef CONFIG_X86_64 583#ifdef CONFIG_X86_64
494 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 584 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
495#endif 585#endif
496 MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 586 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
497 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
498}; 587};
499 588
500static unsigned num_msrs_to_save; 589static unsigned num_msrs_to_save;
@@ -678,7 +767,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
678 /* With all the info we got, fill in the values */ 767 /* With all the info we got, fill in the values */
679 768
680 vcpu->hv_clock.system_time = ts.tv_nsec + 769 vcpu->hv_clock.system_time = ts.tv_nsec +
681 (NSEC_PER_SEC * (u64)ts.tv_sec); 770 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
771
682 /* 772 /*
683 * The interface expects us to write an even number signaling that the 773 * The interface expects us to write an even number signaling that the
684 * update is finished. Since the guest won't see the intermediate 774 * update is finished. Since the guest won't see the intermediate
@@ -836,6 +926,38 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
836 return 0; 926 return 0;
837} 927}
838 928
929static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
930{
931 struct kvm *kvm = vcpu->kvm;
932 int lm = is_long_mode(vcpu);
933 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
934 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
935 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
936 : kvm->arch.xen_hvm_config.blob_size_32;
937 u32 page_num = data & ~PAGE_MASK;
938 u64 page_addr = data & PAGE_MASK;
939 u8 *page;
940 int r;
941
942 r = -E2BIG;
943 if (page_num >= blob_size)
944 goto out;
945 r = -ENOMEM;
946 page = kzalloc(PAGE_SIZE, GFP_KERNEL);
947 if (!page)
948 goto out;
949 r = -EFAULT;
950 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
951 goto out_free;
952 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
953 goto out_free;
954 r = 0;
955out_free:
956 kfree(page);
957out:
958 return r;
959}
960
839int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 961int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
840{ 962{
841 switch (msr) { 963 switch (msr) {
@@ -951,6 +1073,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
951 "0x%x data 0x%llx\n", msr, data); 1073 "0x%x data 0x%llx\n", msr, data);
952 break; 1074 break;
953 default: 1075 default:
1076 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1077 return xen_hvm_config(vcpu, data);
954 if (!ignore_msrs) { 1078 if (!ignore_msrs) {
955 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1079 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
956 msr, data); 1080 msr, data);
@@ -1225,6 +1349,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1225 case KVM_CAP_PIT2: 1349 case KVM_CAP_PIT2:
1226 case KVM_CAP_PIT_STATE2: 1350 case KVM_CAP_PIT_STATE2:
1227 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1351 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1352 case KVM_CAP_XEN_HVM:
1353 case KVM_CAP_ADJUST_CLOCK:
1354 case KVM_CAP_VCPU_EVENTS:
1228 r = 1; 1355 r = 1;
1229 break; 1356 break;
1230 case KVM_CAP_COALESCED_MMIO: 1357 case KVM_CAP_COALESCED_MMIO:
@@ -1239,8 +1366,8 @@ int kvm_dev_ioctl_check_extension(long ext)
1239 case KVM_CAP_NR_MEMSLOTS: 1366 case KVM_CAP_NR_MEMSLOTS:
1240 r = KVM_MEMORY_SLOTS; 1367 r = KVM_MEMORY_SLOTS;
1241 break; 1368 break;
1242 case KVM_CAP_PV_MMU: 1369 case KVM_CAP_PV_MMU: /* obsolete */
1243 r = !tdp_enabled; 1370 r = 0;
1244 break; 1371 break;
1245 case KVM_CAP_IOMMU: 1372 case KVM_CAP_IOMMU:
1246 r = iommu_found(); 1373 r = iommu_found();
@@ -1327,6 +1454,12 @@ out:
1327void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1454void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1328{ 1455{
1329 kvm_x86_ops->vcpu_load(vcpu, cpu); 1456 kvm_x86_ops->vcpu_load(vcpu, cpu);
1457 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1458 unsigned long khz = cpufreq_quick_get(cpu);
1459 if (!khz)
1460 khz = tsc_khz;
1461 per_cpu(cpu_tsc_khz, cpu) = khz;
1462 }
1330 kvm_request_guest_time_update(vcpu); 1463 kvm_request_guest_time_update(vcpu);
1331} 1464}
1332 1465
@@ -1760,6 +1893,61 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1760 return 0; 1893 return 0;
1761} 1894}
1762 1895
1896static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
1897 struct kvm_vcpu_events *events)
1898{
1899 vcpu_load(vcpu);
1900
1901 events->exception.injected = vcpu->arch.exception.pending;
1902 events->exception.nr = vcpu->arch.exception.nr;
1903 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
1904 events->exception.error_code = vcpu->arch.exception.error_code;
1905
1906 events->interrupt.injected = vcpu->arch.interrupt.pending;
1907 events->interrupt.nr = vcpu->arch.interrupt.nr;
1908 events->interrupt.soft = vcpu->arch.interrupt.soft;
1909
1910 events->nmi.injected = vcpu->arch.nmi_injected;
1911 events->nmi.pending = vcpu->arch.nmi_pending;
1912 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
1913
1914 events->sipi_vector = vcpu->arch.sipi_vector;
1915
1916 events->flags = 0;
1917
1918 vcpu_put(vcpu);
1919}
1920
1921static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
1922 struct kvm_vcpu_events *events)
1923{
1924 if (events->flags)
1925 return -EINVAL;
1926
1927 vcpu_load(vcpu);
1928
1929 vcpu->arch.exception.pending = events->exception.injected;
1930 vcpu->arch.exception.nr = events->exception.nr;
1931 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
1932 vcpu->arch.exception.error_code = events->exception.error_code;
1933
1934 vcpu->arch.interrupt.pending = events->interrupt.injected;
1935 vcpu->arch.interrupt.nr = events->interrupt.nr;
1936 vcpu->arch.interrupt.soft = events->interrupt.soft;
1937 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
1938 kvm_pic_clear_isr_ack(vcpu->kvm);
1939
1940 vcpu->arch.nmi_injected = events->nmi.injected;
1941 vcpu->arch.nmi_pending = events->nmi.pending;
1942 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
1943
1944 vcpu->arch.sipi_vector = events->sipi_vector;
1945
1946 vcpu_put(vcpu);
1947
1948 return 0;
1949}
1950
1763long kvm_arch_vcpu_ioctl(struct file *filp, 1951long kvm_arch_vcpu_ioctl(struct file *filp,
1764 unsigned int ioctl, unsigned long arg) 1952 unsigned int ioctl, unsigned long arg)
1765{ 1953{
@@ -1770,6 +1958,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1770 1958
1771 switch (ioctl) { 1959 switch (ioctl) {
1772 case KVM_GET_LAPIC: { 1960 case KVM_GET_LAPIC: {
1961 r = -EINVAL;
1962 if (!vcpu->arch.apic)
1963 goto out;
1773 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1964 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1774 1965
1775 r = -ENOMEM; 1966 r = -ENOMEM;
@@ -1785,6 +1976,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1785 break; 1976 break;
1786 } 1977 }
1787 case KVM_SET_LAPIC: { 1978 case KVM_SET_LAPIC: {
1979 r = -EINVAL;
1980 if (!vcpu->arch.apic)
1981 goto out;
1788 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1982 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1789 r = -ENOMEM; 1983 r = -ENOMEM;
1790 if (!lapic) 1984 if (!lapic)
@@ -1911,6 +2105,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1911 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2105 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1912 break; 2106 break;
1913 } 2107 }
2108 case KVM_GET_VCPU_EVENTS: {
2109 struct kvm_vcpu_events events;
2110
2111 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
2112
2113 r = -EFAULT;
2114 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
2115 break;
2116 r = 0;
2117 break;
2118 }
2119 case KVM_SET_VCPU_EVENTS: {
2120 struct kvm_vcpu_events events;
2121
2122 r = -EFAULT;
2123 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
2124 break;
2125
2126 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2127 break;
2128 }
1914 default: 2129 default:
1915 r = -EINVAL; 2130 r = -EINVAL;
1916 } 2131 }
@@ -2039,9 +2254,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2039 sizeof(struct kvm_pic_state)); 2254 sizeof(struct kvm_pic_state));
2040 break; 2255 break;
2041 case KVM_IRQCHIP_IOAPIC: 2256 case KVM_IRQCHIP_IOAPIC:
2042 memcpy(&chip->chip.ioapic, 2257 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2043 ioapic_irqchip(kvm),
2044 sizeof(struct kvm_ioapic_state));
2045 break; 2258 break;
2046 default: 2259 default:
2047 r = -EINVAL; 2260 r = -EINVAL;
@@ -2071,11 +2284,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2071 spin_unlock(&pic_irqchip(kvm)->lock); 2284 spin_unlock(&pic_irqchip(kvm)->lock);
2072 break; 2285 break;
2073 case KVM_IRQCHIP_IOAPIC: 2286 case KVM_IRQCHIP_IOAPIC:
2074 mutex_lock(&kvm->irq_lock); 2287 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2075 memcpy(ioapic_irqchip(kvm),
2076 &chip->chip.ioapic,
2077 sizeof(struct kvm_ioapic_state));
2078 mutex_unlock(&kvm->irq_lock);
2079 break; 2288 break;
2080 default: 2289 default:
2081 r = -EINVAL; 2290 r = -EINVAL;
@@ -2183,7 +2392,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2183{ 2392{
2184 struct kvm *kvm = filp->private_data; 2393 struct kvm *kvm = filp->private_data;
2185 void __user *argp = (void __user *)arg; 2394 void __user *argp = (void __user *)arg;
2186 int r = -EINVAL; 2395 int r = -ENOTTY;
2187 /* 2396 /*
2188 * This union makes it completely explicit to gcc-3.x 2397 * This union makes it completely explicit to gcc-3.x
2189 * that these two variables' stack usage should be 2398 * that these two variables' stack usage should be
@@ -2245,25 +2454,39 @@ long kvm_arch_vm_ioctl(struct file *filp,
2245 if (r) 2454 if (r)
2246 goto out; 2455 goto out;
2247 break; 2456 break;
2248 case KVM_CREATE_IRQCHIP: 2457 case KVM_CREATE_IRQCHIP: {
2458 struct kvm_pic *vpic;
2459
2460 mutex_lock(&kvm->lock);
2461 r = -EEXIST;
2462 if (kvm->arch.vpic)
2463 goto create_irqchip_unlock;
2249 r = -ENOMEM; 2464 r = -ENOMEM;
2250 kvm->arch.vpic = kvm_create_pic(kvm); 2465 vpic = kvm_create_pic(kvm);
2251 if (kvm->arch.vpic) { 2466 if (vpic) {
2252 r = kvm_ioapic_init(kvm); 2467 r = kvm_ioapic_init(kvm);
2253 if (r) { 2468 if (r) {
2254 kfree(kvm->arch.vpic); 2469 kfree(vpic);
2255 kvm->arch.vpic = NULL; 2470 goto create_irqchip_unlock;
2256 goto out;
2257 } 2471 }
2258 } else 2472 } else
2259 goto out; 2473 goto create_irqchip_unlock;
2474 smp_wmb();
2475 kvm->arch.vpic = vpic;
2476 smp_wmb();
2260 r = kvm_setup_default_irq_routing(kvm); 2477 r = kvm_setup_default_irq_routing(kvm);
2261 if (r) { 2478 if (r) {
2479 mutex_lock(&kvm->irq_lock);
2262 kfree(kvm->arch.vpic); 2480 kfree(kvm->arch.vpic);
2263 kfree(kvm->arch.vioapic); 2481 kfree(kvm->arch.vioapic);
2264 goto out; 2482 kvm->arch.vpic = NULL;
2483 kvm->arch.vioapic = NULL;
2484 mutex_unlock(&kvm->irq_lock);
2265 } 2485 }
2486 create_irqchip_unlock:
2487 mutex_unlock(&kvm->lock);
2266 break; 2488 break;
2489 }
2267 case KVM_CREATE_PIT: 2490 case KVM_CREATE_PIT:
2268 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2491 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2269 goto create_pit; 2492 goto create_pit;
@@ -2293,10 +2516,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2293 goto out; 2516 goto out;
2294 if (irqchip_in_kernel(kvm)) { 2517 if (irqchip_in_kernel(kvm)) {
2295 __s32 status; 2518 __s32 status;
2296 mutex_lock(&kvm->irq_lock);
2297 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2519 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2298 irq_event.irq, irq_event.level); 2520 irq_event.irq, irq_event.level);
2299 mutex_unlock(&kvm->irq_lock);
2300 if (ioctl == KVM_IRQ_LINE_STATUS) { 2521 if (ioctl == KVM_IRQ_LINE_STATUS) {
2301 irq_event.status = status; 2522 irq_event.status = status;
2302 if (copy_to_user(argp, &irq_event, 2523 if (copy_to_user(argp, &irq_event,
@@ -2422,6 +2643,55 @@ long kvm_arch_vm_ioctl(struct file *filp,
2422 r = 0; 2643 r = 0;
2423 break; 2644 break;
2424 } 2645 }
2646 case KVM_XEN_HVM_CONFIG: {
2647 r = -EFAULT;
2648 if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
2649 sizeof(struct kvm_xen_hvm_config)))
2650 goto out;
2651 r = -EINVAL;
2652 if (kvm->arch.xen_hvm_config.flags)
2653 goto out;
2654 r = 0;
2655 break;
2656 }
2657 case KVM_SET_CLOCK: {
2658 struct timespec now;
2659 struct kvm_clock_data user_ns;
2660 u64 now_ns;
2661 s64 delta;
2662
2663 r = -EFAULT;
2664 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
2665 goto out;
2666
2667 r = -EINVAL;
2668 if (user_ns.flags)
2669 goto out;
2670
2671 r = 0;
2672 ktime_get_ts(&now);
2673 now_ns = timespec_to_ns(&now);
2674 delta = user_ns.clock - now_ns;
2675 kvm->arch.kvmclock_offset = delta;
2676 break;
2677 }
2678 case KVM_GET_CLOCK: {
2679 struct timespec now;
2680 struct kvm_clock_data user_ns;
2681 u64 now_ns;
2682
2683 ktime_get_ts(&now);
2684 now_ns = timespec_to_ns(&now);
2685 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
2686 user_ns.flags = 0;
2687
2688 r = -EFAULT;
2689 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
2690 goto out;
2691 r = 0;
2692 break;
2693 }
2694
2425 default: 2695 default:
2426 ; 2696 ;
2427 } 2697 }
@@ -2434,7 +2704,8 @@ static void kvm_init_msr_list(void)
2434 u32 dummy[2]; 2704 u32 dummy[2];
2435 unsigned i, j; 2705 unsigned i, j;
2436 2706
2437 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2707 /* skip the first msrs in the list. KVM-specific */
2708 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2438 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2709 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2439 continue; 2710 continue;
2440 if (j < i) 2711 if (j < i)
@@ -2758,13 +3029,13 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
2758} 3029}
2759 3030
2760int emulate_instruction(struct kvm_vcpu *vcpu, 3031int emulate_instruction(struct kvm_vcpu *vcpu,
2761 struct kvm_run *run,
2762 unsigned long cr2, 3032 unsigned long cr2,
2763 u16 error_code, 3033 u16 error_code,
2764 int emulation_type) 3034 int emulation_type)
2765{ 3035{
2766 int r, shadow_mask; 3036 int r, shadow_mask;
2767 struct decode_cache *c; 3037 struct decode_cache *c;
3038 struct kvm_run *run = vcpu->run;
2768 3039
2769 kvm_clear_exception_queue(vcpu); 3040 kvm_clear_exception_queue(vcpu);
2770 vcpu->arch.mmio_fault_cr2 = cr2; 3041 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -2784,7 +3055,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2784 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3055 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2785 3056
2786 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3057 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2787 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 3058 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
2788 vcpu->arch.emulate_ctxt.mode = 3059 vcpu->arch.emulate_ctxt.mode =
2789 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3060 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2790 ? X86EMUL_MODE_REAL : cs_l 3061 ? X86EMUL_MODE_REAL : cs_l
@@ -2862,7 +3133,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2862 return EMULATE_DO_MMIO; 3133 return EMULATE_DO_MMIO;
2863 } 3134 }
2864 3135
2865 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 3136 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2866 3137
2867 if (vcpu->mmio_is_write) { 3138 if (vcpu->mmio_is_write) {
2868 vcpu->mmio_needed = 0; 3139 vcpu->mmio_needed = 0;
@@ -2970,8 +3241,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
2970 return r; 3241 return r;
2971} 3242}
2972 3243
2973int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3244int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
2974 int size, unsigned port)
2975{ 3245{
2976 unsigned long val; 3246 unsigned long val;
2977 3247
@@ -3000,7 +3270,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
3000} 3270}
3001EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3271EXPORT_SYMBOL_GPL(kvm_emulate_pio);
3002 3272
3003int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3273int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3004 int size, unsigned long count, int down, 3274 int size, unsigned long count, int down,
3005 gva_t address, int rep, unsigned port) 3275 gva_t address, int rep, unsigned port)
3006{ 3276{
@@ -3073,9 +3343,6 @@ static void bounce_off(void *info)
3073 /* nothing */ 3343 /* nothing */
3074} 3344}
3075 3345
3076static unsigned int ref_freq;
3077static unsigned long tsc_khz_ref;
3078
3079static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3346static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3080 void *data) 3347 void *data)
3081{ 3348{
@@ -3084,14 +3351,11 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
3084 struct kvm_vcpu *vcpu; 3351 struct kvm_vcpu *vcpu;
3085 int i, send_ipi = 0; 3352 int i, send_ipi = 0;
3086 3353
3087 if (!ref_freq)
3088 ref_freq = freq->old;
3089
3090 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3354 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3091 return 0; 3355 return 0;
3092 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3356 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3093 return 0; 3357 return 0;
3094 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 3358 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3095 3359
3096 spin_lock(&kvm_lock); 3360 spin_lock(&kvm_lock);
3097 list_for_each_entry(kvm, &vm_list, vm_list) { 3361 list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3128,9 +3392,28 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
3128 .notifier_call = kvmclock_cpufreq_notifier 3392 .notifier_call = kvmclock_cpufreq_notifier
3129}; 3393};
3130 3394
3395static void kvm_timer_init(void)
3396{
3397 int cpu;
3398
3399 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3400 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3401 CPUFREQ_TRANSITION_NOTIFIER);
3402 for_each_online_cpu(cpu) {
3403 unsigned long khz = cpufreq_get(cpu);
3404 if (!khz)
3405 khz = tsc_khz;
3406 per_cpu(cpu_tsc_khz, cpu) = khz;
3407 }
3408 } else {
3409 for_each_possible_cpu(cpu)
3410 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3411 }
3412}
3413
3131int kvm_arch_init(void *opaque) 3414int kvm_arch_init(void *opaque)
3132{ 3415{
3133 int r, cpu; 3416 int r;
3134 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3417 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3135 3418
3136 if (kvm_x86_ops) { 3419 if (kvm_x86_ops) {
@@ -3162,13 +3445,7 @@ int kvm_arch_init(void *opaque)
3162 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3445 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3163 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3446 PT_DIRTY_MASK, PT64_NX_MASK, 0);
3164 3447
3165 for_each_possible_cpu(cpu) 3448 kvm_timer_init();
3166 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3167 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3168 tsc_khz_ref = tsc_khz;
3169 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3170 CPUFREQ_TRANSITION_NOTIFIER);
3171 }
3172 3449
3173 return 0; 3450 return 0;
3174 3451
@@ -3296,7 +3573,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3296 unsigned long *rflags) 3573 unsigned long *rflags)
3297{ 3574{
3298 kvm_lmsw(vcpu, msw); 3575 kvm_lmsw(vcpu, msw);
3299 *rflags = kvm_x86_ops->get_rflags(vcpu); 3576 *rflags = kvm_get_rflags(vcpu);
3300} 3577}
3301 3578
3302unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3579unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
@@ -3334,7 +3611,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3334 switch (cr) { 3611 switch (cr) {
3335 case 0: 3612 case 0:
3336 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3613 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3337 *rflags = kvm_x86_ops->get_rflags(vcpu); 3614 *rflags = kvm_get_rflags(vcpu);
3338 break; 3615 break;
3339 case 2: 3616 case 2:
3340 vcpu->arch.cr2 = val; 3617 vcpu->arch.cr2 = val;
@@ -3454,18 +3731,18 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3454 * 3731 *
3455 * No need to exit to userspace if we already have an interrupt queued. 3732 * No need to exit to userspace if we already have an interrupt queued.
3456 */ 3733 */
3457static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3734static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
3458 struct kvm_run *kvm_run)
3459{ 3735{
3460 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3736 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3461 kvm_run->request_interrupt_window && 3737 vcpu->run->request_interrupt_window &&
3462 kvm_arch_interrupt_allowed(vcpu)); 3738 kvm_arch_interrupt_allowed(vcpu));
3463} 3739}
3464 3740
3465static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3741static void post_kvm_run_save(struct kvm_vcpu *vcpu)
3466 struct kvm_run *kvm_run)
3467{ 3742{
3468 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3743 struct kvm_run *kvm_run = vcpu->run;
3744
3745 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3469 kvm_run->cr8 = kvm_get_cr8(vcpu); 3746 kvm_run->cr8 = kvm_get_cr8(vcpu);
3470 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3747 kvm_run->apic_base = kvm_get_apic_base(vcpu);
3471 if (irqchip_in_kernel(vcpu->kvm)) 3748 if (irqchip_in_kernel(vcpu->kvm))
@@ -3526,7 +3803,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3526 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3803 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3527} 3804}
3528 3805
3529static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3806static void inject_pending_event(struct kvm_vcpu *vcpu)
3530{ 3807{
3531 /* try to reinject previous events if any */ 3808 /* try to reinject previous events if any */
3532 if (vcpu->arch.exception.pending) { 3809 if (vcpu->arch.exception.pending) {
@@ -3562,11 +3839,11 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3562 } 3839 }
3563} 3840}
3564 3841
3565static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3842static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3566{ 3843{
3567 int r; 3844 int r;
3568 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3845 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3569 kvm_run->request_interrupt_window; 3846 vcpu->run->request_interrupt_window;
3570 3847
3571 if (vcpu->requests) 3848 if (vcpu->requests)
3572 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3849 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3587,12 +3864,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3587 kvm_x86_ops->tlb_flush(vcpu); 3864 kvm_x86_ops->tlb_flush(vcpu);
3588 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3865 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3589 &vcpu->requests)) { 3866 &vcpu->requests)) {
3590 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 3867 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
3591 r = 0; 3868 r = 0;
3592 goto out; 3869 goto out;
3593 } 3870 }
3594 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3871 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3595 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 3872 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3596 r = 0; 3873 r = 0;
3597 goto out; 3874 goto out;
3598 } 3875 }
@@ -3616,7 +3893,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3616 goto out; 3893 goto out;
3617 } 3894 }
3618 3895
3619 inject_pending_event(vcpu, kvm_run); 3896 inject_pending_event(vcpu);
3620 3897
3621 /* enable NMI/IRQ window open exits if needed */ 3898 /* enable NMI/IRQ window open exits if needed */
3622 if (vcpu->arch.nmi_pending) 3899 if (vcpu->arch.nmi_pending)
@@ -3642,7 +3919,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3642 } 3919 }
3643 3920
3644 trace_kvm_entry(vcpu->vcpu_id); 3921 trace_kvm_entry(vcpu->vcpu_id);
3645 kvm_x86_ops->run(vcpu, kvm_run); 3922 kvm_x86_ops->run(vcpu);
3646 3923
3647 /* 3924 /*
3648 * If the guest has used debug registers, at least dr7 3925 * If the guest has used debug registers, at least dr7
@@ -3684,13 +3961,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3684 3961
3685 kvm_lapic_sync_from_vapic(vcpu); 3962 kvm_lapic_sync_from_vapic(vcpu);
3686 3963
3687 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3964 r = kvm_x86_ops->handle_exit(vcpu);
3688out: 3965out:
3689 return r; 3966 return r;
3690} 3967}
3691 3968
3692 3969
3693static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3970static int __vcpu_run(struct kvm_vcpu *vcpu)
3694{ 3971{
3695 int r; 3972 int r;
3696 3973
@@ -3710,7 +3987,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3710 r = 1; 3987 r = 1;
3711 while (r > 0) { 3988 while (r > 0) {
3712 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3989 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3713 r = vcpu_enter_guest(vcpu, kvm_run); 3990 r = vcpu_enter_guest(vcpu);
3714 else { 3991 else {
3715 up_read(&vcpu->kvm->slots_lock); 3992 up_read(&vcpu->kvm->slots_lock);
3716 kvm_vcpu_block(vcpu); 3993 kvm_vcpu_block(vcpu);
@@ -3738,14 +4015,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3738 if (kvm_cpu_has_pending_timer(vcpu)) 4015 if (kvm_cpu_has_pending_timer(vcpu))
3739 kvm_inject_pending_timer_irqs(vcpu); 4016 kvm_inject_pending_timer_irqs(vcpu);
3740 4017
3741 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 4018 if (dm_request_for_irq_injection(vcpu)) {
3742 r = -EINTR; 4019 r = -EINTR;
3743 kvm_run->exit_reason = KVM_EXIT_INTR; 4020 vcpu->run->exit_reason = KVM_EXIT_INTR;
3744 ++vcpu->stat.request_irq_exits; 4021 ++vcpu->stat.request_irq_exits;
3745 } 4022 }
3746 if (signal_pending(current)) { 4023 if (signal_pending(current)) {
3747 r = -EINTR; 4024 r = -EINTR;
3748 kvm_run->exit_reason = KVM_EXIT_INTR; 4025 vcpu->run->exit_reason = KVM_EXIT_INTR;
3749 ++vcpu->stat.signal_exits; 4026 ++vcpu->stat.signal_exits;
3750 } 4027 }
3751 if (need_resched()) { 4028 if (need_resched()) {
@@ -3756,7 +4033,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3756 } 4033 }
3757 4034
3758 up_read(&vcpu->kvm->slots_lock); 4035 up_read(&vcpu->kvm->slots_lock);
3759 post_kvm_run_save(vcpu, kvm_run); 4036 post_kvm_run_save(vcpu);
3760 4037
3761 vapic_exit(vcpu); 4038 vapic_exit(vcpu);
3762 4039
@@ -3789,15 +4066,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3789 if (r) 4066 if (r)
3790 goto out; 4067 goto out;
3791 } 4068 }
3792#if CONFIG_HAS_IOMEM
3793 if (vcpu->mmio_needed) { 4069 if (vcpu->mmio_needed) {
3794 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4070 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3795 vcpu->mmio_read_completed = 1; 4071 vcpu->mmio_read_completed = 1;
3796 vcpu->mmio_needed = 0; 4072 vcpu->mmio_needed = 0;
3797 4073
3798 down_read(&vcpu->kvm->slots_lock); 4074 down_read(&vcpu->kvm->slots_lock);
3799 r = emulate_instruction(vcpu, kvm_run, 4075 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
3800 vcpu->arch.mmio_fault_cr2, 0,
3801 EMULTYPE_NO_DECODE); 4076 EMULTYPE_NO_DECODE);
3802 up_read(&vcpu->kvm->slots_lock); 4077 up_read(&vcpu->kvm->slots_lock);
3803 if (r == EMULATE_DO_MMIO) { 4078 if (r == EMULATE_DO_MMIO) {
@@ -3808,12 +4083,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3808 goto out; 4083 goto out;
3809 } 4084 }
3810 } 4085 }
3811#endif
3812 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 4086 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3813 kvm_register_write(vcpu, VCPU_REGS_RAX, 4087 kvm_register_write(vcpu, VCPU_REGS_RAX,
3814 kvm_run->hypercall.ret); 4088 kvm_run->hypercall.ret);
3815 4089
3816 r = __vcpu_run(vcpu, kvm_run); 4090 r = __vcpu_run(vcpu);
3817 4091
3818out: 4092out:
3819 if (vcpu->sigset_active) 4093 if (vcpu->sigset_active)
@@ -3847,13 +4121,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3847#endif 4121#endif
3848 4122
3849 regs->rip = kvm_rip_read(vcpu); 4123 regs->rip = kvm_rip_read(vcpu);
3850 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 4124 regs->rflags = kvm_get_rflags(vcpu);
3851
3852 /*
3853 * Don't leak debug flags in case they were set for guest debugging
3854 */
3855 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3856 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3857 4125
3858 vcpu_put(vcpu); 4126 vcpu_put(vcpu);
3859 4127
@@ -3881,12 +4149,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3881 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 4149 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3882 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 4150 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3883 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 4151 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3884
3885#endif 4152#endif
3886 4153
3887 kvm_rip_write(vcpu, regs->rip); 4154 kvm_rip_write(vcpu, regs->rip);
3888 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 4155 kvm_set_rflags(vcpu, regs->rflags);
3889
3890 4156
3891 vcpu->arch.exception.pending = false; 4157 vcpu->arch.exception.pending = false;
3892 4158
@@ -4105,7 +4371,7 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4105{ 4371{
4106 return (seg != VCPU_SREG_LDTR) && 4372 return (seg != VCPU_SREG_LDTR) &&
4107 (seg != VCPU_SREG_TR) && 4373 (seg != VCPU_SREG_TR) &&
4108 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); 4374 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4109} 4375}
4110 4376
4111int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4377int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
@@ -4133,7 +4399,7 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4133{ 4399{
4134 tss->cr3 = vcpu->arch.cr3; 4400 tss->cr3 = vcpu->arch.cr3;
4135 tss->eip = kvm_rip_read(vcpu); 4401 tss->eip = kvm_rip_read(vcpu);
4136 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 4402 tss->eflags = kvm_get_rflags(vcpu);
4137 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4403 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4138 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4404 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4139 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4405 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4157,7 +4423,7 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4157 kvm_set_cr3(vcpu, tss->cr3); 4423 kvm_set_cr3(vcpu, tss->cr3);
4158 4424
4159 kvm_rip_write(vcpu, tss->eip); 4425 kvm_rip_write(vcpu, tss->eip);
4160 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 4426 kvm_set_rflags(vcpu, tss->eflags | 2);
4161 4427
4162 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4428 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4163 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4429 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
@@ -4195,7 +4461,7 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4195 struct tss_segment_16 *tss) 4461 struct tss_segment_16 *tss)
4196{ 4462{
4197 tss->ip = kvm_rip_read(vcpu); 4463 tss->ip = kvm_rip_read(vcpu);
4198 tss->flag = kvm_x86_ops->get_rflags(vcpu); 4464 tss->flag = kvm_get_rflags(vcpu);
4199 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4465 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4200 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4466 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4201 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4467 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4210,14 +4476,13 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4210 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4476 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4211 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4477 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4212 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4478 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4213 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
4214} 4479}
4215 4480
4216static int load_state_from_tss16(struct kvm_vcpu *vcpu, 4481static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4217 struct tss_segment_16 *tss) 4482 struct tss_segment_16 *tss)
4218{ 4483{
4219 kvm_rip_write(vcpu, tss->ip); 4484 kvm_rip_write(vcpu, tss->ip);
4220 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 4485 kvm_set_rflags(vcpu, tss->flag | 2);
4221 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 4486 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4222 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 4487 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4223 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 4488 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
@@ -4363,8 +4628,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4363 } 4628 }
4364 4629
4365 if (reason == TASK_SWITCH_IRET) { 4630 if (reason == TASK_SWITCH_IRET) {
4366 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4631 u32 eflags = kvm_get_rflags(vcpu);
4367 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4632 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4368 } 4633 }
4369 4634
4370 /* set back link to prev task only if NT bit is set in eflags 4635 /* set back link to prev task only if NT bit is set in eflags
@@ -4372,11 +4637,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4372 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4637 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4373 old_tss_sel = 0xffff; 4638 old_tss_sel = 0xffff;
4374 4639
4375 /* set back link to prev task only if NT bit is set in eflags
4376 note that old_tss_sel is not used afetr this point */
4377 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4378 old_tss_sel = 0xffff;
4379
4380 if (nseg_desc.type & 8) 4640 if (nseg_desc.type & 8)
4381 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4641 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4382 old_tss_base, &nseg_desc); 4642 old_tss_base, &nseg_desc);
@@ -4385,8 +4645,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4385 old_tss_base, &nseg_desc); 4645 old_tss_base, &nseg_desc);
4386 4646
4387 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4647 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4388 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4648 u32 eflags = kvm_get_rflags(vcpu);
4389 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4649 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4390 } 4650 }
4391 4651
4392 if (reason != TASK_SWITCH_IRET) { 4652 if (reason != TASK_SWITCH_IRET) {
@@ -4438,8 +4698,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4438 4698
4439 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4699 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4440 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4700 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4441 if (!is_long_mode(vcpu) && is_pae(vcpu)) 4701 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4442 load_pdptrs(vcpu, vcpu->arch.cr3); 4702 load_pdptrs(vcpu, vcpu->arch.cr3);
4703 mmu_reset_needed = 1;
4704 }
4443 4705
4444 if (mmu_reset_needed) 4706 if (mmu_reset_needed)
4445 kvm_mmu_reset_context(vcpu); 4707 kvm_mmu_reset_context(vcpu);
@@ -4480,12 +4742,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4480int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4742int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4481 struct kvm_guest_debug *dbg) 4743 struct kvm_guest_debug *dbg)
4482{ 4744{
4745 unsigned long rflags;
4483 int i, r; 4746 int i, r;
4484 4747
4485 vcpu_load(vcpu); 4748 vcpu_load(vcpu);
4486 4749
4487 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 4750 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
4488 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 4751 r = -EBUSY;
4752 if (vcpu->arch.exception.pending)
4753 goto unlock_out;
4754 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4755 kvm_queue_exception(vcpu, DB_VECTOR);
4756 else
4757 kvm_queue_exception(vcpu, BP_VECTOR);
4758 }
4759
4760 /*
4761 * Read rflags as long as potentially injected trace flags are still
4762 * filtered out.
4763 */
4764 rflags = kvm_get_rflags(vcpu);
4765
4766 vcpu->guest_debug = dbg->control;
4767 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
4768 vcpu->guest_debug = 0;
4769
4770 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4489 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4771 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4490 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4772 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4491 vcpu->arch.switch_db_regs = 4773 vcpu->arch.switch_db_regs =
@@ -4496,13 +4778,23 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4496 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4778 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4497 } 4779 }
4498 4780
4499 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4781 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
4782 vcpu->arch.singlestep_cs =
4783 get_segment_selector(vcpu, VCPU_SREG_CS);
4784 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
4785 }
4786
4787 /*
4788 * Trigger an rflags update that will inject or remove the trace
4789 * flags.
4790 */
4791 kvm_set_rflags(vcpu, rflags);
4792
4793 kvm_x86_ops->set_guest_debug(vcpu, dbg);
4500 4794
4501 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4795 r = 0;
4502 kvm_queue_exception(vcpu, DB_VECTOR);
4503 else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4504 kvm_queue_exception(vcpu, BP_VECTOR);
4505 4796
4797unlock_out:
4506 vcpu_put(vcpu); 4798 vcpu_put(vcpu);
4507 4799
4508 return r; 4800 return r;
@@ -4703,14 +4995,26 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4703 return kvm_x86_ops->vcpu_reset(vcpu); 4995 return kvm_x86_ops->vcpu_reset(vcpu);
4704} 4996}
4705 4997
4706void kvm_arch_hardware_enable(void *garbage) 4998int kvm_arch_hardware_enable(void *garbage)
4707{ 4999{
4708 kvm_x86_ops->hardware_enable(garbage); 5000 /*
5001 * Since this may be called from a hotplug notifcation,
5002 * we can't get the CPU frequency directly.
5003 */
5004 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5005 int cpu = raw_smp_processor_id();
5006 per_cpu(cpu_tsc_khz, cpu) = 0;
5007 }
5008
5009 kvm_shared_msr_cpu_online();
5010
5011 return kvm_x86_ops->hardware_enable(garbage);
4709} 5012}
4710 5013
4711void kvm_arch_hardware_disable(void *garbage) 5014void kvm_arch_hardware_disable(void *garbage)
4712{ 5015{
4713 kvm_x86_ops->hardware_disable(garbage); 5016 kvm_x86_ops->hardware_disable(garbage);
5017 drop_user_return_notifiers(garbage);
4714} 5018}
4715 5019
4716int kvm_arch_hardware_setup(void) 5020int kvm_arch_hardware_setup(void)
@@ -4948,8 +5252,36 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4948 return kvm_x86_ops->interrupt_allowed(vcpu); 5252 return kvm_x86_ops->interrupt_allowed(vcpu);
4949} 5253}
4950 5254
5255unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5256{
5257 unsigned long rflags;
5258
5259 rflags = kvm_x86_ops->get_rflags(vcpu);
5260 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5261 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
5262 return rflags;
5263}
5264EXPORT_SYMBOL_GPL(kvm_get_rflags);
5265
5266void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5267{
5268 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5269 vcpu->arch.singlestep_cs ==
5270 get_segment_selector(vcpu, VCPU_SREG_CS) &&
5271 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5272 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5273 kvm_x86_ops->set_rflags(vcpu, rflags);
5274}
5275EXPORT_SYMBOL_GPL(kvm_set_rflags);
5276
4951EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 5277EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
4952EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 5278EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
4953EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 5279EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
4954EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 5280EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
4955EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 5281EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
5282EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
5283EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
5284EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5285EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5286EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5287EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index a2d6472895fb..45b20e486c2f 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -5,7 +5,7 @@
5inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk 5inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
6inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt 6inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
7quiet_cmd_inat_tables = GEN $@ 7quiet_cmd_inat_tables = GEN $@
8 cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ 8 cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@
9 9
10$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) 10$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
11 $(call cmd,inat_tables) 11 $(call cmd,inat_tables)
@@ -20,7 +20,7 @@ lib-y := delay.o
20lib-y += thunk_$(BITS).o 20lib-y += thunk_$(BITS).o
21lib-y += usercopy_$(BITS).o getuser.o putuser.o 21lib-y += usercopy_$(BITS).o getuser.o putuser.o
22lib-y += memcpy_$(BITS).o 22lib-y += memcpy_$(BITS).o
23lib-y += insn.o inat.o 23lib-$(CONFIG_KPROBES) += insn.o inat.o
24 24
25obj-y += msr-reg.o msr-reg-export.o 25obj-y += msr-reg.o msr-reg-export.o
26 26
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 73ffd5536f62..d406c5239019 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -146,10 +146,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
146 use_gbpages = direct_gbpages; 146 use_gbpages = direct_gbpages;
147#endif 147#endif
148 148
149 set_nx();
150 if (nx_enabled)
151 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
152
153 /* Enable PSE if available */ 149 /* Enable PSE if available */
154 if (cpu_has_pse) 150 if (cpu_has_pse)
155 set_in_cr4(X86_CR4_PSE); 151 set_in_cr4(X86_CR4_PSE);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 30938c1d8d5d..c973f8e2a6cf 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -412,7 +412,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
412 pkmap_page_table = pte; 412 pkmap_page_table = pte;
413} 413}
414 414
415static void __init add_one_highpage_init(struct page *page, int pfn) 415static void __init add_one_highpage_init(struct page *page)
416{ 416{
417 ClearPageReserved(page); 417 ClearPageReserved(page);
418 init_page_count(page); 418 init_page_count(page);
@@ -445,7 +445,7 @@ static int __init add_highpages_work_fn(unsigned long start_pfn,
445 if (!pfn_valid(node_pfn)) 445 if (!pfn_valid(node_pfn))
446 continue; 446 continue;
447 page = pfn_to_page(node_pfn); 447 page = pfn_to_page(node_pfn);
448 add_one_highpage_init(page, node_pfn); 448 add_one_highpage_init(page);
449 } 449 }
450 450
451 return 0; 451 return 0;
@@ -703,8 +703,8 @@ void __init find_low_pfn_range(void)
703} 703}
704 704
705#ifndef CONFIG_NEED_MULTIPLE_NODES 705#ifndef CONFIG_NEED_MULTIPLE_NODES
706void __init initmem_init(unsigned long start_pfn, 706void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
707 unsigned long end_pfn) 707 int acpi, int k8)
708{ 708{
709#ifdef CONFIG_HIGHMEM 709#ifdef CONFIG_HIGHMEM
710 highstart_pfn = highend_pfn = max_pfn; 710 highstart_pfn = highend_pfn = max_pfn;
@@ -997,7 +997,7 @@ static noinline int do_test_wp_bit(void)
997const int rodata_test_data = 0xC3; 997const int rodata_test_data = 0xC3;
998EXPORT_SYMBOL_GPL(rodata_test_data); 998EXPORT_SYMBOL_GPL(rodata_test_data);
999 999
1000static int kernel_set_to_readonly; 1000int kernel_set_to_readonly __read_mostly;
1001 1001
1002void set_kernel_text_rw(void) 1002void set_kernel_text_rw(void)
1003{ 1003{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a4398a6006b..5198b9bb34ef 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -568,7 +568,8 @@ kernel_physical_mapping_init(unsigned long start,
568} 568}
569 569
570#ifndef CONFIG_NUMA 570#ifndef CONFIG_NUMA
571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) 571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
572 int acpi, int k8)
572{ 573{
573 unsigned long bootmap_size, bootmap; 574 unsigned long bootmap_size, bootmap;
574 575
@@ -694,12 +695,12 @@ void __init mem_init(void)
694const int rodata_test_data = 0xC3; 695const int rodata_test_data = 0xC3;
695EXPORT_SYMBOL_GPL(rodata_test_data); 696EXPORT_SYMBOL_GPL(rodata_test_data);
696 697
697static int kernel_set_to_readonly; 698int kernel_set_to_readonly;
698 699
699void set_kernel_text_rw(void) 700void set_kernel_text_rw(void)
700{ 701{
701 unsigned long start = PFN_ALIGN(_stext); 702 unsigned long start = PFN_ALIGN(_text);
702 unsigned long end = PFN_ALIGN(__start_rodata); 703 unsigned long end = PFN_ALIGN(__stop___ex_table);
703 704
704 if (!kernel_set_to_readonly) 705 if (!kernel_set_to_readonly)
705 return; 706 return;
@@ -707,13 +708,18 @@ void set_kernel_text_rw(void)
707 pr_debug("Set kernel text: %lx - %lx for read write\n", 708 pr_debug("Set kernel text: %lx - %lx for read write\n",
708 start, end); 709 start, end);
709 710
711 /*
712 * Make the kernel identity mapping for text RW. Kernel text
713 * mapping will always be RO. Refer to the comment in
714 * static_protections() in pageattr.c
715 */
710 set_memory_rw(start, (end - start) >> PAGE_SHIFT); 716 set_memory_rw(start, (end - start) >> PAGE_SHIFT);
711} 717}
712 718
713void set_kernel_text_ro(void) 719void set_kernel_text_ro(void)
714{ 720{
715 unsigned long start = PFN_ALIGN(_stext); 721 unsigned long start = PFN_ALIGN(_text);
716 unsigned long end = PFN_ALIGN(__start_rodata); 722 unsigned long end = PFN_ALIGN(__stop___ex_table);
717 723
718 if (!kernel_set_to_readonly) 724 if (!kernel_set_to_readonly)
719 return; 725 return;
@@ -721,14 +727,21 @@ void set_kernel_text_ro(void)
721 pr_debug("Set kernel text: %lx - %lx for read only\n", 727 pr_debug("Set kernel text: %lx - %lx for read only\n",
722 start, end); 728 start, end);
723 729
730 /*
731 * Set the kernel identity mapping for text RO.
732 */
724 set_memory_ro(start, (end - start) >> PAGE_SHIFT); 733 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
725} 734}
726 735
727void mark_rodata_ro(void) 736void mark_rodata_ro(void)
728{ 737{
729 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 738 unsigned long start = PFN_ALIGN(_text);
730 unsigned long rodata_start = 739 unsigned long rodata_start =
731 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 740 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
741 unsigned long end = (unsigned long) &__end_rodata_hpage_align;
742 unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
743 unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
744 unsigned long data_start = (unsigned long) &_sdata;
732 745
733 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 746 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
734 (end - start) >> 10); 747 (end - start) >> 10);
@@ -751,6 +764,14 @@ void mark_rodata_ro(void)
751 printk(KERN_INFO "Testing CPA: again\n"); 764 printk(KERN_INFO "Testing CPA: again\n");
752 set_memory_ro(start, (end-start) >> PAGE_SHIFT); 765 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
753#endif 766#endif
767
768 free_init_pages("unused kernel memory",
769 (unsigned long) page_address(virt_to_page(text_end)),
770 (unsigned long)
771 page_address(virt_to_page(rodata_start)));
772 free_init_pages("unused kernel memory",
773 (unsigned long) page_address(virt_to_page(rodata_end)),
774 (unsigned long) page_address(virt_to_page(data_start)));
754} 775}
755 776
756#endif 777#endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 2feb9bdedaaf..c246d259822d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -281,30 +281,6 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
281} 281}
282EXPORT_SYMBOL(ioremap_cache); 282EXPORT_SYMBOL(ioremap_cache);
283 283
284static void __iomem *ioremap_default(resource_size_t phys_addr,
285 unsigned long size)
286{
287 unsigned long flags;
288 void __iomem *ret;
289 int err;
290
291 /*
292 * - WB for WB-able memory and no other conflicting mappings
293 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
294 * - Inherit from confliting mappings otherwise
295 */
296 err = reserve_memtype(phys_addr, phys_addr + size,
297 _PAGE_CACHE_WB, &flags);
298 if (err < 0)
299 return NULL;
300
301 ret = __ioremap_caller(phys_addr, size, flags,
302 __builtin_return_address(0));
303
304 free_memtype(phys_addr, phys_addr + size);
305 return ret;
306}
307
308void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, 284void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
309 unsigned long prot_val) 285 unsigned long prot_val)
310{ 286{
@@ -380,7 +356,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
380 if (page_is_ram(start >> PAGE_SHIFT)) 356 if (page_is_ram(start >> PAGE_SHIFT))
381 return __va(phys); 357 return __va(phys);
382 358
383 addr = (void __force *)ioremap_default(start, PAGE_SIZE); 359 addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
384 if (addr) 360 if (addr)
385 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 361 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
386 362
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 268f8255280f..970ed579d4e4 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -24,6 +24,9 @@
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h> 25#include <asm/k8.h>
26 26
27static struct bootnode __initdata nodes[8];
28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
29
27static __init int find_northbridge(void) 30static __init int find_northbridge(void)
28{ 31{
29 int num; 32 int num;
@@ -54,18 +57,6 @@ static __init void early_get_boot_cpu_id(void)
54 * need to get boot_cpu_id so can use that to create apicid_to_node 57 * need to get boot_cpu_id so can use that to create apicid_to_node
55 * in k8_scan_nodes() 58 * in k8_scan_nodes()
56 */ 59 */
57 /*
58 * Find possible boot-time SMP configuration:
59 */
60#ifdef CONFIG_X86_MPPARSE
61 early_find_smp_config();
62#endif
63#ifdef CONFIG_ACPI
64 /*
65 * Read APIC information from ACPI tables.
66 */
67 early_acpi_boot_init();
68#endif
69#ifdef CONFIG_X86_MPPARSE 60#ifdef CONFIG_X86_MPPARSE
70 /* 61 /*
71 * get boot-time SMP configuration: 62 * get boot-time SMP configuration:
@@ -76,12 +67,26 @@ static __init void early_get_boot_cpu_id(void)
76 early_init_lapic_mapping(); 67 early_init_lapic_mapping();
77} 68}
78 69
79int __init k8_scan_nodes(unsigned long start, unsigned long end) 70int __init k8_get_nodes(struct bootnode *physnodes)
80{ 71{
81 unsigned numnodes, cores, bits, apicid_base; 72 int i;
73 int ret = 0;
74
75 for_each_node_mask(i, nodes_parsed) {
76 physnodes[ret].start = nodes[i].start;
77 physnodes[ret].end = nodes[i].end;
78 ret++;
79 }
80 return ret;
81}
82
83int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
84{
85 unsigned long start = PFN_PHYS(start_pfn);
86 unsigned long end = PFN_PHYS(end_pfn);
87 unsigned numnodes;
82 unsigned long prevbase; 88 unsigned long prevbase;
83 struct bootnode nodes[8]; 89 int i, nb, found = 0;
84 int i, j, nb, found = 0;
85 u32 nodeid, reg; 90 u32 nodeid, reg;
86 91
87 if (!early_pci_allowed()) 92 if (!early_pci_allowed())
@@ -91,16 +96,15 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
91 if (nb < 0) 96 if (nb < 0)
92 return nb; 97 return nb;
93 98
94 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 99 pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
95 100
96 reg = read_pci_config(0, nb, 0, 0x60); 101 reg = read_pci_config(0, nb, 0, 0x60);
97 numnodes = ((reg >> 4) & 0xF) + 1; 102 numnodes = ((reg >> 4) & 0xF) + 1;
98 if (numnodes <= 1) 103 if (numnodes <= 1)
99 return -1; 104 return -1;
100 105
101 printk(KERN_INFO "Number of nodes %d\n", numnodes); 106 pr_info("Number of physical nodes %d\n", numnodes);
102 107
103 memset(&nodes, 0, sizeof(nodes));
104 prevbase = 0; 108 prevbase = 0;
105 for (i = 0; i < 8; i++) { 109 for (i = 0; i < 8; i++) {
106 unsigned long base, limit; 110 unsigned long base, limit;
@@ -111,28 +115,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
111 nodeid = limit & 7; 115 nodeid = limit & 7;
112 if ((base & 3) == 0) { 116 if ((base & 3) == 0) {
113 if (i < numnodes) 117 if (i < numnodes)
114 printk("Skipping disabled node %d\n", i); 118 pr_info("Skipping disabled node %d\n", i);
115 continue; 119 continue;
116 } 120 }
117 if (nodeid >= numnodes) { 121 if (nodeid >= numnodes) {
118 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, 122 pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
119 base, limit); 123 base, limit);
120 continue; 124 continue;
121 } 125 }
122 126
123 if (!limit) { 127 if (!limit) {
124 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", 128 pr_info("Skipping node entry %d (base %lx)\n",
125 i, base); 129 i, base);
126 continue; 130 continue;
127 } 131 }
128 if ((base >> 8) & 3 || (limit >> 8) & 3) { 132 if ((base >> 8) & 3 || (limit >> 8) & 3) {
129 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 133 pr_err("Node %d using interleaving mode %lx/%lx\n",
130 nodeid, (base>>8)&3, (limit>>8) & 3); 134 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
131 return -1; 135 return -1;
132 } 136 }
133 if (node_isset(nodeid, node_possible_map)) { 137 if (node_isset(nodeid, nodes_parsed)) {
134 printk(KERN_INFO "Node %d already present. Skipping\n", 138 pr_info("Node %d already present, skipping\n",
135 nodeid); 139 nodeid);
136 continue; 140 continue;
137 } 141 }
138 142
@@ -141,8 +145,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
141 limit |= (1<<24)-1; 145 limit |= (1<<24)-1;
142 limit++; 146 limit++;
143 147
144 if (limit > max_pfn << PAGE_SHIFT) 148 if (limit > end)
145 limit = max_pfn << PAGE_SHIFT; 149 limit = end;
146 if (limit <= base) 150 if (limit <= base)
147 continue; 151 continue;
148 152
@@ -154,24 +158,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
154 if (limit > end) 158 if (limit > end)
155 limit = end; 159 limit = end;
156 if (limit == base) { 160 if (limit == base) {
157 printk(KERN_ERR "Empty node %d\n", nodeid); 161 pr_err("Empty node %d\n", nodeid);
158 continue; 162 continue;
159 } 163 }
160 if (limit < base) { 164 if (limit < base) {
161 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", 165 pr_err("Node %d bogus settings %lx-%lx.\n",
162 nodeid, base, limit); 166 nodeid, base, limit);
163 continue; 167 continue;
164 } 168 }
165 169
166 /* Could sort here, but pun for now. Should not happen anyroads. */ 170 /* Could sort here, but pun for now. Should not happen anyroads. */
167 if (prevbase > base) { 171 if (prevbase > base) {
168 printk(KERN_ERR "Node map not sorted %lx,%lx\n", 172 pr_err("Node map not sorted %lx,%lx\n",
169 prevbase, base); 173 prevbase, base);
170 return -1; 174 return -1;
171 } 175 }
172 176
173 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 177 pr_info("Node %d MemBase %016lx Limit %016lx\n",
174 nodeid, base, limit); 178 nodeid, base, limit);
175 179
176 found++; 180 found++;
177 181
@@ -180,18 +184,29 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
180 184
181 prevbase = base; 185 prevbase = base;
182 186
183 node_set(nodeid, node_possible_map); 187 node_set(nodeid, nodes_parsed);
184 } 188 }
185 189
186 if (!found) 190 if (!found)
187 return -1; 191 return -1;
192 return 0;
193}
194
195int __init k8_scan_nodes(void)
196{
197 unsigned int bits;
198 unsigned int cores;
199 unsigned int apicid_base;
200 int i;
188 201
202 BUG_ON(nodes_empty(nodes_parsed));
203 node_possible_map = nodes_parsed;
189 memnode_shift = compute_hash_shift(nodes, 8, NULL); 204 memnode_shift = compute_hash_shift(nodes, 8, NULL);
190 if (memnode_shift < 0) { 205 if (memnode_shift < 0) {
191 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 206 pr_err("No NUMA node hash function found. Contact maintainer\n");
192 return -1; 207 return -1;
193 } 208 }
194 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 209 pr_info("Using node hash shift of %d\n", memnode_shift);
195 210
196 /* use the coreid bits from early_identify_cpu */ 211 /* use the coreid bits from early_identify_cpu */
197 bits = boot_cpu_data.x86_coreid_bits; 212 bits = boot_cpu_data.x86_coreid_bits;
@@ -200,14 +215,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
200 /* need to get boot_cpu_id early for system with apicid lifting */ 215 /* need to get boot_cpu_id early for system with apicid lifting */
201 early_get_boot_cpu_id(); 216 early_get_boot_cpu_id();
202 if (boot_cpu_physical_apicid > 0) { 217 if (boot_cpu_physical_apicid > 0) {
203 printk(KERN_INFO "BSP APIC ID: %02x\n", 218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
204 boot_cpu_physical_apicid);
205 apicid_base = boot_cpu_physical_apicid; 219 apicid_base = boot_cpu_physical_apicid;
206 } 220 }
207 221
208 for (i = 0; i < 8; i++) { 222 for_each_node_mask(i, node_possible_map) {
209 if (nodes[i].start == nodes[i].end) 223 int j;
210 continue;
211 224
212 e820_register_active_regions(i, 225 e820_register_active_regions(i,
213 nodes[i].start >> PAGE_SHIFT, 226 nodes[i].start >> PAGE_SHIFT,
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 11a4ad4d6253..c0f6198565eb 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -5,6 +5,8 @@
5 * 2008 Pekka Paalanen <pq@iki.fi> 5 * 2008 Pekka Paalanen <pq@iki.fi>
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
8#include <linux/list.h> 10#include <linux/list.h>
9#include <linux/rculist.h> 11#include <linux/rculist.h>
10#include <linux/spinlock.h> 12#include <linux/spinlock.h>
@@ -136,7 +138,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
136 pte_t *pte = lookup_address(f->page, &level); 138 pte_t *pte = lookup_address(f->page, &level);
137 139
138 if (!pte) { 140 if (!pte) {
139 pr_err("kmmio: no pte for page 0x%08lx\n", f->page); 141 pr_err("no pte for page 0x%08lx\n", f->page);
140 return -1; 142 return -1;
141 } 143 }
142 144
@@ -148,7 +150,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
148 clear_pte_presence(pte, clear, &f->old_presence); 150 clear_pte_presence(pte, clear, &f->old_presence);
149 break; 151 break;
150 default: 152 default:
151 pr_err("kmmio: unexpected page level 0x%x.\n", level); 153 pr_err("unexpected page level 0x%x.\n", level);
152 return -1; 154 return -1;
153 } 155 }
154 156
@@ -170,13 +172,14 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
170static int arm_kmmio_fault_page(struct kmmio_fault_page *f) 172static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
171{ 173{
172 int ret; 174 int ret;
173 WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); 175 WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
174 if (f->armed) { 176 if (f->armed) {
175 pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", 177 pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
176 f->page, f->count, !!f->old_presence); 178 f->page, f->count, !!f->old_presence);
177 } 179 }
178 ret = clear_page_presence(f, true); 180 ret = clear_page_presence(f, true);
179 WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); 181 WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
182 f->page);
180 f->armed = true; 183 f->armed = true;
181 return ret; 184 return ret;
182} 185}
@@ -203,7 +206,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
203 */ 206 */
204/* 207/*
205 * Interrupts are disabled on entry as trap3 is an interrupt gate 208 * Interrupts are disabled on entry as trap3 is an interrupt gate
206 * and they remain disabled thorough out this function. 209 * and they remain disabled throughout this function.
207 */ 210 */
208int kmmio_handler(struct pt_regs *regs, unsigned long addr) 211int kmmio_handler(struct pt_regs *regs, unsigned long addr)
209{ 212{
@@ -240,24 +243,21 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
240 * condition needs handling by do_page_fault(), the 243 * condition needs handling by do_page_fault(), the
241 * page really not being present is the most common. 244 * page really not being present is the most common.
242 */ 245 */
243 pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", 246 pr_debug("secondary hit for 0x%08lx CPU %d.\n",
244 addr, smp_processor_id()); 247 addr, smp_processor_id());
245 248
246 if (!faultpage->old_presence) 249 if (!faultpage->old_presence)
247 pr_info("kmmio: unexpected secondary hit for " 250 pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
248 "address 0x%08lx on CPU %d.\n", addr, 251 addr, smp_processor_id());
249 smp_processor_id());
250 } else { 252 } else {
251 /* 253 /*
252 * Prevent overwriting already in-flight context. 254 * Prevent overwriting already in-flight context.
253 * This should not happen, let's hope disarming at 255 * This should not happen, let's hope disarming at
254 * least prevents a panic. 256 * least prevents a panic.
255 */ 257 */
256 pr_emerg("kmmio: recursive probe hit on CPU %d, " 258 pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
257 "for address 0x%08lx. Ignoring.\n", 259 smp_processor_id(), addr);
258 smp_processor_id(), addr); 260 pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
259 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
260 ctx->addr);
261 disarm_kmmio_fault_page(faultpage); 261 disarm_kmmio_fault_page(faultpage);
262 } 262 }
263 goto no_kmmio_ctx; 263 goto no_kmmio_ctx;
@@ -302,7 +302,7 @@ no_kmmio:
302 302
303/* 303/*
304 * Interrupts are disabled on entry as trap1 is an interrupt gate 304 * Interrupts are disabled on entry as trap1 is an interrupt gate
305 * and they remain disabled thorough out this function. 305 * and they remain disabled throughout this function.
306 * This must always get called as the pair to kmmio_handler(). 306 * This must always get called as the pair to kmmio_handler().
307 */ 307 */
308static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) 308static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
@@ -316,8 +316,8 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
316 * something external causing them (f.e. using a debugger while 316 * something external causing them (f.e. using a debugger while
317 * mmio tracing enabled), or erroneous behaviour 317 * mmio tracing enabled), or erroneous behaviour
318 */ 318 */
319 pr_warning("kmmio: unexpected debug trap on CPU %d.\n", 319 pr_warning("unexpected debug trap on CPU %d.\n",
320 smp_processor_id()); 320 smp_processor_id());
321 goto out; 321 goto out;
322 } 322 }
323 323
@@ -425,7 +425,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
425 list_add_rcu(&p->list, &kmmio_probes); 425 list_add_rcu(&p->list, &kmmio_probes);
426 while (size < size_lim) { 426 while (size < size_lim) {
427 if (add_kmmio_fault_page(p->addr + size)) 427 if (add_kmmio_fault_page(p->addr + size))
428 pr_err("kmmio: Unable to set page fault.\n"); 428 pr_err("Unable to set page fault.\n");
429 size += PAGE_SIZE; 429 size += PAGE_SIZE;
430 } 430 }
431out: 431out:
@@ -490,7 +490,7 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
490 * 2. remove_kmmio_fault_pages() 490 * 2. remove_kmmio_fault_pages()
491 * Remove the pages from kmmio_page_table. 491 * Remove the pages from kmmio_page_table.
492 * 3. rcu_free_kmmio_fault_pages() 492 * 3. rcu_free_kmmio_fault_pages()
493 * Actally free the kmmio_fault_page structs as with RCU. 493 * Actually free the kmmio_fault_page structs as with RCU.
494 */ 494 */
495void unregister_kmmio_probe(struct kmmio_probe *p) 495void unregister_kmmio_probe(struct kmmio_probe *p)
496{ 496{
@@ -511,7 +511,7 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
511 511
512 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); 512 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
513 if (!drelease) { 513 if (!drelease) {
514 pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); 514 pr_crit("leaking kmmio_fault_page objects.\n");
515 return; 515 return;
516 } 516 }
517 drelease->release_list = release_list; 517 drelease->release_list = release_list;
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 132772a8ec57..4c765e9c4664 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -19,6 +19,9 @@
19 * 19 *
20 * Derived from the read-mod example from relay-examples by Tom Zanussi. 20 * Derived from the read-mod example from relay-examples by Tom Zanussi.
21 */ 21 */
22
23#define pr_fmt(fmt) "mmiotrace: "
24
22#define DEBUG 1 25#define DEBUG 1
23 26
24#include <linux/module.h> 27#include <linux/module.h>
@@ -36,8 +39,6 @@
36 39
37#include "pf_in.h" 40#include "pf_in.h"
38 41
39#define NAME "mmiotrace: "
40
41struct trap_reason { 42struct trap_reason {
42 unsigned long addr; 43 unsigned long addr;
43 unsigned long ip; 44 unsigned long ip;
@@ -96,17 +97,18 @@ static void print_pte(unsigned long address)
96 pte_t *pte = lookup_address(address, &level); 97 pte_t *pte = lookup_address(address, &level);
97 98
98 if (!pte) { 99 if (!pte) {
99 pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", 100 pr_err("Error in %s: no pte for page 0x%08lx\n",
100 __func__, address); 101 __func__, address);
101 return; 102 return;
102 } 103 }
103 104
104 if (level == PG_LEVEL_2M) { 105 if (level == PG_LEVEL_2M) {
105 pr_emerg(NAME "4MB pages are not currently supported: " 106 pr_emerg("4MB pages are not currently supported: 0x%08lx\n",
106 "0x%08lx\n", address); 107 address);
107 BUG(); 108 BUG();
108 } 109 }
109 pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address, 110 pr_info("pte for 0x%lx: 0x%llx 0x%llx\n",
111 address,
110 (unsigned long long)pte_val(*pte), 112 (unsigned long long)pte_val(*pte),
111 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); 113 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
112} 114}
@@ -118,22 +120,21 @@ static void print_pte(unsigned long address)
118static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) 120static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
119{ 121{
120 const struct trap_reason *my_reason = &get_cpu_var(pf_reason); 122 const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
121 pr_emerg(NAME "unexpected fault for address: 0x%08lx, " 123 pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
122 "last fault for address: 0x%08lx\n", 124 addr, my_reason->addr);
123 addr, my_reason->addr);
124 print_pte(addr); 125 print_pte(addr);
125 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); 126 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
126 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); 127 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
127#ifdef __i386__ 128#ifdef __i386__
128 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", 129 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
129 regs->ax, regs->bx, regs->cx, regs->dx); 130 regs->ax, regs->bx, regs->cx, regs->dx);
130 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", 131 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
131 regs->si, regs->di, regs->bp, regs->sp); 132 regs->si, regs->di, regs->bp, regs->sp);
132#else 133#else
133 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", 134 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
134 regs->ax, regs->cx, regs->dx); 135 regs->ax, regs->cx, regs->dx);
135 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", 136 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
136 regs->si, regs->di, regs->bp, regs->sp); 137 regs->si, regs->di, regs->bp, regs->sp);
137#endif 138#endif
138 put_cpu_var(pf_reason); 139 put_cpu_var(pf_reason);
139 BUG(); 140 BUG();
@@ -213,7 +214,7 @@ static void post(struct kmmio_probe *p, unsigned long condition,
213 /* this should always return the active_trace count to 0 */ 214 /* this should always return the active_trace count to 0 */
214 my_reason->active_traces--; 215 my_reason->active_traces--;
215 if (my_reason->active_traces) { 216 if (my_reason->active_traces) {
216 pr_emerg(NAME "unexpected post handler"); 217 pr_emerg("unexpected post handler");
217 BUG(); 218 BUG();
218 } 219 }
219 220
@@ -244,7 +245,7 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
244 }; 245 };
245 246
246 if (!trace) { 247 if (!trace) {
247 pr_err(NAME "kmalloc failed in ioremap\n"); 248 pr_err("kmalloc failed in ioremap\n");
248 return; 249 return;
249 } 250 }
250 251
@@ -282,8 +283,8 @@ void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
282 if (!is_enabled()) /* recheck and proper locking in *_core() */ 283 if (!is_enabled()) /* recheck and proper locking in *_core() */
283 return; 284 return;
284 285
285 pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n", 286 pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n",
286 (unsigned long long)offset, size, addr); 287 (unsigned long long)offset, size, addr);
287 if ((filter_offset) && (offset != filter_offset)) 288 if ((filter_offset) && (offset != filter_offset))
288 return; 289 return;
289 ioremap_trace_core(offset, size, addr); 290 ioremap_trace_core(offset, size, addr);
@@ -301,7 +302,7 @@ static void iounmap_trace_core(volatile void __iomem *addr)
301 struct remap_trace *tmp; 302 struct remap_trace *tmp;
302 struct remap_trace *found_trace = NULL; 303 struct remap_trace *found_trace = NULL;
303 304
304 pr_debug(NAME "Unmapping %p.\n", addr); 305 pr_debug("Unmapping %p.\n", addr);
305 306
306 spin_lock_irq(&trace_lock); 307 spin_lock_irq(&trace_lock);
307 if (!is_enabled()) 308 if (!is_enabled())
@@ -363,9 +364,8 @@ static void clear_trace_list(void)
363 * Caller also ensures is_enabled() cannot change. 364 * Caller also ensures is_enabled() cannot change.
364 */ 365 */
365 list_for_each_entry(trace, &trace_list, list) { 366 list_for_each_entry(trace, &trace_list, list) {
366 pr_notice(NAME "purging non-iounmapped " 367 pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n",
367 "trace @0x%08lx, size 0x%lx.\n", 368 trace->probe.addr, trace->probe.len);
368 trace->probe.addr, trace->probe.len);
369 if (!nommiotrace) 369 if (!nommiotrace)
370 unregister_kmmio_probe(&trace->probe); 370 unregister_kmmio_probe(&trace->probe);
371 } 371 }
@@ -387,7 +387,7 @@ static void enter_uniprocessor(void)
387 387
388 if (downed_cpus == NULL && 388 if (downed_cpus == NULL &&
389 !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { 389 !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
390 pr_notice(NAME "Failed to allocate mask\n"); 390 pr_notice("Failed to allocate mask\n");
391 goto out; 391 goto out;
392 } 392 }
393 393
@@ -395,20 +395,19 @@ static void enter_uniprocessor(void)
395 cpumask_copy(downed_cpus, cpu_online_mask); 395 cpumask_copy(downed_cpus, cpu_online_mask);
396 cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus); 396 cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
397 if (num_online_cpus() > 1) 397 if (num_online_cpus() > 1)
398 pr_notice(NAME "Disabling non-boot CPUs...\n"); 398 pr_notice("Disabling non-boot CPUs...\n");
399 put_online_cpus(); 399 put_online_cpus();
400 400
401 for_each_cpu(cpu, downed_cpus) { 401 for_each_cpu(cpu, downed_cpus) {
402 err = cpu_down(cpu); 402 err = cpu_down(cpu);
403 if (!err) 403 if (!err)
404 pr_info(NAME "CPU%d is down.\n", cpu); 404 pr_info("CPU%d is down.\n", cpu);
405 else 405 else
406 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); 406 pr_err("Error taking CPU%d down: %d\n", cpu, err);
407 } 407 }
408out: 408out:
409 if (num_online_cpus() > 1) 409 if (num_online_cpus() > 1)
410 pr_warning(NAME "multiple CPUs still online, " 410 pr_warning("multiple CPUs still online, may miss events.\n");
411 "may miss events.\n");
412} 411}
413 412
414/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, 413/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
@@ -420,13 +419,13 @@ static void __ref leave_uniprocessor(void)
420 419
421 if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) 420 if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
422 return; 421 return;
423 pr_notice(NAME "Re-enabling CPUs...\n"); 422 pr_notice("Re-enabling CPUs...\n");
424 for_each_cpu(cpu, downed_cpus) { 423 for_each_cpu(cpu, downed_cpus) {
425 err = cpu_up(cpu); 424 err = cpu_up(cpu);
426 if (!err) 425 if (!err)
427 pr_info(NAME "enabled CPU%d.\n", cpu); 426 pr_info("enabled CPU%d.\n", cpu);
428 else 427 else
429 pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err); 428 pr_err("cannot re-enable CPU%d: %d\n", cpu, err);
430 } 429 }
431} 430}
432 431
@@ -434,8 +433,8 @@ static void __ref leave_uniprocessor(void)
434static void enter_uniprocessor(void) 433static void enter_uniprocessor(void)
435{ 434{
436 if (num_online_cpus() > 1) 435 if (num_online_cpus() > 1)
437 pr_warning(NAME "multiple CPUs are online, may miss events. " 436 pr_warning("multiple CPUs are online, may miss events. "
438 "Suggest booting with maxcpus=1 kernel argument.\n"); 437 "Suggest booting with maxcpus=1 kernel argument.\n");
439} 438}
440 439
441static void leave_uniprocessor(void) 440static void leave_uniprocessor(void)
@@ -450,13 +449,13 @@ void enable_mmiotrace(void)
450 goto out; 449 goto out;
451 450
452 if (nommiotrace) 451 if (nommiotrace)
453 pr_info(NAME "MMIO tracing disabled.\n"); 452 pr_info("MMIO tracing disabled.\n");
454 kmmio_init(); 453 kmmio_init();
455 enter_uniprocessor(); 454 enter_uniprocessor();
456 spin_lock_irq(&trace_lock); 455 spin_lock_irq(&trace_lock);
457 atomic_inc(&mmiotrace_enabled); 456 atomic_inc(&mmiotrace_enabled);
458 spin_unlock_irq(&trace_lock); 457 spin_unlock_irq(&trace_lock);
459 pr_info(NAME "enabled.\n"); 458 pr_info("enabled.\n");
460out: 459out:
461 mutex_unlock(&mmiotrace_mutex); 460 mutex_unlock(&mmiotrace_mutex);
462} 461}
@@ -475,7 +474,7 @@ void disable_mmiotrace(void)
475 clear_trace_list(); /* guarantees: no more kmmio callbacks */ 474 clear_trace_list(); /* guarantees: no more kmmio callbacks */
476 leave_uniprocessor(); 475 leave_uniprocessor();
477 kmmio_cleanup(); 476 kmmio_cleanup();
478 pr_info(NAME "disabled.\n"); 477 pr_info("disabled.\n");
479out: 478out:
480 mutex_unlock(&mmiotrace_mutex); 479 mutex_unlock(&mmiotrace_mutex);
481} 480}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d2530062fe00..b20760ca7244 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -347,8 +347,8 @@ static void init_remap_allocator(int nid)
347 (ulong) node_remap_end_vaddr[nid]); 347 (ulong) node_remap_end_vaddr[nid]);
348} 348}
349 349
350void __init initmem_init(unsigned long start_pfn, 350void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
351 unsigned long end_pfn) 351 int acpi, int k8)
352{ 352{
353 int nid; 353 int nid;
354 long kva_target_pfn; 354 long kva_target_pfn;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 459913beac71..83bbc70d11bb 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -239,8 +239,14 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 239 bootmap = early_node_mem(nodeid, bootmap_start, end,
240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
241 if (bootmap == NULL) { 241 if (bootmap == NULL) {
242 if (nodedata_phys < start || nodedata_phys >= end) 242 if (nodedata_phys < start || nodedata_phys >= end) {
243 free_bootmem(nodedata_phys, pgdat_size); 243 /*
244 * only need to free it if it is from other node
245 * bootmem
246 */
247 if (nid != nodeid)
248 free_bootmem(nodedata_phys, pgdat_size);
249 }
244 node_data[nodeid] = NULL; 250 node_data[nodeid] = NULL;
245 return; 251 return;
246 } 252 }
@@ -306,8 +312,71 @@ void __init numa_init_array(void)
306 312
307#ifdef CONFIG_NUMA_EMU 313#ifdef CONFIG_NUMA_EMU
308/* Numa emulation */ 314/* Numa emulation */
315static struct bootnode nodes[MAX_NUMNODES] __initdata;
316static struct bootnode physnodes[MAX_NUMNODES] __initdata;
309static char *cmdline __initdata; 317static char *cmdline __initdata;
310 318
319static int __init setup_physnodes(unsigned long start, unsigned long end,
320 int acpi, int k8)
321{
322 int nr_nodes = 0;
323 int ret = 0;
324 int i;
325
326#ifdef CONFIG_ACPI_NUMA
327 if (acpi)
328 nr_nodes = acpi_get_nodes(physnodes);
329#endif
330#ifdef CONFIG_K8_NUMA
331 if (k8)
332 nr_nodes = k8_get_nodes(physnodes);
333#endif
334 /*
335 * Basic sanity checking on the physical node map: there may be errors
336 * if the SRAT or K8 incorrectly reported the topology or the mem=
337 * kernel parameter is used.
338 */
339 for (i = 0; i < nr_nodes; i++) {
340 if (physnodes[i].start == physnodes[i].end)
341 continue;
342 if (physnodes[i].start > end) {
343 physnodes[i].end = physnodes[i].start;
344 continue;
345 }
346 if (physnodes[i].end < start) {
347 physnodes[i].start = physnodes[i].end;
348 continue;
349 }
350 if (physnodes[i].start < start)
351 physnodes[i].start = start;
352 if (physnodes[i].end > end)
353 physnodes[i].end = end;
354 }
355
356 /*
357 * Remove all nodes that have no memory or were truncated because of the
358 * limited address range.
359 */
360 for (i = 0; i < nr_nodes; i++) {
361 if (physnodes[i].start == physnodes[i].end)
362 continue;
363 physnodes[ret].start = physnodes[i].start;
364 physnodes[ret].end = physnodes[i].end;
365 ret++;
366 }
367
368 /*
369 * If no physical topology was detected, a single node is faked to cover
370 * the entire address space.
371 */
372 if (!ret) {
373 physnodes[ret].start = start;
374 physnodes[ret].end = end;
375 ret = 1;
376 }
377 return ret;
378}
379
311/* 380/*
312 * Setups up nid to range from addr to addr + size. If the end 381 * Setups up nid to range from addr to addr + size. If the end
313 * boundary is greater than max_addr, then max_addr is used instead. 382 * boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +384,9 @@ static char *cmdline __initdata;
315 * allocation past addr and -1 otherwise. addr is adjusted to be at 384 * allocation past addr and -1 otherwise. addr is adjusted to be at
316 * the end of the node. 385 * the end of the node.
317 */ 386 */
318static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 387static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
319 u64 size, u64 max_addr)
320{ 388{
321 int ret = 0; 389 int ret = 0;
322
323 nodes[nid].start = *addr; 390 nodes[nid].start = *addr;
324 *addr += size; 391 *addr += size;
325 if (*addr >= max_addr) { 392 if (*addr >= max_addr) {
@@ -335,12 +402,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
335} 402}
336 403
337/* 404/*
405 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
406 * to max_addr. The return value is the number of nodes allocated.
407 */
408static int __init split_nodes_interleave(u64 addr, u64 max_addr,
409 int nr_phys_nodes, int nr_nodes)
410{
411 nodemask_t physnode_mask = NODE_MASK_NONE;
412 u64 size;
413 int big;
414 int ret = 0;
415 int i;
416
417 if (nr_nodes <= 0)
418 return -1;
419 if (nr_nodes > MAX_NUMNODES) {
420 pr_info("numa=fake=%d too large, reducing to %d\n",
421 nr_nodes, MAX_NUMNODES);
422 nr_nodes = MAX_NUMNODES;
423 }
424
425 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
426 /*
427 * Calculate the number of big nodes that can be allocated as a result
428 * of consolidating the remainder.
429 */
430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
431 FAKE_NODE_MIN_SIZE;
432
433 size &= FAKE_NODE_MIN_HASH_MASK;
434 if (!size) {
435 pr_err("Not enough memory for each node. "
436 "NUMA emulation disabled.\n");
437 return -1;
438 }
439
440 for (i = 0; i < nr_phys_nodes; i++)
441 if (physnodes[i].start != physnodes[i].end)
442 node_set(i, physnode_mask);
443
444 /*
445 * Continue to fill physical nodes with fake nodes until there is no
446 * memory left on any of them.
447 */
448 while (nodes_weight(physnode_mask)) {
449 for_each_node_mask(i, physnode_mask) {
450 u64 end = physnodes[i].start + size;
451 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
452
453 if (ret < big)
454 end += FAKE_NODE_MIN_SIZE;
455
456 /*
457 * Continue to add memory to this fake node if its
458 * non-reserved memory is less than the per-node size.
459 */
460 while (end - physnodes[i].start -
461 e820_hole_size(physnodes[i].start, end) < size) {
462 end += FAKE_NODE_MIN_SIZE;
463 if (end > physnodes[i].end) {
464 end = physnodes[i].end;
465 break;
466 }
467 }
468
469 /*
470 * If there won't be at least FAKE_NODE_MIN_SIZE of
471 * non-reserved memory in ZONE_DMA32 for the next node,
472 * this one must extend to the boundary.
473 */
474 if (end < dma32_end && dma32_end - end -
475 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
476 end = dma32_end;
477
478 /*
479 * If there won't be enough non-reserved memory for the
480 * next node, this one must extend to the end of the
481 * physical node.
482 */
483 if (physnodes[i].end - end -
484 e820_hole_size(end, physnodes[i].end) < size)
485 end = physnodes[i].end;
486
487 /*
488 * Avoid allocating more nodes than requested, which can
489 * happen as a result of rounding down each node's size
490 * to FAKE_NODE_MIN_SIZE.
491 */
492 if (nodes_weight(physnode_mask) + ret >= nr_nodes)
493 end = physnodes[i].end;
494
495 if (setup_node_range(ret++, &physnodes[i].start,
496 end - physnodes[i].start,
497 physnodes[i].end) < 0)
498 node_clear(i, physnode_mask);
499 }
500 }
501 return ret;
502}
503
504/*
338 * Splits num_nodes nodes up equally starting at node_start. The return value 505 * Splits num_nodes nodes up equally starting at node_start. The return value
339 * is the number of nodes split up and addr is adjusted to be at the end of the 506 * is the number of nodes split up and addr is adjusted to be at the end of the
340 * last node allocated. 507 * last node allocated.
341 */ 508 */
342static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, 509static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
343 u64 max_addr, int node_start,
344 int num_nodes) 510 int num_nodes)
345{ 511{
346 unsigned int big; 512 unsigned int big;
@@ -388,7 +554,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
388 break; 554 break;
389 } 555 }
390 } 556 }
391 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) 557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
392 break; 558 break;
393 } 559 }
394 return i - node_start + 1; 560 return i - node_start + 1;
@@ -399,12 +565,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
399 * always assigned to a final node and can be asymmetric. Returns the number of 565 * always assigned to a final node and can be asymmetric. Returns the number of
400 * nodes split. 566 * nodes split.
401 */ 567 */
402static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, 568static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
403 u64 max_addr, int node_start, u64 size) 569 u64 size)
404{ 570{
405 int i = node_start; 571 int i = node_start;
406 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
407 while (!setup_node_range(i++, nodes, addr, size, max_addr)) 573 while (!setup_node_range(i++, addr, size, max_addr))
408 ; 574 ;
409 return i - node_start; 575 return i - node_start;
410} 576}
@@ -413,15 +579,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413 * Sets up the system RAM area from start_pfn to last_pfn according to the 579 * Sets up the system RAM area from start_pfn to last_pfn according to the
414 * numa=fake command-line option. 580 * numa=fake command-line option.
415 */ 581 */
416static struct bootnode nodes[MAX_NUMNODES] __initdata; 582static int __init numa_emulation(unsigned long start_pfn,
417 583 unsigned long last_pfn, int acpi, int k8)
418static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
419{ 584{
420 u64 size, addr = start_pfn << PAGE_SHIFT; 585 u64 size, addr = start_pfn << PAGE_SHIFT;
421 u64 max_addr = last_pfn << PAGE_SHIFT; 586 u64 max_addr = last_pfn << PAGE_SHIFT;
422 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
588 int num_phys_nodes;
423 589
424 memset(&nodes, 0, sizeof(nodes)); 590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
425 /* 591 /*
426 * If the numa=fake command-line is just a single number N, split the 592 * If the numa=fake command-line is just a single number N, split the
427 * system RAM into N fake nodes. 593 * system RAM into N fake nodes.
@@ -429,7 +595,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
429 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
430 long n = simple_strtol(cmdline, NULL, 0); 596 long n = simple_strtol(cmdline, NULL, 0);
431 597
432 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); 598 num_nodes = split_nodes_interleave(addr, max_addr,
599 num_phys_nodes, n);
433 if (num_nodes < 0) 600 if (num_nodes < 0)
434 return num_nodes; 601 return num_nodes;
435 goto out; 602 goto out;
@@ -456,8 +623,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
456 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; 623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
457 if (size) 624 if (size)
458 for (i = 0; i < coeff; i++, num_nodes++) 625 for (i = 0; i < coeff; i++, num_nodes++)
459 if (setup_node_range(num_nodes, nodes, 626 if (setup_node_range(num_nodes, &addr,
460 &addr, size, max_addr) < 0) 627 size, max_addr) < 0)
461 goto done; 628 goto done;
462 if (!*cmdline) 629 if (!*cmdline)
463 break; 630 break;
@@ -473,7 +640,7 @@ done:
473 if (addr < max_addr) { 640 if (addr < max_addr) {
474 if (coeff_flag && coeff < 0) { 641 if (coeff_flag && coeff < 0) {
475 /* Split remaining nodes into num-sized chunks */ 642 /* Split remaining nodes into num-sized chunks */
476 num_nodes += split_nodes_by_size(nodes, &addr, max_addr, 643 num_nodes += split_nodes_by_size(&addr, max_addr,
477 num_nodes, num); 644 num_nodes, num);
478 goto out; 645 goto out;
479 } 646 }
@@ -482,7 +649,7 @@ done:
482 /* Split remaining nodes into coeff chunks */ 649 /* Split remaining nodes into coeff chunks */
483 if (coeff <= 0) 650 if (coeff <= 0)
484 break; 651 break;
485 num_nodes += split_nodes_equally(nodes, &addr, max_addr, 652 num_nodes += split_nodes_equally(&addr, max_addr,
486 num_nodes, coeff); 653 num_nodes, coeff);
487 break; 654 break;
488 case ',': 655 case ',':
@@ -490,8 +657,8 @@ done:
490 break; 657 break;
491 default: 658 default:
492 /* Give one final node */ 659 /* Give one final node */
493 setup_node_range(num_nodes, nodes, &addr, 660 setup_node_range(num_nodes, &addr, max_addr - addr,
494 max_addr - addr, max_addr); 661 max_addr);
495 num_nodes++; 662 num_nodes++;
496 } 663 }
497 } 664 }
@@ -505,14 +672,10 @@ out:
505 } 672 }
506 673
507 /* 674 /*
508 * We need to vacate all active ranges that may have been registered by 675 * We need to vacate all active ranges that may have been registered for
509 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns 676 * the e820 memory map.
510 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
511 */ 677 */
512 remove_all_active_ranges(); 678 remove_all_active_ranges();
513#ifdef CONFIG_ACPI_NUMA
514 acpi_numa = -1;
515#endif
516 for_each_node_mask(i, node_possible_map) { 679 for_each_node_mask(i, node_possible_map) {
517 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 680 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
518 nodes[i].end >> PAGE_SHIFT); 681 nodes[i].end >> PAGE_SHIFT);
@@ -524,7 +687,8 @@ out:
524} 687}
525#endif /* CONFIG_NUMA_EMU */ 688#endif /* CONFIG_NUMA_EMU */
526 689
527void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) 690void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
691 int acpi, int k8)
528{ 692{
529 int i; 693 int i;
530 694
@@ -532,23 +696,22 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
532 nodes_clear(node_online_map); 696 nodes_clear(node_online_map);
533 697
534#ifdef CONFIG_NUMA_EMU 698#ifdef CONFIG_NUMA_EMU
535 if (cmdline && !numa_emulation(start_pfn, last_pfn)) 699 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
536 return; 700 return;
537 nodes_clear(node_possible_map); 701 nodes_clear(node_possible_map);
538 nodes_clear(node_online_map); 702 nodes_clear(node_online_map);
539#endif 703#endif
540 704
541#ifdef CONFIG_ACPI_NUMA 705#ifdef CONFIG_ACPI_NUMA
542 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 706 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
543 last_pfn << PAGE_SHIFT)) 707 last_pfn << PAGE_SHIFT))
544 return; 708 return;
545 nodes_clear(node_possible_map); 709 nodes_clear(node_possible_map);
546 nodes_clear(node_online_map); 710 nodes_clear(node_online_map);
547#endif 711#endif
548 712
549#ifdef CONFIG_K8_NUMA 713#ifdef CONFIG_K8_NUMA
550 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 714 if (!numa_off && k8 && !k8_scan_nodes())
551 last_pfn<<PAGE_SHIFT))
552 return; 715 return;
553 nodes_clear(node_possible_map); 716 nodes_clear(node_possible_map);
554 nodes_clear(node_online_map); 717 nodes_clear(node_online_map);
@@ -601,6 +764,25 @@ static __init int numa_setup(char *opt)
601early_param("numa", numa_setup); 764early_param("numa", numa_setup);
602 765
603#ifdef CONFIG_NUMA 766#ifdef CONFIG_NUMA
767
768static __init int find_near_online_node(int node)
769{
770 int n, val;
771 int min_val = INT_MAX;
772 int best_node = -1;
773
774 for_each_online_node(n) {
775 val = node_distance(node, n);
776
777 if (val < min_val) {
778 min_val = val;
779 best_node = n;
780 }
781 }
782
783 return best_node;
784}
785
604/* 786/*
605 * Setup early cpu_to_node. 787 * Setup early cpu_to_node.
606 * 788 *
@@ -632,7 +814,7 @@ void __init init_cpu_to_node(void)
632 if (node == NUMA_NO_NODE) 814 if (node == NUMA_NO_NODE)
633 continue; 815 continue;
634 if (!node_online(node)) 816 if (!node_online(node))
635 continue; 817 node = find_near_online_node(node);
636 numa_set_node(cpu, node); 818 numa_set_node(cpu, node);
637 } 819 }
638} 820}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dd38bfbefd1f..1d4eb93d333c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -279,6 +279,22 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
280 pgprot_val(forbidden) |= _PAGE_RW; 280 pgprot_val(forbidden) |= _PAGE_RW;
281 281
282#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
283 /*
284 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
285 * kernel text mappings for the large page aligned text, rodata sections
286 * will be always read-only. For the kernel identity mappings covering
287 * the holes caused by this alignment can be anything that user asks.
288 *
289 * This will preserve the large page mappings for kernel text/data
290 * at no extra cost.
291 */
292 if (kernel_set_to_readonly &&
293 within(address, (unsigned long)_text,
294 (unsigned long)__end_rodata_hpage_align))
295 pgprot_val(forbidden) |= _PAGE_RW;
296#endif
297
282 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 298 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
283 299
284 return prot; 300 return prot;
@@ -1069,12 +1085,18 @@ EXPORT_SYMBOL(set_memory_array_wb);
1069 1085
1070int set_memory_x(unsigned long addr, int numpages) 1086int set_memory_x(unsigned long addr, int numpages)
1071{ 1087{
1088 if (!(__supported_pte_mask & _PAGE_NX))
1089 return 0;
1090
1072 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); 1091 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1073} 1092}
1074EXPORT_SYMBOL(set_memory_x); 1093EXPORT_SYMBOL(set_memory_x);
1075 1094
1076int set_memory_nx(unsigned long addr, int numpages) 1095int set_memory_nx(unsigned long addr, int numpages)
1077{ 1096{
1097 if (!(__supported_pte_mask & _PAGE_NX))
1098 return 0;
1099
1078 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); 1100 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1079} 1101}
1080EXPORT_SYMBOL(set_memory_nx); 1102EXPORT_SYMBOL(set_memory_nx);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e78cd0ec2bcf..ae9648eb1c7f 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -20,6 +20,7 @@
20#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
21#include <asm/processor.h> 21#include <asm/processor.h>
22#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
23#include <asm/x86_init.h>
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/fcntl.h> 25#include <asm/fcntl.h>
25#include <asm/e820.h> 26#include <asm/e820.h>
@@ -355,9 +356,6 @@ static int free_ram_pages_type(u64 start, u64 end)
355 * - _PAGE_CACHE_UC_MINUS 356 * - _PAGE_CACHE_UC_MINUS
356 * - _PAGE_CACHE_UC 357 * - _PAGE_CACHE_UC
357 * 358 *
358 * req_type will have a special case value '-1', when requester want to inherit
359 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
360 *
361 * If new_type is NULL, function will return an error if it cannot reserve the 359 * If new_type is NULL, function will return an error if it cannot reserve the
362 * region with req_type. If new_type is non-NULL, function will return 360 * region with req_type. If new_type is non-NULL, function will return
363 * available type in new_type in case of no error. In case of any error 361 * available type in new_type in case of no error. In case of any error
@@ -377,9 +375,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
377 if (!pat_enabled) { 375 if (!pat_enabled) {
378 /* This is identical to page table setting without PAT */ 376 /* This is identical to page table setting without PAT */
379 if (new_type) { 377 if (new_type) {
380 if (req_type == -1) 378 if (req_type == _PAGE_CACHE_WC)
381 *new_type = _PAGE_CACHE_WB;
382 else if (req_type == _PAGE_CACHE_WC)
383 *new_type = _PAGE_CACHE_UC_MINUS; 379 *new_type = _PAGE_CACHE_UC_MINUS;
384 else 380 else
385 *new_type = req_type & _PAGE_CACHE_MASK; 381 *new_type = req_type & _PAGE_CACHE_MASK;
@@ -388,7 +384,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
388 } 384 }
389 385
390 /* Low ISA region is always mapped WB in page table. No need to track */ 386 /* Low ISA region is always mapped WB in page table. No need to track */
391 if (is_ISA_range(start, end - 1)) { 387 if (x86_platform.is_untracked_pat_range(start, end)) {
392 if (new_type) 388 if (new_type)
393 *new_type = _PAGE_CACHE_WB; 389 *new_type = _PAGE_CACHE_WB;
394 return 0; 390 return 0;
@@ -499,7 +495,7 @@ int free_memtype(u64 start, u64 end)
499 return 0; 495 return 0;
500 496
501 /* Low ISA region is always mapped WB. No need to track */ 497 /* Low ISA region is always mapped WB. No need to track */
502 if (is_ISA_range(start, end - 1)) 498 if (x86_platform.is_untracked_pat_range(start, end))
503 return 0; 499 return 0;
504 500
505 is_range_ram = pat_pagerange_is_ram(start, end); 501 is_range_ram = pat_pagerange_is_ram(start, end);
@@ -582,7 +578,7 @@ static unsigned long lookup_memtype(u64 paddr)
582 int rettype = _PAGE_CACHE_WB; 578 int rettype = _PAGE_CACHE_WB;
583 struct memtype *entry; 579 struct memtype *entry;
584 580
585 if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) 581 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
586 return rettype; 582 return rettype;
587 583
588 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 584 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
@@ -708,9 +704,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
708 if (!range_is_allowed(pfn, size)) 704 if (!range_is_allowed(pfn, size))
709 return 0; 705 return 0;
710 706
711 if (file->f_flags & O_SYNC) { 707 if (file->f_flags & O_DSYNC)
712 flags = _PAGE_CACHE_UC_MINUS; 708 flags = _PAGE_CACHE_UC_MINUS;
713 }
714 709
715#ifdef CONFIG_X86_32 710#ifdef CONFIG_X86_32
716 /* 711 /*
@@ -1018,8 +1013,10 @@ static const struct file_operations memtype_fops = {
1018 1013
1019static int __init pat_memtype_list_init(void) 1014static int __init pat_memtype_list_init(void)
1020{ 1015{
1021 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, 1016 if (pat_enabled) {
1022 NULL, &memtype_fops); 1017 debugfs_create_file("pat_memtype_list", S_IRUSR,
1018 arch_debugfs_dir, NULL, &memtype_fops);
1019 }
1023 return 0; 1020 return 0;
1024} 1021}
1025 1022
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 513d8ed5d2ec..a3250aa34086 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -3,10 +3,8 @@
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5#include <asm/pgtable.h> 5#include <asm/pgtable.h>
6#include <asm/proto.h>
6 7
7int nx_enabled;
8
9#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
10static int disable_nx __cpuinitdata; 8static int disable_nx __cpuinitdata;
11 9
12/* 10/*
@@ -22,48 +20,41 @@ static int __init noexec_setup(char *str)
22 if (!str) 20 if (!str)
23 return -EINVAL; 21 return -EINVAL;
24 if (!strncmp(str, "on", 2)) { 22 if (!strncmp(str, "on", 2)) {
25 __supported_pte_mask |= _PAGE_NX;
26 disable_nx = 0; 23 disable_nx = 0;
27 } else if (!strncmp(str, "off", 3)) { 24 } else if (!strncmp(str, "off", 3)) {
28 disable_nx = 1; 25 disable_nx = 1;
29 __supported_pte_mask &= ~_PAGE_NX;
30 } 26 }
27 x86_configure_nx();
31 return 0; 28 return 0;
32} 29}
33early_param("noexec", noexec_setup); 30early_param("noexec", noexec_setup);
34#endif
35 31
36#ifdef CONFIG_X86_PAE 32void __cpuinit x86_configure_nx(void)
37void __init set_nx(void)
38{ 33{
39 unsigned int v[4], l, h; 34 if (cpu_has_nx && !disable_nx)
40 35 __supported_pte_mask |= _PAGE_NX;
41 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { 36 else
42 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); 37 __supported_pte_mask &= ~_PAGE_NX;
38}
43 39
44 if ((v[3] & (1 << 20)) && !disable_nx) { 40void __init x86_report_nx(void)
45 rdmsr(MSR_EFER, l, h); 41{
46 l |= EFER_NX; 42 if (!cpu_has_nx) {
47 wrmsr(MSR_EFER, l, h); 43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
48 nx_enabled = 1; 44 "missing in CPU or disabled in BIOS!\n");
49 __supported_pte_mask |= _PAGE_NX; 45 } else {
46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
47 if (disable_nx) {
48 printk(KERN_INFO "NX (Execute Disable) protection: "
49 "disabled by kernel command line option\n");
50 } else {
51 printk(KERN_INFO "NX (Execute Disable) protection: "
52 "active\n");
50 } 53 }
51 }
52}
53#else 54#else
54void set_nx(void) 55 /* 32bit non-PAE kernel, NX cannot be used */
55{ 56 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
56} 57 "cannot be enabled: non-PAE kernel!\n");
57#endif 58#endif
58 59 }
59#ifdef CONFIG_X86_64
60void __cpuinit check_efer(void)
61{
62 unsigned long efer;
63
64 rdmsrl(MSR_EFER, efer);
65 if (!(efer & EFER_NX) || disable_nx)
66 __supported_pte_mask &= ~_PAGE_NX;
67} 60}
68#endif
69
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 9d7ce96e5a5c..d89075489664 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -290,8 +290,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
290 290
291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
292 start, end); 292 start, end);
293 e820_register_active_regions(node, start >> PAGE_SHIFT,
294 end >> PAGE_SHIFT);
295 293
296 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 294 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
297 update_nodes_add(node, start, end); 295 update_nodes_add(node, start, end);
@@ -338,6 +336,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
338 336
339void __init acpi_numa_arch_fixup(void) {} 337void __init acpi_numa_arch_fixup(void) {}
340 338
339int __init acpi_get_nodes(struct bootnode *physnodes)
340{
341 int i;
342 int ret = 0;
343
344 for_each_node_mask(i, nodes_parsed) {
345 physnodes[ret].start = nodes[i].start;
346 physnodes[ret].end = nodes[i].end;
347 ret++;
348 }
349 return ret;
350}
351
341/* Use the information discovered above to actually set up the nodes. */ 352/* Use the information discovered above to actually set up the nodes. */
342int __init acpi_scan_nodes(unsigned long start, unsigned long end) 353int __init acpi_scan_nodes(unsigned long start, unsigned long end)
343{ 354{
@@ -350,11 +361,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
350 for (i = 0; i < MAX_NUMNODES; i++) 361 for (i = 0; i < MAX_NUMNODES; i++)
351 cutoff_node(i, start, end); 362 cutoff_node(i, start, end);
352 363
353 if (!nodes_cover_memory(nodes)) {
354 bad_srat();
355 return -1;
356 }
357
358 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, 364 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
359 memblk_nodeid); 365 memblk_nodeid);
360 if (memnode_shift < 0) { 366 if (memnode_shift < 0) {
@@ -364,6 +370,14 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
364 return -1; 370 return -1;
365 } 371 }
366 372
373 for_each_node_mask(i, nodes_parsed)
374 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
375 nodes[i].end >> PAGE_SHIFT);
376 if (!nodes_cover_memory(nodes)) {
377 bad_srat();
378 return -1;
379 }
380
367 /* Account for nodes with cpus and no memory */ 381 /* Account for nodes with cpus and no memory */
368 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); 382 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
369 383
@@ -454,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
454 for (i = 0; i < num_nodes; i++) 468 for (i = 0; i < num_nodes; i++)
455 if (fake_nodes[i].start != fake_nodes[i].end) 469 if (fake_nodes[i].start != fake_nodes[i].end)
456 node_set(i, nodes_parsed); 470 node_set(i, nodes_parsed);
457 WARN_ON(!nodes_cover_memory(fake_nodes));
458} 471}
459 472
460static int null_slit_node_compare(int a, int b) 473static int null_slit_node_compare(int a, int b)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36fe08eeb5c3..65b58e4b0b8b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -8,6 +8,7 @@
8 8
9#include <asm/tlbflush.h> 9#include <asm/tlbflush.h>
10#include <asm/mmu_context.h> 10#include <asm/mmu_context.h>
11#include <asm/cache.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
12#include <asm/uv/uv.h> 13#include <asm/uv/uv.h>
13 14
@@ -43,7 +44,7 @@ union smp_flush_state {
43 spinlock_t tlbstate_lock; 44 spinlock_t tlbstate_lock;
44 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
45 }; 46 };
46 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; 47 char pad[INTERNODE_CACHE_BYTES];
47} ____cacheline_internodealigned_in_smp; 48} ____cacheline_internodealigned_in_smp;
48 49
49/* State is put into the per CPU data section, but padded 50/* State is put into the per CPU data section, but padded
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index d49202e740ea..564b008a51c7 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -15,3 +15,8 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15 15
16obj-y += common.o early.o 16obj-y += common.o early.o
17obj-y += amd_bus.o 17obj-y += amd_bus.o
18obj-$(CONFIG_X86_64) += bus_numa.o intel_bus.o
19
20ifeq ($(CONFIG_PCI_DEBUG),y)
21EXTRA_CFLAGS += -DDEBUG
22endif
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 1014eb4bfc37..959e548a7039 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -7,6 +7,7 @@
7#include <asm/pci_x86.h> 7#include <asm/pci_x86.h>
8 8
9struct pci_root_info { 9struct pci_root_info {
10 struct acpi_device *bridge;
10 char *name; 11 char *name;
11 unsigned int res_num; 12 unsigned int res_num;
12 struct resource *res; 13 struct resource *res;
@@ -58,6 +59,30 @@ bus_has_transparent_bridge(struct pci_bus *bus)
58 return false; 59 return false;
59} 60}
60 61
62static void
63align_resource(struct acpi_device *bridge, struct resource *res)
64{
65 int align = (res->flags & IORESOURCE_MEM) ? 16 : 4;
66
67 /*
68 * Host bridge windows are not BARs, but the decoders on the PCI side
69 * that claim this address space have starting alignment and length
70 * constraints, so fix any obvious BIOS goofs.
71 */
72 if (!IS_ALIGNED(res->start, align)) {
73 dev_printk(KERN_DEBUG, &bridge->dev,
74 "host bridge window %pR invalid; "
75 "aligning start to %d-byte boundary\n", res, align);
76 res->start &= ~(align - 1);
77 }
78 if (!IS_ALIGNED(res->end + 1, align)) {
79 dev_printk(KERN_DEBUG, &bridge->dev,
80 "host bridge window %pR invalid; "
81 "aligning end to %d-byte boundary\n", res, align);
82 res->end = ALIGN(res->end, align) - 1;
83 }
84}
85
61static acpi_status 86static acpi_status
62setup_resource(struct acpi_resource *acpi_res, void *data) 87setup_resource(struct acpi_resource *acpi_res, void *data)
63{ 88{
@@ -91,11 +116,12 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
91 start = addr.minimum + addr.translation_offset; 116 start = addr.minimum + addr.translation_offset;
92 end = start + addr.address_length - 1; 117 end = start + addr.address_length - 1;
93 if (info->res_num >= max_root_bus_resources) { 118 if (info->res_num >= max_root_bus_resources) {
94 printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " 119 if (pci_probe & PCI_USE__CRS)
95 "from %s for %s due to _CRS returning more than " 120 printk(KERN_WARNING "PCI: Failed to allocate "
96 "%d resource descriptors\n", (unsigned long) start, 121 "0x%lx-0x%lx from %s for %s due to _CRS "
97 (unsigned long) end, root->name, info->name, 122 "returning more than %d resource descriptors\n",
98 max_root_bus_resources); 123 (unsigned long) start, (unsigned long) end,
124 root->name, info->name, max_root_bus_resources);
99 return AE_OK; 125 return AE_OK;
100 } 126 }
101 127
@@ -105,14 +131,28 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
105 res->start = start; 131 res->start = start;
106 res->end = end; 132 res->end = end;
107 res->child = NULL; 133 res->child = NULL;
134 align_resource(info->bridge, res);
135
136 if (!(pci_probe & PCI_USE__CRS)) {
137 dev_printk(KERN_DEBUG, &info->bridge->dev,
138 "host bridge window %pR (ignored)\n", res);
139 return AE_OK;
140 }
108 141
109 if (insert_resource(root, res)) { 142 if (insert_resource(root, res)) {
110 printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " 143 dev_err(&info->bridge->dev,
111 "from %s for %s\n", (unsigned long) res->start, 144 "can't allocate host bridge window %pR\n", res);
112 (unsigned long) res->end, root->name, info->name);
113 } else { 145 } else {
114 info->bus->resource[info->res_num] = res; 146 info->bus->resource[info->res_num] = res;
115 info->res_num++; 147 info->res_num++;
148 if (addr.translation_offset)
149 dev_info(&info->bridge->dev, "host bridge window %pR "
150 "(PCI address [%#llx-%#llx])\n",
151 res, res->start - addr.translation_offset,
152 res->end - addr.translation_offset);
153 else
154 dev_info(&info->bridge->dev,
155 "host bridge window %pR\n", res);
116 } 156 }
117 return AE_OK; 157 return AE_OK;
118} 158}
@@ -124,6 +164,12 @@ get_current_resources(struct acpi_device *device, int busnum,
124 struct pci_root_info info; 164 struct pci_root_info info;
125 size_t size; 165 size_t size;
126 166
167 if (!(pci_probe & PCI_USE__CRS))
168 dev_info(&device->dev,
169 "ignoring host bridge windows from ACPI; "
170 "boot with \"pci=use_crs\" to use them\n");
171
172 info.bridge = device;
127 info.bus = bus; 173 info.bus = bus;
128 info.res_num = 0; 174 info.res_num = 0;
129 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, 175 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
@@ -163,8 +209,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
163#endif 209#endif
164 210
165 if (domain && !pci_domains_supported) { 211 if (domain && !pci_domains_supported) {
166 printk(KERN_WARNING "PCI: Multiple domains not supported " 212 printk(KERN_WARNING "pci_bus %04x:%02x: "
167 "(dom %d, bus %d)\n", domain, busnum); 213 "ignored (multiple domains not supported)\n",
214 domain, busnum);
168 return NULL; 215 return NULL;
169 } 216 }
170 217
@@ -188,7 +235,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
188 */ 235 */
189 sd = kzalloc(sizeof(*sd), GFP_KERNEL); 236 sd = kzalloc(sizeof(*sd), GFP_KERNEL);
190 if (!sd) { 237 if (!sd) {
191 printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); 238 printk(KERN_WARNING "pci_bus %04x:%02x: "
239 "ignored (out of memory)\n", domain, busnum);
192 return NULL; 240 return NULL;
193 } 241 }
194 242
@@ -209,9 +257,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
209 } else { 257 } else {
210 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); 258 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
211 if (bus) { 259 if (bus) {
212 if (pci_probe & PCI_USE__CRS) 260 get_current_resources(device, busnum, domain, bus);
213 get_current_resources(device, busnum, domain,
214 bus);
215 bus->subordinate = pci_scan_child_bus(bus); 261 bus->subordinate = pci_scan_child_bus(bus);
216 } 262 }
217 } 263 }
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 572ee9782f2a..95ecbd495955 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -6,10 +6,10 @@
6 6
7#ifdef CONFIG_X86_64 7#ifdef CONFIG_X86_64
8#include <asm/pci-direct.h> 8#include <asm/pci-direct.h>
9#include <asm/mpspec.h>
10#include <linux/cpumask.h>
11#endif 9#endif
12 10
11#include "bus_numa.h"
12
13/* 13/*
14 * This discovers the pcibus <-> node mapping on AMD K8. 14 * This discovers the pcibus <-> node mapping on AMD K8.
15 * also get peer root bus resource for io,mmio 15 * also get peer root bus resource for io,mmio
@@ -17,67 +17,6 @@
17 17
18#ifdef CONFIG_X86_64 18#ifdef CONFIG_X86_64
19 19
20/*
21 * sub bus (transparent) will use entres from 3 to store extra from root,
22 * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
23 */
24#define RES_NUM 16
25struct pci_root_info {
26 char name[12];
27 unsigned int res_num;
28 struct resource res[RES_NUM];
29 int bus_min;
30 int bus_max;
31 int node;
32 int link;
33};
34
35/* 4 at this time, it may become to 32 */
36#define PCI_ROOT_NR 4
37static int pci_root_num;
38static struct pci_root_info pci_root_info[PCI_ROOT_NR];
39
40void x86_pci_root_bus_res_quirks(struct pci_bus *b)
41{
42 int i;
43 int j;
44 struct pci_root_info *info;
45
46 /* don't go for it if _CRS is used already */
47 if (b->resource[0] != &ioport_resource ||
48 b->resource[1] != &iomem_resource)
49 return;
50
51 /* if only one root bus, don't need to anything */
52 if (pci_root_num < 2)
53 return;
54
55 for (i = 0; i < pci_root_num; i++) {
56 if (pci_root_info[i].bus_min == b->number)
57 break;
58 }
59
60 if (i == pci_root_num)
61 return;
62
63 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
64 b->number);
65
66 info = &pci_root_info[i];
67 for (j = 0; j < info->res_num; j++) {
68 struct resource *res;
69 struct resource *root;
70
71 res = &info->res[j];
72 b->resource[j] = res;
73 if (res->flags & IORESOURCE_IO)
74 root = &ioport_resource;
75 else
76 root = &iomem_resource;
77 insert_resource(root, res);
78 }
79}
80
81#define RANGE_NUM 16 20#define RANGE_NUM 16
82 21
83struct res_range { 22struct res_range {
@@ -130,52 +69,6 @@ static void __init update_range(struct res_range *range, size_t start,
130 } 69 }
131} 70}
132 71
133static void __init update_res(struct pci_root_info *info, size_t start,
134 size_t end, unsigned long flags, int merge)
135{
136 int i;
137 struct resource *res;
138
139 if (!merge)
140 goto addit;
141
142 /* try to merge it with old one */
143 for (i = 0; i < info->res_num; i++) {
144 size_t final_start, final_end;
145 size_t common_start, common_end;
146
147 res = &info->res[i];
148 if (res->flags != flags)
149 continue;
150
151 common_start = max((size_t)res->start, start);
152 common_end = min((size_t)res->end, end);
153 if (common_start > common_end + 1)
154 continue;
155
156 final_start = min((size_t)res->start, start);
157 final_end = max((size_t)res->end, end);
158
159 res->start = final_start;
160 res->end = final_end;
161 return;
162 }
163
164addit:
165
166 /* need to add that */
167 if (info->res_num >= RES_NUM)
168 return;
169
170 res = &info->res[info->res_num];
171 res->name = info->name;
172 res->flags = flags;
173 res->start = start;
174 res->end = end;
175 res->child = NULL;
176 info->res_num++;
177}
178
179struct pci_hostbridge_probe { 72struct pci_hostbridge_probe {
180 u32 bus; 73 u32 bus;
181 u32 slot; 74 u32 slot;
@@ -230,7 +123,6 @@ static int __init early_fill_mp_bus_info(void)
230 int j; 123 int j;
231 unsigned bus; 124 unsigned bus;
232 unsigned slot; 125 unsigned slot;
233 int found;
234 int node; 126 int node;
235 int link; 127 int link;
236 int def_node; 128 int def_node;
@@ -247,7 +139,7 @@ static int __init early_fill_mp_bus_info(void)
247 if (!early_pci_allowed()) 139 if (!early_pci_allowed())
248 return -1; 140 return -1;
249 141
250 found = 0; 142 found_all_numa_early = 0;
251 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { 143 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
252 u32 id; 144 u32 id;
253 u16 device; 145 u16 device;
@@ -261,12 +153,12 @@ static int __init early_fill_mp_bus_info(void)
261 device = (id>>16) & 0xffff; 153 device = (id>>16) & 0xffff;
262 if (pci_probes[i].vendor == vendor && 154 if (pci_probes[i].vendor == vendor &&
263 pci_probes[i].device == device) { 155 pci_probes[i].device == device) {
264 found = 1; 156 found_all_numa_early = 1;
265 break; 157 break;
266 } 158 }
267 } 159 }
268 160
269 if (!found) 161 if (!found_all_numa_early)
270 return 0; 162 return 0;
271 163
272 pci_root_num = 0; 164 pci_root_num = 0;
@@ -488,7 +380,7 @@ static int __init early_fill_mp_bus_info(void)
488 info = &pci_root_info[i]; 380 info = &pci_root_info[i];
489 res_num = info->res_num; 381 res_num = info->res_num;
490 busnum = info->bus_min; 382 busnum = info->bus_min;
491 printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n", 383 printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n",
492 info->bus_min, info->bus_max, info->node, info->link); 384 info->bus_min, info->bus_max, info->node, info->link);
493 for (j = 0; j < res_num; j++) { 385 for (j = 0; j < res_num; j++) {
494 res = &info->res[j]; 386 res = &info->res[j];
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
new file mode 100644
index 000000000000..145df00e0387
--- /dev/null
+++ b/arch/x86/pci/bus_numa.c
@@ -0,0 +1,101 @@
1#include <linux/init.h>
2#include <linux/pci.h>
3
4#include "bus_numa.h"
5
6int pci_root_num;
7struct pci_root_info pci_root_info[PCI_ROOT_NR];
8int found_all_numa_early;
9
10void x86_pci_root_bus_res_quirks(struct pci_bus *b)
11{
12 int i;
13 int j;
14 struct pci_root_info *info;
15
16 /* don't go for it if _CRS is used already */
17 if (b->resource[0] != &ioport_resource ||
18 b->resource[1] != &iomem_resource)
19 return;
20
21 if (!pci_root_num)
22 return;
23
24 /* for amd, if only one root bus, don't need to do anything */
25 if (pci_root_num < 2 && found_all_numa_early)
26 return;
27
28 for (i = 0; i < pci_root_num; i++) {
29 if (pci_root_info[i].bus_min == b->number)
30 break;
31 }
32
33 if (i == pci_root_num)
34 return;
35
36 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
37 b->number);
38
39 info = &pci_root_info[i];
40 for (j = 0; j < info->res_num; j++) {
41 struct resource *res;
42 struct resource *root;
43
44 res = &info->res[j];
45 b->resource[j] = res;
46 if (res->flags & IORESOURCE_IO)
47 root = &ioport_resource;
48 else
49 root = &iomem_resource;
50 insert_resource(root, res);
51 }
52}
53
54void __init update_res(struct pci_root_info *info, size_t start,
55 size_t end, unsigned long flags, int merge)
56{
57 int i;
58 struct resource *res;
59
60 if (start > end)
61 return;
62
63 if (!merge)
64 goto addit;
65
66 /* try to merge it with old one */
67 for (i = 0; i < info->res_num; i++) {
68 size_t final_start, final_end;
69 size_t common_start, common_end;
70
71 res = &info->res[i];
72 if (res->flags != flags)
73 continue;
74
75 common_start = max((size_t)res->start, start);
76 common_end = min((size_t)res->end, end);
77 if (common_start > common_end + 1)
78 continue;
79
80 final_start = min((size_t)res->start, start);
81 final_end = max((size_t)res->end, end);
82
83 res->start = final_start;
84 res->end = final_end;
85 return;
86 }
87
88addit:
89
90 /* need to add that */
91 if (info->res_num >= RES_NUM)
92 return;
93
94 res = &info->res[info->res_num];
95 res->name = info->name;
96 res->flags = flags;
97 res->start = start;
98 res->end = end;
99 res->child = NULL;
100 info->res_num++;
101}
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
new file mode 100644
index 000000000000..adbc23fe82ac
--- /dev/null
+++ b/arch/x86/pci/bus_numa.h
@@ -0,0 +1,27 @@
1#ifdef CONFIG_X86_64
2
3/*
4 * sub bus (transparent) will use entres from 3 to store extra from
5 * root, so need to make sure we have enough slot there, Should we
6 * increase PCI_BUS_NUM_RESOURCES?
7 */
8#define RES_NUM 16
9struct pci_root_info {
10 char name[12];
11 unsigned int res_num;
12 struct resource res[RES_NUM];
13 int bus_min;
14 int bus_max;
15 int node;
16 int link;
17};
18
19/* 4 at this time, it may become to 32 */
20#define PCI_ROOT_NR 4
21extern int pci_root_num;
22extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
23extern int found_all_numa_early;
24
25extern void update_res(struct pci_root_info *info, size_t start,
26 size_t end, unsigned long flags, int merge);
27#endif
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 1331fcf26143..d2552c68e94d 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -410,8 +410,6 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
410 return bus; 410 return bus;
411} 411}
412 412
413extern u8 pci_cache_line_size;
414
415int __init pcibios_init(void) 413int __init pcibios_init(void)
416{ 414{
417 struct cpuinfo_x86 *c = &boot_cpu_data; 415 struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -422,15 +420,19 @@ int __init pcibios_init(void)
422 } 420 }
423 421
424 /* 422 /*
425 * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8 423 * Set PCI cacheline size to that of the CPU if the CPU has reported it.
426 * and P4. It's also good for 386/486s (which actually have 16) 424 * (For older CPUs that don't support cpuid, we se it to 32 bytes
425 * It's also good for 386/486s (which actually have 16)
427 * as quite a few PCI devices do not support smaller values. 426 * as quite a few PCI devices do not support smaller values.
428 */ 427 */
429 pci_cache_line_size = 32 >> 2; 428 if (c->x86_clflush_size > 0) {
430 if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) 429 pci_dfl_cache_line_size = c->x86_clflush_size >> 2;
431 pci_cache_line_size = 64 >> 2; /* K7 & K8 */ 430 printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n",
432 else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) 431 pci_dfl_cache_line_size << 2);
433 pci_cache_line_size = 128 >> 2; /* P4 */ 432 } else {
433 pci_dfl_cache_line_size = 32 >> 2;
434 printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n");
435 }
434 436
435 pcibios_resource_survey(); 437 pcibios_resource_survey();
436 438
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index aaf26ae58cd5..d1067d539bee 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -12,8 +12,6 @@ u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
12 u32 v; 12 u32 v;
13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
14 v = inl(0xcfc); 14 v = inl(0xcfc);
15 if (v != 0xffffffff)
16 pr_debug("%x reading 4 from %x: %x\n", slot, offset, v);
17 return v; 15 return v;
18} 16}
19 17
@@ -22,7 +20,6 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
22 u8 v; 20 u8 v;
23 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 21 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
24 v = inb(0xcfc + (offset&3)); 22 v = inb(0xcfc + (offset&3));
25 pr_debug("%x reading 1 from %x: %x\n", slot, offset, v);
26 return v; 23 return v;
27} 24}
28 25
@@ -31,28 +28,24 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
31 u16 v; 28 u16 v;
32 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 29 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
33 v = inw(0xcfc + (offset&2)); 30 v = inw(0xcfc + (offset&2));
34 pr_debug("%x reading 2 from %x: %x\n", slot, offset, v);
35 return v; 31 return v;
36} 32}
37 33
38void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, 34void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
39 u32 val) 35 u32 val)
40{ 36{
41 pr_debug("%x writing to %x: %x\n", slot, offset, val);
42 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 37 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
43 outl(val, 0xcfc); 38 outl(val, 0xcfc);
44} 39}
45 40
46void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) 41void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
47{ 42{
48 pr_debug("%x writing to %x: %x\n", slot, offset, val);
49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 43 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
50 outb(val, 0xcfc + (offset&3)); 44 outb(val, 0xcfc + (offset&3));
51} 45}
52 46
53void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) 47void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
54{ 48{
55 pr_debug("%x writing to %x: %x\n", slot, offset, val);
56 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
57 outw(val, 0xcfc + (offset&2)); 50 outw(val, 0xcfc + (offset&2));
58} 51}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index b22d13b0c71d..5dc9e8c63fcd 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -129,7 +129,9 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
129 continue; 129 continue;
130 if (!r->start || 130 if (!r->start ||
131 pci_claim_resource(dev, idx) < 0) { 131 pci_claim_resource(dev, idx) < 0) {
132 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); 132 dev_info(&dev->dev,
133 "can't reserve window %pR\n",
134 r);
133 /* 135 /*
134 * Something is wrong with the region. 136 * Something is wrong with the region.
135 * Invalidate the resource to prevent 137 * Invalidate the resource to prevent
@@ -144,16 +146,29 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
144 } 146 }
145} 147}
146 148
149struct pci_check_idx_range {
150 int start;
151 int end;
152};
153
147static void __init pcibios_allocate_resources(int pass) 154static void __init pcibios_allocate_resources(int pass)
148{ 155{
149 struct pci_dev *dev = NULL; 156 struct pci_dev *dev = NULL;
150 int idx, disabled; 157 int idx, disabled, i;
151 u16 command; 158 u16 command;
152 struct resource *r; 159 struct resource *r;
153 160
161 struct pci_check_idx_range idx_range[] = {
162 { PCI_STD_RESOURCES, PCI_STD_RESOURCE_END },
163#ifdef CONFIG_PCI_IOV
164 { PCI_IOV_RESOURCES, PCI_IOV_RESOURCE_END },
165#endif
166 };
167
154 for_each_pci_dev(dev) { 168 for_each_pci_dev(dev) {
155 pci_read_config_word(dev, PCI_COMMAND, &command); 169 pci_read_config_word(dev, PCI_COMMAND, &command);
156 for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) { 170 for (i = 0; i < ARRAY_SIZE(idx_range); i++)
171 for (idx = idx_range[i].start; idx <= idx_range[i].end; idx++) {
157 r = &dev->resource[idx]; 172 r = &dev->resource[idx];
158 if (r->parent) /* Already allocated */ 173 if (r->parent) /* Already allocated */
159 continue; 174 continue;
@@ -164,12 +179,12 @@ static void __init pcibios_allocate_resources(int pass)
164 else 179 else
165 disabled = !(command & PCI_COMMAND_MEMORY); 180 disabled = !(command & PCI_COMMAND_MEMORY);
166 if (pass == disabled) { 181 if (pass == disabled) {
167 dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n", 182 dev_dbg(&dev->dev,
168 (unsigned long long) r->start, 183 "BAR %d: reserving %pr (d=%d, p=%d)\n",
169 (unsigned long long) r->end, 184 idx, r, disabled, pass);
170 r->flags, disabled, pass);
171 if (pci_claim_resource(dev, idx) < 0) { 185 if (pci_claim_resource(dev, idx) < 0) {
172 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); 186 dev_info(&dev->dev,
187 "can't reserve %pR\n", r);
173 /* We'll assign a new address later */ 188 /* We'll assign a new address later */
174 r->end -= r->start; 189 r->end -= r->start;
175 r->start = 0; 190 r->start = 0;
@@ -182,7 +197,7 @@ static void __init pcibios_allocate_resources(int pass)
182 /* Turn the ROM off, leave the resource region, 197 /* Turn the ROM off, leave the resource region,
183 * but keep it unregistered. */ 198 * but keep it unregistered. */
184 u32 reg; 199 u32 reg;
185 dev_dbg(&dev->dev, "disabling ROM\n"); 200 dev_dbg(&dev->dev, "disabling ROM %pR\n", r);
186 r->flags &= ~IORESOURCE_ROM_ENABLE; 201 r->flags &= ~IORESOURCE_ROM_ENABLE;
187 pci_read_config_dword(dev, 202 pci_read_config_dword(dev,
188 dev->rom_base_reg, &reg); 203 dev->rom_base_reg, &reg);
@@ -282,6 +297,15 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
282 return -EINVAL; 297 return -EINVAL;
283 298
284 prot = pgprot_val(vma->vm_page_prot); 299 prot = pgprot_val(vma->vm_page_prot);
300
301 /*
302 * Return error if pat is not enabled and write_combine is requested.
303 * Caller can followup with UC MINUS request and add a WC mtrr if there
304 * is a free mtrr slot.
305 */
306 if (!pat_enabled && write_combine)
307 return -EINVAL;
308
285 if (pat_enabled && write_combine) 309 if (pat_enabled && write_combine)
286 prot |= _PAGE_CACHE_WC; 310 prot |= _PAGE_CACHE_WC;
287 else if (pat_enabled || boot_cpu_data.x86 > 3) 311 else if (pat_enabled || boot_cpu_data.x86 > 3)
diff --git a/arch/x86/pci/intel_bus.c b/arch/x86/pci/intel_bus.c
new file mode 100644
index 000000000000..b7a55dc55d13
--- /dev/null
+++ b/arch/x86/pci/intel_bus.c
@@ -0,0 +1,90 @@
1/*
2 * to read io range from IOH pci conf, need to do it after mmconfig is there
3 */
4
5#include <linux/delay.h>
6#include <linux/dmi.h>
7#include <linux/pci.h>
8#include <linux/init.h>
9#include <asm/pci_x86.h>
10
11#include "bus_numa.h"
12
13static inline void print_ioh_resources(struct pci_root_info *info)
14{
15 int res_num;
16 int busnum;
17 int i;
18
19 printk(KERN_DEBUG "IOH bus: [%02x, %02x]\n",
20 info->bus_min, info->bus_max);
21 res_num = info->res_num;
22 busnum = info->bus_min;
23 for (i = 0; i < res_num; i++) {
24 struct resource *res;
25
26 res = &info->res[i];
27 printk(KERN_DEBUG "IOH bus: %02x index %x %s: [%llx, %llx]\n",
28 busnum, i,
29 (res->flags & IORESOURCE_IO) ? "io port" :
30 "mmio",
31 res->start, res->end);
32 }
33}
34
35#define IOH_LIO 0x108
36#define IOH_LMMIOL 0x10c
37#define IOH_LMMIOH 0x110
38#define IOH_LMMIOH_BASEU 0x114
39#define IOH_LMMIOH_LIMITU 0x118
40#define IOH_LCFGBUS 0x11c
41
42static void __devinit pci_root_bus_res(struct pci_dev *dev)
43{
44 u16 word;
45 u32 dword;
46 struct pci_root_info *info;
47 u16 io_base, io_end;
48 u32 mmiol_base, mmiol_end;
49 u64 mmioh_base, mmioh_end;
50 int bus_base, bus_end;
51
52 if (pci_root_num >= PCI_ROOT_NR) {
53 printk(KERN_DEBUG "intel_bus.c: PCI_ROOT_NR is too small\n");
54 return;
55 }
56
57 info = &pci_root_info[pci_root_num];
58 pci_root_num++;
59
60 pci_read_config_word(dev, IOH_LCFGBUS, &word);
61 bus_base = (word & 0xff);
62 bus_end = (word & 0xff00) >> 8;
63 sprintf(info->name, "PCI Bus #%02x", bus_base);
64 info->bus_min = bus_base;
65 info->bus_max = bus_end;
66
67 pci_read_config_word(dev, IOH_LIO, &word);
68 io_base = (word & 0xf0) << (12 - 4);
69 io_end = (word & 0xf000) | 0xfff;
70 update_res(info, io_base, io_end, IORESOURCE_IO, 0);
71
72 pci_read_config_dword(dev, IOH_LMMIOL, &dword);
73 mmiol_base = (dword & 0xff00) << (24 - 8);
74 mmiol_end = (dword & 0xff000000) | 0xffffff;
75 update_res(info, mmiol_base, mmiol_end, IORESOURCE_MEM, 0);
76
77 pci_read_config_dword(dev, IOH_LMMIOH, &dword);
78 mmioh_base = ((u64)(dword & 0xfc00)) << (26 - 10);
79 mmioh_end = ((u64)(dword & 0xfc000000) | 0x3ffffff);
80 pci_read_config_dword(dev, IOH_LMMIOH_BASEU, &dword);
81 mmioh_base |= ((u64)(dword & 0x7ffff)) << 32;
82 pci_read_config_dword(dev, IOH_LMMIOH_LIMITU, &dword);
83 mmioh_end |= ((u64)(dword & 0x7ffff)) << 32;
84 update_res(info, mmioh_base, mmioh_end, IORESOURCE_MEM, 0);
85
86 print_ioh_resources(info);
87}
88
89/* intel IOH */
90DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x342e, pci_root_bus_res);
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 602c172d3bd5..b19d1e54201e 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -15,48 +15,98 @@
15#include <linux/acpi.h> 15#include <linux/acpi.h>
16#include <linux/sfi_acpi.h> 16#include <linux/sfi_acpi.h>
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/sort.h> 18#include <linux/dmi.h>
19#include <asm/e820.h> 19#include <asm/e820.h>
20#include <asm/pci_x86.h> 20#include <asm/pci_x86.h>
21#include <asm/acpi.h> 21#include <asm/acpi.h>
22 22
23#define PREFIX "PCI: " 23#define PREFIX "PCI: "
24 24
25/* aperture is up to 256MB but BIOS may reserve less */
26#define MMCONFIG_APER_MIN (2 * 1024*1024)
27#define MMCONFIG_APER_MAX (256 * 1024*1024)
28
29/* Indicate if the mmcfg resources have been placed into the resource table. */ 25/* Indicate if the mmcfg resources have been placed into the resource table. */
30static int __initdata pci_mmcfg_resources_inserted; 26static int __initdata pci_mmcfg_resources_inserted;
31 27
32static __init int extend_mmcfg(int num) 28LIST_HEAD(pci_mmcfg_list);
29
30static __init void pci_mmconfig_remove(struct pci_mmcfg_region *cfg)
33{ 31{
34 struct acpi_mcfg_allocation *new; 32 if (cfg->res.parent)
35 int new_num = pci_mmcfg_config_num + num; 33 release_resource(&cfg->res);
34 list_del(&cfg->list);
35 kfree(cfg);
36}
36 37
37 new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL); 38static __init void free_all_mmcfg(void)
38 if (!new) 39{
39 return -1; 40 struct pci_mmcfg_region *cfg, *tmp;
40 41
41 if (pci_mmcfg_config) { 42 pci_mmcfg_arch_free();
42 memcpy(new, pci_mmcfg_config, 43 list_for_each_entry_safe(cfg, tmp, &pci_mmcfg_list, list)
43 sizeof(pci_mmcfg_config[0]) * new_num); 44 pci_mmconfig_remove(cfg);
44 kfree(pci_mmcfg_config); 45}
46
47static __init void list_add_sorted(struct pci_mmcfg_region *new)
48{
49 struct pci_mmcfg_region *cfg;
50
51 /* keep list sorted by segment and starting bus number */
52 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
53 if (cfg->segment > new->segment ||
54 (cfg->segment == new->segment &&
55 cfg->start_bus >= new->start_bus)) {
56 list_add_tail(&new->list, &cfg->list);
57 return;
58 }
45 } 59 }
46 pci_mmcfg_config = new; 60 list_add_tail(&new->list, &pci_mmcfg_list);
61}
47 62
48 return 0; 63static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
64 int end, u64 addr)
65{
66 struct pci_mmcfg_region *new;
67 int num_buses;
68 struct resource *res;
69
70 if (addr == 0)
71 return NULL;
72
73 new = kzalloc(sizeof(*new), GFP_KERNEL);
74 if (!new)
75 return NULL;
76
77 new->address = addr;
78 new->segment = segment;
79 new->start_bus = start;
80 new->end_bus = end;
81
82 list_add_sorted(new);
83
84 num_buses = end - start + 1;
85 res = &new->res;
86 res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
87 res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
88 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
89 snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
90 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
91 res->name = new->name;
92
93 printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at "
94 "%pR (base %#lx)\n", segment, start, end, &new->res,
95 (unsigned long) addr);
96
97 return new;
49} 98}
50 99
51static __init void fill_one_mmcfg(u64 addr, int segment, int start, int end) 100struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
52{ 101{
53 int i = pci_mmcfg_config_num; 102 struct pci_mmcfg_region *cfg;
54 103
55 pci_mmcfg_config_num++; 104 list_for_each_entry(cfg, &pci_mmcfg_list, list)
56 pci_mmcfg_config[i].address = addr; 105 if (cfg->segment == segment &&
57 pci_mmcfg_config[i].pci_segment = segment; 106 cfg->start_bus <= bus && bus <= cfg->end_bus)
58 pci_mmcfg_config[i].start_bus_number = start; 107 return cfg;
59 pci_mmcfg_config[i].end_bus_number = end; 108
109 return NULL;
60} 110}
61 111
62static const char __init *pci_mmcfg_e7520(void) 112static const char __init *pci_mmcfg_e7520(void)
@@ -68,11 +118,9 @@ static const char __init *pci_mmcfg_e7520(void)
68 if (win == 0x0000 || win == 0xf000) 118 if (win == 0x0000 || win == 0xf000)
69 return NULL; 119 return NULL;
70 120
71 if (extend_mmcfg(1) == -1) 121 if (pci_mmconfig_add(0, 0, 255, win << 16) == NULL)
72 return NULL; 122 return NULL;
73 123
74 fill_one_mmcfg(win << 16, 0, 0, 255);
75
76 return "Intel Corporation E7520 Memory Controller Hub"; 124 return "Intel Corporation E7520 Memory Controller Hub";
77} 125}
78 126
@@ -114,11 +162,9 @@ static const char __init *pci_mmcfg_intel_945(void)
114 if ((pciexbar & mask) >= 0xf0000000U) 162 if ((pciexbar & mask) >= 0xf0000000U)
115 return NULL; 163 return NULL;
116 164
117 if (extend_mmcfg(1) == -1) 165 if (pci_mmconfig_add(0, 0, (len >> 20) - 1, pciexbar & mask) == NULL)
118 return NULL; 166 return NULL;
119 167
120 fill_one_mmcfg(pciexbar & mask, 0, 0, (len >> 20) - 1);
121
122 return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; 168 return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
123} 169}
124 170
@@ -127,7 +173,7 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
127 u32 low, high, address; 173 u32 low, high, address;
128 u64 base, msr; 174 u64 base, msr;
129 int i; 175 int i;
130 unsigned segnbits = 0, busnbits; 176 unsigned segnbits = 0, busnbits, end_bus;
131 177
132 if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) 178 if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF))
133 return NULL; 179 return NULL;
@@ -161,11 +207,13 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
161 busnbits = 8; 207 busnbits = 8;
162 } 208 }
163 209
164 if (extend_mmcfg(1 << segnbits) == -1) 210 end_bus = (1 << busnbits) - 1;
165 return NULL;
166
167 for (i = 0; i < (1 << segnbits); i++) 211 for (i = 0; i < (1 << segnbits); i++)
168 fill_one_mmcfg(base + (1<<28) * i, i, 0, (1 << busnbits) - 1); 212 if (pci_mmconfig_add(i, 0, end_bus,
213 base + (1<<28) * i) == NULL) {
214 free_all_mmcfg();
215 return NULL;
216 }
169 217
170 return "AMD Family 10h NB"; 218 return "AMD Family 10h NB";
171} 219}
@@ -190,7 +238,7 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void)
190 /* 238 /*
191 * do check if amd fam10h already took over 239 * do check if amd fam10h already took over
192 */ 240 */
193 if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked) 241 if (!acpi_disabled || !list_empty(&pci_mmcfg_list) || mcp55_checked)
194 return NULL; 242 return NULL;
195 243
196 mcp55_checked = true; 244 mcp55_checked = true;
@@ -213,16 +261,14 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void)
213 if (!(extcfg & extcfg_enable_mask)) 261 if (!(extcfg & extcfg_enable_mask))
214 continue; 262 continue;
215 263
216 if (extend_mmcfg(1) == -1)
217 continue;
218
219 size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift; 264 size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift;
220 base = extcfg & extcfg_base_mask[size_index]; 265 base = extcfg & extcfg_base_mask[size_index];
221 /* base could > 4G */ 266 /* base could > 4G */
222 base <<= extcfg_base_lshift; 267 base <<= extcfg_base_lshift;
223 start = (extcfg & extcfg_start_mask) >> extcfg_start_shift; 268 start = (extcfg & extcfg_start_mask) >> extcfg_start_shift;
224 end = start + extcfg_sizebus[size_index] - 1; 269 end = start + extcfg_sizebus[size_index] - 1;
225 fill_one_mmcfg(base, 0, start, end); 270 if (pci_mmconfig_add(0, start, end, base) == NULL)
271 continue;
226 mcp55_mmconf_found++; 272 mcp55_mmconf_found++;
227 } 273 }
228 274
@@ -253,45 +299,27 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
253 0x0369, pci_mmcfg_nvidia_mcp55 }, 299 0x0369, pci_mmcfg_nvidia_mcp55 },
254}; 300};
255 301
256static int __init cmp_mmcfg(const void *x1, const void *x2)
257{
258 const typeof(pci_mmcfg_config[0]) *m1 = x1;
259 const typeof(pci_mmcfg_config[0]) *m2 = x2;
260 int start1, start2;
261
262 start1 = m1->start_bus_number;
263 start2 = m2->start_bus_number;
264
265 return start1 - start2;
266}
267
268static void __init pci_mmcfg_check_end_bus_number(void) 302static void __init pci_mmcfg_check_end_bus_number(void)
269{ 303{
270 int i; 304 struct pci_mmcfg_region *cfg, *cfgx;
271 typeof(pci_mmcfg_config[0]) *cfg, *cfgx;
272
273 /* sort them at first */
274 sort(pci_mmcfg_config, pci_mmcfg_config_num,
275 sizeof(pci_mmcfg_config[0]), cmp_mmcfg, NULL);
276 305
277 /* last one*/ 306 /* last one*/
278 if (pci_mmcfg_config_num > 0) { 307 cfg = list_entry(pci_mmcfg_list.prev, typeof(*cfg), list);
279 i = pci_mmcfg_config_num - 1; 308 if (cfg)
280 cfg = &pci_mmcfg_config[i]; 309 if (cfg->end_bus < cfg->start_bus)
281 if (cfg->end_bus_number < cfg->start_bus_number) 310 cfg->end_bus = 255;
282 cfg->end_bus_number = 255;
283 }
284 311
285 /* don't overlap please */ 312 if (list_is_singular(&pci_mmcfg_list))
286 for (i = 0; i < pci_mmcfg_config_num - 1; i++) { 313 return;
287 cfg = &pci_mmcfg_config[i];
288 cfgx = &pci_mmcfg_config[i+1];
289 314
290 if (cfg->end_bus_number < cfg->start_bus_number) 315 /* don't overlap please */
291 cfg->end_bus_number = 255; 316 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
317 if (cfg->end_bus < cfg->start_bus)
318 cfg->end_bus = 255;
292 319
293 if (cfg->end_bus_number >= cfgx->start_bus_number) 320 cfgx = list_entry(cfg->list.next, typeof(*cfg), list);
294 cfg->end_bus_number = cfgx->start_bus_number - 1; 321 if (cfg != cfgx && cfg->end_bus >= cfgx->start_bus)
322 cfg->end_bus = cfgx->start_bus - 1;
295 } 323 }
296} 324}
297 325
@@ -306,8 +334,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
306 if (!raw_pci_ops) 334 if (!raw_pci_ops)
307 return 0; 335 return 0;
308 336
309 pci_mmcfg_config_num = 0; 337 free_all_mmcfg();
310 pci_mmcfg_config = NULL;
311 338
312 for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) { 339 for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
313 bus = pci_mmcfg_probes[i].bus; 340 bus = pci_mmcfg_probes[i].bus;
@@ -322,45 +349,22 @@ static int __init pci_mmcfg_check_hostbridge(void)
322 name = pci_mmcfg_probes[i].probe(); 349 name = pci_mmcfg_probes[i].probe();
323 350
324 if (name) 351 if (name)
325 printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n", 352 printk(KERN_INFO PREFIX "%s with MMCONFIG support\n",
326 name); 353 name);
327 } 354 }
328 355
329 /* some end_bus_number is crazy, fix it */ 356 /* some end_bus_number is crazy, fix it */
330 pci_mmcfg_check_end_bus_number(); 357 pci_mmcfg_check_end_bus_number();
331 358
332 return pci_mmcfg_config_num != 0; 359 return !list_empty(&pci_mmcfg_list);
333} 360}
334 361
335static void __init pci_mmcfg_insert_resources(void) 362static void __init pci_mmcfg_insert_resources(void)
336{ 363{
337#define PCI_MMCFG_RESOURCE_NAME_LEN 24 364 struct pci_mmcfg_region *cfg;
338 int i;
339 struct resource *res;
340 char *names;
341 unsigned num_buses;
342
343 res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
344 pci_mmcfg_config_num, GFP_KERNEL);
345 if (!res) {
346 printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
347 return;
348 }
349 365
350 names = (void *)&res[pci_mmcfg_config_num]; 366 list_for_each_entry(cfg, &pci_mmcfg_list, list)
351 for (i = 0; i < pci_mmcfg_config_num; i++, res++) { 367 insert_resource(&iomem_resource, &cfg->res);
352 struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i];
353 num_buses = cfg->end_bus_number - cfg->start_bus_number + 1;
354 res->name = names;
355 snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN,
356 "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment,
357 cfg->start_bus_number, cfg->end_bus_number);
358 res->start = cfg->address + (cfg->start_bus_number << 20);
359 res->end = res->start + (num_buses << 20) - 1;
360 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
361 insert_resource(&iomem_resource, res);
362 names += PCI_MMCFG_RESOURCE_NAME_LEN;
363 }
364 368
365 /* Mark that the resources have been inserted. */ 369 /* Mark that the resources have been inserted. */
366 pci_mmcfg_resources_inserted = 1; 370 pci_mmcfg_resources_inserted = 1;
@@ -437,11 +441,12 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
437typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); 441typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
438 442
439static int __init is_mmconf_reserved(check_reserved_t is_reserved, 443static int __init is_mmconf_reserved(check_reserved_t is_reserved,
440 u64 addr, u64 size, int i, 444 struct pci_mmcfg_region *cfg, int with_e820)
441 typeof(pci_mmcfg_config[0]) *cfg, int with_e820)
442{ 445{
446 u64 addr = cfg->res.start;
447 u64 size = resource_size(&cfg->res);
443 u64 old_size = size; 448 u64 old_size = size;
444 int valid = 0; 449 int valid = 0, num_buses;
445 450
446 while (!is_reserved(addr, addr + size, E820_RESERVED)) { 451 while (!is_reserved(addr, addr + size, E820_RESERVED)) {
447 size >>= 1; 452 size >>= 1;
@@ -450,19 +455,25 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
450 } 455 }
451 456
452 if (size >= (16UL<<20) || size == old_size) { 457 if (size >= (16UL<<20) || size == old_size) {
453 printk(KERN_NOTICE 458 printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n",
454 "PCI: MCFG area at %Lx reserved in %s\n", 459 &cfg->res,
455 addr, with_e820?"E820":"ACPI motherboard resources"); 460 with_e820 ? "E820" : "ACPI motherboard resources");
456 valid = 1; 461 valid = 1;
457 462
458 if (old_size != size) { 463 if (old_size != size) {
459 /* update end_bus_number */ 464 /* update end_bus */
460 cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1); 465 cfg->end_bus = cfg->start_bus + ((size>>20) - 1);
461 printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx " 466 num_buses = cfg->end_bus - cfg->start_bus + 1;
462 "segment %hu buses %u - %u\n", 467 cfg->res.end = cfg->res.start +
463 i, (unsigned long)cfg->address, cfg->pci_segment, 468 PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
464 (unsigned int)cfg->start_bus_number, 469 snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN,
465 (unsigned int)cfg->end_bus_number); 470 "PCI MMCONFIG %04x [bus %02x-%02x]",
471 cfg->segment, cfg->start_bus, cfg->end_bus);
472 printk(KERN_INFO PREFIX
473 "MMCONFIG for %04x [bus%02x-%02x] "
474 "at %pR (base %#lx) (size reduced!)\n",
475 cfg->segment, cfg->start_bus, cfg->end_bus,
476 &cfg->res, (unsigned long) cfg->address);
466 } 477 }
467 } 478 }
468 479
@@ -471,45 +482,26 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
471 482
472static void __init pci_mmcfg_reject_broken(int early) 483static void __init pci_mmcfg_reject_broken(int early)
473{ 484{
474 typeof(pci_mmcfg_config[0]) *cfg; 485 struct pci_mmcfg_region *cfg;
475 int i;
476 486
477 if ((pci_mmcfg_config_num == 0) || 487 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
478 (pci_mmcfg_config == NULL) ||
479 (pci_mmcfg_config[0].address == 0))
480 return;
481
482 for (i = 0; i < pci_mmcfg_config_num; i++) {
483 int valid = 0; 488 int valid = 0;
484 u64 addr, size;
485
486 cfg = &pci_mmcfg_config[i];
487 addr = cfg->start_bus_number;
488 addr <<= 20;
489 addr += cfg->address;
490 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
491 size <<= 20;
492 printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx "
493 "segment %hu buses %u - %u\n",
494 i, (unsigned long)cfg->address, cfg->pci_segment,
495 (unsigned int)cfg->start_bus_number,
496 (unsigned int)cfg->end_bus_number);
497 489
498 if (!early && !acpi_disabled) 490 if (!early && !acpi_disabled)
499 valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); 491 valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0);
500 492
501 if (valid) 493 if (valid)
502 continue; 494 continue;
503 495
504 if (!early) 496 if (!early)
505 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" 497 printk(KERN_ERR FW_BUG PREFIX
506 " reserved in ACPI motherboard resources\n", 498 "MMCONFIG at %pR not reserved in "
507 cfg->address); 499 "ACPI motherboard resources\n", &cfg->res);
508 500
509 /* Don't try to do this check unless configuration 501 /* Don't try to do this check unless configuration
510 type 1 is available. how about type 2 ?*/ 502 type 1 is available. how about type 2 ?*/
511 if (raw_pci_ops) 503 if (raw_pci_ops)
512 valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1); 504 valid = is_mmconf_reserved(e820_all_mapped, cfg, 1);
513 505
514 if (!valid) 506 if (!valid)
515 goto reject; 507 goto reject;
@@ -518,34 +510,41 @@ static void __init pci_mmcfg_reject_broken(int early)
518 return; 510 return;
519 511
520reject: 512reject:
521 printk(KERN_INFO "PCI: Not using MMCONFIG.\n"); 513 printk(KERN_INFO PREFIX "not using MMCONFIG\n");
522 pci_mmcfg_arch_free(); 514 free_all_mmcfg();
523 kfree(pci_mmcfg_config);
524 pci_mmcfg_config = NULL;
525 pci_mmcfg_config_num = 0;
526} 515}
527 516
528static int __initdata known_bridge; 517static int __initdata known_bridge;
529 518
530static int acpi_mcfg_64bit_base_addr __initdata = FALSE; 519static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
520 struct acpi_mcfg_allocation *cfg)
521{
522 int year;
531 523
532/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ 524 if (cfg->address < 0xFFFFFFFF)
533struct acpi_mcfg_allocation *pci_mmcfg_config; 525 return 0;
534int pci_mmcfg_config_num;
535 526
536static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
537{
538 if (!strcmp(mcfg->header.oem_id, "SGI")) 527 if (!strcmp(mcfg->header.oem_id, "SGI"))
539 acpi_mcfg_64bit_base_addr = TRUE; 528 return 0;
540 529
541 return 0; 530 if (mcfg->header.revision >= 1) {
531 if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) &&
532 year >= 2010)
533 return 0;
534 }
535
536 printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
537 "is above 4GB, ignored\n", cfg->pci_segment,
538 cfg->start_bus_number, cfg->end_bus_number, cfg->address);
539 return -EINVAL;
542} 540}
543 541
544static int __init pci_parse_mcfg(struct acpi_table_header *header) 542static int __init pci_parse_mcfg(struct acpi_table_header *header)
545{ 543{
546 struct acpi_table_mcfg *mcfg; 544 struct acpi_table_mcfg *mcfg;
545 struct acpi_mcfg_allocation *cfg_table, *cfg;
547 unsigned long i; 546 unsigned long i;
548 int config_size; 547 int entries;
549 548
550 if (!header) 549 if (!header)
551 return -EINVAL; 550 return -EINVAL;
@@ -553,38 +552,33 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
553 mcfg = (struct acpi_table_mcfg *)header; 552 mcfg = (struct acpi_table_mcfg *)header;
554 553
555 /* how many config structures do we have */ 554 /* how many config structures do we have */
556 pci_mmcfg_config_num = 0; 555 free_all_mmcfg();
556 entries = 0;
557 i = header->length - sizeof(struct acpi_table_mcfg); 557 i = header->length - sizeof(struct acpi_table_mcfg);
558 while (i >= sizeof(struct acpi_mcfg_allocation)) { 558 while (i >= sizeof(struct acpi_mcfg_allocation)) {
559 ++pci_mmcfg_config_num; 559 entries++;
560 i -= sizeof(struct acpi_mcfg_allocation); 560 i -= sizeof(struct acpi_mcfg_allocation);
561 }; 561 };
562 if (pci_mmcfg_config_num == 0) { 562 if (entries == 0) {
563 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); 563 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
564 return -ENODEV; 564 return -ENODEV;
565 } 565 }
566 566
567 config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); 567 cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1];
568 pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); 568 for (i = 0; i < entries; i++) {
569 if (!pci_mmcfg_config) { 569 cfg = &cfg_table[i];
570 printk(KERN_WARNING PREFIX 570 if (acpi_mcfg_check_entry(mcfg, cfg)) {
571 "No memory for MCFG config tables\n"); 571 free_all_mmcfg();
572 return -ENOMEM;
573 }
574
575 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
576
577 acpi_mcfg_oem_check(mcfg);
578
579 for (i = 0; i < pci_mmcfg_config_num; ++i) {
580 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
581 !acpi_mcfg_64bit_base_addr) {
582 printk(KERN_ERR PREFIX
583 "MMCONFIG not in low 4GB of memory\n");
584 kfree(pci_mmcfg_config);
585 pci_mmcfg_config_num = 0;
586 return -ENODEV; 572 return -ENODEV;
587 } 573 }
574
575 if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
576 cfg->end_bus_number, cfg->address) == NULL) {
577 printk(KERN_WARNING PREFIX
578 "no memory for MCFG entries\n");
579 free_all_mmcfg();
580 return -ENOMEM;
581 }
588 } 582 }
589 583
590 return 0; 584 return 0;
@@ -614,9 +608,7 @@ static void __init __pci_mmcfg_init(int early)
614 608
615 pci_mmcfg_reject_broken(early); 609 pci_mmcfg_reject_broken(early);
616 610
617 if ((pci_mmcfg_config_num == 0) || 611 if (list_empty(&pci_mmcfg_list))
618 (pci_mmcfg_config == NULL) ||
619 (pci_mmcfg_config[0].address == 0))
620 return; 612 return;
621 613
622 if (pci_mmcfg_arch_init()) 614 if (pci_mmcfg_arch_init())
@@ -648,9 +640,7 @@ static int __init pci_mmcfg_late_insert_resources(void)
648 */ 640 */
649 if ((pci_mmcfg_resources_inserted == 1) || 641 if ((pci_mmcfg_resources_inserted == 1) ||
650 (pci_probe & PCI_PROBE_MMCONF) == 0 || 642 (pci_probe & PCI_PROBE_MMCONF) == 0 ||
651 (pci_mmcfg_config_num == 0) || 643 list_empty(&pci_mmcfg_list))
652 (pci_mmcfg_config == NULL) ||
653 (pci_mmcfg_config[0].address == 0))
654 return 1; 644 return 1;
655 645
656 /* 646 /*
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index f10a7e94a84c..90d5fd476ed4 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -27,18 +27,10 @@ static int mmcfg_last_accessed_cpu;
27 */ 27 */
28static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) 28static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
29{ 29{
30 struct acpi_mcfg_allocation *cfg; 30 struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
31 int cfg_num;
32
33 for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
34 cfg = &pci_mmcfg_config[cfg_num];
35 if (cfg->pci_segment == seg &&
36 (cfg->start_bus_number <= bus) &&
37 (cfg->end_bus_number >= bus))
38 return cfg->address;
39 }
40 31
41 /* Fall back to type 0 */ 32 if (cfg)
33 return cfg->address;
42 return 0; 34 return 0;
43} 35}
44 36
@@ -47,7 +39,7 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
47 */ 39 */
48static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn) 40static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn)
49{ 41{
50 u32 dev_base = base | (bus << 20) | (devfn << 12); 42 u32 dev_base = base | PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12);
51 int cpu = smp_processor_id(); 43 int cpu = smp_processor_id();
52 if (dev_base != mmcfg_last_accessed_device || 44 if (dev_base != mmcfg_last_accessed_device ||
53 cpu != mmcfg_last_accessed_cpu) { 45 cpu != mmcfg_last_accessed_cpu) {
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 94349f8b2f96..e783841bd1d7 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -12,38 +12,15 @@
12#include <asm/e820.h> 12#include <asm/e820.h>
13#include <asm/pci_x86.h> 13#include <asm/pci_x86.h>
14 14
15/* Static virtual mapping of the MMCONFIG aperture */ 15#define PREFIX "PCI: "
16struct mmcfg_virt {
17 struct acpi_mcfg_allocation *cfg;
18 char __iomem *virt;
19};
20static struct mmcfg_virt *pci_mmcfg_virt;
21
22static char __iomem *get_virt(unsigned int seg, unsigned bus)
23{
24 struct acpi_mcfg_allocation *cfg;
25 int cfg_num;
26
27 for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
28 cfg = pci_mmcfg_virt[cfg_num].cfg;
29 if (cfg->pci_segment == seg &&
30 (cfg->start_bus_number <= bus) &&
31 (cfg->end_bus_number >= bus))
32 return pci_mmcfg_virt[cfg_num].virt;
33 }
34
35 /* Fall back to type 0 */
36 return NULL;
37}
38 16
39static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) 17static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
40{ 18{
41 char __iomem *addr; 19 struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
42 20
43 addr = get_virt(seg, bus); 21 if (cfg && cfg->virt)
44 if (!addr) 22 return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
45 return NULL; 23 return NULL;
46 return addr + ((bus << 20) | (devfn << 12));
47} 24}
48 25
49static int pci_mmcfg_read(unsigned int seg, unsigned int bus, 26static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
@@ -109,42 +86,30 @@ static struct pci_raw_ops pci_mmcfg = {
109 .write = pci_mmcfg_write, 86 .write = pci_mmcfg_write,
110}; 87};
111 88
112static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) 89static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg)
113{ 90{
114 void __iomem *addr; 91 void __iomem *addr;
115 u64 start, size; 92 u64 start, size;
93 int num_buses;
116 94
117 start = cfg->start_bus_number; 95 start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
118 start <<= 20; 96 num_buses = cfg->end_bus - cfg->start_bus + 1;
119 start += cfg->address; 97 size = PCI_MMCFG_BUS_OFFSET(num_buses);
120 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
121 size <<= 20;
122 addr = ioremap_nocache(start, size); 98 addr = ioremap_nocache(start, size);
123 if (addr) { 99 if (addr)
124 printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", 100 addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
125 start, start + size - 1);
126 addr -= cfg->start_bus_number << 20;
127 }
128 return addr; 101 return addr;
129} 102}
130 103
131int __init pci_mmcfg_arch_init(void) 104int __init pci_mmcfg_arch_init(void)
132{ 105{
133 int i; 106 struct pci_mmcfg_region *cfg;
134 pci_mmcfg_virt = kzalloc(sizeof(*pci_mmcfg_virt) *
135 pci_mmcfg_config_num, GFP_KERNEL);
136 if (pci_mmcfg_virt == NULL) {
137 printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
138 return 0;
139 }
140 107
141 for (i = 0; i < pci_mmcfg_config_num; ++i) { 108 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
142 pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; 109 cfg->virt = mcfg_ioremap(cfg);
143 pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]); 110 if (!cfg->virt) {
144 if (!pci_mmcfg_virt[i].virt) { 111 printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n",
145 printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " 112 &cfg->res);
146 "segment %d\n",
147 pci_mmcfg_config[i].pci_segment);
148 pci_mmcfg_arch_free(); 113 pci_mmcfg_arch_free();
149 return 0; 114 return 0;
150 } 115 }
@@ -155,19 +120,12 @@ int __init pci_mmcfg_arch_init(void)
155 120
156void __init pci_mmcfg_arch_free(void) 121void __init pci_mmcfg_arch_free(void)
157{ 122{
158 int i; 123 struct pci_mmcfg_region *cfg;
159
160 if (pci_mmcfg_virt == NULL)
161 return;
162 124
163 for (i = 0; i < pci_mmcfg_config_num; ++i) { 125 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
164 if (pci_mmcfg_virt[i].virt) { 126 if (cfg->virt) {
165 iounmap(pci_mmcfg_virt[i].virt + (pci_mmcfg_virt[i].cfg->start_bus_number << 20)); 127 iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
166 pci_mmcfg_virt[i].virt = NULL; 128 cfg->virt = NULL;
167 pci_mmcfg_virt[i].cfg = NULL;
168 } 129 }
169 } 130 }
170
171 kfree(pci_mmcfg_virt);
172 pci_mmcfg_virt = NULL;
173} 131}
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
index d8214dc03fa7..bee8d6ac2691 100644
--- a/arch/x86/tools/test_get_len.c
+++ b/arch/x86/tools/test_get_len.c
@@ -113,7 +113,7 @@ int main(int argc, char **argv)
113 char line[BUFSIZE], sym[BUFSIZE] = "<unknown>"; 113 char line[BUFSIZE], sym[BUFSIZE] = "<unknown>";
114 unsigned char insn_buf[16]; 114 unsigned char insn_buf[16];
115 struct insn insn; 115 struct insn insn;
116 int insns = 0, c; 116 int insns = 0;
117 int warnings = 0; 117 int warnings = 0;
118 118
119 parse_args(argc, argv); 119 parse_args(argc, argv);
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 58bc00f68b12..02b442e92007 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -393,7 +393,6 @@ static ctl_table abi_table2[] = {
393 393
394static ctl_table abi_root_table2[] = { 394static ctl_table abi_root_table2[] = {
395 { 395 {
396 .ctl_name = CTL_ABI,
397 .procname = "abi", 396 .procname = "abi",
398 .mode = 0555, 397 .mode = 0555,
399 .child = abi_table2 398 .child = abi_table2
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index dfbf70e65860..2b26dd5930c6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -27,7 +27,9 @@
27#include <linux/page-flags.h> 27#include <linux/page-flags.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/console.h> 29#include <linux/console.h>
30#include <linux/pci.h>
30 31
32#include <xen/xen.h>
31#include <xen/interface/xen.h> 33#include <xen/interface/xen.h>
32#include <xen/interface/version.h> 34#include <xen/interface/version.h>
33#include <xen/interface/physdev.h> 35#include <xen/interface/physdev.h>
@@ -138,24 +140,23 @@ static void xen_vcpu_setup(int cpu)
138 */ 140 */
139void xen_vcpu_restore(void) 141void xen_vcpu_restore(void)
140{ 142{
141 if (have_vcpu_info_placement) { 143 int cpu;
142 int cpu;
143 144
144 for_each_online_cpu(cpu) { 145 for_each_online_cpu(cpu) {
145 bool other_cpu = (cpu != smp_processor_id()); 146 bool other_cpu = (cpu != smp_processor_id());
146 147
147 if (other_cpu && 148 if (other_cpu &&
148 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) 149 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
149 BUG(); 150 BUG();
150 151
151 xen_vcpu_setup(cpu); 152 xen_setup_runstate_info(cpu);
152 153
153 if (other_cpu && 154 if (have_vcpu_info_placement)
154 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) 155 xen_vcpu_setup(cpu);
155 BUG();
156 }
157 156
158 BUG_ON(!have_vcpu_info_placement); 157 if (other_cpu &&
158 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
159 BUG();
159 } 160 }
160} 161}
161 162
@@ -1093,10 +1094,8 @@ asmlinkage void __init xen_start_kernel(void)
1093 1094
1094 __supported_pte_mask |= _PAGE_IOMAP; 1095 __supported_pte_mask |= _PAGE_IOMAP;
1095 1096
1096#ifdef CONFIG_X86_64
1097 /* Work out if we support NX */ 1097 /* Work out if we support NX */
1098 check_efer(); 1098 x86_configure_nx();
1099#endif
1100 1099
1101 xen_setup_features(); 1100 xen_setup_features();
1102 1101
@@ -1178,10 +1177,16 @@ asmlinkage void __init xen_start_kernel(void)
1178 add_preferred_console("xenboot", 0, NULL); 1177 add_preferred_console("xenboot", 0, NULL);
1179 add_preferred_console("tty", 0, NULL); 1178 add_preferred_console("tty", 0, NULL);
1180 add_preferred_console("hvc", 0, NULL); 1179 add_preferred_console("hvc", 0, NULL);
1180 } else {
1181 /* Make sure ACS will be enabled */
1182 pci_request_acs();
1181 } 1183 }
1184
1182 1185
1183 xen_raw_console_write("about to get started...\n"); 1186 xen_raw_console_write("about to get started...\n");
1184 1187
1188 xen_setup_runstate_info(0);
1189
1185 /* Start the world */ 1190 /* Start the world */
1186#ifdef CONFIG_X86_32 1191#ifdef CONFIG_X86_32
1187 i386_start_kernel(); 1192 i386_start_kernel();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3bf7b1d250ce..bf4cd6bfe959 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -185,7 +185,7 @@ static inline unsigned p2m_index(unsigned long pfn)
185} 185}
186 186
187/* Build the parallel p2m_top_mfn structures */ 187/* Build the parallel p2m_top_mfn structures */
188static void __init xen_build_mfn_list_list(void) 188void xen_build_mfn_list_list(void)
189{ 189{
190 unsigned pfn, idx; 190 unsigned pfn, idx;
191 191
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index fe03eeed7b48..563d20504988 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -35,10 +35,10 @@
35 35
36cpumask_var_t xen_cpu_initialized_map; 36cpumask_var_t xen_cpu_initialized_map;
37 37
38static DEFINE_PER_CPU(int, resched_irq); 38static DEFINE_PER_CPU(int, xen_resched_irq);
39static DEFINE_PER_CPU(int, callfunc_irq); 39static DEFINE_PER_CPU(int, xen_callfunc_irq);
40static DEFINE_PER_CPU(int, callfuncsingle_irq); 40static DEFINE_PER_CPU(int, xen_callfuncsingle_irq);
41static DEFINE_PER_CPU(int, debug_irq) = -1; 41static DEFINE_PER_CPU(int, xen_debug_irq) = -1;
42 42
43static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 43static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
44static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 44static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -73,7 +73,7 @@ static __cpuinit void cpu_bringup(void)
73 73
74 xen_setup_cpu_clockevents(); 74 xen_setup_cpu_clockevents();
75 75
76 cpu_set(cpu, cpu_online_map); 76 set_cpu_online(cpu, true);
77 percpu_write(cpu_state, CPU_ONLINE); 77 percpu_write(cpu_state, CPU_ONLINE);
78 wmb(); 78 wmb();
79 79
@@ -103,7 +103,7 @@ static int xen_smp_intr_init(unsigned int cpu)
103 NULL); 103 NULL);
104 if (rc < 0) 104 if (rc < 0)
105 goto fail; 105 goto fail;
106 per_cpu(resched_irq, cpu) = rc; 106 per_cpu(xen_resched_irq, cpu) = rc;
107 107
108 callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); 108 callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
109 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, 109 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
@@ -114,7 +114,7 @@ static int xen_smp_intr_init(unsigned int cpu)
114 NULL); 114 NULL);
115 if (rc < 0) 115 if (rc < 0)
116 goto fail; 116 goto fail;
117 per_cpu(callfunc_irq, cpu) = rc; 117 per_cpu(xen_callfunc_irq, cpu) = rc;
118 118
119 debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); 119 debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
120 rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, 120 rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
@@ -122,7 +122,7 @@ static int xen_smp_intr_init(unsigned int cpu)
122 debug_name, NULL); 122 debug_name, NULL);
123 if (rc < 0) 123 if (rc < 0)
124 goto fail; 124 goto fail;
125 per_cpu(debug_irq, cpu) = rc; 125 per_cpu(xen_debug_irq, cpu) = rc;
126 126
127 callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); 127 callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
128 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, 128 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
@@ -133,19 +133,20 @@ static int xen_smp_intr_init(unsigned int cpu)
133 NULL); 133 NULL);
134 if (rc < 0) 134 if (rc < 0)
135 goto fail; 135 goto fail;
136 per_cpu(callfuncsingle_irq, cpu) = rc; 136 per_cpu(xen_callfuncsingle_irq, cpu) = rc;
137 137
138 return 0; 138 return 0;
139 139
140 fail: 140 fail:
141 if (per_cpu(resched_irq, cpu) >= 0) 141 if (per_cpu(xen_resched_irq, cpu) >= 0)
142 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); 142 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
143 if (per_cpu(callfunc_irq, cpu) >= 0) 143 if (per_cpu(xen_callfunc_irq, cpu) >= 0)
144 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 144 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
145 if (per_cpu(debug_irq, cpu) >= 0) 145 if (per_cpu(xen_debug_irq, cpu) >= 0)
146 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); 146 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
147 if (per_cpu(callfuncsingle_irq, cpu) >= 0) 147 if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0)
148 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); 148 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu),
149 NULL);
149 150
150 return rc; 151 return rc;
151} 152}
@@ -295,6 +296,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
295 (unsigned long)task_stack_page(idle) - 296 (unsigned long)task_stack_page(idle) -
296 KERNEL_STACK_OFFSET + THREAD_SIZE; 297 KERNEL_STACK_OFFSET + THREAD_SIZE;
297#endif 298#endif
299 xen_setup_runstate_info(cpu);
298 xen_setup_timer(cpu); 300 xen_setup_timer(cpu);
299 xen_init_lock_cpu(cpu); 301 xen_init_lock_cpu(cpu);
300 302
@@ -348,10 +350,10 @@ static void xen_cpu_die(unsigned int cpu)
348 current->state = TASK_UNINTERRUPTIBLE; 350 current->state = TASK_UNINTERRUPTIBLE;
349 schedule_timeout(HZ/10); 351 schedule_timeout(HZ/10);
350 } 352 }
351 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); 353 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
352 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 354 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
353 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); 355 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
354 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); 356 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
355 xen_uninit_lock_cpu(cpu); 357 xen_uninit_lock_cpu(cpu);
356 xen_teardown_timer(cpu); 358 xen_teardown_timer(cpu);
357 359
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 95be7b434724..987267f79bf5 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,4 +1,5 @@
1#include <linux/types.h> 1#include <linux/types.h>
2#include <linux/clockchips.h>
2 3
3#include <xen/interface/xen.h> 4#include <xen/interface/xen.h>
4#include <xen/grant_table.h> 5#include <xen/grant_table.h>
@@ -27,6 +28,8 @@ void xen_pre_suspend(void)
27 28
28void xen_post_suspend(int suspend_cancelled) 29void xen_post_suspend(int suspend_cancelled)
29{ 30{
31 xen_build_mfn_list_list();
32
30 xen_setup_shared_info(); 33 xen_setup_shared_info();
31 34
32 if (suspend_cancelled) { 35 if (suspend_cancelled) {
@@ -44,7 +47,19 @@ void xen_post_suspend(int suspend_cancelled)
44 47
45} 48}
46 49
50static void xen_vcpu_notify_restore(void *data)
51{
52 unsigned long reason = (unsigned long)data;
53
54 /* Boot processor notified via generic timekeeping_resume() */
55 if ( smp_processor_id() == 0)
56 return;
57
58 clockevents_notify(reason, NULL);
59}
60
47void xen_arch_resume(void) 61void xen_arch_resume(void)
48{ 62{
49 /* nothing */ 63 smp_call_function(xen_vcpu_notify_restore,
64 (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
50} 65}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 0a5aa44299a5..0d3f07cd1b5f 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -31,14 +31,14 @@
31#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
32 32
33/* runstate info updated by Xen */ 33/* runstate info updated by Xen */
34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 34static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
35 35
36/* snapshots of runstate info */ 36/* snapshots of runstate info */
37static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); 37static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
38 38
39/* unused ns of stolen and blocked time */ 39/* unused ns of stolen and blocked time */
40static DEFINE_PER_CPU(u64, residual_stolen); 40static DEFINE_PER_CPU(u64, xen_residual_stolen);
41static DEFINE_PER_CPU(u64, residual_blocked); 41static DEFINE_PER_CPU(u64, xen_residual_blocked);
42 42
43/* return an consistent snapshot of 64-bit time/counter value */ 43/* return an consistent snapshot of 64-bit time/counter value */
44static u64 get64(const u64 *p) 44static u64 get64(const u64 *p)
@@ -79,7 +79,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
79 79
80 BUG_ON(preemptible()); 80 BUG_ON(preemptible());
81 81
82 state = &__get_cpu_var(runstate); 82 state = &__get_cpu_var(xen_runstate);
83 83
84 /* 84 /*
85 * The runstate info is always updated by the hypervisor on 85 * The runstate info is always updated by the hypervisor on
@@ -97,14 +97,14 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
97/* return true when a vcpu could run but has no real cpu to run on */ 97/* return true when a vcpu could run but has no real cpu to run on */
98bool xen_vcpu_stolen(int vcpu) 98bool xen_vcpu_stolen(int vcpu)
99{ 99{
100 return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; 100 return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
101} 101}
102 102
103static void setup_runstate_info(int cpu) 103void xen_setup_runstate_info(int cpu)
104{ 104{
105 struct vcpu_register_runstate_memory_area area; 105 struct vcpu_register_runstate_memory_area area;
106 106
107 area.addr.v = &per_cpu(runstate, cpu); 107 area.addr.v = &per_cpu(xen_runstate, cpu);
108 108
109 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, 109 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
110 cpu, &area)) 110 cpu, &area))
@@ -122,7 +122,7 @@ static void do_stolen_accounting(void)
122 122
123 WARN_ON(state.state != RUNSTATE_running); 123 WARN_ON(state.state != RUNSTATE_running);
124 124
125 snap = &__get_cpu_var(runstate_snapshot); 125 snap = &__get_cpu_var(xen_runstate_snapshot);
126 126
127 /* work out how much time the VCPU has not been runn*ing* */ 127 /* work out how much time the VCPU has not been runn*ing* */
128 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; 128 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
@@ -133,24 +133,24 @@ static void do_stolen_accounting(void)
133 133
134 /* Add the appropriate number of ticks of stolen time, 134 /* Add the appropriate number of ticks of stolen time,
135 including any left-overs from last time. */ 135 including any left-overs from last time. */
136 stolen = runnable + offline + __get_cpu_var(residual_stolen); 136 stolen = runnable + offline + __get_cpu_var(xen_residual_stolen);
137 137
138 if (stolen < 0) 138 if (stolen < 0)
139 stolen = 0; 139 stolen = 0;
140 140
141 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); 141 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
142 __get_cpu_var(residual_stolen) = stolen; 142 __get_cpu_var(xen_residual_stolen) = stolen;
143 account_steal_ticks(ticks); 143 account_steal_ticks(ticks);
144 144
145 /* Add the appropriate number of ticks of blocked time, 145 /* Add the appropriate number of ticks of blocked time,
146 including any left-overs from last time. */ 146 including any left-overs from last time. */
147 blocked += __get_cpu_var(residual_blocked); 147 blocked += __get_cpu_var(xen_residual_blocked);
148 148
149 if (blocked < 0) 149 if (blocked < 0)
150 blocked = 0; 150 blocked = 0;
151 151
152 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); 152 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
153 __get_cpu_var(residual_blocked) = blocked; 153 __get_cpu_var(xen_residual_blocked) = blocked;
154 account_idle_ticks(ticks); 154 account_idle_ticks(ticks);
155} 155}
156 156
@@ -434,7 +434,7 @@ void xen_setup_timer(int cpu)
434 name = "<timer kasprintf failed>"; 434 name = "<timer kasprintf failed>";
435 435
436 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 436 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
437 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, 437 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER,
438 name, NULL); 438 name, NULL);
439 439
440 evt = &per_cpu(xen_clock_events, cpu); 440 evt = &per_cpu(xen_clock_events, cpu);
@@ -442,8 +442,6 @@ void xen_setup_timer(int cpu)
442 442
443 evt->cpumask = cpumask_of(cpu); 443 evt->cpumask = cpumask_of(cpu);
444 evt->irq = irq; 444 evt->irq = irq;
445
446 setup_runstate_info(cpu);
447} 445}
448 446
449void xen_teardown_timer(int cpu) 447void xen_teardown_timer(int cpu)
@@ -494,6 +492,7 @@ __init void xen_time_init(void)
494 492
495 setup_force_cpu_cap(X86_FEATURE_TSC); 493 setup_force_cpu_cap(X86_FEATURE_TSC);
496 494
495 xen_setup_runstate_info(cpu);
497 xen_setup_timer(cpu); 496 xen_setup_timer(cpu);
498 xen_setup_cpu_clockevents(); 497 xen_setup_cpu_clockevents();
499} 498}
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 02f496a8dbaa..53adefda4275 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -96,7 +96,7 @@ ENTRY(xen_sysret32)
96 pushq $__USER32_CS 96 pushq $__USER32_CS
97 pushq %rcx 97 pushq %rcx
98 98
99 pushq $VGCF_in_syscall 99 pushq $0
1001: jmp hypercall_iret 1001: jmp hypercall_iret
101ENDPATCH(xen_sysret32) 101ENDPATCH(xen_sysret32)
102RELOC(xen_sysret32, 1b+1) 102RELOC(xen_sysret32, 1b+1)
@@ -151,7 +151,7 @@ ENTRY(xen_syscall32_target)
151ENTRY(xen_sysenter_target) 151ENTRY(xen_sysenter_target)
152 lea 16(%rsp), %rsp /* strip %rcx, %r11 */ 152 lea 16(%rsp), %rsp /* strip %rcx, %r11 */
153 mov $-ENOSYS, %rax 153 mov $-ENOSYS, %rax
154 pushq $VGCF_in_syscall 154 pushq $0
155 jmp hypercall_iret 155 jmp hypercall_iret
156ENDPROC(xen_syscall32_target) 156ENDPROC(xen_syscall32_target)
157ENDPROC(xen_sysenter_target) 157ENDPROC(xen_sysenter_target)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 355fa6b99c9c..f9153a300bce 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -25,6 +25,7 @@ extern struct shared_info *HYPERVISOR_shared_info;
25 25
26void xen_setup_mfn_list_list(void); 26void xen_setup_mfn_list_list(void);
27void xen_setup_shared_info(void); 27void xen_setup_shared_info(void);
28void xen_build_mfn_list_list(void);
28void xen_setup_machphys_mapping(void); 29void xen_setup_machphys_mapping(void);
29pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void); 31void xen_ident_map_ISA(void);
@@ -41,6 +42,7 @@ void __init xen_build_dynamic_phys_to_machine(void);
41 42
42void xen_init_irq_ops(void); 43void xen_init_irq_ops(void);
43void xen_setup_timer(int cpu); 44void xen_setup_timer(int cpu);
45void xen_setup_runstate_info(int cpu);
44void xen_teardown_timer(int cpu); 46void xen_teardown_timer(int cpu);
45cycle_t xen_clocksource_read(void); 47cycle_t xen_clocksource_read(void);
46void xen_setup_cpu_clockevents(void); 48void xen_setup_cpu_clockevents(void);