aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig43
-rw-r--r--arch/x86/Kconfig.cpu2
-rw-r--r--arch/x86/Kconfig.debug4
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/misc.c4
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1832
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c540
-rw-r--r--arch/x86/include/asm/acpi.h11
-rw-r--r--arch/x86/include/asm/amd_nb.h13
-rw-r--r--arch/x86/include/asm/boot.h6
-rw-r--r--arch/x86/include/asm/fixmap.h4
-rw-r--r--arch/x86/include/asm/gpio.h5
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h35
-rw-r--r--arch/x86/include/asm/kvm_host.h100
-rw-r--r--arch/x86/include/asm/kvm_para.h24
-rw-r--r--arch/x86/include/asm/mach_traps.h12
-rw-r--r--arch/x86/include/asm/nmi.h20
-rw-r--r--arch/x86/include/asm/numa_32.h2
-rw-r--r--arch/x86/include/asm/numa_64.h3
-rw-r--r--arch/x86/include/asm/olpc.h10
-rw-r--r--arch/x86/include/asm/olpc_ofw.h9
-rw-r--r--arch/x86/include/asm/paravirt.h25
-rw-r--r--arch/x86/include/asm/paravirt_types.h6
-rw-r--r--arch/x86/include/asm/perf_event_p4.h3
-rw-r--r--arch/x86/include/asm/pgalloc.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level.h9
-rw-r--r--arch/x86/include/asm/pgtable-3level.h23
-rw-r--r--arch/x86/include/asm/pgtable.h143
-rw-r--r--arch/x86/include/asm/pgtable_64.h28
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/processor.h7
-rw-r--r--arch/x86/include/asm/prom.h1
-rw-r--r--arch/x86/include/asm/svm.h57
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/vmx.h15
-rw-r--r--arch/x86/include/asm/xen/page.h16
-rw-r--r--arch/x86/kernel/acpi/boot.c1
-rw-r--r--arch/x86/kernel/amd_iommu.c4
-rw-r--r--arch/x86/kernel/amd_nb.c7
-rw-r--r--arch/x86/kernel/apb_timer.c14
-rw-r--r--arch/x86/kernel/aperture_64.c44
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c5
-rw-r--r--arch/x86/kernel/cpu/perf_event.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c28
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/e820.c1
-rw-r--r--arch/x86/kernel/entry_32.S10
-rw-r--r--arch/x86/kernel/entry_64.S39
-rw-r--r--arch/x86/kernel/head_32.S2
-rw-r--r--arch/x86/kernel/i387.c1
-rw-r--r--arch/x86/kernel/irq.c10
-rw-r--r--arch/x86/kernel/irq_32.c7
-rw-r--r--arch/x86/kernel/kgdb.c7
-rw-r--r--arch/x86/kernel/kvm.c317
-rw-r--r--arch/x86/kernel/kvmclock.c13
-rw-r--r--arch/x86/kernel/module.c17
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/process.c30
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c6
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/kernel/rtc.c2
-rw-r--r--arch/x86/kernel/smpboot.c4
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/traps.c102
-rw-r--r--arch/x86/kernel/tsc.c6
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c367
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h22
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/mmu.c376
-rw-r--r--arch/x86/kvm/mmu_audit.c39
-rw-r--r--arch/x86/kvm/paging_tmpl.h156
-rw-r--r--arch/x86/kvm/svm.c865
-rw-r--r--arch/x86/kvm/trace.h17
-rw-r--r--arch/x86/kvm/vmx.c156
-rw-r--r--arch/x86/kvm/x86.c474
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c2
-rw-r--r--arch/x86/mm/amdtopology_64.c86
-rw-r--r--arch/x86/mm/gup.c28
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/numa.c22
-rw-r--r--arch/x86/mm/numa_64.c181
-rw-r--r--arch/x86/mm/pgtable.c66
-rw-r--r--arch/x86/mm/srat_32.c1
-rw-r--r--arch/x86/mm/srat_64.c26
-rw-r--r--arch/x86/oprofile/nmi_int.c3
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c2
-rw-r--r--arch/x86/pci/amd_bus.c33
-rw-r--r--arch/x86/pci/broadcom_bus.c11
-rw-r--r--arch/x86/pci/common.c41
-rw-r--r--arch/x86/pci/irq.c3
-rw-r--r--arch/x86/platform/mrst/early_printk_mrst.c2
-rw-r--r--arch/x86/platform/olpc/Makefile1
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c101
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c183
-rw-r--r--arch/x86/platform/olpc/olpc_ofw.c5
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c2
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c366
-rw-r--r--arch/x86/xen/p2m.c528
111 files changed, 6190 insertions, 1730 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b6fccb07123e..d5ed94d30aad 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -51,6 +51,7 @@ config X86
51 select HAVE_KERNEL_GZIP 51 select HAVE_KERNEL_GZIP
52 select HAVE_KERNEL_BZIP2 52 select HAVE_KERNEL_BZIP2
53 select HAVE_KERNEL_LZMA 53 select HAVE_KERNEL_LZMA
54 select HAVE_KERNEL_XZ
54 select HAVE_KERNEL_LZO 55 select HAVE_KERNEL_LZO
55 select HAVE_HW_BREAKPOINT 56 select HAVE_HW_BREAKPOINT
56 select HAVE_MIXED_BREAKPOINTS_REGS 57 select HAVE_MIXED_BREAKPOINTS_REGS
@@ -65,6 +66,7 @@ config X86
65 select HAVE_SPARSE_IRQ 66 select HAVE_SPARSE_IRQ
66 select GENERIC_IRQ_PROBE 67 select GENERIC_IRQ_PROBE
67 select GENERIC_PENDING_IRQ if SMP 68 select GENERIC_PENDING_IRQ if SMP
69 select USE_GENERIC_SMP_HELPERS if SMP
68 70
69config INSTRUCTION_DECODER 71config INSTRUCTION_DECODER
70 def_bool (KPROBES || PERF_EVENTS) 72 def_bool (KPROBES || PERF_EVENTS)
@@ -203,10 +205,6 @@ config HAVE_INTEL_TXT
203 def_bool y 205 def_bool y
204 depends on EXPERIMENTAL && DMAR && ACPI 206 depends on EXPERIMENTAL && DMAR && ACPI
205 207
206config USE_GENERIC_SMP_HELPERS
207 def_bool y
208 depends on SMP
209
210config X86_32_SMP 208config X86_32_SMP
211 def_bool y 209 def_bool y
212 depends on X86_32 && SMP 210 depends on X86_32 && SMP
@@ -629,11 +627,11 @@ config APB_TIMER
629 as it is off-chip. APB timers are always running regardless of CPU 627 as it is off-chip. APB timers are always running regardless of CPU
630 C states, they are used as per CPU clockevent device when possible. 628 C states, they are used as per CPU clockevent device when possible.
631 629
632# Mark as embedded because too many people got it wrong. 630# Mark as expert because too many people got it wrong.
633# The code disables itself when not needed. 631# The code disables itself when not needed.
634config DMI 632config DMI
635 default y 633 default y
636 bool "Enable DMI scanning" if EMBEDDED 634 bool "Enable DMI scanning" if EXPERT
637 ---help--- 635 ---help---
638 Enabled scanning of DMI to identify machine quirks. Say Y 636 Enabled scanning of DMI to identify machine quirks. Say Y
639 here unless you have verified that your setup is not 637 here unless you have verified that your setup is not
@@ -641,7 +639,7 @@ config DMI
641 BIOS code. 639 BIOS code.
642 640
643config GART_IOMMU 641config GART_IOMMU
644 bool "GART IOMMU support" if EMBEDDED 642 bool "GART IOMMU support" if EXPERT
645 default y 643 default y
646 select SWIOTLB 644 select SWIOTLB
647 depends on X86_64 && PCI && AMD_NB 645 depends on X86_64 && PCI && AMD_NB
@@ -891,7 +889,7 @@ config X86_THERMAL_VECTOR
891 depends on X86_MCE_INTEL 889 depends on X86_MCE_INTEL
892 890
893config VM86 891config VM86
894 bool "Enable VM86 support" if EMBEDDED 892 bool "Enable VM86 support" if EXPERT
895 default y 893 default y
896 depends on X86_32 894 depends on X86_32
897 ---help--- 895 ---help---
@@ -1075,7 +1073,7 @@ endchoice
1075 1073
1076choice 1074choice
1077 depends on EXPERIMENTAL 1075 depends on EXPERIMENTAL
1078 prompt "Memory split" if EMBEDDED 1076 prompt "Memory split" if EXPERT
1079 default VMSPLIT_3G 1077 default VMSPLIT_3G
1080 depends on X86_32 1078 depends on X86_32
1081 ---help--- 1079 ---help---
@@ -1137,7 +1135,7 @@ config ARCH_DMA_ADDR_T_64BIT
1137 def_bool X86_64 || HIGHMEM64G 1135 def_bool X86_64 || HIGHMEM64G
1138 1136
1139config DIRECT_GBPAGES 1137config DIRECT_GBPAGES
1140 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED 1138 bool "Enable 1GB pages for kernel pagetables" if EXPERT
1141 default y 1139 default y
1142 depends on X86_64 1140 depends on X86_64
1143 ---help--- 1141 ---help---
@@ -1371,7 +1369,7 @@ config MATH_EMULATION
1371 1369
1372config MTRR 1370config MTRR
1373 def_bool y 1371 def_bool y
1374 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED 1372 prompt "MTRR (Memory Type Range Register) support" if EXPERT
1375 ---help--- 1373 ---help---
1376 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1374 On Intel P6 family processors (Pentium Pro, Pentium II and later)
1377 the Memory Type Range Registers (MTRRs) may be used to control 1375 the Memory Type Range Registers (MTRRs) may be used to control
@@ -1437,7 +1435,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1437 1435
1438config X86_PAT 1436config X86_PAT
1439 def_bool y 1437 def_bool y
1440 prompt "x86 PAT support" if EMBEDDED 1438 prompt "x86 PAT support" if EXPERT
1441 depends on MTRR 1439 depends on MTRR
1442 ---help--- 1440 ---help---
1443 Use PAT attributes to setup page level cache control. 1441 Use PAT attributes to setup page level cache control.
@@ -1541,7 +1539,7 @@ config KEXEC_JUMP
1541 code in physical address mode via KEXEC 1539 code in physical address mode via KEXEC
1542 1540
1543config PHYSICAL_START 1541config PHYSICAL_START
1544 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1542 hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
1545 default "0x1000000" 1543 default "0x1000000"
1546 ---help--- 1544 ---help---
1547 This gives the physical address where the kernel is loaded. 1545 This gives the physical address where the kernel is loaded.
@@ -1936,13 +1934,19 @@ config PCI_MMCONFIG
1936 depends on X86_64 && PCI && ACPI 1934 depends on X86_64 && PCI && ACPI
1937 1935
1938config PCI_CNB20LE_QUIRK 1936config PCI_CNB20LE_QUIRK
1939 bool "Read CNB20LE Host Bridge Windows" 1937 bool "Read CNB20LE Host Bridge Windows" if EXPERT
1940 depends on PCI 1938 default n
1939 depends on PCI && EXPERIMENTAL
1941 help 1940 help
1942 Read the PCI windows out of the CNB20LE host bridge. This allows 1941 Read the PCI windows out of the CNB20LE host bridge. This allows
1943 PCI hotplug to work on systems with the CNB20LE chipset which do 1942 PCI hotplug to work on systems with the CNB20LE chipset which do
1944 not have ACPI. 1943 not have ACPI.
1945 1944
1945 There's no public spec for this chipset, and this functionality
1946 is known to be incomplete.
1947
1948 You should say N unless you know you need this.
1949
1946config DMAR 1950config DMAR
1947 bool "Support for DMA Remapping Devices (EXPERIMENTAL)" 1951 bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
1948 depends on PCI_MSI && ACPI && EXPERIMENTAL 1952 depends on PCI_MSI && ACPI && EXPERIMENTAL
@@ -2064,13 +2068,14 @@ config OLPC
2064 bool "One Laptop Per Child support" 2068 bool "One Laptop Per Child support"
2065 select GPIOLIB 2069 select GPIOLIB
2066 select OLPC_OPENFIRMWARE 2070 select OLPC_OPENFIRMWARE
2071 depends on !X86_64 && !X86_PAE
2067 ---help--- 2072 ---help---
2068 Add support for detecting the unique features of the OLPC 2073 Add support for detecting the unique features of the OLPC
2069 XO hardware. 2074 XO hardware.
2070 2075
2071config OLPC_XO1 2076config OLPC_XO1
2072 tristate "OLPC XO-1 support" 2077 tristate "OLPC XO-1 support"
2073 depends on OLPC && PCI 2078 depends on OLPC && MFD_CS5535
2074 ---help--- 2079 ---help---
2075 Add support for non-essential features of the OLPC XO-1 laptop. 2080 Add support for non-essential features of the OLPC XO-1 laptop.
2076 2081
@@ -2078,11 +2083,17 @@ config OLPC_OPENFIRMWARE
2078 bool "Support for OLPC's Open Firmware" 2083 bool "Support for OLPC's Open Firmware"
2079 depends on !X86_64 && !X86_PAE 2084 depends on !X86_64 && !X86_PAE
2080 default n 2085 default n
2086 select OF
2081 help 2087 help
2082 This option adds support for the implementation of Open Firmware 2088 This option adds support for the implementation of Open Firmware
2083 that is used on the OLPC XO-1 Children's Machine. 2089 that is used on the OLPC XO-1 Children's Machine.
2084 If unsure, say N here. 2090 If unsure, say N here.
2085 2091
2092config OLPC_OPENFIRMWARE_DT
2093 bool
2094 default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE
2095 select OF_PROMTREE
2096
2086endif # X86_32 2097endif # X86_32
2087 2098
2088config AMD_NB 2099config AMD_NB
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 15588a0ef466..283c5a6a03a6 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -424,7 +424,7 @@ config X86_DEBUGCTLMSR
424 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML 424 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML
425 425
426menuconfig PROCESSOR_SELECT 426menuconfig PROCESSOR_SELECT
427 bool "Supported processor vendors" if EMBEDDED 427 bool "Supported processor vendors" if EXPERT
428 ---help--- 428 ---help---
429 This lets you choose what x86 vendor support code your kernel 429 This lets you choose what x86 vendor support code your kernel
430 will include. 430 will include.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 45143bbcfe5e..615e18810f48 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -31,7 +31,7 @@ config X86_VERBOSE_BOOTUP
31 see errors. Disable this if you want silent bootup. 31 see errors. Disable this if you want silent bootup.
32 32
33config EARLY_PRINTK 33config EARLY_PRINTK
34 bool "Early printk" if EMBEDDED 34 bool "Early printk" if EXPERT
35 default y 35 default y
36 ---help--- 36 ---help---
37 Write kernel log output directly into the VGA buffer or to a serial 37 Write kernel log output directly into the VGA buffer or to a serial
@@ -138,7 +138,7 @@ config DEBUG_NX_TEST
138 138
139config DOUBLEFAULT 139config DOUBLEFAULT
140 default y 140 default y
141 bool "Enable doublefault exception handler" if EMBEDDED 141 bool "Enable doublefault exception handler" if EXPERT
142 depends on X86_32 142 depends on X86_32
143 ---help--- 143 ---help---
144 This option allows trapping of rare doublefault exceptions that 144 This option allows trapping of rare doublefault exceptions that
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0c229551eead..09664efb9cee 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o 7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
8 8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -49,12 +49,15 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
49 $(call if_changed,bzip2) 49 $(call if_changed,bzip2)
50$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE 50$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
51 $(call if_changed,lzma) 51 $(call if_changed,lzma)
52$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
53 $(call if_changed,xzkern)
52$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE 54$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
53 $(call if_changed,lzo) 55 $(call if_changed,lzo)
54 56
55suffix-$(CONFIG_KERNEL_GZIP) := gz 57suffix-$(CONFIG_KERNEL_GZIP) := gz
56suffix-$(CONFIG_KERNEL_BZIP2) := bz2 58suffix-$(CONFIG_KERNEL_BZIP2) := bz2
57suffix-$(CONFIG_KERNEL_LZMA) := lzma 59suffix-$(CONFIG_KERNEL_LZMA) := lzma
60suffix-$(CONFIG_KERNEL_XZ) := xz
58suffix-$(CONFIG_KERNEL_LZO) := lzo 61suffix-$(CONFIG_KERNEL_LZO) := lzo
59 62
60quiet_cmd_mkpiggy = MKPIGGY $@ 63quiet_cmd_mkpiggy = MKPIGGY $@
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 325c05294fc4..3a19d04cebeb 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -139,6 +139,10 @@ static int lines, cols;
139#include "../../../../lib/decompress_unlzma.c" 139#include "../../../../lib/decompress_unlzma.c"
140#endif 140#endif
141 141
142#ifdef CONFIG_KERNEL_XZ
143#include "../../../../lib/decompress_unxz.c"
144#endif
145
142#ifdef CONFIG_KERNEL_LZO 146#ifdef CONFIG_KERNEL_LZO
143#include "../../../../lib/decompress_unlzo.c" 147#include "../../../../lib/decompress_unlzo.c"
144#endif 148#endif
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index 5c228129d175..646aa78ba5fd 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -74,7 +74,7 @@ int main(int argc, char *argv[])
74 74
75 offs = (olen > ilen) ? olen - ilen : 0; 75 offs = (olen > ilen) ? olen - ilen : 0;
76 offs += olen >> 12; /* Add 8 bytes for each 32K block */ 76 offs += olen >> 12; /* Add 8 bytes for each 32K block */
77 offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ 77 offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */
78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ 78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
79 79
80 printf(".section \".rodata..compressed\",\"a\",@progbits\n"); 80 printf(".section \".rodata..compressed\",\"a\",@progbits\n");
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756a51c1..8fe2a4966b7a 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,20 @@
9 * Vinodh Gopal <vinodh.gopal@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir 10 * Kahraman Akdemir
11 * 11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
12 * This program is free software; you can redistribute it and/or modify 26 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by 27 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or 28 * the Free Software Foundation; either version 2 of the License, or
@@ -18,8 +32,62 @@
18#include <linux/linkage.h> 32#include <linux/linkage.h>
19#include <asm/inst.h> 33#include <asm/inst.h>
20 34
35#ifdef __x86_64__
36.data
37POLY: .octa 0xC2000000000000000000000000000001
38TWOONE: .octa 0x00000001000000000000000000000001
39
40# order of these constants should not change.
41# more specifically, ALL_F should follow SHIFT_MASK,
42# and ZERO should follow ALL_F
43
44SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45MASK1: .octa 0x0000000000000000ffffffffffffffff
46MASK2: .octa 0xffffffffffffffff0000000000000000
47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49ZERO: .octa 0x00000000000000000000000000000000
50ONE: .octa 0x00000000000000000000000000000001
51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
52dec: .octa 0x1
53enc: .octa 0x2
54
55
21.text 56.text
22 57
58
59#define STACK_OFFSET 8*3
60#define HashKey 16*0 // store HashKey <<1 mod poly here
61#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65 // bits of HashKey <<1 mod poly here
66 //(for Karatsuba purposes)
67#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68 // bits of HashKey^2 <<1 mod poly here
69 // (for Karatsuba purposes)
70#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71 // bits of HashKey^3 <<1 mod poly here
72 // (for Karatsuba purposes)
73#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74 // bits of HashKey^4 <<1 mod poly here
75 // (for Karatsuba purposes)
76#define VARIABLE_OFFSET 16*8
77
78#define arg1 rdi
79#define arg2 rsi
80#define arg3 rdx
81#define arg4 rcx
82#define arg5 r8
83#define arg6 r9
84#define arg7 STACK_OFFSET+8(%r14)
85#define arg8 STACK_OFFSET+16(%r14)
86#define arg9 STACK_OFFSET+24(%r14)
87#define arg10 STACK_OFFSET+32(%r14)
88#endif
89
90
23#define STATE1 %xmm0 91#define STATE1 %xmm0
24#define STATE2 %xmm4 92#define STATE2 %xmm4
25#define STATE3 %xmm5 93#define STATE3 %xmm5
@@ -32,12 +100,16 @@
32#define IN IN1 100#define IN IN1
33#define KEY %xmm2 101#define KEY %xmm2
34#define IV %xmm3 102#define IV %xmm3
103
35#define BSWAP_MASK %xmm10 104#define BSWAP_MASK %xmm10
36#define CTR %xmm11 105#define CTR %xmm11
37#define INC %xmm12 106#define INC %xmm12
38 107
108#ifdef __x86_64__
109#define AREG %rax
39#define KEYP %rdi 110#define KEYP %rdi
40#define OUTP %rsi 111#define OUTP %rsi
112#define UKEYP OUTP
41#define INP %rdx 113#define INP %rdx
42#define LEN %rcx 114#define LEN %rcx
43#define IVP %r8 115#define IVP %r8
@@ -46,6 +118,1588 @@
46#define TKEYP T1 118#define TKEYP T1
47#define T2 %r11 119#define T2 %r11
48#define TCTR_LOW T2 120#define TCTR_LOW T2
121#else
122#define AREG %eax
123#define KEYP %edi
124#define OUTP AREG
125#define UKEYP OUTP
126#define INP %edx
127#define LEN %esi
128#define IVP %ebp
129#define KLEN %ebx
130#define T1 %ecx
131#define TKEYP T1
132#endif
133
134
135#ifdef __x86_64__
136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
137*
138*
139* Input: A and B (128-bits each, bit-reflected)
140* Output: C = A*B*x mod poly, (i.e. >>1 )
141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
143*
144*/
145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
146 movdqa \GH, \TMP1
147 pshufd $78, \GH, \TMP2
148 pshufd $78, \HK, \TMP3
149 pxor \GH, \TMP2 # TMP2 = a1+a0
150 pxor \HK, \TMP3 # TMP3 = b1+b0
151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
154 pxor \GH, \TMP2
155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
156 movdqa \TMP2, \TMP3
157 pslldq $8, \TMP3 # left shift TMP3 2 DWs
158 psrldq $8, \TMP2 # right shift TMP2 2 DWs
159 pxor \TMP3, \GH
160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
161
162 # first phase of the reduction
163
164 movdqa \GH, \TMP2
165 movdqa \GH, \TMP3
166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167 # in in order to perform
168 # independent shifts
169 pslld $31, \TMP2 # packed right shift <<31
170 pslld $30, \TMP3 # packed right shift <<30
171 pslld $25, \TMP4 # packed right shift <<25
172 pxor \TMP3, \TMP2 # xor the shifted versions
173 pxor \TMP4, \TMP2
174 movdqa \TMP2, \TMP5
175 psrldq $4, \TMP5 # right shift TMP5 1 DW
176 pslldq $12, \TMP2 # left shift TMP2 3 DWs
177 pxor \TMP2, \GH
178
179 # second phase of the reduction
180
181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182 # in in order to perform
183 # independent shifts
184 movdqa \GH,\TMP3
185 movdqa \GH,\TMP4
186 psrld $1,\TMP2 # packed left shift >>1
187 psrld $2,\TMP3 # packed left shift >>2
188 psrld $7,\TMP4 # packed left shift >>7
189 pxor \TMP3,\TMP2 # xor the shifted versions
190 pxor \TMP4,\TMP2
191 pxor \TMP5, \TMP2
192 pxor \TMP2, \GH
193 pxor \TMP1, \GH # result is in TMP1
194.endm
195
196/*
197* if a = number of total plaintext bytes
198* b = floor(a/16)
199* num_initial_blocks = b mod 4
200* encrypt the initial num_initial_blocks blocks and apply ghash on
201* the ciphertext
202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
203* are clobbered
204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205*/
206
207
208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen
212 mov %r12, %r11
213 pxor %xmm\i, %xmm\i
214_get_AAD_loop\num_initial_blocks\operation:
215 movd (%r10), \TMP1
216 pslldq $12, \TMP1
217 psrldq $4, %xmm\i
218 pxor \TMP1, %xmm\i
219 add $4, %r10
220 sub $4, %r12
221 jne _get_AAD_loop\num_initial_blocks\operation
222 cmp $16, %r11
223 je _get_AAD_loop2_done\num_initial_blocks\operation
224 mov $16, %r12
225_get_AAD_loop2\num_initial_blocks\operation:
226 psrldq $4, %xmm\i
227 sub $4, %r12
228 cmp %r11, %r12
229 jne _get_AAD_loop2\num_initial_blocks\operation
230_get_AAD_loop2_done\num_initial_blocks\operation:
231 movdqa SHUF_MASK(%rip), %xmm14
232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
233
234 xor %r11, %r11 # initialise the data pointer offset as zero
235
236 # start AES for num_initial_blocks blocks
237
238 mov %arg5, %rax # %rax = *Y0
239 movdqu (%rax), \XMM0 # XMM0 = Y0
240 movdqa SHUF_MASK(%rip), %xmm14
241 PSHUFB_XMM %xmm14, \XMM0
242
243.if (\i == 5) || (\i == 6) || (\i == 7)
244.irpc index, \i_seq
245 paddd ONE(%rip), \XMM0 # INCR Y0
246 movdqa \XMM0, %xmm\index
247 movdqa SHUF_MASK(%rip), %xmm14
248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
249
250.endr
251.irpc index, \i_seq
252 pxor 16*0(%arg1), %xmm\index
253.endr
254.irpc index, \i_seq
255 movaps 0x10(%rdi), \TMP1
256 AESENC \TMP1, %xmm\index # Round 1
257.endr
258.irpc index, \i_seq
259 movaps 0x20(%arg1), \TMP1
260 AESENC \TMP1, %xmm\index # Round 2
261.endr
262.irpc index, \i_seq
263 movaps 0x30(%arg1), \TMP1
264 AESENC \TMP1, %xmm\index # Round 2
265.endr
266.irpc index, \i_seq
267 movaps 0x40(%arg1), \TMP1
268 AESENC \TMP1, %xmm\index # Round 2
269.endr
270.irpc index, \i_seq
271 movaps 0x50(%arg1), \TMP1
272 AESENC \TMP1, %xmm\index # Round 2
273.endr
274.irpc index, \i_seq
275 movaps 0x60(%arg1), \TMP1
276 AESENC \TMP1, %xmm\index # Round 2
277.endr
278.irpc index, \i_seq
279 movaps 0x70(%arg1), \TMP1
280 AESENC \TMP1, %xmm\index # Round 2
281.endr
282.irpc index, \i_seq
283 movaps 0x80(%arg1), \TMP1
284 AESENC \TMP1, %xmm\index # Round 2
285.endr
286.irpc index, \i_seq
287 movaps 0x90(%arg1), \TMP1
288 AESENC \TMP1, %xmm\index # Round 2
289.endr
290.irpc index, \i_seq
291 movaps 0xa0(%arg1), \TMP1
292 AESENCLAST \TMP1, %xmm\index # Round 10
293.endr
294.irpc index, \i_seq
295 movdqu (%arg3 , %r11, 1), \TMP1
296 pxor \TMP1, %xmm\index
297 movdqu %xmm\index, (%arg2 , %r11, 1)
298 # write back plaintext/ciphertext for num_initial_blocks
299 add $16, %r11
300
301 movdqa \TMP1, %xmm\index
302 movdqa SHUF_MASK(%rip), %xmm14
303 PSHUFB_XMM %xmm14, %xmm\index
304
305 # prepare plaintext/ciphertext for GHASH computation
306.endr
307.endif
308 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309 # apply GHASH on num_initial_blocks blocks
310
311.if \i == 5
312 pxor %xmm5, %xmm6
313 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314 pxor %xmm6, %xmm7
315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316 pxor %xmm7, %xmm8
317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318.elseif \i == 6
319 pxor %xmm6, %xmm7
320 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 pxor %xmm7, %xmm8
322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323.elseif \i == 7
324 pxor %xmm7, %xmm8
325 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326.endif
327 cmp $64, %r13
328 jl _initial_blocks_done\num_initial_blocks\operation
329 # no need for precomputed values
330/*
331*
332* Precomputations for HashKey parallel with encryption of first 4 blocks.
333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
334*/
335 paddd ONE(%rip), \XMM0 # INCR Y0
336 movdqa \XMM0, \XMM1
337 movdqa SHUF_MASK(%rip), %xmm14
338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
339
340 paddd ONE(%rip), \XMM0 # INCR Y0
341 movdqa \XMM0, \XMM2
342 movdqa SHUF_MASK(%rip), %xmm14
343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
344
345 paddd ONE(%rip), \XMM0 # INCR Y0
346 movdqa \XMM0, \XMM3
347 movdqa SHUF_MASK(%rip), %xmm14
348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
349
350 paddd ONE(%rip), \XMM0 # INCR Y0
351 movdqa \XMM0, \XMM4
352 movdqa SHUF_MASK(%rip), %xmm14
353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
354
355 pxor 16*0(%arg1), \XMM1
356 pxor 16*0(%arg1), \XMM2
357 pxor 16*0(%arg1), \XMM3
358 pxor 16*0(%arg1), \XMM4
359 movdqa \TMP3, \TMP5
360 pshufd $78, \TMP3, \TMP1
361 pxor \TMP3, \TMP1
362 movdqa \TMP1, HashKey_k(%rsp)
363 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
364# TMP5 = HashKey^2<<1 (mod poly)
365 movdqa \TMP5, HashKey_2(%rsp)
366# HashKey_2 = HashKey^2<<1 (mod poly)
367 pshufd $78, \TMP5, \TMP1
368 pxor \TMP5, \TMP1
369 movdqa \TMP1, HashKey_2_k(%rsp)
370.irpc index, 1234 # do 4 rounds
371 movaps 0x10*\index(%arg1), \TMP1
372 AESENC \TMP1, \XMM1
373 AESENC \TMP1, \XMM2
374 AESENC \TMP1, \XMM3
375 AESENC \TMP1, \XMM4
376.endr
377 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
378# TMP5 = HashKey^3<<1 (mod poly)
379 movdqa \TMP5, HashKey_3(%rsp)
380 pshufd $78, \TMP5, \TMP1
381 pxor \TMP5, \TMP1
382 movdqa \TMP1, HashKey_3_k(%rsp)
383.irpc index, 56789 # do next 5 rounds
384 movaps 0x10*\index(%arg1), \TMP1
385 AESENC \TMP1, \XMM1
386 AESENC \TMP1, \XMM2
387 AESENC \TMP1, \XMM3
388 AESENC \TMP1, \XMM4
389.endr
390 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
391# TMP5 = HashKey^3<<1 (mod poly)
392 movdqa \TMP5, HashKey_4(%rsp)
393 pshufd $78, \TMP5, \TMP1
394 pxor \TMP5, \TMP1
395 movdqa \TMP1, HashKey_4_k(%rsp)
396 movaps 0xa0(%arg1), \TMP2
397 AESENCLAST \TMP2, \XMM1
398 AESENCLAST \TMP2, \XMM2
399 AESENCLAST \TMP2, \XMM3
400 AESENCLAST \TMP2, \XMM4
401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
402 pxor \TMP1, \XMM1
403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
404 movdqa \TMP1, \XMM1
405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
406 pxor \TMP1, \XMM2
407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
408 movdqa \TMP1, \XMM2
409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
410 pxor \TMP1, \XMM3
411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
412 movdqa \TMP1, \XMM3
413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
414 pxor \TMP1, \XMM4
415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
416 movdqa \TMP1, \XMM4
417 add $64, %r11
418 movdqa SHUF_MASK(%rip), %xmm14
419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
420 pxor \XMMDst, \XMM1
421# combine GHASHed value with the corresponding ciphertext
422 movdqa SHUF_MASK(%rip), %xmm14
423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426 movdqa SHUF_MASK(%rip), %xmm14
427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
428
429_initial_blocks_done\num_initial_blocks\operation:
430
431.endm
432
433
434/*
435* if a = number of total plaintext bytes
436* b = floor(a/16)
437* num_initial_blocks = b mod 4
438* encrypt the initial num_initial_blocks blocks and apply ghash on
439* the ciphertext
440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
441* are clobbered
442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
443*/
444
445
446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448 mov arg7, %r10 # %r10 = AAD
449 mov arg8, %r12 # %r12 = aadLen
450 mov %r12, %r11
451 pxor %xmm\i, %xmm\i
452_get_AAD_loop\num_initial_blocks\operation:
453 movd (%r10), \TMP1
454 pslldq $12, \TMP1
455 psrldq $4, %xmm\i
456 pxor \TMP1, %xmm\i
457 add $4, %r10
458 sub $4, %r12
459 jne _get_AAD_loop\num_initial_blocks\operation
460 cmp $16, %r11
461 je _get_AAD_loop2_done\num_initial_blocks\operation
462 mov $16, %r12
463_get_AAD_loop2\num_initial_blocks\operation:
464 psrldq $4, %xmm\i
465 sub $4, %r12
466 cmp %r11, %r12
467 jne _get_AAD_loop2\num_initial_blocks\operation
468_get_AAD_loop2_done\num_initial_blocks\operation:
469 movdqa SHUF_MASK(%rip), %xmm14
470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
471
472 xor %r11, %r11 # initialise the data pointer offset as zero
473
474 # start AES for num_initial_blocks blocks
475
476 mov %arg5, %rax # %rax = *Y0
477 movdqu (%rax), \XMM0 # XMM0 = Y0
478 movdqa SHUF_MASK(%rip), %xmm14
479 PSHUFB_XMM %xmm14, \XMM0
480
481.if (\i == 5) || (\i == 6) || (\i == 7)
482.irpc index, \i_seq
483 paddd ONE(%rip), \XMM0 # INCR Y0
484 movdqa \XMM0, %xmm\index
485 movdqa SHUF_MASK(%rip), %xmm14
486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
487
488.endr
489.irpc index, \i_seq
490 pxor 16*0(%arg1), %xmm\index
491.endr
492.irpc index, \i_seq
493 movaps 0x10(%rdi), \TMP1
494 AESENC \TMP1, %xmm\index # Round 1
495.endr
496.irpc index, \i_seq
497 movaps 0x20(%arg1), \TMP1
498 AESENC \TMP1, %xmm\index # Round 2
499.endr
500.irpc index, \i_seq
501 movaps 0x30(%arg1), \TMP1
502 AESENC \TMP1, %xmm\index # Round 2
503.endr
504.irpc index, \i_seq
505 movaps 0x40(%arg1), \TMP1
506 AESENC \TMP1, %xmm\index # Round 2
507.endr
508.irpc index, \i_seq
509 movaps 0x50(%arg1), \TMP1
510 AESENC \TMP1, %xmm\index # Round 2
511.endr
512.irpc index, \i_seq
513 movaps 0x60(%arg1), \TMP1
514 AESENC \TMP1, %xmm\index # Round 2
515.endr
516.irpc index, \i_seq
517 movaps 0x70(%arg1), \TMP1
518 AESENC \TMP1, %xmm\index # Round 2
519.endr
520.irpc index, \i_seq
521 movaps 0x80(%arg1), \TMP1
522 AESENC \TMP1, %xmm\index # Round 2
523.endr
524.irpc index, \i_seq
525 movaps 0x90(%arg1), \TMP1
526 AESENC \TMP1, %xmm\index # Round 2
527.endr
528.irpc index, \i_seq
529 movaps 0xa0(%arg1), \TMP1
530 AESENCLAST \TMP1, %xmm\index # Round 10
531.endr
532.irpc index, \i_seq
533 movdqu (%arg3 , %r11, 1), \TMP1
534 pxor \TMP1, %xmm\index
535 movdqu %xmm\index, (%arg2 , %r11, 1)
536 # write back plaintext/ciphertext for num_initial_blocks
537 add $16, %r11
538
539 movdqa SHUF_MASK(%rip), %xmm14
540 PSHUFB_XMM %xmm14, %xmm\index
541
542 # prepare plaintext/ciphertext for GHASH computation
543.endr
544.endif
545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 # apply GHASH on num_initial_blocks blocks
547
548.if \i == 5
549 pxor %xmm5, %xmm6
550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551 pxor %xmm6, %xmm7
552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553 pxor %xmm7, %xmm8
554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555.elseif \i == 6
556 pxor %xmm6, %xmm7
557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558 pxor %xmm7, %xmm8
559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560.elseif \i == 7
561 pxor %xmm7, %xmm8
562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563.endif
564 cmp $64, %r13
565 jl _initial_blocks_done\num_initial_blocks\operation
566 # no need for precomputed values
567/*
568*
569* Precomputations for HashKey parallel with encryption of first 4 blocks.
570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
571*/
572 paddd ONE(%rip), \XMM0 # INCR Y0
573 movdqa \XMM0, \XMM1
574 movdqa SHUF_MASK(%rip), %xmm14
575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
576
577 paddd ONE(%rip), \XMM0 # INCR Y0
578 movdqa \XMM0, \XMM2
579 movdqa SHUF_MASK(%rip), %xmm14
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
581
582 paddd ONE(%rip), \XMM0 # INCR Y0
583 movdqa \XMM0, \XMM3
584 movdqa SHUF_MASK(%rip), %xmm14
585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
586
587 paddd ONE(%rip), \XMM0 # INCR Y0
588 movdqa \XMM0, \XMM4
589 movdqa SHUF_MASK(%rip), %xmm14
590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
591
592 pxor 16*0(%arg1), \XMM1
593 pxor 16*0(%arg1), \XMM2
594 pxor 16*0(%arg1), \XMM3
595 pxor 16*0(%arg1), \XMM4
596 movdqa \TMP3, \TMP5
597 pshufd $78, \TMP3, \TMP1
598 pxor \TMP3, \TMP1
599 movdqa \TMP1, HashKey_k(%rsp)
600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601# TMP5 = HashKey^2<<1 (mod poly)
602 movdqa \TMP5, HashKey_2(%rsp)
603# HashKey_2 = HashKey^2<<1 (mod poly)
604 pshufd $78, \TMP5, \TMP1
605 pxor \TMP5, \TMP1
606 movdqa \TMP1, HashKey_2_k(%rsp)
607.irpc index, 1234 # do 4 rounds
608 movaps 0x10*\index(%arg1), \TMP1
609 AESENC \TMP1, \XMM1
610 AESENC \TMP1, \XMM2
611 AESENC \TMP1, \XMM3
612 AESENC \TMP1, \XMM4
613.endr
614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615# TMP5 = HashKey^3<<1 (mod poly)
616 movdqa \TMP5, HashKey_3(%rsp)
617 pshufd $78, \TMP5, \TMP1
618 pxor \TMP5, \TMP1
619 movdqa \TMP1, HashKey_3_k(%rsp)
620.irpc index, 56789 # do next 5 rounds
621 movaps 0x10*\index(%arg1), \TMP1
622 AESENC \TMP1, \XMM1
623 AESENC \TMP1, \XMM2
624 AESENC \TMP1, \XMM3
625 AESENC \TMP1, \XMM4
626.endr
627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628# TMP5 = HashKey^3<<1 (mod poly)
629 movdqa \TMP5, HashKey_4(%rsp)
630 pshufd $78, \TMP5, \TMP1
631 pxor \TMP5, \TMP1
632 movdqa \TMP1, HashKey_4_k(%rsp)
633 movaps 0xa0(%arg1), \TMP2
634 AESENCLAST \TMP2, \XMM1
635 AESENCLAST \TMP2, \XMM2
636 AESENCLAST \TMP2, \XMM3
637 AESENCLAST \TMP2, \XMM4
638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
639 pxor \TMP1, \XMM1
640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
641 pxor \TMP1, \XMM2
642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
643 pxor \TMP1, \XMM3
644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
645 pxor \TMP1, \XMM4
646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
650
651 add $64, %r11
652 movdqa SHUF_MASK(%rip), %xmm14
653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
654 pxor \XMMDst, \XMM1
655# combine GHASHed value with the corresponding ciphertext
656 movdqa SHUF_MASK(%rip), %xmm14
657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660 movdqa SHUF_MASK(%rip), %xmm14
661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
662
663_initial_blocks_done\num_initial_blocks\operation:
664
665.endm
666
667/*
668* encrypt 4 blocks at a time
669* ghash the 4 previously encrypted ciphertext blocks
670* arg1, %arg2, %arg3 are used as pointers only, not modified
671* %r11 is the data offset value
672*/
673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
675
676 movdqa \XMM1, \XMM5
677 movdqa \XMM2, \XMM6
678 movdqa \XMM3, \XMM7
679 movdqa \XMM4, \XMM8
680
681 movdqa SHUF_MASK(%rip), %xmm15
682 # multiply TMP5 * HashKey using karatsuba
683
684 movdqa \XMM5, \TMP4
685 pshufd $78, \XMM5, \TMP6
686 pxor \XMM5, \TMP6
687 paddd ONE(%rip), \XMM0 # INCR CNT
688 movdqa HashKey_4(%rsp), \TMP5
689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
690 movdqa \XMM0, \XMM1
691 paddd ONE(%rip), \XMM0 # INCR CNT
692 movdqa \XMM0, \XMM2
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa \XMM0, \XMM3
695 paddd ONE(%rip), \XMM0 # INCR CNT
696 movdqa \XMM0, \XMM4
697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
702
703 pxor (%arg1), \XMM1
704 pxor (%arg1), \XMM2
705 pxor (%arg1), \XMM3
706 pxor (%arg1), \XMM4
707 movdqa HashKey_4_k(%rsp), \TMP5
708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709 movaps 0x10(%arg1), \TMP1
710 AESENC \TMP1, \XMM1 # Round 1
711 AESENC \TMP1, \XMM2
712 AESENC \TMP1, \XMM3
713 AESENC \TMP1, \XMM4
714 movaps 0x20(%arg1), \TMP1
715 AESENC \TMP1, \XMM1 # Round 2
716 AESENC \TMP1, \XMM2
717 AESENC \TMP1, \XMM3
718 AESENC \TMP1, \XMM4
719 movdqa \XMM6, \TMP1
720 pshufd $78, \XMM6, \TMP2
721 pxor \XMM6, \TMP2
722 movdqa HashKey_3(%rsp), \TMP5
723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724 movaps 0x30(%arg1), \TMP3
725 AESENC \TMP3, \XMM1 # Round 3
726 AESENC \TMP3, \XMM2
727 AESENC \TMP3, \XMM3
728 AESENC \TMP3, \XMM4
729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730 movaps 0x40(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 4
732 AESENC \TMP3, \XMM2
733 AESENC \TMP3, \XMM3
734 AESENC \TMP3, \XMM4
735 movdqa HashKey_3_k(%rsp), \TMP5
736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737 movaps 0x50(%arg1), \TMP3
738 AESENC \TMP3, \XMM1 # Round 5
739 AESENC \TMP3, \XMM2
740 AESENC \TMP3, \XMM3
741 AESENC \TMP3, \XMM4
742 pxor \TMP1, \TMP4
743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744 pxor \XMM6, \XMM5
745 pxor \TMP2, \TMP6
746 movdqa \XMM7, \TMP1
747 pshufd $78, \XMM7, \TMP2
748 pxor \XMM7, \TMP2
749 movdqa HashKey_2(%rsp ), \TMP5
750
751 # Multiply TMP5 * HashKey using karatsuba
752
753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754 movaps 0x60(%arg1), \TMP3
755 AESENC \TMP3, \XMM1 # Round 6
756 AESENC \TMP3, \XMM2
757 AESENC \TMP3, \XMM3
758 AESENC \TMP3, \XMM4
759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760 movaps 0x70(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 7
762 AESENC \TMP3, \XMM2
763 AESENC \TMP3, \XMM3
764 AESENC \TMP3, \XMM4
765 movdqa HashKey_2_k(%rsp), \TMP5
766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767 movaps 0x80(%arg1), \TMP3
768 AESENC \TMP3, \XMM1 # Round 8
769 AESENC \TMP3, \XMM2
770 AESENC \TMP3, \XMM3
771 AESENC \TMP3, \XMM4
772 pxor \TMP1, \TMP4
773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774 pxor \XMM7, \XMM5
775 pxor \TMP2, \TMP6
776
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
779
780 movdqa \XMM8, \TMP1
781 pshufd $78, \XMM8, \TMP2
782 pxor \XMM8, \TMP2
783 movdqa HashKey(%rsp), \TMP5
784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785 movaps 0x90(%arg1), \TMP3
786 AESENC \TMP3, \XMM1 # Round 9
787 AESENC \TMP3, \XMM2
788 AESENC \TMP3, \XMM3
789 AESENC \TMP3, \XMM4
790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791 movaps 0xa0(%arg1), \TMP3
792 AESENCLAST \TMP3, \XMM1 # Round 10
793 AESENCLAST \TMP3, \XMM2
794 AESENCLAST \TMP3, \XMM3
795 AESENCLAST \TMP3, \XMM4
796 movdqa HashKey_k(%rsp), \TMP5
797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
798 movdqu (%arg3,%r11,1), \TMP3
799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
800 movdqu 16(%arg3,%r11,1), \TMP3
801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
802 movdqu 32(%arg3,%r11,1), \TMP3
803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
804 movdqu 48(%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814
815 pxor \TMP4, \TMP1
816 pxor \XMM8, \XMM5
817 pxor \TMP6, \TMP2
818 pxor \TMP1, \TMP2
819 pxor \XMM5, \TMP2
820 movdqa \TMP2, \TMP3
821 pslldq $8, \TMP3 # left shift TMP3 2 DWs
822 psrldq $8, \TMP2 # right shift TMP2 2 DWs
823 pxor \TMP3, \XMM5
824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
825
826 # first phase of reduction
827
828 movdqa \XMM5, \TMP2
829 movdqa \XMM5, \TMP3
830 movdqa \XMM5, \TMP4
831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832 pslld $31, \TMP2 # packed right shift << 31
833 pslld $30, \TMP3 # packed right shift << 30
834 pslld $25, \TMP4 # packed right shift << 25
835 pxor \TMP3, \TMP2 # xor the shifted versions
836 pxor \TMP4, \TMP2
837 movdqa \TMP2, \TMP5
838 psrldq $4, \TMP5 # right shift T5 1 DW
839 pslldq $12, \TMP2 # left shift T2 3 DWs
840 pxor \TMP2, \XMM5
841
842 # second phase of reduction
843
844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
845 movdqa \XMM5,\TMP3
846 movdqa \XMM5,\TMP4
847 psrld $1, \TMP2 # packed left shift >>1
848 psrld $2, \TMP3 # packed left shift >>2
849 psrld $7, \TMP4 # packed left shift >>7
850 pxor \TMP3,\TMP2 # xor the shifted versions
851 pxor \TMP4,\TMP2
852 pxor \TMP5, \TMP2
853 pxor \TMP2, \XMM5
854 pxor \TMP1, \XMM5 # result is in TMP1
855
856 pxor \XMM5, \XMM1
857.endm
858
859/*
860* decrypt 4 blocks at a time
861* ghash the 4 previously decrypted ciphertext blocks
862* arg1, %arg2, %arg3 are used as pointers only, not modified
863* %r11 is the data offset value
864*/
865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
867
868 movdqa \XMM1, \XMM5
869 movdqa \XMM2, \XMM6
870 movdqa \XMM3, \XMM7
871 movdqa \XMM4, \XMM8
872
873 movdqa SHUF_MASK(%rip), %xmm15
874 # multiply TMP5 * HashKey using karatsuba
875
876 movdqa \XMM5, \TMP4
877 pshufd $78, \XMM5, \TMP6
878 pxor \XMM5, \TMP6
879 paddd ONE(%rip), \XMM0 # INCR CNT
880 movdqa HashKey_4(%rsp), \TMP5
881 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
882 movdqa \XMM0, \XMM1
883 paddd ONE(%rip), \XMM0 # INCR CNT
884 movdqa \XMM0, \XMM2
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa \XMM0, \XMM3
887 paddd ONE(%rip), \XMM0 # INCR CNT
888 movdqa \XMM0, \XMM4
889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
894
895 pxor (%arg1), \XMM1
896 pxor (%arg1), \XMM2
897 pxor (%arg1), \XMM3
898 pxor (%arg1), \XMM4
899 movdqa HashKey_4_k(%rsp), \TMP5
900 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
901 movaps 0x10(%arg1), \TMP1
902 AESENC \TMP1, \XMM1 # Round 1
903 AESENC \TMP1, \XMM2
904 AESENC \TMP1, \XMM3
905 AESENC \TMP1, \XMM4
906 movaps 0x20(%arg1), \TMP1
907 AESENC \TMP1, \XMM1 # Round 2
908 AESENC \TMP1, \XMM2
909 AESENC \TMP1, \XMM3
910 AESENC \TMP1, \XMM4
911 movdqa \XMM6, \TMP1
912 pshufd $78, \XMM6, \TMP2
913 pxor \XMM6, \TMP2
914 movdqa HashKey_3(%rsp), \TMP5
915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
916 movaps 0x30(%arg1), \TMP3
917 AESENC \TMP3, \XMM1 # Round 3
918 AESENC \TMP3, \XMM2
919 AESENC \TMP3, \XMM3
920 AESENC \TMP3, \XMM4
921 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
922 movaps 0x40(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 4
924 AESENC \TMP3, \XMM2
925 AESENC \TMP3, \XMM3
926 AESENC \TMP3, \XMM4
927 movdqa HashKey_3_k(%rsp), \TMP5
928 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
929 movaps 0x50(%arg1), \TMP3
930 AESENC \TMP3, \XMM1 # Round 5
931 AESENC \TMP3, \XMM2
932 AESENC \TMP3, \XMM3
933 AESENC \TMP3, \XMM4
934 pxor \TMP1, \TMP4
935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
936 pxor \XMM6, \XMM5
937 pxor \TMP2, \TMP6
938 movdqa \XMM7, \TMP1
939 pshufd $78, \XMM7, \TMP2
940 pxor \XMM7, \TMP2
941 movdqa HashKey_2(%rsp ), \TMP5
942
943 # Multiply TMP5 * HashKey using karatsuba
944
945 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
946 movaps 0x60(%arg1), \TMP3
947 AESENC \TMP3, \XMM1 # Round 6
948 AESENC \TMP3, \XMM2
949 AESENC \TMP3, \XMM3
950 AESENC \TMP3, \XMM4
951 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
952 movaps 0x70(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 7
954 AESENC \TMP3, \XMM2
955 AESENC \TMP3, \XMM3
956 AESENC \TMP3, \XMM4
957 movdqa HashKey_2_k(%rsp), \TMP5
958 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
959 movaps 0x80(%arg1), \TMP3
960 AESENC \TMP3, \XMM1 # Round 8
961 AESENC \TMP3, \XMM2
962 AESENC \TMP3, \XMM3
963 AESENC \TMP3, \XMM4
964 pxor \TMP1, \TMP4
965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
966 pxor \XMM7, \XMM5
967 pxor \TMP2, \TMP6
968
969 # Multiply XMM8 * HashKey
970 # XMM8 and TMP5 hold the values for the two operands
971
972 movdqa \XMM8, \TMP1
973 pshufd $78, \XMM8, \TMP2
974 pxor \XMM8, \TMP2
975 movdqa HashKey(%rsp), \TMP5
976 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
977 movaps 0x90(%arg1), \TMP3
978 AESENC \TMP3, \XMM1 # Round 9
979 AESENC \TMP3, \XMM2
980 AESENC \TMP3, \XMM3
981 AESENC \TMP3, \XMM4
982 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
983 movaps 0xa0(%arg1), \TMP3
984 AESENCLAST \TMP3, \XMM1 # Round 10
985 AESENCLAST \TMP3, \XMM2
986 AESENCLAST \TMP3, \XMM3
987 AESENCLAST \TMP3, \XMM4
988 movdqa HashKey_k(%rsp), \TMP5
989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
990 movdqu (%arg3,%r11,1), \TMP3
991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
993 movdqa \TMP3, \XMM1
994 movdqu 16(%arg3,%r11,1), \TMP3
995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
997 movdqa \TMP3, \XMM2
998 movdqu 32(%arg3,%r11,1), \TMP3
999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1001 movdqa \TMP3, \XMM3
1002 movdqu 48(%arg3,%r11,1), \TMP3
1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1005 movdqa \TMP3, \XMM4
1006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1010
1011 pxor \TMP4, \TMP1
1012 pxor \XMM8, \XMM5
1013 pxor \TMP6, \TMP2
1014 pxor \TMP1, \TMP2
1015 pxor \XMM5, \TMP2
1016 movdqa \TMP2, \TMP3
1017 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1018 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1019 pxor \TMP3, \XMM5
1020 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1021
1022 # first phase of reduction
1023
1024 movdqa \XMM5, \TMP2
1025 movdqa \XMM5, \TMP3
1026 movdqa \XMM5, \TMP4
1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028 pslld $31, \TMP2 # packed right shift << 31
1029 pslld $30, \TMP3 # packed right shift << 30
1030 pslld $25, \TMP4 # packed right shift << 25
1031 pxor \TMP3, \TMP2 # xor the shifted versions
1032 pxor \TMP4, \TMP2
1033 movdqa \TMP2, \TMP5
1034 psrldq $4, \TMP5 # right shift T5 1 DW
1035 pslldq $12, \TMP2 # left shift T2 3 DWs
1036 pxor \TMP2, \XMM5
1037
1038 # second phase of reduction
1039
1040 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041 movdqa \XMM5,\TMP3
1042 movdqa \XMM5,\TMP4
1043 psrld $1, \TMP2 # packed left shift >>1
1044 psrld $2, \TMP3 # packed left shift >>2
1045 psrld $7, \TMP4 # packed left shift >>7
1046 pxor \TMP3,\TMP2 # xor the shifted versions
1047 pxor \TMP4,\TMP2
1048 pxor \TMP5, \TMP2
1049 pxor \TMP2, \XMM5
1050 pxor \TMP1, \XMM5 # result is in TMP1
1051
1052 pxor \XMM5, \XMM1
1053.endm
1054
1055/* GHASH the last 4 ciphertext blocks. */
1056.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059 # Multiply TMP6 * HashKey (using Karatsuba)
1060
1061 movdqa \XMM1, \TMP6
1062 pshufd $78, \XMM1, \TMP2
1063 pxor \XMM1, \TMP2
1064 movdqa HashKey_4(%rsp), \TMP5
1065 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1066 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1067 movdqa HashKey_4_k(%rsp), \TMP4
1068 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1069 movdqa \XMM1, \XMMDst
1070 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1071
1072 # Multiply TMP1 * HashKey (using Karatsuba)
1073
1074 movdqa \XMM2, \TMP1
1075 pshufd $78, \XMM2, \TMP2
1076 pxor \XMM2, \TMP2
1077 movdqa HashKey_3(%rsp), \TMP5
1078 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1079 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1080 movdqa HashKey_3_k(%rsp), \TMP4
1081 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082 pxor \TMP1, \TMP6
1083 pxor \XMM2, \XMMDst
1084 pxor \TMP2, \XMM1
1085# results accumulated in TMP6, XMMDst, XMM1
1086
1087 # Multiply TMP1 * HashKey (using Karatsuba)
1088
1089 movdqa \XMM3, \TMP1
1090 pshufd $78, \XMM3, \TMP2
1091 pxor \XMM3, \TMP2
1092 movdqa HashKey_2(%rsp), \TMP5
1093 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1094 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1095 movdqa HashKey_2_k(%rsp), \TMP4
1096 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1097 pxor \TMP1, \TMP6
1098 pxor \XMM3, \XMMDst
1099 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1100
1101 # Multiply TMP1 * HashKey (using Karatsuba)
1102 movdqa \XMM4, \TMP1
1103 pshufd $78, \XMM4, \TMP2
1104 pxor \XMM4, \TMP2
1105 movdqa HashKey(%rsp), \TMP5
1106 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1107 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1108 movdqa HashKey_k(%rsp), \TMP4
1109 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1110 pxor \TMP1, \TMP6
1111 pxor \XMM4, \XMMDst
1112 pxor \XMM1, \TMP2
1113 pxor \TMP6, \TMP2
1114 pxor \XMMDst, \TMP2
1115 # middle section of the temp results combined as in karatsuba algorithm
1116 movdqa \TMP2, \TMP4
1117 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1118 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1119 pxor \TMP4, \XMMDst
1120 pxor \TMP2, \TMP6
1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122 # first phase of the reduction
1123 movdqa \XMMDst, \TMP2
1124 movdqa \XMMDst, \TMP3
1125 movdqa \XMMDst, \TMP4
1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127 pslld $31, \TMP2 # packed right shifting << 31
1128 pslld $30, \TMP3 # packed right shifting << 30
1129 pslld $25, \TMP4 # packed right shifting << 25
1130 pxor \TMP3, \TMP2 # xor the shifted versions
1131 pxor \TMP4, \TMP2
1132 movdqa \TMP2, \TMP7
1133 psrldq $4, \TMP7 # right shift TMP7 1 DW
1134 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1135 pxor \TMP2, \XMMDst
1136
1137 # second phase of the reduction
1138 movdqa \XMMDst, \TMP2
1139 # make 3 copies of XMMDst for doing 3 shift operations
1140 movdqa \XMMDst, \TMP3
1141 movdqa \XMMDst, \TMP4
1142 psrld $1, \TMP2 # packed left shift >> 1
1143 psrld $2, \TMP3 # packed left shift >> 2
1144 psrld $7, \TMP4 # packed left shift >> 7
1145 pxor \TMP3, \TMP2 # xor the shifted versions
1146 pxor \TMP4, \TMP2
1147 pxor \TMP7, \TMP2
1148 pxor \TMP2, \XMMDst
1149 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1150.endm
1151
1152/* Encryption of a single block done*/
1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155 pxor (%arg1), \XMM0
1156 movaps 16(%arg1), \TMP1
1157 AESENC \TMP1, \XMM0
1158 movaps 32(%arg1), \TMP1
1159 AESENC \TMP1, \XMM0
1160 movaps 48(%arg1), \TMP1
1161 AESENC \TMP1, \XMM0
1162 movaps 64(%arg1), \TMP1
1163 AESENC \TMP1, \XMM0
1164 movaps 80(%arg1), \TMP1
1165 AESENC \TMP1, \XMM0
1166 movaps 96(%arg1), \TMP1
1167 AESENC \TMP1, \XMM0
1168 movaps 112(%arg1), \TMP1
1169 AESENC \TMP1, \XMM0
1170 movaps 128(%arg1), \TMP1
1171 AESENC \TMP1, \XMM0
1172 movaps 144(%arg1), \TMP1
1173 AESENC \TMP1, \XMM0
1174 movaps 160(%arg1), \TMP1
1175 AESENCLAST \TMP1, \XMM0
1176.endm
1177
1178
1179/*****************************************************************************
1180* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1181* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1182* const u8 *in, // Ciphertext input
1183* u64 plaintext_len, // Length of data in bytes for decryption.
1184* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1185* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186* // concatenated with 0x00000001. 16-byte aligned pointer.
1187* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188* const u8 *aad, // Additional Authentication Data (AAD)
1189* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1191* // given authentication tag and only return the plaintext if they match.
1192* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193* // (most likely), 12 or 8.
1194*
1195* Assumptions:
1196*
1197* keys:
1198* keys are pre-expanded and aligned to 16 bytes. we are using the first
1199* set of 11 keys in the data structure void *aes_ctx
1200*
1201* iv:
1202* 0 1 2 3
1203* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205* | Salt (From the SA) |
1206* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207* | Initialization Vector |
1208* | (This is the sequence number from IPSec header) |
1209* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210* | 0x1 |
1211* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212*
1213*
1214*
1215* AAD:
1216* AAD padded to 128 bits with 0
1217* for example, assume AAD is a u32 vector
1218*
1219* if AAD is 8 bytes:
1220* AAD[3] = {A0, A1};
1221* padded AAD in xmm register = {A1 A0 0 0}
1222*
1223* 0 1 2 3
1224* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226* | SPI (A1) |
1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228* | 32-bit Sequence Number (A0) |
1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230* | 0x0 |
1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*
1233* AAD Format with 32-bit Sequence Number
1234*
1235* if AAD is 12 bytes:
1236* AAD[3] = {A0, A1, A2};
1237* padded AAD in xmm register = {A2 A1 A0 0}
1238*
1239* 0 1 2 3
1240* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244* | SPI (A2) |
1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246* | 64-bit Extended Sequence Number {A1,A0} |
1247* | |
1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249* | 0x0 |
1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252* AAD Format with 64-bit Extended Sequence Number
1253*
1254* aadLen:
1255* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256* The code supports 16 too but for other sizes, the code will fail.
1257*
1258* TLen:
1259* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260* For other sizes, the code will fail.
1261*
1262* poly = x^128 + x^127 + x^126 + x^121 + 1
1263*
1264*****************************************************************************/
1265
1266ENTRY(aesni_gcm_dec)
1267 push %r12
1268 push %r13
1269 push %r14
1270 mov %rsp, %r14
1271/*
1272* states of %xmm registers %xmm6:%xmm15 not saved
1273* all %xmm registers are clobbered
1274*/
1275 sub $VARIABLE_OFFSET, %rsp
1276 and $~63, %rsp # align rsp to 64 bytes
1277 mov %arg6, %r12
1278 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1279 movdqa SHUF_MASK(%rip), %xmm2
1280 PSHUFB_XMM %xmm2, %xmm13
1281
1282
1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284
1285 movdqa %xmm13, %xmm2
1286 psllq $1, %xmm13
1287 psrlq $63, %xmm2
1288 movdqa %xmm2, %xmm1
1289 pslldq $8, %xmm2
1290 psrldq $8, %xmm1
1291 por %xmm2, %xmm13
1292
1293 # Reduction
1294
1295 pshufd $0x24, %xmm1, %xmm2
1296 pcmpeqd TWOONE(%rip), %xmm2
1297 pand POLY(%rip), %xmm2
1298 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1299
1300
1301 # Decrypt first few blocks
1302
1303 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1304 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1305 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1306 mov %r13, %r12
1307 and $(3<<4), %r12
1308 jz _initial_num_blocks_is_0_decrypt
1309 cmp $(2<<4), %r12
1310 jb _initial_num_blocks_is_1_decrypt
1311 je _initial_num_blocks_is_2_decrypt
1312_initial_num_blocks_is_3_decrypt:
1313 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315 sub $48, %r13
1316 jmp _initial_blocks_decrypted
1317_initial_num_blocks_is_2_decrypt:
1318 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320 sub $32, %r13
1321 jmp _initial_blocks_decrypted
1322_initial_num_blocks_is_1_decrypt:
1323 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325 sub $16, %r13
1326 jmp _initial_blocks_decrypted
1327_initial_num_blocks_is_0_decrypt:
1328 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1330_initial_blocks_decrypted:
1331 cmp $0, %r13
1332 je _zero_cipher_left_decrypt
1333 sub $64, %r13
1334 je _four_cipher_left_decrypt
1335_decrypt_by_4:
1336 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1338 add $64, %r11
1339 sub $64, %r13
1340 jne _decrypt_by_4
1341_four_cipher_left_decrypt:
1342 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1344_zero_cipher_left_decrypt:
1345 mov %arg4, %r13
1346 and $15, %r13 # %r13 = arg4 (mod 16)
1347 je _multiple_of_16_bytes_decrypt
1348
1349 # Handle the last <16 byte block seperately
1350
1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352 movdqa SHUF_MASK(%rip), %xmm10
1353 PSHUFB_XMM %xmm10, %xmm0
1354
1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356 sub $16, %r11
1357 add %r13, %r11
1358 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
1359 lea SHIFT_MASK+16(%rip), %r12
1360 sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1362# (%r13 is the number of bytes in plaintext mod 16)
1363 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1364 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1365
1366 movdqa %xmm1, %xmm2
1367 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1368 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1369 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1370 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1371 pand %xmm1, %xmm2
1372 movdqa SHUF_MASK(%rip), %xmm10
1373 PSHUFB_XMM %xmm10 ,%xmm2
1374
1375 pxor %xmm2, %xmm8
1376 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1377 # GHASH computation for the last <16 byte block
1378 sub %r13, %r11
1379 add $16, %r11
1380
1381 # output %r13 bytes
1382 MOVQ_R64_XMM %xmm0, %rax
1383 cmp $8, %r13
1384 jle _less_than_8_bytes_left_decrypt
1385 mov %rax, (%arg2 , %r11, 1)
1386 add $8, %r11
1387 psrldq $8, %xmm0
1388 MOVQ_R64_XMM %xmm0, %rax
1389 sub $8, %r13
1390_less_than_8_bytes_left_decrypt:
1391 mov %al, (%arg2, %r11, 1)
1392 add $1, %r11
1393 shr $8, %rax
1394 sub $1, %r13
1395 jne _less_than_8_bytes_left_decrypt
1396_multiple_of_16_bytes_decrypt:
1397 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1398 shl $3, %r12 # convert into number of bits
1399 movd %r12d, %xmm15 # len(A) in %xmm15
1400 shl $3, %arg4 # len(C) in bits (*128)
1401 MOVQ_R64_XMM %arg4, %xmm1
1402 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1403 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1404 pxor %xmm15, %xmm8
1405 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1406 # final GHASH computation
1407 movdqa SHUF_MASK(%rip), %xmm10
1408 PSHUFB_XMM %xmm10, %xmm8
1409
1410 mov %arg5, %rax # %rax = *Y0
1411 movdqu (%rax), %xmm0 # %xmm0 = Y0
1412 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1413 pxor %xmm8, %xmm0
1414_return_T_decrypt:
1415 mov arg9, %r10 # %r10 = authTag
1416 mov arg10, %r11 # %r11 = auth_tag_len
1417 cmp $16, %r11
1418 je _T_16_decrypt
1419 cmp $12, %r11
1420 je _T_12_decrypt
1421_T_8_decrypt:
1422 MOVQ_R64_XMM %xmm0, %rax
1423 mov %rax, (%r10)
1424 jmp _return_T_done_decrypt
1425_T_12_decrypt:
1426 MOVQ_R64_XMM %xmm0, %rax
1427 mov %rax, (%r10)
1428 psrldq $8, %xmm0
1429 movd %xmm0, %eax
1430 mov %eax, 8(%r10)
1431 jmp _return_T_done_decrypt
1432_T_16_decrypt:
1433 movdqu %xmm0, (%r10)
1434_return_T_done_decrypt:
1435 mov %r14, %rsp
1436 pop %r14
1437 pop %r13
1438 pop %r12
1439 ret
1440
1441
1442/*****************************************************************************
1443* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1444* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1445* const u8 *in, // Plaintext input
1446* u64 plaintext_len, // Length of data in bytes for encryption.
1447* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1448* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449* // concatenated with 0x00000001. 16-byte aligned pointer.
1450* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451* const u8 *aad, // Additional Authentication Data (AAD)
1452* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453* u8 *auth_tag, // Authenticated Tag output.
1454* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455* // 12 or 8.
1456*
1457* Assumptions:
1458*
1459* keys:
1460* keys are pre-expanded and aligned to 16 bytes. we are using the
1461* first set of 11 keys in the data structure void *aes_ctx
1462*
1463*
1464* iv:
1465* 0 1 2 3
1466* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468* | Salt (From the SA) |
1469* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470* | Initialization Vector |
1471* | (This is the sequence number from IPSec header) |
1472* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473* | 0x1 |
1474* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475*
1476*
1477*
1478* AAD:
1479* AAD padded to 128 bits with 0
1480* for example, assume AAD is a u32 vector
1481*
1482* if AAD is 8 bytes:
1483* AAD[3] = {A0, A1};
1484* padded AAD in xmm register = {A1 A0 0 0}
1485*
1486* 0 1 2 3
1487* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489* | SPI (A1) |
1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491* | 32-bit Sequence Number (A0) |
1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493* | 0x0 |
1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*
1496* AAD Format with 32-bit Sequence Number
1497*
1498* if AAD is 12 bytes:
1499* AAD[3] = {A0, A1, A2};
1500* padded AAD in xmm register = {A2 A1 A0 0}
1501*
1502* 0 1 2 3
1503* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505* | SPI (A2) |
1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507* | 64-bit Extended Sequence Number {A1,A0} |
1508* | |
1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510* | 0x0 |
1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513* AAD Format with 64-bit Extended Sequence Number
1514*
1515* aadLen:
1516* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517* The code supports 16 too but for other sizes, the code will fail.
1518*
1519* TLen:
1520* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521* For other sizes, the code will fail.
1522*
1523* poly = x^128 + x^127 + x^126 + x^121 + 1
1524***************************************************************************/
1525ENTRY(aesni_gcm_enc)
1526 push %r12
1527 push %r13
1528 push %r14
1529 mov %rsp, %r14
1530#
1531# states of %xmm registers %xmm6:%xmm15 not saved
1532# all %xmm registers are clobbered
1533#
1534 sub $VARIABLE_OFFSET, %rsp
1535 and $~63, %rsp
1536 mov %arg6, %r12
1537 movdqu (%r12), %xmm13
1538 movdqa SHUF_MASK(%rip), %xmm2
1539 PSHUFB_XMM %xmm2, %xmm13
1540
1541
1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544 movdqa %xmm13, %xmm2
1545 psllq $1, %xmm13
1546 psrlq $63, %xmm2
1547 movdqa %xmm2, %xmm1
1548 pslldq $8, %xmm2
1549 psrldq $8, %xmm1
1550 por %xmm2, %xmm13
1551
1552 # reduce HashKey<<1
1553
1554 pshufd $0x24, %xmm1, %xmm2
1555 pcmpeqd TWOONE(%rip), %xmm2
1556 pand POLY(%rip), %xmm2
1557 pxor %xmm2, %xmm13
1558 movdqa %xmm13, HashKey(%rsp)
1559 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1560 and $-16, %r13
1561 mov %r13, %r12
1562
1563 # Encrypt first few blocks
1564
1565 and $(3<<4), %r12
1566 jz _initial_num_blocks_is_0_encrypt
1567 cmp $(2<<4), %r12
1568 jb _initial_num_blocks_is_1_encrypt
1569 je _initial_num_blocks_is_2_encrypt
1570_initial_num_blocks_is_3_encrypt:
1571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573 sub $48, %r13
1574 jmp _initial_blocks_encrypted
1575_initial_num_blocks_is_2_encrypt:
1576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578 sub $32, %r13
1579 jmp _initial_blocks_encrypted
1580_initial_num_blocks_is_1_encrypt:
1581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583 sub $16, %r13
1584 jmp _initial_blocks_encrypted
1585_initial_num_blocks_is_0_encrypt:
1586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588_initial_blocks_encrypted:
1589
1590 # Main loop - Encrypt remaining blocks
1591
1592 cmp $0, %r13
1593 je _zero_cipher_left_encrypt
1594 sub $64, %r13
1595 je _four_cipher_left_encrypt
1596_encrypt_by_4_encrypt:
1597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599 add $64, %r11
1600 sub $64, %r13
1601 jne _encrypt_by_4_encrypt
1602_four_cipher_left_encrypt:
1603 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605_zero_cipher_left_encrypt:
1606 mov %arg4, %r13
1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt
1609
1610 # Handle the last <16 Byte block seperately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0
1614
1615 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1616 sub $16, %r11
1617 add %r13, %r11
1618 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1619 lea SHIFT_MASK+16(%rip), %r12
1620 sub %r13, %r12
1621 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1622 # (%r13 is the number of bytes in plaintext mod 16)
1623 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1624 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1625 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1626 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1627 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1628 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1629 movdqa SHUF_MASK(%rip), %xmm10
1630 PSHUFB_XMM %xmm10,%xmm0
1631
1632 pxor %xmm0, %xmm8
1633 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1634 # GHASH computation for the last <16 byte block
1635 sub %r13, %r11
1636 add $16, %r11
1637 PSHUFB_XMM %xmm10, %xmm1
1638
1639 # shuffle xmm0 back to output as ciphertext
1640
1641 # Output %r13 bytes
1642 MOVQ_R64_XMM %xmm0, %rax
1643 cmp $8, %r13
1644 jle _less_than_8_bytes_left_encrypt
1645 mov %rax, (%arg2 , %r11, 1)
1646 add $8, %r11
1647 psrldq $8, %xmm0
1648 MOVQ_R64_XMM %xmm0, %rax
1649 sub $8, %r13
1650_less_than_8_bytes_left_encrypt:
1651 mov %al, (%arg2, %r11, 1)
1652 add $1, %r11
1653 shr $8, %rax
1654 sub $1, %r13
1655 jne _less_than_8_bytes_left_encrypt
1656_multiple_of_16_bytes_encrypt:
1657 mov arg8, %r12 # %r12 = addLen (number of bytes)
1658 shl $3, %r12
1659 movd %r12d, %xmm15 # len(A) in %xmm15
1660 shl $3, %arg4 # len(C) in bits (*128)
1661 MOVQ_R64_XMM %arg4, %xmm1
1662 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1663 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1664 pxor %xmm15, %xmm8
1665 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1666 # final GHASH computation
1667 movdqa SHUF_MASK(%rip), %xmm10
1668 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1669
1670 mov %arg5, %rax # %rax = *Y0
1671 movdqu (%rax), %xmm0 # %xmm0 = Y0
1672 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1673 pxor %xmm8, %xmm0
1674_return_T_encrypt:
1675 mov arg9, %r10 # %r10 = authTag
1676 mov arg10, %r11 # %r11 = auth_tag_len
1677 cmp $16, %r11
1678 je _T_16_encrypt
1679 cmp $12, %r11
1680 je _T_12_encrypt
1681_T_8_encrypt:
1682 MOVQ_R64_XMM %xmm0, %rax
1683 mov %rax, (%r10)
1684 jmp _return_T_done_encrypt
1685_T_12_encrypt:
1686 MOVQ_R64_XMM %xmm0, %rax
1687 mov %rax, (%r10)
1688 psrldq $8, %xmm0
1689 movd %xmm0, %eax
1690 mov %eax, 8(%r10)
1691 jmp _return_T_done_encrypt
1692_T_16_encrypt:
1693 movdqu %xmm0, (%r10)
1694_return_T_done_encrypt:
1695 mov %r14, %rsp
1696 pop %r14
1697 pop %r13
1698 pop %r12
1699 ret
1700
1701#endif
1702
49 1703
50_key_expansion_128: 1704_key_expansion_128:
51_key_expansion_256a: 1705_key_expansion_256a:
@@ -55,10 +1709,11 @@ _key_expansion_256a:
55 shufps $0b10001100, %xmm0, %xmm4 1709 shufps $0b10001100, %xmm0, %xmm4
56 pxor %xmm4, %xmm0 1710 pxor %xmm4, %xmm0
57 pxor %xmm1, %xmm0 1711 pxor %xmm1, %xmm0
58 movaps %xmm0, (%rcx) 1712 movaps %xmm0, (TKEYP)
59 add $0x10, %rcx 1713 add $0x10, TKEYP
60 ret 1714 ret
61 1715
1716.align 4
62_key_expansion_192a: 1717_key_expansion_192a:
63 pshufd $0b01010101, %xmm1, %xmm1 1718 pshufd $0b01010101, %xmm1, %xmm1
64 shufps $0b00010000, %xmm0, %xmm4 1719 shufps $0b00010000, %xmm0, %xmm4
@@ -76,12 +1731,13 @@ _key_expansion_192a:
76 1731
77 movaps %xmm0, %xmm1 1732 movaps %xmm0, %xmm1
78 shufps $0b01000100, %xmm0, %xmm6 1733 shufps $0b01000100, %xmm0, %xmm6
79 movaps %xmm6, (%rcx) 1734 movaps %xmm6, (TKEYP)
80 shufps $0b01001110, %xmm2, %xmm1 1735 shufps $0b01001110, %xmm2, %xmm1
81 movaps %xmm1, 16(%rcx) 1736 movaps %xmm1, 0x10(TKEYP)
82 add $0x20, %rcx 1737 add $0x20, TKEYP
83 ret 1738 ret
84 1739
1740.align 4
85_key_expansion_192b: 1741_key_expansion_192b:
86 pshufd $0b01010101, %xmm1, %xmm1 1742 pshufd $0b01010101, %xmm1, %xmm1
87 shufps $0b00010000, %xmm0, %xmm4 1743 shufps $0b00010000, %xmm0, %xmm4
@@ -96,10 +1752,11 @@ _key_expansion_192b:
96 pxor %xmm3, %xmm2 1752 pxor %xmm3, %xmm2
97 pxor %xmm5, %xmm2 1753 pxor %xmm5, %xmm2
98 1754
99 movaps %xmm0, (%rcx) 1755 movaps %xmm0, (TKEYP)
100 add $0x10, %rcx 1756 add $0x10, TKEYP
101 ret 1757 ret
102 1758
1759.align 4
103_key_expansion_256b: 1760_key_expansion_256b:
104 pshufd $0b10101010, %xmm1, %xmm1 1761 pshufd $0b10101010, %xmm1, %xmm1
105 shufps $0b00010000, %xmm2, %xmm4 1762 shufps $0b00010000, %xmm2, %xmm4
@@ -107,8 +1764,8 @@ _key_expansion_256b:
107 shufps $0b10001100, %xmm2, %xmm4 1764 shufps $0b10001100, %xmm2, %xmm4
108 pxor %xmm4, %xmm2 1765 pxor %xmm4, %xmm2
109 pxor %xmm1, %xmm2 1766 pxor %xmm1, %xmm2
110 movaps %xmm2, (%rcx) 1767 movaps %xmm2, (TKEYP)
111 add $0x10, %rcx 1768 add $0x10, TKEYP
112 ret 1769 ret
113 1770
114/* 1771/*
@@ -116,17 +1773,23 @@ _key_expansion_256b:
116 * unsigned int key_len) 1773 * unsigned int key_len)
117 */ 1774 */
118ENTRY(aesni_set_key) 1775ENTRY(aesni_set_key)
119 movups (%rsi), %xmm0 # user key (first 16 bytes) 1776#ifndef __x86_64__
120 movaps %xmm0, (%rdi) 1777 pushl KEYP
121 lea 0x10(%rdi), %rcx # key addr 1778 movl 8(%esp), KEYP # ctx
122 movl %edx, 480(%rdi) 1779 movl 12(%esp), UKEYP # in_key
1780 movl 16(%esp), %edx # key_len
1781#endif
1782 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1783 movaps %xmm0, (KEYP)
1784 lea 0x10(KEYP), TKEYP # key addr
1785 movl %edx, 480(KEYP)
123 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1786 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
124 cmp $24, %dl 1787 cmp $24, %dl
125 jb .Lenc_key128 1788 jb .Lenc_key128
126 je .Lenc_key192 1789 je .Lenc_key192
127 movups 0x10(%rsi), %xmm2 # other user key 1790 movups 0x10(UKEYP), %xmm2 # other user key
128 movaps %xmm2, (%rcx) 1791 movaps %xmm2, (TKEYP)
129 add $0x10, %rcx 1792 add $0x10, TKEYP
130 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1793 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
131 call _key_expansion_256a 1794 call _key_expansion_256a
132 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1795 AESKEYGENASSIST 0x1 %xmm0 %xmm1
@@ -155,7 +1818,7 @@ ENTRY(aesni_set_key)
155 call _key_expansion_256a 1818 call _key_expansion_256a
156 jmp .Ldec_key 1819 jmp .Ldec_key
157.Lenc_key192: 1820.Lenc_key192:
158 movq 0x10(%rsi), %xmm2 # other user key 1821 movq 0x10(UKEYP), %xmm2 # other user key
159 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1822 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
160 call _key_expansion_192a 1823 call _key_expansion_192a
161 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1824 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
@@ -195,33 +1858,47 @@ ENTRY(aesni_set_key)
195 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1858 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
196 call _key_expansion_128 1859 call _key_expansion_128
197.Ldec_key: 1860.Ldec_key:
198 sub $0x10, %rcx 1861 sub $0x10, TKEYP
199 movaps (%rdi), %xmm0 1862 movaps (KEYP), %xmm0
200 movaps (%rcx), %xmm1 1863 movaps (TKEYP), %xmm1
201 movaps %xmm0, 240(%rcx) 1864 movaps %xmm0, 240(TKEYP)
202 movaps %xmm1, 240(%rdi) 1865 movaps %xmm1, 240(KEYP)
203 add $0x10, %rdi 1866 add $0x10, KEYP
204 lea 240-16(%rcx), %rsi 1867 lea 240-16(TKEYP), UKEYP
205.align 4 1868.align 4
206.Ldec_key_loop: 1869.Ldec_key_loop:
207 movaps (%rdi), %xmm0 1870 movaps (KEYP), %xmm0
208 AESIMC %xmm0 %xmm1 1871 AESIMC %xmm0 %xmm1
209 movaps %xmm1, (%rsi) 1872 movaps %xmm1, (UKEYP)
210 add $0x10, %rdi 1873 add $0x10, KEYP
211 sub $0x10, %rsi 1874 sub $0x10, UKEYP
212 cmp %rcx, %rdi 1875 cmp TKEYP, KEYP
213 jb .Ldec_key_loop 1876 jb .Ldec_key_loop
214 xor %rax, %rax 1877 xor AREG, AREG
1878#ifndef __x86_64__
1879 popl KEYP
1880#endif
215 ret 1881 ret
216 1882
217/* 1883/*
218 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1884 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
219 */ 1885 */
220ENTRY(aesni_enc) 1886ENTRY(aesni_enc)
1887#ifndef __x86_64__
1888 pushl KEYP
1889 pushl KLEN
1890 movl 12(%esp), KEYP
1891 movl 16(%esp), OUTP
1892 movl 20(%esp), INP
1893#endif
221 movl 480(KEYP), KLEN # key length 1894 movl 480(KEYP), KLEN # key length
222 movups (INP), STATE # input 1895 movups (INP), STATE # input
223 call _aesni_enc1 1896 call _aesni_enc1
224 movups STATE, (OUTP) # output 1897 movups STATE, (OUTP) # output
1898#ifndef __x86_64__
1899 popl KLEN
1900 popl KEYP
1901#endif
225 ret 1902 ret
226 1903
227/* 1904/*
@@ -236,6 +1913,7 @@ ENTRY(aesni_enc)
236 * KEY 1913 * KEY
237 * TKEYP (T1) 1914 * TKEYP (T1)
238 */ 1915 */
1916.align 4
239_aesni_enc1: 1917_aesni_enc1:
240 movaps (KEYP), KEY # key 1918 movaps (KEYP), KEY # key
241 mov KEYP, TKEYP 1919 mov KEYP, TKEYP
@@ -298,6 +1976,7 @@ _aesni_enc1:
298 * KEY 1976 * KEY
299 * TKEYP (T1) 1977 * TKEYP (T1)
300 */ 1978 */
1979.align 4
301_aesni_enc4: 1980_aesni_enc4:
302 movaps (KEYP), KEY # key 1981 movaps (KEYP), KEY # key
303 mov KEYP, TKEYP 1982 mov KEYP, TKEYP
@@ -391,11 +2070,22 @@ _aesni_enc4:
391 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2070 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
392 */ 2071 */
393ENTRY(aesni_dec) 2072ENTRY(aesni_dec)
2073#ifndef __x86_64__
2074 pushl KEYP
2075 pushl KLEN
2076 movl 12(%esp), KEYP
2077 movl 16(%esp), OUTP
2078 movl 20(%esp), INP
2079#endif
394 mov 480(KEYP), KLEN # key length 2080 mov 480(KEYP), KLEN # key length
395 add $240, KEYP 2081 add $240, KEYP
396 movups (INP), STATE # input 2082 movups (INP), STATE # input
397 call _aesni_dec1 2083 call _aesni_dec1
398 movups STATE, (OUTP) #output 2084 movups STATE, (OUTP) #output
2085#ifndef __x86_64__
2086 popl KLEN
2087 popl KEYP
2088#endif
399 ret 2089 ret
400 2090
401/* 2091/*
@@ -410,6 +2100,7 @@ ENTRY(aesni_dec)
410 * KEY 2100 * KEY
411 * TKEYP (T1) 2101 * TKEYP (T1)
412 */ 2102 */
2103.align 4
413_aesni_dec1: 2104_aesni_dec1:
414 movaps (KEYP), KEY # key 2105 movaps (KEYP), KEY # key
415 mov KEYP, TKEYP 2106 mov KEYP, TKEYP
@@ -472,6 +2163,7 @@ _aesni_dec1:
472 * KEY 2163 * KEY
473 * TKEYP (T1) 2164 * TKEYP (T1)
474 */ 2165 */
2166.align 4
475_aesni_dec4: 2167_aesni_dec4:
476 movaps (KEYP), KEY # key 2168 movaps (KEYP), KEY # key
477 mov KEYP, TKEYP 2169 mov KEYP, TKEYP
@@ -566,6 +2258,15 @@ _aesni_dec4:
566 * size_t len) 2258 * size_t len)
567 */ 2259 */
568ENTRY(aesni_ecb_enc) 2260ENTRY(aesni_ecb_enc)
2261#ifndef __x86_64__
2262 pushl LEN
2263 pushl KEYP
2264 pushl KLEN
2265 movl 16(%esp), KEYP
2266 movl 20(%esp), OUTP
2267 movl 24(%esp), INP
2268 movl 28(%esp), LEN
2269#endif
569 test LEN, LEN # check length 2270 test LEN, LEN # check length
570 jz .Lecb_enc_ret 2271 jz .Lecb_enc_ret
571 mov 480(KEYP), KLEN 2272 mov 480(KEYP), KLEN
@@ -602,6 +2303,11 @@ ENTRY(aesni_ecb_enc)
602 cmp $16, LEN 2303 cmp $16, LEN
603 jge .Lecb_enc_loop1 2304 jge .Lecb_enc_loop1
604.Lecb_enc_ret: 2305.Lecb_enc_ret:
2306#ifndef __x86_64__
2307 popl KLEN
2308 popl KEYP
2309 popl LEN
2310#endif
605 ret 2311 ret
606 2312
607/* 2313/*
@@ -609,6 +2315,15 @@ ENTRY(aesni_ecb_enc)
609 * size_t len); 2315 * size_t len);
610 */ 2316 */
611ENTRY(aesni_ecb_dec) 2317ENTRY(aesni_ecb_dec)
2318#ifndef __x86_64__
2319 pushl LEN
2320 pushl KEYP
2321 pushl KLEN
2322 movl 16(%esp), KEYP
2323 movl 20(%esp), OUTP
2324 movl 24(%esp), INP
2325 movl 28(%esp), LEN
2326#endif
612 test LEN, LEN 2327 test LEN, LEN
613 jz .Lecb_dec_ret 2328 jz .Lecb_dec_ret
614 mov 480(KEYP), KLEN 2329 mov 480(KEYP), KLEN
@@ -646,6 +2361,11 @@ ENTRY(aesni_ecb_dec)
646 cmp $16, LEN 2361 cmp $16, LEN
647 jge .Lecb_dec_loop1 2362 jge .Lecb_dec_loop1
648.Lecb_dec_ret: 2363.Lecb_dec_ret:
2364#ifndef __x86_64__
2365 popl KLEN
2366 popl KEYP
2367 popl LEN
2368#endif
649 ret 2369 ret
650 2370
651/* 2371/*
@@ -653,6 +2373,17 @@ ENTRY(aesni_ecb_dec)
653 * size_t len, u8 *iv) 2373 * size_t len, u8 *iv)
654 */ 2374 */
655ENTRY(aesni_cbc_enc) 2375ENTRY(aesni_cbc_enc)
2376#ifndef __x86_64__
2377 pushl IVP
2378 pushl LEN
2379 pushl KEYP
2380 pushl KLEN
2381 movl 20(%esp), KEYP
2382 movl 24(%esp), OUTP
2383 movl 28(%esp), INP
2384 movl 32(%esp), LEN
2385 movl 36(%esp), IVP
2386#endif
656 cmp $16, LEN 2387 cmp $16, LEN
657 jb .Lcbc_enc_ret 2388 jb .Lcbc_enc_ret
658 mov 480(KEYP), KLEN 2389 mov 480(KEYP), KLEN
@@ -670,6 +2401,12 @@ ENTRY(aesni_cbc_enc)
670 jge .Lcbc_enc_loop 2401 jge .Lcbc_enc_loop
671 movups STATE, (IVP) 2402 movups STATE, (IVP)
672.Lcbc_enc_ret: 2403.Lcbc_enc_ret:
2404#ifndef __x86_64__
2405 popl KLEN
2406 popl KEYP
2407 popl LEN
2408 popl IVP
2409#endif
673 ret 2410 ret
674 2411
675/* 2412/*
@@ -677,6 +2414,17 @@ ENTRY(aesni_cbc_enc)
677 * size_t len, u8 *iv) 2414 * size_t len, u8 *iv)
678 */ 2415 */
679ENTRY(aesni_cbc_dec) 2416ENTRY(aesni_cbc_dec)
2417#ifndef __x86_64__
2418 pushl IVP
2419 pushl LEN
2420 pushl KEYP
2421 pushl KLEN
2422 movl 20(%esp), KEYP
2423 movl 24(%esp), OUTP
2424 movl 28(%esp), INP
2425 movl 32(%esp), LEN
2426 movl 36(%esp), IVP
2427#endif
680 cmp $16, LEN 2428 cmp $16, LEN
681 jb .Lcbc_dec_just_ret 2429 jb .Lcbc_dec_just_ret
682 mov 480(KEYP), KLEN 2430 mov 480(KEYP), KLEN
@@ -690,16 +2438,30 @@ ENTRY(aesni_cbc_dec)
690 movaps IN1, STATE1 2438 movaps IN1, STATE1
691 movups 0x10(INP), IN2 2439 movups 0x10(INP), IN2
692 movaps IN2, STATE2 2440 movaps IN2, STATE2
2441#ifdef __x86_64__
693 movups 0x20(INP), IN3 2442 movups 0x20(INP), IN3
694 movaps IN3, STATE3 2443 movaps IN3, STATE3
695 movups 0x30(INP), IN4 2444 movups 0x30(INP), IN4
696 movaps IN4, STATE4 2445 movaps IN4, STATE4
2446#else
2447 movups 0x20(INP), IN1
2448 movaps IN1, STATE3
2449 movups 0x30(INP), IN2
2450 movaps IN2, STATE4
2451#endif
697 call _aesni_dec4 2452 call _aesni_dec4
698 pxor IV, STATE1 2453 pxor IV, STATE1
2454#ifdef __x86_64__
699 pxor IN1, STATE2 2455 pxor IN1, STATE2
700 pxor IN2, STATE3 2456 pxor IN2, STATE3
701 pxor IN3, STATE4 2457 pxor IN3, STATE4
702 movaps IN4, IV 2458 movaps IN4, IV
2459#else
2460 pxor (INP), STATE2
2461 pxor 0x10(INP), STATE3
2462 pxor IN1, STATE4
2463 movaps IN2, IV
2464#endif
703 movups STATE1, (OUTP) 2465 movups STATE1, (OUTP)
704 movups STATE2, 0x10(OUTP) 2466 movups STATE2, 0x10(OUTP)
705 movups STATE3, 0x20(OUTP) 2467 movups STATE3, 0x20(OUTP)
@@ -727,8 +2489,15 @@ ENTRY(aesni_cbc_dec)
727.Lcbc_dec_ret: 2489.Lcbc_dec_ret:
728 movups IV, (IVP) 2490 movups IV, (IVP)
729.Lcbc_dec_just_ret: 2491.Lcbc_dec_just_ret:
2492#ifndef __x86_64__
2493 popl KLEN
2494 popl KEYP
2495 popl LEN
2496 popl IVP
2497#endif
730 ret 2498 ret
731 2499
2500#ifdef __x86_64__
732.align 16 2501.align 16
733.Lbswap_mask: 2502.Lbswap_mask:
734 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2503 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -744,6 +2513,7 @@ ENTRY(aesni_cbc_dec)
744 * INC: == 1, in little endian 2513 * INC: == 1, in little endian
745 * BSWAP_MASK == endian swapping mask 2514 * BSWAP_MASK == endian swapping mask
746 */ 2515 */
2516.align 4
747_aesni_inc_init: 2517_aesni_inc_init:
748 movaps .Lbswap_mask, BSWAP_MASK 2518 movaps .Lbswap_mask, BSWAP_MASK
749 movaps IV, CTR 2519 movaps IV, CTR
@@ -768,6 +2538,7 @@ _aesni_inc_init:
768 * CTR: == output IV, in little endian 2538 * CTR: == output IV, in little endian
769 * TCTR_LOW: == lower qword of CTR 2539 * TCTR_LOW: == lower qword of CTR
770 */ 2540 */
2541.align 4
771_aesni_inc: 2542_aesni_inc:
772 paddq INC, CTR 2543 paddq INC, CTR
773 add $1, TCTR_LOW 2544 add $1, TCTR_LOW
@@ -839,3 +2610,4 @@ ENTRY(aesni_ctr_enc)
839 movups IV, (IVP) 2610 movups IV, (IVP)
840.Lctr_enc_just_ret: 2611.Lctr_enc_just_ret:
841 ret 2612 ret
2613#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc4490a..e1e60c7d5813 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
5 * Copyright (C) 2008, Intel Corp. 5 * Copyright (C) 2008, Intel Corp.
6 * Author: Huang Ying <ying.huang@intel.com> 6 * Author: Huang Ying <ying.huang@intel.com>
7 * 7 *
8 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
9 * interface for 64-bit kernels.
10 * Authors: Adrian Hoban <adrian.hoban@intel.com>
11 * Gabriele Paoloni <gabriele.paoloni@intel.com>
12 * Tadeusz Struk (tadeusz.struk@intel.com)
13 * Aidan O'Mahony (aidan.o.mahony@intel.com)
14 * Copyright (c) 2010, Intel Corporation.
15 *
8 * This program is free software; you can redistribute it and/or modify 16 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by 17 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or 18 * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
21#include <crypto/ctr.h> 29#include <crypto/ctr.h>
22#include <asm/i387.h> 30#include <asm/i387.h>
23#include <asm/aes.h> 31#include <asm/aes.h>
32#include <crypto/scatterwalk.h>
33#include <crypto/internal/aead.h>
34#include <linux/workqueue.h>
35#include <linux/spinlock.h>
24 36
25#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) 37#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
26#define HAS_CTR 38#define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
42 struct cryptd_ablkcipher *cryptd_tfm; 54 struct cryptd_ablkcipher *cryptd_tfm;
43}; 55};
44 56
45#define AESNI_ALIGN 16 57/* This data is stored at the end of the crypto_tfm struct.
58 * It's a type of per "session" data storage location.
59 * This needs to be 16 byte aligned.
60 */
61struct aesni_rfc4106_gcm_ctx {
62 u8 hash_subkey[16];
63 struct crypto_aes_ctx aes_key_expanded;
64 u8 nonce[4];
65 struct cryptd_aead *cryptd_tfm;
66};
67
68struct aesni_gcm_set_hash_subkey_result {
69 int err;
70 struct completion completion;
71};
72
73struct aesni_hash_subkey_req_data {
74 u8 iv[16];
75 struct aesni_gcm_set_hash_subkey_result result;
76 struct scatterlist sg;
77};
78
79#define AESNI_ALIGN (16)
46#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) 80#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
81#define RFC4106_HASH_SUBKEY_SIZE 16
47 82
48asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 83asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
49 unsigned int key_len); 84 unsigned int key_len);
@@ -59,9 +94,62 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
59 const u8 *in, unsigned int len, u8 *iv); 94 const u8 *in, unsigned int len, u8 *iv);
60asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, 95asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
61 const u8 *in, unsigned int len, u8 *iv); 96 const u8 *in, unsigned int len, u8 *iv);
97#ifdef CONFIG_X86_64
62asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 98asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
63 const u8 *in, unsigned int len, u8 *iv); 99 const u8 *in, unsigned int len, u8 *iv);
64 100
101/* asmlinkage void aesni_gcm_enc()
102 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
103 * u8 *out, Ciphertext output. Encrypt in-place is allowed.
104 * const u8 *in, Plaintext input
105 * unsigned long plaintext_len, Length of data in bytes for encryption.
106 * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
107 * concatenated with 8 byte Initialisation Vector (from IPSec ESP
108 * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
109 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
110 * const u8 *aad, Additional Authentication Data (AAD)
111 * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
112 * is going to be 8 or 12 bytes
113 * u8 *auth_tag, Authenticated Tag output.
114 * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
115 * Valid values are 16 (most likely), 12 or 8.
116 */
117asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
118 const u8 *in, unsigned long plaintext_len, u8 *iv,
119 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
120 u8 *auth_tag, unsigned long auth_tag_len);
121
122/* asmlinkage void aesni_gcm_dec()
123 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
124 * u8 *out, Plaintext output. Decrypt in-place is allowed.
125 * const u8 *in, Ciphertext input
126 * unsigned long ciphertext_len, Length of data in bytes for decryption.
127 * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
128 * concatenated with 8 byte Initialisation Vector (from IPSec ESP
129 * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
130 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
131 * const u8 *aad, Additional Authentication Data (AAD)
132 * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
133 * to be 8 or 12 bytes
134 * u8 *auth_tag, Authenticated Tag output.
135 * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
136 * Valid values are 16 (most likely), 12 or 8.
137 */
138asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
139 const u8 *in, unsigned long ciphertext_len, u8 *iv,
140 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
141 u8 *auth_tag, unsigned long auth_tag_len);
142
143static inline struct
144aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
145{
146 return
147 (struct aesni_rfc4106_gcm_ctx *)
148 PTR_ALIGN((u8 *)
149 crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
150}
151#endif
152
65static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) 153static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
66{ 154{
67 unsigned long addr = (unsigned long)raw_ctx; 155 unsigned long addr = (unsigned long)raw_ctx;
@@ -324,6 +412,7 @@ static struct crypto_alg blk_cbc_alg = {
324 }, 412 },
325}; 413};
326 414
415#ifdef CONFIG_X86_64
327static void ctr_crypt_final(struct crypto_aes_ctx *ctx, 416static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
328 struct blkcipher_walk *walk) 417 struct blkcipher_walk *walk)
329{ 418{
@@ -389,6 +478,7 @@ static struct crypto_alg blk_ctr_alg = {
389 }, 478 },
390 }, 479 },
391}; 480};
481#endif
392 482
393static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, 483static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
394 unsigned int key_len) 484 unsigned int key_len)
@@ -536,6 +626,7 @@ static struct crypto_alg ablk_cbc_alg = {
536 }, 626 },
537}; 627};
538 628
629#ifdef CONFIG_X86_64
539static int ablk_ctr_init(struct crypto_tfm *tfm) 630static int ablk_ctr_init(struct crypto_tfm *tfm)
540{ 631{
541 struct cryptd_ablkcipher *cryptd_tfm; 632 struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +703,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
612 }, 703 },
613}; 704};
614#endif 705#endif
706#endif
615 707
616#ifdef HAS_LRW 708#ifdef HAS_LRW
617static int ablk_lrw_init(struct crypto_tfm *tfm) 709static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -730,6 +822,424 @@ static struct crypto_alg ablk_xts_alg = {
730}; 822};
731#endif 823#endif
732 824
825#ifdef CONFIG_X86_64
826static int rfc4106_init(struct crypto_tfm *tfm)
827{
828 struct cryptd_aead *cryptd_tfm;
829 struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
830 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
831 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
832 if (IS_ERR(cryptd_tfm))
833 return PTR_ERR(cryptd_tfm);
834 ctx->cryptd_tfm = cryptd_tfm;
835 tfm->crt_aead.reqsize = sizeof(struct aead_request)
836 + crypto_aead_reqsize(&cryptd_tfm->base);
837 return 0;
838}
839
840static void rfc4106_exit(struct crypto_tfm *tfm)
841{
842 struct aesni_rfc4106_gcm_ctx *ctx =
843 (struct aesni_rfc4106_gcm_ctx *)
844 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
845 if (!IS_ERR(ctx->cryptd_tfm))
846 cryptd_free_aead(ctx->cryptd_tfm);
847 return;
848}
849
850static void
851rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
852{
853 struct aesni_gcm_set_hash_subkey_result *result = req->data;
854
855 if (err == -EINPROGRESS)
856 return;
857 result->err = err;
858 complete(&result->completion);
859}
860
861static int
862rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
863{
864 struct crypto_ablkcipher *ctr_tfm;
865 struct ablkcipher_request *req;
866 int ret = -EINVAL;
867 struct aesni_hash_subkey_req_data *req_data;
868
869 ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
870 if (IS_ERR(ctr_tfm))
871 return PTR_ERR(ctr_tfm);
872
873 crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
874
875 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
876 if (ret) {
877 crypto_free_ablkcipher(ctr_tfm);
878 return ret;
879 }
880
881 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
882 if (!req) {
883 crypto_free_ablkcipher(ctr_tfm);
884 return -EINVAL;
885 }
886
887 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
888 if (!req_data) {
889 crypto_free_ablkcipher(ctr_tfm);
890 return -ENOMEM;
891 }
892 memset(req_data->iv, 0, sizeof(req_data->iv));
893
894 /* Clear the data in the hash sub key container to zero.*/
895 /* We want to cipher all zeros to create the hash sub key. */
896 memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
897
898 init_completion(&req_data->result.completion);
899 sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
900 ablkcipher_request_set_tfm(req, ctr_tfm);
901 ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
902 CRYPTO_TFM_REQ_MAY_BACKLOG,
903 rfc4106_set_hash_subkey_done,
904 &req_data->result);
905
906 ablkcipher_request_set_crypt(req, &req_data->sg,
907 &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
908
909 ret = crypto_ablkcipher_encrypt(req);
910 if (ret == -EINPROGRESS || ret == -EBUSY) {
911 ret = wait_for_completion_interruptible
912 (&req_data->result.completion);
913 if (!ret)
914 ret = req_data->result.err;
915 }
916 ablkcipher_request_free(req);
917 kfree(req_data);
918 crypto_free_ablkcipher(ctr_tfm);
919 return ret;
920}
921
922static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
923 unsigned int key_len)
924{
925 int ret = 0;
926 struct crypto_tfm *tfm = crypto_aead_tfm(parent);
927 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
928 u8 *new_key_mem = NULL;
929
930 if (key_len < 4) {
931 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
932 return -EINVAL;
933 }
934 /*Account for 4 byte nonce at the end.*/
935 key_len -= 4;
936 if (key_len != AES_KEYSIZE_128) {
937 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
938 return -EINVAL;
939 }
940
941 memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
942 /*This must be on a 16 byte boundary!*/
943 if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
944 return -EINVAL;
945
946 if ((unsigned long)key % AESNI_ALIGN) {
947 /*key is not aligned: use an auxuliar aligned pointer*/
948 new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
949 if (!new_key_mem)
950 return -ENOMEM;
951
952 new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
953 memcpy(new_key_mem, key, key_len);
954 key = new_key_mem;
955 }
956
957 if (!irq_fpu_usable())
958 ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
959 key, key_len);
960 else {
961 kernel_fpu_begin();
962 ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
963 kernel_fpu_end();
964 }
965 /*This must be on a 16 byte boundary!*/
966 if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
967 ret = -EINVAL;
968 goto exit;
969 }
970 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
971exit:
972 kfree(new_key_mem);
973 return ret;
974}
975
976/* This is the Integrity Check Value (aka the authentication tag length and can
977 * be 8, 12 or 16 bytes long. */
978static int rfc4106_set_authsize(struct crypto_aead *parent,
979 unsigned int authsize)
980{
981 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
982 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
983
984 switch (authsize) {
985 case 8:
986 case 12:
987 case 16:
988 break;
989 default:
990 return -EINVAL;
991 }
992 crypto_aead_crt(parent)->authsize = authsize;
993 crypto_aead_crt(cryptd_child)->authsize = authsize;
994 return 0;
995}
996
997static int rfc4106_encrypt(struct aead_request *req)
998{
999 int ret;
1000 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1001 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1002 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1003
1004 if (!irq_fpu_usable()) {
1005 struct aead_request *cryptd_req =
1006 (struct aead_request *) aead_request_ctx(req);
1007 memcpy(cryptd_req, req, sizeof(*req));
1008 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1009 return crypto_aead_encrypt(cryptd_req);
1010 } else {
1011 kernel_fpu_begin();
1012 ret = cryptd_child->base.crt_aead.encrypt(req);
1013 kernel_fpu_end();
1014 return ret;
1015 }
1016}
1017
1018static int rfc4106_decrypt(struct aead_request *req)
1019{
1020 int ret;
1021 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1022 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1023 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1024
1025 if (!irq_fpu_usable()) {
1026 struct aead_request *cryptd_req =
1027 (struct aead_request *) aead_request_ctx(req);
1028 memcpy(cryptd_req, req, sizeof(*req));
1029 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1030 return crypto_aead_decrypt(cryptd_req);
1031 } else {
1032 kernel_fpu_begin();
1033 ret = cryptd_child->base.crt_aead.decrypt(req);
1034 kernel_fpu_end();
1035 return ret;
1036 }
1037}
1038
1039static struct crypto_alg rfc4106_alg = {
1040 .cra_name = "rfc4106(gcm(aes))",
1041 .cra_driver_name = "rfc4106-gcm-aesni",
1042 .cra_priority = 400,
1043 .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
1044 .cra_blocksize = 1,
1045 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1046 .cra_alignmask = 0,
1047 .cra_type = &crypto_nivaead_type,
1048 .cra_module = THIS_MODULE,
1049 .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
1050 .cra_init = rfc4106_init,
1051 .cra_exit = rfc4106_exit,
1052 .cra_u = {
1053 .aead = {
1054 .setkey = rfc4106_set_key,
1055 .setauthsize = rfc4106_set_authsize,
1056 .encrypt = rfc4106_encrypt,
1057 .decrypt = rfc4106_decrypt,
1058 .geniv = "seqiv",
1059 .ivsize = 8,
1060 .maxauthsize = 16,
1061 },
1062 },
1063};
1064
1065static int __driver_rfc4106_encrypt(struct aead_request *req)
1066{
1067 u8 one_entry_in_sg = 0;
1068 u8 *src, *dst, *assoc;
1069 __be32 counter = cpu_to_be32(1);
1070 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1071 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1072 void *aes_ctx = &(ctx->aes_key_expanded);
1073 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
1074 u8 iv_tab[16+AESNI_ALIGN];
1075 u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
1076 struct scatter_walk src_sg_walk;
1077 struct scatter_walk assoc_sg_walk;
1078 struct scatter_walk dst_sg_walk;
1079 unsigned int i;
1080
1081 /* Assuming we are supporting rfc4106 64-bit extended */
1082 /* sequence numbers We need to have the AAD length equal */
1083 /* to 8 or 12 bytes */
1084 if (unlikely(req->assoclen != 8 && req->assoclen != 12))
1085 return -EINVAL;
1086 /* IV below built */
1087 for (i = 0; i < 4; i++)
1088 *(iv+i) = ctx->nonce[i];
1089 for (i = 0; i < 8; i++)
1090 *(iv+4+i) = req->iv[i];
1091 *((__be32 *)(iv+12)) = counter;
1092
1093 if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
1094 one_entry_in_sg = 1;
1095 scatterwalk_start(&src_sg_walk, req->src);
1096 scatterwalk_start(&assoc_sg_walk, req->assoc);
1097 src = scatterwalk_map(&src_sg_walk, 0);
1098 assoc = scatterwalk_map(&assoc_sg_walk, 0);
1099 dst = src;
1100 if (unlikely(req->src != req->dst)) {
1101 scatterwalk_start(&dst_sg_walk, req->dst);
1102 dst = scatterwalk_map(&dst_sg_walk, 0);
1103 }
1104
1105 } else {
1106 /* Allocate memory for src, dst, assoc */
1107 src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
1108 GFP_ATOMIC);
1109 if (unlikely(!src))
1110 return -ENOMEM;
1111 assoc = (src + req->cryptlen + auth_tag_len);
1112 scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
1113 scatterwalk_map_and_copy(assoc, req->assoc, 0,
1114 req->assoclen, 0);
1115 dst = src;
1116 }
1117
1118 aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
1119 ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
1120 + ((unsigned long)req->cryptlen), auth_tag_len);
1121
1122 /* The authTag (aka the Integrity Check Value) needs to be written
1123 * back to the packet. */
1124 if (one_entry_in_sg) {
1125 if (unlikely(req->src != req->dst)) {
1126 scatterwalk_unmap(dst, 0);
1127 scatterwalk_done(&dst_sg_walk, 0, 0);
1128 }
1129 scatterwalk_unmap(src, 0);
1130 scatterwalk_unmap(assoc, 0);
1131 scatterwalk_done(&src_sg_walk, 0, 0);
1132 scatterwalk_done(&assoc_sg_walk, 0, 0);
1133 } else {
1134 scatterwalk_map_and_copy(dst, req->dst, 0,
1135 req->cryptlen + auth_tag_len, 1);
1136 kfree(src);
1137 }
1138 return 0;
1139}
1140
1141static int __driver_rfc4106_decrypt(struct aead_request *req)
1142{
1143 u8 one_entry_in_sg = 0;
1144 u8 *src, *dst, *assoc;
1145 unsigned long tempCipherLen = 0;
1146 __be32 counter = cpu_to_be32(1);
1147 int retval = 0;
1148 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1149 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1150 void *aes_ctx = &(ctx->aes_key_expanded);
1151 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
1152 u8 iv_and_authTag[32+AESNI_ALIGN];
1153 u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
1154 u8 *authTag = iv + 16;
1155 struct scatter_walk src_sg_walk;
1156 struct scatter_walk assoc_sg_walk;
1157 struct scatter_walk dst_sg_walk;
1158 unsigned int i;
1159
1160 if (unlikely((req->cryptlen < auth_tag_len) ||
1161 (req->assoclen != 8 && req->assoclen != 12)))
1162 return -EINVAL;
1163 /* Assuming we are supporting rfc4106 64-bit extended */
1164 /* sequence numbers We need to have the AAD length */
1165 /* equal to 8 or 12 bytes */
1166
1167 tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
1168 /* IV below built */
1169 for (i = 0; i < 4; i++)
1170 *(iv+i) = ctx->nonce[i];
1171 for (i = 0; i < 8; i++)
1172 *(iv+4+i) = req->iv[i];
1173 *((__be32 *)(iv+12)) = counter;
1174
1175 if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
1176 one_entry_in_sg = 1;
1177 scatterwalk_start(&src_sg_walk, req->src);
1178 scatterwalk_start(&assoc_sg_walk, req->assoc);
1179 src = scatterwalk_map(&src_sg_walk, 0);
1180 assoc = scatterwalk_map(&assoc_sg_walk, 0);
1181 dst = src;
1182 if (unlikely(req->src != req->dst)) {
1183 scatterwalk_start(&dst_sg_walk, req->dst);
1184 dst = scatterwalk_map(&dst_sg_walk, 0);
1185 }
1186
1187 } else {
1188 /* Allocate memory for src, dst, assoc */
1189 src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
1190 if (!src)
1191 return -ENOMEM;
1192 assoc = (src + req->cryptlen + auth_tag_len);
1193 scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
1194 scatterwalk_map_and_copy(assoc, req->assoc, 0,
1195 req->assoclen, 0);
1196 dst = src;
1197 }
1198
1199 aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
1200 ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
1201 authTag, auth_tag_len);
1202
1203 /* Compare generated tag with passed in tag. */
1204 retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
1205 -EBADMSG : 0;
1206
1207 if (one_entry_in_sg) {
1208 if (unlikely(req->src != req->dst)) {
1209 scatterwalk_unmap(dst, 0);
1210 scatterwalk_done(&dst_sg_walk, 0, 0);
1211 }
1212 scatterwalk_unmap(src, 0);
1213 scatterwalk_unmap(assoc, 0);
1214 scatterwalk_done(&src_sg_walk, 0, 0);
1215 scatterwalk_done(&assoc_sg_walk, 0, 0);
1216 } else {
1217 scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
1218 kfree(src);
1219 }
1220 return retval;
1221}
1222
1223static struct crypto_alg __rfc4106_alg = {
1224 .cra_name = "__gcm-aes-aesni",
1225 .cra_driver_name = "__driver-gcm-aes-aesni",
1226 .cra_priority = 0,
1227 .cra_flags = CRYPTO_ALG_TYPE_AEAD,
1228 .cra_blocksize = 1,
1229 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1230 .cra_alignmask = 0,
1231 .cra_type = &crypto_aead_type,
1232 .cra_module = THIS_MODULE,
1233 .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list),
1234 .cra_u = {
1235 .aead = {
1236 .encrypt = __driver_rfc4106_encrypt,
1237 .decrypt = __driver_rfc4106_decrypt,
1238 },
1239 },
1240};
1241#endif
1242
733static int __init aesni_init(void) 1243static int __init aesni_init(void)
734{ 1244{
735 int err; 1245 int err;
@@ -738,6 +1248,7 @@ static int __init aesni_init(void)
738 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); 1248 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
739 return -ENODEV; 1249 return -ENODEV;
740 } 1250 }
1251
741 if ((err = crypto_register_alg(&aesni_alg))) 1252 if ((err = crypto_register_alg(&aesni_alg)))
742 goto aes_err; 1253 goto aes_err;
743 if ((err = crypto_register_alg(&__aesni_alg))) 1254 if ((err = crypto_register_alg(&__aesni_alg)))
@@ -746,18 +1257,24 @@ static int __init aesni_init(void)
746 goto blk_ecb_err; 1257 goto blk_ecb_err;
747 if ((err = crypto_register_alg(&blk_cbc_alg))) 1258 if ((err = crypto_register_alg(&blk_cbc_alg)))
748 goto blk_cbc_err; 1259 goto blk_cbc_err;
749 if ((err = crypto_register_alg(&blk_ctr_alg)))
750 goto blk_ctr_err;
751 if ((err = crypto_register_alg(&ablk_ecb_alg))) 1260 if ((err = crypto_register_alg(&ablk_ecb_alg)))
752 goto ablk_ecb_err; 1261 goto ablk_ecb_err;
753 if ((err = crypto_register_alg(&ablk_cbc_alg))) 1262 if ((err = crypto_register_alg(&ablk_cbc_alg)))
754 goto ablk_cbc_err; 1263 goto ablk_cbc_err;
1264#ifdef CONFIG_X86_64
1265 if ((err = crypto_register_alg(&blk_ctr_alg)))
1266 goto blk_ctr_err;
755 if ((err = crypto_register_alg(&ablk_ctr_alg))) 1267 if ((err = crypto_register_alg(&ablk_ctr_alg)))
756 goto ablk_ctr_err; 1268 goto ablk_ctr_err;
1269 if ((err = crypto_register_alg(&__rfc4106_alg)))
1270 goto __aead_gcm_err;
1271 if ((err = crypto_register_alg(&rfc4106_alg)))
1272 goto aead_gcm_err;
757#ifdef HAS_CTR 1273#ifdef HAS_CTR
758 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg))) 1274 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
759 goto ablk_rfc3686_ctr_err; 1275 goto ablk_rfc3686_ctr_err;
760#endif 1276#endif
1277#endif
761#ifdef HAS_LRW 1278#ifdef HAS_LRW
762 if ((err = crypto_register_alg(&ablk_lrw_alg))) 1279 if ((err = crypto_register_alg(&ablk_lrw_alg)))
763 goto ablk_lrw_err; 1280 goto ablk_lrw_err;
@@ -770,7 +1287,6 @@ static int __init aesni_init(void)
770 if ((err = crypto_register_alg(&ablk_xts_alg))) 1287 if ((err = crypto_register_alg(&ablk_xts_alg)))
771 goto ablk_xts_err; 1288 goto ablk_xts_err;
772#endif 1289#endif
773
774 return err; 1290 return err;
775 1291
776#ifdef HAS_XTS 1292#ifdef HAS_XTS
@@ -784,18 +1300,24 @@ ablk_pcbc_err:
784 crypto_unregister_alg(&ablk_lrw_alg); 1300 crypto_unregister_alg(&ablk_lrw_alg);
785ablk_lrw_err: 1301ablk_lrw_err:
786#endif 1302#endif
1303#ifdef CONFIG_X86_64
787#ifdef HAS_CTR 1304#ifdef HAS_CTR
788 crypto_unregister_alg(&ablk_rfc3686_ctr_alg); 1305 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
789ablk_rfc3686_ctr_err: 1306ablk_rfc3686_ctr_err:
790#endif 1307#endif
1308 crypto_unregister_alg(&rfc4106_alg);
1309aead_gcm_err:
1310 crypto_unregister_alg(&__rfc4106_alg);
1311__aead_gcm_err:
791 crypto_unregister_alg(&ablk_ctr_alg); 1312 crypto_unregister_alg(&ablk_ctr_alg);
792ablk_ctr_err: 1313ablk_ctr_err:
1314 crypto_unregister_alg(&blk_ctr_alg);
1315blk_ctr_err:
1316#endif
793 crypto_unregister_alg(&ablk_cbc_alg); 1317 crypto_unregister_alg(&ablk_cbc_alg);
794ablk_cbc_err: 1318ablk_cbc_err:
795 crypto_unregister_alg(&ablk_ecb_alg); 1319 crypto_unregister_alg(&ablk_ecb_alg);
796ablk_ecb_err: 1320ablk_ecb_err:
797 crypto_unregister_alg(&blk_ctr_alg);
798blk_ctr_err:
799 crypto_unregister_alg(&blk_cbc_alg); 1321 crypto_unregister_alg(&blk_cbc_alg);
800blk_cbc_err: 1322blk_cbc_err:
801 crypto_unregister_alg(&blk_ecb_alg); 1323 crypto_unregister_alg(&blk_ecb_alg);
@@ -818,13 +1340,17 @@ static void __exit aesni_exit(void)
818#ifdef HAS_LRW 1340#ifdef HAS_LRW
819 crypto_unregister_alg(&ablk_lrw_alg); 1341 crypto_unregister_alg(&ablk_lrw_alg);
820#endif 1342#endif
1343#ifdef CONFIG_X86_64
821#ifdef HAS_CTR 1344#ifdef HAS_CTR
822 crypto_unregister_alg(&ablk_rfc3686_ctr_alg); 1345 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
823#endif 1346#endif
1347 crypto_unregister_alg(&rfc4106_alg);
1348 crypto_unregister_alg(&__rfc4106_alg);
824 crypto_unregister_alg(&ablk_ctr_alg); 1349 crypto_unregister_alg(&ablk_ctr_alg);
1350 crypto_unregister_alg(&blk_ctr_alg);
1351#endif
825 crypto_unregister_alg(&ablk_cbc_alg); 1352 crypto_unregister_alg(&ablk_cbc_alg);
826 crypto_unregister_alg(&ablk_ecb_alg); 1353 crypto_unregister_alg(&ablk_ecb_alg);
827 crypto_unregister_alg(&blk_ctr_alg);
828 crypto_unregister_alg(&blk_cbc_alg); 1354 crypto_unregister_alg(&blk_cbc_alg);
829 crypto_unregister_alg(&blk_ecb_alg); 1355 crypto_unregister_alg(&blk_ecb_alg);
830 crypto_unregister_alg(&__aesni_alg); 1356 crypto_unregister_alg(&__aesni_alg);
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 55d106b5e31b..211ca3f7fd16 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -185,17 +185,16 @@ struct bootnode;
185 185
186#ifdef CONFIG_ACPI_NUMA 186#ifdef CONFIG_ACPI_NUMA
187extern int acpi_numa; 187extern int acpi_numa;
188extern int acpi_get_nodes(struct bootnode *physnodes); 188extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
189 unsigned long end);
189extern int acpi_scan_nodes(unsigned long start, unsigned long end); 190extern int acpi_scan_nodes(unsigned long start, unsigned long end);
190#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 191#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
192
193#ifdef CONFIG_NUMA_EMU
191extern void acpi_fake_nodes(const struct bootnode *fake_nodes, 194extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
192 int num_nodes); 195 int num_nodes);
193#else
194static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
195 int num_nodes)
196{
197}
198#endif 196#endif
197#endif /* CONFIG_ACPI_NUMA */
199 198
200#define acpi_unlazy_tlb(x) leave_mm(x) 199#define acpi_unlazy_tlb(x) leave_mm(x)
201 200
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 6aee50d655d1..64dc82ee19f0 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,16 +3,27 @@
3 3
4#include <linux/pci.h> 4#include <linux/pci.h>
5 5
6struct amd_nb_bus_dev_range {
7 u8 bus;
8 u8 dev_base;
9 u8 dev_limit;
10};
11
6extern struct pci_device_id amd_nb_misc_ids[]; 12extern struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
7struct bootnode; 14struct bootnode;
8 15
9extern int early_is_amd_nb(u32 value); 16extern int early_is_amd_nb(u32 value);
10extern int amd_cache_northbridges(void); 17extern int amd_cache_northbridges(void);
11extern void amd_flush_garts(void); 18extern void amd_flush_garts(void);
12extern int amd_get_nodes(struct bootnode *nodes);
13extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); 19extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
14extern int amd_scan_nodes(void); 20extern int amd_scan_nodes(void);
15 21
22#ifdef CONFIG_NUMA_EMU
23extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
24extern void amd_get_nodes(struct bootnode *nodes);
25#endif
26
16struct amd_northbridge { 27struct amd_northbridge {
17 struct pci_dev *misc; 28 struct pci_dev *misc;
18}; 29};
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 3b62ab56c7a0..5e1a2eef3e7c 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -32,11 +32,7 @@
32#define BOOT_HEAP_SIZE 0x400000 32#define BOOT_HEAP_SIZE 0x400000
33#else /* !CONFIG_KERNEL_BZIP2 */ 33#else /* !CONFIG_KERNEL_BZIP2 */
34 34
35#ifdef CONFIG_X86_64 35#define BOOT_HEAP_SIZE 0x8000
36#define BOOT_HEAP_SIZE 0x7000
37#else
38#define BOOT_HEAP_SIZE 0x4000
39#endif
40 36
41#endif /* !CONFIG_KERNEL_BZIP2 */ 37#endif /* !CONFIG_KERNEL_BZIP2 */
42 38
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 0141b234406f..4729b2b63117 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -116,11 +116,11 @@ enum fixed_addresses {
116#endif 116#endif
117 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ 117 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
118 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ 118 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
119 __end_of_permanent_fixed_addresses,
120
121#ifdef CONFIG_X86_MRST 119#ifdef CONFIG_X86_MRST
122 FIX_LNW_VRTC, 120 FIX_LNW_VRTC,
123#endif 121#endif
122 __end_of_permanent_fixed_addresses,
123
124 /* 124 /*
125 * 256 temporary boot-time mappings, used by early_ioremap(), 125 * 256 temporary boot-time mappings, used by early_ioremap(),
126 * before ioremap() is functional. 126 * before ioremap() is functional.
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
index 49dbfdfa50f9..91d915a65259 100644
--- a/arch/x86/include/asm/gpio.h
+++ b/arch/x86/include/asm/gpio.h
@@ -38,12 +38,9 @@ static inline int gpio_cansleep(unsigned int gpio)
38 return __gpio_cansleep(gpio); 38 return __gpio_cansleep(gpio);
39} 39}
40 40
41/*
42 * Not implemented, yet.
43 */
44static inline int gpio_to_irq(unsigned int gpio) 41static inline int gpio_to_irq(unsigned int gpio)
45{ 42{
46 return -ENOSYS; 43 return __gpio_to_irq(gpio);
47} 44}
48 45
49static inline int irq_to_gpio(unsigned int irq) 46static inline int irq_to_gpio(unsigned int irq)
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ba870bb6dd8e..c704b38c57a2 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -10,6 +10,9 @@
10#include <asm/apicdef.h> 10#include <asm/apicdef.h>
11#include <asm/irq_vectors.h> 11#include <asm/irq_vectors.h>
12 12
13/* Even though we don't support this, supply it to appease OF */
14static inline void irq_dispose_mapping(unsigned int virq) { }
15
13static inline int irq_canonicalize(int irq) 16static inline int irq_canonicalize(int irq)
14{ 17{
15 return ((irq == 2) ? 9 : irq); 18 return ((irq == 2) ? 9 : irq);
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index f23eb2528464..ca242d35e873 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -18,7 +18,6 @@ enum die_val {
18 DIE_TRAP, 18 DIE_TRAP,
19 DIE_GPF, 19 DIE_GPF,
20 DIE_CALL, 20 DIE_CALL,
21 DIE_NMI_IPI,
22 DIE_PAGE_FAULT, 21 DIE_PAGE_FAULT,
23 DIE_NMIUNKNOWN, 22 DIE_NMIUNKNOWN,
24}; 23};
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b36c6b3fe144..8e37deb1eb38 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -15,6 +15,14 @@
15 15
16struct x86_emulate_ctxt; 16struct x86_emulate_ctxt;
17 17
18struct x86_exception {
19 u8 vector;
20 bool error_code_valid;
21 u16 error_code;
22 bool nested_page_fault;
23 u64 address; /* cr2 or nested page fault gpa */
24};
25
18/* 26/*
19 * x86_emulate_ops: 27 * x86_emulate_ops:
20 * 28 *
@@ -64,7 +72,8 @@ struct x86_emulate_ops {
64 * @bytes: [IN ] Number of bytes to read from memory. 72 * @bytes: [IN ] Number of bytes to read from memory.
65 */ 73 */
66 int (*read_std)(unsigned long addr, void *val, 74 int (*read_std)(unsigned long addr, void *val,
67 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 75 unsigned int bytes, struct kvm_vcpu *vcpu,
76 struct x86_exception *fault);
68 77
69 /* 78 /*
70 * write_std: Write bytes of standard (non-emulated/special) memory. 79 * write_std: Write bytes of standard (non-emulated/special) memory.
@@ -74,7 +83,8 @@ struct x86_emulate_ops {
74 * @bytes: [IN ] Number of bytes to write to memory. 83 * @bytes: [IN ] Number of bytes to write to memory.
75 */ 84 */
76 int (*write_std)(unsigned long addr, void *val, 85 int (*write_std)(unsigned long addr, void *val,
77 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 86 unsigned int bytes, struct kvm_vcpu *vcpu,
87 struct x86_exception *fault);
78 /* 88 /*
79 * fetch: Read bytes of standard (non-emulated/special) memory. 89 * fetch: Read bytes of standard (non-emulated/special) memory.
80 * Used for instruction fetch. 90 * Used for instruction fetch.
@@ -83,7 +93,8 @@ struct x86_emulate_ops {
83 * @bytes: [IN ] Number of bytes to read from memory. 93 * @bytes: [IN ] Number of bytes to read from memory.
84 */ 94 */
85 int (*fetch)(unsigned long addr, void *val, 95 int (*fetch)(unsigned long addr, void *val,
86 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 96 unsigned int bytes, struct kvm_vcpu *vcpu,
97 struct x86_exception *fault);
87 98
88 /* 99 /*
89 * read_emulated: Read bytes from emulated/special memory area. 100 * read_emulated: Read bytes from emulated/special memory area.
@@ -94,7 +105,7 @@ struct x86_emulate_ops {
94 int (*read_emulated)(unsigned long addr, 105 int (*read_emulated)(unsigned long addr,
95 void *val, 106 void *val,
96 unsigned int bytes, 107 unsigned int bytes,
97 unsigned int *error, 108 struct x86_exception *fault,
98 struct kvm_vcpu *vcpu); 109 struct kvm_vcpu *vcpu);
99 110
100 /* 111 /*
@@ -107,7 +118,7 @@ struct x86_emulate_ops {
107 int (*write_emulated)(unsigned long addr, 118 int (*write_emulated)(unsigned long addr,
108 const void *val, 119 const void *val,
109 unsigned int bytes, 120 unsigned int bytes,
110 unsigned int *error, 121 struct x86_exception *fault,
111 struct kvm_vcpu *vcpu); 122 struct kvm_vcpu *vcpu);
112 123
113 /* 124 /*
@@ -122,7 +133,7 @@ struct x86_emulate_ops {
122 const void *old, 133 const void *old,
123 const void *new, 134 const void *new,
124 unsigned int bytes, 135 unsigned int bytes,
125 unsigned int *error, 136 struct x86_exception *fault,
126 struct kvm_vcpu *vcpu); 137 struct kvm_vcpu *vcpu);
127 138
128 int (*pio_in_emulated)(int size, unsigned short port, void *val, 139 int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -159,7 +170,10 @@ struct operand {
159 }; 170 };
160 union { 171 union {
161 unsigned long *reg; 172 unsigned long *reg;
162 unsigned long mem; 173 struct segmented_address {
174 ulong ea;
175 unsigned seg;
176 } mem;
163 } addr; 177 } addr;
164 union { 178 union {
165 unsigned long val; 179 unsigned long val;
@@ -226,9 +240,8 @@ struct x86_emulate_ctxt {
226 240
227 bool perm_ok; /* do not check permissions if true */ 241 bool perm_ok; /* do not check permissions if true */
228 242
229 int exception; /* exception that happens during emulation or -1 */ 243 bool have_exception;
230 u32 error_code; /* error code for exception */ 244 struct x86_exception exception;
231 bool error_code_valid;
232 245
233 /* decode cache */ 246 /* decode cache */
234 struct decode_cache decode; 247 struct decode_cache decode;
@@ -252,7 +265,7 @@ struct x86_emulate_ctxt {
252#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 265#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
253#endif 266#endif
254 267
255int x86_decode_insn(struct x86_emulate_ctxt *ctxt); 268int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
256#define EMULATION_FAILED -1 269#define EMULATION_FAILED -1
257#define EMULATION_OK 0 270#define EMULATION_OK 0
258#define EMULATION_RESTART 1 271#define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f702f82aa1eb..ffd7f8d29187 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -83,11 +83,14 @@
83#define KVM_NR_FIXED_MTRR_REGION 88 83#define KVM_NR_FIXED_MTRR_REGION 88
84#define KVM_NR_VAR_MTRR 8 84#define KVM_NR_VAR_MTRR 8
85 85
86#define ASYNC_PF_PER_VCPU 64
87
86extern spinlock_t kvm_lock; 88extern spinlock_t kvm_lock;
87extern struct list_head vm_list; 89extern struct list_head vm_list;
88 90
89struct kvm_vcpu; 91struct kvm_vcpu;
90struct kvm; 92struct kvm;
93struct kvm_async_pf;
91 94
92enum kvm_reg { 95enum kvm_reg {
93 VCPU_REGS_RAX = 0, 96 VCPU_REGS_RAX = 0,
@@ -114,6 +117,7 @@ enum kvm_reg {
114 117
115enum kvm_reg_ex { 118enum kvm_reg_ex {
116 VCPU_EXREG_PDPTR = NR_VCPU_REGS, 119 VCPU_EXREG_PDPTR = NR_VCPU_REGS,
120 VCPU_EXREG_CR3,
117}; 121};
118 122
119enum { 123enum {
@@ -238,16 +242,18 @@ struct kvm_mmu {
238 void (*new_cr3)(struct kvm_vcpu *vcpu); 242 void (*new_cr3)(struct kvm_vcpu *vcpu);
239 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); 243 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
240 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); 244 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
241 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 245 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
242 void (*inject_page_fault)(struct kvm_vcpu *vcpu); 246 bool prefault);
247 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
248 struct x86_exception *fault);
243 void (*free)(struct kvm_vcpu *vcpu); 249 void (*free)(struct kvm_vcpu *vcpu);
244 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, 250 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
245 u32 *error); 251 struct x86_exception *exception);
246 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); 252 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
247 void (*prefetch_page)(struct kvm_vcpu *vcpu, 253 void (*prefetch_page)(struct kvm_vcpu *vcpu,
248 struct kvm_mmu_page *page); 254 struct kvm_mmu_page *page);
249 int (*sync_page)(struct kvm_vcpu *vcpu, 255 int (*sync_page)(struct kvm_vcpu *vcpu,
250 struct kvm_mmu_page *sp, bool clear_unsync); 256 struct kvm_mmu_page *sp);
251 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
252 hpa_t root_hpa; 258 hpa_t root_hpa;
253 int root_level; 259 int root_level;
@@ -315,16 +321,6 @@ struct kvm_vcpu_arch {
315 */ 321 */
316 struct kvm_mmu *walk_mmu; 322 struct kvm_mmu *walk_mmu;
317 323
318 /*
319 * This struct is filled with the necessary information to propagate a
320 * page fault into the guest
321 */
322 struct {
323 u64 address;
324 unsigned error_code;
325 bool nested;
326 } fault;
327
328 /* only needed in kvm_pv_mmu_op() path, but it's hot so 324 /* only needed in kvm_pv_mmu_op() path, but it's hot so
329 * put it here to avoid allocation */ 325 * put it here to avoid allocation */
330 struct kvm_pv_mmu_op_buffer mmu_op_buffer; 326 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -412,6 +408,15 @@ struct kvm_vcpu_arch {
412 u64 hv_vapic; 408 u64 hv_vapic;
413 409
414 cpumask_var_t wbinvd_dirty_mask; 410 cpumask_var_t wbinvd_dirty_mask;
411
412 struct {
413 bool halted;
414 gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
415 struct gfn_to_hva_cache data;
416 u64 msr_val;
417 u32 id;
418 bool send_user_only;
419 } apf;
415}; 420};
416 421
417struct kvm_arch { 422struct kvm_arch {
@@ -456,6 +461,10 @@ struct kvm_arch {
456 /* fields used by HYPER-V emulation */ 461 /* fields used by HYPER-V emulation */
457 u64 hv_guest_os_id; 462 u64 hv_guest_os_id;
458 u64 hv_hypercall; 463 u64 hv_hypercall;
464
465 #ifdef CONFIG_KVM_MMU_AUDIT
466 int audit_point;
467 #endif
459}; 468};
460 469
461struct kvm_vm_stat { 470struct kvm_vm_stat {
@@ -529,6 +538,7 @@ struct kvm_x86_ops {
529 struct kvm_segment *var, int seg); 538 struct kvm_segment *var, int seg);
530 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); 539 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
531 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); 540 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
541 void (*decache_cr3)(struct kvm_vcpu *vcpu);
532 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); 542 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
533 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 543 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
534 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 544 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -582,9 +592,17 @@ struct kvm_x86_ops {
582 592
583 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 593 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
584 594
595 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
585 const struct trace_print_flags *exit_reasons_str; 596 const struct trace_print_flags *exit_reasons_str;
586}; 597};
587 598
599struct kvm_arch_async_pf {
600 u32 token;
601 gfn_t gfn;
602 unsigned long cr3;
603 bool direct_map;
604};
605
588extern struct kvm_x86_ops *kvm_x86_ops; 606extern struct kvm_x86_ops *kvm_x86_ops;
589 607
590int kvm_mmu_module_init(void); 608int kvm_mmu_module_init(void);
@@ -594,7 +612,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
594int kvm_mmu_create(struct kvm_vcpu *vcpu); 612int kvm_mmu_create(struct kvm_vcpu *vcpu);
595int kvm_mmu_setup(struct kvm_vcpu *vcpu); 613int kvm_mmu_setup(struct kvm_vcpu *vcpu);
596void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); 614void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
597void kvm_mmu_set_base_ptes(u64 base_pte);
598void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 615void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
599 u64 dirty_mask, u64 nx_mask, u64 x_mask); 616 u64 dirty_mask, u64 nx_mask, u64 x_mask);
600 617
@@ -623,8 +640,15 @@ enum emulation_result {
623#define EMULTYPE_NO_DECODE (1 << 0) 640#define EMULTYPE_NO_DECODE (1 << 0)
624#define EMULTYPE_TRAP_UD (1 << 1) 641#define EMULTYPE_TRAP_UD (1 << 1)
625#define EMULTYPE_SKIP (1 << 2) 642#define EMULTYPE_SKIP (1 << 2)
626int emulate_instruction(struct kvm_vcpu *vcpu, 643int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
627 unsigned long cr2, u16 error_code, int emulation_type); 644 int emulation_type, void *insn, int insn_len);
645
646static inline int emulate_instruction(struct kvm_vcpu *vcpu,
647 int emulation_type)
648{
649 return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
650}
651
628void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 652void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
629void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 653void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
630 654
@@ -650,7 +674,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
650int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 674int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
651int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 675int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
652int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 676int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
653void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 677int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
654int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); 678int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
655int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); 679int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
656unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 680unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
@@ -668,11 +692,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
668void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 692void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
669void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); 693void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
670void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 694void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
671void kvm_inject_page_fault(struct kvm_vcpu *vcpu); 695void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
672int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 696int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
673 gfn_t gfn, void *data, int offset, int len, 697 gfn_t gfn, void *data, int offset, int len,
674 u32 access); 698 u32 access);
675void kvm_propagate_fault(struct kvm_vcpu *vcpu); 699void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
676bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 700bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
677 701
678int kvm_pic_set_irq(void *opaque, int irq, int level); 702int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -690,16 +714,21 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
690int kvm_mmu_load(struct kvm_vcpu *vcpu); 714int kvm_mmu_load(struct kvm_vcpu *vcpu);
691void kvm_mmu_unload(struct kvm_vcpu *vcpu); 715void kvm_mmu_unload(struct kvm_vcpu *vcpu);
692void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 716void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
693gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 717gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
694gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 718 struct x86_exception *exception);
695gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 719gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
696gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 720 struct x86_exception *exception);
721gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
722 struct x86_exception *exception);
723gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
724 struct x86_exception *exception);
697 725
698int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 726int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
699 727
700int kvm_fix_hypercall(struct kvm_vcpu *vcpu); 728int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
701 729
702int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); 730int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
731 void *insn, int insn_len);
703void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); 732void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
704 733
705void kvm_enable_tdp(void); 734void kvm_enable_tdp(void);
@@ -766,20 +795,25 @@ enum {
766#define HF_VINTR_MASK (1 << 2) 795#define HF_VINTR_MASK (1 << 2)
767#define HF_NMI_MASK (1 << 3) 796#define HF_NMI_MASK (1 << 3)
768#define HF_IRET_MASK (1 << 4) 797#define HF_IRET_MASK (1 << 4)
798#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
769 799
770/* 800/*
771 * Hardware virtualization extension instructions may fault if a 801 * Hardware virtualization extension instructions may fault if a
772 * reboot turns off virtualization while processes are running. 802 * reboot turns off virtualization while processes are running.
773 * Trap the fault and ignore the instruction if that happens. 803 * Trap the fault and ignore the instruction if that happens.
774 */ 804 */
775asmlinkage void kvm_handle_fault_on_reboot(void); 805asmlinkage void kvm_spurious_fault(void);
806extern bool kvm_rebooting;
776 807
777#define __kvm_handle_fault_on_reboot(insn) \ 808#define __kvm_handle_fault_on_reboot(insn) \
778 "666: " insn "\n\t" \ 809 "666: " insn "\n\t" \
810 "668: \n\t" \
779 ".pushsection .fixup, \"ax\" \n" \ 811 ".pushsection .fixup, \"ax\" \n" \
780 "667: \n\t" \ 812 "667: \n\t" \
813 "cmpb $0, kvm_rebooting \n\t" \
814 "jne 668b \n\t" \
781 __ASM_SIZE(push) " $666b \n\t" \ 815 __ASM_SIZE(push) " $666b \n\t" \
782 "jmp kvm_handle_fault_on_reboot \n\t" \ 816 "call kvm_spurious_fault \n\t" \
783 ".popsection \n\t" \ 817 ".popsection \n\t" \
784 ".pushsection __ex_table, \"a\" \n\t" \ 818 ".pushsection __ex_table, \"a\" \n\t" \
785 _ASM_PTR " 666b, 667b \n\t" \ 819 _ASM_PTR " 666b, 667b \n\t" \
@@ -788,6 +822,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
788#define KVM_ARCH_WANT_MMU_NOTIFIER 822#define KVM_ARCH_WANT_MMU_NOTIFIER
789int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 823int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
790int kvm_age_hva(struct kvm *kvm, unsigned long hva); 824int kvm_age_hva(struct kvm *kvm, unsigned long hva);
825int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
791void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 826void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
792int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 827int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
793int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 828int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
@@ -799,4 +834,15 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
799 834
800bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); 835bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
801 836
837void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
838 struct kvm_async_pf *work);
839void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
840 struct kvm_async_pf *work);
841void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
842 struct kvm_async_pf *work);
843bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
844extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
845
846void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
847
802#endif /* _ASM_X86_KVM_HOST_H */ 848#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b562b6184bc..a427bf77a93d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -20,6 +20,7 @@
20 * are available. The use of 0x11 and 0x12 is deprecated 20 * are available. The use of 0x11 and 0x12 is deprecated
21 */ 21 */
22#define KVM_FEATURE_CLOCKSOURCE2 3 22#define KVM_FEATURE_CLOCKSOURCE2 3
23#define KVM_FEATURE_ASYNC_PF 4
23 24
24/* The last 8 bits are used to indicate how to interpret the flags field 25/* The last 8 bits are used to indicate how to interpret the flags field
25 * in pvclock structure. If no bits are set, all flags are ignored. 26 * in pvclock structure. If no bits are set, all flags are ignored.
@@ -32,9 +33,13 @@
32/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ 33/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
33#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 34#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
34#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 35#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
36#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
35 37
36#define KVM_MAX_MMU_OP_BATCH 32 38#define KVM_MAX_MMU_OP_BATCH 32
37 39
40#define KVM_ASYNC_PF_ENABLED (1 << 0)
41#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
42
38/* Operations for KVM_HC_MMU_OP */ 43/* Operations for KVM_HC_MMU_OP */
39#define KVM_MMU_OP_WRITE_PTE 1 44#define KVM_MMU_OP_WRITE_PTE 1
40#define KVM_MMU_OP_FLUSH_TLB 2 45#define KVM_MMU_OP_FLUSH_TLB 2
@@ -61,10 +66,20 @@ struct kvm_mmu_op_release_pt {
61 __u64 pt_phys; 66 __u64 pt_phys;
62}; 67};
63 68
69#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
70#define KVM_PV_REASON_PAGE_READY 2
71
72struct kvm_vcpu_pv_apf_data {
73 __u32 reason;
74 __u8 pad[60];
75 __u32 enabled;
76};
77
64#ifdef __KERNEL__ 78#ifdef __KERNEL__
65#include <asm/processor.h> 79#include <asm/processor.h>
66 80
67extern void kvmclock_init(void); 81extern void kvmclock_init(void);
82extern int kvm_register_clock(char *txt);
68 83
69 84
70/* This instruction is vmcall. On non-VT architectures, it will generate a 85/* This instruction is vmcall. On non-VT architectures, it will generate a
@@ -160,8 +175,17 @@ static inline unsigned int kvm_arch_para_features(void)
160 175
161#ifdef CONFIG_KVM_GUEST 176#ifdef CONFIG_KVM_GUEST
162void __init kvm_guest_init(void); 177void __init kvm_guest_init(void);
178void kvm_async_pf_task_wait(u32 token);
179void kvm_async_pf_task_wake(u32 token);
180u32 kvm_read_and_reset_pf_reason(void);
163#else 181#else
164#define kvm_guest_init() do { } while (0) 182#define kvm_guest_init() do { } while (0)
183#define kvm_async_pf_task_wait(T) do {} while(0)
184#define kvm_async_pf_task_wake(T) do {} while(0)
185static inline u32 kvm_read_and_reset_pf_reason(void)
186{
187 return 0;
188}
165#endif 189#endif
166 190
167#endif /* __KERNEL__ */ 191#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index f7920601e472..72a8b52e7dfd 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -7,9 +7,19 @@
7 7
8#include <asm/mc146818rtc.h> 8#include <asm/mc146818rtc.h>
9 9
10#define NMI_REASON_PORT 0x61
11
12#define NMI_REASON_SERR 0x80
13#define NMI_REASON_IOCHK 0x40
14#define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK)
15
16#define NMI_REASON_CLEAR_SERR 0x04
17#define NMI_REASON_CLEAR_IOCHK 0x08
18#define NMI_REASON_CLEAR_MASK 0x0f
19
10static inline unsigned char get_nmi_reason(void) 20static inline unsigned char get_nmi_reason(void)
11{ 21{
12 return inb(0x61); 22 return inb(NMI_REASON_PORT);
13} 23}
14 24
15static inline void reassert_nmi(void) 25static inline void reassert_nmi(void)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c4021b953510..c76f5b92b840 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -23,6 +23,26 @@ void arch_trigger_all_cpu_backtrace(void);
23#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace 23#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
24#endif 24#endif
25 25
26/*
27 * Define some priorities for the nmi notifier call chain.
28 *
29 * Create a local nmi bit that has a higher priority than
30 * external nmis, because the local ones are more frequent.
31 *
32 * Also setup some default high/normal/low settings for
33 * subsystems to registers with. Using 4 bits to seperate
34 * the priorities. This can go alot higher if needed be.
35 */
36
37#define NMI_LOCAL_SHIFT 16 /* randomly picked */
38#define NMI_LOCAL_BIT (1ULL << NMI_LOCAL_SHIFT)
39#define NMI_HIGH_PRIOR (1ULL << 8)
40#define NMI_NORMAL_PRIOR (1ULL << 4)
41#define NMI_LOW_PRIOR (1ULL << 0)
42#define NMI_LOCAL_HIGH_PRIOR (NMI_LOCAL_BIT | NMI_HIGH_PRIOR)
43#define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR)
44#define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR)
45
26void stop_nmi(void); 46void stop_nmi(void);
27void restart_nmi(void); 47void restart_nmi(void);
28 48
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index a37229011b56..b0ef2b449a9d 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_NUMA_32_H 1#ifndef _ASM_X86_NUMA_32_H
2#define _ASM_X86_NUMA_32_H 2#define _ASM_X86_NUMA_32_H
3 3
4extern int numa_off;
5
4extern int pxm_to_nid(int pxm); 6extern int pxm_to_nid(int pxm);
5extern void numa_remove_cpu(int cpu); 7extern void numa_remove_cpu(int cpu);
6 8
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 823e070e7c26..0493be39607c 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -38,8 +38,9 @@ extern void __cpuinit numa_add_cpu(int cpu);
38extern void __cpuinit numa_remove_cpu(int cpu); 38extern void __cpuinit numa_remove_cpu(int cpu);
39 39
40#ifdef CONFIG_NUMA_EMU 40#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)64 << 20) 41#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) 42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
43void numa_emu_cmdline(char *);
43#endif /* CONFIG_NUMA_EMU */ 44#endif /* CONFIG_NUMA_EMU */
44#else 45#else
45static inline void init_cpu_to_node(void) { } 46static inline void init_cpu_to_node(void) { }
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 42a978c0c1b3..f482010350fb 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -107,10 +107,14 @@ extern int olpc_ec_mask_unset(uint8_t bits);
107/* GPIO assignments */ 107/* GPIO assignments */
108 108
109#define OLPC_GPIO_MIC_AC 1 109#define OLPC_GPIO_MIC_AC 1
110#define OLPC_GPIO_DCON_IRQ geode_gpio(7) 110#define OLPC_GPIO_DCON_STAT0 5
111#define OLPC_GPIO_DCON_STAT1 6
112#define OLPC_GPIO_DCON_IRQ 7
111#define OLPC_GPIO_THRM_ALRM geode_gpio(10) 113#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
112#define OLPC_GPIO_SMB_CLK geode_gpio(14) 114#define OLPC_GPIO_DCON_LOAD 11
113#define OLPC_GPIO_SMB_DATA geode_gpio(15) 115#define OLPC_GPIO_DCON_BLANK 12
116#define OLPC_GPIO_SMB_CLK 14
117#define OLPC_GPIO_SMB_DATA 15
114#define OLPC_GPIO_WORKAUX geode_gpio(24) 118#define OLPC_GPIO_WORKAUX geode_gpio(24)
115#define OLPC_GPIO_LID geode_gpio(26) 119#define OLPC_GPIO_LID geode_gpio(26)
116#define OLPC_GPIO_ECSCI geode_gpio(27) 120#define OLPC_GPIO_ECSCI geode_gpio(27)
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 2a8478140bb3..641988efe063 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -8,6 +8,8 @@
8 8
9#ifdef CONFIG_OLPC_OPENFIRMWARE 9#ifdef CONFIG_OLPC_OPENFIRMWARE
10 10
11extern bool olpc_ofw_is_installed(void);
12
11/* run an OFW command by calling into the firmware */ 13/* run an OFW command by calling into the firmware */
12#define olpc_ofw(name, args, res) \ 14#define olpc_ofw(name, args, res) \
13 __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res) 15 __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
@@ -26,10 +28,17 @@ extern bool olpc_ofw_present(void);
26 28
27#else /* !CONFIG_OLPC_OPENFIRMWARE */ 29#else /* !CONFIG_OLPC_OPENFIRMWARE */
28 30
31static inline bool olpc_ofw_is_installed(void) { return false; }
29static inline void olpc_ofw_detect(void) { } 32static inline void olpc_ofw_detect(void) { }
30static inline void setup_olpc_ofw_pgd(void) { } 33static inline void setup_olpc_ofw_pgd(void) { }
31static inline bool olpc_ofw_present(void) { return false; } 34static inline bool olpc_ofw_present(void) { return false; }
32 35
33#endif /* !CONFIG_OLPC_OPENFIRMWARE */ 36#endif /* !CONFIG_OLPC_OPENFIRMWARE */
34 37
38#ifdef CONFIG_OLPC_OPENFIRMWARE_DT
39extern void olpc_dt_build_devicetree(void);
40#else
41static inline void olpc_dt_build_devicetree(void) { }
42#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */
43
35#endif /* _ASM_X86_OLPC_OFW_H */ 44#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 7709c12431b8..2071a8b2b32f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
435{ 435{
436 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); 436 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
437} 437}
438static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
439 pmd_t *pmdp)
440{
441 PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
442}
438 443
439static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, 444static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
440 pte_t *ptep) 445 pte_t *ptep)
@@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
442 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); 447 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
443} 448}
444 449
450static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
451 pmd_t *pmdp)
452{
453 PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
454}
455
445static inline pte_t __pte(pteval_t val) 456static inline pte_t __pte(pteval_t val)
446{ 457{
447 pteval_t ret; 458 pteval_t ret;
@@ -543,6 +554,20 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
543 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); 554 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
544} 555}
545 556
557#ifdef CONFIG_TRANSPARENT_HUGEPAGE
558static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
559 pmd_t *pmdp, pmd_t pmd)
560{
561#if PAGETABLE_LEVELS >= 3
562 if (sizeof(pmdval_t) > sizeof(long))
563 /* 5 arg words */
564 pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
565 else
566 PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd);
567#endif
568}
569#endif
570
546static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 571static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
547{ 572{
548 pmdval_t val = native_pmd_val(pmd); 573 pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b82bac975250..82885099c869 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -265,10 +265,16 @@ struct pv_mmu_ops {
265 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, 265 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
266 pte_t *ptep, pte_t pteval); 266 pte_t *ptep, pte_t pteval);
267 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); 267 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
268 void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
269 pmd_t *pmdp, pmd_t pmdval);
268 void (*pte_update)(struct mm_struct *mm, unsigned long addr, 270 void (*pte_update)(struct mm_struct *mm, unsigned long addr,
269 pte_t *ptep); 271 pte_t *ptep);
270 void (*pte_update_defer)(struct mm_struct *mm, 272 void (*pte_update_defer)(struct mm_struct *mm,
271 unsigned long addr, pte_t *ptep); 273 unsigned long addr, pte_t *ptep);
274 void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
275 pmd_t *pmdp);
276 void (*pmd_update_defer)(struct mm_struct *mm,
277 unsigned long addr, pmd_t *pmdp);
272 278
273 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, 279 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
274 pte_t *ptep); 280 pte_t *ptep);
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 295e2ff18a6a..e2f6a99f14ab 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -20,6 +20,9 @@
20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) 20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
21#define ARCH_P4_MAX_CCCR (18) 21#define ARCH_P4_MAX_CCCR (18)
22 22
23#define ARCH_P4_CNTRVAL_BITS (40)
24#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1)
25
23#define P4_ESCR_EVENT_MASK 0x7e000000U 26#define P4_ESCR_EVENT_MASK 0x7e000000U
24#define P4_ESCR_EVENT_SHIFT 25 27#define P4_ESCR_EVENT_SHIFT 25
25#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U 28#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 271de94c3810..b4389a468fb6 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -92,7 +92,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
92extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); 92extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
93 93
94static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, 94static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
95 unsigned long adddress) 95 unsigned long address)
96{ 96{
97 ___pmd_free_tlb(tlb, pmd); 97 ___pmd_free_tlb(tlb, pmd);
98} 98}
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982b339e..98391db840c6 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
46#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 46#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
47#endif 47#endif
48 48
49#ifdef CONFIG_SMP
50static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
51{
52 return __pmd(xchg((pmdval_t *)xp, 0));
53}
54#else
55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
56#endif
57
49/* 58/*
50 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, 59 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
51 * split up the 29 bits of offset into this range: 60 * split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea01..94b979d1b58d 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
104#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 104#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
105#endif 105#endif
106 106
107#ifdef CONFIG_SMP
108union split_pmd {
109 struct {
110 u32 pmd_low;
111 u32 pmd_high;
112 };
113 pmd_t pmd;
114};
115static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
116{
117 union split_pmd res, *orig = (union split_pmd *)pmdp;
118
119 /* xchg acts as a barrier before setting of the high bits */
120 res.pmd_low = xchg(&orig->pmd_low, 0);
121 res.pmd_high = orig->pmd_high;
122 orig->pmd_high = 0;
123
124 return res.pmd;
125}
126#else
127#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
128#endif
129
107/* 130/*
108 * Bits 0, 6 and 7 are taken in the low part of the pte, 131 * Bits 0, 6 and 7 are taken in the low part of the pte,
109 * put the 32 bits of offset into the high part. 132 * put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ada823a13c7c..18601c86fab1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
35#else /* !CONFIG_PARAVIRT */ 35#else /* !CONFIG_PARAVIRT */
36#define set_pte(ptep, pte) native_set_pte(ptep, pte) 36#define set_pte(ptep, pte) native_set_pte(ptep, pte)
37#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) 37#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
38#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
38 39
39#define set_pte_atomic(ptep, pte) \ 40#define set_pte_atomic(ptep, pte) \
40 native_set_pte_atomic(ptep, pte) 41 native_set_pte_atomic(ptep, pte)
@@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
59 60
60#define pte_update(mm, addr, ptep) do { } while (0) 61#define pte_update(mm, addr, ptep) do { } while (0)
61#define pte_update_defer(mm, addr, ptep) do { } while (0) 62#define pte_update_defer(mm, addr, ptep) do { } while (0)
63#define pmd_update(mm, addr, ptep) do { } while (0)
64#define pmd_update_defer(mm, addr, ptep) do { } while (0)
62 65
63#define pgd_val(x) native_pgd_val(x) 66#define pgd_val(x) native_pgd_val(x)
64#define __pgd(x) native_make_pgd(x) 67#define __pgd(x) native_make_pgd(x)
@@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte)
94 return pte_flags(pte) & _PAGE_ACCESSED; 97 return pte_flags(pte) & _PAGE_ACCESSED;
95} 98}
96 99
100static inline int pmd_young(pmd_t pmd)
101{
102 return pmd_flags(pmd) & _PAGE_ACCESSED;
103}
104
97static inline int pte_write(pte_t pte) 105static inline int pte_write(pte_t pte)
98{ 106{
99 return pte_flags(pte) & _PAGE_RW; 107 return pte_flags(pte) & _PAGE_RW;
@@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte)
142 (_PAGE_PSE | _PAGE_PRESENT); 150 (_PAGE_PSE | _PAGE_PRESENT);
143} 151}
144 152
153#ifdef CONFIG_TRANSPARENT_HUGEPAGE
154static inline int pmd_trans_splitting(pmd_t pmd)
155{
156 return pmd_val(pmd) & _PAGE_SPLITTING;
157}
158
159static inline int pmd_trans_huge(pmd_t pmd)
160{
161 return pmd_val(pmd) & _PAGE_PSE;
162}
163
164static inline int has_transparent_hugepage(void)
165{
166 return cpu_has_pse;
167}
168#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
169
145static inline pte_t pte_set_flags(pte_t pte, pteval_t set) 170static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
146{ 171{
147 pteval_t v = native_pte_val(pte); 172 pteval_t v = native_pte_val(pte);
@@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte)
216 return pte_set_flags(pte, _PAGE_SPECIAL); 241 return pte_set_flags(pte, _PAGE_SPECIAL);
217} 242}
218 243
244static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
245{
246 pmdval_t v = native_pmd_val(pmd);
247
248 return __pmd(v | set);
249}
250
251static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
252{
253 pmdval_t v = native_pmd_val(pmd);
254
255 return __pmd(v & ~clear);
256}
257
258static inline pmd_t pmd_mkold(pmd_t pmd)
259{
260 return pmd_clear_flags(pmd, _PAGE_ACCESSED);
261}
262
263static inline pmd_t pmd_wrprotect(pmd_t pmd)
264{
265 return pmd_clear_flags(pmd, _PAGE_RW);
266}
267
268static inline pmd_t pmd_mkdirty(pmd_t pmd)
269{
270 return pmd_set_flags(pmd, _PAGE_DIRTY);
271}
272
273static inline pmd_t pmd_mkhuge(pmd_t pmd)
274{
275 return pmd_set_flags(pmd, _PAGE_PSE);
276}
277
278static inline pmd_t pmd_mkyoung(pmd_t pmd)
279{
280 return pmd_set_flags(pmd, _PAGE_ACCESSED);
281}
282
283static inline pmd_t pmd_mkwrite(pmd_t pmd)
284{
285 return pmd_set_flags(pmd, _PAGE_RW);
286}
287
288static inline pmd_t pmd_mknotpresent(pmd_t pmd)
289{
290 return pmd_clear_flags(pmd, _PAGE_PRESENT);
291}
292
219/* 293/*
220 * Mask out unsupported bits in a present pgprot. Non-present pgprots 294 * Mask out unsupported bits in a present pgprot. Non-present pgprots
221 * can use those bits for other purposes, so leave them be. 295 * can use those bits for other purposes, so leave them be.
@@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
256 return __pte(val); 330 return __pte(val);
257} 331}
258 332
333static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
334{
335 pmdval_t val = pmd_val(pmd);
336
337 val &= _HPAGE_CHG_MASK;
338 val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
339
340 return __pmd(val);
341}
342
259/* mprotect needs to preserve PAT bits when updating vm_page_prot */ 343/* mprotect needs to preserve PAT bits when updating vm_page_prot */
260#define pgprot_modify pgprot_modify 344#define pgprot_modify pgprot_modify
261static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) 345static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
350 * Currently stuck as a macro due to indirect forward reference to 434 * Currently stuck as a macro due to indirect forward reference to
351 * linux/mmzone.h's __section_mem_map_addr() definition: 435 * linux/mmzone.h's __section_mem_map_addr() definition:
352 */ 436 */
353#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) 437#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
354 438
355/* 439/*
356 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] 440 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
524 return res; 608 return res;
525} 609}
526 610
611static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
612{
613 pmd_t res = *pmdp;
614
615 native_pmd_clear(pmdp);
616 return res;
617}
618
527static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, 619static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
528 pte_t *ptep , pte_t pte) 620 pte_t *ptep , pte_t pte)
529{ 621{
530 native_set_pte(ptep, pte); 622 native_set_pte(ptep, pte);
531} 623}
532 624
625static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
626 pmd_t *pmdp , pmd_t pmd)
627{
628 native_set_pmd(pmdp, pmd);
629}
630
533#ifndef CONFIG_PARAVIRT 631#ifndef CONFIG_PARAVIRT
534/* 632/*
535 * Rules for using pte_update - it must be called after any PTE update which 633 * Rules for using pte_update - it must be called after any PTE update which
@@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
607 705
608#define flush_tlb_fix_spurious_fault(vma, address) 706#define flush_tlb_fix_spurious_fault(vma, address)
609 707
708#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
709
710#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
711extern int pmdp_set_access_flags(struct vm_area_struct *vma,
712 unsigned long address, pmd_t *pmdp,
713 pmd_t entry, int dirty);
714
715#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
716extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
717 unsigned long addr, pmd_t *pmdp);
718
719#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
720extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
721 unsigned long address, pmd_t *pmdp);
722
723
724#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
725extern void pmdp_splitting_flush(struct vm_area_struct *vma,
726 unsigned long addr, pmd_t *pmdp);
727
728#define __HAVE_ARCH_PMD_WRITE
729static inline int pmd_write(pmd_t pmd)
730{
731 return pmd_flags(pmd) & _PAGE_RW;
732}
733
734#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
735static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
736 pmd_t *pmdp)
737{
738 pmd_t pmd = native_pmdp_get_and_clear(pmdp);
739 pmd_update(mm, addr, pmdp);
740 return pmd;
741}
742
743#define __HAVE_ARCH_PMDP_SET_WRPROTECT
744static inline void pmdp_set_wrprotect(struct mm_struct *mm,
745 unsigned long addr, pmd_t *pmdp)
746{
747 clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
748 pmd_update(mm, addr, pmdp);
749}
750
610/* 751/*
611 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 752 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
612 * 753 *
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f86da20347f2..975f709e09ae 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
59 native_set_pte(ptep, pte); 59 native_set_pte(ptep, pte);
60} 60}
61 61
62static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
63{
64 *pmdp = pmd;
65}
66
67static inline void native_pmd_clear(pmd_t *pmd)
68{
69 native_set_pmd(pmd, native_make_pmd(0));
70}
71
62static inline pte_t native_ptep_get_and_clear(pte_t *xp) 72static inline pte_t native_ptep_get_and_clear(pte_t *xp)
63{ 73{
64#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
@@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
72#endif 82#endif
73} 83}
74 84
75static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) 85static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
76{ 86{
77 *pmdp = pmd; 87#ifdef CONFIG_SMP
78} 88 return native_make_pmd(xchg(&xp->pmd, 0));
79 89#else
80static inline void native_pmd_clear(pmd_t *pmd) 90 /* native_local_pmdp_get_and_clear,
81{ 91 but duplicated because of cyclic dependency */
82 native_set_pmd(pmd, native_make_pmd(0)); 92 pmd_t ret = *xp;
93 native_pmd_clear(xp);
94 return ret;
95#endif
83} 96}
84 97
85static inline void native_set_pud(pud_t *pudp, pud_t pud) 98static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -168,6 +181,7 @@ extern void cleanup_highmap(void);
168#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) 181#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
169 182
170#define __HAVE_ARCH_PTE_SAME 183#define __HAVE_ARCH_PTE_SAME
184
171#endif /* !__ASSEMBLY__ */ 185#endif /* !__ASSEMBLY__ */
172 186
173#endif /* _ASM_X86_PGTABLE_64_H */ 187#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d1f4a760be23..7db7723d1f32 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,6 +22,7 @@
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
25#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
25#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 26#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
26 27
27/* If _PAGE_BIT_PRESENT is clear, we use these: */ 28/* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -45,6 +46,7 @@
45#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
46#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
47#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
48#define __HAVE_ARCH_PTE_SPECIAL 50#define __HAVE_ARCH_PTE_SPECIAL
49 51
50#ifdef CONFIG_KMEMCHECK 52#ifdef CONFIG_KMEMCHECK
@@ -70,6 +72,7 @@
70/* Set of bits not changed in pte_modify */ 72/* Set of bits not changed in pte_modify */
71#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 73#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
72 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) 74 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
75#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
73 76
74#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) 77#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
75#define _PAGE_CACHE_WB (0) 78#define _PAGE_CACHE_WB (0)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c6efecf85a6a..45636cefa186 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -761,10 +761,11 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c);
761extern void init_c1e_mask(void); 761extern void init_c1e_mask(void);
762 762
763extern unsigned long boot_option_idle_override; 763extern unsigned long boot_option_idle_override;
764extern unsigned long idle_halt;
765extern unsigned long idle_nomwait;
766extern bool c1e_detected; 764extern bool c1e_detected;
767 765
766enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
767 IDLE_POLL, IDLE_FORCE_MWAIT};
768
768extern void enable_sep_cpu(void); 769extern void enable_sep_cpu(void);
769extern int sysenter_setup(void); 770extern int sysenter_setup(void);
770 771
@@ -901,7 +902,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
901/* 902/*
902 * The below -8 is to reserve 8 bytes on top of the ring0 stack. 903 * The below -8 is to reserve 8 bytes on top of the ring0 stack.
903 * This is necessary to guarantee that the entire "struct pt_regs" 904 * This is necessary to guarantee that the entire "struct pt_regs"
904 * is accessable even if the CPU haven't stored the SS/ESP registers 905 * is accessible even if the CPU haven't stored the SS/ESP registers
905 * on the stack (interrupt gate does not save these registers 906 * on the stack (interrupt gate does not save these registers
906 * when switching to the same priv ring). 907 * when switching to the same priv ring).
907 * Therefore beware: accessing the ss/esp fields of the 908 * Therefore beware: accessing the ss/esp fields of the
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
new file mode 100644
index 000000000000..b4ec95f07518
--- /dev/null
+++ b/arch/x86/include/asm/prom.h
@@ -0,0 +1 @@
/* dummy prom.h; here to make linux/of.h's #includes happy */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0e831059ac5a..f2b83bc7d784 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -47,14 +47,13 @@ enum {
47 INTERCEPT_MONITOR, 47 INTERCEPT_MONITOR,
48 INTERCEPT_MWAIT, 48 INTERCEPT_MWAIT,
49 INTERCEPT_MWAIT_COND, 49 INTERCEPT_MWAIT_COND,
50 INTERCEPT_XSETBV,
50}; 51};
51 52
52 53
53struct __attribute__ ((__packed__)) vmcb_control_area { 54struct __attribute__ ((__packed__)) vmcb_control_area {
54 u16 intercept_cr_read; 55 u32 intercept_cr;
55 u16 intercept_cr_write; 56 u32 intercept_dr;
56 u16 intercept_dr_read;
57 u16 intercept_dr_write;
58 u32 intercept_exceptions; 57 u32 intercept_exceptions;
59 u64 intercept; 58 u64 intercept;
60 u8 reserved_1[42]; 59 u8 reserved_1[42];
@@ -81,14 +80,19 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
81 u32 event_inj_err; 80 u32 event_inj_err;
82 u64 nested_cr3; 81 u64 nested_cr3;
83 u64 lbr_ctl; 82 u64 lbr_ctl;
84 u64 reserved_5; 83 u32 clean;
84 u32 reserved_5;
85 u64 next_rip; 85 u64 next_rip;
86 u8 reserved_6[816]; 86 u8 insn_len;
87 u8 insn_bytes[15];
88 u8 reserved_6[800];
87}; 89};
88 90
89 91
90#define TLB_CONTROL_DO_NOTHING 0 92#define TLB_CONTROL_DO_NOTHING 0
91#define TLB_CONTROL_FLUSH_ALL_ASID 1 93#define TLB_CONTROL_FLUSH_ALL_ASID 1
94#define TLB_CONTROL_FLUSH_ASID 3
95#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
92 96
93#define V_TPR_MASK 0x0f 97#define V_TPR_MASK 0x0f
94 98
@@ -204,19 +208,31 @@ struct __attribute__ ((__packed__)) vmcb {
204#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK 208#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
205#define SVM_SELECTOR_CODE_MASK (1 << 3) 209#define SVM_SELECTOR_CODE_MASK (1 << 3)
206 210
207#define INTERCEPT_CR0_MASK 1 211#define INTERCEPT_CR0_READ 0
208#define INTERCEPT_CR3_MASK (1 << 3) 212#define INTERCEPT_CR3_READ 3
209#define INTERCEPT_CR4_MASK (1 << 4) 213#define INTERCEPT_CR4_READ 4
210#define INTERCEPT_CR8_MASK (1 << 8) 214#define INTERCEPT_CR8_READ 8
211 215#define INTERCEPT_CR0_WRITE (16 + 0)
212#define INTERCEPT_DR0_MASK 1 216#define INTERCEPT_CR3_WRITE (16 + 3)
213#define INTERCEPT_DR1_MASK (1 << 1) 217#define INTERCEPT_CR4_WRITE (16 + 4)
214#define INTERCEPT_DR2_MASK (1 << 2) 218#define INTERCEPT_CR8_WRITE (16 + 8)
215#define INTERCEPT_DR3_MASK (1 << 3) 219
216#define INTERCEPT_DR4_MASK (1 << 4) 220#define INTERCEPT_DR0_READ 0
217#define INTERCEPT_DR5_MASK (1 << 5) 221#define INTERCEPT_DR1_READ 1
218#define INTERCEPT_DR6_MASK (1 << 6) 222#define INTERCEPT_DR2_READ 2
219#define INTERCEPT_DR7_MASK (1 << 7) 223#define INTERCEPT_DR3_READ 3
224#define INTERCEPT_DR4_READ 4
225#define INTERCEPT_DR5_READ 5
226#define INTERCEPT_DR6_READ 6
227#define INTERCEPT_DR7_READ 7
228#define INTERCEPT_DR0_WRITE (16 + 0)
229#define INTERCEPT_DR1_WRITE (16 + 1)
230#define INTERCEPT_DR2_WRITE (16 + 2)
231#define INTERCEPT_DR3_WRITE (16 + 3)
232#define INTERCEPT_DR4_WRITE (16 + 4)
233#define INTERCEPT_DR5_WRITE (16 + 5)
234#define INTERCEPT_DR6_WRITE (16 + 6)
235#define INTERCEPT_DR7_WRITE (16 + 7)
220 236
221#define SVM_EVTINJ_VEC_MASK 0xff 237#define SVM_EVTINJ_VEC_MASK 0xff
222 238
@@ -246,6 +262,8 @@ struct __attribute__ ((__packed__)) vmcb {
246#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 262#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
247#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 263#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
248 264
265#define SVM_EXITINFO_REG_MASK 0x0F
266
249#define SVM_EXIT_READ_CR0 0x000 267#define SVM_EXIT_READ_CR0 0x000
250#define SVM_EXIT_READ_CR3 0x003 268#define SVM_EXIT_READ_CR3 0x003
251#define SVM_EXIT_READ_CR4 0x004 269#define SVM_EXIT_READ_CR4 0x004
@@ -316,6 +334,7 @@ struct __attribute__ ((__packed__)) vmcb {
316#define SVM_EXIT_MONITOR 0x08a 334#define SVM_EXIT_MONITOR 0x08a
317#define SVM_EXIT_MWAIT 0x08b 335#define SVM_EXIT_MWAIT 0x08b
318#define SVM_EXIT_MWAIT_COND 0x08c 336#define SVM_EXIT_MWAIT_COND 0x08c
337#define SVM_EXIT_XSETBV 0x08d
319#define SVM_EXIT_NPF 0x400 338#define SVM_EXIT_NPF 0x400
320 339
321#define SVM_EXIT_ERR -1 340#define SVM_EXIT_ERR -1
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index f66cda56781d..0310da67307f 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
30asmlinkage void stack_segment(void); 30asmlinkage void stack_segment(void);
31asmlinkage void general_protection(void); 31asmlinkage void general_protection(void);
32asmlinkage void page_fault(void); 32asmlinkage void page_fault(void);
33asmlinkage void async_page_fault(void);
33asmlinkage void spurious_interrupt_bug(void); 34asmlinkage void spurious_interrupt_bug(void);
34asmlinkage void coprocessor_error(void); 35asmlinkage void coprocessor_error(void);
35asmlinkage void alignment_check(void); 36asmlinkage void alignment_check(void);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9f0cbd987d50..84471b810460 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,15 +66,23 @@
66#define PIN_BASED_NMI_EXITING 0x00000008 66#define PIN_BASED_NMI_EXITING 0x00000008
67#define PIN_BASED_VIRTUAL_NMIS 0x00000020 67#define PIN_BASED_VIRTUAL_NMIS 0x00000020
68 68
69#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
69#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 70#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
71#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
70#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 72#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
71#define VM_EXIT_SAVE_IA32_PAT 0x00040000 73#define VM_EXIT_SAVE_IA32_PAT 0x00040000
72#define VM_EXIT_LOAD_IA32_PAT 0x00080000 74#define VM_EXIT_LOAD_IA32_PAT 0x00080000
75#define VM_EXIT_SAVE_IA32_EFER 0x00100000
76#define VM_EXIT_LOAD_IA32_EFER 0x00200000
77#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
73 78
79#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
74#define VM_ENTRY_IA32E_MODE 0x00000200 80#define VM_ENTRY_IA32E_MODE 0x00000200
75#define VM_ENTRY_SMM 0x00000400 81#define VM_ENTRY_SMM 0x00000400
76#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 82#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
83#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
77#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 84#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
85#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
78 86
79/* VMCS Encodings */ 87/* VMCS Encodings */
80enum vmcs_field { 88enum vmcs_field {
@@ -239,6 +247,7 @@ enum vmcs_field {
239#define EXIT_REASON_TASK_SWITCH 9 247#define EXIT_REASON_TASK_SWITCH 9
240#define EXIT_REASON_CPUID 10 248#define EXIT_REASON_CPUID 10
241#define EXIT_REASON_HLT 12 249#define EXIT_REASON_HLT 12
250#define EXIT_REASON_INVD 13
242#define EXIT_REASON_INVLPG 14 251#define EXIT_REASON_INVLPG 14
243#define EXIT_REASON_RDPMC 15 252#define EXIT_REASON_RDPMC 15
244#define EXIT_REASON_RDTSC 16 253#define EXIT_REASON_RDTSC 16
@@ -296,6 +305,12 @@ enum vmcs_field {
296#define GUEST_INTR_STATE_SMI 0x00000004 305#define GUEST_INTR_STATE_SMI 0x00000004
297#define GUEST_INTR_STATE_NMI 0x00000008 306#define GUEST_INTR_STATE_NMI 0x00000008
298 307
308/* GUEST_ACTIVITY_STATE flags */
309#define GUEST_ACTIVITY_ACTIVE 0
310#define GUEST_ACTIVITY_HLT 1
311#define GUEST_ACTIVITY_SHUTDOWN 2
312#define GUEST_ACTIVITY_WAIT_SIPI 3
313
299/* 314/*
300 * Exit Qualifications for MOV for Control Register Access 315 * Exit Qualifications for MOV for Control Register Access
301 */ 316 */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 8760cc60a21c..f25bdf238a33 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -42,6 +42,11 @@ extern unsigned int machine_to_phys_order;
42extern unsigned long get_phys_to_machine(unsigned long pfn); 42extern unsigned long get_phys_to_machine(unsigned long pfn);
43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
44 44
45extern int m2p_add_override(unsigned long mfn, struct page *page);
46extern int m2p_remove_override(struct page *page);
47extern struct page *m2p_find_override(unsigned long mfn);
48extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
49
45static inline unsigned long pfn_to_mfn(unsigned long pfn) 50static inline unsigned long pfn_to_mfn(unsigned long pfn)
46{ 51{
47 unsigned long mfn; 52 unsigned long mfn;
@@ -72,9 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
72 if (xen_feature(XENFEAT_auto_translated_physmap)) 77 if (xen_feature(XENFEAT_auto_translated_physmap))
73 return mfn; 78 return mfn;
74 79
75 if (unlikely((mfn >> machine_to_phys_order) != 0))
76 return ~0;
77
78 pfn = 0; 80 pfn = 0;
79 /* 81 /*
80 * The array access can fail (e.g., device space beyond end of RAM). 82 * The array access can fail (e.g., device space beyond end of RAM).
@@ -83,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
83 */ 85 */
84 __get_user(pfn, &machine_to_phys_mapping[mfn]); 86 __get_user(pfn, &machine_to_phys_mapping[mfn]);
85 87
88 /*
89 * If this appears to be a foreign mfn (because the pfn
90 * doesn't map back to the mfn), then check the local override
91 * table to see if there's a better pfn to use.
92 */
93 if (get_phys_to_machine(pfn) != mfn)
94 pfn = m2p_find_override_pfn(mfn, pfn);
95
86 return pfn; 96 return pfn;
87} 97}
88 98
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ec881c6bfee0..b3a71137983a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -509,6 +509,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
509 509
510 return 0; 510 return 0;
511} 511}
512EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
512 513
513int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) 514int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
514{ 515{
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d2fdb0826df2..57ca77787220 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1086,7 +1086,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
1086 1086
1087 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1087 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
1088 1088
1089 /* Intialize the exclusion range if necessary */ 1089 /* Initialize the exclusion range if necessary */
1090 for_each_iommu(iommu) { 1090 for_each_iommu(iommu) {
1091 if (iommu->exclusion_start && 1091 if (iommu->exclusion_start &&
1092 iommu->exclusion_start >= dma_dom->aperture[index]->offset 1092 iommu->exclusion_start >= dma_dom->aperture[index]->offset
@@ -1353,7 +1353,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1353 1353
1354/* 1354/*
1355 * Allocates a new protection domain usable for the dma_ops functions. 1355 * Allocates a new protection domain usable for the dma_ops functions.
1356 * It also intializes the page table and the address allocator data 1356 * It also initializes the page table and the address allocator data
1357 * structures required for the dma_ops interface 1357 * structures required for the dma_ops interface
1358 */ 1358 */
1359static struct dma_ops_domain *dma_ops_domain_alloc(void) 1359static struct dma_ops_domain *dma_ops_domain_alloc(void)
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index affacb5e0065..0a99f7198bc3 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -20,6 +20,13 @@ struct pci_device_id amd_nb_misc_ids[] = {
20}; 20};
21EXPORT_SYMBOL(amd_nb_misc_ids); 21EXPORT_SYMBOL(amd_nb_misc_ids);
22 22
23const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
24 { 0x00, 0x18, 0x20 },
25 { 0xff, 0x00, 0x20 },
26 { 0xfe, 0x00, 0x20 },
27 { }
28};
29
23struct amd_northbridge_info amd_northbridges; 30struct amd_northbridge_info amd_northbridges;
24EXPORT_SYMBOL(amd_northbridges); 31EXPORT_SYMBOL(amd_northbridges);
25 32
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 7c9ab59653e8..51ef31a89be9 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -313,14 +313,16 @@ static void apbt_setup_irq(struct apbt_dev *adev)
313 if (adev->irq == 0) 313 if (adev->irq == 0)
314 return; 314 return;
315 315
316 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
317 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
318 /* APB timer irqs are set up as mp_irqs, timer is edge type */
319 __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
320
316 if (system_state == SYSTEM_BOOTING) { 321 if (system_state == SYSTEM_BOOTING) {
317 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
318 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
319 /* APB timer irqs are set up as mp_irqs, timer is edge type */
320 __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
321 if (request_irq(adev->irq, apbt_interrupt_handler, 322 if (request_irq(adev->irq, apbt_interrupt_handler,
322 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, 323 IRQF_TIMER | IRQF_DISABLED |
323 adev->name, adev)) { 324 IRQF_NOBALANCING,
325 adev->name, adev)) {
324 printk(KERN_ERR "Failed request IRQ for APBT%d\n", 326 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
325 adev->num); 327 adev->num);
326 } 328 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index dcd7c83e1659..5955a7800a96 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -39,18 +39,6 @@ int fallback_aper_force __initdata;
39 39
40int fix_aperture __initdata = 1; 40int fix_aperture __initdata = 1;
41 41
42struct bus_dev_range {
43 int bus;
44 int dev_base;
45 int dev_limit;
46};
47
48static struct bus_dev_range bus_dev_ranges[] __initdata = {
49 { 0x00, 0x18, 0x20},
50 { 0xff, 0x00, 0x20},
51 { 0xfe, 0x00, 0x20}
52};
53
54static struct resource gart_resource = { 42static struct resource gart_resource = {
55 .name = "GART", 43 .name = "GART",
56 .flags = IORESOURCE_MEM, 44 .flags = IORESOURCE_MEM,
@@ -294,13 +282,13 @@ void __init early_gart_iommu_check(void)
294 search_agp_bridge(&agp_aper_order, &valid_agp); 282 search_agp_bridge(&agp_aper_order, &valid_agp);
295 283
296 fix = 0; 284 fix = 0;
297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 285 for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
298 int bus; 286 int bus;
299 int dev_base, dev_limit; 287 int dev_base, dev_limit;
300 288
301 bus = bus_dev_ranges[i].bus; 289 bus = amd_nb_bus_dev_ranges[i].bus;
302 dev_base = bus_dev_ranges[i].dev_base; 290 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
303 dev_limit = bus_dev_ranges[i].dev_limit; 291 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
304 292
305 for (slot = dev_base; slot < dev_limit; slot++) { 293 for (slot = dev_base; slot < dev_limit; slot++) {
306 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) 294 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
@@ -349,13 +337,13 @@ void __init early_gart_iommu_check(void)
349 return; 337 return;
350 338
351 /* disable them all at first */ 339 /* disable them all at first */
352 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 340 for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
353 int bus; 341 int bus;
354 int dev_base, dev_limit; 342 int dev_base, dev_limit;
355 343
356 bus = bus_dev_ranges[i].bus; 344 bus = amd_nb_bus_dev_ranges[i].bus;
357 dev_base = bus_dev_ranges[i].dev_base; 345 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
358 dev_limit = bus_dev_ranges[i].dev_limit; 346 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
359 347
360 for (slot = dev_base; slot < dev_limit; slot++) { 348 for (slot = dev_base; slot < dev_limit; slot++) {
361 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) 349 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
@@ -390,14 +378,14 @@ int __init gart_iommu_hole_init(void)
390 378
391 fix = 0; 379 fix = 0;
392 node = 0; 380 node = 0;
393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 381 for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
394 int bus; 382 int bus;
395 int dev_base, dev_limit; 383 int dev_base, dev_limit;
396 u32 ctl; 384 u32 ctl;
397 385
398 bus = bus_dev_ranges[i].bus; 386 bus = amd_nb_bus_dev_ranges[i].bus;
399 dev_base = bus_dev_ranges[i].dev_base; 387 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
400 dev_limit = bus_dev_ranges[i].dev_limit; 388 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
401 389
402 for (slot = dev_base; slot < dev_limit; slot++) { 390 for (slot = dev_base; slot < dev_limit; slot++) {
403 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) 391 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
@@ -505,7 +493,7 @@ out:
505 } 493 }
506 494
507 /* Fix up the north bridges */ 495 /* Fix up the north bridges */
508 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 496 for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
509 int bus, dev_base, dev_limit; 497 int bus, dev_base, dev_limit;
510 498
511 /* 499 /*
@@ -514,9 +502,9 @@ out:
514 */ 502 */
515 u32 ctl = DISTLBWALKPRB | aper_order << 1; 503 u32 ctl = DISTLBWALKPRB | aper_order << 1;
516 504
517 bus = bus_dev_ranges[i].bus; 505 bus = amd_nb_bus_dev_ranges[i].bus;
518 dev_base = bus_dev_ranges[i].dev_base; 506 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
519 dev_limit = bus_dev_ranges[i].dev_limit; 507 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
520 for (slot = dev_base; slot < dev_limit; slot++) { 508 for (slot = dev_base; slot < dev_limit; slot++) {
521 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) 509 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
522 continue; 510 continue;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a51345ba449e..06c196d7e59c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -684,7 +684,7 @@ static int __init calibrate_APIC_clock(void)
684 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 684 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
685 lapic_clockevent.shift); 685 lapic_clockevent.shift);
686 lapic_clockevent.max_delta_ns = 686 lapic_clockevent.max_delta_ns =
687 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); 687 clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
688 lapic_clockevent.min_delta_ns = 688 lapic_clockevent.min_delta_ns =
689 clockevent_delta2ns(0xF, &lapic_clockevent); 689 clockevent_delta2ns(0xF, &lapic_clockevent);
690 690
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 72ec29e1ae06..79fd43ca6f96 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -68,7 +68,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
68 68
69 switch (cmd) { 69 switch (cmd) {
70 case DIE_NMI: 70 case DIE_NMI:
71 case DIE_NMI_IPI:
72 break; 71 break;
73 72
74 default: 73 default:
@@ -96,7 +95,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
96static __read_mostly struct notifier_block backtrace_notifier = { 95static __read_mostly struct notifier_block backtrace_notifier = {
97 .notifier_call = arch_trigger_all_cpu_backtrace_handler, 96 .notifier_call = arch_trigger_all_cpu_backtrace_handler,
98 .next = NULL, 97 .next = NULL,
99 .priority = 1 98 .priority = NMI_LOCAL_LOW_PRIOR,
100}; 99};
101 100
102static int __init register_trigger_all_cpu_backtrace(void) 101static int __init register_trigger_all_cpu_backtrace(void)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ecca5f41ad2c..bd16b58b8850 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -378,7 +378,7 @@ struct apic __refdata apic_x2apic_uv_x = {
378 378
379static __cpuinit void set_x2apic_extra_bits(int pnode) 379static __cpuinit void set_x2apic_extra_bits(int pnode)
380{ 380{
381 __this_cpu_write(x2apic_extra_bits, (pnode << 6)); 381 __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
382} 382}
383 383
384/* 384/*
@@ -641,7 +641,7 @@ void __cpuinit uv_cpu_init(void)
641 */ 641 */
642int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) 642int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
643{ 643{
644 if (reason != DIE_NMI_IPI) 644 if (reason != DIE_NMIUNKNOWN)
645 return NOTIFY_OK; 645 return NOTIFY_OK;
646 646
647 if (in_crash_kexec) 647 if (in_crash_kexec)
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index e7dbde7bfedb..a77971979564 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -25,6 +25,7 @@
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <asm/mce.h> 26#include <asm/mce.h>
27#include <asm/apic.h> 27#include <asm/apic.h>
28#include <asm/nmi.h>
28 29
29/* Update fake mce registers on current CPU. */ 30/* Update fake mce registers on current CPU. */
30static void inject_mce(struct mce *m) 31static void inject_mce(struct mce *m)
@@ -83,7 +84,7 @@ static int mce_raise_notify(struct notifier_block *self,
83 struct die_args *args = (struct die_args *)data; 84 struct die_args *args = (struct die_args *)data;
84 int cpu = smp_processor_id(); 85 int cpu = smp_processor_id();
85 struct mce *m = &__get_cpu_var(injectm); 86 struct mce *m = &__get_cpu_var(injectm);
86 if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) 87 if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
87 return NOTIFY_DONE; 88 return NOTIFY_DONE;
88 cpumask_clear_cpu(cpu, mce_inject_cpumask); 89 cpumask_clear_cpu(cpu, mce_inject_cpumask);
89 if (m->inject_flags & MCJ_EXCEPTION) 90 if (m->inject_flags & MCJ_EXCEPTION)
@@ -95,7 +96,7 @@ static int mce_raise_notify(struct notifier_block *self,
95 96
96static struct notifier_block mce_raise_nb = { 97static struct notifier_block mce_raise_nb = {
97 .notifier_call = mce_raise_notify, 98 .notifier_call = mce_raise_notify,
98 .priority = 1000, 99 .priority = NMI_LOCAL_NORMAL_PRIOR,
99}; 100};
100 101
101/* Inject mce on current CPU */ 102/* Inject mce on current CPU */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 04921017abe0..9d977a2ea693 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1267,7 +1267,6 @@ perf_event_nmi_handler(struct notifier_block *self,
1267 1267
1268 switch (cmd) { 1268 switch (cmd) {
1269 case DIE_NMI: 1269 case DIE_NMI:
1270 case DIE_NMI_IPI:
1271 break; 1270 break;
1272 case DIE_NMIUNKNOWN: 1271 case DIE_NMIUNKNOWN:
1273 this_nmi = percpu_read(irq_stat.__nmi_count); 1272 this_nmi = percpu_read(irq_stat.__nmi_count);
@@ -1317,7 +1316,7 @@ perf_event_nmi_handler(struct notifier_block *self,
1317static __read_mostly struct notifier_block perf_event_nmi_notifier = { 1316static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1318 .notifier_call = perf_event_nmi_handler, 1317 .notifier_call = perf_event_nmi_handler,
1319 .next = NULL, 1318 .next = NULL,
1320 .priority = 1 1319 .priority = NMI_LOCAL_LOW_PRIOR,
1321}; 1320};
1322 1321
1323static struct event_constraint unconstrained; 1322static struct event_constraint unconstrained;
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 81400b93e694..e56b9bfbabd1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -753,19 +753,21 @@ out:
753 753
754static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) 754static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
755{ 755{
756 int overflow = 0; 756 u64 v;
757 u32 low, high;
758 757
759 rdmsr(hwc->config_base + hwc->idx, low, high); 758 /* an official way for overflow indication */
760 759 rdmsrl(hwc->config_base + hwc->idx, v);
761 /* we need to check high bit for unflagged overflows */ 760 if (v & P4_CCCR_OVF) {
762 if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) { 761 wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF);
763 overflow = 1; 762 return 1;
764 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
765 ((u64)low) & ~P4_CCCR_OVF);
766 } 763 }
767 764
768 return overflow; 765 /* it might be unflagged overflow */
766 rdmsrl(hwc->event_base + hwc->idx, v);
767 if (!(v & ARCH_P4_CNTRVAL_MASK))
768 return 1;
769
770 return 0;
769} 771}
770 772
771static void p4_pmu_disable_pebs(void) 773static void p4_pmu_disable_pebs(void)
@@ -1152,9 +1154,9 @@ static __initconst const struct x86_pmu p4_pmu = {
1152 */ 1154 */
1153 .num_counters = ARCH_P4_MAX_CCCR, 1155 .num_counters = ARCH_P4_MAX_CCCR,
1154 .apic = 1, 1156 .apic = 1,
1155 .cntval_bits = 40, 1157 .cntval_bits = ARCH_P4_CNTRVAL_BITS,
1156 .cntval_mask = (1ULL << 40) - 1, 1158 .cntval_mask = ARCH_P4_CNTRVAL_MASK,
1157 .max_period = (1ULL << 39) - 1, 1159 .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
1158 .hw_config = p4_hw_config, 1160 .hw_config = p4_hw_config,
1159 .schedule_events = p4_pmu_schedule_events, 1161 .schedule_events = p4_pmu_schedule_events,
1160 /* 1162 /*
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 8474c998cbd4..df20723a6a1b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -197,14 +197,8 @@ void show_stack(struct task_struct *task, unsigned long *sp)
197 */ 197 */
198void dump_stack(void) 198void dump_stack(void)
199{ 199{
200 unsigned long bp = 0;
201 unsigned long stack; 200 unsigned long stack;
202 201
203#ifdef CONFIG_FRAME_POINTER
204 if (!bp)
205 get_bp(bp);
206#endif
207
208 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 202 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
209 current->pid, current->comm, print_tainted(), 203 current->pid, current->comm, print_tainted(),
210 init_utsname()->release, 204 init_utsname()->release,
@@ -240,6 +234,7 @@ unsigned __kprobes long oops_begin(void)
240 bust_spinlocks(1); 234 bust_spinlocks(1);
241 return flags; 235 return flags;
242} 236}
237EXPORT_SYMBOL_GPL(oops_begin);
243 238
244void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) 239void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
245{ 240{
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0c2b7ef7a34d..294f26da0c0c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -14,6 +14,7 @@
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/pfn.h> 15#include <linux/pfn.h>
16#include <linux/suspend.h> 16#include <linux/suspend.h>
17#include <linux/acpi.h>
17#include <linux/firmware-map.h> 18#include <linux/firmware-map.h>
18#include <linux/memblock.h> 19#include <linux/memblock.h>
19 20
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 591e60104278..c8b4efad7ebb 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1406,6 +1406,16 @@ ENTRY(general_protection)
1406 CFI_ENDPROC 1406 CFI_ENDPROC
1407END(general_protection) 1407END(general_protection)
1408 1408
1409#ifdef CONFIG_KVM_GUEST
1410ENTRY(async_page_fault)
1411 RING0_EC_FRAME
1412 pushl $do_async_page_fault
1413 CFI_ADJUST_CFA_OFFSET 4
1414 jmp error_code
1415 CFI_ENDPROC
1416END(apf_page_fault)
1417#endif
1418
1409/* 1419/*
1410 * End of kprobes section 1420 * End of kprobes section
1411 */ 1421 */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e3ba417e8697..aed1ffbeb0c9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -299,17 +299,21 @@ ENDPROC(native_usergs_sysret64)
299ENTRY(save_args) 299ENTRY(save_args)
300 XCPT_FRAME 300 XCPT_FRAME
301 cld 301 cld
302 movq_cfi rdi, RDI+16-ARGOFFSET 302 /*
303 movq_cfi rsi, RSI+16-ARGOFFSET 303 * start from rbp in pt_regs and jump over
304 movq_cfi rdx, RDX+16-ARGOFFSET 304 * return address.
305 movq_cfi rcx, RCX+16-ARGOFFSET 305 */
306 movq_cfi rax, RAX+16-ARGOFFSET 306 movq_cfi rdi, RDI+8-RBP
307 movq_cfi r8, R8+16-ARGOFFSET 307 movq_cfi rsi, RSI+8-RBP
308 movq_cfi r9, R9+16-ARGOFFSET 308 movq_cfi rdx, RDX+8-RBP
309 movq_cfi r10, R10+16-ARGOFFSET 309 movq_cfi rcx, RCX+8-RBP
310 movq_cfi r11, R11+16-ARGOFFSET 310 movq_cfi rax, RAX+8-RBP
311 311 movq_cfi r8, R8+8-RBP
312 leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ 312 movq_cfi r9, R9+8-RBP
313 movq_cfi r10, R10+8-RBP
314 movq_cfi r11, R11+8-RBP
315
316 leaq -RBP+8(%rsp),%rdi /* arg1 for handler */
313 movq_cfi rbp, 8 /* push %rbp */ 317 movq_cfi rbp, 8 /* push %rbp */
314 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ 318 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
315 testl $3, CS(%rdi) 319 testl $3, CS(%rdi)
@@ -782,8 +786,9 @@ END(interrupt)
782 786
783/* 0(%rsp): ~(interrupt number) */ 787/* 0(%rsp): ~(interrupt number) */
784 .macro interrupt func 788 .macro interrupt func
785 subq $ORIG_RAX-ARGOFFSET+8, %rsp 789 /* reserve pt_regs for scratch regs and rbp */
786 CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8 790 subq $ORIG_RAX-RBP, %rsp
791 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
787 call save_args 792 call save_args
788 PARTIAL_FRAME 0 793 PARTIAL_FRAME 0
789 call \func 794 call \func
@@ -808,9 +813,14 @@ ret_from_intr:
808 TRACE_IRQS_OFF 813 TRACE_IRQS_OFF
809 decl PER_CPU_VAR(irq_count) 814 decl PER_CPU_VAR(irq_count)
810 leaveq 815 leaveq
816
811 CFI_RESTORE rbp 817 CFI_RESTORE rbp
812 CFI_DEF_CFA_REGISTER rsp 818 CFI_DEF_CFA_REGISTER rsp
813 CFI_ADJUST_CFA_OFFSET -8 819 CFI_ADJUST_CFA_OFFSET -8
820
821 /* we did not save rbx, restore only from ARGOFFSET */
822 addq $8, %rsp
823 CFI_ADJUST_CFA_OFFSET -8
814exit_intr: 824exit_intr:
815 GET_THREAD_INFO(%rcx) 825 GET_THREAD_INFO(%rcx)
816 testl $3,CS-ARGOFFSET(%rsp) 826 testl $3,CS-ARGOFFSET(%rsp)
@@ -1319,6 +1329,9 @@ errorentry xen_stack_segment do_stack_segment
1319#endif 1329#endif
1320errorentry general_protection do_general_protection 1330errorentry general_protection do_general_protection
1321errorentry page_fault do_page_fault 1331errorentry page_fault do_page_fault
1332#ifdef CONFIG_KVM_GUEST
1333errorentry async_page_fault do_async_page_fault
1334#endif
1322#ifdef CONFIG_X86_MCE 1335#ifdef CONFIG_X86_MCE
1323paranoidzeroentry machine_check *machine_check_vector(%rip) 1336paranoidzeroentry machine_check *machine_check_vector(%rip)
1324#endif 1337#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 9f54b209c378..fc293dc8dc35 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -126,7 +126,7 @@ ENTRY(startup_32)
126 movsl 126 movsl
127 movl pa(boot_params) + NEW_CL_POINTER,%esi 127 movl pa(boot_params) + NEW_CL_POINTER,%esi
128 andl %esi,%esi 128 andl %esi,%esi
129 jz 1f # No comand line 129 jz 1f # No command line
130 movl $pa(boot_command_line),%edi 130 movl $pa(boot_command_line),%edi
131 movl $(COMMAND_LINE_SIZE/4),%ecx 131 movl $(COMMAND_LINE_SIZE/4),%ecx
132 rep 132 rep
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 58bb239a2fd7..e60c38cc0eed 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
169 set_stopped_child_used_math(tsk); 169 set_stopped_child_used_math(tsk);
170 return 0; 170 return 0;
171} 171}
172EXPORT_SYMBOL_GPL(init_fpu);
172 173
173/* 174/*
174 * The xstateregs_active() routine is the same as the fpregs_active() routine, 175 * The xstateregs_active() routine is the same as the fpregs_active() routine,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3a43caa3beb7..52945da52a94 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel_stat.h> 6#include <linux/kernel_stat.h>
7#include <linux/of.h>
7#include <linux/seq_file.h> 8#include <linux/seq_file.h>
8#include <linux/smp.h> 9#include <linux/smp.h>
9#include <linux/ftrace.h> 10#include <linux/ftrace.h>
@@ -275,6 +276,15 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
275 276
276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 277EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
277 278
279#ifdef CONFIG_OF
280unsigned int irq_create_of_mapping(struct device_node *controller,
281 const u32 *intspec, unsigned int intsize)
282{
283 return intspec[0];
284}
285EXPORT_SYMBOL_GPL(irq_create_of_mapping);
286#endif
287
278#ifdef CONFIG_HOTPLUG_CPU 288#ifdef CONFIG_HOTPLUG_CPU
279/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ 289/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
280void fixup_irqs(void) 290void fixup_irqs(void)
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 48ff6dcffa02..9974d21048fd 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -129,8 +129,7 @@ void __cpuinit irq_ctx_init(int cpu)
129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
130 THREAD_FLAGS, 130 THREAD_FLAGS,
131 THREAD_ORDER)); 131 THREAD_ORDER));
132 irqctx->tinfo.task = NULL; 132 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
133 irqctx->tinfo.exec_domain = NULL;
134 irqctx->tinfo.cpu = cpu; 133 irqctx->tinfo.cpu = cpu;
135 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 134 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
136 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 135 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
@@ -140,10 +139,8 @@ void __cpuinit irq_ctx_init(int cpu)
140 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 139 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
141 THREAD_FLAGS, 140 THREAD_FLAGS,
142 THREAD_ORDER)); 141 THREAD_ORDER));
143 irqctx->tinfo.task = NULL; 142 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
144 irqctx->tinfo.exec_domain = NULL;
145 irqctx->tinfo.cpu = cpu; 143 irqctx->tinfo.cpu = cpu;
146 irqctx->tinfo.preempt_count = 0;
147 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 144 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
148 145
149 per_cpu(softirq_ctx, cpu) = irqctx; 146 per_cpu(softirq_ctx, cpu) = irqctx;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index cd21b654dec6..a4130005028a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -48,6 +48,7 @@
48#include <asm/apicdef.h> 48#include <asm/apicdef.h>
49#include <asm/system.h> 49#include <asm/system.h>
50#include <asm/apic.h> 50#include <asm/apic.h>
51#include <asm/nmi.h>
51 52
52struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = 53struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
53{ 54{
@@ -525,10 +526,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
525 } 526 }
526 return NOTIFY_DONE; 527 return NOTIFY_DONE;
527 528
528 case DIE_NMI_IPI:
529 /* Just ignore, we will handle the roundup on DIE_NMI. */
530 return NOTIFY_DONE;
531
532 case DIE_NMIUNKNOWN: 529 case DIE_NMIUNKNOWN:
533 if (was_in_debug_nmi[raw_smp_processor_id()]) { 530 if (was_in_debug_nmi[raw_smp_processor_id()]) {
534 was_in_debug_nmi[raw_smp_processor_id()] = 0; 531 was_in_debug_nmi[raw_smp_processor_id()] = 0;
@@ -606,7 +603,7 @@ static struct notifier_block kgdb_notifier = {
606 /* 603 /*
607 * Lowest-prio notifier priority, we want to be notified last: 604 * Lowest-prio notifier priority, we want to be notified last:
608 */ 605 */
609 .priority = -INT_MAX, 606 .priority = NMI_LOCAL_LOW_PRIOR,
610}; 607};
611 608
612/** 609/**
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8d3d4a..8dc44662394b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,16 +27,37 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <linux/notifier.h>
31#include <linux/reboot.h>
32#include <linux/hash.h>
33#include <linux/sched.h>
34#include <linux/slab.h>
35#include <linux/kprobes.h>
30#include <asm/timer.h> 36#include <asm/timer.h>
37#include <asm/cpu.h>
38#include <asm/traps.h>
39#include <asm/desc.h>
40#include <asm/tlbflush.h>
31 41
32#define MMU_QUEUE_SIZE 1024 42#define MMU_QUEUE_SIZE 1024
33 43
44static int kvmapf = 1;
45
46static int parse_no_kvmapf(char *arg)
47{
48 kvmapf = 0;
49 return 0;
50}
51
52early_param("no-kvmapf", parse_no_kvmapf);
53
34struct kvm_para_state { 54struct kvm_para_state {
35 u8 mmu_queue[MMU_QUEUE_SIZE]; 55 u8 mmu_queue[MMU_QUEUE_SIZE];
36 int mmu_queue_len; 56 int mmu_queue_len;
37}; 57};
38 58
39static DEFINE_PER_CPU(struct kvm_para_state, para_state); 59static DEFINE_PER_CPU(struct kvm_para_state, para_state);
60static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
40 61
41static struct kvm_para_state *kvm_para_state(void) 62static struct kvm_para_state *kvm_para_state(void)
42{ 63{
@@ -50,6 +71,195 @@ static void kvm_io_delay(void)
50{ 71{
51} 72}
52 73
74#define KVM_TASK_SLEEP_HASHBITS 8
75#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
76
77struct kvm_task_sleep_node {
78 struct hlist_node link;
79 wait_queue_head_t wq;
80 u32 token;
81 int cpu;
82 bool halted;
83 struct mm_struct *mm;
84};
85
86static struct kvm_task_sleep_head {
87 spinlock_t lock;
88 struct hlist_head list;
89} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
90
91static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
92 u32 token)
93{
94 struct hlist_node *p;
95
96 hlist_for_each(p, &b->list) {
97 struct kvm_task_sleep_node *n =
98 hlist_entry(p, typeof(*n), link);
99 if (n->token == token)
100 return n;
101 }
102
103 return NULL;
104}
105
106void kvm_async_pf_task_wait(u32 token)
107{
108 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
109 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
110 struct kvm_task_sleep_node n, *e;
111 DEFINE_WAIT(wait);
112 int cpu, idle;
113
114 cpu = get_cpu();
115 idle = idle_cpu(cpu);
116 put_cpu();
117
118 spin_lock(&b->lock);
119 e = _find_apf_task(b, token);
120 if (e) {
121 /* dummy entry exist -> wake up was delivered ahead of PF */
122 hlist_del(&e->link);
123 kfree(e);
124 spin_unlock(&b->lock);
125 return;
126 }
127
128 n.token = token;
129 n.cpu = smp_processor_id();
130 n.mm = current->active_mm;
131 n.halted = idle || preempt_count() > 1;
132 atomic_inc(&n.mm->mm_count);
133 init_waitqueue_head(&n.wq);
134 hlist_add_head(&n.link, &b->list);
135 spin_unlock(&b->lock);
136
137 for (;;) {
138 if (!n.halted)
139 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
140 if (hlist_unhashed(&n.link))
141 break;
142
143 if (!n.halted) {
144 local_irq_enable();
145 schedule();
146 local_irq_disable();
147 } else {
148 /*
149 * We cannot reschedule. So halt.
150 */
151 native_safe_halt();
152 local_irq_disable();
153 }
154 }
155 if (!n.halted)
156 finish_wait(&n.wq, &wait);
157
158 return;
159}
160EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
161
162static void apf_task_wake_one(struct kvm_task_sleep_node *n)
163{
164 hlist_del_init(&n->link);
165 if (!n->mm)
166 return;
167 mmdrop(n->mm);
168 if (n->halted)
169 smp_send_reschedule(n->cpu);
170 else if (waitqueue_active(&n->wq))
171 wake_up(&n->wq);
172}
173
174static void apf_task_wake_all(void)
175{
176 int i;
177
178 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
179 struct hlist_node *p, *next;
180 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
181 spin_lock(&b->lock);
182 hlist_for_each_safe(p, next, &b->list) {
183 struct kvm_task_sleep_node *n =
184 hlist_entry(p, typeof(*n), link);
185 if (n->cpu == smp_processor_id())
186 apf_task_wake_one(n);
187 }
188 spin_unlock(&b->lock);
189 }
190}
191
192void kvm_async_pf_task_wake(u32 token)
193{
194 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
195 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
196 struct kvm_task_sleep_node *n;
197
198 if (token == ~0) {
199 apf_task_wake_all();
200 return;
201 }
202
203again:
204 spin_lock(&b->lock);
205 n = _find_apf_task(b, token);
206 if (!n) {
207 /*
208 * async PF was not yet handled.
209 * Add dummy entry for the token.
210 */
211 n = kmalloc(sizeof(*n), GFP_ATOMIC);
212 if (!n) {
213 /*
214 * Allocation failed! Busy wait while other cpu
215 * handles async PF.
216 */
217 spin_unlock(&b->lock);
218 cpu_relax();
219 goto again;
220 }
221 n->token = token;
222 n->cpu = smp_processor_id();
223 n->mm = NULL;
224 init_waitqueue_head(&n->wq);
225 hlist_add_head(&n->link, &b->list);
226 } else
227 apf_task_wake_one(n);
228 spin_unlock(&b->lock);
229 return;
230}
231EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
232
233u32 kvm_read_and_reset_pf_reason(void)
234{
235 u32 reason = 0;
236
237 if (__get_cpu_var(apf_reason).enabled) {
238 reason = __get_cpu_var(apf_reason).reason;
239 __get_cpu_var(apf_reason).reason = 0;
240 }
241
242 return reason;
243}
244EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
245
246dotraplinkage void __kprobes
247do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
248{
249 switch (kvm_read_and_reset_pf_reason()) {
250 default:
251 do_page_fault(regs, error_code);
252 break;
253 case KVM_PV_REASON_PAGE_NOT_PRESENT:
254 /* page is swapped out by the host. */
255 kvm_async_pf_task_wait((u32)read_cr2());
256 break;
257 case KVM_PV_REASON_PAGE_READY:
258 kvm_async_pf_task_wake((u32)read_cr2());
259 break;
260 }
261}
262
53static void kvm_mmu_op(void *buffer, unsigned len) 263static void kvm_mmu_op(void *buffer, unsigned len)
54{ 264{
55 int r; 265 int r;
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void)
231#endif 441#endif
232} 442}
233 443
444void __cpuinit kvm_guest_cpu_init(void)
445{
446 if (!kvm_para_available())
447 return;
448
449 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
450 u64 pa = __pa(&__get_cpu_var(apf_reason));
451
452#ifdef CONFIG_PREEMPT
453 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
454#endif
455 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
456 __get_cpu_var(apf_reason).enabled = 1;
457 printk(KERN_INFO"KVM setup async PF for cpu %d\n",
458 smp_processor_id());
459 }
460}
461
462static void kvm_pv_disable_apf(void *unused)
463{
464 if (!__get_cpu_var(apf_reason).enabled)
465 return;
466
467 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
468 __get_cpu_var(apf_reason).enabled = 0;
469
470 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
471 smp_processor_id());
472}
473
474static int kvm_pv_reboot_notify(struct notifier_block *nb,
475 unsigned long code, void *unused)
476{
477 if (code == SYS_RESTART)
478 on_each_cpu(kvm_pv_disable_apf, NULL, 1);
479 return NOTIFY_DONE;
480}
481
482static struct notifier_block kvm_pv_reboot_nb = {
483 .notifier_call = kvm_pv_reboot_notify,
484};
485
486#ifdef CONFIG_SMP
487static void __init kvm_smp_prepare_boot_cpu(void)
488{
489#ifdef CONFIG_KVM_CLOCK
490 WARN_ON(kvm_register_clock("primary cpu clock"));
491#endif
492 kvm_guest_cpu_init();
493 native_smp_prepare_boot_cpu();
494}
495
496static void kvm_guest_cpu_online(void *dummy)
497{
498 kvm_guest_cpu_init();
499}
500
501static void kvm_guest_cpu_offline(void *dummy)
502{
503 kvm_pv_disable_apf(NULL);
504 apf_task_wake_all();
505}
506
507static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
508 unsigned long action, void *hcpu)
509{
510 int cpu = (unsigned long)hcpu;
511 switch (action) {
512 case CPU_ONLINE:
513 case CPU_DOWN_FAILED:
514 case CPU_ONLINE_FROZEN:
515 smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
516 break;
517 case CPU_DOWN_PREPARE:
518 case CPU_DOWN_PREPARE_FROZEN:
519 smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
520 break;
521 default:
522 break;
523 }
524 return NOTIFY_OK;
525}
526
527static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
528 .notifier_call = kvm_cpu_notify,
529};
530#endif
531
532static void __init kvm_apf_trap_init(void)
533{
534 set_intr_gate(14, &async_page_fault);
535}
536
234void __init kvm_guest_init(void) 537void __init kvm_guest_init(void)
235{ 538{
539 int i;
540
236 if (!kvm_para_available()) 541 if (!kvm_para_available())
237 return; 542 return;
238 543
239 paravirt_ops_setup(); 544 paravirt_ops_setup();
545 register_reboot_notifier(&kvm_pv_reboot_nb);
546 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
547 spin_lock_init(&async_pf_sleepers[i].lock);
548 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
549 x86_init.irqs.trap_init = kvm_apf_trap_init;
550
551#ifdef CONFIG_SMP
552 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
553 register_cpu_notifier(&kvm_cpu_notifier);
554#else
555 kvm_guest_cpu_init();
556#endif
240} 557}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ca43ce31a19c..f98d3eafe07a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -125,7 +125,7 @@ static struct clocksource kvm_clock = {
125 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 125 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
126}; 126};
127 127
128static int kvm_register_clock(char *txt) 128int kvm_register_clock(char *txt)
129{ 129{
130 int cpu = smp_processor_id(); 130 int cpu = smp_processor_id();
131 int low, high, ret; 131 int low, high, ret;
@@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
152} 152}
153#endif 153#endif
154 154
155#ifdef CONFIG_SMP
156static void __init kvm_smp_prepare_boot_cpu(void)
157{
158 WARN_ON(kvm_register_clock("primary cpu clock"));
159 native_smp_prepare_boot_cpu();
160}
161#endif
162
163/* 155/*
164 * After the clock is registered, the host will keep writing to the 156 * After the clock is registered, the host will keep writing to the
165 * registered memory location. If the guest happens to shutdown, this memory 157 * registered memory location. If the guest happens to shutdown, this memory
@@ -206,9 +198,6 @@ void __init kvmclock_init(void)
206 x86_cpuinit.setup_percpu_clockev = 198 x86_cpuinit.setup_percpu_clockev =
207 kvm_setup_secondary_clock; 199 kvm_setup_secondary_clock;
208#endif 200#endif
209#ifdef CONFIG_SMP
210 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
211#endif
212 machine_ops.shutdown = kvm_shutdown; 201 machine_ops.shutdown = kvm_shutdown;
213#ifdef CONFIG_KEXEC 202#ifdef CONFIG_KEXEC
214 machine_ops.crash_shutdown = kvm_crash_shutdown; 203 machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 8f2956091735..ab23f1ad4bf1 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -37,20 +37,11 @@
37 37
38void *module_alloc(unsigned long size) 38void *module_alloc(unsigned long size)
39{ 39{
40 struct vm_struct *area; 40 if (PAGE_ALIGN(size) > MODULES_LEN)
41
42 if (!size)
43 return NULL;
44 size = PAGE_ALIGN(size);
45 if (size > MODULES_LEN)
46 return NULL; 41 return NULL;
47 42 return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
48 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); 43 GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
49 if (!area) 44 -1, __builtin_return_address(0));
50 return NULL;
51
52 return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
53 PAGE_KERNEL_EXEC);
54} 45}
55 46
56/* Free memory returned from module_alloc */ 47/* Free memory returned from module_alloc */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c5b250011fd4..869e1aeeb71b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
421 .set_pte = native_set_pte, 421 .set_pte = native_set_pte,
422 .set_pte_at = native_set_pte_at, 422 .set_pte_at = native_set_pte_at,
423 .set_pmd = native_set_pmd, 423 .set_pmd = native_set_pmd,
424 .set_pmd_at = native_set_pmd_at,
424 .pte_update = paravirt_nop, 425 .pte_update = paravirt_nop,
425 .pte_update_defer = paravirt_nop, 426 .pte_update_defer = paravirt_nop,
427 .pmd_update = paravirt_nop,
428 .pmd_update_defer = paravirt_nop,
426 429
427 .ptep_modify_prot_start = __ptep_modify_prot_start, 430 .ptep_modify_prot_start = __ptep_modify_prot_start,
428 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 431 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 09c08a1c706f..d8286ed54ffa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -22,11 +22,6 @@
22#include <asm/i387.h> 22#include <asm/i387.h>
23#include <asm/debugreg.h> 23#include <asm/debugreg.h>
24 24
25unsigned long idle_halt;
26EXPORT_SYMBOL(idle_halt);
27unsigned long idle_nomwait;
28EXPORT_SYMBOL(idle_nomwait);
29
30struct kmem_cache *task_xstate_cachep; 25struct kmem_cache *task_xstate_cachep;
31EXPORT_SYMBOL_GPL(task_xstate_cachep); 26EXPORT_SYMBOL_GPL(task_xstate_cachep);
32 27
@@ -327,7 +322,7 @@ long sys_execve(const char __user *name,
327/* 322/*
328 * Idle related variables and functions 323 * Idle related variables and functions
329 */ 324 */
330unsigned long boot_option_idle_override = 0; 325unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
331EXPORT_SYMBOL(boot_option_idle_override); 326EXPORT_SYMBOL(boot_option_idle_override);
332 327
333/* 328/*
@@ -386,6 +381,8 @@ void default_idle(void)
386 else 381 else
387 local_irq_enable(); 382 local_irq_enable();
388 current_thread_info()->status |= TS_POLLING; 383 current_thread_info()->status |= TS_POLLING;
384 trace_power_end(smp_processor_id());
385 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
389 } else { 386 } else {
390 local_irq_enable(); 387 local_irq_enable();
391 /* loop is done by the caller */ 388 /* loop is done by the caller */
@@ -443,8 +440,6 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
443 */ 440 */
444void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 441void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
445{ 442{
446 trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
447 trace_cpu_idle((ax>>4)+1, smp_processor_id());
448 if (!need_resched()) { 443 if (!need_resched()) {
449 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) 444 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
450 clflush((void *)&current_thread_info()->flags); 445 clflush((void *)&current_thread_info()->flags);
@@ -471,6 +466,8 @@ static void mwait_idle(void)
471 __sti_mwait(0, 0); 466 __sti_mwait(0, 0);
472 else 467 else
473 local_irq_enable(); 468 local_irq_enable();
469 trace_power_end(smp_processor_id());
470 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
474 } else 471 } else
475 local_irq_enable(); 472 local_irq_enable();
476} 473}
@@ -503,7 +500,6 @@ static void poll_idle(void)
503 * 500 *
504 * idle=mwait overrides this decision and forces the usage of mwait. 501 * idle=mwait overrides this decision and forces the usage of mwait.
505 */ 502 */
506static int __cpuinitdata force_mwait;
507 503
508#define MWAIT_INFO 0x05 504#define MWAIT_INFO 0x05
509#define MWAIT_ECX_EXTENDED_INFO 0x01 505#define MWAIT_ECX_EXTENDED_INFO 0x01
@@ -513,7 +509,7 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
513{ 509{
514 u32 eax, ebx, ecx, edx; 510 u32 eax, ebx, ecx, edx;
515 511
516 if (force_mwait) 512 if (boot_option_idle_override == IDLE_FORCE_MWAIT)
517 return 1; 513 return 1;
518 514
519 if (c->cpuid_level < MWAIT_INFO) 515 if (c->cpuid_level < MWAIT_INFO)
@@ -633,9 +629,10 @@ static int __init idle_setup(char *str)
633 if (!strcmp(str, "poll")) { 629 if (!strcmp(str, "poll")) {
634 printk("using polling idle threads.\n"); 630 printk("using polling idle threads.\n");
635 pm_idle = poll_idle; 631 pm_idle = poll_idle;
636 } else if (!strcmp(str, "mwait")) 632 boot_option_idle_override = IDLE_POLL;
637 force_mwait = 1; 633 } else if (!strcmp(str, "mwait")) {
638 else if (!strcmp(str, "halt")) { 634 boot_option_idle_override = IDLE_FORCE_MWAIT;
635 } else if (!strcmp(str, "halt")) {
639 /* 636 /*
640 * When the boot option of idle=halt is added, halt is 637 * When the boot option of idle=halt is added, halt is
641 * forced to be used for CPU idle. In such case CPU C2/C3 638 * forced to be used for CPU idle. In such case CPU C2/C3
@@ -644,8 +641,7 @@ static int __init idle_setup(char *str)
644 * the boot_option_idle_override. 641 * the boot_option_idle_override.
645 */ 642 */
646 pm_idle = default_idle; 643 pm_idle = default_idle;
647 idle_halt = 1; 644 boot_option_idle_override = IDLE_HALT;
648 return 0;
649 } else if (!strcmp(str, "nomwait")) { 645 } else if (!strcmp(str, "nomwait")) {
650 /* 646 /*
651 * If the boot option of "idle=nomwait" is added, 647 * If the boot option of "idle=nomwait" is added,
@@ -653,12 +649,10 @@ static int __init idle_setup(char *str)
653 * states. In such case it won't touch the variable 649 * states. In such case it won't touch the variable
654 * of boot_option_idle_override. 650 * of boot_option_idle_override.
655 */ 651 */
656 idle_nomwait = 1; 652 boot_option_idle_override = IDLE_NOMWAIT;
657 return 0;
658 } else 653 } else
659 return -1; 654 return -1;
660 655
661 boot_option_idle_override = 1;
662 return 0; 656 return 0;
663} 657}
664early_param("idle", idle_setup); 658early_param("idle", idle_setup);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4b9befa0e347..8d128783af47 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,8 +57,6 @@
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/debugreg.h> 58#include <asm/debugreg.h>
59 59
60#include <trace/events/power.h>
61
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 61
64/* 62/*
@@ -113,8 +111,6 @@ void cpu_idle(void)
113 stop_critical_timings(); 111 stop_critical_timings();
114 pm_idle(); 112 pm_idle();
115 start_critical_timings(); 113 start_critical_timings();
116 trace_power_end(smp_processor_id());
117 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
118 } 114 }
119 tick_nohz_restart_sched_tick(); 115 tick_nohz_restart_sched_tick();
120 preempt_enable_no_resched(); 116 preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4c818a738396..bd387e8f73b4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,8 +51,6 @@
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/debugreg.h> 52#include <asm/debugreg.h>
53 53
54#include <trace/events/power.h>
55
56asmlinkage extern void ret_from_fork(void); 54asmlinkage extern void ret_from_fork(void);
57 55
58DEFINE_PER_CPU(unsigned long, old_rsp); 56DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -141,10 +139,6 @@ void cpu_idle(void)
141 pm_idle(); 139 pm_idle();
142 start_critical_timings(); 140 start_critical_timings();
143 141
144 trace_power_end(smp_processor_id());
145 trace_cpu_idle(PWR_EVENT_EXIT,
146 smp_processor_id());
147
148 /* In many cases the interrupt that ended idle 142 /* In many cases the interrupt that ended idle
149 has already called exit_idle. But some idle 143 has already called exit_idle. But some idle
150 loops can be woken up without interrupt. */ 144 loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c495aa8d4815..fc7aae1e2bc7 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -18,6 +18,7 @@
18#include <asm/pci_x86.h> 18#include <asm/pci_x86.h>
19#include <asm/virtext.h> 19#include <asm/virtext.h>
20#include <asm/cpu.h> 20#include <asm/cpu.h>
21#include <asm/nmi.h>
21 22
22#ifdef CONFIG_X86_32 23#ifdef CONFIG_X86_32
23# include <linux/ctype.h> 24# include <linux/ctype.h>
@@ -747,7 +748,7 @@ static int crash_nmi_callback(struct notifier_block *self,
747{ 748{
748 int cpu; 749 int cpu;
749 750
750 if (val != DIE_NMI_IPI) 751 if (val != DIE_NMI)
751 return NOTIFY_OK; 752 return NOTIFY_OK;
752 753
753 cpu = raw_smp_processor_id(); 754 cpu = raw_smp_processor_id();
@@ -778,6 +779,8 @@ static void smp_send_nmi_allbutself(void)
778 779
779static struct notifier_block crash_nmi_nb = { 780static struct notifier_block crash_nmi_nb = {
780 .notifier_call = crash_nmi_callback, 781 .notifier_call = crash_nmi_callback,
782 /* we want to be the first one called */
783 .priority = NMI_LOCAL_HIGH_PRIOR+1,
781}; 784};
782 785
783/* Halt all other CPUs, calling the specified function on each of them 786/* Halt all other CPUs, calling the specified function on each of them
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1cfbbfc3ae26..6f39cab052d5 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -76,7 +76,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
76 CMOS_WRITE(real_seconds, RTC_SECONDS); 76 CMOS_WRITE(real_seconds, RTC_SECONDS);
77 CMOS_WRITE(real_minutes, RTC_MINUTES); 77 CMOS_WRITE(real_minutes, RTC_MINUTES);
78 } else { 78 } else {
79 printk(KERN_WARNING 79 printk_once(KERN_NOTICE
80 "set_rtc_mmss: can't update from %d to %d\n", 80 "set_rtc_mmss: can't update from %d to %d\n",
81 cmos_minutes, real_minutes); 81 cmos_minutes, real_minutes);
82 retval = -1; 82 retval = -1;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c7149c96d079..763df77343dd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,12 +97,12 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
97 */ 97 */
98static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); 98static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
99 99
100void cpu_hotplug_driver_lock() 100void cpu_hotplug_driver_lock(void)
101{ 101{
102 mutex_lock(&x86_cpu_hotplug_driver_mutex); 102 mutex_lock(&x86_cpu_hotplug_driver_mutex);
103} 103}
104 104
105void cpu_hotplug_driver_unlock() 105void cpu_hotplug_driver_unlock(void)
106{ 106{
107 mutex_unlock(&x86_cpu_hotplug_driver_mutex); 107 mutex_unlock(&x86_cpu_hotplug_driver_mutex);
108} 108}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..998e972f3b1a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
133 pmd = pmd_alloc(&tboot_mm, pud, vaddr); 133 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
134 if (!pmd) 134 if (!pmd)
135 return -1; 135 return -1;
136 pte = pte_alloc_map(&tboot_mm, pmd, vaddr); 136 pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
137 if (!pte) 137 if (!pte)
138 return -1; 138 return -1;
139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); 139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c76aaca5694d..b9b67166f9de 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -84,6 +84,11 @@ EXPORT_SYMBOL_GPL(used_vectors);
84static int ignore_nmis; 84static int ignore_nmis;
85 85
86int unknown_nmi_panic; 86int unknown_nmi_panic;
87/*
88 * Prevent NMI reason port (0x61) being accessed simultaneously, can
89 * only be used in NMI handler.
90 */
91static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
87 92
88static inline void conditional_sti(struct pt_regs *regs) 93static inline void conditional_sti(struct pt_regs *regs)
89{ 94{
@@ -310,15 +315,15 @@ static int __init setup_unknown_nmi_panic(char *str)
310__setup("unknown_nmi_panic", setup_unknown_nmi_panic); 315__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
311 316
312static notrace __kprobes void 317static notrace __kprobes void
313mem_parity_error(unsigned char reason, struct pt_regs *regs) 318pci_serr_error(unsigned char reason, struct pt_regs *regs)
314{ 319{
315 printk(KERN_EMERG 320 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
316 "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 321 reason, smp_processor_id());
317 reason, smp_processor_id());
318
319 printk(KERN_EMERG
320 "You have some hardware problem, likely on the PCI bus.\n");
321 322
323 /*
324 * On some machines, PCI SERR line is used to report memory
325 * errors. EDAC makes use of it.
326 */
322#if defined(CONFIG_EDAC) 327#if defined(CONFIG_EDAC)
323 if (edac_handler_set()) { 328 if (edac_handler_set()) {
324 edac_atomic_assert_error(); 329 edac_atomic_assert_error();
@@ -329,11 +334,11 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
329 if (panic_on_unrecovered_nmi) 334 if (panic_on_unrecovered_nmi)
330 panic("NMI: Not continuing"); 335 panic("NMI: Not continuing");
331 336
332 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 337 pr_emerg("Dazed and confused, but trying to continue\n");
333 338
334 /* Clear and disable the memory parity error line. */ 339 /* Clear and disable the PCI SERR error line. */
335 reason = (reason & 0xf) | 4; 340 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
336 outb(reason, 0x61); 341 outb(reason, NMI_REASON_PORT);
337} 342}
338 343
339static notrace __kprobes void 344static notrace __kprobes void
@@ -341,15 +346,17 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
341{ 346{
342 unsigned long i; 347 unsigned long i;
343 348
344 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); 349 pr_emerg(
350 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
351 reason, smp_processor_id());
345 show_registers(regs); 352 show_registers(regs);
346 353
347 if (panic_on_io_nmi) 354 if (panic_on_io_nmi)
348 panic("NMI IOCK error: Not continuing"); 355 panic("NMI IOCK error: Not continuing");
349 356
350 /* Re-enable the IOCK line, wait for a few seconds */ 357 /* Re-enable the IOCK line, wait for a few seconds */
351 reason = (reason & 0xf) | 8; 358 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
352 outb(reason, 0x61); 359 outb(reason, NMI_REASON_PORT);
353 360
354 i = 20000; 361 i = 20000;
355 while (--i) { 362 while (--i) {
@@ -357,8 +364,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
357 udelay(100); 364 udelay(100);
358 } 365 }
359 366
360 reason &= ~8; 367 reason &= ~NMI_REASON_CLEAR_IOCHK;
361 outb(reason, 0x61); 368 outb(reason, NMI_REASON_PORT);
362} 369}
363 370
364static notrace __kprobes void 371static notrace __kprobes void
@@ -377,57 +384,50 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
377 return; 384 return;
378 } 385 }
379#endif 386#endif
380 printk(KERN_EMERG 387 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
381 "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 388 reason, smp_processor_id());
382 reason, smp_processor_id());
383 389
384 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); 390 pr_emerg("Do you have a strange power saving mode enabled?\n");
385 if (unknown_nmi_panic || panic_on_unrecovered_nmi) 391 if (unknown_nmi_panic || panic_on_unrecovered_nmi)
386 panic("NMI: Not continuing"); 392 panic("NMI: Not continuing");
387 393
388 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 394 pr_emerg("Dazed and confused, but trying to continue\n");
389} 395}
390 396
391static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 397static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
392{ 398{
393 unsigned char reason = 0; 399 unsigned char reason = 0;
394 int cpu;
395 400
396 cpu = smp_processor_id(); 401 /*
397 402 * CPU-specific NMI must be processed before non-CPU-specific
398 /* Only the BSP gets external NMIs from the system. */ 403 * NMI, otherwise we may lose it, because the CPU-specific
399 if (!cpu) 404 * NMI can not be detected/processed on other CPUs.
400 reason = get_nmi_reason(); 405 */
406 if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
407 return;
401 408
402 if (!(reason & 0xc0)) { 409 /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
403 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) 410 raw_spin_lock(&nmi_reason_lock);
404 == NOTIFY_STOP) 411 reason = get_nmi_reason();
405 return;
406 412
407#ifdef CONFIG_X86_LOCAL_APIC 413 if (reason & NMI_REASON_MASK) {
408 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 414 if (reason & NMI_REASON_SERR)
409 == NOTIFY_STOP) 415 pci_serr_error(reason, regs);
410 return; 416 else if (reason & NMI_REASON_IOCHK)
417 io_check_error(reason, regs);
418#ifdef CONFIG_X86_32
419 /*
420 * Reassert NMI in case it became active
421 * meanwhile as it's edge-triggered:
422 */
423 reassert_nmi();
411#endif 424#endif
412 unknown_nmi_error(reason, regs); 425 raw_spin_unlock(&nmi_reason_lock);
413
414 return; 426 return;
415 } 427 }
416 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 428 raw_spin_unlock(&nmi_reason_lock);
417 return;
418 429
419 /* AK: following checks seem to be broken on modern chipsets. FIXME */ 430 unknown_nmi_error(reason, regs);
420 if (reason & 0x80)
421 mem_parity_error(reason, regs);
422 if (reason & 0x40)
423 io_check_error(reason, regs);
424#ifdef CONFIG_X86_32
425 /*
426 * Reassert NMI in case it became active meanwhile
427 * as it's edge-triggered:
428 */
429 reassert_nmi();
430#endif
431} 431}
432 432
433dotraplinkage notrace __kprobes void 433dotraplinkage notrace __kprobes void
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 03d2ea82f35a..ffe5755caa8b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -464,7 +464,7 @@ unsigned long native_calibrate_tsc(void)
464 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); 464 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
465 465
466 /* hpet or pmtimer available ? */ 466 /* hpet or pmtimer available ? */
467 if (!hpet && !ref1 && !ref2) 467 if (ref1 == ref2)
468 continue; 468 continue;
469 469
470 /* Check, whether the sampling was disturbed by an SMI */ 470 /* Check, whether the sampling was disturbed by an SMI */
@@ -935,7 +935,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
935 tsc_stop = tsc_read_refs(&ref_stop, hpet); 935 tsc_stop = tsc_read_refs(&ref_stop, hpet);
936 936
937 /* hpet or pmtimer available ? */ 937 /* hpet or pmtimer available ? */
938 if (!hpet && !ref_start && !ref_stop) 938 if (ref_start == ref_stop)
939 goto out; 939 goto out;
940 940
941 /* Check, whether the sampling was disturbed by an SMI */ 941 /* Check, whether the sampling was disturbed by an SMI */
@@ -965,7 +965,7 @@ out:
965 965
966static int __init init_tsc_clocksource(void) 966static int __init init_tsc_clocksource(void)
967{ 967{
968 if (!cpu_has_tsc || tsc_disabled > 0) 968 if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz)
969 return 0; 969 return 0;
970 970
971 if (tsc_clocksource_reliable) 971 if (tsc_clocksource_reliable)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 61fb98519622..863f8753ab0a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
179 if (pud_none_or_clear_bad(pud)) 179 if (pud_none_or_clear_bad(pud))
180 goto out; 180 goto out;
181 pmd = pmd_offset(pud, 0xA0000); 181 pmd = pmd_offset(pud, 0xA0000);
182 split_huge_page_pmd(mm, pmd);
182 if (pmd_none_or_clear_bad(pmd)) 183 if (pmd_none_or_clear_bad(pmd))
183 goto out; 184 goto out;
184 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); 185 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ddc131ff438f..50f63648ce1b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select KVM_ASYNC_PF
31 select USER_RETURN_NOTIFIER 32 select USER_RETURN_NOTIFIER
32 select KVM_MMIO 33 select KVM_MMIO
33 ---help--- 34 ---help---
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..f15501f431c8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
1 1
2EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 2ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
3 3
4CFLAGS_x86.o := -I. 4CFLAGS_x86.o := -I.
5CFLAGS_svm.o := -I. 5CFLAGS_svm.o := -I.
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o) 10 assigned-dev.o)
11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
12 13
13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
14 i8254.o timer.o 15 i8254.o timer.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 38b6e8dafaff..caf966781d25 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -20,16 +20,8 @@
20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
21 */ 21 */
22 22
23#ifndef __KERNEL__
24#include <stdio.h>
25#include <stdint.h>
26#include <public/xen.h>
27#define DPRINTF(_f, _a ...) printf(_f , ## _a)
28#else
29#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
30#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
31#define DPRINTF(x...) do {} while (0)
32#endif
33#include <linux/module.h> 25#include <linux/module.h>
34#include <asm/kvm_emulate.h> 26#include <asm/kvm_emulate.h>
35 27
@@ -418,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
418} 410}
419 411
420static inline unsigned long 412static inline unsigned long
421register_address(struct decode_cache *c, unsigned long base, unsigned long reg) 413register_address(struct decode_cache *c, unsigned long reg)
422{ 414{
423 return base + address_mask(c, reg); 415 return address_mask(c, reg);
424} 416}
425 417
426static inline void 418static inline void
@@ -452,60 +444,55 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
452 return ops->get_cached_segment_base(seg, ctxt->vcpu); 444 return ops->get_cached_segment_base(seg, ctxt->vcpu);
453} 445}
454 446
455static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 447static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
456 struct x86_emulate_ops *ops, 448 struct x86_emulate_ops *ops,
457 struct decode_cache *c) 449 struct decode_cache *c)
458{ 450{
459 if (!c->has_seg_override) 451 if (!c->has_seg_override)
460 return 0; 452 return 0;
461 453
462 return seg_base(ctxt, ops, c->seg_override); 454 return c->seg_override;
463} 455}
464 456
465static unsigned long es_base(struct x86_emulate_ctxt *ctxt, 457static ulong linear(struct x86_emulate_ctxt *ctxt,
466 struct x86_emulate_ops *ops) 458 struct segmented_address addr)
467{ 459{
468 return seg_base(ctxt, ops, VCPU_SREG_ES); 460 struct decode_cache *c = &ctxt->decode;
469} 461 ulong la;
470
471static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
472 struct x86_emulate_ops *ops)
473{
474 return seg_base(ctxt, ops, VCPU_SREG_SS);
475}
476 462
477static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 463 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
478 u32 error, bool valid) 464 if (c->ad_bytes != 8)
479{ 465 la &= (u32)-1;
480 ctxt->exception = vec; 466 return la;
481 ctxt->error_code = error;
482 ctxt->error_code_valid = valid;
483} 467}
484 468
485static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 469static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
470 u32 error, bool valid)
486{ 471{
487 emulate_exception(ctxt, GP_VECTOR, err, true); 472 ctxt->exception.vector = vec;
473 ctxt->exception.error_code = error;
474 ctxt->exception.error_code_valid = valid;
475 return X86EMUL_PROPAGATE_FAULT;
488} 476}
489 477
490static void emulate_pf(struct x86_emulate_ctxt *ctxt) 478static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
491{ 479{
492 emulate_exception(ctxt, PF_VECTOR, 0, true); 480 return emulate_exception(ctxt, GP_VECTOR, err, true);
493} 481}
494 482
495static void emulate_ud(struct x86_emulate_ctxt *ctxt) 483static int emulate_ud(struct x86_emulate_ctxt *ctxt)
496{ 484{
497 emulate_exception(ctxt, UD_VECTOR, 0, false); 485 return emulate_exception(ctxt, UD_VECTOR, 0, false);
498} 486}
499 487
500static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) 488static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
501{ 489{
502 emulate_exception(ctxt, TS_VECTOR, err, true); 490 return emulate_exception(ctxt, TS_VECTOR, err, true);
503} 491}
504 492
505static int emulate_de(struct x86_emulate_ctxt *ctxt) 493static int emulate_de(struct x86_emulate_ctxt *ctxt)
506{ 494{
507 emulate_exception(ctxt, DE_VECTOR, 0, false); 495 return emulate_exception(ctxt, DE_VECTOR, 0, false);
508 return X86EMUL_PROPAGATE_FAULT;
509} 496}
510 497
511static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 498static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -520,7 +507,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
520 cur_size = fc->end - fc->start; 507 cur_size = fc->end - fc->start;
521 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); 508 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
522 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, 509 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
523 size, ctxt->vcpu, NULL); 510 size, ctxt->vcpu, &ctxt->exception);
524 if (rc != X86EMUL_CONTINUE) 511 if (rc != X86EMUL_CONTINUE)
525 return rc; 512 return rc;
526 fc->end += size; 513 fc->end += size;
@@ -564,7 +551,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
564 551
565static int read_descriptor(struct x86_emulate_ctxt *ctxt, 552static int read_descriptor(struct x86_emulate_ctxt *ctxt,
566 struct x86_emulate_ops *ops, 553 struct x86_emulate_ops *ops,
567 ulong addr, 554 struct segmented_address addr,
568 u16 *size, unsigned long *address, int op_bytes) 555 u16 *size, unsigned long *address, int op_bytes)
569{ 556{
570 int rc; 557 int rc;
@@ -572,10 +559,13 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
572 if (op_bytes == 2) 559 if (op_bytes == 2)
573 op_bytes = 3; 560 op_bytes = 3;
574 *address = 0; 561 *address = 0;
575 rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL); 562 rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
563 ctxt->vcpu, &ctxt->exception);
576 if (rc != X86EMUL_CONTINUE) 564 if (rc != X86EMUL_CONTINUE)
577 return rc; 565 return rc;
578 rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL); 566 addr.ea += 2;
567 rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
568 ctxt->vcpu, &ctxt->exception);
579 return rc; 569 return rc;
580} 570}
581 571
@@ -768,7 +758,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
768 break; 758 break;
769 } 759 }
770 } 760 }
771 op->addr.mem = modrm_ea; 761 op->addr.mem.ea = modrm_ea;
772done: 762done:
773 return rc; 763 return rc;
774} 764}
@@ -783,13 +773,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
783 op->type = OP_MEM; 773 op->type = OP_MEM;
784 switch (c->ad_bytes) { 774 switch (c->ad_bytes) {
785 case 2: 775 case 2:
786 op->addr.mem = insn_fetch(u16, 2, c->eip); 776 op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
787 break; 777 break;
788 case 4: 778 case 4:
789 op->addr.mem = insn_fetch(u32, 4, c->eip); 779 op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
790 break; 780 break;
791 case 8: 781 case 8:
792 op->addr.mem = insn_fetch(u64, 8, c->eip); 782 op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
793 break; 783 break;
794 } 784 }
795done: 785done:
@@ -808,7 +798,7 @@ static void fetch_bit_operand(struct decode_cache *c)
808 else if (c->src.bytes == 4) 798 else if (c->src.bytes == 4)
809 sv = (s32)c->src.val & (s32)mask; 799 sv = (s32)c->src.val & (s32)mask;
810 800
811 c->dst.addr.mem += (sv >> 3); 801 c->dst.addr.mem.ea += (sv >> 3);
812 } 802 }
813 803
814 /* only subword offset */ 804 /* only subword offset */
@@ -821,7 +811,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
821{ 811{
822 int rc; 812 int rc;
823 struct read_cache *mc = &ctxt->decode.mem_read; 813 struct read_cache *mc = &ctxt->decode.mem_read;
824 u32 err;
825 814
826 while (size) { 815 while (size) {
827 int n = min(size, 8u); 816 int n = min(size, 8u);
@@ -829,10 +818,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
829 if (mc->pos < mc->end) 818 if (mc->pos < mc->end)
830 goto read_cached; 819 goto read_cached;
831 820
832 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, 821 rc = ops->read_emulated(addr, mc->data + mc->end, n,
833 ctxt->vcpu); 822 &ctxt->exception, ctxt->vcpu);
834 if (rc == X86EMUL_PROPAGATE_FAULT)
835 emulate_pf(ctxt);
836 if (rc != X86EMUL_CONTINUE) 823 if (rc != X86EMUL_CONTINUE)
837 return rc; 824 return rc;
838 mc->end += n; 825 mc->end += n;
@@ -907,19 +894,15 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
907 struct desc_ptr dt; 894 struct desc_ptr dt;
908 u16 index = selector >> 3; 895 u16 index = selector >> 3;
909 int ret; 896 int ret;
910 u32 err;
911 ulong addr; 897 ulong addr;
912 898
913 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 899 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
914 900
915 if (dt.size < index * 8 + 7) { 901 if (dt.size < index * 8 + 7)
916 emulate_gp(ctxt, selector & 0xfffc); 902 return emulate_gp(ctxt, selector & 0xfffc);
917 return X86EMUL_PROPAGATE_FAULT;
918 }
919 addr = dt.address + index * 8; 903 addr = dt.address + index * 8;
920 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 904 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
921 if (ret == X86EMUL_PROPAGATE_FAULT) 905 &ctxt->exception);
922 emulate_pf(ctxt);
923 906
924 return ret; 907 return ret;
925} 908}
@@ -931,21 +914,17 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
931{ 914{
932 struct desc_ptr dt; 915 struct desc_ptr dt;
933 u16 index = selector >> 3; 916 u16 index = selector >> 3;
934 u32 err;
935 ulong addr; 917 ulong addr;
936 int ret; 918 int ret;
937 919
938 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 920 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
939 921
940 if (dt.size < index * 8 + 7) { 922 if (dt.size < index * 8 + 7)
941 emulate_gp(ctxt, selector & 0xfffc); 923 return emulate_gp(ctxt, selector & 0xfffc);
942 return X86EMUL_PROPAGATE_FAULT;
943 }
944 924
945 addr = dt.address + index * 8; 925 addr = dt.address + index * 8;
946 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 926 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
947 if (ret == X86EMUL_PROPAGATE_FAULT) 927 &ctxt->exception);
948 emulate_pf(ctxt);
949 928
950 return ret; 929 return ret;
951} 930}
@@ -1092,7 +1071,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1092{ 1071{
1093 int rc; 1072 int rc;
1094 struct decode_cache *c = &ctxt->decode; 1073 struct decode_cache *c = &ctxt->decode;
1095 u32 err;
1096 1074
1097 switch (c->dst.type) { 1075 switch (c->dst.type) {
1098 case OP_REG: 1076 case OP_REG:
@@ -1101,21 +1079,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1101 case OP_MEM: 1079 case OP_MEM:
1102 if (c->lock_prefix) 1080 if (c->lock_prefix)
1103 rc = ops->cmpxchg_emulated( 1081 rc = ops->cmpxchg_emulated(
1104 c->dst.addr.mem, 1082 linear(ctxt, c->dst.addr.mem),
1105 &c->dst.orig_val, 1083 &c->dst.orig_val,
1106 &c->dst.val, 1084 &c->dst.val,
1107 c->dst.bytes, 1085 c->dst.bytes,
1108 &err, 1086 &ctxt->exception,
1109 ctxt->vcpu); 1087 ctxt->vcpu);
1110 else 1088 else
1111 rc = ops->write_emulated( 1089 rc = ops->write_emulated(
1112 c->dst.addr.mem, 1090 linear(ctxt, c->dst.addr.mem),
1113 &c->dst.val, 1091 &c->dst.val,
1114 c->dst.bytes, 1092 c->dst.bytes,
1115 &err, 1093 &ctxt->exception,
1116 ctxt->vcpu); 1094 ctxt->vcpu);
1117 if (rc == X86EMUL_PROPAGATE_FAULT)
1118 emulate_pf(ctxt);
1119 if (rc != X86EMUL_CONTINUE) 1095 if (rc != X86EMUL_CONTINUE)
1120 return rc; 1096 return rc;
1121 break; 1097 break;
@@ -1137,8 +1113,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1137 c->dst.bytes = c->op_bytes; 1113 c->dst.bytes = c->op_bytes;
1138 c->dst.val = c->src.val; 1114 c->dst.val = c->src.val;
1139 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1115 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1140 c->dst.addr.mem = register_address(c, ss_base(ctxt, ops), 1116 c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1141 c->regs[VCPU_REGS_RSP]); 1117 c->dst.addr.mem.seg = VCPU_SREG_SS;
1142} 1118}
1143 1119
1144static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1120static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1147,10 +1123,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1147{ 1123{
1148 struct decode_cache *c = &ctxt->decode; 1124 struct decode_cache *c = &ctxt->decode;
1149 int rc; 1125 int rc;
1126 struct segmented_address addr;
1150 1127
1151 rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), 1128 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1152 c->regs[VCPU_REGS_RSP]), 1129 addr.seg = VCPU_SREG_SS;
1153 dest, len); 1130 rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
1154 if (rc != X86EMUL_CONTINUE) 1131 if (rc != X86EMUL_CONTINUE)
1155 return rc; 1132 return rc;
1156 1133
@@ -1184,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1184 change_mask |= EFLG_IF; 1161 change_mask |= EFLG_IF;
1185 break; 1162 break;
1186 case X86EMUL_MODE_VM86: 1163 case X86EMUL_MODE_VM86:
1187 if (iopl < 3) { 1164 if (iopl < 3)
1188 emulate_gp(ctxt, 0); 1165 return emulate_gp(ctxt, 0);
1189 return X86EMUL_PROPAGATE_FAULT;
1190 }
1191 change_mask |= EFLG_IF; 1166 change_mask |= EFLG_IF;
1192 break; 1167 break;
1193 default: /* real mode */ 1168 default: /* real mode */
@@ -1198,9 +1173,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1198 *(unsigned long *)dest = 1173 *(unsigned long *)dest =
1199 (ctxt->eflags & ~change_mask) | (val & change_mask); 1174 (ctxt->eflags & ~change_mask) | (val & change_mask);
1200 1175
1201 if (rc == X86EMUL_PROPAGATE_FAULT)
1202 emulate_pf(ctxt);
1203
1204 return rc; 1176 return rc;
1205} 1177}
1206 1178
@@ -1287,7 +1259,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1287 gva_t cs_addr; 1259 gva_t cs_addr;
1288 gva_t eip_addr; 1260 gva_t eip_addr;
1289 u16 cs, eip; 1261 u16 cs, eip;
1290 u32 err;
1291 1262
1292 /* TODO: Add limit checks */ 1263 /* TODO: Add limit checks */
1293 c->src.val = ctxt->eflags; 1264 c->src.val = ctxt->eflags;
@@ -1317,11 +1288,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1317 eip_addr = dt.address + (irq << 2); 1288 eip_addr = dt.address + (irq << 2);
1318 cs_addr = dt.address + (irq << 2) + 2; 1289 cs_addr = dt.address + (irq << 2) + 2;
1319 1290
1320 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err); 1291 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
1321 if (rc != X86EMUL_CONTINUE) 1292 if (rc != X86EMUL_CONTINUE)
1322 return rc; 1293 return rc;
1323 1294
1324 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err); 1295 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
1325 if (rc != X86EMUL_CONTINUE) 1296 if (rc != X86EMUL_CONTINUE)
1326 return rc; 1297 return rc;
1327 1298
@@ -1370,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1370 if (rc != X86EMUL_CONTINUE) 1341 if (rc != X86EMUL_CONTINUE)
1371 return rc; 1342 return rc;
1372 1343
1373 if (temp_eip & ~0xffff) { 1344 if (temp_eip & ~0xffff)
1374 emulate_gp(ctxt, 0); 1345 return emulate_gp(ctxt, 0);
1375 return X86EMUL_PROPAGATE_FAULT;
1376 }
1377 1346
1378 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1347 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1379 1348
@@ -1624,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1624 1593
1625 /* syscall is not available in real mode */ 1594 /* syscall is not available in real mode */
1626 if (ctxt->mode == X86EMUL_MODE_REAL || 1595 if (ctxt->mode == X86EMUL_MODE_REAL ||
1627 ctxt->mode == X86EMUL_MODE_VM86) { 1596 ctxt->mode == X86EMUL_MODE_VM86)
1628 emulate_ud(ctxt); 1597 return emulate_ud(ctxt);
1629 return X86EMUL_PROPAGATE_FAULT;
1630 }
1631 1598
1632 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1599 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1633 1600
@@ -1678,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1678 u16 cs_sel, ss_sel; 1645 u16 cs_sel, ss_sel;
1679 1646
1680 /* inject #GP if in real mode */ 1647 /* inject #GP if in real mode */
1681 if (ctxt->mode == X86EMUL_MODE_REAL) { 1648 if (ctxt->mode == X86EMUL_MODE_REAL)
1682 emulate_gp(ctxt, 0); 1649 return emulate_gp(ctxt, 0);
1683 return X86EMUL_PROPAGATE_FAULT;
1684 }
1685 1650
1686 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1651 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1687 * Therefore, we inject an #UD. 1652 * Therefore, we inject an #UD.
1688 */ 1653 */
1689 if (ctxt->mode == X86EMUL_MODE_PROT64) { 1654 if (ctxt->mode == X86EMUL_MODE_PROT64)
1690 emulate_ud(ctxt); 1655 return emulate_ud(ctxt);
1691 return X86EMUL_PROPAGATE_FAULT;
1692 }
1693 1656
1694 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1657 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1695 1658
1696 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 1659 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1697 switch (ctxt->mode) { 1660 switch (ctxt->mode) {
1698 case X86EMUL_MODE_PROT32: 1661 case X86EMUL_MODE_PROT32:
1699 if ((msr_data & 0xfffc) == 0x0) { 1662 if ((msr_data & 0xfffc) == 0x0)
1700 emulate_gp(ctxt, 0); 1663 return emulate_gp(ctxt, 0);
1701 return X86EMUL_PROPAGATE_FAULT;
1702 }
1703 break; 1664 break;
1704 case X86EMUL_MODE_PROT64: 1665 case X86EMUL_MODE_PROT64:
1705 if (msr_data == 0x0) { 1666 if (msr_data == 0x0)
1706 emulate_gp(ctxt, 0); 1667 return emulate_gp(ctxt, 0);
1707 return X86EMUL_PROPAGATE_FAULT;
1708 }
1709 break; 1668 break;
1710 } 1669 }
1711 1670
@@ -1745,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1745 1704
1746 /* inject #GP if in real mode or Virtual 8086 mode */ 1705 /* inject #GP if in real mode or Virtual 8086 mode */
1747 if (ctxt->mode == X86EMUL_MODE_REAL || 1706 if (ctxt->mode == X86EMUL_MODE_REAL ||
1748 ctxt->mode == X86EMUL_MODE_VM86) { 1707 ctxt->mode == X86EMUL_MODE_VM86)
1749 emulate_gp(ctxt, 0); 1708 return emulate_gp(ctxt, 0);
1750 return X86EMUL_PROPAGATE_FAULT;
1751 }
1752 1709
1753 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1710 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1754 1711
@@ -1763,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1763 switch (usermode) { 1720 switch (usermode) {
1764 case X86EMUL_MODE_PROT32: 1721 case X86EMUL_MODE_PROT32:
1765 cs_sel = (u16)(msr_data + 16); 1722 cs_sel = (u16)(msr_data + 16);
1766 if ((msr_data & 0xfffc) == 0x0) { 1723 if ((msr_data & 0xfffc) == 0x0)
1767 emulate_gp(ctxt, 0); 1724 return emulate_gp(ctxt, 0);
1768 return X86EMUL_PROPAGATE_FAULT;
1769 }
1770 ss_sel = (u16)(msr_data + 24); 1725 ss_sel = (u16)(msr_data + 24);
1771 break; 1726 break;
1772 case X86EMUL_MODE_PROT64: 1727 case X86EMUL_MODE_PROT64:
1773 cs_sel = (u16)(msr_data + 32); 1728 cs_sel = (u16)(msr_data + 32);
1774 if (msr_data == 0x0) { 1729 if (msr_data == 0x0)
1775 emulate_gp(ctxt, 0); 1730 return emulate_gp(ctxt, 0);
1776 return X86EMUL_PROPAGATE_FAULT;
1777 }
1778 ss_sel = cs_sel + 8; 1731 ss_sel = cs_sel + 8;
1779 cs.d = 0; 1732 cs.d = 0;
1780 cs.l = 1; 1733 cs.l = 1;
@@ -1934,33 +1887,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1934{ 1887{
1935 struct tss_segment_16 tss_seg; 1888 struct tss_segment_16 tss_seg;
1936 int ret; 1889 int ret;
1937 u32 err, new_tss_base = get_desc_base(new_desc); 1890 u32 new_tss_base = get_desc_base(new_desc);
1938 1891
1939 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1892 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1940 &err); 1893 &ctxt->exception);
1941 if (ret == X86EMUL_PROPAGATE_FAULT) { 1894 if (ret != X86EMUL_CONTINUE)
1942 /* FIXME: need to provide precise fault address */ 1895 /* FIXME: need to provide precise fault address */
1943 emulate_pf(ctxt);
1944 return ret; 1896 return ret;
1945 }
1946 1897
1947 save_state_to_tss16(ctxt, ops, &tss_seg); 1898 save_state_to_tss16(ctxt, ops, &tss_seg);
1948 1899
1949 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1900 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1950 &err); 1901 &ctxt->exception);
1951 if (ret == X86EMUL_PROPAGATE_FAULT) { 1902 if (ret != X86EMUL_CONTINUE)
1952 /* FIXME: need to provide precise fault address */ 1903 /* FIXME: need to provide precise fault address */
1953 emulate_pf(ctxt);
1954 return ret; 1904 return ret;
1955 }
1956 1905
1957 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1906 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1958 &err); 1907 &ctxt->exception);
1959 if (ret == X86EMUL_PROPAGATE_FAULT) { 1908 if (ret != X86EMUL_CONTINUE)
1960 /* FIXME: need to provide precise fault address */ 1909 /* FIXME: need to provide precise fault address */
1961 emulate_pf(ctxt);
1962 return ret; 1910 return ret;
1963 }
1964 1911
1965 if (old_tss_sel != 0xffff) { 1912 if (old_tss_sel != 0xffff) {
1966 tss_seg.prev_task_link = old_tss_sel; 1913 tss_seg.prev_task_link = old_tss_sel;
@@ -1968,12 +1915,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1968 ret = ops->write_std(new_tss_base, 1915 ret = ops->write_std(new_tss_base,
1969 &tss_seg.prev_task_link, 1916 &tss_seg.prev_task_link,
1970 sizeof tss_seg.prev_task_link, 1917 sizeof tss_seg.prev_task_link,
1971 ctxt->vcpu, &err); 1918 ctxt->vcpu, &ctxt->exception);
1972 if (ret == X86EMUL_PROPAGATE_FAULT) { 1919 if (ret != X86EMUL_CONTINUE)
1973 /* FIXME: need to provide precise fault address */ 1920 /* FIXME: need to provide precise fault address */
1974 emulate_pf(ctxt);
1975 return ret; 1921 return ret;
1976 }
1977 } 1922 }
1978 1923
1979 return load_state_from_tss16(ctxt, ops, &tss_seg); 1924 return load_state_from_tss16(ctxt, ops, &tss_seg);
@@ -2013,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2013 struct decode_cache *c = &ctxt->decode; 1958 struct decode_cache *c = &ctxt->decode;
2014 int ret; 1959 int ret;
2015 1960
2016 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { 1961 if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
2017 emulate_gp(ctxt, 0); 1962 return emulate_gp(ctxt, 0);
2018 return X86EMUL_PROPAGATE_FAULT;
2019 }
2020 c->eip = tss->eip; 1963 c->eip = tss->eip;
2021 ctxt->eflags = tss->eflags | 2; 1964 ctxt->eflags = tss->eflags | 2;
2022 c->regs[VCPU_REGS_RAX] = tss->eax; 1965 c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2076,33 +2019,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2076{ 2019{
2077 struct tss_segment_32 tss_seg; 2020 struct tss_segment_32 tss_seg;
2078 int ret; 2021 int ret;
2079 u32 err, new_tss_base = get_desc_base(new_desc); 2022 u32 new_tss_base = get_desc_base(new_desc);
2080 2023
2081 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2024 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2082 &err); 2025 &ctxt->exception);
2083 if (ret == X86EMUL_PROPAGATE_FAULT) { 2026 if (ret != X86EMUL_CONTINUE)
2084 /* FIXME: need to provide precise fault address */ 2027 /* FIXME: need to provide precise fault address */
2085 emulate_pf(ctxt);
2086 return ret; 2028 return ret;
2087 }
2088 2029
2089 save_state_to_tss32(ctxt, ops, &tss_seg); 2030 save_state_to_tss32(ctxt, ops, &tss_seg);
2090 2031
2091 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2032 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2092 &err); 2033 &ctxt->exception);
2093 if (ret == X86EMUL_PROPAGATE_FAULT) { 2034 if (ret != X86EMUL_CONTINUE)
2094 /* FIXME: need to provide precise fault address */ 2035 /* FIXME: need to provide precise fault address */
2095 emulate_pf(ctxt);
2096 return ret; 2036 return ret;
2097 }
2098 2037
2099 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2038 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2100 &err); 2039 &ctxt->exception);
2101 if (ret == X86EMUL_PROPAGATE_FAULT) { 2040 if (ret != X86EMUL_CONTINUE)
2102 /* FIXME: need to provide precise fault address */ 2041 /* FIXME: need to provide precise fault address */
2103 emulate_pf(ctxt);
2104 return ret; 2042 return ret;
2105 }
2106 2043
2107 if (old_tss_sel != 0xffff) { 2044 if (old_tss_sel != 0xffff) {
2108 tss_seg.prev_task_link = old_tss_sel; 2045 tss_seg.prev_task_link = old_tss_sel;
@@ -2110,12 +2047,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2110 ret = ops->write_std(new_tss_base, 2047 ret = ops->write_std(new_tss_base,
2111 &tss_seg.prev_task_link, 2048 &tss_seg.prev_task_link,
2112 sizeof tss_seg.prev_task_link, 2049 sizeof tss_seg.prev_task_link,
2113 ctxt->vcpu, &err); 2050 ctxt->vcpu, &ctxt->exception);
2114 if (ret == X86EMUL_PROPAGATE_FAULT) { 2051 if (ret != X86EMUL_CONTINUE)
2115 /* FIXME: need to provide precise fault address */ 2052 /* FIXME: need to provide precise fault address */
2116 emulate_pf(ctxt);
2117 return ret; 2053 return ret;
2118 }
2119 } 2054 }
2120 2055
2121 return load_state_from_tss32(ctxt, ops, &tss_seg); 2056 return load_state_from_tss32(ctxt, ops, &tss_seg);
@@ -2146,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2146 2081
2147 if (reason != TASK_SWITCH_IRET) { 2082 if (reason != TASK_SWITCH_IRET) {
2148 if ((tss_selector & 3) > next_tss_desc.dpl || 2083 if ((tss_selector & 3) > next_tss_desc.dpl ||
2149 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2084 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
2150 emulate_gp(ctxt, 0); 2085 return emulate_gp(ctxt, 0);
2151 return X86EMUL_PROPAGATE_FAULT;
2152 }
2153 } 2086 }
2154 2087
2155 desc_limit = desc_limit_scaled(&next_tss_desc); 2088 desc_limit = desc_limit_scaled(&next_tss_desc);
@@ -2231,14 +2164,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2231 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2164 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2232} 2165}
2233 2166
2234static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, 2167static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
2235 int reg, struct operand *op) 2168 int reg, struct operand *op)
2236{ 2169{
2237 struct decode_cache *c = &ctxt->decode; 2170 struct decode_cache *c = &ctxt->decode;
2238 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2171 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2239 2172
2240 register_address_increment(c, &c->regs[reg], df * op->bytes); 2173 register_address_increment(c, &c->regs[reg], df * op->bytes);
2241 op->addr.mem = register_address(c, base, c->regs[reg]); 2174 op->addr.mem.ea = register_address(c, c->regs[reg]);
2175 op->addr.mem.seg = seg;
2242} 2176}
2243 2177
2244static int em_push(struct x86_emulate_ctxt *ctxt) 2178static int em_push(struct x86_emulate_ctxt *ctxt)
@@ -2369,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2369 struct decode_cache *c = &ctxt->decode; 2303 struct decode_cache *c = &ctxt->decode;
2370 u64 tsc = 0; 2304 u64 tsc = 0;
2371 2305
2372 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) { 2306 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
2373 emulate_gp(ctxt, 0); 2307 return emulate_gp(ctxt, 0);
2374 return X86EMUL_PROPAGATE_FAULT;
2375 }
2376 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); 2308 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
2377 c->regs[VCPU_REGS_RAX] = (u32)tsc; 2309 c->regs[VCPU_REGS_RAX] = (u32)tsc;
2378 c->regs[VCPU_REGS_RDX] = tsc >> 32; 2310 c->regs[VCPU_REGS_RDX] = tsc >> 32;
@@ -2647,7 +2579,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
2647 2579
2648 op->type = OP_IMM; 2580 op->type = OP_IMM;
2649 op->bytes = size; 2581 op->bytes = size;
2650 op->addr.mem = c->eip; 2582 op->addr.mem.ea = c->eip;
2651 /* NB. Immediates are sign-extended as necessary. */ 2583 /* NB. Immediates are sign-extended as necessary. */
2652 switch (op->bytes) { 2584 switch (op->bytes) {
2653 case 1: 2585 case 1:
@@ -2678,7 +2610,7 @@ done:
2678} 2610}
2679 2611
2680int 2612int
2681x86_decode_insn(struct x86_emulate_ctxt *ctxt) 2613x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2682{ 2614{
2683 struct x86_emulate_ops *ops = ctxt->ops; 2615 struct x86_emulate_ops *ops = ctxt->ops;
2684 struct decode_cache *c = &ctxt->decode; 2616 struct decode_cache *c = &ctxt->decode;
@@ -2689,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
2689 struct operand memop = { .type = OP_NONE }; 2621 struct operand memop = { .type = OP_NONE };
2690 2622
2691 c->eip = ctxt->eip; 2623 c->eip = ctxt->eip;
2692 c->fetch.start = c->fetch.end = c->eip; 2624 c->fetch.start = c->eip;
2625 c->fetch.end = c->fetch.start + insn_len;
2626 if (insn_len > 0)
2627 memcpy(c->fetch.data, insn, insn_len);
2693 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); 2628 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
2694 2629
2695 switch (mode) { 2630 switch (mode) {
@@ -2803,10 +2738,8 @@ done_prefixes:
2803 c->execute = opcode.u.execute; 2738 c->execute = opcode.u.execute;
2804 2739
2805 /* Unrecognised? */ 2740 /* Unrecognised? */
2806 if (c->d == 0 || (c->d & Undefined)) { 2741 if (c->d == 0 || (c->d & Undefined))
2807 DPRINTF("Cannot emulate %02x\n", c->b);
2808 return -1; 2742 return -1;
2809 }
2810 2743
2811 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 2744 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2812 c->op_bytes = 8; 2745 c->op_bytes = 8;
@@ -2831,14 +2764,13 @@ done_prefixes:
2831 if (!c->has_seg_override) 2764 if (!c->has_seg_override)
2832 set_seg_override(c, VCPU_SREG_DS); 2765 set_seg_override(c, VCPU_SREG_DS);
2833 2766
2834 if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d)) 2767 memop.addr.mem.seg = seg_override(ctxt, ops, c);
2835 memop.addr.mem += seg_override_base(ctxt, ops, c);
2836 2768
2837 if (memop.type == OP_MEM && c->ad_bytes != 8) 2769 if (memop.type == OP_MEM && c->ad_bytes != 8)
2838 memop.addr.mem = (u32)memop.addr.mem; 2770 memop.addr.mem.ea = (u32)memop.addr.mem.ea;
2839 2771
2840 if (memop.type == OP_MEM && c->rip_relative) 2772 if (memop.type == OP_MEM && c->rip_relative)
2841 memop.addr.mem += c->eip; 2773 memop.addr.mem.ea += c->eip;
2842 2774
2843 /* 2775 /*
2844 * Decode and fetch the source operand: register, memory 2776 * Decode and fetch the source operand: register, memory
@@ -2890,14 +2822,14 @@ done_prefixes:
2890 case SrcSI: 2822 case SrcSI:
2891 c->src.type = OP_MEM; 2823 c->src.type = OP_MEM;
2892 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2824 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2893 c->src.addr.mem = 2825 c->src.addr.mem.ea =
2894 register_address(c, seg_override_base(ctxt, ops, c), 2826 register_address(c, c->regs[VCPU_REGS_RSI]);
2895 c->regs[VCPU_REGS_RSI]); 2827 c->src.addr.mem.seg = seg_override(ctxt, ops, c),
2896 c->src.val = 0; 2828 c->src.val = 0;
2897 break; 2829 break;
2898 case SrcImmFAddr: 2830 case SrcImmFAddr:
2899 c->src.type = OP_IMM; 2831 c->src.type = OP_IMM;
2900 c->src.addr.mem = c->eip; 2832 c->src.addr.mem.ea = c->eip;
2901 c->src.bytes = c->op_bytes + 2; 2833 c->src.bytes = c->op_bytes + 2;
2902 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); 2834 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
2903 break; 2835 break;
@@ -2944,7 +2876,7 @@ done_prefixes:
2944 break; 2876 break;
2945 case DstImmUByte: 2877 case DstImmUByte:
2946 c->dst.type = OP_IMM; 2878 c->dst.type = OP_IMM;
2947 c->dst.addr.mem = c->eip; 2879 c->dst.addr.mem.ea = c->eip;
2948 c->dst.bytes = 1; 2880 c->dst.bytes = 1;
2949 c->dst.val = insn_fetch(u8, 1, c->eip); 2881 c->dst.val = insn_fetch(u8, 1, c->eip);
2950 break; 2882 break;
@@ -2969,9 +2901,9 @@ done_prefixes:
2969 case DstDI: 2901 case DstDI:
2970 c->dst.type = OP_MEM; 2902 c->dst.type = OP_MEM;
2971 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2903 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2972 c->dst.addr.mem = 2904 c->dst.addr.mem.ea =
2973 register_address(c, es_base(ctxt, ops), 2905 register_address(c, c->regs[VCPU_REGS_RDI]);
2974 c->regs[VCPU_REGS_RDI]); 2906 c->dst.addr.mem.seg = VCPU_SREG_ES;
2975 c->dst.val = 0; 2907 c->dst.val = 0;
2976 break; 2908 break;
2977 case ImplicitOps: 2909 case ImplicitOps:
@@ -3020,24 +2952,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3020 ctxt->decode.mem_read.pos = 0; 2952 ctxt->decode.mem_read.pos = 0;
3021 2953
3022 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 2954 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
3023 emulate_ud(ctxt); 2955 rc = emulate_ud(ctxt);
3024 goto done; 2956 goto done;
3025 } 2957 }
3026 2958
3027 /* LOCK prefix is allowed only with some instructions */ 2959 /* LOCK prefix is allowed only with some instructions */
3028 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 2960 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
3029 emulate_ud(ctxt); 2961 rc = emulate_ud(ctxt);
3030 goto done; 2962 goto done;
3031 } 2963 }
3032 2964
3033 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { 2965 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
3034 emulate_ud(ctxt); 2966 rc = emulate_ud(ctxt);
3035 goto done; 2967 goto done;
3036 } 2968 }
3037 2969
3038 /* Privileged instruction can be executed only in CPL=0 */ 2970 /* Privileged instruction can be executed only in CPL=0 */
3039 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 2971 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
3040 emulate_gp(ctxt, 0); 2972 rc = emulate_gp(ctxt, 0);
3041 goto done; 2973 goto done;
3042 } 2974 }
3043 2975
@@ -3050,7 +2982,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3050 } 2982 }
3051 2983
3052 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { 2984 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
3053 rc = read_emulated(ctxt, ops, c->src.addr.mem, 2985 rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
3054 c->src.valptr, c->src.bytes); 2986 c->src.valptr, c->src.bytes);
3055 if (rc != X86EMUL_CONTINUE) 2987 if (rc != X86EMUL_CONTINUE)
3056 goto done; 2988 goto done;
@@ -3058,7 +2990,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3058 } 2990 }
3059 2991
3060 if (c->src2.type == OP_MEM) { 2992 if (c->src2.type == OP_MEM) {
3061 rc = read_emulated(ctxt, ops, c->src2.addr.mem, 2993 rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
3062 &c->src2.val, c->src2.bytes); 2994 &c->src2.val, c->src2.bytes);
3063 if (rc != X86EMUL_CONTINUE) 2995 if (rc != X86EMUL_CONTINUE)
3064 goto done; 2996 goto done;
@@ -3070,7 +3002,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3070 3002
3071 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3003 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
3072 /* optimisation - avoid slow emulated read if Mov */ 3004 /* optimisation - avoid slow emulated read if Mov */
3073 rc = read_emulated(ctxt, ops, c->dst.addr.mem, 3005 rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
3074 &c->dst.val, c->dst.bytes); 3006 &c->dst.val, c->dst.bytes);
3075 if (rc != X86EMUL_CONTINUE) 3007 if (rc != X86EMUL_CONTINUE)
3076 goto done; 3008 goto done;
@@ -3215,13 +3147,13 @@ special_insn:
3215 break; 3147 break;
3216 case 0x8c: /* mov r/m, sreg */ 3148 case 0x8c: /* mov r/m, sreg */
3217 if (c->modrm_reg > VCPU_SREG_GS) { 3149 if (c->modrm_reg > VCPU_SREG_GS) {
3218 emulate_ud(ctxt); 3150 rc = emulate_ud(ctxt);
3219 goto done; 3151 goto done;
3220 } 3152 }
3221 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 3153 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
3222 break; 3154 break;
3223 case 0x8d: /* lea r16/r32, m */ 3155 case 0x8d: /* lea r16/r32, m */
3224 c->dst.val = c->src.addr.mem; 3156 c->dst.val = c->src.addr.mem.ea;
3225 break; 3157 break;
3226 case 0x8e: { /* mov seg, r/m16 */ 3158 case 0x8e: { /* mov seg, r/m16 */
3227 uint16_t sel; 3159 uint16_t sel;
@@ -3230,7 +3162,7 @@ special_insn:
3230 3162
3231 if (c->modrm_reg == VCPU_SREG_CS || 3163 if (c->modrm_reg == VCPU_SREG_CS ||
3232 c->modrm_reg > VCPU_SREG_GS) { 3164 c->modrm_reg > VCPU_SREG_GS) {
3233 emulate_ud(ctxt); 3165 rc = emulate_ud(ctxt);
3234 goto done; 3166 goto done;
3235 } 3167 }
3236 3168
@@ -3268,7 +3200,6 @@ special_insn:
3268 break; 3200 break;
3269 case 0xa6 ... 0xa7: /* cmps */ 3201 case 0xa6 ... 0xa7: /* cmps */
3270 c->dst.type = OP_NONE; /* Disable writeback. */ 3202 c->dst.type = OP_NONE; /* Disable writeback. */
3271 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
3272 goto cmp; 3203 goto cmp;
3273 case 0xa8 ... 0xa9: /* test ax, imm */ 3204 case 0xa8 ... 0xa9: /* test ax, imm */
3274 goto test; 3205 goto test;
@@ -3363,7 +3294,7 @@ special_insn:
3363 do_io_in: 3294 do_io_in:
3364 c->dst.bytes = min(c->dst.bytes, 4u); 3295 c->dst.bytes = min(c->dst.bytes, 4u);
3365 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 3296 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
3366 emulate_gp(ctxt, 0); 3297 rc = emulate_gp(ctxt, 0);
3367 goto done; 3298 goto done;
3368 } 3299 }
3369 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 3300 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -3377,7 +3308,7 @@ special_insn:
3377 c->src.bytes = min(c->src.bytes, 4u); 3308 c->src.bytes = min(c->src.bytes, 4u);
3378 if (!emulator_io_permited(ctxt, ops, c->dst.val, 3309 if (!emulator_io_permited(ctxt, ops, c->dst.val,
3379 c->src.bytes)) { 3310 c->src.bytes)) {
3380 emulate_gp(ctxt, 0); 3311 rc = emulate_gp(ctxt, 0);
3381 goto done; 3312 goto done;
3382 } 3313 }
3383 ops->pio_out_emulated(c->src.bytes, c->dst.val, 3314 ops->pio_out_emulated(c->src.bytes, c->dst.val,
@@ -3402,14 +3333,14 @@ special_insn:
3402 break; 3333 break;
3403 case 0xfa: /* cli */ 3334 case 0xfa: /* cli */
3404 if (emulator_bad_iopl(ctxt, ops)) { 3335 if (emulator_bad_iopl(ctxt, ops)) {
3405 emulate_gp(ctxt, 0); 3336 rc = emulate_gp(ctxt, 0);
3406 goto done; 3337 goto done;
3407 } else 3338 } else
3408 ctxt->eflags &= ~X86_EFLAGS_IF; 3339 ctxt->eflags &= ~X86_EFLAGS_IF;
3409 break; 3340 break;
3410 case 0xfb: /* sti */ 3341 case 0xfb: /* sti */
3411 if (emulator_bad_iopl(ctxt, ops)) { 3342 if (emulator_bad_iopl(ctxt, ops)) {
3412 emulate_gp(ctxt, 0); 3343 rc = emulate_gp(ctxt, 0);
3413 goto done; 3344 goto done;
3414 } else { 3345 } else {
3415 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; 3346 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
@@ -3449,11 +3380,11 @@ writeback:
3449 c->dst.type = saved_dst_type; 3380 c->dst.type = saved_dst_type;
3450 3381
3451 if ((c->d & SrcMask) == SrcSI) 3382 if ((c->d & SrcMask) == SrcSI)
3452 string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), 3383 string_addr_inc(ctxt, seg_override(ctxt, ops, c),
3453 VCPU_REGS_RSI, &c->src); 3384 VCPU_REGS_RSI, &c->src);
3454 3385
3455 if ((c->d & DstMask) == DstDI) 3386 if ((c->d & DstMask) == DstDI)
3456 string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, 3387 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
3457 &c->dst); 3388 &c->dst);
3458 3389
3459 if (c->rep_prefix && (c->d & String)) { 3390 if (c->rep_prefix && (c->d & String)) {
@@ -3482,6 +3413,8 @@ writeback:
3482 ctxt->eip = c->eip; 3413 ctxt->eip = c->eip;
3483 3414
3484done: 3415done:
3416 if (rc == X86EMUL_PROPAGATE_FAULT)
3417 ctxt->have_exception = true;
3485 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 3418 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3486 3419
3487twobyte_insn: 3420twobyte_insn:
@@ -3544,9 +3477,11 @@ twobyte_insn:
3544 break; 3477 break;
3545 case 5: /* not defined */ 3478 case 5: /* not defined */
3546 emulate_ud(ctxt); 3479 emulate_ud(ctxt);
3480 rc = X86EMUL_PROPAGATE_FAULT;
3547 goto done; 3481 goto done;
3548 case 7: /* invlpg*/ 3482 case 7: /* invlpg*/
3549 emulate_invlpg(ctxt->vcpu, c->src.addr.mem); 3483 emulate_invlpg(ctxt->vcpu,
3484 linear(ctxt, c->src.addr.mem));
3550 /* Disable writeback. */ 3485 /* Disable writeback. */
3551 c->dst.type = OP_NONE; 3486 c->dst.type = OP_NONE;
3552 break; 3487 break;
@@ -3573,6 +3508,7 @@ twobyte_insn:
3573 case 5 ... 7: 3508 case 5 ... 7:
3574 case 9 ... 15: 3509 case 9 ... 15:
3575 emulate_ud(ctxt); 3510 emulate_ud(ctxt);
3511 rc = X86EMUL_PROPAGATE_FAULT;
3576 goto done; 3512 goto done;
3577 } 3513 }
3578 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3514 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3581,6 +3517,7 @@ twobyte_insn:
3581 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3517 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3582 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3518 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3583 emulate_ud(ctxt); 3519 emulate_ud(ctxt);
3520 rc = X86EMUL_PROPAGATE_FAULT;
3584 goto done; 3521 goto done;
3585 } 3522 }
3586 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); 3523 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
@@ -3588,6 +3525,7 @@ twobyte_insn:
3588 case 0x22: /* mov reg, cr */ 3525 case 0x22: /* mov reg, cr */
3589 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { 3526 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
3590 emulate_gp(ctxt, 0); 3527 emulate_gp(ctxt, 0);
3528 rc = X86EMUL_PROPAGATE_FAULT;
3591 goto done; 3529 goto done;
3592 } 3530 }
3593 c->dst.type = OP_NONE; 3531 c->dst.type = OP_NONE;
@@ -3596,6 +3534,7 @@ twobyte_insn:
3596 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3534 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3597 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3535 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3598 emulate_ud(ctxt); 3536 emulate_ud(ctxt);
3537 rc = X86EMUL_PROPAGATE_FAULT;
3599 goto done; 3538 goto done;
3600 } 3539 }
3601 3540
@@ -3604,6 +3543,7 @@ twobyte_insn:
3604 ~0ULL : ~0U), ctxt->vcpu) < 0) { 3543 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3605 /* #UD condition is already handled by the code above */ 3544 /* #UD condition is already handled by the code above */
3606 emulate_gp(ctxt, 0); 3545 emulate_gp(ctxt, 0);
3546 rc = X86EMUL_PROPAGATE_FAULT;
3607 goto done; 3547 goto done;
3608 } 3548 }
3609 3549
@@ -3615,6 +3555,7 @@ twobyte_insn:
3615 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3555 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3616 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 3556 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
3617 emulate_gp(ctxt, 0); 3557 emulate_gp(ctxt, 0);
3558 rc = X86EMUL_PROPAGATE_FAULT;
3618 goto done; 3559 goto done;
3619 } 3560 }
3620 rc = X86EMUL_CONTINUE; 3561 rc = X86EMUL_CONTINUE;
@@ -3623,6 +3564,7 @@ twobyte_insn:
3623 /* rdmsr */ 3564 /* rdmsr */
3624 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 3565 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
3625 emulate_gp(ctxt, 0); 3566 emulate_gp(ctxt, 0);
3567 rc = X86EMUL_PROPAGATE_FAULT;
3626 goto done; 3568 goto done;
3627 } else { 3569 } else {
3628 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3570 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3785,6 +3727,5 @@ twobyte_insn:
3785 goto writeback; 3727 goto writeback;
3786 3728
3787cannot_emulate: 3729cannot_emulate:
3788 DPRINTF("Cannot emulate %02x\n", c->b);
3789 return -1; 3730 return -1;
3790} 3731}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 975bb45329a1..3377d53fcd36 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -73,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
73 return vcpu->arch.cr4 & mask; 73 return vcpu->arch.cr4 & mask;
74} 74}
75 75
76static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
77{
78 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
79 kvm_x86_ops->decache_cr3(vcpu);
80 return vcpu->arch.cr3;
81}
82
76static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) 83static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
77{ 84{
78 return kvm_read_cr4_bits(vcpu, ~0UL); 85 return kvm_read_cr4_bits(vcpu, ~0UL);
@@ -84,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
84 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); 91 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
85} 92}
86 93
94static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
95{
96 vcpu->arch.hflags |= HF_GUEST_MASK;
97}
98
99static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
100{
101 vcpu->arch.hflags &= ~HF_GUEST_MASK;
102}
103
104static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
105{
106 return vcpu->arch.hflags & HF_GUEST_MASK;
107}
108
87#endif 109#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 413f8973a855..93cf9d0d3653 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
277 277
278 if (old_ppr != ppr) { 278 if (old_ppr != ppr) {
279 apic_set_reg(apic, APIC_PROCPRI, ppr); 279 apic_set_reg(apic, APIC_PROCPRI, ppr);
280 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 280 if (ppr < old_ppr)
281 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
281 } 282 }
282} 283}
283 284
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fbb04aee8301..f02b8edc3d44 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,9 +18,11 @@
18 * 18 *
19 */ 19 */
20 20
21#include "irq.h"
21#include "mmu.h" 22#include "mmu.h"
22#include "x86.h" 23#include "x86.h"
23#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include "x86.h"
24 26
25#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
26#include <linux/types.h> 28#include <linux/types.h>
@@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages;
194 196
195static u64 __read_mostly shadow_trap_nonpresent_pte; 197static u64 __read_mostly shadow_trap_nonpresent_pte;
196static u64 __read_mostly shadow_notrap_nonpresent_pte; 198static u64 __read_mostly shadow_notrap_nonpresent_pte;
197static u64 __read_mostly shadow_base_present_pte;
198static u64 __read_mostly shadow_nx_mask; 199static u64 __read_mostly shadow_nx_mask;
199static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 200static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
200static u64 __read_mostly shadow_user_mask; 201static u64 __read_mostly shadow_user_mask;
@@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
213} 214}
214EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); 215EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
215 216
216void kvm_mmu_set_base_ptes(u64 base_pte)
217{
218 shadow_base_present_pte = base_pte;
219}
220EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
221
222void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 217void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
223 u64 dirty_mask, u64 nx_mask, u64 x_mask) 218 u64 dirty_mask, u64 nx_mask, u64 x_mask)
224{ 219{
@@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
482} 477}
483 478
484/* 479/*
485 * Return the pointer to the largepage write count for a given 480 * Return the pointer to the large page information for a given gfn,
486 * gfn, handling slots that are not large page aligned. 481 * handling slots that are not large page aligned.
487 */ 482 */
488static int *slot_largepage_idx(gfn_t gfn, 483static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
489 struct kvm_memory_slot *slot, 484 struct kvm_memory_slot *slot,
490 int level) 485 int level)
491{ 486{
492 unsigned long idx; 487 unsigned long idx;
493 488
494 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 489 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
495 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 490 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
496 return &slot->lpage_info[level - 2][idx].write_count; 491 return &slot->lpage_info[level - 2][idx];
497} 492}
498 493
499static void account_shadowed(struct kvm *kvm, gfn_t gfn) 494static void account_shadowed(struct kvm *kvm, gfn_t gfn)
500{ 495{
501 struct kvm_memory_slot *slot; 496 struct kvm_memory_slot *slot;
502 int *write_count; 497 struct kvm_lpage_info *linfo;
503 int i; 498 int i;
504 499
505 slot = gfn_to_memslot(kvm, gfn); 500 slot = gfn_to_memslot(kvm, gfn);
506 for (i = PT_DIRECTORY_LEVEL; 501 for (i = PT_DIRECTORY_LEVEL;
507 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 502 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
508 write_count = slot_largepage_idx(gfn, slot, i); 503 linfo = lpage_info_slot(gfn, slot, i);
509 *write_count += 1; 504 linfo->write_count += 1;
510 } 505 }
511} 506}
512 507
513static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 508static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
514{ 509{
515 struct kvm_memory_slot *slot; 510 struct kvm_memory_slot *slot;
516 int *write_count; 511 struct kvm_lpage_info *linfo;
517 int i; 512 int i;
518 513
519 slot = gfn_to_memslot(kvm, gfn); 514 slot = gfn_to_memslot(kvm, gfn);
520 for (i = PT_DIRECTORY_LEVEL; 515 for (i = PT_DIRECTORY_LEVEL;
521 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 516 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
522 write_count = slot_largepage_idx(gfn, slot, i); 517 linfo = lpage_info_slot(gfn, slot, i);
523 *write_count -= 1; 518 linfo->write_count -= 1;
524 WARN_ON(*write_count < 0); 519 WARN_ON(linfo->write_count < 0);
525 } 520 }
526} 521}
527 522
@@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm,
530 int level) 525 int level)
531{ 526{
532 struct kvm_memory_slot *slot; 527 struct kvm_memory_slot *slot;
533 int *largepage_idx; 528 struct kvm_lpage_info *linfo;
534 529
535 slot = gfn_to_memslot(kvm, gfn); 530 slot = gfn_to_memslot(kvm, gfn);
536 if (slot) { 531 if (slot) {
537 largepage_idx = slot_largepage_idx(gfn, slot, level); 532 linfo = lpage_info_slot(gfn, slot, level);
538 return *largepage_idx; 533 return linfo->write_count;
539 } 534 }
540 535
541 return 1; 536 return 1;
@@ -559,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
559 return ret; 554 return ret;
560} 555}
561 556
562static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 557static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
563{ 558{
564 struct kvm_memory_slot *slot; 559 struct kvm_memory_slot *slot;
565 int host_level, level, max_level;
566
567 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 560 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
568 if (slot && slot->dirty_bitmap) 561 if (slot && slot->dirty_bitmap)
569 return PT_PAGE_TABLE_LEVEL; 562 return true;
563 return false;
564}
565
566static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 int host_level, level, max_level;
570 569
571 host_level = host_mapping_level(vcpu->kvm, large_gfn); 570 host_level = host_mapping_level(vcpu->kvm, large_gfn);
572 571
@@ -590,16 +589,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
590static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 589static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
591{ 590{
592 struct kvm_memory_slot *slot; 591 struct kvm_memory_slot *slot;
593 unsigned long idx; 592 struct kvm_lpage_info *linfo;
594 593
595 slot = gfn_to_memslot(kvm, gfn); 594 slot = gfn_to_memslot(kvm, gfn);
596 if (likely(level == PT_PAGE_TABLE_LEVEL)) 595 if (likely(level == PT_PAGE_TABLE_LEVEL))
597 return &slot->rmap[gfn - slot->base_gfn]; 596 return &slot->rmap[gfn - slot->base_gfn];
598 597
599 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 598 linfo = lpage_info_slot(gfn, slot, level);
600 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
601 599
602 return &slot->lpage_info[level - 2][idx].rmap_pde; 600 return &linfo->rmap_pde;
603} 601}
604 602
605/* 603/*
@@ -887,19 +885,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
887 end = start + (memslot->npages << PAGE_SHIFT); 885 end = start + (memslot->npages << PAGE_SHIFT);
888 if (hva >= start && hva < end) { 886 if (hva >= start && hva < end) {
889 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 887 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
888 gfn_t gfn = memslot->base_gfn + gfn_offset;
890 889
891 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 890 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
892 891
893 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 892 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
894 unsigned long idx; 893 struct kvm_lpage_info *linfo;
895 int sh; 894
896 895 linfo = lpage_info_slot(gfn, memslot,
897 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); 896 PT_DIRECTORY_LEVEL + j);
898 idx = ((memslot->base_gfn+gfn_offset) >> sh) - 897 ret |= handler(kvm, &linfo->rmap_pde, data);
899 (memslot->base_gfn >> sh);
900 ret |= handler(kvm,
901 &memslot->lpage_info[j][idx].rmap_pde,
902 data);
903 } 898 }
904 trace_kvm_age_page(hva, memslot, ret); 899 trace_kvm_age_page(hva, memslot, ret);
905 retval |= ret; 900 retval |= ret;
@@ -950,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
950 return young; 945 return young;
951} 946}
952 947
948static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
949 unsigned long data)
950{
951 u64 *spte;
952 int young = 0;
953
954 /*
955 * If there's no access bit in the secondary pte set by the
956 * hardware it's up to gup-fast/gup to set the access bit in
957 * the primary pte or in the page structure.
958 */
959 if (!shadow_accessed_mask)
960 goto out;
961
962 spte = rmap_next(kvm, rmapp, NULL);
963 while (spte) {
964 u64 _spte = *spte;
965 BUG_ON(!(_spte & PT_PRESENT_MASK));
966 young = _spte & PT_ACCESSED_MASK;
967 if (young) {
968 young = 1;
969 break;
970 }
971 spte = rmap_next(kvm, rmapp, spte);
972 }
973out:
974 return young;
975}
976
953#define RMAP_RECYCLE_THRESHOLD 1000 977#define RMAP_RECYCLE_THRESHOLD 1000
954 978
955static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 979static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -970,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
970 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 994 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
971} 995}
972 996
997int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
998{
999 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1000}
1001
973#ifdef MMU_DEBUG 1002#ifdef MMU_DEBUG
974static int is_empty_shadow_page(u64 *spt) 1003static int is_empty_shadow_page(u64 *spt)
975{ 1004{
@@ -1161,7 +1190,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1161} 1190}
1162 1191
1163static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1192static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1164 struct kvm_mmu_page *sp, bool clear_unsync) 1193 struct kvm_mmu_page *sp)
1165{ 1194{
1166 return 1; 1195 return 1;
1167} 1196}
@@ -1291,7 +1320,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1291 if (clear_unsync) 1320 if (clear_unsync)
1292 kvm_unlink_unsync_page(vcpu->kvm, sp); 1321 kvm_unlink_unsync_page(vcpu->kvm, sp);
1293 1322
1294 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { 1323 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1295 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1324 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1296 return 1; 1325 return 1;
1297 } 1326 }
@@ -1332,12 +1361,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1332 continue; 1361 continue;
1333 1362
1334 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1363 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1364 kvm_unlink_unsync_page(vcpu->kvm, s);
1335 if ((s->role.cr4_pae != !!is_pae(vcpu)) || 1365 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1336 (vcpu->arch.mmu.sync_page(vcpu, s, true))) { 1366 (vcpu->arch.mmu.sync_page(vcpu, s))) {
1337 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); 1367 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1338 continue; 1368 continue;
1339 } 1369 }
1340 kvm_unlink_unsync_page(vcpu->kvm, s);
1341 flush = true; 1370 flush = true;
1342 } 1371 }
1343 1372
@@ -1963,9 +1992,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1963 unsigned pte_access, int user_fault, 1992 unsigned pte_access, int user_fault,
1964 int write_fault, int dirty, int level, 1993 int write_fault, int dirty, int level,
1965 gfn_t gfn, pfn_t pfn, bool speculative, 1994 gfn_t gfn, pfn_t pfn, bool speculative,
1966 bool can_unsync, bool reset_host_protection) 1995 bool can_unsync, bool host_writable)
1967{ 1996{
1968 u64 spte; 1997 u64 spte, entry = *sptep;
1969 int ret = 0; 1998 int ret = 0;
1970 1999
1971 /* 2000 /*
@@ -1973,7 +2002,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1973 * whether the guest actually used the pte (in order to detect 2002 * whether the guest actually used the pte (in order to detect
1974 * demand paging). 2003 * demand paging).
1975 */ 2004 */
1976 spte = shadow_base_present_pte; 2005 spte = PT_PRESENT_MASK;
1977 if (!speculative) 2006 if (!speculative)
1978 spte |= shadow_accessed_mask; 2007 spte |= shadow_accessed_mask;
1979 if (!dirty) 2008 if (!dirty)
@@ -1990,8 +2019,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1990 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 2019 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1991 kvm_is_mmio_pfn(pfn)); 2020 kvm_is_mmio_pfn(pfn));
1992 2021
1993 if (reset_host_protection) 2022 if (host_writable)
1994 spte |= SPTE_HOST_WRITEABLE; 2023 spte |= SPTE_HOST_WRITEABLE;
2024 else
2025 pte_access &= ~ACC_WRITE_MASK;
1995 2026
1996 spte |= (u64)pfn << PAGE_SHIFT; 2027 spte |= (u64)pfn << PAGE_SHIFT;
1997 2028
@@ -2036,6 +2067,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2036 2067
2037set_pte: 2068set_pte:
2038 update_spte(sptep, spte); 2069 update_spte(sptep, spte);
2070 /*
2071 * If we overwrite a writable spte with a read-only one we
2072 * should flush remote TLBs. Otherwise rmap_write_protect
2073 * will find a read-only spte, even though the writable spte
2074 * might be cached on a CPU's TLB.
2075 */
2076 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2077 kvm_flush_remote_tlbs(vcpu->kvm);
2039done: 2078done:
2040 return ret; 2079 return ret;
2041} 2080}
@@ -2045,7 +2084,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2045 int user_fault, int write_fault, int dirty, 2084 int user_fault, int write_fault, int dirty,
2046 int *ptwrite, int level, gfn_t gfn, 2085 int *ptwrite, int level, gfn_t gfn,
2047 pfn_t pfn, bool speculative, 2086 pfn_t pfn, bool speculative,
2048 bool reset_host_protection) 2087 bool host_writable)
2049{ 2088{
2050 int was_rmapped = 0; 2089 int was_rmapped = 0;
2051 int rmap_count; 2090 int rmap_count;
@@ -2080,7 +2119,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2080 2119
2081 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2120 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2082 dirty, level, gfn, pfn, speculative, true, 2121 dirty, level, gfn, pfn, speculative, true,
2083 reset_host_protection)) { 2122 host_writable)) {
2084 if (write_fault) 2123 if (write_fault)
2085 *ptwrite = 1; 2124 *ptwrite = 1;
2086 kvm_mmu_flush_tlb(vcpu); 2125 kvm_mmu_flush_tlb(vcpu);
@@ -2211,7 +2250,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2211} 2250}
2212 2251
2213static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2252static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2214 int level, gfn_t gfn, pfn_t pfn) 2253 int map_writable, int level, gfn_t gfn, pfn_t pfn,
2254 bool prefault)
2215{ 2255{
2216 struct kvm_shadow_walk_iterator iterator; 2256 struct kvm_shadow_walk_iterator iterator;
2217 struct kvm_mmu_page *sp; 2257 struct kvm_mmu_page *sp;
@@ -2220,9 +2260,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2220 2260
2221 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2261 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2222 if (iterator.level == level) { 2262 if (iterator.level == level) {
2223 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2263 unsigned pte_access = ACC_ALL;
2264
2265 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2224 0, write, 1, &pt_write, 2266 0, write, 1, &pt_write,
2225 level, gfn, pfn, false, true); 2267 level, gfn, pfn, prefault, map_writable);
2226 direct_pte_prefetch(vcpu, iterator.sptep); 2268 direct_pte_prefetch(vcpu, iterator.sptep);
2227 ++vcpu->stat.pf_fixed; 2269 ++vcpu->stat.pf_fixed;
2228 break; 2270 break;
@@ -2277,27 +2319,81 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2277 return 1; 2319 return 1;
2278} 2320}
2279 2321
2280static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2322static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2323 gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2324{
2325 pfn_t pfn = *pfnp;
2326 gfn_t gfn = *gfnp;
2327 int level = *levelp;
2328
2329 /*
2330 * Check if it's a transparent hugepage. If this would be an
2331 * hugetlbfs page, level wouldn't be set to
2332 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2333 * here.
2334 */
2335 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2336 level == PT_PAGE_TABLE_LEVEL &&
2337 PageTransCompound(pfn_to_page(pfn)) &&
2338 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2339 unsigned long mask;
2340 /*
2341 * mmu_notifier_retry was successful and we hold the
2342 * mmu_lock here, so the pmd can't become splitting
2343 * from under us, and in turn
2344 * __split_huge_page_refcount() can't run from under
2345 * us and we can safely transfer the refcount from
2346 * PG_tail to PG_head as we switch the pfn to tail to
2347 * head.
2348 */
2349 *levelp = level = PT_DIRECTORY_LEVEL;
2350 mask = KVM_PAGES_PER_HPAGE(level) - 1;
2351 VM_BUG_ON((gfn & mask) != (pfn & mask));
2352 if (pfn & mask) {
2353 gfn &= ~mask;
2354 *gfnp = gfn;
2355 kvm_release_pfn_clean(pfn);
2356 pfn &= ~mask;
2357 if (!get_page_unless_zero(pfn_to_page(pfn)))
2358 BUG();
2359 *pfnp = pfn;
2360 }
2361 }
2362}
2363
2364static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2365 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2366
2367static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2368 bool prefault)
2281{ 2369{
2282 int r; 2370 int r;
2283 int level; 2371 int level;
2372 int force_pt_level;
2284 pfn_t pfn; 2373 pfn_t pfn;
2285 unsigned long mmu_seq; 2374 unsigned long mmu_seq;
2375 bool map_writable;
2286 2376
2287 level = mapping_level(vcpu, gfn); 2377 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2288 2378 if (likely(!force_pt_level)) {
2289 /* 2379 level = mapping_level(vcpu, gfn);
2290 * This path builds a PAE pagetable - so we can map 2mb pages at 2380 /*
2291 * maximum. Therefore check if the level is larger than that. 2381 * This path builds a PAE pagetable - so we can map
2292 */ 2382 * 2mb pages at maximum. Therefore check if the level
2293 if (level > PT_DIRECTORY_LEVEL) 2383 * is larger than that.
2294 level = PT_DIRECTORY_LEVEL; 2384 */
2385 if (level > PT_DIRECTORY_LEVEL)
2386 level = PT_DIRECTORY_LEVEL;
2295 2387
2296 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2388 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2389 } else
2390 level = PT_PAGE_TABLE_LEVEL;
2297 2391
2298 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2392 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2299 smp_rmb(); 2393 smp_rmb();
2300 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2394
2395 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2396 return 0;
2301 2397
2302 /* mmio */ 2398 /* mmio */
2303 if (is_error_pfn(pfn)) 2399 if (is_error_pfn(pfn))
@@ -2307,7 +2403,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2307 if (mmu_notifier_retry(vcpu, mmu_seq)) 2403 if (mmu_notifier_retry(vcpu, mmu_seq))
2308 goto out_unlock; 2404 goto out_unlock;
2309 kvm_mmu_free_some_pages(vcpu); 2405 kvm_mmu_free_some_pages(vcpu);
2310 r = __direct_map(vcpu, v, write, level, gfn, pfn); 2406 if (likely(!force_pt_level))
2407 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2408 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2409 prefault);
2311 spin_unlock(&vcpu->kvm->mmu_lock); 2410 spin_unlock(&vcpu->kvm->mmu_lock);
2312 2411
2313 2412
@@ -2530,6 +2629,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2530 hpa_t root = vcpu->arch.mmu.root_hpa; 2629 hpa_t root = vcpu->arch.mmu.root_hpa;
2531 sp = page_header(root); 2630 sp = page_header(root);
2532 mmu_sync_children(vcpu, sp); 2631 mmu_sync_children(vcpu, sp);
2632 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2533 return; 2633 return;
2534 } 2634 }
2535 for (i = 0; i < 4; ++i) { 2635 for (i = 0; i < 4; ++i) {
@@ -2552,23 +2652,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2552} 2652}
2553 2653
2554static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 2654static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2555 u32 access, u32 *error) 2655 u32 access, struct x86_exception *exception)
2556{ 2656{
2557 if (error) 2657 if (exception)
2558 *error = 0; 2658 exception->error_code = 0;
2559 return vaddr; 2659 return vaddr;
2560} 2660}
2561 2661
2562static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, 2662static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2563 u32 access, u32 *error) 2663 u32 access,
2664 struct x86_exception *exception)
2564{ 2665{
2565 if (error) 2666 if (exception)
2566 *error = 0; 2667 exception->error_code = 0;
2567 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); 2668 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2568} 2669}
2569 2670
2570static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2671static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2571 u32 error_code) 2672 u32 error_code, bool prefault)
2572{ 2673{
2573 gfn_t gfn; 2674 gfn_t gfn;
2574 int r; 2675 int r;
@@ -2584,17 +2685,68 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2584 gfn = gva >> PAGE_SHIFT; 2685 gfn = gva >> PAGE_SHIFT;
2585 2686
2586 return nonpaging_map(vcpu, gva & PAGE_MASK, 2687 return nonpaging_map(vcpu, gva & PAGE_MASK,
2587 error_code & PFERR_WRITE_MASK, gfn); 2688 error_code & PFERR_WRITE_MASK, gfn, prefault);
2689}
2690
2691static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2692{
2693 struct kvm_arch_async_pf arch;
2694
2695 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2696 arch.gfn = gfn;
2697 arch.direct_map = vcpu->arch.mmu.direct_map;
2698 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
2699
2700 return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2701}
2702
2703static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2704{
2705 if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2706 kvm_event_needs_reinjection(vcpu)))
2707 return false;
2708
2709 return kvm_x86_ops->interrupt_allowed(vcpu);
2588} 2710}
2589 2711
2590static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, 2712static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2591 u32 error_code) 2713 gva_t gva, pfn_t *pfn, bool write, bool *writable)
2714{
2715 bool async;
2716
2717 *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
2718
2719 if (!async)
2720 return false; /* *pfn has correct page already */
2721
2722 put_page(pfn_to_page(*pfn));
2723
2724 if (!prefault && can_do_async_pf(vcpu)) {
2725 trace_kvm_try_async_get_page(gva, gfn);
2726 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2727 trace_kvm_async_pf_doublefault(gva, gfn);
2728 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2729 return true;
2730 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2731 return true;
2732 }
2733
2734 *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
2735
2736 return false;
2737}
2738
2739static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2740 bool prefault)
2592{ 2741{
2593 pfn_t pfn; 2742 pfn_t pfn;
2594 int r; 2743 int r;
2595 int level; 2744 int level;
2745 int force_pt_level;
2596 gfn_t gfn = gpa >> PAGE_SHIFT; 2746 gfn_t gfn = gpa >> PAGE_SHIFT;
2597 unsigned long mmu_seq; 2747 unsigned long mmu_seq;
2748 int write = error_code & PFERR_WRITE_MASK;
2749 bool map_writable;
2598 2750
2599 ASSERT(vcpu); 2751 ASSERT(vcpu);
2600 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2752 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2603,21 +2755,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2603 if (r) 2755 if (r)
2604 return r; 2756 return r;
2605 2757
2606 level = mapping_level(vcpu, gfn); 2758 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2607 2759 if (likely(!force_pt_level)) {
2608 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2760 level = mapping_level(vcpu, gfn);
2761 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2762 } else
2763 level = PT_PAGE_TABLE_LEVEL;
2609 2764
2610 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2765 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2611 smp_rmb(); 2766 smp_rmb();
2612 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2767
2768 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2769 return 0;
2770
2771 /* mmio */
2613 if (is_error_pfn(pfn)) 2772 if (is_error_pfn(pfn))
2614 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2773 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2615 spin_lock(&vcpu->kvm->mmu_lock); 2774 spin_lock(&vcpu->kvm->mmu_lock);
2616 if (mmu_notifier_retry(vcpu, mmu_seq)) 2775 if (mmu_notifier_retry(vcpu, mmu_seq))
2617 goto out_unlock; 2776 goto out_unlock;
2618 kvm_mmu_free_some_pages(vcpu); 2777 kvm_mmu_free_some_pages(vcpu);
2619 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2778 if (likely(!force_pt_level))
2620 level, gfn, pfn); 2779 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2780 r = __direct_map(vcpu, gpa, write, map_writable,
2781 level, gfn, pfn, prefault);
2621 spin_unlock(&vcpu->kvm->mmu_lock); 2782 spin_unlock(&vcpu->kvm->mmu_lock);
2622 2783
2623 return r; 2784 return r;
@@ -2659,18 +2820,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2659 2820
2660static void paging_new_cr3(struct kvm_vcpu *vcpu) 2821static void paging_new_cr3(struct kvm_vcpu *vcpu)
2661{ 2822{
2662 pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); 2823 pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
2663 mmu_free_roots(vcpu); 2824 mmu_free_roots(vcpu);
2664} 2825}
2665 2826
2666static unsigned long get_cr3(struct kvm_vcpu *vcpu) 2827static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2667{ 2828{
2668 return vcpu->arch.cr3; 2829 return kvm_read_cr3(vcpu);
2669} 2830}
2670 2831
2671static void inject_page_fault(struct kvm_vcpu *vcpu) 2832static void inject_page_fault(struct kvm_vcpu *vcpu,
2833 struct x86_exception *fault)
2672{ 2834{
2673 vcpu->arch.mmu.inject_page_fault(vcpu); 2835 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
2674} 2836}
2675 2837
2676static void paging_free(struct kvm_vcpu *vcpu) 2838static void paging_free(struct kvm_vcpu *vcpu)
@@ -2816,6 +2978,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2816{ 2978{
2817 struct kvm_mmu *context = vcpu->arch.walk_mmu; 2979 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2818 2980
2981 context->base_role.word = 0;
2819 context->new_cr3 = nonpaging_new_cr3; 2982 context->new_cr3 = nonpaging_new_cr3;
2820 context->page_fault = tdp_page_fault; 2983 context->page_fault = tdp_page_fault;
2821 context->free = nonpaging_free; 2984 context->free = nonpaging_free;
@@ -3008,9 +3171,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3008 return; 3171 return;
3009 } 3172 }
3010 3173
3011 if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
3012 return;
3013
3014 ++vcpu->kvm->stat.mmu_pte_updated; 3174 ++vcpu->kvm->stat.mmu_pte_updated;
3015 if (!sp->role.cr4_pae) 3175 if (!sp->role.cr4_pae)
3016 paging32_update_pte(vcpu, sp, spte, new); 3176 paging32_update_pte(vcpu, sp, spte, new);
@@ -3264,12 +3424,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3264 } 3424 }
3265} 3425}
3266 3426
3267int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 3427int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3428 void *insn, int insn_len)
3268{ 3429{
3269 int r; 3430 int r;
3270 enum emulation_result er; 3431 enum emulation_result er;
3271 3432
3272 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); 3433 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
3273 if (r < 0) 3434 if (r < 0)
3274 goto out; 3435 goto out;
3275 3436
@@ -3282,7 +3443,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
3282 if (r) 3443 if (r)
3283 goto out; 3444 goto out;
3284 3445
3285 er = emulate_instruction(vcpu, cr2, error_code, 0); 3446 er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
3286 3447
3287 switch (er) { 3448 switch (er) {
3288 case EMULATE_DONE: 3449 case EMULATE_DONE:
@@ -3377,11 +3538,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3377 if (!test_bit(slot, sp->slot_bitmap)) 3538 if (!test_bit(slot, sp->slot_bitmap))
3378 continue; 3539 continue;
3379 3540
3541 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3542 continue;
3543
3380 pt = sp->spt; 3544 pt = sp->spt;
3381 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3545 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
3382 /* avoid RMW */ 3546 /* avoid RMW */
3383 if (is_writable_pte(pt[i])) 3547 if (is_writable_pte(pt[i]))
3384 pt[i] &= ~PT_WRITABLE_MASK; 3548 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3385 } 3549 }
3386 kvm_flush_remote_tlbs(kvm); 3550 kvm_flush_remote_tlbs(kvm);
3387} 3551}
@@ -3463,13 +3627,6 @@ static void mmu_destroy_caches(void)
3463 kmem_cache_destroy(mmu_page_header_cache); 3627 kmem_cache_destroy(mmu_page_header_cache);
3464} 3628}
3465 3629
3466void kvm_mmu_module_exit(void)
3467{
3468 mmu_destroy_caches();
3469 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3470 unregister_shrinker(&mmu_shrinker);
3471}
3472
3473int kvm_mmu_module_init(void) 3630int kvm_mmu_module_init(void)
3474{ 3631{
3475 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3632 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3566,7 +3723,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3566 3723
3567static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3724static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3568{ 3725{
3569 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); 3726 (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
3570 return 1; 3727 return 1;
3571} 3728}
3572 3729
@@ -3662,12 +3819,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3662} 3819}
3663EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3820EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3664 3821
3665#ifdef CONFIG_KVM_MMU_AUDIT
3666#include "mmu_audit.c"
3667#else
3668static void mmu_audit_disable(void) { }
3669#endif
3670
3671void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 3822void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3672{ 3823{
3673 ASSERT(vcpu); 3824 ASSERT(vcpu);
@@ -3675,5 +3826,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3675 destroy_kvm_mmu(vcpu); 3826 destroy_kvm_mmu(vcpu);
3676 free_mmu_pages(vcpu); 3827 free_mmu_pages(vcpu);
3677 mmu_free_memory_caches(vcpu); 3828 mmu_free_memory_caches(vcpu);
3829}
3830
3831#ifdef CONFIG_KVM_MMU_AUDIT
3832#include "mmu_audit.c"
3833#else
3834static void mmu_audit_disable(void) { }
3835#endif
3836
3837void kvm_mmu_module_exit(void)
3838{
3839 mmu_destroy_caches();
3840 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3841 unregister_shrinker(&mmu_shrinker);
3678 mmu_audit_disable(); 3842 mmu_audit_disable();
3679} 3843}
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ba2bcdde6221..5f6223b8bcf7 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,11 +19,9 @@
19 19
20#include <linux/ratelimit.h> 20#include <linux/ratelimit.h>
21 21
22static int audit_point; 22#define audit_printk(kvm, fmt, args...) \
23
24#define audit_printk(fmt, args...) \
25 printk(KERN_ERR "audit: (%s) error: " \ 23 printk(KERN_ERR "audit: (%s) error: " \
26 fmt, audit_point_name[audit_point], ##args) 24 fmt, audit_point_name[kvm->arch.audit_point], ##args)
27 25
28typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); 26typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
29 27
@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
97 95
98 if (sp->unsync) { 96 if (sp->unsync) {
99 if (level != PT_PAGE_TABLE_LEVEL) { 97 if (level != PT_PAGE_TABLE_LEVEL) {
100 audit_printk("unsync sp: %p level = %d\n", sp, level); 98 audit_printk(vcpu->kvm, "unsync sp: %p "
99 "level = %d\n", sp, level);
101 return; 100 return;
102 } 101 }
103 102
104 if (*sptep == shadow_notrap_nonpresent_pte) { 103 if (*sptep == shadow_notrap_nonpresent_pte) {
105 audit_printk("notrap spte in unsync sp: %p\n", sp); 104 audit_printk(vcpu->kvm, "notrap spte in unsync "
105 "sp: %p\n", sp);
106 return; 106 return;
107 } 107 }
108 } 108 }
109 109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { 110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk("notrap spte in direct sp: %p\n", sp); 111 audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
112 sp);
112 return; 113 return;
113 } 114 }
114 115
@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
125 126
126 hpa = pfn << PAGE_SHIFT; 127 hpa = pfn << PAGE_SHIFT;
127 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) 128 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
128 audit_printk("levels %d pfn %llx hpa %llx ent %llxn", 129 audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
129 vcpu->arch.mmu.root_level, pfn, hpa, *sptep); 130 "ent %llxn", vcpu->arch.mmu.root_level, pfn,
131 hpa, *sptep);
130} 132}
131 133
132static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) 134static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
142 if (!gfn_to_memslot(kvm, gfn)) { 144 if (!gfn_to_memslot(kvm, gfn)) {
143 if (!printk_ratelimit()) 145 if (!printk_ratelimit())
144 return; 146 return;
145 audit_printk("no memslot for gfn %llx\n", gfn); 147 audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
146 audit_printk("index %ld of sp (gfn=%llx)\n", 148 audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
147 (long int)(sptep - rev_sp->spt), rev_sp->gfn); 149 (long int)(sptep - rev_sp->spt), rev_sp->gfn);
148 dump_stack(); 150 dump_stack();
149 return; 151 return;
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
153 if (!*rmapp) { 155 if (!*rmapp) {
154 if (!printk_ratelimit()) 156 if (!printk_ratelimit())
155 return; 157 return;
156 audit_printk("no rmap for writable spte %llx\n", *sptep); 158 audit_printk(kvm, "no rmap for writable spte %llx\n",
159 *sptep);
157 dump_stack(); 160 dump_stack();
158 } 161 }
159} 162}
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
168{ 171{
169 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 172 struct kvm_mmu_page *sp = page_header(__pa(sptep));
170 173
171 if (audit_point == AUDIT_POST_SYNC && sp->unsync) 174 if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
172 audit_printk("meet unsync sp(%p) after sync root.\n", sp); 175 audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
176 "root.\n", sp);
173} 177}
174 178
175static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) 179static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
202 spte = rmap_next(kvm, rmapp, NULL); 206 spte = rmap_next(kvm, rmapp, NULL);
203 while (spte) { 207 while (spte) {
204 if (is_writable_pte(*spte)) 208 if (is_writable_pte(*spte))
205 audit_printk("shadow page has writable mappings: gfn " 209 audit_printk(kvm, "shadow page has writable "
206 "%llx role %x\n", sp->gfn, sp->role.word); 210 "mappings: gfn %llx role %x\n",
211 sp->gfn, sp->role.word);
207 spte = rmap_next(kvm, rmapp, spte); 212 spte = rmap_next(kvm, rmapp, spte);
208 } 213 }
209} 214}
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
238 if (!__ratelimit(&ratelimit_state)) 243 if (!__ratelimit(&ratelimit_state))
239 return; 244 return;
240 245
241 audit_point = point; 246 vcpu->kvm->arch.audit_point = point;
242 audit_all_active_sps(vcpu->kvm); 247 audit_all_active_sps(vcpu->kvm);
243 audit_vcpu_spte(vcpu); 248 audit_vcpu_spte(vcpu);
244} 249}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cd7a833a3b52..6bccc24c4181 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -72,7 +72,7 @@ struct guest_walker {
72 unsigned pt_access; 72 unsigned pt_access;
73 unsigned pte_access; 73 unsigned pte_access;
74 gfn_t gfn; 74 gfn_t gfn;
75 u32 error_code; 75 struct x86_exception fault;
76}; 76};
77 77
78static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) 78static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -266,21 +266,23 @@ walk:
266 return 1; 266 return 1;
267 267
268error: 268error:
269 walker->error_code = 0; 269 walker->fault.vector = PF_VECTOR;
270 walker->fault.error_code_valid = true;
271 walker->fault.error_code = 0;
270 if (present) 272 if (present)
271 walker->error_code |= PFERR_PRESENT_MASK; 273 walker->fault.error_code |= PFERR_PRESENT_MASK;
272 274
273 walker->error_code |= write_fault | user_fault; 275 walker->fault.error_code |= write_fault | user_fault;
274 276
275 if (fetch_fault && mmu->nx) 277 if (fetch_fault && mmu->nx)
276 walker->error_code |= PFERR_FETCH_MASK; 278 walker->fault.error_code |= PFERR_FETCH_MASK;
277 if (rsvd_fault) 279 if (rsvd_fault)
278 walker->error_code |= PFERR_RSVD_MASK; 280 walker->fault.error_code |= PFERR_RSVD_MASK;
279 281
280 vcpu->arch.fault.address = addr; 282 walker->fault.address = addr;
281 vcpu->arch.fault.error_code = walker->error_code; 283 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
282 284
283 trace_kvm_mmu_walker_error(walker->error_code); 285 trace_kvm_mmu_walker_error(walker->fault.error_code);
284 return 0; 286 return 0;
285} 287}
286 288
@@ -299,25 +301,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
299 addr, access); 301 addr, access);
300} 302}
301 303
304static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
305 struct kvm_mmu_page *sp, u64 *spte,
306 pt_element_t gpte)
307{
308 u64 nonpresent = shadow_trap_nonpresent_pte;
309
310 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
311 goto no_present;
312
313 if (!is_present_gpte(gpte)) {
314 if (!sp->unsync)
315 nonpresent = shadow_notrap_nonpresent_pte;
316 goto no_present;
317 }
318
319 if (!(gpte & PT_ACCESSED_MASK))
320 goto no_present;
321
322 return false;
323
324no_present:
325 drop_spte(vcpu->kvm, spte, nonpresent);
326 return true;
327}
328
302static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 329static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
303 u64 *spte, const void *pte) 330 u64 *spte, const void *pte)
304{ 331{
305 pt_element_t gpte; 332 pt_element_t gpte;
306 unsigned pte_access; 333 unsigned pte_access;
307 pfn_t pfn; 334 pfn_t pfn;
308 u64 new_spte;
309 335
310 gpte = *(const pt_element_t *)pte; 336 gpte = *(const pt_element_t *)pte;
311 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 337 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
312 if (!is_present_gpte(gpte)) {
313 if (sp->unsync)
314 new_spte = shadow_trap_nonpresent_pte;
315 else
316 new_spte = shadow_notrap_nonpresent_pte;
317 __set_spte(spte, new_spte);
318 }
319 return; 338 return;
320 } 339
321 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 340 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
322 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 341 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
323 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 342 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -329,7 +348,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
329 return; 348 return;
330 kvm_get_pfn(pfn); 349 kvm_get_pfn(pfn);
331 /* 350 /*
332 * we call mmu_set_spte() with reset_host_protection = true beacuse that 351 * we call mmu_set_spte() with host_writable = true beacuse that
333 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 352 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
334 */ 353 */
335 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 354 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -364,7 +383,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
364 u64 *sptep) 383 u64 *sptep)
365{ 384{
366 struct kvm_mmu_page *sp; 385 struct kvm_mmu_page *sp;
367 struct kvm_mmu *mmu = &vcpu->arch.mmu;
368 pt_element_t *gptep = gw->prefetch_ptes; 386 pt_element_t *gptep = gw->prefetch_ptes;
369 u64 *spte; 387 u64 *spte;
370 int i; 388 int i;
@@ -395,14 +413,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
395 413
396 gpte = gptep[i]; 414 gpte = gptep[i];
397 415
398 if (!is_present_gpte(gpte) || 416 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
399 is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
400 if (!sp->unsync)
401 __set_spte(spte, shadow_notrap_nonpresent_pte);
402 continue;
403 }
404
405 if (!(gpte & PT_ACCESSED_MASK))
406 continue; 417 continue;
407 418
408 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 419 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -427,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
427static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 438static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
428 struct guest_walker *gw, 439 struct guest_walker *gw,
429 int user_fault, int write_fault, int hlevel, 440 int user_fault, int write_fault, int hlevel,
430 int *ptwrite, pfn_t pfn) 441 int *ptwrite, pfn_t pfn, bool map_writable,
442 bool prefault)
431{ 443{
432 unsigned access = gw->pt_access; 444 unsigned access = gw->pt_access;
433 struct kvm_mmu_page *sp = NULL; 445 struct kvm_mmu_page *sp = NULL;
@@ -501,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
501 513
502 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 514 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
503 user_fault, write_fault, dirty, ptwrite, it.level, 515 user_fault, write_fault, dirty, ptwrite, it.level,
504 gw->gfn, pfn, false, true); 516 gw->gfn, pfn, prefault, map_writable);
505 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 517 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
506 518
507 return it.sptep; 519 return it.sptep;
@@ -527,8 +539,8 @@ out_gpte_changed:
527 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 539 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
528 * a negative value on error. 540 * a negative value on error.
529 */ 541 */
530static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, 542static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
531 u32 error_code) 543 bool prefault)
532{ 544{
533 int write_fault = error_code & PFERR_WRITE_MASK; 545 int write_fault = error_code & PFERR_WRITE_MASK;
534 int user_fault = error_code & PFERR_USER_MASK; 546 int user_fault = error_code & PFERR_USER_MASK;
@@ -538,7 +550,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
538 int r; 550 int r;
539 pfn_t pfn; 551 pfn_t pfn;
540 int level = PT_PAGE_TABLE_LEVEL; 552 int level = PT_PAGE_TABLE_LEVEL;
553 int force_pt_level;
541 unsigned long mmu_seq; 554 unsigned long mmu_seq;
555 bool map_writable;
542 556
543 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 557 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
544 558
@@ -556,19 +570,29 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
556 */ 570 */
557 if (!r) { 571 if (!r) {
558 pgprintk("%s: guest page fault\n", __func__); 572 pgprintk("%s: guest page fault\n", __func__);
559 inject_page_fault(vcpu); 573 if (!prefault) {
560 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 574 inject_page_fault(vcpu, &walker.fault);
575 /* reset fork detector */
576 vcpu->arch.last_pt_write_count = 0;
577 }
561 return 0; 578 return 0;
562 } 579 }
563 580
564 if (walker.level >= PT_DIRECTORY_LEVEL) { 581 if (walker.level >= PT_DIRECTORY_LEVEL)
582 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
583 else
584 force_pt_level = 1;
585 if (!force_pt_level) {
565 level = min(walker.level, mapping_level(vcpu, walker.gfn)); 586 level = min(walker.level, mapping_level(vcpu, walker.gfn));
566 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 587 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
567 } 588 }
568 589
569 mmu_seq = vcpu->kvm->mmu_notifier_seq; 590 mmu_seq = vcpu->kvm->mmu_notifier_seq;
570 smp_rmb(); 591 smp_rmb();
571 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 592
593 if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
594 &map_writable))
595 return 0;
572 596
573 /* mmio */ 597 /* mmio */
574 if (is_error_pfn(pfn)) 598 if (is_error_pfn(pfn))
@@ -580,8 +604,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
580 604
581 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 605 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
582 kvm_mmu_free_some_pages(vcpu); 606 kvm_mmu_free_some_pages(vcpu);
607 if (!force_pt_level)
608 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
583 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 609 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
584 level, &write_pt, pfn); 610 level, &write_pt, pfn, map_writable, prefault);
585 (void)sptep; 611 (void)sptep;
586 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 612 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
587 sptep, *sptep, write_pt); 613 sptep, *sptep, write_pt);
@@ -661,7 +687,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
661} 687}
662 688
663static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 689static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
664 u32 *error) 690 struct x86_exception *exception)
665{ 691{
666 struct guest_walker walker; 692 struct guest_walker walker;
667 gpa_t gpa = UNMAPPED_GVA; 693 gpa_t gpa = UNMAPPED_GVA;
@@ -672,14 +698,15 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
672 if (r) { 698 if (r) {
673 gpa = gfn_to_gpa(walker.gfn); 699 gpa = gfn_to_gpa(walker.gfn);
674 gpa |= vaddr & ~PAGE_MASK; 700 gpa |= vaddr & ~PAGE_MASK;
675 } else if (error) 701 } else if (exception)
676 *error = walker.error_code; 702 *exception = walker.fault;
677 703
678 return gpa; 704 return gpa;
679} 705}
680 706
681static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, 707static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
682 u32 access, u32 *error) 708 u32 access,
709 struct x86_exception *exception)
683{ 710{
684 struct guest_walker walker; 711 struct guest_walker walker;
685 gpa_t gpa = UNMAPPED_GVA; 712 gpa_t gpa = UNMAPPED_GVA;
@@ -690,8 +717,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
690 if (r) { 717 if (r) {
691 gpa = gfn_to_gpa(walker.gfn); 718 gpa = gfn_to_gpa(walker.gfn);
692 gpa |= vaddr & ~PAGE_MASK; 719 gpa |= vaddr & ~PAGE_MASK;
693 } else if (error) 720 } else if (exception)
694 *error = walker.error_code; 721 *exception = walker.fault;
695 722
696 return gpa; 723 return gpa;
697} 724}
@@ -730,12 +757,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
730 * Using the cached information from sp->gfns is safe because: 757 * Using the cached information from sp->gfns is safe because:
731 * - The spte has a reference to the struct page, so the pfn for a given gfn 758 * - The spte has a reference to the struct page, so the pfn for a given gfn
732 * can't change unless all sptes pointing to it are nuked first. 759 * can't change unless all sptes pointing to it are nuked first.
760 *
761 * Note:
762 * We should flush all tlbs if spte is dropped even though guest is
763 * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
764 * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
765 * used by guest then tlbs are not flushed, so guest is allowed to access the
766 * freed pages.
767 * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
733 */ 768 */
734static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 769static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
735 bool clear_unsync)
736{ 770{
737 int i, offset, nr_present; 771 int i, offset, nr_present;
738 bool reset_host_protection; 772 bool host_writable;
739 gpa_t first_pte_gpa; 773 gpa_t first_pte_gpa;
740 774
741 offset = nr_present = 0; 775 offset = nr_present = 0;
@@ -764,31 +798,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
764 return -EINVAL; 798 return -EINVAL;
765 799
766 gfn = gpte_to_gfn(gpte); 800 gfn = gpte_to_gfn(gpte);
767 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
768 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
769 || !(gpte & PT_ACCESSED_MASK)) {
770 u64 nonpresent;
771 801
772 if (is_present_gpte(gpte) || !clear_unsync) 802 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
773 nonpresent = shadow_trap_nonpresent_pte; 803 vcpu->kvm->tlbs_dirty++;
774 else 804 continue;
775 nonpresent = shadow_notrap_nonpresent_pte; 805 }
776 drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); 806
807 if (gfn != sp->gfns[i]) {
808 drop_spte(vcpu->kvm, &sp->spt[i],
809 shadow_trap_nonpresent_pte);
810 vcpu->kvm->tlbs_dirty++;
777 continue; 811 continue;
778 } 812 }
779 813
780 nr_present++; 814 nr_present++;
781 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 815 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
782 if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { 816 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
783 pte_access &= ~ACC_WRITE_MASK; 817
784 reset_host_protection = 0;
785 } else {
786 reset_host_protection = 1;
787 }
788 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 818 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
789 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 819 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
790 spte_to_pfn(sp->spt[i]), true, false, 820 spte_to_pfn(sp->spt[i]), true, false,
791 reset_host_protection); 821 host_writable);
792 } 822 }
793 823
794 return !nr_present; 824 return !nr_present;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b81a9b7c2ca4..25bd1bc5aad2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -31,6 +31,7 @@
31 31
32#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
33#include <asm/desc.h> 33#include <asm/desc.h>
34#include <asm/kvm_para.h>
34 35
35#include <asm/virtext.h> 36#include <asm/virtext.h>
36#include "trace.h" 37#include "trace.h"
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL");
50#define SVM_FEATURE_LBRV (1 << 1) 51#define SVM_FEATURE_LBRV (1 << 1)
51#define SVM_FEATURE_SVML (1 << 2) 52#define SVM_FEATURE_SVML (1 << 2)
52#define SVM_FEATURE_NRIP (1 << 3) 53#define SVM_FEATURE_NRIP (1 << 3)
54#define SVM_FEATURE_TSC_RATE (1 << 4)
55#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
56#define SVM_FEATURE_FLUSH_ASID (1 << 6)
57#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
53#define SVM_FEATURE_PAUSE_FILTER (1 << 10) 58#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
54 59
55#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 60#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -97,10 +102,8 @@ struct nested_state {
97 unsigned long vmexit_rax; 102 unsigned long vmexit_rax;
98 103
99 /* cache for intercepts of the guest */ 104 /* cache for intercepts of the guest */
100 u16 intercept_cr_read; 105 u32 intercept_cr;
101 u16 intercept_cr_write; 106 u32 intercept_dr;
102 u16 intercept_dr_read;
103 u16 intercept_dr_write;
104 u32 intercept_exceptions; 107 u32 intercept_exceptions;
105 u64 intercept; 108 u64 intercept;
106 109
@@ -123,7 +126,12 @@ struct vcpu_svm {
123 u64 next_rip; 126 u64 next_rip;
124 127
125 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; 128 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
126 u64 host_gs_base; 129 struct {
130 u16 fs;
131 u16 gs;
132 u16 ldt;
133 u64 gs_base;
134 } host;
127 135
128 u32 *msrpm; 136 u32 *msrpm;
129 137
@@ -133,6 +141,7 @@ struct vcpu_svm {
133 141
134 unsigned int3_injected; 142 unsigned int3_injected;
135 unsigned long int3_rip; 143 unsigned long int3_rip;
144 u32 apf_reason;
136}; 145};
137 146
138#define MSR_INVALID 0xffffffffU 147#define MSR_INVALID 0xffffffffU
@@ -180,14 +189,151 @@ static int nested_svm_vmexit(struct vcpu_svm *svm);
180static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 189static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
181 bool has_error_code, u32 error_code); 190 bool has_error_code, u32 error_code);
182 191
192enum {
193 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
194 pause filter count */
195 VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
196 VMCB_ASID, /* ASID */
197 VMCB_INTR, /* int_ctl, int_vector */
198 VMCB_NPT, /* npt_en, nCR3, gPAT */
199 VMCB_CR, /* CR0, CR3, CR4, EFER */
200 VMCB_DR, /* DR6, DR7 */
201 VMCB_DT, /* GDT, IDT */
202 VMCB_SEG, /* CS, DS, SS, ES, CPL */
203 VMCB_CR2, /* CR2 only */
204 VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
205 VMCB_DIRTY_MAX,
206};
207
208/* TPR and CR2 are always written before VMRUN */
209#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
210
211static inline void mark_all_dirty(struct vmcb *vmcb)
212{
213 vmcb->control.clean = 0;
214}
215
216static inline void mark_all_clean(struct vmcb *vmcb)
217{
218 vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
219 & ~VMCB_ALWAYS_DIRTY_MASK;
220}
221
222static inline void mark_dirty(struct vmcb *vmcb, int bit)
223{
224 vmcb->control.clean &= ~(1 << bit);
225}
226
183static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 227static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
184{ 228{
185 return container_of(vcpu, struct vcpu_svm, vcpu); 229 return container_of(vcpu, struct vcpu_svm, vcpu);
186} 230}
187 231
188static inline bool is_nested(struct vcpu_svm *svm) 232static void recalc_intercepts(struct vcpu_svm *svm)
233{
234 struct vmcb_control_area *c, *h;
235 struct nested_state *g;
236
237 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
238
239 if (!is_guest_mode(&svm->vcpu))
240 return;
241
242 c = &svm->vmcb->control;
243 h = &svm->nested.hsave->control;
244 g = &svm->nested;
245
246 c->intercept_cr = h->intercept_cr | g->intercept_cr;
247 c->intercept_dr = h->intercept_dr | g->intercept_dr;
248 c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
249 c->intercept = h->intercept | g->intercept;
250}
251
252static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
253{
254 if (is_guest_mode(&svm->vcpu))
255 return svm->nested.hsave;
256 else
257 return svm->vmcb;
258}
259
260static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
261{
262 struct vmcb *vmcb = get_host_vmcb(svm);
263
264 vmcb->control.intercept_cr |= (1U << bit);
265
266 recalc_intercepts(svm);
267}
268
269static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
270{
271 struct vmcb *vmcb = get_host_vmcb(svm);
272
273 vmcb->control.intercept_cr &= ~(1U << bit);
274
275 recalc_intercepts(svm);
276}
277
278static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
279{
280 struct vmcb *vmcb = get_host_vmcb(svm);
281
282 return vmcb->control.intercept_cr & (1U << bit);
283}
284
285static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
286{
287 struct vmcb *vmcb = get_host_vmcb(svm);
288
289 vmcb->control.intercept_dr |= (1U << bit);
290
291 recalc_intercepts(svm);
292}
293
294static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
295{
296 struct vmcb *vmcb = get_host_vmcb(svm);
297
298 vmcb->control.intercept_dr &= ~(1U << bit);
299
300 recalc_intercepts(svm);
301}
302
303static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
304{
305 struct vmcb *vmcb = get_host_vmcb(svm);
306
307 vmcb->control.intercept_exceptions |= (1U << bit);
308
309 recalc_intercepts(svm);
310}
311
312static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
189{ 313{
190 return svm->nested.vmcb; 314 struct vmcb *vmcb = get_host_vmcb(svm);
315
316 vmcb->control.intercept_exceptions &= ~(1U << bit);
317
318 recalc_intercepts(svm);
319}
320
321static inline void set_intercept(struct vcpu_svm *svm, int bit)
322{
323 struct vmcb *vmcb = get_host_vmcb(svm);
324
325 vmcb->control.intercept |= (1ULL << bit);
326
327 recalc_intercepts(svm);
328}
329
330static inline void clr_intercept(struct vcpu_svm *svm, int bit)
331{
332 struct vmcb *vmcb = get_host_vmcb(svm);
333
334 vmcb->control.intercept &= ~(1ULL << bit);
335
336 recalc_intercepts(svm);
191} 337}
192 338
193static inline void enable_gif(struct vcpu_svm *svm) 339static inline void enable_gif(struct vcpu_svm *svm)
@@ -264,11 +410,6 @@ static u32 svm_msrpm_offset(u32 msr)
264 410
265#define MAX_INST_SIZE 15 411#define MAX_INST_SIZE 15
266 412
267static inline u32 svm_has(u32 feat)
268{
269 return svm_features & feat;
270}
271
272static inline void clgi(void) 413static inline void clgi(void)
273{ 414{
274 asm volatile (__ex(SVM_CLGI)); 415 asm volatile (__ex(SVM_CLGI));
@@ -284,16 +425,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
284 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); 425 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
285} 426}
286 427
287static inline void force_new_asid(struct kvm_vcpu *vcpu)
288{
289 to_svm(vcpu)->asid_generation--;
290}
291
292static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
293{
294 force_new_asid(vcpu);
295}
296
297static int get_npt_level(void) 428static int get_npt_level(void)
298{ 429{
299#ifdef CONFIG_X86_64 430#ifdef CONFIG_X86_64
@@ -310,6 +441,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
310 efer &= ~EFER_LME; 441 efer &= ~EFER_LME;
311 442
312 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 443 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
444 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
313} 445}
314 446
315static int is_external_interrupt(u32 info) 447static int is_external_interrupt(u32 info)
@@ -347,7 +479,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
347 svm->next_rip = svm->vmcb->control.next_rip; 479 svm->next_rip = svm->vmcb->control.next_rip;
348 480
349 if (!svm->next_rip) { 481 if (!svm->next_rip) {
350 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != 482 if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
351 EMULATE_DONE) 483 EMULATE_DONE)
352 printk(KERN_DEBUG "%s: NOP\n", __func__); 484 printk(KERN_DEBUG "%s: NOP\n", __func__);
353 return; 485 return;
@@ -374,7 +506,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
374 nested_svm_check_exception(svm, nr, has_error_code, error_code)) 506 nested_svm_check_exception(svm, nr, has_error_code, error_code))
375 return; 507 return;
376 508
377 if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { 509 if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
378 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); 510 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
379 511
380 /* 512 /*
@@ -670,7 +802,7 @@ static __init int svm_hardware_setup(void)
670 802
671 svm_features = cpuid_edx(SVM_CPUID_FUNC); 803 svm_features = cpuid_edx(SVM_CPUID_FUNC);
672 804
673 if (!svm_has(SVM_FEATURE_NPT)) 805 if (!boot_cpu_has(X86_FEATURE_NPT))
674 npt_enabled = false; 806 npt_enabled = false;
675 807
676 if (npt_enabled && !npt) { 808 if (npt_enabled && !npt) {
@@ -725,13 +857,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
725 struct vcpu_svm *svm = to_svm(vcpu); 857 struct vcpu_svm *svm = to_svm(vcpu);
726 u64 g_tsc_offset = 0; 858 u64 g_tsc_offset = 0;
727 859
728 if (is_nested(svm)) { 860 if (is_guest_mode(vcpu)) {
729 g_tsc_offset = svm->vmcb->control.tsc_offset - 861 g_tsc_offset = svm->vmcb->control.tsc_offset -
730 svm->nested.hsave->control.tsc_offset; 862 svm->nested.hsave->control.tsc_offset;
731 svm->nested.hsave->control.tsc_offset = offset; 863 svm->nested.hsave->control.tsc_offset = offset;
732 } 864 }
733 865
734 svm->vmcb->control.tsc_offset = offset + g_tsc_offset; 866 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
867
868 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
735} 869}
736 870
737static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 871static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -739,8 +873,9 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
739 struct vcpu_svm *svm = to_svm(vcpu); 873 struct vcpu_svm *svm = to_svm(vcpu);
740 874
741 svm->vmcb->control.tsc_offset += adjustment; 875 svm->vmcb->control.tsc_offset += adjustment;
742 if (is_nested(svm)) 876 if (is_guest_mode(vcpu))
743 svm->nested.hsave->control.tsc_offset += adjustment; 877 svm->nested.hsave->control.tsc_offset += adjustment;
878 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
744} 879}
745 880
746static void init_vmcb(struct vcpu_svm *svm) 881static void init_vmcb(struct vcpu_svm *svm)
@@ -749,62 +884,62 @@ static void init_vmcb(struct vcpu_svm *svm)
749 struct vmcb_save_area *save = &svm->vmcb->save; 884 struct vmcb_save_area *save = &svm->vmcb->save;
750 885
751 svm->vcpu.fpu_active = 1; 886 svm->vcpu.fpu_active = 1;
887 svm->vcpu.arch.hflags = 0;
752 888
753 control->intercept_cr_read = INTERCEPT_CR0_MASK | 889 set_cr_intercept(svm, INTERCEPT_CR0_READ);
754 INTERCEPT_CR3_MASK | 890 set_cr_intercept(svm, INTERCEPT_CR3_READ);
755 INTERCEPT_CR4_MASK; 891 set_cr_intercept(svm, INTERCEPT_CR4_READ);
756 892 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
757 control->intercept_cr_write = INTERCEPT_CR0_MASK | 893 set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
758 INTERCEPT_CR3_MASK | 894 set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
759 INTERCEPT_CR4_MASK | 895 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
760 INTERCEPT_CR8_MASK; 896
761 897 set_dr_intercept(svm, INTERCEPT_DR0_READ);
762 control->intercept_dr_read = INTERCEPT_DR0_MASK | 898 set_dr_intercept(svm, INTERCEPT_DR1_READ);
763 INTERCEPT_DR1_MASK | 899 set_dr_intercept(svm, INTERCEPT_DR2_READ);
764 INTERCEPT_DR2_MASK | 900 set_dr_intercept(svm, INTERCEPT_DR3_READ);
765 INTERCEPT_DR3_MASK | 901 set_dr_intercept(svm, INTERCEPT_DR4_READ);
766 INTERCEPT_DR4_MASK | 902 set_dr_intercept(svm, INTERCEPT_DR5_READ);
767 INTERCEPT_DR5_MASK | 903 set_dr_intercept(svm, INTERCEPT_DR6_READ);
768 INTERCEPT_DR6_MASK | 904 set_dr_intercept(svm, INTERCEPT_DR7_READ);
769 INTERCEPT_DR7_MASK; 905
770 906 set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
771 control->intercept_dr_write = INTERCEPT_DR0_MASK | 907 set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
772 INTERCEPT_DR1_MASK | 908 set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
773 INTERCEPT_DR2_MASK | 909 set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
774 INTERCEPT_DR3_MASK | 910 set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
775 INTERCEPT_DR4_MASK | 911 set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
776 INTERCEPT_DR5_MASK | 912 set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
777 INTERCEPT_DR6_MASK | 913 set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
778 INTERCEPT_DR7_MASK; 914
779 915 set_exception_intercept(svm, PF_VECTOR);
780 control->intercept_exceptions = (1 << PF_VECTOR) | 916 set_exception_intercept(svm, UD_VECTOR);
781 (1 << UD_VECTOR) | 917 set_exception_intercept(svm, MC_VECTOR);
782 (1 << MC_VECTOR); 918
783 919 set_intercept(svm, INTERCEPT_INTR);
784 920 set_intercept(svm, INTERCEPT_NMI);
785 control->intercept = (1ULL << INTERCEPT_INTR) | 921 set_intercept(svm, INTERCEPT_SMI);
786 (1ULL << INTERCEPT_NMI) | 922 set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
787 (1ULL << INTERCEPT_SMI) | 923 set_intercept(svm, INTERCEPT_CPUID);
788 (1ULL << INTERCEPT_SELECTIVE_CR0) | 924 set_intercept(svm, INTERCEPT_INVD);
789 (1ULL << INTERCEPT_CPUID) | 925 set_intercept(svm, INTERCEPT_HLT);
790 (1ULL << INTERCEPT_INVD) | 926 set_intercept(svm, INTERCEPT_INVLPG);
791 (1ULL << INTERCEPT_HLT) | 927 set_intercept(svm, INTERCEPT_INVLPGA);
792 (1ULL << INTERCEPT_INVLPG) | 928 set_intercept(svm, INTERCEPT_IOIO_PROT);
793 (1ULL << INTERCEPT_INVLPGA) | 929 set_intercept(svm, INTERCEPT_MSR_PROT);
794 (1ULL << INTERCEPT_IOIO_PROT) | 930 set_intercept(svm, INTERCEPT_TASK_SWITCH);
795 (1ULL << INTERCEPT_MSR_PROT) | 931 set_intercept(svm, INTERCEPT_SHUTDOWN);
796 (1ULL << INTERCEPT_TASK_SWITCH) | 932 set_intercept(svm, INTERCEPT_VMRUN);
797 (1ULL << INTERCEPT_SHUTDOWN) | 933 set_intercept(svm, INTERCEPT_VMMCALL);
798 (1ULL << INTERCEPT_VMRUN) | 934 set_intercept(svm, INTERCEPT_VMLOAD);
799 (1ULL << INTERCEPT_VMMCALL) | 935 set_intercept(svm, INTERCEPT_VMSAVE);
800 (1ULL << INTERCEPT_VMLOAD) | 936 set_intercept(svm, INTERCEPT_STGI);
801 (1ULL << INTERCEPT_VMSAVE) | 937 set_intercept(svm, INTERCEPT_CLGI);
802 (1ULL << INTERCEPT_STGI) | 938 set_intercept(svm, INTERCEPT_SKINIT);
803 (1ULL << INTERCEPT_CLGI) | 939 set_intercept(svm, INTERCEPT_WBINVD);
804 (1ULL << INTERCEPT_SKINIT) | 940 set_intercept(svm, INTERCEPT_MONITOR);
805 (1ULL << INTERCEPT_WBINVD) | 941 set_intercept(svm, INTERCEPT_MWAIT);
806 (1ULL << INTERCEPT_MONITOR) | 942 set_intercept(svm, INTERCEPT_XSETBV);
807 (1ULL << INTERCEPT_MWAIT);
808 943
809 control->iopm_base_pa = iopm_base; 944 control->iopm_base_pa = iopm_base;
810 control->msrpm_base_pa = __pa(svm->msrpm); 945 control->msrpm_base_pa = __pa(svm->msrpm);
@@ -855,25 +990,27 @@ static void init_vmcb(struct vcpu_svm *svm)
855 if (npt_enabled) { 990 if (npt_enabled) {
856 /* Setup VMCB for Nested Paging */ 991 /* Setup VMCB for Nested Paging */
857 control->nested_ctl = 1; 992 control->nested_ctl = 1;
858 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | 993 clr_intercept(svm, INTERCEPT_TASK_SWITCH);
859 (1ULL << INTERCEPT_INVLPG)); 994 clr_intercept(svm, INTERCEPT_INVLPG);
860 control->intercept_exceptions &= ~(1 << PF_VECTOR); 995 clr_exception_intercept(svm, PF_VECTOR);
861 control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; 996 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
862 control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; 997 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
863 save->g_pat = 0x0007040600070406ULL; 998 save->g_pat = 0x0007040600070406ULL;
864 save->cr3 = 0; 999 save->cr3 = 0;
865 save->cr4 = 0; 1000 save->cr4 = 0;
866 } 1001 }
867 force_new_asid(&svm->vcpu); 1002 svm->asid_generation = 0;
868 1003
869 svm->nested.vmcb = 0; 1004 svm->nested.vmcb = 0;
870 svm->vcpu.arch.hflags = 0; 1005 svm->vcpu.arch.hflags = 0;
871 1006
872 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { 1007 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
873 control->pause_filter_count = 3000; 1008 control->pause_filter_count = 3000;
874 control->intercept |= (1ULL << INTERCEPT_PAUSE); 1009 set_intercept(svm, INTERCEPT_PAUSE);
875 } 1010 }
876 1011
1012 mark_all_dirty(svm->vmcb);
1013
877 enable_gif(svm); 1014 enable_gif(svm);
878} 1015}
879 1016
@@ -990,8 +1127,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
990 1127
991 if (unlikely(cpu != vcpu->cpu)) { 1128 if (unlikely(cpu != vcpu->cpu)) {
992 svm->asid_generation = 0; 1129 svm->asid_generation = 0;
1130 mark_all_dirty(svm->vmcb);
993 } 1131 }
994 1132
1133#ifdef CONFIG_X86_64
1134 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1135#endif
1136 savesegment(fs, svm->host.fs);
1137 savesegment(gs, svm->host.gs);
1138 svm->host.ldt = kvm_read_ldt();
1139
995 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1140 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
996 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1141 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
997} 1142}
@@ -1002,6 +1147,14 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1002 int i; 1147 int i;
1003 1148
1004 ++vcpu->stat.host_state_reload; 1149 ++vcpu->stat.host_state_reload;
1150 kvm_load_ldt(svm->host.ldt);
1151#ifdef CONFIG_X86_64
1152 loadsegment(fs, svm->host.fs);
1153 load_gs_index(svm->host.gs);
1154 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1155#else
1156 loadsegment(gs, svm->host.gs);
1157#endif
1005 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1158 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1006 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1159 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1007} 1160}
@@ -1021,7 +1174,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1021 switch (reg) { 1174 switch (reg) {
1022 case VCPU_EXREG_PDPTR: 1175 case VCPU_EXREG_PDPTR:
1023 BUG_ON(!npt_enabled); 1176 BUG_ON(!npt_enabled);
1024 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); 1177 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1025 break; 1178 break;
1026 default: 1179 default:
1027 BUG(); 1180 BUG();
@@ -1030,12 +1183,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1030 1183
1031static void svm_set_vintr(struct vcpu_svm *svm) 1184static void svm_set_vintr(struct vcpu_svm *svm)
1032{ 1185{
1033 svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; 1186 set_intercept(svm, INTERCEPT_VINTR);
1034} 1187}
1035 1188
1036static void svm_clear_vintr(struct vcpu_svm *svm) 1189static void svm_clear_vintr(struct vcpu_svm *svm)
1037{ 1190{
1038 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1191 clr_intercept(svm, INTERCEPT_VINTR);
1039} 1192}
1040 1193
1041static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1194static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1150,6 +1303,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1150 1303
1151 svm->vmcb->save.idtr.limit = dt->size; 1304 svm->vmcb->save.idtr.limit = dt->size;
1152 svm->vmcb->save.idtr.base = dt->address ; 1305 svm->vmcb->save.idtr.base = dt->address ;
1306 mark_dirty(svm->vmcb, VMCB_DT);
1153} 1307}
1154 1308
1155static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1309static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1166,19 +1320,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1166 1320
1167 svm->vmcb->save.gdtr.limit = dt->size; 1321 svm->vmcb->save.gdtr.limit = dt->size;
1168 svm->vmcb->save.gdtr.base = dt->address ; 1322 svm->vmcb->save.gdtr.base = dt->address ;
1323 mark_dirty(svm->vmcb, VMCB_DT);
1169} 1324}
1170 1325
1171static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1326static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1172{ 1327{
1173} 1328}
1174 1329
1330static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1331{
1332}
1333
1175static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1334static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1176{ 1335{
1177} 1336}
1178 1337
1179static void update_cr0_intercept(struct vcpu_svm *svm) 1338static void update_cr0_intercept(struct vcpu_svm *svm)
1180{ 1339{
1181 struct vmcb *vmcb = svm->vmcb;
1182 ulong gcr0 = svm->vcpu.arch.cr0; 1340 ulong gcr0 = svm->vcpu.arch.cr0;
1183 u64 *hcr0 = &svm->vmcb->save.cr0; 1341 u64 *hcr0 = &svm->vmcb->save.cr0;
1184 1342
@@ -1188,27 +1346,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
1188 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) 1346 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1189 | (gcr0 & SVM_CR0_SELECTIVE_MASK); 1347 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1190 1348
1349 mark_dirty(svm->vmcb, VMCB_CR);
1191 1350
1192 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1351 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1193 vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; 1352 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1194 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; 1353 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1195 if (is_nested(svm)) {
1196 struct vmcb *hsave = svm->nested.hsave;
1197
1198 hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
1199 hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1200 vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
1201 vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
1202 }
1203 } else { 1354 } else {
1204 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; 1355 set_cr_intercept(svm, INTERCEPT_CR0_READ);
1205 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; 1356 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1206 if (is_nested(svm)) {
1207 struct vmcb *hsave = svm->nested.hsave;
1208
1209 hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
1210 hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1211 }
1212 } 1357 }
1213} 1358}
1214 1359
@@ -1216,7 +1361,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1216{ 1361{
1217 struct vcpu_svm *svm = to_svm(vcpu); 1362 struct vcpu_svm *svm = to_svm(vcpu);
1218 1363
1219 if (is_nested(svm)) { 1364 if (is_guest_mode(vcpu)) {
1220 /* 1365 /*
1221 * We are here because we run in nested mode, the host kvm 1366 * We are here because we run in nested mode, the host kvm
1222 * intercepts cr0 writes but the l1 hypervisor does not. 1367 * intercepts cr0 writes but the l1 hypervisor does not.
@@ -1268,6 +1413,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1268 */ 1413 */
1269 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1414 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1270 svm->vmcb->save.cr0 = cr0; 1415 svm->vmcb->save.cr0 = cr0;
1416 mark_dirty(svm->vmcb, VMCB_CR);
1271 update_cr0_intercept(svm); 1417 update_cr0_intercept(svm);
1272} 1418}
1273 1419
@@ -1277,13 +1423,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1277 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; 1423 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1278 1424
1279 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1425 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1280 force_new_asid(vcpu); 1426 svm_flush_tlb(vcpu);
1281 1427
1282 vcpu->arch.cr4 = cr4; 1428 vcpu->arch.cr4 = cr4;
1283 if (!npt_enabled) 1429 if (!npt_enabled)
1284 cr4 |= X86_CR4_PAE; 1430 cr4 |= X86_CR4_PAE;
1285 cr4 |= host_cr4_mce; 1431 cr4 |= host_cr4_mce;
1286 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1432 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1433 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1287} 1434}
1288 1435
1289static void svm_set_segment(struct kvm_vcpu *vcpu, 1436static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1312,26 +1459,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1312 = (svm->vmcb->save.cs.attrib 1459 = (svm->vmcb->save.cs.attrib
1313 >> SVM_SELECTOR_DPL_SHIFT) & 3; 1460 >> SVM_SELECTOR_DPL_SHIFT) & 3;
1314 1461
1462 mark_dirty(svm->vmcb, VMCB_SEG);
1315} 1463}
1316 1464
1317static void update_db_intercept(struct kvm_vcpu *vcpu) 1465static void update_db_intercept(struct kvm_vcpu *vcpu)
1318{ 1466{
1319 struct vcpu_svm *svm = to_svm(vcpu); 1467 struct vcpu_svm *svm = to_svm(vcpu);
1320 1468
1321 svm->vmcb->control.intercept_exceptions &= 1469 clr_exception_intercept(svm, DB_VECTOR);
1322 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1470 clr_exception_intercept(svm, BP_VECTOR);
1323 1471
1324 if (svm->nmi_singlestep) 1472 if (svm->nmi_singlestep)
1325 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1473 set_exception_intercept(svm, DB_VECTOR);
1326 1474
1327 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1475 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1328 if (vcpu->guest_debug & 1476 if (vcpu->guest_debug &
1329 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 1477 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1330 svm->vmcb->control.intercept_exceptions |= 1478 set_exception_intercept(svm, DB_VECTOR);
1331 1 << DB_VECTOR;
1332 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1479 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1333 svm->vmcb->control.intercept_exceptions |= 1480 set_exception_intercept(svm, BP_VECTOR);
1334 1 << BP_VECTOR;
1335 } else 1481 } else
1336 vcpu->guest_debug = 0; 1482 vcpu->guest_debug = 0;
1337} 1483}
@@ -1345,21 +1491,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1345 else 1491 else
1346 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1492 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1347 1493
1348 update_db_intercept(vcpu); 1494 mark_dirty(svm->vmcb, VMCB_DR);
1349}
1350
1351static void load_host_msrs(struct kvm_vcpu *vcpu)
1352{
1353#ifdef CONFIG_X86_64
1354 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1355#endif
1356}
1357 1495
1358static void save_host_msrs(struct kvm_vcpu *vcpu) 1496 update_db_intercept(vcpu);
1359{
1360#ifdef CONFIG_X86_64
1361 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1362#endif
1363} 1497}
1364 1498
1365static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1499static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -1372,6 +1506,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1372 1506
1373 svm->asid_generation = sd->asid_generation; 1507 svm->asid_generation = sd->asid_generation;
1374 svm->vmcb->control.asid = sd->next_asid++; 1508 svm->vmcb->control.asid = sd->next_asid++;
1509
1510 mark_dirty(svm->vmcb, VMCB_ASID);
1375} 1511}
1376 1512
1377static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1513static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
@@ -1379,20 +1515,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1379 struct vcpu_svm *svm = to_svm(vcpu); 1515 struct vcpu_svm *svm = to_svm(vcpu);
1380 1516
1381 svm->vmcb->save.dr7 = value; 1517 svm->vmcb->save.dr7 = value;
1518 mark_dirty(svm->vmcb, VMCB_DR);
1382} 1519}
1383 1520
1384static int pf_interception(struct vcpu_svm *svm) 1521static int pf_interception(struct vcpu_svm *svm)
1385{ 1522{
1386 u64 fault_address; 1523 u64 fault_address = svm->vmcb->control.exit_info_2;
1387 u32 error_code; 1524 u32 error_code;
1525 int r = 1;
1388 1526
1389 fault_address = svm->vmcb->control.exit_info_2; 1527 switch (svm->apf_reason) {
1390 error_code = svm->vmcb->control.exit_info_1; 1528 default:
1391 1529 error_code = svm->vmcb->control.exit_info_1;
1392 trace_kvm_page_fault(fault_address, error_code); 1530
1393 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) 1531 trace_kvm_page_fault(fault_address, error_code);
1394 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1532 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1395 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1533 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1534 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1535 svm->vmcb->control.insn_bytes,
1536 svm->vmcb->control.insn_len);
1537 break;
1538 case KVM_PV_REASON_PAGE_NOT_PRESENT:
1539 svm->apf_reason = 0;
1540 local_irq_disable();
1541 kvm_async_pf_task_wait(fault_address);
1542 local_irq_enable();
1543 break;
1544 case KVM_PV_REASON_PAGE_READY:
1545 svm->apf_reason = 0;
1546 local_irq_disable();
1547 kvm_async_pf_task_wake(fault_address);
1548 local_irq_enable();
1549 break;
1550 }
1551 return r;
1396} 1552}
1397 1553
1398static int db_interception(struct vcpu_svm *svm) 1554static int db_interception(struct vcpu_svm *svm)
@@ -1440,7 +1596,7 @@ static int ud_interception(struct vcpu_svm *svm)
1440{ 1596{
1441 int er; 1597 int er;
1442 1598
1443 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); 1599 er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
1444 if (er != EMULATE_DONE) 1600 if (er != EMULATE_DONE)
1445 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1601 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1446 return 1; 1602 return 1;
@@ -1449,21 +1605,8 @@ static int ud_interception(struct vcpu_svm *svm)
1449static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1605static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1450{ 1606{
1451 struct vcpu_svm *svm = to_svm(vcpu); 1607 struct vcpu_svm *svm = to_svm(vcpu);
1452 u32 excp;
1453
1454 if (is_nested(svm)) {
1455 u32 h_excp, n_excp;
1456
1457 h_excp = svm->nested.hsave->control.intercept_exceptions;
1458 n_excp = svm->nested.intercept_exceptions;
1459 h_excp &= ~(1 << NM_VECTOR);
1460 excp = h_excp | n_excp;
1461 } else {
1462 excp = svm->vmcb->control.intercept_exceptions;
1463 excp &= ~(1 << NM_VECTOR);
1464 }
1465 1608
1466 svm->vmcb->control.intercept_exceptions = excp; 1609 clr_exception_intercept(svm, NM_VECTOR);
1467 1610
1468 svm->vcpu.fpu_active = 1; 1611 svm->vcpu.fpu_active = 1;
1469 update_cr0_intercept(svm); 1612 update_cr0_intercept(svm);
@@ -1570,7 +1713,7 @@ static int io_interception(struct vcpu_svm *svm)
1570 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1713 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1571 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1714 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1572 if (string || in) 1715 if (string || in)
1573 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 1716 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
1574 1717
1575 port = io_info >> 16; 1718 port = io_info >> 16;
1576 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1719 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1624,17 +1767,19 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1624 struct vcpu_svm *svm = to_svm(vcpu); 1767 struct vcpu_svm *svm = to_svm(vcpu);
1625 1768
1626 svm->vmcb->control.nested_cr3 = root; 1769 svm->vmcb->control.nested_cr3 = root;
1627 force_new_asid(vcpu); 1770 mark_dirty(svm->vmcb, VMCB_NPT);
1771 svm_flush_tlb(vcpu);
1628} 1772}
1629 1773
1630static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) 1774static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
1775 struct x86_exception *fault)
1631{ 1776{
1632 struct vcpu_svm *svm = to_svm(vcpu); 1777 struct vcpu_svm *svm = to_svm(vcpu);
1633 1778
1634 svm->vmcb->control.exit_code = SVM_EXIT_NPF; 1779 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1635 svm->vmcb->control.exit_code_hi = 0; 1780 svm->vmcb->control.exit_code_hi = 0;
1636 svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code; 1781 svm->vmcb->control.exit_info_1 = fault->error_code;
1637 svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address; 1782 svm->vmcb->control.exit_info_2 = fault->address;
1638 1783
1639 nested_svm_vmexit(svm); 1784 nested_svm_vmexit(svm);
1640} 1785}
@@ -1680,7 +1825,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1680{ 1825{
1681 int vmexit; 1826 int vmexit;
1682 1827
1683 if (!is_nested(svm)) 1828 if (!is_guest_mode(&svm->vcpu))
1684 return 0; 1829 return 0;
1685 1830
1686 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; 1831 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1698,7 +1843,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1698/* This function returns true if it is save to enable the irq window */ 1843/* This function returns true if it is save to enable the irq window */
1699static inline bool nested_svm_intr(struct vcpu_svm *svm) 1844static inline bool nested_svm_intr(struct vcpu_svm *svm)
1700{ 1845{
1701 if (!is_nested(svm)) 1846 if (!is_guest_mode(&svm->vcpu))
1702 return true; 1847 return true;
1703 1848
1704 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1849 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
@@ -1737,7 +1882,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
1737/* This function returns true if it is save to enable the nmi window */ 1882/* This function returns true if it is save to enable the nmi window */
1738static inline bool nested_svm_nmi(struct vcpu_svm *svm) 1883static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1739{ 1884{
1740 if (!is_nested(svm)) 1885 if (!is_guest_mode(&svm->vcpu))
1741 return true; 1886 return true;
1742 1887
1743 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) 1888 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
@@ -1836,8 +1981,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1836 return NESTED_EXIT_HOST; 1981 return NESTED_EXIT_HOST;
1837 break; 1982 break;
1838 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1983 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1839 /* When we're shadowing, trap PFs */ 1984 /* When we're shadowing, trap PFs, but not async PF */
1840 if (!npt_enabled) 1985 if (!npt_enabled && svm->apf_reason == 0)
1841 return NESTED_EXIT_HOST; 1986 return NESTED_EXIT_HOST;
1842 break; 1987 break;
1843 case SVM_EXIT_EXCP_BASE + NM_VECTOR: 1988 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1865,27 +2010,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1865 case SVM_EXIT_IOIO: 2010 case SVM_EXIT_IOIO:
1866 vmexit = nested_svm_intercept_ioio(svm); 2011 vmexit = nested_svm_intercept_ioio(svm);
1867 break; 2012 break;
1868 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 2013 case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
1869 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 2014 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
1870 if (svm->nested.intercept_cr_read & cr_bits) 2015 if (svm->nested.intercept_cr & bit)
1871 vmexit = NESTED_EXIT_DONE; 2016 vmexit = NESTED_EXIT_DONE;
1872 break; 2017 break;
1873 } 2018 }
1874 case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { 2019 case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
1875 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); 2020 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
1876 if (svm->nested.intercept_cr_write & cr_bits) 2021 if (svm->nested.intercept_dr & bit)
1877 vmexit = NESTED_EXIT_DONE;
1878 break;
1879 }
1880 case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
1881 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
1882 if (svm->nested.intercept_dr_read & dr_bits)
1883 vmexit = NESTED_EXIT_DONE;
1884 break;
1885 }
1886 case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
1887 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
1888 if (svm->nested.intercept_dr_write & dr_bits)
1889 vmexit = NESTED_EXIT_DONE; 2022 vmexit = NESTED_EXIT_DONE;
1890 break; 2023 break;
1891 } 2024 }
@@ -1893,6 +2026,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1893 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 2026 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1894 if (svm->nested.intercept_exceptions & excp_bits) 2027 if (svm->nested.intercept_exceptions & excp_bits)
1895 vmexit = NESTED_EXIT_DONE; 2028 vmexit = NESTED_EXIT_DONE;
2029 /* async page fault always cause vmexit */
2030 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2031 svm->apf_reason != 0)
2032 vmexit = NESTED_EXIT_DONE;
1896 break; 2033 break;
1897 } 2034 }
1898 case SVM_EXIT_ERR: { 2035 case SVM_EXIT_ERR: {
@@ -1926,10 +2063,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
1926 struct vmcb_control_area *dst = &dst_vmcb->control; 2063 struct vmcb_control_area *dst = &dst_vmcb->control;
1927 struct vmcb_control_area *from = &from_vmcb->control; 2064 struct vmcb_control_area *from = &from_vmcb->control;
1928 2065
1929 dst->intercept_cr_read = from->intercept_cr_read; 2066 dst->intercept_cr = from->intercept_cr;
1930 dst->intercept_cr_write = from->intercept_cr_write; 2067 dst->intercept_dr = from->intercept_dr;
1931 dst->intercept_dr_read = from->intercept_dr_read;
1932 dst->intercept_dr_write = from->intercept_dr_write;
1933 dst->intercept_exceptions = from->intercept_exceptions; 2068 dst->intercept_exceptions = from->intercept_exceptions;
1934 dst->intercept = from->intercept; 2069 dst->intercept = from->intercept;
1935 dst->iopm_base_pa = from->iopm_base_pa; 2070 dst->iopm_base_pa = from->iopm_base_pa;
@@ -1970,7 +2105,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1970 if (!nested_vmcb) 2105 if (!nested_vmcb)
1971 return 1; 2106 return 1;
1972 2107
1973 /* Exit nested SVM mode */ 2108 /* Exit Guest-Mode */
2109 leave_guest_mode(&svm->vcpu);
1974 svm->nested.vmcb = 0; 2110 svm->nested.vmcb = 0;
1975 2111
1976 /* Give the current vmcb to the guest */ 2112 /* Give the current vmcb to the guest */
@@ -1984,7 +2120,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1984 nested_vmcb->save.idtr = vmcb->save.idtr; 2120 nested_vmcb->save.idtr = vmcb->save.idtr;
1985 nested_vmcb->save.efer = svm->vcpu.arch.efer; 2121 nested_vmcb->save.efer = svm->vcpu.arch.efer;
1986 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); 2122 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1987 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; 2123 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
1988 nested_vmcb->save.cr2 = vmcb->save.cr2; 2124 nested_vmcb->save.cr2 = vmcb->save.cr2;
1989 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; 2125 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
1990 nested_vmcb->save.rflags = vmcb->save.rflags; 2126 nested_vmcb->save.rflags = vmcb->save.rflags;
@@ -2061,6 +2197,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2061 svm->vmcb->save.cpl = 0; 2197 svm->vmcb->save.cpl = 0;
2062 svm->vmcb->control.exit_int_info = 0; 2198 svm->vmcb->control.exit_int_info = 0;
2063 2199
2200 mark_all_dirty(svm->vmcb);
2201
2064 nested_svm_unmap(page); 2202 nested_svm_unmap(page);
2065 2203
2066 nested_svm_uninit_mmu_context(&svm->vcpu); 2204 nested_svm_uninit_mmu_context(&svm->vcpu);
@@ -2148,8 +2286,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2148 nested_vmcb->control.event_inj, 2286 nested_vmcb->control.event_inj,
2149 nested_vmcb->control.nested_ctl); 2287 nested_vmcb->control.nested_ctl);
2150 2288
2151 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, 2289 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2152 nested_vmcb->control.intercept_cr_write, 2290 nested_vmcb->control.intercept_cr >> 16,
2153 nested_vmcb->control.intercept_exceptions, 2291 nested_vmcb->control.intercept_exceptions,
2154 nested_vmcb->control.intercept); 2292 nested_vmcb->control.intercept);
2155 2293
@@ -2177,7 +2315,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2177 if (npt_enabled) 2315 if (npt_enabled)
2178 hsave->save.cr3 = vmcb->save.cr3; 2316 hsave->save.cr3 = vmcb->save.cr3;
2179 else 2317 else
2180 hsave->save.cr3 = svm->vcpu.arch.cr3; 2318 hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
2181 2319
2182 copy_vmcb_control_area(hsave, vmcb); 2320 copy_vmcb_control_area(hsave, vmcb);
2183 2321
@@ -2229,14 +2367,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2229 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; 2367 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
2230 2368
2231 /* cache intercepts */ 2369 /* cache intercepts */
2232 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; 2370 svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
2233 svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; 2371 svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
2234 svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
2235 svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
2236 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; 2372 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2237 svm->nested.intercept = nested_vmcb->control.intercept; 2373 svm->nested.intercept = nested_vmcb->control.intercept;
2238 2374
2239 force_new_asid(&svm->vcpu); 2375 svm_flush_tlb(&svm->vcpu);
2240 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 2376 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2241 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 2377 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2242 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 2378 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -2245,29 +2381,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2245 2381
2246 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { 2382 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2247 /* We only want the cr8 intercept bits of the guest */ 2383 /* We only want the cr8 intercept bits of the guest */
2248 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; 2384 clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2249 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2385 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2250 } 2386 }
2251 2387
2252 /* We don't want to see VMMCALLs from a nested guest */ 2388 /* We don't want to see VMMCALLs from a nested guest */
2253 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); 2389 clr_intercept(svm, INTERCEPT_VMMCALL);
2254
2255 /*
2256 * We don't want a nested guest to be more powerful than the guest, so
2257 * all intercepts are ORed
2258 */
2259 svm->vmcb->control.intercept_cr_read |=
2260 nested_vmcb->control.intercept_cr_read;
2261 svm->vmcb->control.intercept_cr_write |=
2262 nested_vmcb->control.intercept_cr_write;
2263 svm->vmcb->control.intercept_dr_read |=
2264 nested_vmcb->control.intercept_dr_read;
2265 svm->vmcb->control.intercept_dr_write |=
2266 nested_vmcb->control.intercept_dr_write;
2267 svm->vmcb->control.intercept_exceptions |=
2268 nested_vmcb->control.intercept_exceptions;
2269
2270 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
2271 2390
2272 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; 2391 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2273 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 2392 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
@@ -2278,11 +2397,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2278 2397
2279 nested_svm_unmap(page); 2398 nested_svm_unmap(page);
2280 2399
2281 /* nested_vmcb is our indicator if nested SVM is activated */ 2400 /* Enter Guest-Mode */
2401 enter_guest_mode(&svm->vcpu);
2402
2403 /*
2404 * Merge guest and host intercepts - must be called with vcpu in
2405 * guest-mode to take affect here
2406 */
2407 recalc_intercepts(svm);
2408
2282 svm->nested.vmcb = vmcb_gpa; 2409 svm->nested.vmcb = vmcb_gpa;
2283 2410
2284 enable_gif(svm); 2411 enable_gif(svm);
2285 2412
2413 mark_all_dirty(svm->vmcb);
2414
2286 return true; 2415 return true;
2287} 2416}
2288 2417
@@ -2400,6 +2529,8 @@ static int clgi_interception(struct vcpu_svm *svm)
2400 svm_clear_vintr(svm); 2529 svm_clear_vintr(svm);
2401 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2530 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2402 2531
2532 mark_dirty(svm->vmcb, VMCB_INTR);
2533
2403 return 1; 2534 return 1;
2404} 2535}
2405 2536
@@ -2426,6 +2557,19 @@ static int skinit_interception(struct vcpu_svm *svm)
2426 return 1; 2557 return 1;
2427} 2558}
2428 2559
2560static int xsetbv_interception(struct vcpu_svm *svm)
2561{
2562 u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2563 u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
2564
2565 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2566 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2567 skip_emulated_instruction(&svm->vcpu);
2568 }
2569
2570 return 1;
2571}
2572
2429static int invalid_op_interception(struct vcpu_svm *svm) 2573static int invalid_op_interception(struct vcpu_svm *svm)
2430{ 2574{
2431 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2575 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2507,19 +2651,92 @@ static int cpuid_interception(struct vcpu_svm *svm)
2507static int iret_interception(struct vcpu_svm *svm) 2651static int iret_interception(struct vcpu_svm *svm)
2508{ 2652{
2509 ++svm->vcpu.stat.nmi_window_exits; 2653 ++svm->vcpu.stat.nmi_window_exits;
2510 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 2654 clr_intercept(svm, INTERCEPT_IRET);
2511 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2655 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2512 return 1; 2656 return 1;
2513} 2657}
2514 2658
2515static int invlpg_interception(struct vcpu_svm *svm) 2659static int invlpg_interception(struct vcpu_svm *svm)
2516{ 2660{
2517 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2661 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2662 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2663
2664 kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2665 skip_emulated_instruction(&svm->vcpu);
2666 return 1;
2518} 2667}
2519 2668
2520static int emulate_on_interception(struct vcpu_svm *svm) 2669static int emulate_on_interception(struct vcpu_svm *svm)
2521{ 2670{
2522 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2671 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2672}
2673
2674#define CR_VALID (1ULL << 63)
2675
2676static int cr_interception(struct vcpu_svm *svm)
2677{
2678 int reg, cr;
2679 unsigned long val;
2680 int err;
2681
2682 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2683 return emulate_on_interception(svm);
2684
2685 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2686 return emulate_on_interception(svm);
2687
2688 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2689 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2690
2691 err = 0;
2692 if (cr >= 16) { /* mov to cr */
2693 cr -= 16;
2694 val = kvm_register_read(&svm->vcpu, reg);
2695 switch (cr) {
2696 case 0:
2697 err = kvm_set_cr0(&svm->vcpu, val);
2698 break;
2699 case 3:
2700 err = kvm_set_cr3(&svm->vcpu, val);
2701 break;
2702 case 4:
2703 err = kvm_set_cr4(&svm->vcpu, val);
2704 break;
2705 case 8:
2706 err = kvm_set_cr8(&svm->vcpu, val);
2707 break;
2708 default:
2709 WARN(1, "unhandled write to CR%d", cr);
2710 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2711 return 1;
2712 }
2713 } else { /* mov from cr */
2714 switch (cr) {
2715 case 0:
2716 val = kvm_read_cr0(&svm->vcpu);
2717 break;
2718 case 2:
2719 val = svm->vcpu.arch.cr2;
2720 break;
2721 case 3:
2722 val = kvm_read_cr3(&svm->vcpu);
2723 break;
2724 case 4:
2725 val = kvm_read_cr4(&svm->vcpu);
2726 break;
2727 case 8:
2728 val = kvm_get_cr8(&svm->vcpu);
2729 break;
2730 default:
2731 WARN(1, "unhandled read from CR%d", cr);
2732 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2733 return 1;
2734 }
2735 kvm_register_write(&svm->vcpu, reg, val);
2736 }
2737 kvm_complete_insn_gp(&svm->vcpu, err);
2738
2739 return 1;
2523} 2740}
2524 2741
2525static int cr0_write_interception(struct vcpu_svm *svm) 2742static int cr0_write_interception(struct vcpu_svm *svm)
@@ -2527,7 +2744,7 @@ static int cr0_write_interception(struct vcpu_svm *svm)
2527 struct kvm_vcpu *vcpu = &svm->vcpu; 2744 struct kvm_vcpu *vcpu = &svm->vcpu;
2528 int r; 2745 int r;
2529 2746
2530 r = emulate_instruction(&svm->vcpu, 0, 0, 0); 2747 r = cr_interception(svm);
2531 2748
2532 if (svm->nested.vmexit_rip) { 2749 if (svm->nested.vmexit_rip) {
2533 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); 2750 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
@@ -2536,22 +2753,47 @@ static int cr0_write_interception(struct vcpu_svm *svm)
2536 svm->nested.vmexit_rip = 0; 2753 svm->nested.vmexit_rip = 0;
2537 } 2754 }
2538 2755
2539 return r == EMULATE_DONE; 2756 return r;
2757}
2758
2759static int dr_interception(struct vcpu_svm *svm)
2760{
2761 int reg, dr;
2762 unsigned long val;
2763 int err;
2764
2765 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2766 return emulate_on_interception(svm);
2767
2768 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2769 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2770
2771 if (dr >= 16) { /* mov to DRn */
2772 val = kvm_register_read(&svm->vcpu, reg);
2773 kvm_set_dr(&svm->vcpu, dr - 16, val);
2774 } else {
2775 err = kvm_get_dr(&svm->vcpu, dr, &val);
2776 if (!err)
2777 kvm_register_write(&svm->vcpu, reg, val);
2778 }
2779
2780 return 1;
2540} 2781}
2541 2782
2542static int cr8_write_interception(struct vcpu_svm *svm) 2783static int cr8_write_interception(struct vcpu_svm *svm)
2543{ 2784{
2544 struct kvm_run *kvm_run = svm->vcpu.run; 2785 struct kvm_run *kvm_run = svm->vcpu.run;
2786 int r;
2545 2787
2546 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2788 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2547 /* instruction emulation calls kvm_set_cr8() */ 2789 /* instruction emulation calls kvm_set_cr8() */
2548 emulate_instruction(&svm->vcpu, 0, 0, 0); 2790 r = cr_interception(svm);
2549 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2791 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2550 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2792 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2551 return 1; 2793 return r;
2552 } 2794 }
2553 if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) 2795 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2554 return 1; 2796 return r;
2555 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2797 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2556 return 0; 2798 return 0;
2557} 2799}
@@ -2562,14 +2804,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2562 2804
2563 switch (ecx) { 2805 switch (ecx) {
2564 case MSR_IA32_TSC: { 2806 case MSR_IA32_TSC: {
2565 u64 tsc_offset; 2807 struct vmcb *vmcb = get_host_vmcb(svm);
2566 2808
2567 if (is_nested(svm)) 2809 *data = vmcb->control.tsc_offset + native_read_tsc();
2568 tsc_offset = svm->nested.hsave->control.tsc_offset;
2569 else
2570 tsc_offset = svm->vmcb->control.tsc_offset;
2571
2572 *data = tsc_offset + native_read_tsc();
2573 break; 2810 break;
2574 } 2811 }
2575 case MSR_STAR: 2812 case MSR_STAR:
@@ -2714,7 +2951,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2714 svm->vmcb->save.sysenter_esp = data; 2951 svm->vmcb->save.sysenter_esp = data;
2715 break; 2952 break;
2716 case MSR_IA32_DEBUGCTLMSR: 2953 case MSR_IA32_DEBUGCTLMSR:
2717 if (!svm_has(SVM_FEATURE_LBRV)) { 2954 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2718 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 2955 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2719 __func__, data); 2956 __func__, data);
2720 break; 2957 break;
@@ -2723,6 +2960,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2723 return 1; 2960 return 1;
2724 2961
2725 svm->vmcb->save.dbgctl = data; 2962 svm->vmcb->save.dbgctl = data;
2963 mark_dirty(svm->vmcb, VMCB_LBR);
2726 if (data & (1ULL<<0)) 2964 if (data & (1ULL<<0))
2727 svm_enable_lbrv(svm); 2965 svm_enable_lbrv(svm);
2728 else 2966 else
@@ -2775,6 +3013,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
2775 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3013 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2776 svm_clear_vintr(svm); 3014 svm_clear_vintr(svm);
2777 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3015 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3016 mark_dirty(svm->vmcb, VMCB_INTR);
2778 /* 3017 /*
2779 * If the user space waits to inject interrupts, exit as soon as 3018 * If the user space waits to inject interrupts, exit as soon as
2780 * possible 3019 * possible
@@ -2797,31 +3036,31 @@ static int pause_interception(struct vcpu_svm *svm)
2797} 3036}
2798 3037
2799static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 3038static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2800 [SVM_EXIT_READ_CR0] = emulate_on_interception, 3039 [SVM_EXIT_READ_CR0] = cr_interception,
2801 [SVM_EXIT_READ_CR3] = emulate_on_interception, 3040 [SVM_EXIT_READ_CR3] = cr_interception,
2802 [SVM_EXIT_READ_CR4] = emulate_on_interception, 3041 [SVM_EXIT_READ_CR4] = cr_interception,
2803 [SVM_EXIT_READ_CR8] = emulate_on_interception, 3042 [SVM_EXIT_READ_CR8] = cr_interception,
2804 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 3043 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2805 [SVM_EXIT_WRITE_CR0] = cr0_write_interception, 3044 [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
2806 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 3045 [SVM_EXIT_WRITE_CR3] = cr_interception,
2807 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 3046 [SVM_EXIT_WRITE_CR4] = cr_interception,
2808 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3047 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2809 [SVM_EXIT_READ_DR0] = emulate_on_interception, 3048 [SVM_EXIT_READ_DR0] = dr_interception,
2810 [SVM_EXIT_READ_DR1] = emulate_on_interception, 3049 [SVM_EXIT_READ_DR1] = dr_interception,
2811 [SVM_EXIT_READ_DR2] = emulate_on_interception, 3050 [SVM_EXIT_READ_DR2] = dr_interception,
2812 [SVM_EXIT_READ_DR3] = emulate_on_interception, 3051 [SVM_EXIT_READ_DR3] = dr_interception,
2813 [SVM_EXIT_READ_DR4] = emulate_on_interception, 3052 [SVM_EXIT_READ_DR4] = dr_interception,
2814 [SVM_EXIT_READ_DR5] = emulate_on_interception, 3053 [SVM_EXIT_READ_DR5] = dr_interception,
2815 [SVM_EXIT_READ_DR6] = emulate_on_interception, 3054 [SVM_EXIT_READ_DR6] = dr_interception,
2816 [SVM_EXIT_READ_DR7] = emulate_on_interception, 3055 [SVM_EXIT_READ_DR7] = dr_interception,
2817 [SVM_EXIT_WRITE_DR0] = emulate_on_interception, 3056 [SVM_EXIT_WRITE_DR0] = dr_interception,
2818 [SVM_EXIT_WRITE_DR1] = emulate_on_interception, 3057 [SVM_EXIT_WRITE_DR1] = dr_interception,
2819 [SVM_EXIT_WRITE_DR2] = emulate_on_interception, 3058 [SVM_EXIT_WRITE_DR2] = dr_interception,
2820 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 3059 [SVM_EXIT_WRITE_DR3] = dr_interception,
2821 [SVM_EXIT_WRITE_DR4] = emulate_on_interception, 3060 [SVM_EXIT_WRITE_DR4] = dr_interception,
2822 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 3061 [SVM_EXIT_WRITE_DR5] = dr_interception,
2823 [SVM_EXIT_WRITE_DR6] = emulate_on_interception, 3062 [SVM_EXIT_WRITE_DR6] = dr_interception,
2824 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 3063 [SVM_EXIT_WRITE_DR7] = dr_interception,
2825 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 3064 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2826 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3065 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2827 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3066 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
@@ -2854,6 +3093,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2854 [SVM_EXIT_WBINVD] = emulate_on_interception, 3093 [SVM_EXIT_WBINVD] = emulate_on_interception,
2855 [SVM_EXIT_MONITOR] = invalid_op_interception, 3094 [SVM_EXIT_MONITOR] = invalid_op_interception,
2856 [SVM_EXIT_MWAIT] = invalid_op_interception, 3095 [SVM_EXIT_MWAIT] = invalid_op_interception,
3096 [SVM_EXIT_XSETBV] = xsetbv_interception,
2857 [SVM_EXIT_NPF] = pf_interception, 3097 [SVM_EXIT_NPF] = pf_interception,
2858}; 3098};
2859 3099
@@ -2864,10 +3104,10 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
2864 struct vmcb_save_area *save = &svm->vmcb->save; 3104 struct vmcb_save_area *save = &svm->vmcb->save;
2865 3105
2866 pr_err("VMCB Control Area:\n"); 3106 pr_err("VMCB Control Area:\n");
2867 pr_err("cr_read: %04x\n", control->intercept_cr_read); 3107 pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff);
2868 pr_err("cr_write: %04x\n", control->intercept_cr_write); 3108 pr_err("cr_write: %04x\n", control->intercept_cr >> 16);
2869 pr_err("dr_read: %04x\n", control->intercept_dr_read); 3109 pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff);
2870 pr_err("dr_write: %04x\n", control->intercept_dr_write); 3110 pr_err("dr_write: %04x\n", control->intercept_dr >> 16);
2871 pr_err("exceptions: %08x\n", control->intercept_exceptions); 3111 pr_err("exceptions: %08x\n", control->intercept_exceptions);
2872 pr_err("intercepts: %016llx\n", control->intercept); 3112 pr_err("intercepts: %016llx\n", control->intercept);
2873 pr_err("pause filter count: %d\n", control->pause_filter_count); 3113 pr_err("pause filter count: %d\n", control->pause_filter_count);
@@ -2950,15 +3190,23 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
2950 3190
2951} 3191}
2952 3192
3193static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3194{
3195 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3196
3197 *info1 = control->exit_info_1;
3198 *info2 = control->exit_info_2;
3199}
3200
2953static int handle_exit(struct kvm_vcpu *vcpu) 3201static int handle_exit(struct kvm_vcpu *vcpu)
2954{ 3202{
2955 struct vcpu_svm *svm = to_svm(vcpu); 3203 struct vcpu_svm *svm = to_svm(vcpu);
2956 struct kvm_run *kvm_run = vcpu->run; 3204 struct kvm_run *kvm_run = vcpu->run;
2957 u32 exit_code = svm->vmcb->control.exit_code; 3205 u32 exit_code = svm->vmcb->control.exit_code;
2958 3206
2959 trace_kvm_exit(exit_code, vcpu); 3207 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
2960 3208
2961 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) 3209 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
2962 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3210 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2963 if (npt_enabled) 3211 if (npt_enabled)
2964 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3212 vcpu->arch.cr3 = svm->vmcb->save.cr3;
@@ -2970,7 +3218,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2970 return 1; 3218 return 1;
2971 } 3219 }
2972 3220
2973 if (is_nested(svm)) { 3221 if (is_guest_mode(vcpu)) {
2974 int vmexit; 3222 int vmexit;
2975 3223
2976 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, 3224 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -3033,7 +3281,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
3033 3281
3034 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 3282 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3035 3283
3036 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3037 /* FIXME: handle wraparound of asid_generation */ 3284 /* FIXME: handle wraparound of asid_generation */
3038 if (svm->asid_generation != sd->asid_generation) 3285 if (svm->asid_generation != sd->asid_generation)
3039 new_asid(svm, sd); 3286 new_asid(svm, sd);
@@ -3045,7 +3292,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3045 3292
3046 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3293 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3047 vcpu->arch.hflags |= HF_NMI_MASK; 3294 vcpu->arch.hflags |= HF_NMI_MASK;
3048 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3295 set_intercept(svm, INTERCEPT_IRET);
3049 ++vcpu->stat.nmi_injections; 3296 ++vcpu->stat.nmi_injections;
3050} 3297}
3051 3298
@@ -3058,6 +3305,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
3058 control->int_ctl &= ~V_INTR_PRIO_MASK; 3305 control->int_ctl &= ~V_INTR_PRIO_MASK;
3059 control->int_ctl |= V_IRQ_MASK | 3306 control->int_ctl |= V_IRQ_MASK |
3060 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 3307 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
3308 mark_dirty(svm->vmcb, VMCB_INTR);
3061} 3309}
3062 3310
3063static void svm_set_irq(struct kvm_vcpu *vcpu) 3311static void svm_set_irq(struct kvm_vcpu *vcpu)
@@ -3077,14 +3325,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3077{ 3325{
3078 struct vcpu_svm *svm = to_svm(vcpu); 3326 struct vcpu_svm *svm = to_svm(vcpu);
3079 3327
3080 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3328 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3081 return; 3329 return;
3082 3330
3083 if (irr == -1) 3331 if (irr == -1)
3084 return; 3332 return;
3085 3333
3086 if (tpr >= irr) 3334 if (tpr >= irr)
3087 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; 3335 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3088} 3336}
3089 3337
3090static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3338static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -3112,10 +3360,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3112 3360
3113 if (masked) { 3361 if (masked) {
3114 svm->vcpu.arch.hflags |= HF_NMI_MASK; 3362 svm->vcpu.arch.hflags |= HF_NMI_MASK;
3115 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3363 set_intercept(svm, INTERCEPT_IRET);
3116 } else { 3364 } else {
3117 svm->vcpu.arch.hflags &= ~HF_NMI_MASK; 3365 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3118 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 3366 clr_intercept(svm, INTERCEPT_IRET);
3119 } 3367 }
3120} 3368}
3121 3369
@@ -3131,7 +3379,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3131 3379
3132 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); 3380 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
3133 3381
3134 if (is_nested(svm)) 3382 if (is_guest_mode(vcpu))
3135 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); 3383 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
3136 3384
3137 return ret; 3385 return ret;
@@ -3177,7 +3425,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3177 3425
3178static void svm_flush_tlb(struct kvm_vcpu *vcpu) 3426static void svm_flush_tlb(struct kvm_vcpu *vcpu)
3179{ 3427{
3180 force_new_asid(vcpu); 3428 struct vcpu_svm *svm = to_svm(vcpu);
3429
3430 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3431 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3432 else
3433 svm->asid_generation--;
3181} 3434}
3182 3435
3183static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) 3436static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3188,10 +3441,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3188{ 3441{
3189 struct vcpu_svm *svm = to_svm(vcpu); 3442 struct vcpu_svm *svm = to_svm(vcpu);
3190 3443
3191 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3444 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3192 return; 3445 return;
3193 3446
3194 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 3447 if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3195 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 3448 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3196 kvm_set_cr8(vcpu, cr8); 3449 kvm_set_cr8(vcpu, cr8);
3197 } 3450 }
@@ -3202,7 +3455,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3202 struct vcpu_svm *svm = to_svm(vcpu); 3455 struct vcpu_svm *svm = to_svm(vcpu);
3203 u64 cr8; 3456 u64 cr8;
3204 3457
3205 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3458 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3206 return; 3459 return;
3207 3460
3208 cr8 = kvm_get_cr8(vcpu); 3461 cr8 = kvm_get_cr8(vcpu);
@@ -3289,9 +3542,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3289static void svm_vcpu_run(struct kvm_vcpu *vcpu) 3542static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3290{ 3543{
3291 struct vcpu_svm *svm = to_svm(vcpu); 3544 struct vcpu_svm *svm = to_svm(vcpu);
3292 u16 fs_selector;
3293 u16 gs_selector;
3294 u16 ldt_selector;
3295 3545
3296 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3546 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3297 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 3547 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3308,10 +3558,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3308 3558
3309 sync_lapic_to_cr8(vcpu); 3559 sync_lapic_to_cr8(vcpu);
3310 3560
3311 save_host_msrs(vcpu);
3312 savesegment(fs, fs_selector);
3313 savesegment(gs, gs_selector);
3314 ldt_selector = kvm_read_ldt();
3315 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3561 svm->vmcb->save.cr2 = vcpu->arch.cr2;
3316 3562
3317 clgi(); 3563 clgi();
@@ -3389,19 +3635,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3389#endif 3635#endif
3390 ); 3636 );
3391 3637
3392 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3393 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3394 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3395 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3396
3397 load_host_msrs(vcpu);
3398 kvm_load_ldt(ldt_selector);
3399 loadsegment(fs, fs_selector);
3400#ifdef CONFIG_X86_64 3638#ifdef CONFIG_X86_64
3401 load_gs_index(gs_selector); 3639 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3402 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
3403#else 3640#else
3404 loadsegment(gs, gs_selector); 3641 loadsegment(fs, svm->host.fs);
3405#endif 3642#endif
3406 3643
3407 reload_tss(vcpu); 3644 reload_tss(vcpu);
@@ -3410,10 +3647,21 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3410 3647
3411 stgi(); 3648 stgi();
3412 3649
3650 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3651 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3652 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3653 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3654
3413 sync_cr8_to_lapic(vcpu); 3655 sync_cr8_to_lapic(vcpu);
3414 3656
3415 svm->next_rip = 0; 3657 svm->next_rip = 0;
3416 3658
3659 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3660
3661 /* if exit due to PF check for async PF */
3662 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3663 svm->apf_reason = kvm_read_and_reset_pf_reason();
3664
3417 if (npt_enabled) { 3665 if (npt_enabled) {
3418 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); 3666 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
3419 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); 3667 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
@@ -3426,6 +3674,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3426 if (unlikely(svm->vmcb->control.exit_code == 3674 if (unlikely(svm->vmcb->control.exit_code ==
3427 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 3675 SVM_EXIT_EXCP_BASE + MC_VECTOR))
3428 svm_handle_mce(svm); 3676 svm_handle_mce(svm);
3677
3678 mark_all_clean(svm->vmcb);
3429} 3679}
3430 3680
3431#undef R 3681#undef R
@@ -3435,7 +3685,8 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3435 struct vcpu_svm *svm = to_svm(vcpu); 3685 struct vcpu_svm *svm = to_svm(vcpu);
3436 3686
3437 svm->vmcb->save.cr3 = root; 3687 svm->vmcb->save.cr3 = root;
3438 force_new_asid(vcpu); 3688 mark_dirty(svm->vmcb, VMCB_CR);
3689 svm_flush_tlb(vcpu);
3439} 3690}
3440 3691
3441static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) 3692static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -3443,11 +3694,13 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3443 struct vcpu_svm *svm = to_svm(vcpu); 3694 struct vcpu_svm *svm = to_svm(vcpu);
3444 3695
3445 svm->vmcb->control.nested_cr3 = root; 3696 svm->vmcb->control.nested_cr3 = root;
3697 mark_dirty(svm->vmcb, VMCB_NPT);
3446 3698
3447 /* Also sync guest cr3 here in case we live migrate */ 3699 /* Also sync guest cr3 here in case we live migrate */
3448 svm->vmcb->save.cr3 = vcpu->arch.cr3; 3700 svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
3701 mark_dirty(svm->vmcb, VMCB_CR);
3449 3702
3450 force_new_asid(vcpu); 3703 svm_flush_tlb(vcpu);
3451} 3704}
3452 3705
3453static int is_disabled(void) 3706static int is_disabled(void)
@@ -3494,10 +3747,6 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
3494static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 3747static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3495{ 3748{
3496 switch (func) { 3749 switch (func) {
3497 case 0x00000001:
3498 /* Mask out xsave bit as long as it is not supported by SVM */
3499 entry->ecx &= ~(bit(X86_FEATURE_XSAVE));
3500 break;
3501 case 0x80000001: 3750 case 0x80000001:
3502 if (nested) 3751 if (nested)
3503 entry->ecx |= (1 << 2); /* Set SVM bit */ 3752 entry->ecx |= (1 << 2); /* Set SVM bit */
@@ -3511,7 +3760,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3511 additional features */ 3760 additional features */
3512 3761
3513 /* Support next_rip if host supports it */ 3762 /* Support next_rip if host supports it */
3514 if (svm_has(SVM_FEATURE_NRIP)) 3763 if (boot_cpu_has(X86_FEATURE_NRIPS))
3515 entry->edx |= SVM_FEATURE_NRIP; 3764 entry->edx |= SVM_FEATURE_NRIP;
3516 3765
3517 /* Support NPT for the guest if enabled */ 3766 /* Support NPT for the guest if enabled */
@@ -3571,6 +3820,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
3571 { SVM_EXIT_WBINVD, "wbinvd" }, 3820 { SVM_EXIT_WBINVD, "wbinvd" },
3572 { SVM_EXIT_MONITOR, "monitor" }, 3821 { SVM_EXIT_MONITOR, "monitor" },
3573 { SVM_EXIT_MWAIT, "mwait" }, 3822 { SVM_EXIT_MWAIT, "mwait" },
3823 { SVM_EXIT_XSETBV, "xsetbv" },
3574 { SVM_EXIT_NPF, "npf" }, 3824 { SVM_EXIT_NPF, "npf" },
3575 { -1, NULL } 3825 { -1, NULL }
3576}; 3826};
@@ -3594,9 +3844,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3594{ 3844{
3595 struct vcpu_svm *svm = to_svm(vcpu); 3845 struct vcpu_svm *svm = to_svm(vcpu);
3596 3846
3597 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; 3847 set_exception_intercept(svm, NM_VECTOR);
3598 if (is_nested(svm))
3599 svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
3600 update_cr0_intercept(svm); 3848 update_cr0_intercept(svm);
3601} 3849}
3602 3850
@@ -3627,6 +3875,7 @@ static struct kvm_x86_ops svm_x86_ops = {
3627 .get_cpl = svm_get_cpl, 3875 .get_cpl = svm_get_cpl,
3628 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 3876 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
3629 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, 3877 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
3878 .decache_cr3 = svm_decache_cr3,
3630 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 3879 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
3631 .set_cr0 = svm_set_cr0, 3880 .set_cr0 = svm_set_cr0,
3632 .set_cr3 = svm_set_cr3, 3881 .set_cr3 = svm_set_cr3,
@@ -3667,7 +3916,9 @@ static struct kvm_x86_ops svm_x86_ops = {
3667 .get_tdp_level = get_npt_level, 3916 .get_tdp_level = get_npt_level,
3668 .get_mt_mask = svm_get_mt_mask, 3917 .get_mt_mask = svm_get_mt_mask,
3669 3918
3919 .get_exit_info = svm_get_exit_info,
3670 .exit_reasons_str = svm_exit_reasons_str, 3920 .exit_reasons_str = svm_exit_reasons_str,
3921
3671 .get_lpage_level = svm_get_lpage_level, 3922 .get_lpage_level = svm_get_lpage_level,
3672 3923
3673 .cpuid_update = svm_cpuid_update, 3924 .cpuid_update = svm_cpuid_update,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a6544b8e7c0f..1357d7cf4ec8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic,
178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) 178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) 179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
180 180
181#define KVM_ISA_VMX 1
182#define KVM_ISA_SVM 2
183
181/* 184/*
182 * Tracepoint for kvm guest exit: 185 * Tracepoint for kvm guest exit:
183 */ 186 */
184TRACE_EVENT(kvm_exit, 187TRACE_EVENT(kvm_exit,
185 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu), 188 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
186 TP_ARGS(exit_reason, vcpu), 189 TP_ARGS(exit_reason, vcpu, isa),
187 190
188 TP_STRUCT__entry( 191 TP_STRUCT__entry(
189 __field( unsigned int, exit_reason ) 192 __field( unsigned int, exit_reason )
190 __field( unsigned long, guest_rip ) 193 __field( unsigned long, guest_rip )
194 __field( u32, isa )
195 __field( u64, info1 )
196 __field( u64, info2 )
191 ), 197 ),
192 198
193 TP_fast_assign( 199 TP_fast_assign(
194 __entry->exit_reason = exit_reason; 200 __entry->exit_reason = exit_reason;
195 __entry->guest_rip = kvm_rip_read(vcpu); 201 __entry->guest_rip = kvm_rip_read(vcpu);
202 __entry->isa = isa;
203 kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
204 &__entry->info2);
196 ), 205 ),
197 206
198 TP_printk("reason %s rip 0x%lx", 207 TP_printk("reason %s rip 0x%lx info %llx %llx",
199 ftrace_print_symbols_seq(p, __entry->exit_reason, 208 ftrace_print_symbols_seq(p, __entry->exit_reason,
200 kvm_x86_ops->exit_reasons_str), 209 kvm_x86_ops->exit_reasons_str),
201 __entry->guest_rip) 210 __entry->guest_rip, __entry->info1, __entry->info2)
202); 211);
203 212
204/* 213/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81fcbe9515c5..bf89ec2cfb82 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
69static int __read_mostly vmm_exclusive = 1; 69static int __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO); 70module_param(vmm_exclusive, bool, S_IRUGO);
71 71
72static int __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO);
74
72#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 75#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
73 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 76 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
74#define KVM_GUEST_CR0_MASK \ 77#define KVM_GUEST_CR0_MASK \
@@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm);
177static u64 construct_eptp(unsigned long root_hpa); 180static u64 construct_eptp(unsigned long root_hpa);
178static void kvm_cpu_vmxon(u64 addr); 181static void kvm_cpu_vmxon(u64 addr);
179static void kvm_cpu_vmxoff(void); 182static void kvm_cpu_vmxoff(void);
183static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
180 184
181static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
182static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b;
188static unsigned long *vmx_msr_bitmap_legacy; 192static unsigned long *vmx_msr_bitmap_legacy;
189static unsigned long *vmx_msr_bitmap_longmode; 193static unsigned long *vmx_msr_bitmap_longmode;
190 194
195static bool cpu_has_load_ia32_efer;
196
191static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 197static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
192static DEFINE_SPINLOCK(vmx_vpid_lock); 198static DEFINE_SPINLOCK(vmx_vpid_lock);
193 199
@@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs)
472 u8 error; 478 u8 error;
473 479
474 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" 480 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
475 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 481 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
476 : "cc", "memory"); 482 : "cc", "memory");
477 if (error) 483 if (error)
478 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 484 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs)
485 u8 error; 491 u8 error;
486 492
487 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 493 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
488 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 494 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
489 : "cc", "memory"); 495 : "cc", "memory");
490 if (error) 496 if (error)
491 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 497 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
565 571
566static unsigned long vmcs_readl(unsigned long field) 572static unsigned long vmcs_readl(unsigned long field)
567{ 573{
568 unsigned long value; 574 unsigned long value = 0;
569 575
570 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) 576 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
571 : "=a"(value) : "d"(field) : "cc"); 577 : "+a"(value) : "d"(field) : "cc");
572 return value; 578 return value;
573} 579}
574 580
@@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
661 unsigned i; 667 unsigned i;
662 struct msr_autoload *m = &vmx->msr_autoload; 668 struct msr_autoload *m = &vmx->msr_autoload;
663 669
670 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
671 vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
672 vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
673 return;
674 }
675
664 for (i = 0; i < m->nr; ++i) 676 for (i = 0; i < m->nr; ++i)
665 if (m->guest[i].index == msr) 677 if (m->guest[i].index == msr)
666 break; 678 break;
@@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
680 unsigned i; 692 unsigned i;
681 struct msr_autoload *m = &vmx->msr_autoload; 693 struct msr_autoload *m = &vmx->msr_autoload;
682 694
695 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
696 vmcs_write64(GUEST_IA32_EFER, guest_val);
697 vmcs_write64(HOST_IA32_EFER, host_val);
698 vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
699 vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
700 return;
701 }
702
683 for (i = 0; i < m->nr; ++i) 703 for (i = 0; i < m->nr; ++i)
684 if (m->guest[i].index == msr) 704 if (m->guest[i].index == msr)
685 break; 705 break;
@@ -1009,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1009 vmx_set_interrupt_shadow(vcpu, 0); 1029 vmx_set_interrupt_shadow(vcpu, 0);
1010} 1030}
1011 1031
1032static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1033{
1034 /* Ensure that we clear the HLT state in the VMCS. We don't need to
1035 * explicitly skip the instruction because if the HLT state is set, then
1036 * the instruction is already executing and RIP has already been
1037 * advanced. */
1038 if (!yield_on_hlt &&
1039 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1040 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1041}
1042
1012static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 1043static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1013 bool has_error_code, u32 error_code, 1044 bool has_error_code, u32 error_code,
1014 bool reinject) 1045 bool reinject)
@@ -1035,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1035 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1066 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1036 1067
1037 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1068 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1069 vmx_clear_hlt(vcpu);
1038} 1070}
1039 1071
1040static bool vmx_rdtscp_supported(void) 1072static bool vmx_rdtscp_supported(void)
@@ -1305,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void)
1305 && tboot_enabled()) 1337 && tboot_enabled())
1306 return 1; 1338 return 1;
1307 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 1339 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1308 && !tboot_enabled()) 1340 && !tboot_enabled()) {
1341 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1342 " activate TXT before enabling KVM\n");
1309 return 1; 1343 return 1;
1344 }
1310 } 1345 }
1311 1346
1312 return 0; 1347 return 0;
@@ -1400,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1400 return 0; 1435 return 0;
1401} 1436}
1402 1437
1438static __init bool allow_1_setting(u32 msr, u32 ctl)
1439{
1440 u32 vmx_msr_low, vmx_msr_high;
1441
1442 rdmsr(msr, vmx_msr_low, vmx_msr_high);
1443 return vmx_msr_high & ctl;
1444}
1445
1403static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) 1446static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1404{ 1447{
1405 u32 vmx_msr_low, vmx_msr_high; 1448 u32 vmx_msr_low, vmx_msr_high;
@@ -1416,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1416 &_pin_based_exec_control) < 0) 1459 &_pin_based_exec_control) < 0)
1417 return -EIO; 1460 return -EIO;
1418 1461
1419 min = CPU_BASED_HLT_EXITING | 1462 min =
1420#ifdef CONFIG_X86_64 1463#ifdef CONFIG_X86_64
1421 CPU_BASED_CR8_LOAD_EXITING | 1464 CPU_BASED_CR8_LOAD_EXITING |
1422 CPU_BASED_CR8_STORE_EXITING | 1465 CPU_BASED_CR8_STORE_EXITING |
@@ -1429,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1429 CPU_BASED_MWAIT_EXITING | 1472 CPU_BASED_MWAIT_EXITING |
1430 CPU_BASED_MONITOR_EXITING | 1473 CPU_BASED_MONITOR_EXITING |
1431 CPU_BASED_INVLPG_EXITING; 1474 CPU_BASED_INVLPG_EXITING;
1475
1476 if (yield_on_hlt)
1477 min |= CPU_BASED_HLT_EXITING;
1478
1432 opt = CPU_BASED_TPR_SHADOW | 1479 opt = CPU_BASED_TPR_SHADOW |
1433 CPU_BASED_USE_MSR_BITMAPS | 1480 CPU_BASED_USE_MSR_BITMAPS |
1434 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1481 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1510,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1510 vmcs_conf->vmexit_ctrl = _vmexit_control; 1557 vmcs_conf->vmexit_ctrl = _vmexit_control;
1511 vmcs_conf->vmentry_ctrl = _vmentry_control; 1558 vmcs_conf->vmentry_ctrl = _vmentry_control;
1512 1559
1560 cpu_has_load_ia32_efer =
1561 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
1562 VM_ENTRY_LOAD_IA32_EFER)
1563 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
1564 VM_EXIT_LOAD_IA32_EFER);
1565
1513 return 0; 1566 return 0;
1514} 1567}
1515 1568
@@ -1683,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1683 save->limit = vmcs_read32(sf->limit); 1736 save->limit = vmcs_read32(sf->limit);
1684 save->ar = vmcs_read32(sf->ar_bytes); 1737 save->ar = vmcs_read32(sf->ar_bytes);
1685 vmcs_write16(sf->selector, save->base >> 4); 1738 vmcs_write16(sf->selector, save->base >> 4);
1686 vmcs_write32(sf->base, save->base & 0xfffff); 1739 vmcs_write32(sf->base, save->base & 0xffff0);
1687 vmcs_write32(sf->limit, 0xffff); 1740 vmcs_write32(sf->limit, 0xffff);
1688 vmcs_write32(sf->ar_bytes, 0xf3); 1741 vmcs_write32(sf->ar_bytes, 0xf3);
1742 if (save->base & 0xf)
1743 printk_once(KERN_WARNING "kvm: segment base is not paragraph"
1744 " aligned when entering protected mode (seg=%d)",
1745 seg);
1689} 1746}
1690 1747
1691static void enter_rmode(struct kvm_vcpu *vcpu) 1748static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1814,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1814 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 1871 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1815} 1872}
1816 1873
1874static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
1875{
1876 if (enable_ept && is_paging(vcpu))
1877 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
1878 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
1879}
1880
1817static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1881static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1818{ 1882{
1819 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 1883 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1857,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1857 unsigned long cr0, 1921 unsigned long cr0,
1858 struct kvm_vcpu *vcpu) 1922 struct kvm_vcpu *vcpu)
1859{ 1923{
1924 vmx_decache_cr3(vcpu);
1860 if (!(cr0 & X86_CR0_PG)) { 1925 if (!(cr0 & X86_CR0_PG)) {
1861 /* From paging/starting to nonpaging */ 1926 /* From paging/starting to nonpaging */
1862 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1927 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1937,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1937 if (enable_ept) { 2002 if (enable_ept) {
1938 eptp = construct_eptp(cr3); 2003 eptp = construct_eptp(cr3);
1939 vmcs_write64(EPT_POINTER, eptp); 2004 vmcs_write64(EPT_POINTER, eptp);
1940 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 2005 guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
1941 vcpu->kvm->arch.ept_identity_map_addr; 2006 vcpu->kvm->arch.ept_identity_map_addr;
1942 ept_load_pdptrs(vcpu); 2007 ept_load_pdptrs(vcpu);
1943 } 2008 }
@@ -2725,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2725 vmcs_writel(GUEST_IDTR_BASE, 0); 2790 vmcs_writel(GUEST_IDTR_BASE, 0);
2726 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 2791 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
2727 2792
2728 vmcs_write32(GUEST_ACTIVITY_STATE, 0); 2793 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
2729 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 2794 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2730 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 2795 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2731 2796
@@ -2787,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2787 return; 2852 return;
2788 } 2853 }
2789 2854
2855 if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
2856 enable_irq_window(vcpu);
2857 return;
2858 }
2790 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2859 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2791 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 2860 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2792 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2861 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2814,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2814 } else 2883 } else
2815 intr |= INTR_TYPE_EXT_INTR; 2884 intr |= INTR_TYPE_EXT_INTR;
2816 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 2885 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
2886 vmx_clear_hlt(vcpu);
2817} 2887}
2818 2888
2819static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 2889static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2841,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2841 } 2911 }
2842 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2912 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2843 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2913 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2914 vmx_clear_hlt(vcpu);
2844} 2915}
2845 2916
2846static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 2917static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2849,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2849 return 0; 2920 return 0;
2850 2921
2851 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 2922 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2852 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); 2923 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
2924 | GUEST_INTR_STATE_NMI));
2853} 2925}
2854 2926
2855static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 2927static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2910,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2910 * Cause the #SS fault with 0 error code in VM86 mode. 2982 * Cause the #SS fault with 0 error code in VM86 mode.
2911 */ 2983 */
2912 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2984 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2913 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) 2985 if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
2914 return 1; 2986 return 1;
2915 /* 2987 /*
2916 * Forward all other exceptions that are valid in real mode. 2988 * Forward all other exceptions that are valid in real mode.
@@ -3007,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3007 } 3079 }
3008 3080
3009 if (is_invalid_opcode(intr_info)) { 3081 if (is_invalid_opcode(intr_info)) {
3010 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); 3082 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
3011 if (er != EMULATE_DONE) 3083 if (er != EMULATE_DONE)
3012 kvm_queue_exception(vcpu, UD_VECTOR); 3084 kvm_queue_exception(vcpu, UD_VECTOR);
3013 return 1; 3085 return 1;
@@ -3026,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3026 3098
3027 if (kvm_event_needs_reinjection(vcpu)) 3099 if (kvm_event_needs_reinjection(vcpu))
3028 kvm_mmu_unprotect_page_virt(vcpu, cr2); 3100 kvm_mmu_unprotect_page_virt(vcpu, cr2);
3029 return kvm_mmu_page_fault(vcpu, cr2, error_code); 3101 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
3030 } 3102 }
3031 3103
3032 if (vmx->rmode.vm86_active && 3104 if (vmx->rmode.vm86_active &&
@@ -3098,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
3098 ++vcpu->stat.io_exits; 3170 ++vcpu->stat.io_exits;
3099 3171
3100 if (string || in) 3172 if (string || in)
3101 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3173 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3102 3174
3103 port = exit_qualification >> 16; 3175 port = exit_qualification >> 16;
3104 size = (exit_qualification & 7) + 1; 3176 size = (exit_qualification & 7) + 1;
@@ -3118,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3118 hypercall[2] = 0xc1; 3190 hypercall[2] = 0xc1;
3119} 3191}
3120 3192
3121static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
3122{
3123 if (err)
3124 kvm_inject_gp(vcpu, 0);
3125 else
3126 skip_emulated_instruction(vcpu);
3127}
3128
3129static int handle_cr(struct kvm_vcpu *vcpu) 3193static int handle_cr(struct kvm_vcpu *vcpu)
3130{ 3194{
3131 unsigned long exit_qualification, val; 3195 unsigned long exit_qualification, val;
@@ -3143,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3143 switch (cr) { 3207 switch (cr) {
3144 case 0: 3208 case 0:
3145 err = kvm_set_cr0(vcpu, val); 3209 err = kvm_set_cr0(vcpu, val);
3146 complete_insn_gp(vcpu, err); 3210 kvm_complete_insn_gp(vcpu, err);
3147 return 1; 3211 return 1;
3148 case 3: 3212 case 3:
3149 err = kvm_set_cr3(vcpu, val); 3213 err = kvm_set_cr3(vcpu, val);
3150 complete_insn_gp(vcpu, err); 3214 kvm_complete_insn_gp(vcpu, err);
3151 return 1; 3215 return 1;
3152 case 4: 3216 case 4:
3153 err = kvm_set_cr4(vcpu, val); 3217 err = kvm_set_cr4(vcpu, val);
3154 complete_insn_gp(vcpu, err); 3218 kvm_complete_insn_gp(vcpu, err);
3155 return 1; 3219 return 1;
3156 case 8: { 3220 case 8: {
3157 u8 cr8_prev = kvm_get_cr8(vcpu); 3221 u8 cr8_prev = kvm_get_cr8(vcpu);
3158 u8 cr8 = kvm_register_read(vcpu, reg); 3222 u8 cr8 = kvm_register_read(vcpu, reg);
3159 kvm_set_cr8(vcpu, cr8); 3223 err = kvm_set_cr8(vcpu, cr8);
3160 skip_emulated_instruction(vcpu); 3224 kvm_complete_insn_gp(vcpu, err);
3161 if (irqchip_in_kernel(vcpu->kvm)) 3225 if (irqchip_in_kernel(vcpu->kvm))
3162 return 1; 3226 return 1;
3163 if (cr8_prev <= cr8) 3227 if (cr8_prev <= cr8)
@@ -3176,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3176 case 1: /*mov from cr*/ 3240 case 1: /*mov from cr*/
3177 switch (cr) { 3241 switch (cr) {
3178 case 3: 3242 case 3:
3179 kvm_register_write(vcpu, reg, vcpu->arch.cr3); 3243 val = kvm_read_cr3(vcpu);
3180 trace_kvm_cr_read(cr, vcpu->arch.cr3); 3244 kvm_register_write(vcpu, reg, val);
3245 trace_kvm_cr_read(cr, val);
3181 skip_emulated_instruction(vcpu); 3246 skip_emulated_instruction(vcpu);
3182 return 1; 3247 return 1;
3183 case 8: 3248 case 8:
@@ -3349,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3349 return 1; 3414 return 1;
3350} 3415}
3351 3416
3417static int handle_invd(struct kvm_vcpu *vcpu)
3418{
3419 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3420}
3421
3352static int handle_invlpg(struct kvm_vcpu *vcpu) 3422static int handle_invlpg(struct kvm_vcpu *vcpu)
3353{ 3423{
3354 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3424 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3377,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
3377 3447
3378static int handle_apic_access(struct kvm_vcpu *vcpu) 3448static int handle_apic_access(struct kvm_vcpu *vcpu)
3379{ 3449{
3380 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3450 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3381} 3451}
3382 3452
3383static int handle_task_switch(struct kvm_vcpu *vcpu) 3453static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3476,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
3476 3546
3477 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 3547 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3478 trace_kvm_page_fault(gpa, exit_qualification); 3548 trace_kvm_page_fault(gpa, exit_qualification);
3479 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); 3549 return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
3480} 3550}
3481 3551
3482static u64 ept_rsvd_mask(u64 spte, int level) 3552static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3592,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3592 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) 3662 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
3593 return handle_interrupt_window(&vmx->vcpu); 3663 return handle_interrupt_window(&vmx->vcpu);
3594 3664
3595 err = emulate_instruction(vcpu, 0, 0, 0); 3665 err = emulate_instruction(vcpu, 0);
3596 3666
3597 if (err == EMULATE_DO_MMIO) { 3667 if (err == EMULATE_DO_MMIO) {
3598 ret = 0; 3668 ret = 0;
@@ -3649,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3649 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 3719 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
3650 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 3720 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
3651 [EXIT_REASON_HLT] = handle_halt, 3721 [EXIT_REASON_HLT] = handle_halt,
3722 [EXIT_REASON_INVD] = handle_invd,
3652 [EXIT_REASON_INVLPG] = handle_invlpg, 3723 [EXIT_REASON_INVLPG] = handle_invlpg,
3653 [EXIT_REASON_VMCALL] = handle_vmcall, 3724 [EXIT_REASON_VMCALL] = handle_vmcall,
3654 [EXIT_REASON_VMCLEAR] = handle_vmx_insn, 3725 [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
@@ -3676,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3676static const int kvm_vmx_max_exit_handlers = 3747static const int kvm_vmx_max_exit_handlers =
3677 ARRAY_SIZE(kvm_vmx_exit_handlers); 3748 ARRAY_SIZE(kvm_vmx_exit_handlers);
3678 3749
3750static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3751{
3752 *info1 = vmcs_readl(EXIT_QUALIFICATION);
3753 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
3754}
3755
3679/* 3756/*
3680 * The guest has exited. See if we can fix it or if we need userspace 3757 * The guest has exited. See if we can fix it or if we need userspace
3681 * assistance. 3758 * assistance.
@@ -3686,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3686 u32 exit_reason = vmx->exit_reason; 3763 u32 exit_reason = vmx->exit_reason;
3687 u32 vectoring_info = vmx->idt_vectoring_info; 3764 u32 vectoring_info = vmx->idt_vectoring_info;
3688 3765
3689 trace_kvm_exit(exit_reason, vcpu); 3766 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
3690 3767
3691 /* If guest state is invalid, start emulating */ 3768 /* If guest state is invalid, start emulating */
3692 if (vmx->emulation_required && emulate_invalid_guest_state) 3769 if (vmx->emulation_required && emulate_invalid_guest_state)
3693 return handle_invalid_guest_state(vcpu); 3770 return handle_invalid_guest_state(vcpu);
3694 3771
3695 /* Access CR3 don't cause VMExit in paging mode, so we need
3696 * to sync with guest real CR3. */
3697 if (enable_ept && is_paging(vcpu))
3698 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3699
3700 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 3772 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3701 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3773 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3702 vcpu->run->fail_entry.hardware_entry_failure_reason 3774 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -4013,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4013 ); 4085 );
4014 4086
4015 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 4087 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
4016 | (1 << VCPU_EXREG_PDPTR)); 4088 | (1 << VCPU_EXREG_PDPTR)
4089 | (1 << VCPU_EXREG_CR3));
4017 vcpu->arch.regs_dirty = 0; 4090 vcpu->arch.regs_dirty = 0;
4018 4091
4019 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 4092 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
@@ -4280,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4280 .get_cpl = vmx_get_cpl, 4353 .get_cpl = vmx_get_cpl,
4281 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 4354 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4282 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 4355 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
4356 .decache_cr3 = vmx_decache_cr3,
4283 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 4357 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
4284 .set_cr0 = vmx_set_cr0, 4358 .set_cr0 = vmx_set_cr0,
4285 .set_cr3 = vmx_set_cr3, 4359 .set_cr3 = vmx_set_cr3,
@@ -4320,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
4320 .get_tdp_level = get_ept_level, 4394 .get_tdp_level = get_ept_level,
4321 .get_mt_mask = vmx_get_mt_mask, 4395 .get_mt_mask = vmx_get_mt_mask,
4322 4396
4397 .get_exit_info = vmx_get_exit_info,
4323 .exit_reasons_str = vmx_exit_reasons_str, 4398 .exit_reasons_str = vmx_exit_reasons_str,
4399
4324 .get_lpage_level = vmx_get_lpage_level, 4400 .get_lpage_level = vmx_get_lpage_level,
4325 4401
4326 .cpuid_update = vmx_cpuid_update, 4402 .cpuid_update = vmx_cpuid_update,
@@ -4396,8 +4472,6 @@ static int __init vmx_init(void)
4396 4472
4397 if (enable_ept) { 4473 if (enable_ept) {
4398 bypass_guest_pf = 0; 4474 bypass_guest_pf = 0;
4399 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
4400 VMX_EPT_WRITABLE_MASK);
4401 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 4475 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4402 VMX_EPT_EXECUTABLE_MASK); 4476 VMX_EPT_EXECUTABLE_MASK);
4403 kvm_enable_tdp(); 4477 kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 46a368cb651e..bcc0efce85bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -43,6 +43,7 @@
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/hash.h>
46#include <trace/events/kvm.h> 47#include <trace/events/kvm.h>
47 48
48#define CREATE_TRACE_POINTS 49#define CREATE_TRACE_POINTS
@@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
155 156
156u64 __read_mostly host_xcr0; 157u64 __read_mostly host_xcr0;
157 158
159static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
160{
161 int i;
162 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
163 vcpu->arch.apf.gfns[i] = ~0;
164}
165
158static void kvm_on_user_return(struct user_return_notifier *urn) 166static void kvm_on_user_return(struct user_return_notifier *urn)
159{ 167{
160 unsigned slot; 168 unsigned slot;
@@ -326,23 +334,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
326} 334}
327EXPORT_SYMBOL_GPL(kvm_requeue_exception); 335EXPORT_SYMBOL_GPL(kvm_requeue_exception);
328 336
329void kvm_inject_page_fault(struct kvm_vcpu *vcpu) 337void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
330{ 338{
331 unsigned error_code = vcpu->arch.fault.error_code; 339 if (err)
340 kvm_inject_gp(vcpu, 0);
341 else
342 kvm_x86_ops->skip_emulated_instruction(vcpu);
343}
344EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
332 345
346void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
347{
333 ++vcpu->stat.pf_guest; 348 ++vcpu->stat.pf_guest;
334 vcpu->arch.cr2 = vcpu->arch.fault.address; 349 vcpu->arch.cr2 = fault->address;
335 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 350 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
336} 351}
337 352
338void kvm_propagate_fault(struct kvm_vcpu *vcpu) 353void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
339{ 354{
340 if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) 355 if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
341 vcpu->arch.nested_mmu.inject_page_fault(vcpu); 356 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
342 else 357 else
343 vcpu->arch.mmu.inject_page_fault(vcpu); 358 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
344
345 vcpu->arch.fault.nested = false;
346} 359}
347 360
348void kvm_inject_nmi(struct kvm_vcpu *vcpu) 361void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -460,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
460 (unsigned long *)&vcpu->arch.regs_avail)) 473 (unsigned long *)&vcpu->arch.regs_avail))
461 return true; 474 return true;
462 475
463 gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; 476 gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
464 offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); 477 offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
465 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), 478 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
466 PFERR_USER_MASK | PFERR_WRITE_MASK); 479 PFERR_USER_MASK | PFERR_WRITE_MASK);
467 if (r < 0) 480 if (r < 0)
@@ -506,12 +519,15 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
506 } else 519 } else
507#endif 520#endif
508 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, 521 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
509 vcpu->arch.cr3)) 522 kvm_read_cr3(vcpu)))
510 return 1; 523 return 1;
511 } 524 }
512 525
513 kvm_x86_ops->set_cr0(vcpu, cr0); 526 kvm_x86_ops->set_cr0(vcpu, cr0);
514 527
528 if ((cr0 ^ old_cr0) & X86_CR0_PG)
529 kvm_clear_async_pf_completion_queue(vcpu);
530
515 if ((cr0 ^ old_cr0) & update_bits) 531 if ((cr0 ^ old_cr0) & update_bits)
516 kvm_mmu_reset_context(vcpu); 532 kvm_mmu_reset_context(vcpu);
517 return 0; 533 return 0;
@@ -595,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
595 return 1; 611 return 1;
596 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 612 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
597 && ((cr4 ^ old_cr4) & pdptr_bits) 613 && ((cr4 ^ old_cr4) & pdptr_bits)
598 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) 614 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
615 kvm_read_cr3(vcpu)))
599 return 1; 616 return 1;
600 617
601 if (cr4 & X86_CR4_VMXE) 618 if (cr4 & X86_CR4_VMXE)
@@ -615,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
615 632
616int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 633int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
617{ 634{
618 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 635 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
619 kvm_mmu_sync_roots(vcpu); 636 kvm_mmu_sync_roots(vcpu);
620 kvm_mmu_flush_tlb(vcpu); 637 kvm_mmu_flush_tlb(vcpu);
621 return 0; 638 return 0;
@@ -650,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
650 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 667 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
651 return 1; 668 return 1;
652 vcpu->arch.cr3 = cr3; 669 vcpu->arch.cr3 = cr3;
670 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
653 vcpu->arch.mmu.new_cr3(vcpu); 671 vcpu->arch.mmu.new_cr3(vcpu);
654 return 0; 672 return 0;
655} 673}
656EXPORT_SYMBOL_GPL(kvm_set_cr3); 674EXPORT_SYMBOL_GPL(kvm_set_cr3);
657 675
658int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 676int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
659{ 677{
660 if (cr8 & CR8_RESERVED_BITS) 678 if (cr8 & CR8_RESERVED_BITS)
661 return 1; 679 return 1;
@@ -665,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
665 vcpu->arch.cr8 = cr8; 683 vcpu->arch.cr8 = cr8;
666 return 0; 684 return 0;
667} 685}
668
669void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
670{
671 if (__kvm_set_cr8(vcpu, cr8))
672 kvm_inject_gp(vcpu, 0);
673}
674EXPORT_SYMBOL_GPL(kvm_set_cr8); 686EXPORT_SYMBOL_GPL(kvm_set_cr8);
675 687
676unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 688unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@@ -775,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
775 * kvm-specific. Those are put in the beginning of the list. 787 * kvm-specific. Those are put in the beginning of the list.
776 */ 788 */
777 789
778#define KVM_SAVE_MSRS_BEGIN 7 790#define KVM_SAVE_MSRS_BEGIN 8
779static u32 msrs_to_save[] = { 791static u32 msrs_to_save[] = {
780 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 792 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
781 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 793 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
782 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 794 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
783 HV_X64_MSR_APIC_ASSIST_PAGE, 795 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
784 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 796 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
785 MSR_STAR, 797 MSR_STAR,
786#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
@@ -830,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
830 kvm_x86_ops->set_efer(vcpu, efer); 842 kvm_x86_ops->set_efer(vcpu, efer);
831 843
832 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 844 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
833 kvm_mmu_reset_context(vcpu);
834 845
835 /* Update reserved bits */ 846 /* Update reserved bits */
836 if ((efer ^ old_efer) & EFER_NX) 847 if ((efer ^ old_efer) & EFER_NX)
@@ -1418,6 +1429,30 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1418 return 0; 1429 return 0;
1419} 1430}
1420 1431
1432static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1433{
1434 gpa_t gpa = data & ~0x3f;
1435
1436 /* Bits 2:5 are resrved, Should be zero */
1437 if (data & 0x3c)
1438 return 1;
1439
1440 vcpu->arch.apf.msr_val = data;
1441
1442 if (!(data & KVM_ASYNC_PF_ENABLED)) {
1443 kvm_clear_async_pf_completion_queue(vcpu);
1444 kvm_async_pf_hash_reset(vcpu);
1445 return 0;
1446 }
1447
1448 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
1449 return 1;
1450
1451 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
1452 kvm_async_pf_wakeup_all(vcpu);
1453 return 0;
1454}
1455
1421int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1456int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1422{ 1457{
1423 switch (msr) { 1458 switch (msr) {
@@ -1499,6 +1534,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1499 } 1534 }
1500 break; 1535 break;
1501 } 1536 }
1537 case MSR_KVM_ASYNC_PF_EN:
1538 if (kvm_pv_enable_async_pf(vcpu, data))
1539 return 1;
1540 break;
1502 case MSR_IA32_MCG_CTL: 1541 case MSR_IA32_MCG_CTL:
1503 case MSR_IA32_MCG_STATUS: 1542 case MSR_IA32_MCG_STATUS:
1504 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1543 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1775,6 +1814,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1775 case MSR_KVM_SYSTEM_TIME_NEW: 1814 case MSR_KVM_SYSTEM_TIME_NEW:
1776 data = vcpu->arch.time; 1815 data = vcpu->arch.time;
1777 break; 1816 break;
1817 case MSR_KVM_ASYNC_PF_EN:
1818 data = vcpu->arch.apf.msr_val;
1819 break;
1778 case MSR_IA32_P5_MC_ADDR: 1820 case MSR_IA32_P5_MC_ADDR:
1779 case MSR_IA32_P5_MC_TYPE: 1821 case MSR_IA32_P5_MC_TYPE:
1780 case MSR_IA32_MCG_CAP: 1822 case MSR_IA32_MCG_CAP:
@@ -1904,6 +1946,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1904 case KVM_CAP_NOP_IO_DELAY: 1946 case KVM_CAP_NOP_IO_DELAY:
1905 case KVM_CAP_MP_STATE: 1947 case KVM_CAP_MP_STATE:
1906 case KVM_CAP_SYNC_MMU: 1948 case KVM_CAP_SYNC_MMU:
1949 case KVM_CAP_USER_NMI:
1907 case KVM_CAP_REINJECT_CONTROL: 1950 case KVM_CAP_REINJECT_CONTROL:
1908 case KVM_CAP_IRQ_INJECT_STATUS: 1951 case KVM_CAP_IRQ_INJECT_STATUS:
1909 case KVM_CAP_ASSIGN_DEV_IRQ: 1952 case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -1922,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1922 case KVM_CAP_DEBUGREGS: 1965 case KVM_CAP_DEBUGREGS:
1923 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1966 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1924 case KVM_CAP_XSAVE: 1967 case KVM_CAP_XSAVE:
1968 case KVM_CAP_ASYNC_PF:
1925 r = 1; 1969 r = 1;
1926 break; 1970 break;
1927 case KVM_CAP_COALESCED_MMIO: 1971 case KVM_CAP_COALESCED_MMIO:
@@ -2185,6 +2229,11 @@ out:
2185 return r; 2229 return r;
2186} 2230}
2187 2231
2232static void cpuid_mask(u32 *word, int wordnum)
2233{
2234 *word &= boot_cpu_data.x86_capability[wordnum];
2235}
2236
2188static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 2237static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2189 u32 index) 2238 u32 index)
2190{ 2239{
@@ -2259,7 +2308,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2259 break; 2308 break;
2260 case 1: 2309 case 1:
2261 entry->edx &= kvm_supported_word0_x86_features; 2310 entry->edx &= kvm_supported_word0_x86_features;
2311 cpuid_mask(&entry->edx, 0);
2262 entry->ecx &= kvm_supported_word4_x86_features; 2312 entry->ecx &= kvm_supported_word4_x86_features;
2313 cpuid_mask(&entry->ecx, 4);
2263 /* we support x2apic emulation even if host does not support 2314 /* we support x2apic emulation even if host does not support
2264 * it since we emulate x2apic in software */ 2315 * it since we emulate x2apic in software */
2265 entry->ecx |= F(X2APIC); 2316 entry->ecx |= F(X2APIC);
@@ -2350,7 +2401,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2350 break; 2401 break;
2351 case 0x80000001: 2402 case 0x80000001:
2352 entry->edx &= kvm_supported_word1_x86_features; 2403 entry->edx &= kvm_supported_word1_x86_features;
2404 cpuid_mask(&entry->edx, 1);
2353 entry->ecx &= kvm_supported_word6_x86_features; 2405 entry->ecx &= kvm_supported_word6_x86_features;
2406 cpuid_mask(&entry->ecx, 6);
2354 break; 2407 break;
2355 } 2408 }
2356 2409
@@ -3169,20 +3222,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3169 struct kvm_memslots *slots, *old_slots; 3222 struct kvm_memslots *slots, *old_slots;
3170 unsigned long *dirty_bitmap; 3223 unsigned long *dirty_bitmap;
3171 3224
3172 r = -ENOMEM; 3225 dirty_bitmap = memslot->dirty_bitmap_head;
3173 dirty_bitmap = vmalloc(n); 3226 if (memslot->dirty_bitmap == dirty_bitmap)
3174 if (!dirty_bitmap) 3227 dirty_bitmap += n / sizeof(long);
3175 goto out;
3176 memset(dirty_bitmap, 0, n); 3228 memset(dirty_bitmap, 0, n);
3177 3229
3178 r = -ENOMEM; 3230 r = -ENOMEM;
3179 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 3231 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
3180 if (!slots) { 3232 if (!slots)
3181 vfree(dirty_bitmap);
3182 goto out; 3233 goto out;
3183 }
3184 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 3234 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
3185 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 3235 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
3236 slots->generation++;
3186 3237
3187 old_slots = kvm->memslots; 3238 old_slots = kvm->memslots;
3188 rcu_assign_pointer(kvm->memslots, slots); 3239 rcu_assign_pointer(kvm->memslots, slots);
@@ -3195,11 +3246,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3195 spin_unlock(&kvm->mmu_lock); 3246 spin_unlock(&kvm->mmu_lock);
3196 3247
3197 r = -EFAULT; 3248 r = -EFAULT;
3198 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { 3249 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
3199 vfree(dirty_bitmap);
3200 goto out; 3250 goto out;
3201 }
3202 vfree(dirty_bitmap);
3203 } else { 3251 } else {
3204 r = -EFAULT; 3252 r = -EFAULT;
3205 if (clear_user(log->dirty_bitmap, n)) 3253 if (clear_user(log->dirty_bitmap, n))
@@ -3266,8 +3314,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
3266 if (vpic) { 3314 if (vpic) {
3267 r = kvm_ioapic_init(kvm); 3315 r = kvm_ioapic_init(kvm);
3268 if (r) { 3316 if (r) {
3317 mutex_lock(&kvm->slots_lock);
3269 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3318 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3270 &vpic->dev); 3319 &vpic->dev);
3320 mutex_unlock(&kvm->slots_lock);
3271 kfree(vpic); 3321 kfree(vpic);
3272 goto create_irqchip_unlock; 3322 goto create_irqchip_unlock;
3273 } 3323 }
@@ -3278,10 +3328,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
3278 smp_wmb(); 3328 smp_wmb();
3279 r = kvm_setup_default_irq_routing(kvm); 3329 r = kvm_setup_default_irq_routing(kvm);
3280 if (r) { 3330 if (r) {
3331 mutex_lock(&kvm->slots_lock);
3281 mutex_lock(&kvm->irq_lock); 3332 mutex_lock(&kvm->irq_lock);
3282 kvm_ioapic_destroy(kvm); 3333 kvm_ioapic_destroy(kvm);
3283 kvm_destroy_pic(kvm); 3334 kvm_destroy_pic(kvm);
3284 mutex_unlock(&kvm->irq_lock); 3335 mutex_unlock(&kvm->irq_lock);
3336 mutex_unlock(&kvm->slots_lock);
3285 } 3337 }
3286 create_irqchip_unlock: 3338 create_irqchip_unlock:
3287 mutex_unlock(&kvm->lock); 3339 mutex_unlock(&kvm->lock);
@@ -3557,63 +3609,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3557static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) 3609static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3558{ 3610{
3559 gpa_t t_gpa; 3611 gpa_t t_gpa;
3560 u32 error; 3612 struct x86_exception exception;
3561 3613
3562 BUG_ON(!mmu_is_nested(vcpu)); 3614 BUG_ON(!mmu_is_nested(vcpu));
3563 3615
3564 /* NPT walks are always user-walks */ 3616 /* NPT walks are always user-walks */
3565 access |= PFERR_USER_MASK; 3617 access |= PFERR_USER_MASK;
3566 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); 3618 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
3567 if (t_gpa == UNMAPPED_GVA)
3568 vcpu->arch.fault.nested = true;
3569 3619
3570 return t_gpa; 3620 return t_gpa;
3571} 3621}
3572 3622
3573gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3623gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
3624 struct x86_exception *exception)
3574{ 3625{
3575 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3626 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3576 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3627 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3577} 3628}
3578 3629
3579 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3630 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
3631 struct x86_exception *exception)
3580{ 3632{
3581 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3633 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3582 access |= PFERR_FETCH_MASK; 3634 access |= PFERR_FETCH_MASK;
3583 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3635 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3584} 3636}
3585 3637
3586gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3638gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
3639 struct x86_exception *exception)
3587{ 3640{
3588 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3641 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3589 access |= PFERR_WRITE_MASK; 3642 access |= PFERR_WRITE_MASK;
3590 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3643 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3591} 3644}
3592 3645
3593/* uses this to access any guest's mapped memory without checking CPL */ 3646/* uses this to access any guest's mapped memory without checking CPL */
3594gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3647gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
3648 struct x86_exception *exception)
3595{ 3649{
3596 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); 3650 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
3597} 3651}
3598 3652
3599static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3653static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3600 struct kvm_vcpu *vcpu, u32 access, 3654 struct kvm_vcpu *vcpu, u32 access,
3601 u32 *error) 3655 struct x86_exception *exception)
3602{ 3656{
3603 void *data = val; 3657 void *data = val;
3604 int r = X86EMUL_CONTINUE; 3658 int r = X86EMUL_CONTINUE;
3605 3659
3606 while (bytes) { 3660 while (bytes) {
3607 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, 3661 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3608 error); 3662 exception);
3609 unsigned offset = addr & (PAGE_SIZE-1); 3663 unsigned offset = addr & (PAGE_SIZE-1);
3610 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3664 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3611 int ret; 3665 int ret;
3612 3666
3613 if (gpa == UNMAPPED_GVA) { 3667 if (gpa == UNMAPPED_GVA)
3614 r = X86EMUL_PROPAGATE_FAULT; 3668 return X86EMUL_PROPAGATE_FAULT;
3615 goto out;
3616 }
3617 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3669 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3618 if (ret < 0) { 3670 if (ret < 0) {
3619 r = X86EMUL_IO_NEEDED; 3671 r = X86EMUL_IO_NEEDED;
@@ -3630,31 +3682,35 @@ out:
3630 3682
3631/* used for instruction fetching */ 3683/* used for instruction fetching */
3632static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, 3684static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
3633 struct kvm_vcpu *vcpu, u32 *error) 3685 struct kvm_vcpu *vcpu,
3686 struct x86_exception *exception)
3634{ 3687{
3635 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3688 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3636 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 3689 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3637 access | PFERR_FETCH_MASK, error); 3690 access | PFERR_FETCH_MASK,
3691 exception);
3638} 3692}
3639 3693
3640static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3694static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
3641 struct kvm_vcpu *vcpu, u32 *error) 3695 struct kvm_vcpu *vcpu,
3696 struct x86_exception *exception)
3642{ 3697{
3643 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3698 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3644 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3699 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3645 error); 3700 exception);
3646} 3701}
3647 3702
3648static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, 3703static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3649 struct kvm_vcpu *vcpu, u32 *error) 3704 struct kvm_vcpu *vcpu,
3705 struct x86_exception *exception)
3650{ 3706{
3651 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3707 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3652} 3708}
3653 3709
3654static int kvm_write_guest_virt_system(gva_t addr, void *val, 3710static int kvm_write_guest_virt_system(gva_t addr, void *val,
3655 unsigned int bytes, 3711 unsigned int bytes,
3656 struct kvm_vcpu *vcpu, 3712 struct kvm_vcpu *vcpu,
3657 u32 *error) 3713 struct x86_exception *exception)
3658{ 3714{
3659 void *data = val; 3715 void *data = val;
3660 int r = X86EMUL_CONTINUE; 3716 int r = X86EMUL_CONTINUE;
@@ -3662,15 +3718,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3662 while (bytes) { 3718 while (bytes) {
3663 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, 3719 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3664 PFERR_WRITE_MASK, 3720 PFERR_WRITE_MASK,
3665 error); 3721 exception);
3666 unsigned offset = addr & (PAGE_SIZE-1); 3722 unsigned offset = addr & (PAGE_SIZE-1);
3667 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3723 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3668 int ret; 3724 int ret;
3669 3725
3670 if (gpa == UNMAPPED_GVA) { 3726 if (gpa == UNMAPPED_GVA)
3671 r = X86EMUL_PROPAGATE_FAULT; 3727 return X86EMUL_PROPAGATE_FAULT;
3672 goto out;
3673 }
3674 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3728 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3675 if (ret < 0) { 3729 if (ret < 0) {
3676 r = X86EMUL_IO_NEEDED; 3730 r = X86EMUL_IO_NEEDED;
@@ -3688,7 +3742,7 @@ out:
3688static int emulator_read_emulated(unsigned long addr, 3742static int emulator_read_emulated(unsigned long addr,
3689 void *val, 3743 void *val,
3690 unsigned int bytes, 3744 unsigned int bytes,
3691 unsigned int *error_code, 3745 struct x86_exception *exception,
3692 struct kvm_vcpu *vcpu) 3746 struct kvm_vcpu *vcpu)
3693{ 3747{
3694 gpa_t gpa; 3748 gpa_t gpa;
@@ -3701,7 +3755,7 @@ static int emulator_read_emulated(unsigned long addr,
3701 return X86EMUL_CONTINUE; 3755 return X86EMUL_CONTINUE;
3702 } 3756 }
3703 3757
3704 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); 3758 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
3705 3759
3706 if (gpa == UNMAPPED_GVA) 3760 if (gpa == UNMAPPED_GVA)
3707 return X86EMUL_PROPAGATE_FAULT; 3761 return X86EMUL_PROPAGATE_FAULT;
@@ -3710,8 +3764,8 @@ static int emulator_read_emulated(unsigned long addr,
3710 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3764 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3711 goto mmio; 3765 goto mmio;
3712 3766
3713 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) 3767 if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
3714 == X86EMUL_CONTINUE) 3768 == X86EMUL_CONTINUE)
3715 return X86EMUL_CONTINUE; 3769 return X86EMUL_CONTINUE;
3716 3770
3717mmio: 3771mmio:
@@ -3735,7 +3789,7 @@ mmio:
3735} 3789}
3736 3790
3737int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3791int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3738 const void *val, int bytes) 3792 const void *val, int bytes)
3739{ 3793{
3740 int ret; 3794 int ret;
3741 3795
@@ -3749,12 +3803,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3749static int emulator_write_emulated_onepage(unsigned long addr, 3803static int emulator_write_emulated_onepage(unsigned long addr,
3750 const void *val, 3804 const void *val,
3751 unsigned int bytes, 3805 unsigned int bytes,
3752 unsigned int *error_code, 3806 struct x86_exception *exception,
3753 struct kvm_vcpu *vcpu) 3807 struct kvm_vcpu *vcpu)
3754{ 3808{
3755 gpa_t gpa; 3809 gpa_t gpa;
3756 3810
3757 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); 3811 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
3758 3812
3759 if (gpa == UNMAPPED_GVA) 3813 if (gpa == UNMAPPED_GVA)
3760 return X86EMUL_PROPAGATE_FAULT; 3814 return X86EMUL_PROPAGATE_FAULT;
@@ -3787,7 +3841,7 @@ mmio:
3787int emulator_write_emulated(unsigned long addr, 3841int emulator_write_emulated(unsigned long addr,
3788 const void *val, 3842 const void *val,
3789 unsigned int bytes, 3843 unsigned int bytes,
3790 unsigned int *error_code, 3844 struct x86_exception *exception,
3791 struct kvm_vcpu *vcpu) 3845 struct kvm_vcpu *vcpu)
3792{ 3846{
3793 /* Crossing a page boundary? */ 3847 /* Crossing a page boundary? */
@@ -3795,7 +3849,7 @@ int emulator_write_emulated(unsigned long addr,
3795 int rc, now; 3849 int rc, now;
3796 3850
3797 now = -addr & ~PAGE_MASK; 3851 now = -addr & ~PAGE_MASK;
3798 rc = emulator_write_emulated_onepage(addr, val, now, error_code, 3852 rc = emulator_write_emulated_onepage(addr, val, now, exception,
3799 vcpu); 3853 vcpu);
3800 if (rc != X86EMUL_CONTINUE) 3854 if (rc != X86EMUL_CONTINUE)
3801 return rc; 3855 return rc;
@@ -3803,7 +3857,7 @@ int emulator_write_emulated(unsigned long addr,
3803 val += now; 3857 val += now;
3804 bytes -= now; 3858 bytes -= now;
3805 } 3859 }
3806 return emulator_write_emulated_onepage(addr, val, bytes, error_code, 3860 return emulator_write_emulated_onepage(addr, val, bytes, exception,
3807 vcpu); 3861 vcpu);
3808} 3862}
3809 3863
@@ -3821,7 +3875,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3821 const void *old, 3875 const void *old,
3822 const void *new, 3876 const void *new,
3823 unsigned int bytes, 3877 unsigned int bytes,
3824 unsigned int *error_code, 3878 struct x86_exception *exception,
3825 struct kvm_vcpu *vcpu) 3879 struct kvm_vcpu *vcpu)
3826{ 3880{
3827 gpa_t gpa; 3881 gpa_t gpa;
@@ -3879,7 +3933,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3879emul_write: 3933emul_write:
3880 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3934 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3881 3935
3882 return emulator_write_emulated(addr, new, bytes, error_code, vcpu); 3936 return emulator_write_emulated(addr, new, bytes, exception, vcpu);
3883} 3937}
3884 3938
3885static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3939static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3904,7 +3958,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3904 if (vcpu->arch.pio.count) 3958 if (vcpu->arch.pio.count)
3905 goto data_avail; 3959 goto data_avail;
3906 3960
3907 trace_kvm_pio(0, port, size, 1); 3961 trace_kvm_pio(0, port, size, count);
3908 3962
3909 vcpu->arch.pio.port = port; 3963 vcpu->arch.pio.port = port;
3910 vcpu->arch.pio.in = 1; 3964 vcpu->arch.pio.in = 1;
@@ -3932,7 +3986,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
3932 const void *val, unsigned int count, 3986 const void *val, unsigned int count,
3933 struct kvm_vcpu *vcpu) 3987 struct kvm_vcpu *vcpu)
3934{ 3988{
3935 trace_kvm_pio(1, port, size, 1); 3989 trace_kvm_pio(1, port, size, count);
3936 3990
3937 vcpu->arch.pio.port = port; 3991 vcpu->arch.pio.port = port;
3938 vcpu->arch.pio.in = 0; 3992 vcpu->arch.pio.in = 0;
@@ -3973,13 +4027,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
3973 return X86EMUL_CONTINUE; 4027 return X86EMUL_CONTINUE;
3974 4028
3975 if (kvm_x86_ops->has_wbinvd_exit()) { 4029 if (kvm_x86_ops->has_wbinvd_exit()) {
3976 preempt_disable(); 4030 int cpu = get_cpu();
4031
4032 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3977 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, 4033 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
3978 wbinvd_ipi, NULL, 1); 4034 wbinvd_ipi, NULL, 1);
3979 preempt_enable(); 4035 put_cpu();
3980 cpumask_clear(vcpu->arch.wbinvd_dirty_mask); 4036 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
3981 } 4037 } else
3982 wbinvd(); 4038 wbinvd();
3983 return X86EMUL_CONTINUE; 4039 return X86EMUL_CONTINUE;
3984} 4040}
3985EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4041EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
@@ -4019,7 +4075,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
4019 value = vcpu->arch.cr2; 4075 value = vcpu->arch.cr2;
4020 break; 4076 break;
4021 case 3: 4077 case 3:
4022 value = vcpu->arch.cr3; 4078 value = kvm_read_cr3(vcpu);
4023 break; 4079 break;
4024 case 4: 4080 case 4:
4025 value = kvm_read_cr4(vcpu); 4081 value = kvm_read_cr4(vcpu);
@@ -4053,7 +4109,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
4053 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 4109 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4054 break; 4110 break;
4055 case 8: 4111 case 8:
4056 res = __kvm_set_cr8(vcpu, val & 0xfUL); 4112 res = kvm_set_cr8(vcpu, val);
4057 break; 4113 break;
4058 default: 4114 default:
4059 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4115 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@@ -4206,12 +4262,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
4206static void inject_emulated_exception(struct kvm_vcpu *vcpu) 4262static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4207{ 4263{
4208 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4264 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4209 if (ctxt->exception == PF_VECTOR) 4265 if (ctxt->exception.vector == PF_VECTOR)
4210 kvm_propagate_fault(vcpu); 4266 kvm_propagate_fault(vcpu, &ctxt->exception);
4211 else if (ctxt->error_code_valid) 4267 else if (ctxt->exception.error_code_valid)
4212 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 4268 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
4269 ctxt->exception.error_code);
4213 else 4270 else
4214 kvm_queue_exception(vcpu, ctxt->exception); 4271 kvm_queue_exception(vcpu, ctxt->exception.vector);
4215} 4272}
4216 4273
4217static void init_emulate_ctxt(struct kvm_vcpu *vcpu) 4274static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -4267,13 +4324,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4267 4324
4268static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4325static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4269{ 4326{
4327 int r = EMULATE_DONE;
4328
4270 ++vcpu->stat.insn_emulation_fail; 4329 ++vcpu->stat.insn_emulation_fail;
4271 trace_kvm_emulate_insn_failed(vcpu); 4330 trace_kvm_emulate_insn_failed(vcpu);
4272 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4331 if (!is_guest_mode(vcpu)) {
4273 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4332 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4274 vcpu->run->internal.ndata = 0; 4333 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4334 vcpu->run->internal.ndata = 0;
4335 r = EMULATE_FAIL;
4336 }
4275 kvm_queue_exception(vcpu, UD_VECTOR); 4337 kvm_queue_exception(vcpu, UD_VECTOR);
4276 return EMULATE_FAIL; 4338
4339 return r;
4277} 4340}
4278 4341
4279static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4342static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
@@ -4302,10 +4365,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4302 return false; 4365 return false;
4303} 4366}
4304 4367
4305int emulate_instruction(struct kvm_vcpu *vcpu, 4368int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4306 unsigned long cr2, 4369 unsigned long cr2,
4307 u16 error_code, 4370 int emulation_type,
4308 int emulation_type) 4371 void *insn,
4372 int insn_len)
4309{ 4373{
4310 int r; 4374 int r;
4311 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4375 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
@@ -4323,10 +4387,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
4323 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4387 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4324 init_emulate_ctxt(vcpu); 4388 init_emulate_ctxt(vcpu);
4325 vcpu->arch.emulate_ctxt.interruptibility = 0; 4389 vcpu->arch.emulate_ctxt.interruptibility = 0;
4326 vcpu->arch.emulate_ctxt.exception = -1; 4390 vcpu->arch.emulate_ctxt.have_exception = false;
4327 vcpu->arch.emulate_ctxt.perm_ok = false; 4391 vcpu->arch.emulate_ctxt.perm_ok = false;
4328 4392
4329 r = x86_decode_insn(&vcpu->arch.emulate_ctxt); 4393 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
4330 if (r == X86EMUL_PROPAGATE_FAULT) 4394 if (r == X86EMUL_PROPAGATE_FAULT)
4331 goto done; 4395 goto done;
4332 4396
@@ -4389,7 +4453,7 @@ restart:
4389 } 4453 }
4390 4454
4391done: 4455done:
4392 if (vcpu->arch.emulate_ctxt.exception >= 0) { 4456 if (vcpu->arch.emulate_ctxt.have_exception) {
4393 inject_emulated_exception(vcpu); 4457 inject_emulated_exception(vcpu);
4394 r = EMULATE_DONE; 4458 r = EMULATE_DONE;
4395 } else if (vcpu->arch.pio.count) { 4459 } else if (vcpu->arch.pio.count) {
@@ -4413,7 +4477,7 @@ done:
4413 4477
4414 return r; 4478 return r;
4415} 4479}
4416EXPORT_SYMBOL_GPL(emulate_instruction); 4480EXPORT_SYMBOL_GPL(x86_emulate_instruction);
4417 4481
4418int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 4482int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4419{ 4483{
@@ -4653,7 +4717,6 @@ int kvm_arch_init(void *opaque)
4653 4717
4654 kvm_x86_ops = ops; 4718 kvm_x86_ops = ops;
4655 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 4719 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
4656 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
4657 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 4720 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
4658 PT_DIRTY_MASK, PT64_NX_MASK, 0); 4721 PT_DIRTY_MASK, PT64_NX_MASK, 0);
4659 4722
@@ -5116,6 +5179,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5116 vcpu->fpu_active = 0; 5179 vcpu->fpu_active = 0;
5117 kvm_x86_ops->fpu_deactivate(vcpu); 5180 kvm_x86_ops->fpu_deactivate(vcpu);
5118 } 5181 }
5182 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
5183 /* Page is swapped out. Do synthetic halt */
5184 vcpu->arch.apf.halted = true;
5185 r = 1;
5186 goto out;
5187 }
5119 } 5188 }
5120 5189
5121 r = kvm_mmu_reload(vcpu); 5190 r = kvm_mmu_reload(vcpu);
@@ -5244,7 +5313,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5244 5313
5245 r = 1; 5314 r = 1;
5246 while (r > 0) { 5315 while (r > 0) {
5247 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 5316 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
5317 !vcpu->arch.apf.halted)
5248 r = vcpu_enter_guest(vcpu); 5318 r = vcpu_enter_guest(vcpu);
5249 else { 5319 else {
5250 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5320 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -5257,6 +5327,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5257 vcpu->arch.mp_state = 5327 vcpu->arch.mp_state =
5258 KVM_MP_STATE_RUNNABLE; 5328 KVM_MP_STATE_RUNNABLE;
5259 case KVM_MP_STATE_RUNNABLE: 5329 case KVM_MP_STATE_RUNNABLE:
5330 vcpu->arch.apf.halted = false;
5260 break; 5331 break;
5261 case KVM_MP_STATE_SIPI_RECEIVED: 5332 case KVM_MP_STATE_SIPI_RECEIVED:
5262 default: 5333 default:
@@ -5278,6 +5349,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5278 vcpu->run->exit_reason = KVM_EXIT_INTR; 5349 vcpu->run->exit_reason = KVM_EXIT_INTR;
5279 ++vcpu->stat.request_irq_exits; 5350 ++vcpu->stat.request_irq_exits;
5280 } 5351 }
5352
5353 kvm_check_async_pf_completion(vcpu);
5354
5281 if (signal_pending(current)) { 5355 if (signal_pending(current)) {
5282 r = -EINTR; 5356 r = -EINTR;
5283 vcpu->run->exit_reason = KVM_EXIT_INTR; 5357 vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -5302,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5302 int r; 5376 int r;
5303 sigset_t sigsaved; 5377 sigset_t sigsaved;
5304 5378
5379 if (!tsk_used_math(current) && init_fpu(current))
5380 return -ENOMEM;
5381
5305 if (vcpu->sigset_active) 5382 if (vcpu->sigset_active)
5306 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 5383 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
5307 5384
@@ -5313,8 +5390,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5313 } 5390 }
5314 5391
5315 /* re-sync apic's tpr */ 5392 /* re-sync apic's tpr */
5316 if (!irqchip_in_kernel(vcpu->kvm)) 5393 if (!irqchip_in_kernel(vcpu->kvm)) {
5317 kvm_set_cr8(vcpu, kvm_run->cr8); 5394 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
5395 r = -EINVAL;
5396 goto out;
5397 }
5398 }
5318 5399
5319 if (vcpu->arch.pio.count || vcpu->mmio_needed) { 5400 if (vcpu->arch.pio.count || vcpu->mmio_needed) {
5320 if (vcpu->mmio_needed) { 5401 if (vcpu->mmio_needed) {
@@ -5323,7 +5404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5323 vcpu->mmio_needed = 0; 5404 vcpu->mmio_needed = 0;
5324 } 5405 }
5325 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5406 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5326 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); 5407 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5327 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5408 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5328 if (r != EMULATE_DONE) { 5409 if (r != EMULATE_DONE) {
5329 r = 0; 5410 r = 0;
@@ -5436,7 +5517,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
5436 5517
5437 sregs->cr0 = kvm_read_cr0(vcpu); 5518 sregs->cr0 = kvm_read_cr0(vcpu);
5438 sregs->cr2 = vcpu->arch.cr2; 5519 sregs->cr2 = vcpu->arch.cr2;
5439 sregs->cr3 = vcpu->arch.cr3; 5520 sregs->cr3 = kvm_read_cr3(vcpu);
5440 sregs->cr4 = kvm_read_cr4(vcpu); 5521 sregs->cr4 = kvm_read_cr4(vcpu);
5441 sregs->cr8 = kvm_get_cr8(vcpu); 5522 sregs->cr8 = kvm_get_cr8(vcpu);
5442 sregs->efer = vcpu->arch.efer; 5523 sregs->efer = vcpu->arch.efer;
@@ -5504,8 +5585,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5504 kvm_x86_ops->set_gdt(vcpu, &dt); 5585 kvm_x86_ops->set_gdt(vcpu, &dt);
5505 5586
5506 vcpu->arch.cr2 = sregs->cr2; 5587 vcpu->arch.cr2 = sregs->cr2;
5507 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 5588 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
5508 vcpu->arch.cr3 = sregs->cr3; 5589 vcpu->arch.cr3 = sregs->cr3;
5590 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
5509 5591
5510 kvm_set_cr8(vcpu, sregs->cr8); 5592 kvm_set_cr8(vcpu, sregs->cr8);
5511 5593
@@ -5522,7 +5604,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5522 if (sregs->cr4 & X86_CR4_OSXSAVE) 5604 if (sregs->cr4 & X86_CR4_OSXSAVE)
5523 update_cpuid(vcpu); 5605 update_cpuid(vcpu);
5524 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5606 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5525 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); 5607 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
5526 mmu_reset_needed = 1; 5608 mmu_reset_needed = 1;
5527 } 5609 }
5528 5610
@@ -5773,6 +5855,8 @@ free_vcpu:
5773 5855
5774void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 5856void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
5775{ 5857{
5858 vcpu->arch.apf.msr_val = 0;
5859
5776 vcpu_load(vcpu); 5860 vcpu_load(vcpu);
5777 kvm_mmu_unload(vcpu); 5861 kvm_mmu_unload(vcpu);
5778 vcpu_put(vcpu); 5862 vcpu_put(vcpu);
@@ -5792,6 +5876,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5792 vcpu->arch.dr7 = DR7_FIXED_1; 5876 vcpu->arch.dr7 = DR7_FIXED_1;
5793 5877
5794 kvm_make_request(KVM_REQ_EVENT, vcpu); 5878 kvm_make_request(KVM_REQ_EVENT, vcpu);
5879 vcpu->arch.apf.msr_val = 0;
5880
5881 kvm_clear_async_pf_completion_queue(vcpu);
5882 kvm_async_pf_hash_reset(vcpu);
5883 vcpu->arch.apf.halted = false;
5795 5884
5796 return kvm_x86_ops->vcpu_reset(vcpu); 5885 return kvm_x86_ops->vcpu_reset(vcpu);
5797} 5886}
@@ -5881,6 +5970,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5881 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 5970 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5882 goto fail_free_mce_banks; 5971 goto fail_free_mce_banks;
5883 5972
5973 kvm_async_pf_hash_reset(vcpu);
5974
5884 return 0; 5975 return 0;
5885fail_free_mce_banks: 5976fail_free_mce_banks:
5886 kfree(vcpu->arch.mce_banks); 5977 kfree(vcpu->arch.mce_banks);
@@ -5906,13 +5997,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
5906 free_page((unsigned long)vcpu->arch.pio_data); 5997 free_page((unsigned long)vcpu->arch.pio_data);
5907} 5998}
5908 5999
5909struct kvm *kvm_arch_create_vm(void) 6000int kvm_arch_init_vm(struct kvm *kvm)
5910{ 6001{
5911 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
5912
5913 if (!kvm)
5914 return ERR_PTR(-ENOMEM);
5915
5916 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6002 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5917 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6003 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5918 6004
@@ -5921,7 +6007,7 @@ struct kvm *kvm_arch_create_vm(void)
5921 6007
5922 spin_lock_init(&kvm->arch.tsc_write_lock); 6008 spin_lock_init(&kvm->arch.tsc_write_lock);
5923 6009
5924 return kvm; 6010 return 0;
5925} 6011}
5926 6012
5927static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 6013static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -5939,8 +6025,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
5939 /* 6025 /*
5940 * Unpin any mmu pages first. 6026 * Unpin any mmu pages first.
5941 */ 6027 */
5942 kvm_for_each_vcpu(i, vcpu, kvm) 6028 kvm_for_each_vcpu(i, vcpu, kvm) {
6029 kvm_clear_async_pf_completion_queue(vcpu);
5943 kvm_unload_vcpu_mmu(vcpu); 6030 kvm_unload_vcpu_mmu(vcpu);
6031 }
5944 kvm_for_each_vcpu(i, vcpu, kvm) 6032 kvm_for_each_vcpu(i, vcpu, kvm)
5945 kvm_arch_vcpu_free(vcpu); 6033 kvm_arch_vcpu_free(vcpu);
5946 6034
@@ -5964,13 +6052,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5964 kfree(kvm->arch.vpic); 6052 kfree(kvm->arch.vpic);
5965 kfree(kvm->arch.vioapic); 6053 kfree(kvm->arch.vioapic);
5966 kvm_free_vcpus(kvm); 6054 kvm_free_vcpus(kvm);
5967 kvm_free_physmem(kvm);
5968 if (kvm->arch.apic_access_page) 6055 if (kvm->arch.apic_access_page)
5969 put_page(kvm->arch.apic_access_page); 6056 put_page(kvm->arch.apic_access_page);
5970 if (kvm->arch.ept_identity_pagetable) 6057 if (kvm->arch.ept_identity_pagetable)
5971 put_page(kvm->arch.ept_identity_pagetable); 6058 put_page(kvm->arch.ept_identity_pagetable);
5972 cleanup_srcu_struct(&kvm->srcu);
5973 kfree(kvm);
5974} 6059}
5975 6060
5976int kvm_arch_prepare_memory_region(struct kvm *kvm, 6061int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -6051,7 +6136,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
6051 6136
6052int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6137int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6053{ 6138{
6054 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 6139 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6140 !vcpu->arch.apf.halted)
6141 || !list_empty_careful(&vcpu->async_pf.done)
6055 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6142 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
6056 || vcpu->arch.nmi_pending || 6143 || vcpu->arch.nmi_pending ||
6057 (kvm_arch_interrupt_allowed(vcpu) && 6144 (kvm_arch_interrupt_allowed(vcpu) &&
@@ -6110,6 +6197,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
6110} 6197}
6111EXPORT_SYMBOL_GPL(kvm_set_rflags); 6198EXPORT_SYMBOL_GPL(kvm_set_rflags);
6112 6199
6200void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
6201{
6202 int r;
6203
6204 if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
6205 is_error_page(work->page))
6206 return;
6207
6208 r = kvm_mmu_reload(vcpu);
6209 if (unlikely(r))
6210 return;
6211
6212 if (!vcpu->arch.mmu.direct_map &&
6213 work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
6214 return;
6215
6216 vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
6217}
6218
6219static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
6220{
6221 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
6222}
6223
6224static inline u32 kvm_async_pf_next_probe(u32 key)
6225{
6226 return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
6227}
6228
6229static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6230{
6231 u32 key = kvm_async_pf_hash_fn(gfn);
6232
6233 while (vcpu->arch.apf.gfns[key] != ~0)
6234 key = kvm_async_pf_next_probe(key);
6235
6236 vcpu->arch.apf.gfns[key] = gfn;
6237}
6238
6239static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
6240{
6241 int i;
6242 u32 key = kvm_async_pf_hash_fn(gfn);
6243
6244 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
6245 (vcpu->arch.apf.gfns[key] != gfn &&
6246 vcpu->arch.apf.gfns[key] != ~0); i++)
6247 key = kvm_async_pf_next_probe(key);
6248
6249 return key;
6250}
6251
6252bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6253{
6254 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
6255}
6256
6257static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6258{
6259 u32 i, j, k;
6260
6261 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
6262 while (true) {
6263 vcpu->arch.apf.gfns[i] = ~0;
6264 do {
6265 j = kvm_async_pf_next_probe(j);
6266 if (vcpu->arch.apf.gfns[j] == ~0)
6267 return;
6268 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
6269 /*
6270 * k lies cyclically in ]i,j]
6271 * | i.k.j |
6272 * |....j i.k.| or |.k..j i...|
6273 */
6274 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
6275 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
6276 i = j;
6277 }
6278}
6279
6280static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
6281{
6282
6283 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
6284 sizeof(val));
6285}
6286
6287void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
6288 struct kvm_async_pf *work)
6289{
6290 struct x86_exception fault;
6291
6292 trace_kvm_async_pf_not_present(work->arch.token, work->gva);
6293 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
6294
6295 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
6296 (vcpu->arch.apf.send_user_only &&
6297 kvm_x86_ops->get_cpl(vcpu) == 0))
6298 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
6299 else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
6300 fault.vector = PF_VECTOR;
6301 fault.error_code_valid = true;
6302 fault.error_code = 0;
6303 fault.nested_page_fault = false;
6304 fault.address = work->arch.token;
6305 kvm_inject_page_fault(vcpu, &fault);
6306 }
6307}
6308
6309void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
6310 struct kvm_async_pf *work)
6311{
6312 struct x86_exception fault;
6313
6314 trace_kvm_async_pf_ready(work->arch.token, work->gva);
6315 if (is_error_page(work->page))
6316 work->arch.token = ~0; /* broadcast wakeup */
6317 else
6318 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
6319
6320 if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
6321 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
6322 fault.vector = PF_VECTOR;
6323 fault.error_code_valid = true;
6324 fault.error_code = 0;
6325 fault.nested_page_fault = false;
6326 fault.address = work->arch.token;
6327 kvm_inject_page_fault(vcpu, &fault);
6328 }
6329 vcpu->arch.apf.halted = false;
6330}
6331
6332bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
6333{
6334 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
6335 return true;
6336 else
6337 return !kvm_event_needs_reinjection(vcpu) &&
6338 kvm_x86_ops->interrupt_allowed(vcpu);
6339}
6340
6113EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 6341EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
6114EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 6342EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
6115EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 6343EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 38718041efc3..6e121a2a49e1 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,6 +2,7 @@ config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 select VIRTUALIZATION
5 select VIRTIO 6 select VIRTIO
6 select VIRTIO_RING 7 select VIRTIO_RING
7 select VIRTIO_CONSOLE 8 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4996cf5f73a0..eba687f0cc0c 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -824,7 +824,7 @@ static void __init lguest_init_IRQ(void)
824 824
825 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 825 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
826 /* Some systems map "vectors" to interrupts weirdly. Not us! */ 826 /* Some systems map "vectors" to interrupts weirdly. Not us! */
827 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; 827 __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
828 if (i != SYSCALL_VECTOR) 828 if (i != SYSCALL_VECTOR)
829 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 829 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
830 } 830 }
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 08a0069b87a5..f21962c435ed 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -27,6 +27,7 @@
27#include <asm/amd_nb.h> 27#include <asm/amd_nb.h>
28 28
29static struct bootnode __initdata nodes[8]; 29static struct bootnode __initdata nodes[8];
30static unsigned char __initdata nodeids[8];
30static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; 31static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
31 32
32static __init int find_northbridge(void) 33static __init int find_northbridge(void)
@@ -68,19 +69,6 @@ static __init void early_get_boot_cpu_id(void)
68#endif 69#endif
69} 70}
70 71
71int __init amd_get_nodes(struct bootnode *physnodes)
72{
73 int i;
74 int ret = 0;
75
76 for_each_node_mask(i, nodes_parsed) {
77 physnodes[ret].start = nodes[i].start;
78 physnodes[ret].end = nodes[i].end;
79 ret++;
80 }
81 return ret;
82}
83
84int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) 72int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
85{ 73{
86 unsigned long start = PFN_PHYS(start_pfn); 74 unsigned long start = PFN_PHYS(start_pfn);
@@ -113,7 +101,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
113 base = read_pci_config(0, nb, 1, 0x40 + i*8); 101 base = read_pci_config(0, nb, 1, 0x40 + i*8);
114 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 102 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
115 103
116 nodeid = limit & 7; 104 nodeids[i] = nodeid = limit & 7;
117 if ((base & 3) == 0) { 105 if ((base & 3) == 0) {
118 if (i < numnodes) 106 if (i < numnodes)
119 pr_info("Skipping disabled node %d\n", i); 107 pr_info("Skipping disabled node %d\n", i);
@@ -193,6 +181,76 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
193 return 0; 181 return 0;
194} 182}
195 183
184#ifdef CONFIG_NUMA_EMU
185static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
186 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
187};
188
189void __init amd_get_nodes(struct bootnode *physnodes)
190{
191 int i;
192
193 for_each_node_mask(i, nodes_parsed) {
194 physnodes[i].start = nodes[i].start;
195 physnodes[i].end = nodes[i].end;
196 }
197}
198
199static int __init find_node_by_addr(unsigned long addr)
200{
201 int ret = NUMA_NO_NODE;
202 int i;
203
204 for (i = 0; i < 8; i++)
205 if (addr >= nodes[i].start && addr < nodes[i].end) {
206 ret = i;
207 break;
208 }
209 return ret;
210}
211
212/*
213 * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
214 * setup to represent the physical topology but reflect the emulated
215 * environment. For each emulated node, the real node which it appears on is
216 * found and a fake pxm to nid mapping is created which mirrors the actual
217 * locality. node_distance() then represents the correct distances between
218 * emulated nodes by using the fake acpi mappings to pxms.
219 */
220void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
221{
222 unsigned int bits;
223 unsigned int cores;
224 unsigned int apicid_base = 0;
225 int i;
226
227 bits = boot_cpu_data.x86_coreid_bits;
228 cores = 1 << bits;
229 early_get_boot_cpu_id();
230 if (boot_cpu_physical_apicid > 0)
231 apicid_base = boot_cpu_physical_apicid;
232
233 for (i = 0; i < nr_nodes; i++) {
234 int index;
235 int nid;
236 int j;
237
238 nid = find_node_by_addr(nodes[i].start);
239 if (nid == NUMA_NO_NODE)
240 continue;
241
242 index = nodeids[nid] << bits;
243 if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
244 for (j = apicid_base; j < cores + apicid_base; j++)
245 fake_apicid_to_node[index + j] = i;
246#ifdef CONFIG_ACPI_NUMA
247 __acpi_map_pxm_to_node(nid, i);
248#endif
249 }
250 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
251}
252#endif /* CONFIG_NUMA_EMU */
253
196int __init amd_scan_nodes(void) 254int __init amd_scan_nodes(void)
197{ 255{
198 unsigned int bits; 256 unsigned int bits;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/vmstat.h> 9#include <linux/vmstat.h>
10#include <linux/highmem.h> 10#include <linux/highmem.h>
11#include <linux/swap.h>
11 12
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
13 14
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 90 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte); 91 page = pte_page(pte);
91 get_page(page); 92 get_page(page);
93 SetPageReferenced(page);
92 pages[*nr] = page; 94 pages[*nr] = page;
93 (*nr)++; 95 (*nr)++;
94 96
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
103 VM_BUG_ON(page != compound_head(page)); 105 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0); 106 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count); 107 atomic_add(nr, &page->_count);
108 SetPageReferenced(page);
109}
110
111static inline void get_huge_page_tail(struct page *page)
112{
113 /*
114 * __split_huge_page_refcount() cannot run
115 * from under us.
116 */
117 VM_BUG_ON(atomic_read(&page->_count) < 0);
118 atomic_inc(&page->_count);
106} 119}
107 120
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 121static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
128 do { 141 do {
129 VM_BUG_ON(compound_head(page) != head); 142 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page; 143 pages[*nr] = page;
144 if (PageTail(page))
145 get_huge_page_tail(page);
131 (*nr)++; 146 (*nr)++;
132 page++; 147 page++;
133 refs++; 148 refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
148 pmd_t pmd = *pmdp; 163 pmd_t pmd = *pmdp;
149 164
150 next = pmd_addr_end(addr, end); 165 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd)) 166 /*
167 * The pmd_trans_splitting() check below explains why
168 * pmdp_splitting_flush has to flush the tlb, to stop
169 * this gup-fast code from running while we set the
170 * splitting bit in the pmd. Returning zero will take
171 * the slow path that will call wait_split_huge_page()
172 * if the pmd is still in splitting state. gup-fast
173 * can't because it has irq disabled and
174 * wait_split_huge_page() would never return as the
175 * tlb flush IPI wouldn't run.
176 */
177 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
152 return 0; 178 return 0;
153 if (unlikely(pmd_large(pmd))) { 179 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) 180 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f89b5bb4e93f..c821074b7f0b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -45,6 +45,7 @@
45#include <asm/bugs.h> 45#include <asm/bugs.h>
46#include <asm/tlb.h> 46#include <asm/tlb.h>
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/olpc_ofw.h>
48#include <asm/pgalloc.h> 49#include <asm/pgalloc.h>
49#include <asm/sections.h> 50#include <asm/sections.h>
50#include <asm/paravirt.h> 51#include <asm/paravirt.h>
@@ -715,6 +716,7 @@ void __init paging_init(void)
715 /* 716 /*
716 * NOTE: at this point the bootmem allocator is fully available. 717 * NOTE: at this point the bootmem allocator is fully available.
717 */ 718 */
719 olpc_dt_build_devicetree();
718 sparse_init(); 720 sparse_init();
719 zone_sizes_init(); 721 zone_sizes_init();
720} 722}
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c3..ebf6d7887a38 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,6 +2,28 @@
2#include <linux/topology.h> 2#include <linux/topology.h>
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <asm/numa.h>
6#include <asm/acpi.h>
7
8int __initdata numa_off;
9
10static __init int numa_setup(char *opt)
11{
12 if (!opt)
13 return -EINVAL;
14 if (!strncmp(opt, "off", 3))
15 numa_off = 1;
16#ifdef CONFIG_NUMA_EMU
17 if (!strncmp(opt, "fake=", 5))
18 numa_emu_cmdline(opt + 5);
19#endif
20#ifdef CONFIG_ACPI_NUMA
21 if (!strncmp(opt, "noacpi", 6))
22 acpi_numa = -1;
23#endif
24 return 0;
25}
26early_param("numa", numa_setup);
5 27
6/* 28/*
7 * Which logical CPUs are on which nodes 29 * Which logical CPUs are on which nodes
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7762a517d69d..95ea1551eebc 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -30,7 +30,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
30 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 30 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
31}; 31};
32 32
33int numa_off __initdata;
34static unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
35static unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
36 35
@@ -260,30 +259,35 @@ void __init numa_init_array(void)
260#ifdef CONFIG_NUMA_EMU 259#ifdef CONFIG_NUMA_EMU
261/* Numa emulation */ 260/* Numa emulation */
262static struct bootnode nodes[MAX_NUMNODES] __initdata; 261static struct bootnode nodes[MAX_NUMNODES] __initdata;
263static struct bootnode physnodes[MAX_NUMNODES] __initdata; 262static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
264static char *cmdline __initdata; 263static char *cmdline __initdata;
265 264
265void __init numa_emu_cmdline(char *str)
266{
267 cmdline = str;
268}
269
266static int __init setup_physnodes(unsigned long start, unsigned long end, 270static int __init setup_physnodes(unsigned long start, unsigned long end,
267 int acpi, int amd) 271 int acpi, int amd)
268{ 272{
269 int nr_nodes = 0;
270 int ret = 0; 273 int ret = 0;
271 int i; 274 int i;
272 275
276 memset(physnodes, 0, sizeof(physnodes));
273#ifdef CONFIG_ACPI_NUMA 277#ifdef CONFIG_ACPI_NUMA
274 if (acpi) 278 if (acpi)
275 nr_nodes = acpi_get_nodes(physnodes); 279 acpi_get_nodes(physnodes, start, end);
276#endif 280#endif
277#ifdef CONFIG_AMD_NUMA 281#ifdef CONFIG_AMD_NUMA
278 if (amd) 282 if (amd)
279 nr_nodes = amd_get_nodes(physnodes); 283 amd_get_nodes(physnodes);
280#endif 284#endif
281 /* 285 /*
282 * Basic sanity checking on the physical node map: there may be errors 286 * Basic sanity checking on the physical node map: there may be errors
283 * if the SRAT or AMD code incorrectly reported the topology or the mem= 287 * if the SRAT or AMD code incorrectly reported the topology or the mem=
284 * kernel parameter is used. 288 * kernel parameter is used.
285 */ 289 */
286 for (i = 0; i < nr_nodes; i++) { 290 for (i = 0; i < MAX_NUMNODES; i++) {
287 if (physnodes[i].start == physnodes[i].end) 291 if (physnodes[i].start == physnodes[i].end)
288 continue; 292 continue;
289 if (physnodes[i].start > end) { 293 if (physnodes[i].start > end) {
@@ -298,17 +302,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
298 physnodes[i].start = start; 302 physnodes[i].start = start;
299 if (physnodes[i].end > end) 303 if (physnodes[i].end > end)
300 physnodes[i].end = end; 304 physnodes[i].end = end;
301 }
302
303 /*
304 * Remove all nodes that have no memory or were truncated because of the
305 * limited address range.
306 */
307 for (i = 0; i < nr_nodes; i++) {
308 if (physnodes[i].start == physnodes[i].end)
309 continue;
310 physnodes[ret].start = physnodes[i].start;
311 physnodes[ret].end = physnodes[i].end;
312 ret++; 305 ret++;
313 } 306 }
314 307
@@ -324,6 +317,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
324 return ret; 317 return ret;
325} 318}
326 319
320static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
321{
322 int i;
323
324 BUG_ON(acpi && amd);
325#ifdef CONFIG_ACPI_NUMA
326 if (acpi)
327 acpi_fake_nodes(nodes, nr_nodes);
328#endif
329#ifdef CONFIG_AMD_NUMA
330 if (amd)
331 amd_fake_nodes(nodes, nr_nodes);
332#endif
333 if (!acpi && !amd)
334 for (i = 0; i < nr_cpu_ids; i++)
335 numa_set_node(i, 0);
336}
337
327/* 338/*
328 * Setups up nid to range from addr to addr + size. If the end 339 * Setups up nid to range from addr to addr + size. If the end
329 * boundary is greater than max_addr, then max_addr is used instead. 340 * boundary is greater than max_addr, then max_addr is used instead.
@@ -352,8 +363,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
352 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 363 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
353 * to max_addr. The return value is the number of nodes allocated. 364 * to max_addr. The return value is the number of nodes allocated.
354 */ 365 */
355static int __init split_nodes_interleave(u64 addr, u64 max_addr, 366static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
356 int nr_phys_nodes, int nr_nodes)
357{ 367{
358 nodemask_t physnode_mask = NODE_MASK_NONE; 368 nodemask_t physnode_mask = NODE_MASK_NONE;
359 u64 size; 369 u64 size;
@@ -384,7 +394,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
384 return -1; 394 return -1;
385 } 395 }
386 396
387 for (i = 0; i < nr_phys_nodes; i++) 397 for (i = 0; i < MAX_NUMNODES; i++)
388 if (physnodes[i].start != physnodes[i].end) 398 if (physnodes[i].start != physnodes[i].end)
389 node_set(i, physnode_mask); 399 node_set(i, physnode_mask);
390 400
@@ -553,11 +563,9 @@ static int __init numa_emulation(unsigned long start_pfn,
553{ 563{
554 u64 addr = start_pfn << PAGE_SHIFT; 564 u64 addr = start_pfn << PAGE_SHIFT;
555 u64 max_addr = last_pfn << PAGE_SHIFT; 565 u64 max_addr = last_pfn << PAGE_SHIFT;
556 int num_phys_nodes;
557 int num_nodes; 566 int num_nodes;
558 int i; 567 int i;
559 568
560 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
561 /* 569 /*
562 * If the numa=fake command-line contains a 'M' or 'G', it represents 570 * If the numa=fake command-line contains a 'M' or 'G', it represents
563 * the fixed node size. Otherwise, if it is just a single number N, 571 * the fixed node size. Otherwise, if it is just a single number N,
@@ -572,7 +580,7 @@ static int __init numa_emulation(unsigned long start_pfn,
572 unsigned long n; 580 unsigned long n;
573 581
574 n = simple_strtoul(cmdline, NULL, 0); 582 n = simple_strtoul(cmdline, NULL, 0);
575 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); 583 num_nodes = split_nodes_interleave(addr, max_addr, n);
576 } 584 }
577 585
578 if (num_nodes < 0) 586 if (num_nodes < 0)
@@ -595,7 +603,8 @@ static int __init numa_emulation(unsigned long start_pfn,
595 nodes[i].end >> PAGE_SHIFT); 603 nodes[i].end >> PAGE_SHIFT);
596 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 604 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
597 } 605 }
598 acpi_fake_nodes(nodes, num_nodes); 606 setup_physnodes(addr, max_addr, acpi, amd);
607 fake_physnodes(acpi, amd, num_nodes);
599 numa_init_array(); 608 numa_init_array();
600 return 0; 609 return 0;
601} 610}
@@ -610,8 +619,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
610 nodes_clear(node_online_map); 619 nodes_clear(node_online_map);
611 620
612#ifdef CONFIG_NUMA_EMU 621#ifdef CONFIG_NUMA_EMU
622 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
623 acpi, amd);
613 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) 624 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
614 return; 625 return;
626 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
627 acpi, amd);
615 nodes_clear(node_possible_map); 628 nodes_clear(node_possible_map);
616 nodes_clear(node_online_map); 629 nodes_clear(node_online_map);
617#endif 630#endif
@@ -661,24 +674,6 @@ unsigned long __init numa_free_all_bootmem(void)
661 return pages; 674 return pages;
662} 675}
663 676
664static __init int numa_setup(char *opt)
665{
666 if (!opt)
667 return -EINVAL;
668 if (!strncmp(opt, "off", 3))
669 numa_off = 1;
670#ifdef CONFIG_NUMA_EMU
671 if (!strncmp(opt, "fake=", 5))
672 cmdline = opt + 5;
673#endif
674#ifdef CONFIG_ACPI_NUMA
675 if (!strncmp(opt, "noacpi", 6))
676 acpi_numa = -1;
677#endif
678 return 0;
679}
680early_param("numa", numa_setup);
681
682#ifdef CONFIG_NUMA 677#ifdef CONFIG_NUMA
683 678
684static __init int find_near_online_node(int node) 679static __init int find_near_online_node(int node)
@@ -767,6 +762,7 @@ void __cpuinit numa_clear_node(int cpu)
767 762
768#ifndef CONFIG_DEBUG_PER_CPU_MAPS 763#ifndef CONFIG_DEBUG_PER_CPU_MAPS
769 764
765#ifndef CONFIG_NUMA_EMU
770void __cpuinit numa_add_cpu(int cpu) 766void __cpuinit numa_add_cpu(int cpu)
771{ 767{
772 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 768 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
@@ -776,34 +772,115 @@ void __cpuinit numa_remove_cpu(int cpu)
776{ 772{
777 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 773 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
778} 774}
775#else
776void __cpuinit numa_add_cpu(int cpu)
777{
778 unsigned long addr;
779 u16 apicid;
780 int physnid;
781 int nid = NUMA_NO_NODE;
782
783 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
784 if (apicid != BAD_APICID)
785 nid = apicid_to_node[apicid];
786 if (nid == NUMA_NO_NODE)
787 nid = early_cpu_to_node(cpu);
788 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
789
790 /*
791 * Use the starting address of the emulated node to find which physical
792 * node it is allocated on.
793 */
794 addr = node_start_pfn(nid) << PAGE_SHIFT;
795 for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
796 if (addr >= physnodes[physnid].start &&
797 addr < physnodes[physnid].end)
798 break;
799
800 /*
801 * Map the cpu to each emulated node that is allocated on the physical
802 * node of the cpu's apic id.
803 */
804 for_each_online_node(nid) {
805 addr = node_start_pfn(nid) << PAGE_SHIFT;
806 if (addr >= physnodes[physnid].start &&
807 addr < physnodes[physnid].end)
808 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
809 }
810}
811
812void __cpuinit numa_remove_cpu(int cpu)
813{
814 int i;
815
816 for_each_online_node(i)
817 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
818}
819#endif /* !CONFIG_NUMA_EMU */
779 820
780#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 821#else /* CONFIG_DEBUG_PER_CPU_MAPS */
822static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
823{
824 int node = early_cpu_to_node(cpu);
825 struct cpumask *mask;
826 char buf[64];
827
828 mask = node_to_cpumask_map[node];
829 if (!mask) {
830 pr_err("node_to_cpumask_map[%i] NULL\n", node);
831 dump_stack();
832 return NULL;
833 }
834
835 cpulist_scnprintf(buf, sizeof(buf), mask);
836 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
837 enable ? "numa_add_cpu" : "numa_remove_cpu",
838 cpu, node, buf);
839 return mask;
840}
781 841
782/* 842/*
783 * --------- debug versions of the numa functions --------- 843 * --------- debug versions of the numa functions ---------
784 */ 844 */
845#ifndef CONFIG_NUMA_EMU
785static void __cpuinit numa_set_cpumask(int cpu, int enable) 846static void __cpuinit numa_set_cpumask(int cpu, int enable)
786{ 847{
787 int node = early_cpu_to_node(cpu);
788 struct cpumask *mask; 848 struct cpumask *mask;
789 char buf[64];
790 849
791 mask = node_to_cpumask_map[node]; 850 mask = debug_cpumask_set_cpu(cpu, enable);
792 if (mask == NULL) { 851 if (!mask)
793 printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
794 dump_stack();
795 return; 852 return;
796 }
797 853
798 if (enable) 854 if (enable)
799 cpumask_set_cpu(cpu, mask); 855 cpumask_set_cpu(cpu, mask);
800 else 856 else
801 cpumask_clear_cpu(cpu, mask); 857 cpumask_clear_cpu(cpu, mask);
858}
859#else
860static void __cpuinit numa_set_cpumask(int cpu, int enable)
861{
862 int node = early_cpu_to_node(cpu);
863 struct cpumask *mask;
864 int i;
802 865
803 cpulist_scnprintf(buf, sizeof(buf), mask); 866 for_each_online_node(i) {
804 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 867 unsigned long addr;
805 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); 868
869 addr = node_start_pfn(i) << PAGE_SHIFT;
870 if (addr < physnodes[node].start ||
871 addr >= physnodes[node].end)
872 continue;
873 mask = debug_cpumask_set_cpu(cpu, enable);
874 if (!mask)
875 return;
876
877 if (enable)
878 cpumask_set_cpu(cpu, mask);
879 else
880 cpumask_clear_cpu(cpu, mask);
881 }
806} 882}
883#endif /* CONFIG_NUMA_EMU */
807 884
808void __cpuinit numa_add_cpu(int cpu) 885void __cpuinit numa_add_cpu(int cpu)
809{ 886{
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc89..500242d3c96d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
320 return changed; 320 return changed;
321} 321}
322 322
323#ifdef CONFIG_TRANSPARENT_HUGEPAGE
324int pmdp_set_access_flags(struct vm_area_struct *vma,
325 unsigned long address, pmd_t *pmdp,
326 pmd_t entry, int dirty)
327{
328 int changed = !pmd_same(*pmdp, entry);
329
330 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
331
332 if (changed && dirty) {
333 *pmdp = entry;
334 pmd_update_defer(vma->vm_mm, address, pmdp);
335 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
336 }
337
338 return changed;
339}
340#endif
341
323int ptep_test_and_clear_young(struct vm_area_struct *vma, 342int ptep_test_and_clear_young(struct vm_area_struct *vma,
324 unsigned long addr, pte_t *ptep) 343 unsigned long addr, pte_t *ptep)
325{ 344{
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
335 return ret; 354 return ret;
336} 355}
337 356
357#ifdef CONFIG_TRANSPARENT_HUGEPAGE
358int pmdp_test_and_clear_young(struct vm_area_struct *vma,
359 unsigned long addr, pmd_t *pmdp)
360{
361 int ret = 0;
362
363 if (pmd_young(*pmdp))
364 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
365 (unsigned long *)pmdp);
366
367 if (ret)
368 pmd_update(vma->vm_mm, addr, pmdp);
369
370 return ret;
371}
372#endif
373
338int ptep_clear_flush_young(struct vm_area_struct *vma, 374int ptep_clear_flush_young(struct vm_area_struct *vma,
339 unsigned long address, pte_t *ptep) 375 unsigned long address, pte_t *ptep)
340{ 376{
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
347 return young; 383 return young;
348} 384}
349 385
386#ifdef CONFIG_TRANSPARENT_HUGEPAGE
387int pmdp_clear_flush_young(struct vm_area_struct *vma,
388 unsigned long address, pmd_t *pmdp)
389{
390 int young;
391
392 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
393
394 young = pmdp_test_and_clear_young(vma, address, pmdp);
395 if (young)
396 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
397
398 return young;
399}
400
401void pmdp_splitting_flush(struct vm_area_struct *vma,
402 unsigned long address, pmd_t *pmdp)
403{
404 int set;
405 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
406 set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
407 (unsigned long *)pmdp);
408 if (set) {
409 pmd_update(vma->vm_mm, address, pmdp);
410 /* need tlb flush only to serialize against gup-fast */
411 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
412 }
413}
414#endif
415
350/** 416/**
351 * reserve_top_address - reserves a hole in the top of kernel address space 417 * reserve_top_address - reserves a hole in the top of kernel address space
352 * @reserve - size of hole to reserve 418 * @reserve - size of hole to reserve
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index f16434568a51..ae96e7b8051d 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -59,7 +59,6 @@ static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
59static int __initdata num_memory_chunks; /* total number of memory chunks */ 59static int __initdata num_memory_chunks; /* total number of memory chunks */
60static u8 __initdata apicid_to_pxm[MAX_APICID]; 60static u8 __initdata apicid_to_pxm[MAX_APICID];
61 61
62int numa_off __initdata;
63int acpi_numa __initdata; 62int acpi_numa __initdata;
64 63
65static __init void bad_srat(void) 64static __init void bad_srat(void)
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 171a0aacb99a..603d285d1daa 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -349,18 +349,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
349 349
350void __init acpi_numa_arch_fixup(void) {} 350void __init acpi_numa_arch_fixup(void) {}
351 351
352int __init acpi_get_nodes(struct bootnode *physnodes) 352#ifdef CONFIG_NUMA_EMU
353void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
354 unsigned long end)
353{ 355{
354 int i; 356 int i;
355 int ret = 0;
356 357
357 for_each_node_mask(i, nodes_parsed) { 358 for_each_node_mask(i, nodes_parsed) {
358 physnodes[ret].start = nodes[i].start; 359 cutoff_node(i, start, end);
359 physnodes[ret].end = nodes[i].end; 360 physnodes[i].start = nodes[i].start;
360 ret++; 361 physnodes[i].end = nodes[i].end;
361 } 362 }
362 return ret;
363} 363}
364#endif /* CONFIG_NUMA_EMU */
364 365
365/* Use the information discovered above to actually set up the nodes. */ 366/* Use the information discovered above to actually set up the nodes. */
366int __init acpi_scan_nodes(unsigned long start, unsigned long end) 367int __init acpi_scan_nodes(unsigned long start, unsigned long end)
@@ -505,8 +506,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
505{ 506{
506 int i, j; 507 int i, j;
507 508
508 printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
509 "topology.\n");
510 for (i = 0; i < num_nodes; i++) { 509 for (i = 0; i < num_nodes; i++) {
511 int nid, pxm; 510 int nid, pxm;
512 511
@@ -526,6 +525,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
526 fake_apicid_to_node[j] == NUMA_NO_NODE) 525 fake_apicid_to_node[j] == NUMA_NO_NODE)
527 fake_apicid_to_node[j] = i; 526 fake_apicid_to_node[j] = i;
528 } 527 }
528
529 /*
530 * If there are apicid-to-node mappings for physical nodes that do not
531 * have a corresponding emulated node, it should default to a guaranteed
532 * value.
533 */
534 for (i = 0; i < MAX_LOCAL_APIC; i++)
535 if (apicid_to_node[i] != NUMA_NO_NODE &&
536 fake_apicid_to_node[i] == NUMA_NO_NODE)
537 fake_apicid_to_node[i] = 0;
538
529 for (i = 0; i < num_nodes; i++) 539 for (i = 0; i < num_nodes; i++)
530 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); 540 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
531 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); 541 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index f24a8533bcdf..e2b7b0c06cdf 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -65,7 +65,6 @@ static int profile_exceptions_notify(struct notifier_block *self,
65 65
66 switch (val) { 66 switch (val) {
67 case DIE_NMI: 67 case DIE_NMI:
68 case DIE_NMI_IPI:
69 if (ctr_running) 68 if (ctr_running)
70 model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs)); 69 model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
71 else if (!nmi_enabled) 70 else if (!nmi_enabled)
@@ -361,7 +360,7 @@ static void nmi_cpu_setup(void *dummy)
361static struct notifier_block profile_exceptions_nb = { 360static struct notifier_block profile_exceptions_nb = {
362 .notifier_call = profile_exceptions_notify, 361 .notifier_call = profile_exceptions_notify,
363 .next = NULL, 362 .next = NULL,
364 .priority = 2 363 .priority = NMI_LOCAL_LOW_PRIOR,
365}; 364};
366 365
367static void nmi_cpu_restore_registers(struct op_msrs *msrs) 366static void nmi_cpu_restore_registers(struct op_msrs *msrs)
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index 0636dd93cef8..720bf5a53c51 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -38,7 +38,7 @@ static int profile_timer_exceptions_notify(struct notifier_block *self,
38static struct notifier_block profile_timer_exceptions_nb = { 38static struct notifier_block profile_timer_exceptions_nb = {
39 .notifier_call = profile_timer_exceptions_notify, 39 .notifier_call = profile_timer_exceptions_notify,
40 .next = NULL, 40 .next = NULL,
41 .priority = 0 41 .priority = NMI_LOW_PRIOR,
42}; 42};
43 43
44static int timer_start(void) 44static int timer_start(void)
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index fc1e8fe07e5c..e27dffbbb1a7 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/range.h> 5#include <linux/range.h>
6 6
7#include <asm/amd_nb.h>
7#include <asm/pci_x86.h> 8#include <asm/pci_x86.h>
8 9
9#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
@@ -378,6 +379,34 @@ static struct notifier_block __cpuinitdata amd_cpu_notifier = {
378 .notifier_call = amd_cpu_notify, 379 .notifier_call = amd_cpu_notify,
379}; 380};
380 381
382static void __init pci_enable_pci_io_ecs(void)
383{
384#ifdef CONFIG_AMD_NB
385 unsigned int i, n;
386
387 for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
388 u8 bus = amd_nb_bus_dev_ranges[i].bus;
389 u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
390 u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
391
392 for (; slot < limit; ++slot) {
393 u32 val = read_pci_config(bus, slot, 3, 0);
394
395 if (!early_is_amd_nb(val))
396 continue;
397
398 val = read_pci_config(bus, slot, 3, 0x8c);
399 if (!(val & (ENABLE_CF8_EXT_CFG >> 32))) {
400 val |= ENABLE_CF8_EXT_CFG >> 32;
401 write_pci_config(bus, slot, 3, 0x8c, val);
402 }
403 ++n;
404 }
405 }
406 pr_info("Extended Config Space enabled on %u nodes\n", n);
407#endif
408}
409
381static int __init pci_io_ecs_init(void) 410static int __init pci_io_ecs_init(void)
382{ 411{
383 int cpu; 412 int cpu;
@@ -386,6 +415,10 @@ static int __init pci_io_ecs_init(void)
386 if (boot_cpu_data.x86 < 0x10) 415 if (boot_cpu_data.x86 < 0x10)
387 return 0; 416 return 0;
388 417
418 /* Try the PCI method first. */
419 if (early_pci_allowed())
420 pci_enable_pci_io_ecs();
421
389 register_cpu_notifier(&amd_cpu_notifier); 422 register_cpu_notifier(&amd_cpu_notifier);
390 for_each_online_cpu(cpu) 423 for_each_online_cpu(cpu)
391 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, 424 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index 0846a5bbbfbd..ab8269b0da29 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -9,6 +9,7 @@
9 * option) any later version. 9 * option) any later version.
10 */ 10 */
11 11
12#include <linux/acpi.h>
12#include <linux/delay.h> 13#include <linux/delay.h>
13#include <linux/dmi.h> 14#include <linux/dmi.h>
14#include <linux/pci.h> 15#include <linux/pci.h>
@@ -25,12 +26,14 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
25 u8 fbus, lbus; 26 u8 fbus, lbus;
26 int i; 27 int i;
27 28
29#ifdef CONFIG_ACPI
28 /* 30 /*
29 * The x86_pci_root_bus_res_quirks() function already refuses to use 31 * We should get host bridge information from ACPI unless the BIOS
30 * this information if ACPI _CRS was used. Therefore, we don't bother 32 * doesn't support it.
31 * checking if ACPI is enabled, and just generate the information
32 * for both the ACPI _CRS and no ACPI cases.
33 */ 33 */
34 if (acpi_os_get_root_pointer())
35 return;
36#endif
34 37
35 info = &pci_root_info[pci_root_num]; 38 info = &pci_root_info[pci_root_num];
36 pci_root_num++; 39 pci_root_num++;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index f7c8a399978c..5fe75026ecc2 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
22 22
23unsigned int pci_early_dump_regs; 23unsigned int pci_early_dump_regs;
24static int pci_bf_sort; 24static int pci_bf_sort;
25static int smbios_type_b1_flag;
25int pci_routeirq; 26int pci_routeirq;
26int noioapicquirk; 27int noioapicquirk;
27#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS 28#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -185,6 +186,39 @@ static int __devinit set_bf_sort(const struct dmi_system_id *d)
185 return 0; 186 return 0;
186} 187}
187 188
189static void __devinit read_dmi_type_b1(const struct dmi_header *dm,
190 void *private_data)
191{
192 u8 *d = (u8 *)dm + 4;
193
194 if (dm->type != 0xB1)
195 return;
196 switch (((*(u32 *)d) >> 9) & 0x03) {
197 case 0x00:
198 printk(KERN_INFO "dmi type 0xB1 record - unknown flag\n");
199 break;
200 case 0x01: /* set pci=bfsort */
201 smbios_type_b1_flag = 1;
202 break;
203 case 0x02: /* do not set pci=bfsort */
204 smbios_type_b1_flag = 2;
205 break;
206 default:
207 break;
208 }
209}
210
211static int __devinit find_sort_method(const struct dmi_system_id *d)
212{
213 dmi_walk(read_dmi_type_b1, NULL);
214
215 if (smbios_type_b1_flag == 1) {
216 set_bf_sort(d);
217 return 0;
218 }
219 return -1;
220}
221
188/* 222/*
189 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) 223 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
190 */ 224 */
@@ -213,6 +247,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
213 }, 247 },
214#endif /* __i386__ */ 248#endif /* __i386__ */
215 { 249 {
250 .callback = find_sort_method,
251 .ident = "Dell System",
252 .matches = {
253 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
254 },
255 },
256 {
216 .callback = set_bf_sort, 257 .callback = set_bf_sort,
217 .ident = "Dell PowerEdge 1950", 258 .ident = "Dell PowerEdge 1950",
218 .matches = { 259 .matches = {
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9f9bfb705cf9..87e6c8323117 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,7 +589,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
589 case PCI_DEVICE_ID_INTEL_ICH10_1: 589 case PCI_DEVICE_ID_INTEL_ICH10_1:
590 case PCI_DEVICE_ID_INTEL_ICH10_2: 590 case PCI_DEVICE_ID_INTEL_ICH10_2:
591 case PCI_DEVICE_ID_INTEL_ICH10_3: 591 case PCI_DEVICE_ID_INTEL_ICH10_3:
592 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC: 592 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0:
593 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1:
593 r->name = "PIIX/ICH"; 594 r->name = "PIIX/ICH";
594 r->get = pirq_piix_get; 595 r->get = pirq_piix_get;
595 r->set = pirq_piix_set; 596 r->set = pirq_piix_set;
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 65df603622b2..25bfdbb5b130 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -103,7 +103,7 @@ struct dw_spi_reg {
103static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0; 103static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
104 104
105static u32 *pclk_spi0; 105static u32 *pclk_spi0;
106/* Always contains an accessable address, start with 0 */ 106/* Always contains an accessible address, start with 0 */
107static struct dw_spi_reg *pspi; 107static struct dw_spi_reg *pspi;
108 108
109static struct kmsg_dumper dw_dumper; 109static struct kmsg_dumper dw_dumper;
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index c31b8fcb5a86..e797428b163b 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,3 +1,4 @@
1obj-$(CONFIG_OLPC) += olpc.o 1obj-$(CONFIG_OLPC) += olpc.o
2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o 2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
3obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o 3obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
4obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
index f5442c03abc3..127775696d6c 100644
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * Support for features of the OLPC XO-1 laptop 2 * Support for features of the OLPC XO-1 laptop
3 * 3 *
4 * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
4 * Copyright (C) 2010 One Laptop per Child 5 * Copyright (C) 2010 One Laptop per Child
5 * Copyright (C) 2006 Red Hat, Inc. 6 * Copyright (C) 2006 Red Hat, Inc.
6 * Copyright (C) 2006 Advanced Micro Devices, Inc. 7 * Copyright (C) 2006 Advanced Micro Devices, Inc.
@@ -12,8 +13,6 @@
12 */ 13 */
13 14
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/pci.h>
16#include <linux/pci_ids.h>
17#include <linux/platform_device.h> 16#include <linux/platform_device.h>
18#include <linux/pm.h> 17#include <linux/pm.h>
19 18
@@ -22,9 +21,6 @@
22 21
23#define DRV_NAME "olpc-xo1" 22#define DRV_NAME "olpc-xo1"
24 23
25#define PMS_BAR 4
26#define ACPI_BAR 5
27
28/* PMC registers (PMS block) */ 24/* PMC registers (PMS block) */
29#define PM_SCLK 0x10 25#define PM_SCLK 0x10
30#define PM_IN_SLPCTL 0x20 26#define PM_IN_SLPCTL 0x20
@@ -57,65 +53,67 @@ static void xo1_power_off(void)
57 outl(0x00002000, acpi_base + PM1_CNT); 53 outl(0x00002000, acpi_base + PM1_CNT);
58} 54}
59 55
60/* Read the base addresses from the PCI BAR info */ 56static int __devinit olpc_xo1_probe(struct platform_device *pdev)
61static int __devinit setup_bases(struct pci_dev *pdev)
62{ 57{
63 int r; 58 struct resource *res;
64 59
65 r = pci_enable_device_io(pdev); 60 /* don't run on non-XOs */
66 if (r) { 61 if (!machine_is_olpc())
67 dev_err(&pdev->dev, "can't enable device IO\n"); 62 return -ENODEV;
68 return r;
69 }
70 63
71 r = pci_request_region(pdev, ACPI_BAR, DRV_NAME); 64 res = platform_get_resource(pdev, IORESOURCE_IO, 0);
72 if (r) { 65 if (!res) {
73 dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR); 66 dev_err(&pdev->dev, "can't fetch device resource info\n");
74 return r; 67 return -EIO;
75 } 68 }
76 69
77 r = pci_request_region(pdev, PMS_BAR, DRV_NAME); 70 if (!request_region(res->start, resource_size(res), DRV_NAME)) {
78 if (r) { 71 dev_err(&pdev->dev, "can't request region\n");
79 dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR); 72 return -EIO;
80 pci_release_region(pdev, ACPI_BAR);
81 return r;
82 } 73 }
83 74
84 acpi_base = pci_resource_start(pdev, ACPI_BAR); 75 if (strcmp(pdev->name, "cs5535-pms") == 0)
85 pms_base = pci_resource_start(pdev, PMS_BAR); 76 pms_base = res->start;
77 else if (strcmp(pdev->name, "cs5535-acpi") == 0)
78 acpi_base = res->start;
79
80 /* If we have both addresses, we can override the poweroff hook */
81 if (pms_base && acpi_base) {
82 pm_power_off = xo1_power_off;
83 printk(KERN_INFO "OLPC XO-1 support registered\n");
84 }
86 85
87 return 0; 86 return 0;
88} 87}
89 88
90static int __devinit olpc_xo1_probe(struct platform_device *pdev) 89static int __devexit olpc_xo1_remove(struct platform_device *pdev)
91{ 90{
92 struct pci_dev *pcidev; 91 struct resource *r;
93 int r;
94
95 pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
96 NULL);
97 if (!pdev)
98 return -ENODEV;
99
100 r = setup_bases(pcidev);
101 if (r)
102 return r;
103 92
104 pm_power_off = xo1_power_off; 93 r = platform_get_resource(pdev, IORESOURCE_IO, 0);
94 release_region(r->start, resource_size(r));
105 95
106 printk(KERN_INFO "OLPC XO-1 support registered\n"); 96 if (strcmp(pdev->name, "cs5535-pms") == 0)
107 return 0; 97 pms_base = 0;
108} 98 else if (strcmp(pdev->name, "cs5535-acpi") == 0)
99 acpi_base = 0;
109 100
110static int __devexit olpc_xo1_remove(struct platform_device *pdev)
111{
112 pm_power_off = NULL; 101 pm_power_off = NULL;
113 return 0; 102 return 0;
114} 103}
115 104
116static struct platform_driver olpc_xo1_driver = { 105static struct platform_driver cs5535_pms_drv = {
106 .driver = {
107 .name = "cs5535-pms",
108 .owner = THIS_MODULE,
109 },
110 .probe = olpc_xo1_probe,
111 .remove = __devexit_p(olpc_xo1_remove),
112};
113
114static struct platform_driver cs5535_acpi_drv = {
117 .driver = { 115 .driver = {
118 .name = DRV_NAME, 116 .name = "cs5535-acpi",
119 .owner = THIS_MODULE, 117 .owner = THIS_MODULE,
120 }, 118 },
121 .probe = olpc_xo1_probe, 119 .probe = olpc_xo1_probe,
@@ -124,12 +122,23 @@ static struct platform_driver olpc_xo1_driver = {
124 122
125static int __init olpc_xo1_init(void) 123static int __init olpc_xo1_init(void)
126{ 124{
127 return platform_driver_register(&olpc_xo1_driver); 125 int r;
126
127 r = platform_driver_register(&cs5535_pms_drv);
128 if (r)
129 return r;
130
131 r = platform_driver_register(&cs5535_acpi_drv);
132 if (r)
133 platform_driver_unregister(&cs5535_pms_drv);
134
135 return r;
128} 136}
129 137
130static void __exit olpc_xo1_exit(void) 138static void __exit olpc_xo1_exit(void)
131{ 139{
132 platform_driver_unregister(&olpc_xo1_driver); 140 platform_driver_unregister(&cs5535_acpi_drv);
141 platform_driver_unregister(&cs5535_pms_drv);
133} 142}
134 143
135MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); 144MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
new file mode 100644
index 000000000000..dab874647530
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -0,0 +1,183 @@
1/*
2 * OLPC-specific OFW device tree support code.
3 *
4 * Paul Mackerras August 1996.
5 * Copyright (C) 1996-2005 Paul Mackerras.
6 *
7 * Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner.
8 * {engebret|bergner}@us.ibm.com
9 *
10 * Adapted for sparc by David S. Miller davem@davemloft.net
11 * Adapted for x86/OLPC by Andres Salomon <dilinger@queued.net>
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bootmem.h>
21#include <linux/of.h>
22#include <linux/of_pdt.h>
23#include <asm/olpc_ofw.h>
24
25static phandle __init olpc_dt_getsibling(phandle node)
26{
27 const void *args[] = { (void *)node };
28 void *res[] = { &node };
29
30 if ((s32)node == -1)
31 return 0;
32
33 if (olpc_ofw("peer", args, res) || (s32)node == -1)
34 return 0;
35
36 return node;
37}
38
39static phandle __init olpc_dt_getchild(phandle node)
40{
41 const void *args[] = { (void *)node };
42 void *res[] = { &node };
43
44 if ((s32)node == -1)
45 return 0;
46
47 if (olpc_ofw("child", args, res) || (s32)node == -1) {
48 pr_err("PROM: %s: fetching child failed!\n", __func__);
49 return 0;
50 }
51
52 return node;
53}
54
55static int __init olpc_dt_getproplen(phandle node, const char *prop)
56{
57 const void *args[] = { (void *)node, prop };
58 int len;
59 void *res[] = { &len };
60
61 if ((s32)node == -1)
62 return -1;
63
64 if (olpc_ofw("getproplen", args, res)) {
65 pr_err("PROM: %s: getproplen failed!\n", __func__);
66 return -1;
67 }
68
69 return len;
70}
71
72static int __init olpc_dt_getproperty(phandle node, const char *prop,
73 char *buf, int bufsize)
74{
75 int plen;
76
77 plen = olpc_dt_getproplen(node, prop);
78 if (plen > bufsize || plen < 1) {
79 return -1;
80 } else {
81 const void *args[] = { (void *)node, prop, buf, (void *)plen };
82 void *res[] = { &plen };
83
84 if (olpc_ofw("getprop", args, res)) {
85 pr_err("PROM: %s: getprop failed!\n", __func__);
86 return -1;
87 }
88 }
89
90 return plen;
91}
92
93static int __init olpc_dt_nextprop(phandle node, char *prev, char *buf)
94{
95 const void *args[] = { (void *)node, prev, buf };
96 int success;
97 void *res[] = { &success };
98
99 buf[0] = '\0';
100
101 if ((s32)node == -1)
102 return -1;
103
104 if (olpc_ofw("nextprop", args, res) || success != 1)
105 return -1;
106
107 return 0;
108}
109
110static int __init olpc_dt_pkg2path(phandle node, char *buf,
111 const int buflen, int *len)
112{
113 const void *args[] = { (void *)node, buf, (void *)buflen };
114 void *res[] = { len };
115
116 if ((s32)node == -1)
117 return -1;
118
119 if (olpc_ofw("package-to-path", args, res) || *len < 1)
120 return -1;
121
122 return 0;
123}
124
125static unsigned int prom_early_allocated __initdata;
126
127void * __init prom_early_alloc(unsigned long size)
128{
129 static u8 *mem;
130 static size_t free_mem;
131 void *res;
132
133 if (free_mem < size) {
134 const size_t chunk_size = max(PAGE_SIZE, size);
135
136 /*
137 * To mimimize the number of allocations, grab at least
138 * PAGE_SIZE of memory (that's an arbitrary choice that's
139 * fast enough on the platforms we care about while minimizing
140 * wasted bootmem) and hand off chunks of it to callers.
141 */
142 res = alloc_bootmem(chunk_size);
143 if (!res)
144 return NULL;
145 prom_early_allocated += chunk_size;
146 memset(res, 0, chunk_size);
147 free_mem = chunk_size;
148 mem = res;
149 }
150
151 /* allocate from the local cache */
152 free_mem -= size;
153 res = mem;
154 mem += size;
155 return res;
156}
157
158static struct of_pdt_ops prom_olpc_ops __initdata = {
159 .nextprop = olpc_dt_nextprop,
160 .getproplen = olpc_dt_getproplen,
161 .getproperty = olpc_dt_getproperty,
162 .getchild = olpc_dt_getchild,
163 .getsibling = olpc_dt_getsibling,
164 .pkg2path = olpc_dt_pkg2path,
165};
166
167void __init olpc_dt_build_devicetree(void)
168{
169 phandle root;
170
171 if (!olpc_ofw_is_installed())
172 return;
173
174 root = olpc_dt_getsibling(0);
175 if (!root) {
176 pr_err("PROM: unable to get root node from OFW!\n");
177 return;
178 }
179 of_pdt_build_devicetree(root, &prom_olpc_ops);
180
181 pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
182 prom_early_allocated);
183}
diff --git a/arch/x86/platform/olpc/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c
index 787320464379..e7604f62870d 100644
--- a/arch/x86/platform/olpc/olpc_ofw.c
+++ b/arch/x86/platform/olpc/olpc_ofw.c
@@ -110,3 +110,8 @@ void __init olpc_ofw_detect(void)
110 (unsigned long)olpc_ofw_cif, (-start) >> 20); 110 (unsigned long)olpc_ofw_cif, (-start) >> 20);
111 reserve_top_address(-start); 111 reserve_top_address(-start);
112} 112}
113
114bool __init olpc_ofw_is_installed(void)
115{
116 return olpc_ofw_cif != NULL;
117}
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 779385158915..17c565de3d64 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp)
12 12
13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
14 time.o xen-asm.o xen-asm_$(BITS).o \ 14 time.o xen-asm.o xen-asm_$(BITS).o \
15 grant-table.o suspend.o platform-pci-unplug.o 15 grant-table.o suspend.o platform-pci-unplug.o \
16 p2m.o
16 17
17obj-$(CONFIG_SMP) += smp.o 18obj-$(CONFIG_SMP) += smp.o
18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 19obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7e8d3bc80af6..50542efe45fb 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1194,7 +1194,7 @@ asmlinkage void __init xen_start_kernel(void)
1194 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1194 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1195 1195
1196 local_irq_disable(); 1196 local_irq_disable();
1197 early_boot_irqs_off(); 1197 early_boot_irqs_disabled = true;
1198 1198
1199 memblock_init(); 1199 memblock_init();
1200 1200
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 9d30105a0c4a..6a6fe8939645 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -126,7 +126,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
126#endif 126#endif
127}; 127};
128 128
129void __init xen_init_irq_ops() 129void __init xen_init_irq_ops(void)
130{ 130{
131 pv_irq_ops = xen_irq_ops; 131 pv_irq_ops = xen_irq_ops;
132 x86_init.irqs.intr_init = xen_init_IRQ; 132 x86_init.irqs.intr_init = xen_init_IRQ;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 44924e551fde..5e92b61ad574 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
173 */ 173 */
174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
175 175
176/*
177 * Xen leaves the responsibility for maintaining p2m mappings to the
178 * guests themselves, but it must also access and update the p2m array
179 * during suspend/resume when all the pages are reallocated.
180 *
181 * The p2m table is logically a flat array, but we implement it as a
182 * three-level tree to allow the address space to be sparse.
183 *
184 * Xen
185 * |
186 * p2m_top p2m_top_mfn
187 * / \ / \
188 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
189 * / \ / \ / /
190 * p2m p2m p2m p2m p2m p2m p2m ...
191 *
192 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
193 *
194 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
195 * maximum representable pseudo-physical address space is:
196 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
197 *
198 * P2M_PER_PAGE depends on the architecture, as a mfn is always
199 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
200 * 512 and 1024 entries respectively.
201 */
202
203unsigned long xen_max_p2m_pfn __read_mostly;
204
205#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
206#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
207#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
208
209#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
210
211/* Placeholders for holes in the address space */
212static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
213static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
214static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
215
216static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
217static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
218static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
219
220RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
221RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
222
223static inline unsigned p2m_top_index(unsigned long pfn)
224{
225 BUG_ON(pfn >= MAX_P2M_PFN);
226 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
227}
228
229static inline unsigned p2m_mid_index(unsigned long pfn)
230{
231 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
232}
233
234static inline unsigned p2m_index(unsigned long pfn)
235{
236 return pfn % P2M_PER_PAGE;
237}
238
239static void p2m_top_init(unsigned long ***top)
240{
241 unsigned i;
242
243 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
244 top[i] = p2m_mid_missing;
245}
246
247static void p2m_top_mfn_init(unsigned long *top)
248{
249 unsigned i;
250
251 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
252 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
253}
254
255static void p2m_top_mfn_p_init(unsigned long **top)
256{
257 unsigned i;
258
259 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
260 top[i] = p2m_mid_missing_mfn;
261}
262
263static void p2m_mid_init(unsigned long **mid)
264{
265 unsigned i;
266
267 for (i = 0; i < P2M_MID_PER_PAGE; i++)
268 mid[i] = p2m_missing;
269}
270
271static void p2m_mid_mfn_init(unsigned long *mid)
272{
273 unsigned i;
274
275 for (i = 0; i < P2M_MID_PER_PAGE; i++)
276 mid[i] = virt_to_mfn(p2m_missing);
277}
278
279static void p2m_init(unsigned long *p2m)
280{
281 unsigned i;
282
283 for (i = 0; i < P2M_MID_PER_PAGE; i++)
284 p2m[i] = INVALID_P2M_ENTRY;
285}
286
287/*
288 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
289 *
290 * This is called both at boot time, and after resuming from suspend:
291 * - At boot time we're called very early, and must use extend_brk()
292 * to allocate memory.
293 *
294 * - After resume we're called from within stop_machine, but the mfn
295 * tree should alreay be completely allocated.
296 */
297void xen_build_mfn_list_list(void)
298{
299 unsigned long pfn;
300
301 /* Pre-initialize p2m_top_mfn to be completely missing */
302 if (p2m_top_mfn == NULL) {
303 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
304 p2m_mid_mfn_init(p2m_mid_missing_mfn);
305
306 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
307 p2m_top_mfn_p_init(p2m_top_mfn_p);
308
309 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
310 p2m_top_mfn_init(p2m_top_mfn);
311 } else {
312 /* Reinitialise, mfn's all change after migration */
313 p2m_mid_mfn_init(p2m_mid_missing_mfn);
314 }
315
316 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
317 unsigned topidx = p2m_top_index(pfn);
318 unsigned mididx = p2m_mid_index(pfn);
319 unsigned long **mid;
320 unsigned long *mid_mfn_p;
321
322 mid = p2m_top[topidx];
323 mid_mfn_p = p2m_top_mfn_p[topidx];
324
325 /* Don't bother allocating any mfn mid levels if
326 * they're just missing, just update the stored mfn,
327 * since all could have changed over a migrate.
328 */
329 if (mid == p2m_mid_missing) {
330 BUG_ON(mididx);
331 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
332 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
333 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
334 continue;
335 }
336
337 if (mid_mfn_p == p2m_mid_missing_mfn) {
338 /*
339 * XXX boot-time only! We should never find
340 * missing parts of the mfn tree after
341 * runtime. extend_brk() will BUG if we call
342 * it too late.
343 */
344 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
345 p2m_mid_mfn_init(mid_mfn_p);
346
347 p2m_top_mfn_p[topidx] = mid_mfn_p;
348 }
349
350 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
351 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
352 }
353}
354
355void xen_setup_mfn_list_list(void)
356{
357 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
358
359 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
360 virt_to_mfn(p2m_top_mfn);
361 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
362}
363
364/* Set up p2m_top to point to the domain-builder provided p2m pages */
365void __init xen_build_dynamic_phys_to_machine(void)
366{
367 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
368 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
369 unsigned long pfn;
370
371 xen_max_p2m_pfn = max_pfn;
372
373 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
374 p2m_init(p2m_missing);
375
376 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
377 p2m_mid_init(p2m_mid_missing);
378
379 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
380 p2m_top_init(p2m_top);
381
382 /*
383 * The domain builder gives us a pre-constructed p2m array in
384 * mfn_list for all the pages initially given to us, so we just
385 * need to graft that into our tree structure.
386 */
387 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
388 unsigned topidx = p2m_top_index(pfn);
389 unsigned mididx = p2m_mid_index(pfn);
390
391 if (p2m_top[topidx] == p2m_mid_missing) {
392 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
393 p2m_mid_init(mid);
394
395 p2m_top[topidx] = mid;
396 }
397
398 p2m_top[topidx][mididx] = &mfn_list[pfn];
399 }
400}
401
402unsigned long get_phys_to_machine(unsigned long pfn)
403{
404 unsigned topidx, mididx, idx;
405
406 if (unlikely(pfn >= MAX_P2M_PFN))
407 return INVALID_P2M_ENTRY;
408
409 topidx = p2m_top_index(pfn);
410 mididx = p2m_mid_index(pfn);
411 idx = p2m_index(pfn);
412
413 return p2m_top[topidx][mididx][idx];
414}
415EXPORT_SYMBOL_GPL(get_phys_to_machine);
416
417static void *alloc_p2m_page(void)
418{
419 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
420}
421
422static void free_p2m_page(void *p)
423{
424 free_page((unsigned long)p);
425}
426
427/*
428 * Fully allocate the p2m structure for a given pfn. We need to check
429 * that both the top and mid levels are allocated, and make sure the
430 * parallel mfn tree is kept in sync. We may race with other cpus, so
431 * the new pages are installed with cmpxchg; if we lose the race then
432 * simply free the page we allocated and use the one that's there.
433 */
434static bool alloc_p2m(unsigned long pfn)
435{
436 unsigned topidx, mididx;
437 unsigned long ***top_p, **mid;
438 unsigned long *top_mfn_p, *mid_mfn;
439
440 topidx = p2m_top_index(pfn);
441 mididx = p2m_mid_index(pfn);
442
443 top_p = &p2m_top[topidx];
444 mid = *top_p;
445
446 if (mid == p2m_mid_missing) {
447 /* Mid level is missing, allocate a new one */
448 mid = alloc_p2m_page();
449 if (!mid)
450 return false;
451
452 p2m_mid_init(mid);
453
454 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
455 free_p2m_page(mid);
456 }
457
458 top_mfn_p = &p2m_top_mfn[topidx];
459 mid_mfn = p2m_top_mfn_p[topidx];
460
461 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
462
463 if (mid_mfn == p2m_mid_missing_mfn) {
464 /* Separately check the mid mfn level */
465 unsigned long missing_mfn;
466 unsigned long mid_mfn_mfn;
467
468 mid_mfn = alloc_p2m_page();
469 if (!mid_mfn)
470 return false;
471
472 p2m_mid_mfn_init(mid_mfn);
473
474 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
475 mid_mfn_mfn = virt_to_mfn(mid_mfn);
476 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
477 free_p2m_page(mid_mfn);
478 else
479 p2m_top_mfn_p[topidx] = mid_mfn;
480 }
481
482 if (p2m_top[topidx][mididx] == p2m_missing) {
483 /* p2m leaf page is missing */
484 unsigned long *p2m;
485
486 p2m = alloc_p2m_page();
487 if (!p2m)
488 return false;
489
490 p2m_init(p2m);
491
492 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
493 free_p2m_page(p2m);
494 else
495 mid_mfn[mididx] = virt_to_mfn(p2m);
496 }
497
498 return true;
499}
500
501/* Try to install p2m mapping; fail if intermediate bits missing */
502bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
503{
504 unsigned topidx, mididx, idx;
505
506 if (unlikely(pfn >= MAX_P2M_PFN)) {
507 BUG_ON(mfn != INVALID_P2M_ENTRY);
508 return true;
509 }
510
511 topidx = p2m_top_index(pfn);
512 mididx = p2m_mid_index(pfn);
513 idx = p2m_index(pfn);
514
515 if (p2m_top[topidx][mididx] == p2m_missing)
516 return mfn == INVALID_P2M_ENTRY;
517
518 p2m_top[topidx][mididx][idx] = mfn;
519
520 return true;
521}
522
523bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
524{
525 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
526 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
527 return true;
528 }
529
530 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
531 if (!alloc_p2m(pfn))
532 return false;
533
534 if (!__set_phys_to_machine(pfn, mfn))
535 return false;
536 }
537
538 return true;
539}
540
541unsigned long arbitrary_virt_to_mfn(void *vaddr) 176unsigned long arbitrary_virt_to_mfn(void *vaddr)
542{ 177{
543 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); 178 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
@@ -566,6 +201,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
566 offset = address & ~PAGE_MASK; 201 offset = address & ~PAGE_MASK;
567 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); 202 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
568} 203}
204EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
569 205
570void make_lowmem_page_readonly(void *vaddr) 206void make_lowmem_page_readonly(void *vaddr)
571{ 207{
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 000000000000..ddc81a06edb9
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,528 @@
1/*
2 * Xen leaves the responsibility for maintaining p2m mappings to the
3 * guests themselves, but it must also access and update the p2m array
4 * during suspend/resume when all the pages are reallocated.
5 *
6 * The p2m table is logically a flat array, but we implement it as a
7 * three-level tree to allow the address space to be sparse.
8 *
9 * Xen
10 * |
11 * p2m_top p2m_top_mfn
12 * / \ / \
13 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
14 * / \ / \ / /
15 * p2m p2m p2m p2m p2m p2m p2m ...
16 *
17 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
18 *
19 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
20 * maximum representable pseudo-physical address space is:
21 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
22 *
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively.
26 */
27
28#include <linux/init.h>
29#include <linux/module.h>
30#include <linux/list.h>
31#include <linux/hash.h>
32#include <linux/sched.h>
33
34#include <asm/cache.h>
35#include <asm/setup.h>
36
37#include <asm/xen/page.h>
38#include <asm/xen/hypercall.h>
39#include <asm/xen/hypervisor.h>
40
41#include "xen-ops.h"
42
43static void __init m2p_override_init(void);
44
45unsigned long xen_max_p2m_pfn __read_mostly;
46
47#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
48#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
49#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
50
51#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
52
53/* Placeholders for holes in the address space */
54static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
55static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
56static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
57
58static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
59static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
60static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
61
62RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
63RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
64
65static inline unsigned p2m_top_index(unsigned long pfn)
66{
67 BUG_ON(pfn >= MAX_P2M_PFN);
68 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
69}
70
71static inline unsigned p2m_mid_index(unsigned long pfn)
72{
73 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
74}
75
76static inline unsigned p2m_index(unsigned long pfn)
77{
78 return pfn % P2M_PER_PAGE;
79}
80
81static void p2m_top_init(unsigned long ***top)
82{
83 unsigned i;
84
85 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
86 top[i] = p2m_mid_missing;
87}
88
89static void p2m_top_mfn_init(unsigned long *top)
90{
91 unsigned i;
92
93 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
94 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
95}
96
97static void p2m_top_mfn_p_init(unsigned long **top)
98{
99 unsigned i;
100
101 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
102 top[i] = p2m_mid_missing_mfn;
103}
104
105static void p2m_mid_init(unsigned long **mid)
106{
107 unsigned i;
108
109 for (i = 0; i < P2M_MID_PER_PAGE; i++)
110 mid[i] = p2m_missing;
111}
112
113static void p2m_mid_mfn_init(unsigned long *mid)
114{
115 unsigned i;
116
117 for (i = 0; i < P2M_MID_PER_PAGE; i++)
118 mid[i] = virt_to_mfn(p2m_missing);
119}
120
121static void p2m_init(unsigned long *p2m)
122{
123 unsigned i;
124
125 for (i = 0; i < P2M_MID_PER_PAGE; i++)
126 p2m[i] = INVALID_P2M_ENTRY;
127}
128
129/*
130 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
131 *
132 * This is called both at boot time, and after resuming from suspend:
133 * - At boot time we're called very early, and must use extend_brk()
134 * to allocate memory.
135 *
136 * - After resume we're called from within stop_machine, but the mfn
137 * tree should alreay be completely allocated.
138 */
139void xen_build_mfn_list_list(void)
140{
141 unsigned long pfn;
142
143 /* Pre-initialize p2m_top_mfn to be completely missing */
144 if (p2m_top_mfn == NULL) {
145 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
146 p2m_mid_mfn_init(p2m_mid_missing_mfn);
147
148 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
149 p2m_top_mfn_p_init(p2m_top_mfn_p);
150
151 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
152 p2m_top_mfn_init(p2m_top_mfn);
153 } else {
154 /* Reinitialise, mfn's all change after migration */
155 p2m_mid_mfn_init(p2m_mid_missing_mfn);
156 }
157
158 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
159 unsigned topidx = p2m_top_index(pfn);
160 unsigned mididx = p2m_mid_index(pfn);
161 unsigned long **mid;
162 unsigned long *mid_mfn_p;
163
164 mid = p2m_top[topidx];
165 mid_mfn_p = p2m_top_mfn_p[topidx];
166
167 /* Don't bother allocating any mfn mid levels if
168 * they're just missing, just update the stored mfn,
169 * since all could have changed over a migrate.
170 */
171 if (mid == p2m_mid_missing) {
172 BUG_ON(mididx);
173 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
174 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
175 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
176 continue;
177 }
178
179 if (mid_mfn_p == p2m_mid_missing_mfn) {
180 /*
181 * XXX boot-time only! We should never find
182 * missing parts of the mfn tree after
183 * runtime. extend_brk() will BUG if we call
184 * it too late.
185 */
186 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
187 p2m_mid_mfn_init(mid_mfn_p);
188
189 p2m_top_mfn_p[topidx] = mid_mfn_p;
190 }
191
192 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
193 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
194 }
195}
196
197void xen_setup_mfn_list_list(void)
198{
199 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
200
201 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
202 virt_to_mfn(p2m_top_mfn);
203 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
204}
205
206/* Set up p2m_top to point to the domain-builder provided p2m pages */
207void __init xen_build_dynamic_phys_to_machine(void)
208{
209 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
210 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
211 unsigned long pfn;
212
213 xen_max_p2m_pfn = max_pfn;
214
215 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
216 p2m_init(p2m_missing);
217
218 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
219 p2m_mid_init(p2m_mid_missing);
220
221 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
222 p2m_top_init(p2m_top);
223
224 /*
225 * The domain builder gives us a pre-constructed p2m array in
226 * mfn_list for all the pages initially given to us, so we just
227 * need to graft that into our tree structure.
228 */
229 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
230 unsigned topidx = p2m_top_index(pfn);
231 unsigned mididx = p2m_mid_index(pfn);
232
233 if (p2m_top[topidx] == p2m_mid_missing) {
234 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
235 p2m_mid_init(mid);
236
237 p2m_top[topidx] = mid;
238 }
239
240 /*
241 * As long as the mfn_list has enough entries to completely
242 * fill a p2m page, pointing into the array is ok. But if
243 * not the entries beyond the last pfn will be undefined.
244 * And guessing that the 'what-ever-there-is' does not take it
245 * too kindly when changing it to invalid markers, a new page
246 * is allocated, initialized and filled with the valid part.
247 */
248 if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
249 unsigned long p2midx;
250 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
251 p2m_init(p2m);
252
253 for (p2midx = 0; pfn + p2midx < max_pfn; p2midx++) {
254 p2m[p2midx] = mfn_list[pfn + p2midx];
255 }
256 p2m_top[topidx][mididx] = p2m;
257 } else
258 p2m_top[topidx][mididx] = &mfn_list[pfn];
259 }
260
261 m2p_override_init();
262}
263
264unsigned long get_phys_to_machine(unsigned long pfn)
265{
266 unsigned topidx, mididx, idx;
267
268 if (unlikely(pfn >= MAX_P2M_PFN))
269 return INVALID_P2M_ENTRY;
270
271 topidx = p2m_top_index(pfn);
272 mididx = p2m_mid_index(pfn);
273 idx = p2m_index(pfn);
274
275 return p2m_top[topidx][mididx][idx];
276}
277EXPORT_SYMBOL_GPL(get_phys_to_machine);
278
279static void *alloc_p2m_page(void)
280{
281 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
282}
283
284static void free_p2m_page(void *p)
285{
286 free_page((unsigned long)p);
287}
288
289/*
290 * Fully allocate the p2m structure for a given pfn. We need to check
291 * that both the top and mid levels are allocated, and make sure the
292 * parallel mfn tree is kept in sync. We may race with other cpus, so
293 * the new pages are installed with cmpxchg; if we lose the race then
294 * simply free the page we allocated and use the one that's there.
295 */
296static bool alloc_p2m(unsigned long pfn)
297{
298 unsigned topidx, mididx;
299 unsigned long ***top_p, **mid;
300 unsigned long *top_mfn_p, *mid_mfn;
301
302 topidx = p2m_top_index(pfn);
303 mididx = p2m_mid_index(pfn);
304
305 top_p = &p2m_top[topidx];
306 mid = *top_p;
307
308 if (mid == p2m_mid_missing) {
309 /* Mid level is missing, allocate a new one */
310 mid = alloc_p2m_page();
311 if (!mid)
312 return false;
313
314 p2m_mid_init(mid);
315
316 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
317 free_p2m_page(mid);
318 }
319
320 top_mfn_p = &p2m_top_mfn[topidx];
321 mid_mfn = p2m_top_mfn_p[topidx];
322
323 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
324
325 if (mid_mfn == p2m_mid_missing_mfn) {
326 /* Separately check the mid mfn level */
327 unsigned long missing_mfn;
328 unsigned long mid_mfn_mfn;
329
330 mid_mfn = alloc_p2m_page();
331 if (!mid_mfn)
332 return false;
333
334 p2m_mid_mfn_init(mid_mfn);
335
336 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
337 mid_mfn_mfn = virt_to_mfn(mid_mfn);
338 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
339 free_p2m_page(mid_mfn);
340 else
341 p2m_top_mfn_p[topidx] = mid_mfn;
342 }
343
344 if (p2m_top[topidx][mididx] == p2m_missing) {
345 /* p2m leaf page is missing */
346 unsigned long *p2m;
347
348 p2m = alloc_p2m_page();
349 if (!p2m)
350 return false;
351
352 p2m_init(p2m);
353
354 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
355 free_p2m_page(p2m);
356 else
357 mid_mfn[mididx] = virt_to_mfn(p2m);
358 }
359
360 return true;
361}
362
363/* Try to install p2m mapping; fail if intermediate bits missing */
364bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
365{
366 unsigned topidx, mididx, idx;
367
368 if (unlikely(pfn >= MAX_P2M_PFN)) {
369 BUG_ON(mfn != INVALID_P2M_ENTRY);
370 return true;
371 }
372
373 topidx = p2m_top_index(pfn);
374 mididx = p2m_mid_index(pfn);
375 idx = p2m_index(pfn);
376
377 if (p2m_top[topidx][mididx] == p2m_missing)
378 return mfn == INVALID_P2M_ENTRY;
379
380 p2m_top[topidx][mididx][idx] = mfn;
381
382 return true;
383}
384
385bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
386{
387 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
388 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
389 return true;
390 }
391
392 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
393 if (!alloc_p2m(pfn))
394 return false;
395
396 if (!__set_phys_to_machine(pfn, mfn))
397 return false;
398 }
399
400 return true;
401}
402
403#define M2P_OVERRIDE_HASH_SHIFT 10
404#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
405
406static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
407static DEFINE_SPINLOCK(m2p_override_lock);
408
409static void __init m2p_override_init(void)
410{
411 unsigned i;
412
413 m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
414 sizeof(unsigned long));
415
416 for (i = 0; i < M2P_OVERRIDE_HASH; i++)
417 INIT_LIST_HEAD(&m2p_overrides[i]);
418}
419
420static unsigned long mfn_hash(unsigned long mfn)
421{
422 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
423}
424
425/* Add an MFN override for a particular page */
426int m2p_add_override(unsigned long mfn, struct page *page)
427{
428 unsigned long flags;
429 unsigned long pfn;
430 unsigned long address;
431 unsigned level;
432 pte_t *ptep = NULL;
433
434 pfn = page_to_pfn(page);
435 if (!PageHighMem(page)) {
436 address = (unsigned long)__va(pfn << PAGE_SHIFT);
437 ptep = lookup_address(address, &level);
438
439 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
440 "m2p_add_override: pfn %lx not mapped", pfn))
441 return -EINVAL;
442 }
443
444 page->private = mfn;
445 page->index = pfn_to_mfn(pfn);
446
447 __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
448 if (!PageHighMem(page))
449 /* Just zap old mapping for now */
450 pte_clear(&init_mm, address, ptep);
451
452 spin_lock_irqsave(&m2p_override_lock, flags);
453 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
454 spin_unlock_irqrestore(&m2p_override_lock, flags);
455
456 return 0;
457}
458
459int m2p_remove_override(struct page *page)
460{
461 unsigned long flags;
462 unsigned long mfn;
463 unsigned long pfn;
464 unsigned long address;
465 unsigned level;
466 pte_t *ptep = NULL;
467
468 pfn = page_to_pfn(page);
469 mfn = get_phys_to_machine(pfn);
470 if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
471 return -EINVAL;
472
473 if (!PageHighMem(page)) {
474 address = (unsigned long)__va(pfn << PAGE_SHIFT);
475 ptep = lookup_address(address, &level);
476
477 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
478 "m2p_remove_override: pfn %lx not mapped", pfn))
479 return -EINVAL;
480 }
481
482 spin_lock_irqsave(&m2p_override_lock, flags);
483 list_del(&page->lru);
484 spin_unlock_irqrestore(&m2p_override_lock, flags);
485 __set_phys_to_machine(pfn, page->index);
486
487 if (!PageHighMem(page))
488 set_pte_at(&init_mm, address, ptep,
489 pfn_pte(pfn, PAGE_KERNEL));
490 /* No tlb flush necessary because the caller already
491 * left the pte unmapped. */
492
493 return 0;
494}
495
496struct page *m2p_find_override(unsigned long mfn)
497{
498 unsigned long flags;
499 struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
500 struct page *p, *ret;
501
502 ret = NULL;
503
504 spin_lock_irqsave(&m2p_override_lock, flags);
505
506 list_for_each_entry(p, bucket, lru) {
507 if (p->private == mfn) {
508 ret = p;
509 break;
510 }
511 }
512
513 spin_unlock_irqrestore(&m2p_override_lock, flags);
514
515 return ret;
516}
517
518unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
519{
520 struct page *p = m2p_find_override(mfn);
521 unsigned long ret = pfn;
522
523 if (p)
524 ret = page_to_pfn(p);
525
526 return ret;
527}
528EXPORT_SYMBOL_GPL(m2p_find_override_pfn);