aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig24
-rw-r--r--arch/x86/Kconfig.cpu3
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/misc.c4
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1832
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c540
-rw-r--r--arch/x86/include/asm/boot.h6
-rw-r--r--arch/x86/include/asm/debugreg.h2
-rw-r--r--arch/x86/include/asm/hypervisor.h12
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h35
-rw-r--r--arch/x86/include/asm/kvm_host.h100
-rw-r--r--arch/x86/include/asm/kvm_para.h24
-rw-r--r--arch/x86/include/asm/mach_traps.h12
-rw-r--r--arch/x86/include/asm/nmi.h20
-rw-r--r--arch/x86/include/asm/olpc.h10
-rw-r--r--arch/x86/include/asm/olpc_ofw.h9
-rw-r--r--arch/x86/include/asm/paravirt.h25
-rw-r--r--arch/x86/include/asm/paravirt_types.h6
-rw-r--r--arch/x86/include/asm/percpu.h158
-rw-r--r--arch/x86/include/asm/perf_event_p4.h3
-rw-r--r--arch/x86/include/asm/pgalloc.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level.h9
-rw-r--r--arch/x86/include/asm/pgtable-3level.h23
-rw-r--r--arch/x86/include/asm/pgtable.h143
-rw-r--r--arch/x86/include/asm/pgtable_64.h28
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/processor.h10
-rw-r--r--arch/x86/include/asm/prom.h1
-rw-r--r--arch/x86/include/asm/svm.h57
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/vmx.h15
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h35
-rw-r--r--arch/x86/include/asm/xen/page.h16
-rw-r--r--arch/x86/kernel/acpi/boot.c1
-rw-r--r--arch/x86/kernel/amd_iommu.c4
-rw-r--r--arch/x86/kernel/apic/apic.c7
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c3
-rw-r--r--arch/x86/kernel/apic/io_apic.c4
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c10
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c4
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c30
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c28
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/e820.c1
-rw-r--r--arch/x86/kernel/entry_32.S10
-rw-r--r--arch/x86/kernel/entry_64.S39
-rw-r--r--arch/x86/kernel/ftrace.c6
-rw-r--r--arch/x86/kernel/head_32.S2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c12
-rw-r--r--arch/x86/kernel/i387.c1
-rw-r--r--arch/x86/kernel/irq.c16
-rw-r--r--arch/x86/kernel/irq_32.c4
-rw-r--r--arch/x86/kernel/kgdb.c7
-rw-r--r--arch/x86/kernel/kprobes.c14
-rw-r--r--arch/x86/kernel/kvm.c317
-rw-r--r--arch/x86/kernel/kvmclock.c13
-rw-r--r--arch/x86/kernel/module.c17
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/process.c34
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c6
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/kernel/rtc.c2
-rw-r--r--arch/x86/kernel/smpboot.c14
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/traps.c102
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c367
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h22
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/mmu.c376
-rw-r--r--arch/x86/kvm/mmu_audit.c39
-rw-r--r--arch/x86/kvm/paging_tmpl.h156
-rw-r--r--arch/x86/kvm/svm.c865
-rw-r--r--arch/x86/kvm/trace.h17
-rw-r--r--arch/x86/kvm/vmx.c156
-rw-r--r--arch/x86/kvm/x86.c482
-rw-r--r--arch/x86/lib/delay.c2
-rw-r--r--arch/x86/mm/gup.c28
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/pgtable.c66
-rw-r--r--arch/x86/oprofile/nmi_int.c5
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c2
-rw-r--r--arch/x86/oprofile/op_model_ppro.c8
-rw-r--r--arch/x86/pci/broadcom_bus.c11
-rw-r--r--arch/x86/pci/common.c41
-rw-r--r--arch/x86/pci/irq.c3
-rw-r--r--arch/x86/platform/mrst/early_printk_mrst.c2
-rw-r--r--arch/x86/platform/olpc/Makefile1
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c101
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c183
-rw-r--r--arch/x86/platform/olpc/olpc_ofw.c5
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c44
-rw-r--r--arch/x86/xen/mmu.c366
-rw-r--r--arch/x86/xen/multicalls.h2
-rw-r--r--arch/x86/xen/p2m.c510
-rw-r--r--arch/x86/xen/spinlock.c8
-rw-r--r--arch/x86/xen/time.c8
111 files changed, 6138 insertions, 1698 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 184bc8872799..3ed5ad92b029 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -51,6 +51,7 @@ config X86
51 select HAVE_KERNEL_GZIP 51 select HAVE_KERNEL_GZIP
52 select HAVE_KERNEL_BZIP2 52 select HAVE_KERNEL_BZIP2
53 select HAVE_KERNEL_LZMA 53 select HAVE_KERNEL_LZMA
54 select HAVE_KERNEL_XZ
54 select HAVE_KERNEL_LZO 55 select HAVE_KERNEL_LZO
55 select HAVE_HW_BREAKPOINT 56 select HAVE_HW_BREAKPOINT
56 select HAVE_MIXED_BREAKPOINTS_REGS 57 select HAVE_MIXED_BREAKPOINTS_REGS
@@ -65,6 +66,7 @@ config X86
65 select HAVE_SPARSE_IRQ 66 select HAVE_SPARSE_IRQ
66 select GENERIC_IRQ_PROBE 67 select GENERIC_IRQ_PROBE
67 select GENERIC_PENDING_IRQ if SMP 68 select GENERIC_PENDING_IRQ if SMP
69 select USE_GENERIC_SMP_HELPERS if SMP
68 70
69config INSTRUCTION_DECODER 71config INSTRUCTION_DECODER
70 def_bool (KPROBES || PERF_EVENTS) 72 def_bool (KPROBES || PERF_EVENTS)
@@ -203,10 +205,6 @@ config HAVE_INTEL_TXT
203 def_bool y 205 def_bool y
204 depends on EXPERIMENTAL && DMAR && ACPI 206 depends on EXPERIMENTAL && DMAR && ACPI
205 207
206config USE_GENERIC_SMP_HELPERS
207 def_bool y
208 depends on SMP
209
210config X86_32_SMP 208config X86_32_SMP
211 def_bool y 209 def_bool y
212 depends on X86_32 && SMP 210 depends on X86_32 && SMP
@@ -1936,13 +1934,19 @@ config PCI_MMCONFIG
1936 depends on X86_64 && PCI && ACPI 1934 depends on X86_64 && PCI && ACPI
1937 1935
1938config PCI_CNB20LE_QUIRK 1936config PCI_CNB20LE_QUIRK
1939 bool "Read CNB20LE Host Bridge Windows" 1937 bool "Read CNB20LE Host Bridge Windows" if EMBEDDED
1940 depends on PCI 1938 default n
1939 depends on PCI && EXPERIMENTAL
1941 help 1940 help
1942 Read the PCI windows out of the CNB20LE host bridge. This allows 1941 Read the PCI windows out of the CNB20LE host bridge. This allows
1943 PCI hotplug to work on systems with the CNB20LE chipset which do 1942 PCI hotplug to work on systems with the CNB20LE chipset which do
1944 not have ACPI. 1943 not have ACPI.
1945 1944
1945 There's no public spec for this chipset, and this functionality
1946 is known to be incomplete.
1947
1948 You should say N unless you know you need this.
1949
1946config DMAR 1950config DMAR
1947 bool "Support for DMA Remapping Devices (EXPERIMENTAL)" 1951 bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
1948 depends on PCI_MSI && ACPI && EXPERIMENTAL 1952 depends on PCI_MSI && ACPI && EXPERIMENTAL
@@ -2071,7 +2075,7 @@ config OLPC
2071 2075
2072config OLPC_XO1 2076config OLPC_XO1
2073 tristate "OLPC XO-1 support" 2077 tristate "OLPC XO-1 support"
2074 depends on OLPC && PCI 2078 depends on OLPC && MFD_CS5535
2075 ---help--- 2079 ---help---
2076 Add support for non-essential features of the OLPC XO-1 laptop. 2080 Add support for non-essential features of the OLPC XO-1 laptop.
2077 2081
@@ -2079,11 +2083,17 @@ config OLPC_OPENFIRMWARE
2079 bool "Support for OLPC's Open Firmware" 2083 bool "Support for OLPC's Open Firmware"
2080 depends on !X86_64 && !X86_PAE 2084 depends on !X86_64 && !X86_PAE
2081 default n 2085 default n
2086 select OF
2082 help 2087 help
2083 This option adds support for the implementation of Open Firmware 2088 This option adds support for the implementation of Open Firmware
2084 that is used on the OLPC XO-1 Children's Machine. 2089 that is used on the OLPC XO-1 Children's Machine.
2085 If unsure, say N here. 2090 If unsure, say N here.
2086 2091
2092config OLPC_OPENFIRMWARE_DT
2093 bool
2094 default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE
2095 select OF_PROMTREE
2096
2087endif # X86_32 2097endif # X86_32
2088 2098
2089config AMD_NB 2099config AMD_NB
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2ac9069890cd..15588a0ef466 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -310,6 +310,9 @@ config X86_INTERNODE_CACHE_SHIFT
310config X86_CMPXCHG 310config X86_CMPXCHG
311 def_bool X86_64 || (X86_32 && !M386) 311 def_bool X86_64 || (X86_32 && !M386)
312 312
313config CMPXCHG_LOCAL
314 def_bool X86_64 || (X86_32 && !M386)
315
313config X86_L1_CACHE_SHIFT 316config X86_L1_CACHE_SHIFT
314 int 317 int
315 default "7" if MPENTIUM4 || MPSC 318 default "7" if MPENTIUM4 || MPSC
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0c229551eead..09664efb9cee 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o 7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
8 8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -49,12 +49,15 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
49 $(call if_changed,bzip2) 49 $(call if_changed,bzip2)
50$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE 50$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
51 $(call if_changed,lzma) 51 $(call if_changed,lzma)
52$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
53 $(call if_changed,xzkern)
52$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE 54$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
53 $(call if_changed,lzo) 55 $(call if_changed,lzo)
54 56
55suffix-$(CONFIG_KERNEL_GZIP) := gz 57suffix-$(CONFIG_KERNEL_GZIP) := gz
56suffix-$(CONFIG_KERNEL_BZIP2) := bz2 58suffix-$(CONFIG_KERNEL_BZIP2) := bz2
57suffix-$(CONFIG_KERNEL_LZMA) := lzma 59suffix-$(CONFIG_KERNEL_LZMA) := lzma
60suffix-$(CONFIG_KERNEL_XZ) := xz
58suffix-$(CONFIG_KERNEL_LZO) := lzo 61suffix-$(CONFIG_KERNEL_LZO) := lzo
59 62
60quiet_cmd_mkpiggy = MKPIGGY $@ 63quiet_cmd_mkpiggy = MKPIGGY $@
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 325c05294fc4..3a19d04cebeb 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -139,6 +139,10 @@ static int lines, cols;
139#include "../../../../lib/decompress_unlzma.c" 139#include "../../../../lib/decompress_unlzma.c"
140#endif 140#endif
141 141
142#ifdef CONFIG_KERNEL_XZ
143#include "../../../../lib/decompress_unxz.c"
144#endif
145
142#ifdef CONFIG_KERNEL_LZO 146#ifdef CONFIG_KERNEL_LZO
143#include "../../../../lib/decompress_unlzo.c" 147#include "../../../../lib/decompress_unlzo.c"
144#endif 148#endif
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index 5c228129d175..646aa78ba5fd 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -74,7 +74,7 @@ int main(int argc, char *argv[])
74 74
75 offs = (olen > ilen) ? olen - ilen : 0; 75 offs = (olen > ilen) ? olen - ilen : 0;
76 offs += olen >> 12; /* Add 8 bytes for each 32K block */ 76 offs += olen >> 12; /* Add 8 bytes for each 32K block */
77 offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ 77 offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */
78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ 78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
79 79
80 printf(".section \".rodata..compressed\",\"a\",@progbits\n"); 80 printf(".section \".rodata..compressed\",\"a\",@progbits\n");
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756a51c1..8fe2a4966b7a 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,20 @@
9 * Vinodh Gopal <vinodh.gopal@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir 10 * Kahraman Akdemir
11 * 11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
12 * This program is free software; you can redistribute it and/or modify 26 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by 27 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or 28 * the Free Software Foundation; either version 2 of the License, or
@@ -18,8 +32,62 @@
18#include <linux/linkage.h> 32#include <linux/linkage.h>
19#include <asm/inst.h> 33#include <asm/inst.h>
20 34
35#ifdef __x86_64__
36.data
37POLY: .octa 0xC2000000000000000000000000000001
38TWOONE: .octa 0x00000001000000000000000000000001
39
40# order of these constants should not change.
41# more specifically, ALL_F should follow SHIFT_MASK,
42# and ZERO should follow ALL_F
43
44SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45MASK1: .octa 0x0000000000000000ffffffffffffffff
46MASK2: .octa 0xffffffffffffffff0000000000000000
47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49ZERO: .octa 0x00000000000000000000000000000000
50ONE: .octa 0x00000000000000000000000000000001
51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
52dec: .octa 0x1
53enc: .octa 0x2
54
55
21.text 56.text
22 57
58
59#define STACK_OFFSET 8*3
60#define HashKey 16*0 // store HashKey <<1 mod poly here
61#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65 // bits of HashKey <<1 mod poly here
66 //(for Karatsuba purposes)
67#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68 // bits of HashKey^2 <<1 mod poly here
69 // (for Karatsuba purposes)
70#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71 // bits of HashKey^3 <<1 mod poly here
72 // (for Karatsuba purposes)
73#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74 // bits of HashKey^4 <<1 mod poly here
75 // (for Karatsuba purposes)
76#define VARIABLE_OFFSET 16*8
77
78#define arg1 rdi
79#define arg2 rsi
80#define arg3 rdx
81#define arg4 rcx
82#define arg5 r8
83#define arg6 r9
84#define arg7 STACK_OFFSET+8(%r14)
85#define arg8 STACK_OFFSET+16(%r14)
86#define arg9 STACK_OFFSET+24(%r14)
87#define arg10 STACK_OFFSET+32(%r14)
88#endif
89
90
23#define STATE1 %xmm0 91#define STATE1 %xmm0
24#define STATE2 %xmm4 92#define STATE2 %xmm4
25#define STATE3 %xmm5 93#define STATE3 %xmm5
@@ -32,12 +100,16 @@
32#define IN IN1 100#define IN IN1
33#define KEY %xmm2 101#define KEY %xmm2
34#define IV %xmm3 102#define IV %xmm3
103
35#define BSWAP_MASK %xmm10 104#define BSWAP_MASK %xmm10
36#define CTR %xmm11 105#define CTR %xmm11
37#define INC %xmm12 106#define INC %xmm12
38 107
108#ifdef __x86_64__
109#define AREG %rax
39#define KEYP %rdi 110#define KEYP %rdi
40#define OUTP %rsi 111#define OUTP %rsi
112#define UKEYP OUTP
41#define INP %rdx 113#define INP %rdx
42#define LEN %rcx 114#define LEN %rcx
43#define IVP %r8 115#define IVP %r8
@@ -46,6 +118,1588 @@
46#define TKEYP T1 118#define TKEYP T1
47#define T2 %r11 119#define T2 %r11
48#define TCTR_LOW T2 120#define TCTR_LOW T2
121#else
122#define AREG %eax
123#define KEYP %edi
124#define OUTP AREG
125#define UKEYP OUTP
126#define INP %edx
127#define LEN %esi
128#define IVP %ebp
129#define KLEN %ebx
130#define T1 %ecx
131#define TKEYP T1
132#endif
133
134
135#ifdef __x86_64__
136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
137*
138*
139* Input: A and B (128-bits each, bit-reflected)
140* Output: C = A*B*x mod poly, (i.e. >>1 )
141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
143*
144*/
145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
146 movdqa \GH, \TMP1
147 pshufd $78, \GH, \TMP2
148 pshufd $78, \HK, \TMP3
149 pxor \GH, \TMP2 # TMP2 = a1+a0
150 pxor \HK, \TMP3 # TMP3 = b1+b0
151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
154 pxor \GH, \TMP2
155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
156 movdqa \TMP2, \TMP3
157 pslldq $8, \TMP3 # left shift TMP3 2 DWs
158 psrldq $8, \TMP2 # right shift TMP2 2 DWs
159 pxor \TMP3, \GH
160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
161
162 # first phase of the reduction
163
164 movdqa \GH, \TMP2
165 movdqa \GH, \TMP3
166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167 # in in order to perform
168 # independent shifts
169 pslld $31, \TMP2 # packed right shift <<31
170 pslld $30, \TMP3 # packed right shift <<30
171 pslld $25, \TMP4 # packed right shift <<25
172 pxor \TMP3, \TMP2 # xor the shifted versions
173 pxor \TMP4, \TMP2
174 movdqa \TMP2, \TMP5
175 psrldq $4, \TMP5 # right shift TMP5 1 DW
176 pslldq $12, \TMP2 # left shift TMP2 3 DWs
177 pxor \TMP2, \GH
178
179 # second phase of the reduction
180
181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182 # in in order to perform
183 # independent shifts
184 movdqa \GH,\TMP3
185 movdqa \GH,\TMP4
186 psrld $1,\TMP2 # packed left shift >>1
187 psrld $2,\TMP3 # packed left shift >>2
188 psrld $7,\TMP4 # packed left shift >>7
189 pxor \TMP3,\TMP2 # xor the shifted versions
190 pxor \TMP4,\TMP2
191 pxor \TMP5, \TMP2
192 pxor \TMP2, \GH
193 pxor \TMP1, \GH # result is in TMP1
194.endm
195
196/*
197* if a = number of total plaintext bytes
198* b = floor(a/16)
199* num_initial_blocks = b mod 4
200* encrypt the initial num_initial_blocks blocks and apply ghash on
201* the ciphertext
202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
203* are clobbered
204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205*/
206
207
208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen
212 mov %r12, %r11
213 pxor %xmm\i, %xmm\i
214_get_AAD_loop\num_initial_blocks\operation:
215 movd (%r10), \TMP1
216 pslldq $12, \TMP1
217 psrldq $4, %xmm\i
218 pxor \TMP1, %xmm\i
219 add $4, %r10
220 sub $4, %r12
221 jne _get_AAD_loop\num_initial_blocks\operation
222 cmp $16, %r11
223 je _get_AAD_loop2_done\num_initial_blocks\operation
224 mov $16, %r12
225_get_AAD_loop2\num_initial_blocks\operation:
226 psrldq $4, %xmm\i
227 sub $4, %r12
228 cmp %r11, %r12
229 jne _get_AAD_loop2\num_initial_blocks\operation
230_get_AAD_loop2_done\num_initial_blocks\operation:
231 movdqa SHUF_MASK(%rip), %xmm14
232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
233
234 xor %r11, %r11 # initialise the data pointer offset as zero
235
236 # start AES for num_initial_blocks blocks
237
238 mov %arg5, %rax # %rax = *Y0
239 movdqu (%rax), \XMM0 # XMM0 = Y0
240 movdqa SHUF_MASK(%rip), %xmm14
241 PSHUFB_XMM %xmm14, \XMM0
242
243.if (\i == 5) || (\i == 6) || (\i == 7)
244.irpc index, \i_seq
245 paddd ONE(%rip), \XMM0 # INCR Y0
246 movdqa \XMM0, %xmm\index
247 movdqa SHUF_MASK(%rip), %xmm14
248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
249
250.endr
251.irpc index, \i_seq
252 pxor 16*0(%arg1), %xmm\index
253.endr
254.irpc index, \i_seq
255 movaps 0x10(%rdi), \TMP1
256 AESENC \TMP1, %xmm\index # Round 1
257.endr
258.irpc index, \i_seq
259 movaps 0x20(%arg1), \TMP1
260 AESENC \TMP1, %xmm\index # Round 2
261.endr
262.irpc index, \i_seq
263 movaps 0x30(%arg1), \TMP1
264 AESENC \TMP1, %xmm\index # Round 2
265.endr
266.irpc index, \i_seq
267 movaps 0x40(%arg1), \TMP1
268 AESENC \TMP1, %xmm\index # Round 2
269.endr
270.irpc index, \i_seq
271 movaps 0x50(%arg1), \TMP1
272 AESENC \TMP1, %xmm\index # Round 2
273.endr
274.irpc index, \i_seq
275 movaps 0x60(%arg1), \TMP1
276 AESENC \TMP1, %xmm\index # Round 2
277.endr
278.irpc index, \i_seq
279 movaps 0x70(%arg1), \TMP1
280 AESENC \TMP1, %xmm\index # Round 2
281.endr
282.irpc index, \i_seq
283 movaps 0x80(%arg1), \TMP1
284 AESENC \TMP1, %xmm\index # Round 2
285.endr
286.irpc index, \i_seq
287 movaps 0x90(%arg1), \TMP1
288 AESENC \TMP1, %xmm\index # Round 2
289.endr
290.irpc index, \i_seq
291 movaps 0xa0(%arg1), \TMP1
292 AESENCLAST \TMP1, %xmm\index # Round 10
293.endr
294.irpc index, \i_seq
295 movdqu (%arg3 , %r11, 1), \TMP1
296 pxor \TMP1, %xmm\index
297 movdqu %xmm\index, (%arg2 , %r11, 1)
298 # write back plaintext/ciphertext for num_initial_blocks
299 add $16, %r11
300
301 movdqa \TMP1, %xmm\index
302 movdqa SHUF_MASK(%rip), %xmm14
303 PSHUFB_XMM %xmm14, %xmm\index
304
305 # prepare plaintext/ciphertext for GHASH computation
306.endr
307.endif
308 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309 # apply GHASH on num_initial_blocks blocks
310
311.if \i == 5
312 pxor %xmm5, %xmm6
313 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314 pxor %xmm6, %xmm7
315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316 pxor %xmm7, %xmm8
317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318.elseif \i == 6
319 pxor %xmm6, %xmm7
320 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 pxor %xmm7, %xmm8
322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323.elseif \i == 7
324 pxor %xmm7, %xmm8
325 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326.endif
327 cmp $64, %r13
328 jl _initial_blocks_done\num_initial_blocks\operation
329 # no need for precomputed values
330/*
331*
332* Precomputations for HashKey parallel with encryption of first 4 blocks.
333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
334*/
335 paddd ONE(%rip), \XMM0 # INCR Y0
336 movdqa \XMM0, \XMM1
337 movdqa SHUF_MASK(%rip), %xmm14
338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
339
340 paddd ONE(%rip), \XMM0 # INCR Y0
341 movdqa \XMM0, \XMM2
342 movdqa SHUF_MASK(%rip), %xmm14
343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
344
345 paddd ONE(%rip), \XMM0 # INCR Y0
346 movdqa \XMM0, \XMM3
347 movdqa SHUF_MASK(%rip), %xmm14
348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
349
350 paddd ONE(%rip), \XMM0 # INCR Y0
351 movdqa \XMM0, \XMM4
352 movdqa SHUF_MASK(%rip), %xmm14
353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
354
355 pxor 16*0(%arg1), \XMM1
356 pxor 16*0(%arg1), \XMM2
357 pxor 16*0(%arg1), \XMM3
358 pxor 16*0(%arg1), \XMM4
359 movdqa \TMP3, \TMP5
360 pshufd $78, \TMP3, \TMP1
361 pxor \TMP3, \TMP1
362 movdqa \TMP1, HashKey_k(%rsp)
363 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
364# TMP5 = HashKey^2<<1 (mod poly)
365 movdqa \TMP5, HashKey_2(%rsp)
366# HashKey_2 = HashKey^2<<1 (mod poly)
367 pshufd $78, \TMP5, \TMP1
368 pxor \TMP5, \TMP1
369 movdqa \TMP1, HashKey_2_k(%rsp)
370.irpc index, 1234 # do 4 rounds
371 movaps 0x10*\index(%arg1), \TMP1
372 AESENC \TMP1, \XMM1
373 AESENC \TMP1, \XMM2
374 AESENC \TMP1, \XMM3
375 AESENC \TMP1, \XMM4
376.endr
377 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
378# TMP5 = HashKey^3<<1 (mod poly)
379 movdqa \TMP5, HashKey_3(%rsp)
380 pshufd $78, \TMP5, \TMP1
381 pxor \TMP5, \TMP1
382 movdqa \TMP1, HashKey_3_k(%rsp)
383.irpc index, 56789 # do next 5 rounds
384 movaps 0x10*\index(%arg1), \TMP1
385 AESENC \TMP1, \XMM1
386 AESENC \TMP1, \XMM2
387 AESENC \TMP1, \XMM3
388 AESENC \TMP1, \XMM4
389.endr
390 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
391# TMP5 = HashKey^3<<1 (mod poly)
392 movdqa \TMP5, HashKey_4(%rsp)
393 pshufd $78, \TMP5, \TMP1
394 pxor \TMP5, \TMP1
395 movdqa \TMP1, HashKey_4_k(%rsp)
396 movaps 0xa0(%arg1), \TMP2
397 AESENCLAST \TMP2, \XMM1
398 AESENCLAST \TMP2, \XMM2
399 AESENCLAST \TMP2, \XMM3
400 AESENCLAST \TMP2, \XMM4
401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
402 pxor \TMP1, \XMM1
403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
404 movdqa \TMP1, \XMM1
405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
406 pxor \TMP1, \XMM2
407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
408 movdqa \TMP1, \XMM2
409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
410 pxor \TMP1, \XMM3
411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
412 movdqa \TMP1, \XMM3
413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
414 pxor \TMP1, \XMM4
415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
416 movdqa \TMP1, \XMM4
417 add $64, %r11
418 movdqa SHUF_MASK(%rip), %xmm14
419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
420 pxor \XMMDst, \XMM1
421# combine GHASHed value with the corresponding ciphertext
422 movdqa SHUF_MASK(%rip), %xmm14
423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426 movdqa SHUF_MASK(%rip), %xmm14
427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
428
429_initial_blocks_done\num_initial_blocks\operation:
430
431.endm
432
433
434/*
435* if a = number of total plaintext bytes
436* b = floor(a/16)
437* num_initial_blocks = b mod 4
438* encrypt the initial num_initial_blocks blocks and apply ghash on
439* the ciphertext
440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
441* are clobbered
442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
443*/
444
445
446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448 mov arg7, %r10 # %r10 = AAD
449 mov arg8, %r12 # %r12 = aadLen
450 mov %r12, %r11
451 pxor %xmm\i, %xmm\i
452_get_AAD_loop\num_initial_blocks\operation:
453 movd (%r10), \TMP1
454 pslldq $12, \TMP1
455 psrldq $4, %xmm\i
456 pxor \TMP1, %xmm\i
457 add $4, %r10
458 sub $4, %r12
459 jne _get_AAD_loop\num_initial_blocks\operation
460 cmp $16, %r11
461 je _get_AAD_loop2_done\num_initial_blocks\operation
462 mov $16, %r12
463_get_AAD_loop2\num_initial_blocks\operation:
464 psrldq $4, %xmm\i
465 sub $4, %r12
466 cmp %r11, %r12
467 jne _get_AAD_loop2\num_initial_blocks\operation
468_get_AAD_loop2_done\num_initial_blocks\operation:
469 movdqa SHUF_MASK(%rip), %xmm14
470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
471
472 xor %r11, %r11 # initialise the data pointer offset as zero
473
474 # start AES for num_initial_blocks blocks
475
476 mov %arg5, %rax # %rax = *Y0
477 movdqu (%rax), \XMM0 # XMM0 = Y0
478 movdqa SHUF_MASK(%rip), %xmm14
479 PSHUFB_XMM %xmm14, \XMM0
480
481.if (\i == 5) || (\i == 6) || (\i == 7)
482.irpc index, \i_seq
483 paddd ONE(%rip), \XMM0 # INCR Y0
484 movdqa \XMM0, %xmm\index
485 movdqa SHUF_MASK(%rip), %xmm14
486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
487
488.endr
489.irpc index, \i_seq
490 pxor 16*0(%arg1), %xmm\index
491.endr
492.irpc index, \i_seq
493 movaps 0x10(%rdi), \TMP1
494 AESENC \TMP1, %xmm\index # Round 1
495.endr
496.irpc index, \i_seq
497 movaps 0x20(%arg1), \TMP1
498 AESENC \TMP1, %xmm\index # Round 2
499.endr
500.irpc index, \i_seq
501 movaps 0x30(%arg1), \TMP1
502 AESENC \TMP1, %xmm\index # Round 2
503.endr
504.irpc index, \i_seq
505 movaps 0x40(%arg1), \TMP1
506 AESENC \TMP1, %xmm\index # Round 2
507.endr
508.irpc index, \i_seq
509 movaps 0x50(%arg1), \TMP1
510 AESENC \TMP1, %xmm\index # Round 2
511.endr
512.irpc index, \i_seq
513 movaps 0x60(%arg1), \TMP1
514 AESENC \TMP1, %xmm\index # Round 2
515.endr
516.irpc index, \i_seq
517 movaps 0x70(%arg1), \TMP1
518 AESENC \TMP1, %xmm\index # Round 2
519.endr
520.irpc index, \i_seq
521 movaps 0x80(%arg1), \TMP1
522 AESENC \TMP1, %xmm\index # Round 2
523.endr
524.irpc index, \i_seq
525 movaps 0x90(%arg1), \TMP1
526 AESENC \TMP1, %xmm\index # Round 2
527.endr
528.irpc index, \i_seq
529 movaps 0xa0(%arg1), \TMP1
530 AESENCLAST \TMP1, %xmm\index # Round 10
531.endr
532.irpc index, \i_seq
533 movdqu (%arg3 , %r11, 1), \TMP1
534 pxor \TMP1, %xmm\index
535 movdqu %xmm\index, (%arg2 , %r11, 1)
536 # write back plaintext/ciphertext for num_initial_blocks
537 add $16, %r11
538
539 movdqa SHUF_MASK(%rip), %xmm14
540 PSHUFB_XMM %xmm14, %xmm\index
541
542 # prepare plaintext/ciphertext for GHASH computation
543.endr
544.endif
545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 # apply GHASH on num_initial_blocks blocks
547
548.if \i == 5
549 pxor %xmm5, %xmm6
550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551 pxor %xmm6, %xmm7
552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553 pxor %xmm7, %xmm8
554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555.elseif \i == 6
556 pxor %xmm6, %xmm7
557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558 pxor %xmm7, %xmm8
559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560.elseif \i == 7
561 pxor %xmm7, %xmm8
562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563.endif
564 cmp $64, %r13
565 jl _initial_blocks_done\num_initial_blocks\operation
566 # no need for precomputed values
567/*
568*
569* Precomputations for HashKey parallel with encryption of first 4 blocks.
570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
571*/
572 paddd ONE(%rip), \XMM0 # INCR Y0
573 movdqa \XMM0, \XMM1
574 movdqa SHUF_MASK(%rip), %xmm14
575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
576
577 paddd ONE(%rip), \XMM0 # INCR Y0
578 movdqa \XMM0, \XMM2
579 movdqa SHUF_MASK(%rip), %xmm14
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
581
582 paddd ONE(%rip), \XMM0 # INCR Y0
583 movdqa \XMM0, \XMM3
584 movdqa SHUF_MASK(%rip), %xmm14
585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
586
587 paddd ONE(%rip), \XMM0 # INCR Y0
588 movdqa \XMM0, \XMM4
589 movdqa SHUF_MASK(%rip), %xmm14
590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
591
592 pxor 16*0(%arg1), \XMM1
593 pxor 16*0(%arg1), \XMM2
594 pxor 16*0(%arg1), \XMM3
595 pxor 16*0(%arg1), \XMM4
596 movdqa \TMP3, \TMP5
597 pshufd $78, \TMP3, \TMP1
598 pxor \TMP3, \TMP1
599 movdqa \TMP1, HashKey_k(%rsp)
600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601# TMP5 = HashKey^2<<1 (mod poly)
602 movdqa \TMP5, HashKey_2(%rsp)
603# HashKey_2 = HashKey^2<<1 (mod poly)
604 pshufd $78, \TMP5, \TMP1
605 pxor \TMP5, \TMP1
606 movdqa \TMP1, HashKey_2_k(%rsp)
607.irpc index, 1234 # do 4 rounds
608 movaps 0x10*\index(%arg1), \TMP1
609 AESENC \TMP1, \XMM1
610 AESENC \TMP1, \XMM2
611 AESENC \TMP1, \XMM3
612 AESENC \TMP1, \XMM4
613.endr
614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615# TMP5 = HashKey^3<<1 (mod poly)
616 movdqa \TMP5, HashKey_3(%rsp)
617 pshufd $78, \TMP5, \TMP1
618 pxor \TMP5, \TMP1
619 movdqa \TMP1, HashKey_3_k(%rsp)
620.irpc index, 56789 # do next 5 rounds
621 movaps 0x10*\index(%arg1), \TMP1
622 AESENC \TMP1, \XMM1
623 AESENC \TMP1, \XMM2
624 AESENC \TMP1, \XMM3
625 AESENC \TMP1, \XMM4
626.endr
627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628# TMP5 = HashKey^3<<1 (mod poly)
629 movdqa \TMP5, HashKey_4(%rsp)
630 pshufd $78, \TMP5, \TMP1
631 pxor \TMP5, \TMP1
632 movdqa \TMP1, HashKey_4_k(%rsp)
633 movaps 0xa0(%arg1), \TMP2
634 AESENCLAST \TMP2, \XMM1
635 AESENCLAST \TMP2, \XMM2
636 AESENCLAST \TMP2, \XMM3
637 AESENCLAST \TMP2, \XMM4
638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
639 pxor \TMP1, \XMM1
640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
641 pxor \TMP1, \XMM2
642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
643 pxor \TMP1, \XMM3
644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
645 pxor \TMP1, \XMM4
646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
650
651 add $64, %r11
652 movdqa SHUF_MASK(%rip), %xmm14
653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
654 pxor \XMMDst, \XMM1
655# combine GHASHed value with the corresponding ciphertext
656 movdqa SHUF_MASK(%rip), %xmm14
657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660 movdqa SHUF_MASK(%rip), %xmm14
661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
662
663_initial_blocks_done\num_initial_blocks\operation:
664
665.endm
666
667/*
668* encrypt 4 blocks at a time
669* ghash the 4 previously encrypted ciphertext blocks
670* arg1, %arg2, %arg3 are used as pointers only, not modified
671* %r11 is the data offset value
672*/
673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
675
676 movdqa \XMM1, \XMM5
677 movdqa \XMM2, \XMM6
678 movdqa \XMM3, \XMM7
679 movdqa \XMM4, \XMM8
680
681 movdqa SHUF_MASK(%rip), %xmm15
682 # multiply TMP5 * HashKey using karatsuba
683
684 movdqa \XMM5, \TMP4
685 pshufd $78, \XMM5, \TMP6
686 pxor \XMM5, \TMP6
687 paddd ONE(%rip), \XMM0 # INCR CNT
688 movdqa HashKey_4(%rsp), \TMP5
689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
690 movdqa \XMM0, \XMM1
691 paddd ONE(%rip), \XMM0 # INCR CNT
692 movdqa \XMM0, \XMM2
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa \XMM0, \XMM3
695 paddd ONE(%rip), \XMM0 # INCR CNT
696 movdqa \XMM0, \XMM4
697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
702
703 pxor (%arg1), \XMM1
704 pxor (%arg1), \XMM2
705 pxor (%arg1), \XMM3
706 pxor (%arg1), \XMM4
707 movdqa HashKey_4_k(%rsp), \TMP5
708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709 movaps 0x10(%arg1), \TMP1
710 AESENC \TMP1, \XMM1 # Round 1
711 AESENC \TMP1, \XMM2
712 AESENC \TMP1, \XMM3
713 AESENC \TMP1, \XMM4
714 movaps 0x20(%arg1), \TMP1
715 AESENC \TMP1, \XMM1 # Round 2
716 AESENC \TMP1, \XMM2
717 AESENC \TMP1, \XMM3
718 AESENC \TMP1, \XMM4
719 movdqa \XMM6, \TMP1
720 pshufd $78, \XMM6, \TMP2
721 pxor \XMM6, \TMP2
722 movdqa HashKey_3(%rsp), \TMP5
723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724 movaps 0x30(%arg1), \TMP3
725 AESENC \TMP3, \XMM1 # Round 3
726 AESENC \TMP3, \XMM2
727 AESENC \TMP3, \XMM3
728 AESENC \TMP3, \XMM4
729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730 movaps 0x40(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 4
732 AESENC \TMP3, \XMM2
733 AESENC \TMP3, \XMM3
734 AESENC \TMP3, \XMM4
735 movdqa HashKey_3_k(%rsp), \TMP5
736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737 movaps 0x50(%arg1), \TMP3
738 AESENC \TMP3, \XMM1 # Round 5
739 AESENC \TMP3, \XMM2
740 AESENC \TMP3, \XMM3
741 AESENC \TMP3, \XMM4
742 pxor \TMP1, \TMP4
743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744 pxor \XMM6, \XMM5
745 pxor \TMP2, \TMP6
746 movdqa \XMM7, \TMP1
747 pshufd $78, \XMM7, \TMP2
748 pxor \XMM7, \TMP2
749 movdqa HashKey_2(%rsp ), \TMP5
750
751 # Multiply TMP5 * HashKey using karatsuba
752
753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754 movaps 0x60(%arg1), \TMP3
755 AESENC \TMP3, \XMM1 # Round 6
756 AESENC \TMP3, \XMM2
757 AESENC \TMP3, \XMM3
758 AESENC \TMP3, \XMM4
759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760 movaps 0x70(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 7
762 AESENC \TMP3, \XMM2
763 AESENC \TMP3, \XMM3
764 AESENC \TMP3, \XMM4
765 movdqa HashKey_2_k(%rsp), \TMP5
766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767 movaps 0x80(%arg1), \TMP3
768 AESENC \TMP3, \XMM1 # Round 8
769 AESENC \TMP3, \XMM2
770 AESENC \TMP3, \XMM3
771 AESENC \TMP3, \XMM4
772 pxor \TMP1, \TMP4
773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774 pxor \XMM7, \XMM5
775 pxor \TMP2, \TMP6
776
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
779
780 movdqa \XMM8, \TMP1
781 pshufd $78, \XMM8, \TMP2
782 pxor \XMM8, \TMP2
783 movdqa HashKey(%rsp), \TMP5
784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785 movaps 0x90(%arg1), \TMP3
786 AESENC \TMP3, \XMM1 # Round 9
787 AESENC \TMP3, \XMM2
788 AESENC \TMP3, \XMM3
789 AESENC \TMP3, \XMM4
790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791 movaps 0xa0(%arg1), \TMP3
792 AESENCLAST \TMP3, \XMM1 # Round 10
793 AESENCLAST \TMP3, \XMM2
794 AESENCLAST \TMP3, \XMM3
795 AESENCLAST \TMP3, \XMM4
796 movdqa HashKey_k(%rsp), \TMP5
797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
798 movdqu (%arg3,%r11,1), \TMP3
799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
800 movdqu 16(%arg3,%r11,1), \TMP3
801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
802 movdqu 32(%arg3,%r11,1), \TMP3
803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
804 movdqu 48(%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814
815 pxor \TMP4, \TMP1
816 pxor \XMM8, \XMM5
817 pxor \TMP6, \TMP2
818 pxor \TMP1, \TMP2
819 pxor \XMM5, \TMP2
820 movdqa \TMP2, \TMP3
821 pslldq $8, \TMP3 # left shift TMP3 2 DWs
822 psrldq $8, \TMP2 # right shift TMP2 2 DWs
823 pxor \TMP3, \XMM5
824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
825
826 # first phase of reduction
827
828 movdqa \XMM5, \TMP2
829 movdqa \XMM5, \TMP3
830 movdqa \XMM5, \TMP4
831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832 pslld $31, \TMP2 # packed right shift << 31
833 pslld $30, \TMP3 # packed right shift << 30
834 pslld $25, \TMP4 # packed right shift << 25
835 pxor \TMP3, \TMP2 # xor the shifted versions
836 pxor \TMP4, \TMP2
837 movdqa \TMP2, \TMP5
838 psrldq $4, \TMP5 # right shift T5 1 DW
839 pslldq $12, \TMP2 # left shift T2 3 DWs
840 pxor \TMP2, \XMM5
841
842 # second phase of reduction
843
844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
845 movdqa \XMM5,\TMP3
846 movdqa \XMM5,\TMP4
847 psrld $1, \TMP2 # packed left shift >>1
848 psrld $2, \TMP3 # packed left shift >>2
849 psrld $7, \TMP4 # packed left shift >>7
850 pxor \TMP3,\TMP2 # xor the shifted versions
851 pxor \TMP4,\TMP2
852 pxor \TMP5, \TMP2
853 pxor \TMP2, \XMM5
854 pxor \TMP1, \XMM5 # result is in TMP1
855
856 pxor \XMM5, \XMM1
857.endm
858
859/*
860* decrypt 4 blocks at a time
861* ghash the 4 previously decrypted ciphertext blocks
862* arg1, %arg2, %arg3 are used as pointers only, not modified
863* %r11 is the data offset value
864*/
865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
867
868 movdqa \XMM1, \XMM5
869 movdqa \XMM2, \XMM6
870 movdqa \XMM3, \XMM7
871 movdqa \XMM4, \XMM8
872
873 movdqa SHUF_MASK(%rip), %xmm15
874 # multiply TMP5 * HashKey using karatsuba
875
876 movdqa \XMM5, \TMP4
877 pshufd $78, \XMM5, \TMP6
878 pxor \XMM5, \TMP6
879 paddd ONE(%rip), \XMM0 # INCR CNT
880 movdqa HashKey_4(%rsp), \TMP5
881 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
882 movdqa \XMM0, \XMM1
883 paddd ONE(%rip), \XMM0 # INCR CNT
884 movdqa \XMM0, \XMM2
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa \XMM0, \XMM3
887 paddd ONE(%rip), \XMM0 # INCR CNT
888 movdqa \XMM0, \XMM4
889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
894
895 pxor (%arg1), \XMM1
896 pxor (%arg1), \XMM2
897 pxor (%arg1), \XMM3
898 pxor (%arg1), \XMM4
899 movdqa HashKey_4_k(%rsp), \TMP5
900 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
901 movaps 0x10(%arg1), \TMP1
902 AESENC \TMP1, \XMM1 # Round 1
903 AESENC \TMP1, \XMM2
904 AESENC \TMP1, \XMM3
905 AESENC \TMP1, \XMM4
906 movaps 0x20(%arg1), \TMP1
907 AESENC \TMP1, \XMM1 # Round 2
908 AESENC \TMP1, \XMM2
909 AESENC \TMP1, \XMM3
910 AESENC \TMP1, \XMM4
911 movdqa \XMM6, \TMP1
912 pshufd $78, \XMM6, \TMP2
913 pxor \XMM6, \TMP2
914 movdqa HashKey_3(%rsp), \TMP5
915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
916 movaps 0x30(%arg1), \TMP3
917 AESENC \TMP3, \XMM1 # Round 3
918 AESENC \TMP3, \XMM2
919 AESENC \TMP3, \XMM3
920 AESENC \TMP3, \XMM4
921 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
922 movaps 0x40(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 4
924 AESENC \TMP3, \XMM2
925 AESENC \TMP3, \XMM3
926 AESENC \TMP3, \XMM4
927 movdqa HashKey_3_k(%rsp), \TMP5
928 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
929 movaps 0x50(%arg1), \TMP3
930 AESENC \TMP3, \XMM1 # Round 5
931 AESENC \TMP3, \XMM2
932 AESENC \TMP3, \XMM3
933 AESENC \TMP3, \XMM4
934 pxor \TMP1, \TMP4
935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
936 pxor \XMM6, \XMM5
937 pxor \TMP2, \TMP6
938 movdqa \XMM7, \TMP1
939 pshufd $78, \XMM7, \TMP2
940 pxor \XMM7, \TMP2
941 movdqa HashKey_2(%rsp ), \TMP5
942
943 # Multiply TMP5 * HashKey using karatsuba
944
945 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
946 movaps 0x60(%arg1), \TMP3
947 AESENC \TMP3, \XMM1 # Round 6
948 AESENC \TMP3, \XMM2
949 AESENC \TMP3, \XMM3
950 AESENC \TMP3, \XMM4
951 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
952 movaps 0x70(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 7
954 AESENC \TMP3, \XMM2
955 AESENC \TMP3, \XMM3
956 AESENC \TMP3, \XMM4
957 movdqa HashKey_2_k(%rsp), \TMP5
958 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
959 movaps 0x80(%arg1), \TMP3
960 AESENC \TMP3, \XMM1 # Round 8
961 AESENC \TMP3, \XMM2
962 AESENC \TMP3, \XMM3
963 AESENC \TMP3, \XMM4
964 pxor \TMP1, \TMP4
965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
966 pxor \XMM7, \XMM5
967 pxor \TMP2, \TMP6
968
969 # Multiply XMM8 * HashKey
970 # XMM8 and TMP5 hold the values for the two operands
971
972 movdqa \XMM8, \TMP1
973 pshufd $78, \XMM8, \TMP2
974 pxor \XMM8, \TMP2
975 movdqa HashKey(%rsp), \TMP5
976 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
977 movaps 0x90(%arg1), \TMP3
978 AESENC \TMP3, \XMM1 # Round 9
979 AESENC \TMP3, \XMM2
980 AESENC \TMP3, \XMM3
981 AESENC \TMP3, \XMM4
982 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
983 movaps 0xa0(%arg1), \TMP3
984 AESENCLAST \TMP3, \XMM1 # Round 10
985 AESENCLAST \TMP3, \XMM2
986 AESENCLAST \TMP3, \XMM3
987 AESENCLAST \TMP3, \XMM4
988 movdqa HashKey_k(%rsp), \TMP5
989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
990 movdqu (%arg3,%r11,1), \TMP3
991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
993 movdqa \TMP3, \XMM1
994 movdqu 16(%arg3,%r11,1), \TMP3
995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
997 movdqa \TMP3, \XMM2
998 movdqu 32(%arg3,%r11,1), \TMP3
999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1001 movdqa \TMP3, \XMM3
1002 movdqu 48(%arg3,%r11,1), \TMP3
1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1005 movdqa \TMP3, \XMM4
1006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1010
1011 pxor \TMP4, \TMP1
1012 pxor \XMM8, \XMM5
1013 pxor \TMP6, \TMP2
1014 pxor \TMP1, \TMP2
1015 pxor \XMM5, \TMP2
1016 movdqa \TMP2, \TMP3
1017 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1018 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1019 pxor \TMP3, \XMM5
1020 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1021
1022 # first phase of reduction
1023
1024 movdqa \XMM5, \TMP2
1025 movdqa \XMM5, \TMP3
1026 movdqa \XMM5, \TMP4
1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028 pslld $31, \TMP2 # packed right shift << 31
1029 pslld $30, \TMP3 # packed right shift << 30
1030 pslld $25, \TMP4 # packed right shift << 25
1031 pxor \TMP3, \TMP2 # xor the shifted versions
1032 pxor \TMP4, \TMP2
1033 movdqa \TMP2, \TMP5
1034 psrldq $4, \TMP5 # right shift T5 1 DW
1035 pslldq $12, \TMP2 # left shift T2 3 DWs
1036 pxor \TMP2, \XMM5
1037
1038 # second phase of reduction
1039
1040 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041 movdqa \XMM5,\TMP3
1042 movdqa \XMM5,\TMP4
1043 psrld $1, \TMP2 # packed left shift >>1
1044 psrld $2, \TMP3 # packed left shift >>2
1045 psrld $7, \TMP4 # packed left shift >>7
1046 pxor \TMP3,\TMP2 # xor the shifted versions
1047 pxor \TMP4,\TMP2
1048 pxor \TMP5, \TMP2
1049 pxor \TMP2, \XMM5
1050 pxor \TMP1, \XMM5 # result is in TMP1
1051
1052 pxor \XMM5, \XMM1
1053.endm
1054
1055/* GHASH the last 4 ciphertext blocks. */
1056.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059 # Multiply TMP6 * HashKey (using Karatsuba)
1060
1061 movdqa \XMM1, \TMP6
1062 pshufd $78, \XMM1, \TMP2
1063 pxor \XMM1, \TMP2
1064 movdqa HashKey_4(%rsp), \TMP5
1065 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1066 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1067 movdqa HashKey_4_k(%rsp), \TMP4
1068 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1069 movdqa \XMM1, \XMMDst
1070 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1071
1072 # Multiply TMP1 * HashKey (using Karatsuba)
1073
1074 movdqa \XMM2, \TMP1
1075 pshufd $78, \XMM2, \TMP2
1076 pxor \XMM2, \TMP2
1077 movdqa HashKey_3(%rsp), \TMP5
1078 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1079 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1080 movdqa HashKey_3_k(%rsp), \TMP4
1081 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082 pxor \TMP1, \TMP6
1083 pxor \XMM2, \XMMDst
1084 pxor \TMP2, \XMM1
1085# results accumulated in TMP6, XMMDst, XMM1
1086
1087 # Multiply TMP1 * HashKey (using Karatsuba)
1088
1089 movdqa \XMM3, \TMP1
1090 pshufd $78, \XMM3, \TMP2
1091 pxor \XMM3, \TMP2
1092 movdqa HashKey_2(%rsp), \TMP5
1093 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1094 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1095 movdqa HashKey_2_k(%rsp), \TMP4
1096 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1097 pxor \TMP1, \TMP6
1098 pxor \XMM3, \XMMDst
1099 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1100
1101 # Multiply TMP1 * HashKey (using Karatsuba)
1102 movdqa \XMM4, \TMP1
1103 pshufd $78, \XMM4, \TMP2
1104 pxor \XMM4, \TMP2
1105 movdqa HashKey(%rsp), \TMP5
1106 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1107 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1108 movdqa HashKey_k(%rsp), \TMP4
1109 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1110 pxor \TMP1, \TMP6
1111 pxor \XMM4, \XMMDst
1112 pxor \XMM1, \TMP2
1113 pxor \TMP6, \TMP2
1114 pxor \XMMDst, \TMP2
1115 # middle section of the temp results combined as in karatsuba algorithm
1116 movdqa \TMP2, \TMP4
1117 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1118 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1119 pxor \TMP4, \XMMDst
1120 pxor \TMP2, \TMP6
1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122 # first phase of the reduction
1123 movdqa \XMMDst, \TMP2
1124 movdqa \XMMDst, \TMP3
1125 movdqa \XMMDst, \TMP4
1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127 pslld $31, \TMP2 # packed right shifting << 31
1128 pslld $30, \TMP3 # packed right shifting << 30
1129 pslld $25, \TMP4 # packed right shifting << 25
1130 pxor \TMP3, \TMP2 # xor the shifted versions
1131 pxor \TMP4, \TMP2
1132 movdqa \TMP2, \TMP7
1133 psrldq $4, \TMP7 # right shift TMP7 1 DW
1134 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1135 pxor \TMP2, \XMMDst
1136
1137 # second phase of the reduction
1138 movdqa \XMMDst, \TMP2
1139 # make 3 copies of XMMDst for doing 3 shift operations
1140 movdqa \XMMDst, \TMP3
1141 movdqa \XMMDst, \TMP4
1142 psrld $1, \TMP2 # packed left shift >> 1
1143 psrld $2, \TMP3 # packed left shift >> 2
1144 psrld $7, \TMP4 # packed left shift >> 7
1145 pxor \TMP3, \TMP2 # xor the shifted versions
1146 pxor \TMP4, \TMP2
1147 pxor \TMP7, \TMP2
1148 pxor \TMP2, \XMMDst
1149 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1150.endm
1151
1152/* Encryption of a single block done*/
1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155 pxor (%arg1), \XMM0
1156 movaps 16(%arg1), \TMP1
1157 AESENC \TMP1, \XMM0
1158 movaps 32(%arg1), \TMP1
1159 AESENC \TMP1, \XMM0
1160 movaps 48(%arg1), \TMP1
1161 AESENC \TMP1, \XMM0
1162 movaps 64(%arg1), \TMP1
1163 AESENC \TMP1, \XMM0
1164 movaps 80(%arg1), \TMP1
1165 AESENC \TMP1, \XMM0
1166 movaps 96(%arg1), \TMP1
1167 AESENC \TMP1, \XMM0
1168 movaps 112(%arg1), \TMP1
1169 AESENC \TMP1, \XMM0
1170 movaps 128(%arg1), \TMP1
1171 AESENC \TMP1, \XMM0
1172 movaps 144(%arg1), \TMP1
1173 AESENC \TMP1, \XMM0
1174 movaps 160(%arg1), \TMP1
1175 AESENCLAST \TMP1, \XMM0
1176.endm
1177
1178
1179/*****************************************************************************
1180* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1181* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1182* const u8 *in, // Ciphertext input
1183* u64 plaintext_len, // Length of data in bytes for decryption.
1184* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1185* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186* // concatenated with 0x00000001. 16-byte aligned pointer.
1187* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188* const u8 *aad, // Additional Authentication Data (AAD)
1189* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1191* // given authentication tag and only return the plaintext if they match.
1192* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193* // (most likely), 12 or 8.
1194*
1195* Assumptions:
1196*
1197* keys:
1198* keys are pre-expanded and aligned to 16 bytes. we are using the first
1199* set of 11 keys in the data structure void *aes_ctx
1200*
1201* iv:
1202* 0 1 2 3
1203* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205* | Salt (From the SA) |
1206* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207* | Initialization Vector |
1208* | (This is the sequence number from IPSec header) |
1209* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210* | 0x1 |
1211* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212*
1213*
1214*
1215* AAD:
1216* AAD padded to 128 bits with 0
1217* for example, assume AAD is a u32 vector
1218*
1219* if AAD is 8 bytes:
1220* AAD[3] = {A0, A1};
1221* padded AAD in xmm register = {A1 A0 0 0}
1222*
1223* 0 1 2 3
1224* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226* | SPI (A1) |
1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228* | 32-bit Sequence Number (A0) |
1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230* | 0x0 |
1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*
1233* AAD Format with 32-bit Sequence Number
1234*
1235* if AAD is 12 bytes:
1236* AAD[3] = {A0, A1, A2};
1237* padded AAD in xmm register = {A2 A1 A0 0}
1238*
1239* 0 1 2 3
1240* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244* | SPI (A2) |
1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246* | 64-bit Extended Sequence Number {A1,A0} |
1247* | |
1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249* | 0x0 |
1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252* AAD Format with 64-bit Extended Sequence Number
1253*
1254* aadLen:
1255* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256* The code supports 16 too but for other sizes, the code will fail.
1257*
1258* TLen:
1259* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260* For other sizes, the code will fail.
1261*
1262* poly = x^128 + x^127 + x^126 + x^121 + 1
1263*
1264*****************************************************************************/
1265
1266ENTRY(aesni_gcm_dec)
1267 push %r12
1268 push %r13
1269 push %r14
1270 mov %rsp, %r14
1271/*
1272* states of %xmm registers %xmm6:%xmm15 not saved
1273* all %xmm registers are clobbered
1274*/
1275 sub $VARIABLE_OFFSET, %rsp
1276 and $~63, %rsp # align rsp to 64 bytes
1277 mov %arg6, %r12
1278 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1279 movdqa SHUF_MASK(%rip), %xmm2
1280 PSHUFB_XMM %xmm2, %xmm13
1281
1282
1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284
1285 movdqa %xmm13, %xmm2
1286 psllq $1, %xmm13
1287 psrlq $63, %xmm2
1288 movdqa %xmm2, %xmm1
1289 pslldq $8, %xmm2
1290 psrldq $8, %xmm1
1291 por %xmm2, %xmm13
1292
1293 # Reduction
1294
1295 pshufd $0x24, %xmm1, %xmm2
1296 pcmpeqd TWOONE(%rip), %xmm2
1297 pand POLY(%rip), %xmm2
1298 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1299
1300
1301 # Decrypt first few blocks
1302
1303 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1304 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1305 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1306 mov %r13, %r12
1307 and $(3<<4), %r12
1308 jz _initial_num_blocks_is_0_decrypt
1309 cmp $(2<<4), %r12
1310 jb _initial_num_blocks_is_1_decrypt
1311 je _initial_num_blocks_is_2_decrypt
1312_initial_num_blocks_is_3_decrypt:
1313 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315 sub $48, %r13
1316 jmp _initial_blocks_decrypted
1317_initial_num_blocks_is_2_decrypt:
1318 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320 sub $32, %r13
1321 jmp _initial_blocks_decrypted
1322_initial_num_blocks_is_1_decrypt:
1323 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325 sub $16, %r13
1326 jmp _initial_blocks_decrypted
1327_initial_num_blocks_is_0_decrypt:
1328 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1330_initial_blocks_decrypted:
1331 cmp $0, %r13
1332 je _zero_cipher_left_decrypt
1333 sub $64, %r13
1334 je _four_cipher_left_decrypt
1335_decrypt_by_4:
1336 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1338 add $64, %r11
1339 sub $64, %r13
1340 jne _decrypt_by_4
1341_four_cipher_left_decrypt:
1342 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1344_zero_cipher_left_decrypt:
1345 mov %arg4, %r13
1346 and $15, %r13 # %r13 = arg4 (mod 16)
1347 je _multiple_of_16_bytes_decrypt
1348
1349 # Handle the last <16 byte block seperately
1350
1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352 movdqa SHUF_MASK(%rip), %xmm10
1353 PSHUFB_XMM %xmm10, %xmm0
1354
1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356 sub $16, %r11
1357 add %r13, %r11
1358 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
1359 lea SHIFT_MASK+16(%rip), %r12
1360 sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1362# (%r13 is the number of bytes in plaintext mod 16)
1363 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1364 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1365
1366 movdqa %xmm1, %xmm2
1367 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1368 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1369 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1370 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1371 pand %xmm1, %xmm2
1372 movdqa SHUF_MASK(%rip), %xmm10
1373 PSHUFB_XMM %xmm10 ,%xmm2
1374
1375 pxor %xmm2, %xmm8
1376 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1377 # GHASH computation for the last <16 byte block
1378 sub %r13, %r11
1379 add $16, %r11
1380
1381 # output %r13 bytes
1382 MOVQ_R64_XMM %xmm0, %rax
1383 cmp $8, %r13
1384 jle _less_than_8_bytes_left_decrypt
1385 mov %rax, (%arg2 , %r11, 1)
1386 add $8, %r11
1387 psrldq $8, %xmm0
1388 MOVQ_R64_XMM %xmm0, %rax
1389 sub $8, %r13
1390_less_than_8_bytes_left_decrypt:
1391 mov %al, (%arg2, %r11, 1)
1392 add $1, %r11
1393 shr $8, %rax
1394 sub $1, %r13
1395 jne _less_than_8_bytes_left_decrypt
1396_multiple_of_16_bytes_decrypt:
1397 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1398 shl $3, %r12 # convert into number of bits
1399 movd %r12d, %xmm15 # len(A) in %xmm15
1400 shl $3, %arg4 # len(C) in bits (*128)
1401 MOVQ_R64_XMM %arg4, %xmm1
1402 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1403 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1404 pxor %xmm15, %xmm8
1405 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1406 # final GHASH computation
1407 movdqa SHUF_MASK(%rip), %xmm10
1408 PSHUFB_XMM %xmm10, %xmm8
1409
1410 mov %arg5, %rax # %rax = *Y0
1411 movdqu (%rax), %xmm0 # %xmm0 = Y0
1412 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1413 pxor %xmm8, %xmm0
1414_return_T_decrypt:
1415 mov arg9, %r10 # %r10 = authTag
1416 mov arg10, %r11 # %r11 = auth_tag_len
1417 cmp $16, %r11
1418 je _T_16_decrypt
1419 cmp $12, %r11
1420 je _T_12_decrypt
1421_T_8_decrypt:
1422 MOVQ_R64_XMM %xmm0, %rax
1423 mov %rax, (%r10)
1424 jmp _return_T_done_decrypt
1425_T_12_decrypt:
1426 MOVQ_R64_XMM %xmm0, %rax
1427 mov %rax, (%r10)
1428 psrldq $8, %xmm0
1429 movd %xmm0, %eax
1430 mov %eax, 8(%r10)
1431 jmp _return_T_done_decrypt
1432_T_16_decrypt:
1433 movdqu %xmm0, (%r10)
1434_return_T_done_decrypt:
1435 mov %r14, %rsp
1436 pop %r14
1437 pop %r13
1438 pop %r12
1439 ret
1440
1441
1442/*****************************************************************************
1443* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1444* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1445* const u8 *in, // Plaintext input
1446* u64 plaintext_len, // Length of data in bytes for encryption.
1447* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1448* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449* // concatenated with 0x00000001. 16-byte aligned pointer.
1450* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451* const u8 *aad, // Additional Authentication Data (AAD)
1452* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453* u8 *auth_tag, // Authenticated Tag output.
1454* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455* // 12 or 8.
1456*
1457* Assumptions:
1458*
1459* keys:
1460* keys are pre-expanded and aligned to 16 bytes. we are using the
1461* first set of 11 keys in the data structure void *aes_ctx
1462*
1463*
1464* iv:
1465* 0 1 2 3
1466* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468* | Salt (From the SA) |
1469* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470* | Initialization Vector |
1471* | (This is the sequence number from IPSec header) |
1472* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473* | 0x1 |
1474* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475*
1476*
1477*
1478* AAD:
1479* AAD padded to 128 bits with 0
1480* for example, assume AAD is a u32 vector
1481*
1482* if AAD is 8 bytes:
1483* AAD[3] = {A0, A1};
1484* padded AAD in xmm register = {A1 A0 0 0}
1485*
1486* 0 1 2 3
1487* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489* | SPI (A1) |
1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491* | 32-bit Sequence Number (A0) |
1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493* | 0x0 |
1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*
1496* AAD Format with 32-bit Sequence Number
1497*
1498* if AAD is 12 bytes:
1499* AAD[3] = {A0, A1, A2};
1500* padded AAD in xmm register = {A2 A1 A0 0}
1501*
1502* 0 1 2 3
1503* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505* | SPI (A2) |
1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507* | 64-bit Extended Sequence Number {A1,A0} |
1508* | |
1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510* | 0x0 |
1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513* AAD Format with 64-bit Extended Sequence Number
1514*
1515* aadLen:
1516* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517* The code supports 16 too but for other sizes, the code will fail.
1518*
1519* TLen:
1520* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521* For other sizes, the code will fail.
1522*
1523* poly = x^128 + x^127 + x^126 + x^121 + 1
1524***************************************************************************/
1525ENTRY(aesni_gcm_enc)
1526 push %r12
1527 push %r13
1528 push %r14
1529 mov %rsp, %r14
1530#
1531# states of %xmm registers %xmm6:%xmm15 not saved
1532# all %xmm registers are clobbered
1533#
1534 sub $VARIABLE_OFFSET, %rsp
1535 and $~63, %rsp
1536 mov %arg6, %r12
1537 movdqu (%r12), %xmm13
1538 movdqa SHUF_MASK(%rip), %xmm2
1539 PSHUFB_XMM %xmm2, %xmm13
1540
1541
1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544 movdqa %xmm13, %xmm2
1545 psllq $1, %xmm13
1546 psrlq $63, %xmm2
1547 movdqa %xmm2, %xmm1
1548 pslldq $8, %xmm2
1549 psrldq $8, %xmm1
1550 por %xmm2, %xmm13
1551
1552 # reduce HashKey<<1
1553
1554 pshufd $0x24, %xmm1, %xmm2
1555 pcmpeqd TWOONE(%rip), %xmm2
1556 pand POLY(%rip), %xmm2
1557 pxor %xmm2, %xmm13
1558 movdqa %xmm13, HashKey(%rsp)
1559 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1560 and $-16, %r13
1561 mov %r13, %r12
1562
1563 # Encrypt first few blocks
1564
1565 and $(3<<4), %r12
1566 jz _initial_num_blocks_is_0_encrypt
1567 cmp $(2<<4), %r12
1568 jb _initial_num_blocks_is_1_encrypt
1569 je _initial_num_blocks_is_2_encrypt
1570_initial_num_blocks_is_3_encrypt:
1571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573 sub $48, %r13
1574 jmp _initial_blocks_encrypted
1575_initial_num_blocks_is_2_encrypt:
1576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578 sub $32, %r13
1579 jmp _initial_blocks_encrypted
1580_initial_num_blocks_is_1_encrypt:
1581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583 sub $16, %r13
1584 jmp _initial_blocks_encrypted
1585_initial_num_blocks_is_0_encrypt:
1586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588_initial_blocks_encrypted:
1589
1590 # Main loop - Encrypt remaining blocks
1591
1592 cmp $0, %r13
1593 je _zero_cipher_left_encrypt
1594 sub $64, %r13
1595 je _four_cipher_left_encrypt
1596_encrypt_by_4_encrypt:
1597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599 add $64, %r11
1600 sub $64, %r13
1601 jne _encrypt_by_4_encrypt
1602_four_cipher_left_encrypt:
1603 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605_zero_cipher_left_encrypt:
1606 mov %arg4, %r13
1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt
1609
1610 # Handle the last <16 Byte block seperately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0
1614
1615 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1616 sub $16, %r11
1617 add %r13, %r11
1618 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1619 lea SHIFT_MASK+16(%rip), %r12
1620 sub %r13, %r12
1621 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1622 # (%r13 is the number of bytes in plaintext mod 16)
1623 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1624 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1625 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1626 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1627 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1628 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1629 movdqa SHUF_MASK(%rip), %xmm10
1630 PSHUFB_XMM %xmm10,%xmm0
1631
1632 pxor %xmm0, %xmm8
1633 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1634 # GHASH computation for the last <16 byte block
1635 sub %r13, %r11
1636 add $16, %r11
1637 PSHUFB_XMM %xmm10, %xmm1
1638
1639 # shuffle xmm0 back to output as ciphertext
1640
1641 # Output %r13 bytes
1642 MOVQ_R64_XMM %xmm0, %rax
1643 cmp $8, %r13
1644 jle _less_than_8_bytes_left_encrypt
1645 mov %rax, (%arg2 , %r11, 1)
1646 add $8, %r11
1647 psrldq $8, %xmm0
1648 MOVQ_R64_XMM %xmm0, %rax
1649 sub $8, %r13
1650_less_than_8_bytes_left_encrypt:
1651 mov %al, (%arg2, %r11, 1)
1652 add $1, %r11
1653 shr $8, %rax
1654 sub $1, %r13
1655 jne _less_than_8_bytes_left_encrypt
1656_multiple_of_16_bytes_encrypt:
1657 mov arg8, %r12 # %r12 = addLen (number of bytes)
1658 shl $3, %r12
1659 movd %r12d, %xmm15 # len(A) in %xmm15
1660 shl $3, %arg4 # len(C) in bits (*128)
1661 MOVQ_R64_XMM %arg4, %xmm1
1662 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1663 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1664 pxor %xmm15, %xmm8
1665 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1666 # final GHASH computation
1667 movdqa SHUF_MASK(%rip), %xmm10
1668 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1669
1670 mov %arg5, %rax # %rax = *Y0
1671 movdqu (%rax), %xmm0 # %xmm0 = Y0
1672 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1673 pxor %xmm8, %xmm0
1674_return_T_encrypt:
1675 mov arg9, %r10 # %r10 = authTag
1676 mov arg10, %r11 # %r11 = auth_tag_len
1677 cmp $16, %r11
1678 je _T_16_encrypt
1679 cmp $12, %r11
1680 je _T_12_encrypt
1681_T_8_encrypt:
1682 MOVQ_R64_XMM %xmm0, %rax
1683 mov %rax, (%r10)
1684 jmp _return_T_done_encrypt
1685_T_12_encrypt:
1686 MOVQ_R64_XMM %xmm0, %rax
1687 mov %rax, (%r10)
1688 psrldq $8, %xmm0
1689 movd %xmm0, %eax
1690 mov %eax, 8(%r10)
1691 jmp _return_T_done_encrypt
1692_T_16_encrypt:
1693 movdqu %xmm0, (%r10)
1694_return_T_done_encrypt:
1695 mov %r14, %rsp
1696 pop %r14
1697 pop %r13
1698 pop %r12
1699 ret
1700
1701#endif
1702
49 1703
50_key_expansion_128: 1704_key_expansion_128:
51_key_expansion_256a: 1705_key_expansion_256a:
@@ -55,10 +1709,11 @@ _key_expansion_256a:
55 shufps $0b10001100, %xmm0, %xmm4 1709 shufps $0b10001100, %xmm0, %xmm4
56 pxor %xmm4, %xmm0 1710 pxor %xmm4, %xmm0
57 pxor %xmm1, %xmm0 1711 pxor %xmm1, %xmm0
58 movaps %xmm0, (%rcx) 1712 movaps %xmm0, (TKEYP)
59 add $0x10, %rcx 1713 add $0x10, TKEYP
60 ret 1714 ret
61 1715
1716.align 4
62_key_expansion_192a: 1717_key_expansion_192a:
63 pshufd $0b01010101, %xmm1, %xmm1 1718 pshufd $0b01010101, %xmm1, %xmm1
64 shufps $0b00010000, %xmm0, %xmm4 1719 shufps $0b00010000, %xmm0, %xmm4
@@ -76,12 +1731,13 @@ _key_expansion_192a:
76 1731
77 movaps %xmm0, %xmm1 1732 movaps %xmm0, %xmm1
78 shufps $0b01000100, %xmm0, %xmm6 1733 shufps $0b01000100, %xmm0, %xmm6
79 movaps %xmm6, (%rcx) 1734 movaps %xmm6, (TKEYP)
80 shufps $0b01001110, %xmm2, %xmm1 1735 shufps $0b01001110, %xmm2, %xmm1
81 movaps %xmm1, 16(%rcx) 1736 movaps %xmm1, 0x10(TKEYP)
82 add $0x20, %rcx 1737 add $0x20, TKEYP
83 ret 1738 ret
84 1739
1740.align 4
85_key_expansion_192b: 1741_key_expansion_192b:
86 pshufd $0b01010101, %xmm1, %xmm1 1742 pshufd $0b01010101, %xmm1, %xmm1
87 shufps $0b00010000, %xmm0, %xmm4 1743 shufps $0b00010000, %xmm0, %xmm4
@@ -96,10 +1752,11 @@ _key_expansion_192b:
96 pxor %xmm3, %xmm2 1752 pxor %xmm3, %xmm2
97 pxor %xmm5, %xmm2 1753 pxor %xmm5, %xmm2
98 1754
99 movaps %xmm0, (%rcx) 1755 movaps %xmm0, (TKEYP)
100 add $0x10, %rcx 1756 add $0x10, TKEYP
101 ret 1757 ret
102 1758
1759.align 4
103_key_expansion_256b: 1760_key_expansion_256b:
104 pshufd $0b10101010, %xmm1, %xmm1 1761 pshufd $0b10101010, %xmm1, %xmm1
105 shufps $0b00010000, %xmm2, %xmm4 1762 shufps $0b00010000, %xmm2, %xmm4
@@ -107,8 +1764,8 @@ _key_expansion_256b:
107 shufps $0b10001100, %xmm2, %xmm4 1764 shufps $0b10001100, %xmm2, %xmm4
108 pxor %xmm4, %xmm2 1765 pxor %xmm4, %xmm2
109 pxor %xmm1, %xmm2 1766 pxor %xmm1, %xmm2
110 movaps %xmm2, (%rcx) 1767 movaps %xmm2, (TKEYP)
111 add $0x10, %rcx 1768 add $0x10, TKEYP
112 ret 1769 ret
113 1770
114/* 1771/*
@@ -116,17 +1773,23 @@ _key_expansion_256b:
116 * unsigned int key_len) 1773 * unsigned int key_len)
117 */ 1774 */
118ENTRY(aesni_set_key) 1775ENTRY(aesni_set_key)
119 movups (%rsi), %xmm0 # user key (first 16 bytes) 1776#ifndef __x86_64__
120 movaps %xmm0, (%rdi) 1777 pushl KEYP
121 lea 0x10(%rdi), %rcx # key addr 1778 movl 8(%esp), KEYP # ctx
122 movl %edx, 480(%rdi) 1779 movl 12(%esp), UKEYP # in_key
1780 movl 16(%esp), %edx # key_len
1781#endif
1782 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1783 movaps %xmm0, (KEYP)
1784 lea 0x10(KEYP), TKEYP # key addr
1785 movl %edx, 480(KEYP)
123 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1786 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
124 cmp $24, %dl 1787 cmp $24, %dl
125 jb .Lenc_key128 1788 jb .Lenc_key128
126 je .Lenc_key192 1789 je .Lenc_key192
127 movups 0x10(%rsi), %xmm2 # other user key 1790 movups 0x10(UKEYP), %xmm2 # other user key
128 movaps %xmm2, (%rcx) 1791 movaps %xmm2, (TKEYP)
129 add $0x10, %rcx 1792 add $0x10, TKEYP
130 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1793 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
131 call _key_expansion_256a 1794 call _key_expansion_256a
132 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1795 AESKEYGENASSIST 0x1 %xmm0 %xmm1
@@ -155,7 +1818,7 @@ ENTRY(aesni_set_key)
155 call _key_expansion_256a 1818 call _key_expansion_256a
156 jmp .Ldec_key 1819 jmp .Ldec_key
157.Lenc_key192: 1820.Lenc_key192:
158 movq 0x10(%rsi), %xmm2 # other user key 1821 movq 0x10(UKEYP), %xmm2 # other user key
159 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1822 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
160 call _key_expansion_192a 1823 call _key_expansion_192a
161 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1824 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
@@ -195,33 +1858,47 @@ ENTRY(aesni_set_key)
195 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1858 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
196 call _key_expansion_128 1859 call _key_expansion_128
197.Ldec_key: 1860.Ldec_key:
198 sub $0x10, %rcx 1861 sub $0x10, TKEYP
199 movaps (%rdi), %xmm0 1862 movaps (KEYP), %xmm0
200 movaps (%rcx), %xmm1 1863 movaps (TKEYP), %xmm1
201 movaps %xmm0, 240(%rcx) 1864 movaps %xmm0, 240(TKEYP)
202 movaps %xmm1, 240(%rdi) 1865 movaps %xmm1, 240(KEYP)
203 add $0x10, %rdi 1866 add $0x10, KEYP
204 lea 240-16(%rcx), %rsi 1867 lea 240-16(TKEYP), UKEYP
205.align 4 1868.align 4
206.Ldec_key_loop: 1869.Ldec_key_loop:
207 movaps (%rdi), %xmm0 1870 movaps (KEYP), %xmm0
208 AESIMC %xmm0 %xmm1 1871 AESIMC %xmm0 %xmm1
209 movaps %xmm1, (%rsi) 1872 movaps %xmm1, (UKEYP)
210 add $0x10, %rdi 1873 add $0x10, KEYP
211 sub $0x10, %rsi 1874 sub $0x10, UKEYP
212 cmp %rcx, %rdi 1875 cmp TKEYP, KEYP
213 jb .Ldec_key_loop 1876 jb .Ldec_key_loop
214 xor %rax, %rax 1877 xor AREG, AREG
1878#ifndef __x86_64__
1879 popl KEYP
1880#endif
215 ret 1881 ret
216 1882
217/* 1883/*
218 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1884 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
219 */ 1885 */
220ENTRY(aesni_enc) 1886ENTRY(aesni_enc)
1887#ifndef __x86_64__
1888 pushl KEYP
1889 pushl KLEN
1890 movl 12(%esp), KEYP
1891 movl 16(%esp), OUTP
1892 movl 20(%esp), INP
1893#endif
221 movl 480(KEYP), KLEN # key length 1894 movl 480(KEYP), KLEN # key length
222 movups (INP), STATE # input 1895 movups (INP), STATE # input
223 call _aesni_enc1 1896 call _aesni_enc1
224 movups STATE, (OUTP) # output 1897 movups STATE, (OUTP) # output
1898#ifndef __x86_64__
1899 popl KLEN
1900 popl KEYP
1901#endif
225 ret 1902 ret
226 1903
227/* 1904/*
@@ -236,6 +1913,7 @@ ENTRY(aesni_enc)
236 * KEY 1913 * KEY
237 * TKEYP (T1) 1914 * TKEYP (T1)
238 */ 1915 */
1916.align 4
239_aesni_enc1: 1917_aesni_enc1:
240 movaps (KEYP), KEY # key 1918 movaps (KEYP), KEY # key
241 mov KEYP, TKEYP 1919 mov KEYP, TKEYP
@@ -298,6 +1976,7 @@ _aesni_enc1:
298 * KEY 1976 * KEY
299 * TKEYP (T1) 1977 * TKEYP (T1)
300 */ 1978 */
1979.align 4
301_aesni_enc4: 1980_aesni_enc4:
302 movaps (KEYP), KEY # key 1981 movaps (KEYP), KEY # key
303 mov KEYP, TKEYP 1982 mov KEYP, TKEYP
@@ -391,11 +2070,22 @@ _aesni_enc4:
391 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2070 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
392 */ 2071 */
393ENTRY(aesni_dec) 2072ENTRY(aesni_dec)
2073#ifndef __x86_64__
2074 pushl KEYP
2075 pushl KLEN
2076 movl 12(%esp), KEYP
2077 movl 16(%esp), OUTP
2078 movl 20(%esp), INP
2079#endif
394 mov 480(KEYP), KLEN # key length 2080 mov 480(KEYP), KLEN # key length
395 add $240, KEYP 2081 add $240, KEYP
396 movups (INP), STATE # input 2082 movups (INP), STATE # input
397 call _aesni_dec1 2083 call _aesni_dec1
398 movups STATE, (OUTP) #output 2084 movups STATE, (OUTP) #output
2085#ifndef __x86_64__
2086 popl KLEN
2087 popl KEYP
2088#endif
399 ret 2089 ret
400 2090
401/* 2091/*
@@ -410,6 +2100,7 @@ ENTRY(aesni_dec)
410 * KEY 2100 * KEY
411 * TKEYP (T1) 2101 * TKEYP (T1)
412 */ 2102 */
2103.align 4
413_aesni_dec1: 2104_aesni_dec1:
414 movaps (KEYP), KEY # key 2105 movaps (KEYP), KEY # key
415 mov KEYP, TKEYP 2106 mov KEYP, TKEYP
@@ -472,6 +2163,7 @@ _aesni_dec1:
472 * KEY 2163 * KEY
473 * TKEYP (T1) 2164 * TKEYP (T1)
474 */ 2165 */
2166.align 4
475_aesni_dec4: 2167_aesni_dec4:
476 movaps (KEYP), KEY # key 2168 movaps (KEYP), KEY # key
477 mov KEYP, TKEYP 2169 mov KEYP, TKEYP
@@ -566,6 +2258,15 @@ _aesni_dec4:
566 * size_t len) 2258 * size_t len)
567 */ 2259 */
568ENTRY(aesni_ecb_enc) 2260ENTRY(aesni_ecb_enc)
2261#ifndef __x86_64__
2262 pushl LEN
2263 pushl KEYP
2264 pushl KLEN
2265 movl 16(%esp), KEYP
2266 movl 20(%esp), OUTP
2267 movl 24(%esp), INP
2268 movl 28(%esp), LEN
2269#endif
569 test LEN, LEN # check length 2270 test LEN, LEN # check length
570 jz .Lecb_enc_ret 2271 jz .Lecb_enc_ret
571 mov 480(KEYP), KLEN 2272 mov 480(KEYP), KLEN
@@ -602,6 +2303,11 @@ ENTRY(aesni_ecb_enc)
602 cmp $16, LEN 2303 cmp $16, LEN
603 jge .Lecb_enc_loop1 2304 jge .Lecb_enc_loop1
604.Lecb_enc_ret: 2305.Lecb_enc_ret:
2306#ifndef __x86_64__
2307 popl KLEN
2308 popl KEYP
2309 popl LEN
2310#endif
605 ret 2311 ret
606 2312
607/* 2313/*
@@ -609,6 +2315,15 @@ ENTRY(aesni_ecb_enc)
609 * size_t len); 2315 * size_t len);
610 */ 2316 */
611ENTRY(aesni_ecb_dec) 2317ENTRY(aesni_ecb_dec)
2318#ifndef __x86_64__
2319 pushl LEN
2320 pushl KEYP
2321 pushl KLEN
2322 movl 16(%esp), KEYP
2323 movl 20(%esp), OUTP
2324 movl 24(%esp), INP
2325 movl 28(%esp), LEN
2326#endif
612 test LEN, LEN 2327 test LEN, LEN
613 jz .Lecb_dec_ret 2328 jz .Lecb_dec_ret
614 mov 480(KEYP), KLEN 2329 mov 480(KEYP), KLEN
@@ -646,6 +2361,11 @@ ENTRY(aesni_ecb_dec)
646 cmp $16, LEN 2361 cmp $16, LEN
647 jge .Lecb_dec_loop1 2362 jge .Lecb_dec_loop1
648.Lecb_dec_ret: 2363.Lecb_dec_ret:
2364#ifndef __x86_64__
2365 popl KLEN
2366 popl KEYP
2367 popl LEN
2368#endif
649 ret 2369 ret
650 2370
651/* 2371/*
@@ -653,6 +2373,17 @@ ENTRY(aesni_ecb_dec)
653 * size_t len, u8 *iv) 2373 * size_t len, u8 *iv)
654 */ 2374 */
655ENTRY(aesni_cbc_enc) 2375ENTRY(aesni_cbc_enc)
2376#ifndef __x86_64__
2377 pushl IVP
2378 pushl LEN
2379 pushl KEYP
2380 pushl KLEN
2381 movl 20(%esp), KEYP
2382 movl 24(%esp), OUTP
2383 movl 28(%esp), INP
2384 movl 32(%esp), LEN
2385 movl 36(%esp), IVP
2386#endif
656 cmp $16, LEN 2387 cmp $16, LEN
657 jb .Lcbc_enc_ret 2388 jb .Lcbc_enc_ret
658 mov 480(KEYP), KLEN 2389 mov 480(KEYP), KLEN
@@ -670,6 +2401,12 @@ ENTRY(aesni_cbc_enc)
670 jge .Lcbc_enc_loop 2401 jge .Lcbc_enc_loop
671 movups STATE, (IVP) 2402 movups STATE, (IVP)
672.Lcbc_enc_ret: 2403.Lcbc_enc_ret:
2404#ifndef __x86_64__
2405 popl KLEN
2406 popl KEYP
2407 popl LEN
2408 popl IVP
2409#endif
673 ret 2410 ret
674 2411
675/* 2412/*
@@ -677,6 +2414,17 @@ ENTRY(aesni_cbc_enc)
677 * size_t len, u8 *iv) 2414 * size_t len, u8 *iv)
678 */ 2415 */
679ENTRY(aesni_cbc_dec) 2416ENTRY(aesni_cbc_dec)
2417#ifndef __x86_64__
2418 pushl IVP
2419 pushl LEN
2420 pushl KEYP
2421 pushl KLEN
2422 movl 20(%esp), KEYP
2423 movl 24(%esp), OUTP
2424 movl 28(%esp), INP
2425 movl 32(%esp), LEN
2426 movl 36(%esp), IVP
2427#endif
680 cmp $16, LEN 2428 cmp $16, LEN
681 jb .Lcbc_dec_just_ret 2429 jb .Lcbc_dec_just_ret
682 mov 480(KEYP), KLEN 2430 mov 480(KEYP), KLEN
@@ -690,16 +2438,30 @@ ENTRY(aesni_cbc_dec)
690 movaps IN1, STATE1 2438 movaps IN1, STATE1
691 movups 0x10(INP), IN2 2439 movups 0x10(INP), IN2
692 movaps IN2, STATE2 2440 movaps IN2, STATE2
2441#ifdef __x86_64__
693 movups 0x20(INP), IN3 2442 movups 0x20(INP), IN3
694 movaps IN3, STATE3 2443 movaps IN3, STATE3
695 movups 0x30(INP), IN4 2444 movups 0x30(INP), IN4
696 movaps IN4, STATE4 2445 movaps IN4, STATE4
2446#else
2447 movups 0x20(INP), IN1
2448 movaps IN1, STATE3
2449 movups 0x30(INP), IN2
2450 movaps IN2, STATE4
2451#endif
697 call _aesni_dec4 2452 call _aesni_dec4
698 pxor IV, STATE1 2453 pxor IV, STATE1
2454#ifdef __x86_64__
699 pxor IN1, STATE2 2455 pxor IN1, STATE2
700 pxor IN2, STATE3 2456 pxor IN2, STATE3
701 pxor IN3, STATE4 2457 pxor IN3, STATE4
702 movaps IN4, IV 2458 movaps IN4, IV
2459#else
2460 pxor (INP), STATE2
2461 pxor 0x10(INP), STATE3
2462 pxor IN1, STATE4
2463 movaps IN2, IV
2464#endif
703 movups STATE1, (OUTP) 2465 movups STATE1, (OUTP)
704 movups STATE2, 0x10(OUTP) 2466 movups STATE2, 0x10(OUTP)
705 movups STATE3, 0x20(OUTP) 2467 movups STATE3, 0x20(OUTP)
@@ -727,8 +2489,15 @@ ENTRY(aesni_cbc_dec)
727.Lcbc_dec_ret: 2489.Lcbc_dec_ret:
728 movups IV, (IVP) 2490 movups IV, (IVP)
729.Lcbc_dec_just_ret: 2491.Lcbc_dec_just_ret:
2492#ifndef __x86_64__
2493 popl KLEN
2494 popl KEYP
2495 popl LEN
2496 popl IVP
2497#endif
730 ret 2498 ret
731 2499
2500#ifdef __x86_64__
732.align 16 2501.align 16
733.Lbswap_mask: 2502.Lbswap_mask:
734 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2503 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -744,6 +2513,7 @@ ENTRY(aesni_cbc_dec)
744 * INC: == 1, in little endian 2513 * INC: == 1, in little endian
745 * BSWAP_MASK == endian swapping mask 2514 * BSWAP_MASK == endian swapping mask
746 */ 2515 */
2516.align 4
747_aesni_inc_init: 2517_aesni_inc_init:
748 movaps .Lbswap_mask, BSWAP_MASK 2518 movaps .Lbswap_mask, BSWAP_MASK
749 movaps IV, CTR 2519 movaps IV, CTR
@@ -768,6 +2538,7 @@ _aesni_inc_init:
768 * CTR: == output IV, in little endian 2538 * CTR: == output IV, in little endian
769 * TCTR_LOW: == lower qword of CTR 2539 * TCTR_LOW: == lower qword of CTR
770 */ 2540 */
2541.align 4
771_aesni_inc: 2542_aesni_inc:
772 paddq INC, CTR 2543 paddq INC, CTR
773 add $1, TCTR_LOW 2544 add $1, TCTR_LOW
@@ -839,3 +2610,4 @@ ENTRY(aesni_ctr_enc)
839 movups IV, (IVP) 2610 movups IV, (IVP)
840.Lctr_enc_just_ret: 2611.Lctr_enc_just_ret:
841 ret 2612 ret
2613#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc4490a..e1e60c7d5813 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
5 * Copyright (C) 2008, Intel Corp. 5 * Copyright (C) 2008, Intel Corp.
6 * Author: Huang Ying <ying.huang@intel.com> 6 * Author: Huang Ying <ying.huang@intel.com>
7 * 7 *
8 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
9 * interface for 64-bit kernels.
10 * Authors: Adrian Hoban <adrian.hoban@intel.com>
11 * Gabriele Paoloni <gabriele.paoloni@intel.com>
12 * Tadeusz Struk (tadeusz.struk@intel.com)
13 * Aidan O'Mahony (aidan.o.mahony@intel.com)
14 * Copyright (c) 2010, Intel Corporation.
15 *
8 * This program is free software; you can redistribute it and/or modify 16 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by 17 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or 18 * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
21#include <crypto/ctr.h> 29#include <crypto/ctr.h>
22#include <asm/i387.h> 30#include <asm/i387.h>
23#include <asm/aes.h> 31#include <asm/aes.h>
32#include <crypto/scatterwalk.h>
33#include <crypto/internal/aead.h>
34#include <linux/workqueue.h>
35#include <linux/spinlock.h>
24 36
25#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) 37#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
26#define HAS_CTR 38#define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
42 struct cryptd_ablkcipher *cryptd_tfm; 54 struct cryptd_ablkcipher *cryptd_tfm;
43}; 55};
44 56
45#define AESNI_ALIGN 16 57/* This data is stored at the end of the crypto_tfm struct.
58 * It's a type of per "session" data storage location.
59 * This needs to be 16 byte aligned.
60 */
61struct aesni_rfc4106_gcm_ctx {
62 u8 hash_subkey[16];
63 struct crypto_aes_ctx aes_key_expanded;
64 u8 nonce[4];
65 struct cryptd_aead *cryptd_tfm;
66};
67
68struct aesni_gcm_set_hash_subkey_result {
69 int err;
70 struct completion completion;
71};
72
73struct aesni_hash_subkey_req_data {
74 u8 iv[16];
75 struct aesni_gcm_set_hash_subkey_result result;
76 struct scatterlist sg;
77};
78
79#define AESNI_ALIGN (16)
46#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) 80#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
81#define RFC4106_HASH_SUBKEY_SIZE 16
47 82
48asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 83asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
49 unsigned int key_len); 84 unsigned int key_len);
@@ -59,9 +94,62 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
59 const u8 *in, unsigned int len, u8 *iv); 94 const u8 *in, unsigned int len, u8 *iv);
60asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, 95asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
61 const u8 *in, unsigned int len, u8 *iv); 96 const u8 *in, unsigned int len, u8 *iv);
97#ifdef CONFIG_X86_64
62asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 98asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
63 const u8 *in, unsigned int len, u8 *iv); 99 const u8 *in, unsigned int len, u8 *iv);
64 100
101/* asmlinkage void aesni_gcm_enc()
102 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
103 * u8 *out, Ciphertext output. Encrypt in-place is allowed.
104 * const u8 *in, Plaintext input
105 * unsigned long plaintext_len, Length of data in bytes for encryption.
106 * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
107 * concatenated with 8 byte Initialisation Vector (from IPSec ESP
108 * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
109 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
110 * const u8 *aad, Additional Authentication Data (AAD)
111 * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
112 * is going to be 8 or 12 bytes
113 * u8 *auth_tag, Authenticated Tag output.
114 * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
115 * Valid values are 16 (most likely), 12 or 8.
116 */
117asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
118 const u8 *in, unsigned long plaintext_len, u8 *iv,
119 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
120 u8 *auth_tag, unsigned long auth_tag_len);
121
122/* asmlinkage void aesni_gcm_dec()
123 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
124 * u8 *out, Plaintext output. Decrypt in-place is allowed.
125 * const u8 *in, Ciphertext input
126 * unsigned long ciphertext_len, Length of data in bytes for decryption.
127 * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
128 * concatenated with 8 byte Initialisation Vector (from IPSec ESP
129 * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
130 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
131 * const u8 *aad, Additional Authentication Data (AAD)
132 * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
133 * to be 8 or 12 bytes
134 * u8 *auth_tag, Authenticated Tag output.
135 * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
136 * Valid values are 16 (most likely), 12 or 8.
137 */
138asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
139 const u8 *in, unsigned long ciphertext_len, u8 *iv,
140 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
141 u8 *auth_tag, unsigned long auth_tag_len);
142
143static inline struct
144aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
145{
146 return
147 (struct aesni_rfc4106_gcm_ctx *)
148 PTR_ALIGN((u8 *)
149 crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
150}
151#endif
152
65static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) 153static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
66{ 154{
67 unsigned long addr = (unsigned long)raw_ctx; 155 unsigned long addr = (unsigned long)raw_ctx;
@@ -324,6 +412,7 @@ static struct crypto_alg blk_cbc_alg = {
324 }, 412 },
325}; 413};
326 414
415#ifdef CONFIG_X86_64
327static void ctr_crypt_final(struct crypto_aes_ctx *ctx, 416static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
328 struct blkcipher_walk *walk) 417 struct blkcipher_walk *walk)
329{ 418{
@@ -389,6 +478,7 @@ static struct crypto_alg blk_ctr_alg = {
389 }, 478 },
390 }, 479 },
391}; 480};
481#endif
392 482
393static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, 483static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
394 unsigned int key_len) 484 unsigned int key_len)
@@ -536,6 +626,7 @@ static struct crypto_alg ablk_cbc_alg = {
536 }, 626 },
537}; 627};
538 628
629#ifdef CONFIG_X86_64
539static int ablk_ctr_init(struct crypto_tfm *tfm) 630static int ablk_ctr_init(struct crypto_tfm *tfm)
540{ 631{
541 struct cryptd_ablkcipher *cryptd_tfm; 632 struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +703,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
612 }, 703 },
613}; 704};
614#endif 705#endif
706#endif
615 707
616#ifdef HAS_LRW 708#ifdef HAS_LRW
617static int ablk_lrw_init(struct crypto_tfm *tfm) 709static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -730,6 +822,424 @@ static struct crypto_alg ablk_xts_alg = {
730}; 822};
731#endif 823#endif
732 824
825#ifdef CONFIG_X86_64
826static int rfc4106_init(struct crypto_tfm *tfm)
827{
828 struct cryptd_aead *cryptd_tfm;
829 struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
830 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
831 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
832 if (IS_ERR(cryptd_tfm))
833 return PTR_ERR(cryptd_tfm);
834 ctx->cryptd_tfm = cryptd_tfm;
835 tfm->crt_aead.reqsize = sizeof(struct aead_request)
836 + crypto_aead_reqsize(&cryptd_tfm->base);
837 return 0;
838}
839
840static void rfc4106_exit(struct crypto_tfm *tfm)
841{
842 struct aesni_rfc4106_gcm_ctx *ctx =
843 (struct aesni_rfc4106_gcm_ctx *)
844 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
845 if (!IS_ERR(ctx->cryptd_tfm))
846 cryptd_free_aead(ctx->cryptd_tfm);
847 return;
848}
849
850static void
851rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
852{
853 struct aesni_gcm_set_hash_subkey_result *result = req->data;
854
855 if (err == -EINPROGRESS)
856 return;
857 result->err = err;
858 complete(&result->completion);
859}
860
861static int
862rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
863{
864 struct crypto_ablkcipher *ctr_tfm;
865 struct ablkcipher_request *req;
866 int ret = -EINVAL;
867 struct aesni_hash_subkey_req_data *req_data;
868
869 ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
870 if (IS_ERR(ctr_tfm))
871 return PTR_ERR(ctr_tfm);
872
873 crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
874
875 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
876 if (ret) {
877 crypto_free_ablkcipher(ctr_tfm);
878 return ret;
879 }
880
881 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
882 if (!req) {
883 crypto_free_ablkcipher(ctr_tfm);
884 return -EINVAL;
885 }
886
887 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
888 if (!req_data) {
889 crypto_free_ablkcipher(ctr_tfm);
890 return -ENOMEM;
891 }
892 memset(req_data->iv, 0, sizeof(req_data->iv));
893
894 /* Clear the data in the hash sub key container to zero.*/
895 /* We want to cipher all zeros to create the hash sub key. */
896 memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
897
898 init_completion(&req_data->result.completion);
899 sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
900 ablkcipher_request_set_tfm(req, ctr_tfm);
901 ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
902 CRYPTO_TFM_REQ_MAY_BACKLOG,
903 rfc4106_set_hash_subkey_done,
904 &req_data->result);
905
906 ablkcipher_request_set_crypt(req, &req_data->sg,
907 &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
908
909 ret = crypto_ablkcipher_encrypt(req);
910 if (ret == -EINPROGRESS || ret == -EBUSY) {
911 ret = wait_for_completion_interruptible
912 (&req_data->result.completion);
913 if (!ret)
914 ret = req_data->result.err;
915 }
916 ablkcipher_request_free(req);
917 kfree(req_data);
918 crypto_free_ablkcipher(ctr_tfm);
919 return ret;
920}
921
922static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
923 unsigned int key_len)
924{
925 int ret = 0;
926 struct crypto_tfm *tfm = crypto_aead_tfm(parent);
927 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
928 u8 *new_key_mem = NULL;
929
930 if (key_len < 4) {
931 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
932 return -EINVAL;
933 }
934 /*Account for 4 byte nonce at the end.*/
935 key_len -= 4;
936 if (key_len != AES_KEYSIZE_128) {
937 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
938 return -EINVAL;
939 }
940
941 memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
942 /*This must be on a 16 byte boundary!*/
943 if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
944 return -EINVAL;
945
946 if ((unsigned long)key % AESNI_ALIGN) {
947 /*key is not aligned: use an auxuliar aligned pointer*/
948 new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
949 if (!new_key_mem)
950 return -ENOMEM;
951
952 new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
953 memcpy(new_key_mem, key, key_len);
954 key = new_key_mem;
955 }
956
957 if (!irq_fpu_usable())
958 ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
959 key, key_len);
960 else {
961 kernel_fpu_begin();
962 ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
963 kernel_fpu_end();
964 }
965 /*This must be on a 16 byte boundary!*/
966 if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
967 ret = -EINVAL;
968 goto exit;
969 }
970 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
971exit:
972 kfree(new_key_mem);
973 return ret;
974}
975
976/* This is the Integrity Check Value (aka the authentication tag length and can
977 * be 8, 12 or 16 bytes long. */
978static int rfc4106_set_authsize(struct crypto_aead *parent,
979 unsigned int authsize)
980{
981 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
982 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
983
984 switch (authsize) {
985 case 8:
986 case 12:
987 case 16:
988 break;
989 default:
990 return -EINVAL;
991 }
992 crypto_aead_crt(parent)->authsize = authsize;
993 crypto_aead_crt(cryptd_child)->authsize = authsize;
994 return 0;
995}
996
997static int rfc4106_encrypt(struct aead_request *req)
998{
999 int ret;
1000 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1001 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1002 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1003
1004 if (!irq_fpu_usable()) {
1005 struct aead_request *cryptd_req =
1006 (struct aead_request *) aead_request_ctx(req);
1007 memcpy(cryptd_req, req, sizeof(*req));
1008 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1009 return crypto_aead_encrypt(cryptd_req);
1010 } else {
1011 kernel_fpu_begin();
1012 ret = cryptd_child->base.crt_aead.encrypt(req);
1013 kernel_fpu_end();
1014 return ret;
1015 }
1016}
1017
1018static int rfc4106_decrypt(struct aead_request *req)
1019{
1020 int ret;
1021 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1022 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1023 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1024
1025 if (!irq_fpu_usable()) {
1026 struct aead_request *cryptd_req =
1027 (struct aead_request *) aead_request_ctx(req);
1028 memcpy(cryptd_req, req, sizeof(*req));
1029 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1030 return crypto_aead_decrypt(cryptd_req);
1031 } else {
1032 kernel_fpu_begin();
1033 ret = cryptd_child->base.crt_aead.decrypt(req);
1034 kernel_fpu_end();
1035 return ret;
1036 }
1037}
1038
1039static struct crypto_alg rfc4106_alg = {
1040 .cra_name = "rfc4106(gcm(aes))",
1041 .cra_driver_name = "rfc4106-gcm-aesni",
1042 .cra_priority = 400,
1043 .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
1044 .cra_blocksize = 1,
1045 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1046 .cra_alignmask = 0,
1047 .cra_type = &crypto_nivaead_type,
1048 .cra_module = THIS_MODULE,
1049 .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
1050 .cra_init = rfc4106_init,
1051 .cra_exit = rfc4106_exit,
1052 .cra_u = {
1053 .aead = {
1054 .setkey = rfc4106_set_key,
1055 .setauthsize = rfc4106_set_authsize,
1056 .encrypt = rfc4106_encrypt,
1057 .decrypt = rfc4106_decrypt,
1058 .geniv = "seqiv",
1059 .ivsize = 8,
1060 .maxauthsize = 16,
1061 },
1062 },
1063};
1064
1065static int __driver_rfc4106_encrypt(struct aead_request *req)
1066{
1067 u8 one_entry_in_sg = 0;
1068 u8 *src, *dst, *assoc;
1069 __be32 counter = cpu_to_be32(1);
1070 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1071 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1072 void *aes_ctx = &(ctx->aes_key_expanded);
1073 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
1074 u8 iv_tab[16+AESNI_ALIGN];
1075 u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
1076 struct scatter_walk src_sg_walk;
1077 struct scatter_walk assoc_sg_walk;
1078 struct scatter_walk dst_sg_walk;
1079 unsigned int i;
1080
1081 /* Assuming we are supporting rfc4106 64-bit extended */
1082 /* sequence numbers We need to have the AAD length equal */
1083 /* to 8 or 12 bytes */
1084 if (unlikely(req->assoclen != 8 && req->assoclen != 12))
1085 return -EINVAL;
1086 /* IV below built */
1087 for (i = 0; i < 4; i++)
1088 *(iv+i) = ctx->nonce[i];
1089 for (i = 0; i < 8; i++)
1090 *(iv+4+i) = req->iv[i];
1091 *((__be32 *)(iv+12)) = counter;
1092
1093 if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
1094 one_entry_in_sg = 1;
1095 scatterwalk_start(&src_sg_walk, req->src);
1096 scatterwalk_start(&assoc_sg_walk, req->assoc);
1097 src = scatterwalk_map(&src_sg_walk, 0);
1098 assoc = scatterwalk_map(&assoc_sg_walk, 0);
1099 dst = src;
1100 if (unlikely(req->src != req->dst)) {
1101 scatterwalk_start(&dst_sg_walk, req->dst);
1102 dst = scatterwalk_map(&dst_sg_walk, 0);
1103 }
1104
1105 } else {
1106 /* Allocate memory for src, dst, assoc */
1107 src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
1108 GFP_ATOMIC);
1109 if (unlikely(!src))
1110 return -ENOMEM;
1111 assoc = (src + req->cryptlen + auth_tag_len);
1112 scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
1113 scatterwalk_map_and_copy(assoc, req->assoc, 0,
1114 req->assoclen, 0);
1115 dst = src;
1116 }
1117
1118 aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
1119 ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
1120 + ((unsigned long)req->cryptlen), auth_tag_len);
1121
1122 /* The authTag (aka the Integrity Check Value) needs to be written
1123 * back to the packet. */
1124 if (one_entry_in_sg) {
1125 if (unlikely(req->src != req->dst)) {
1126 scatterwalk_unmap(dst, 0);
1127 scatterwalk_done(&dst_sg_walk, 0, 0);
1128 }
1129 scatterwalk_unmap(src, 0);
1130 scatterwalk_unmap(assoc, 0);
1131 scatterwalk_done(&src_sg_walk, 0, 0);
1132 scatterwalk_done(&assoc_sg_walk, 0, 0);
1133 } else {
1134 scatterwalk_map_and_copy(dst, req->dst, 0,
1135 req->cryptlen + auth_tag_len, 1);
1136 kfree(src);
1137 }
1138 return 0;
1139}
1140
1141static int __driver_rfc4106_decrypt(struct aead_request *req)
1142{
1143 u8 one_entry_in_sg = 0;
1144 u8 *src, *dst, *assoc;
1145 unsigned long tempCipherLen = 0;
1146 __be32 counter = cpu_to_be32(1);
1147 int retval = 0;
1148 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1149 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1150 void *aes_ctx = &(ctx->aes_key_expanded);
1151 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
1152 u8 iv_and_authTag[32+AESNI_ALIGN];
1153 u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
1154 u8 *authTag = iv + 16;
1155 struct scatter_walk src_sg_walk;
1156 struct scatter_walk assoc_sg_walk;
1157 struct scatter_walk dst_sg_walk;
1158 unsigned int i;
1159
1160 if (unlikely((req->cryptlen < auth_tag_len) ||
1161 (req->assoclen != 8 && req->assoclen != 12)))
1162 return -EINVAL;
1163 /* Assuming we are supporting rfc4106 64-bit extended */
1164 /* sequence numbers We need to have the AAD length */
1165 /* equal to 8 or 12 bytes */
1166
1167 tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
1168 /* IV below built */
1169 for (i = 0; i < 4; i++)
1170 *(iv+i) = ctx->nonce[i];
1171 for (i = 0; i < 8; i++)
1172 *(iv+4+i) = req->iv[i];
1173 *((__be32 *)(iv+12)) = counter;
1174
1175 if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
1176 one_entry_in_sg = 1;
1177 scatterwalk_start(&src_sg_walk, req->src);
1178 scatterwalk_start(&assoc_sg_walk, req->assoc);
1179 src = scatterwalk_map(&src_sg_walk, 0);
1180 assoc = scatterwalk_map(&assoc_sg_walk, 0);
1181 dst = src;
1182 if (unlikely(req->src != req->dst)) {
1183 scatterwalk_start(&dst_sg_walk, req->dst);
1184 dst = scatterwalk_map(&dst_sg_walk, 0);
1185 }
1186
1187 } else {
1188 /* Allocate memory for src, dst, assoc */
1189 src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
1190 if (!src)
1191 return -ENOMEM;
1192 assoc = (src + req->cryptlen + auth_tag_len);
1193 scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
1194 scatterwalk_map_and_copy(assoc, req->assoc, 0,
1195 req->assoclen, 0);
1196 dst = src;
1197 }
1198
1199 aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
1200 ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
1201 authTag, auth_tag_len);
1202
1203 /* Compare generated tag with passed in tag. */
1204 retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
1205 -EBADMSG : 0;
1206
1207 if (one_entry_in_sg) {
1208 if (unlikely(req->src != req->dst)) {
1209 scatterwalk_unmap(dst, 0);
1210 scatterwalk_done(&dst_sg_walk, 0, 0);
1211 }
1212 scatterwalk_unmap(src, 0);
1213 scatterwalk_unmap(assoc, 0);
1214 scatterwalk_done(&src_sg_walk, 0, 0);
1215 scatterwalk_done(&assoc_sg_walk, 0, 0);
1216 } else {
1217 scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
1218 kfree(src);
1219 }
1220 return retval;
1221}
1222
1223static struct crypto_alg __rfc4106_alg = {
1224 .cra_name = "__gcm-aes-aesni",
1225 .cra_driver_name = "__driver-gcm-aes-aesni",
1226 .cra_priority = 0,
1227 .cra_flags = CRYPTO_ALG_TYPE_AEAD,
1228 .cra_blocksize = 1,
1229 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1230 .cra_alignmask = 0,
1231 .cra_type = &crypto_aead_type,
1232 .cra_module = THIS_MODULE,
1233 .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list),
1234 .cra_u = {
1235 .aead = {
1236 .encrypt = __driver_rfc4106_encrypt,
1237 .decrypt = __driver_rfc4106_decrypt,
1238 },
1239 },
1240};
1241#endif
1242
733static int __init aesni_init(void) 1243static int __init aesni_init(void)
734{ 1244{
735 int err; 1245 int err;
@@ -738,6 +1248,7 @@ static int __init aesni_init(void)
738 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); 1248 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
739 return -ENODEV; 1249 return -ENODEV;
740 } 1250 }
1251
741 if ((err = crypto_register_alg(&aesni_alg))) 1252 if ((err = crypto_register_alg(&aesni_alg)))
742 goto aes_err; 1253 goto aes_err;
743 if ((err = crypto_register_alg(&__aesni_alg))) 1254 if ((err = crypto_register_alg(&__aesni_alg)))
@@ -746,18 +1257,24 @@ static int __init aesni_init(void)
746 goto blk_ecb_err; 1257 goto blk_ecb_err;
747 if ((err = crypto_register_alg(&blk_cbc_alg))) 1258 if ((err = crypto_register_alg(&blk_cbc_alg)))
748 goto blk_cbc_err; 1259 goto blk_cbc_err;
749 if ((err = crypto_register_alg(&blk_ctr_alg)))
750 goto blk_ctr_err;
751 if ((err = crypto_register_alg(&ablk_ecb_alg))) 1260 if ((err = crypto_register_alg(&ablk_ecb_alg)))
752 goto ablk_ecb_err; 1261 goto ablk_ecb_err;
753 if ((err = crypto_register_alg(&ablk_cbc_alg))) 1262 if ((err = crypto_register_alg(&ablk_cbc_alg)))
754 goto ablk_cbc_err; 1263 goto ablk_cbc_err;
1264#ifdef CONFIG_X86_64
1265 if ((err = crypto_register_alg(&blk_ctr_alg)))
1266 goto blk_ctr_err;
755 if ((err = crypto_register_alg(&ablk_ctr_alg))) 1267 if ((err = crypto_register_alg(&ablk_ctr_alg)))
756 goto ablk_ctr_err; 1268 goto ablk_ctr_err;
1269 if ((err = crypto_register_alg(&__rfc4106_alg)))
1270 goto __aead_gcm_err;
1271 if ((err = crypto_register_alg(&rfc4106_alg)))
1272 goto aead_gcm_err;
757#ifdef HAS_CTR 1273#ifdef HAS_CTR
758 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg))) 1274 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
759 goto ablk_rfc3686_ctr_err; 1275 goto ablk_rfc3686_ctr_err;
760#endif 1276#endif
1277#endif
761#ifdef HAS_LRW 1278#ifdef HAS_LRW
762 if ((err = crypto_register_alg(&ablk_lrw_alg))) 1279 if ((err = crypto_register_alg(&ablk_lrw_alg)))
763 goto ablk_lrw_err; 1280 goto ablk_lrw_err;
@@ -770,7 +1287,6 @@ static int __init aesni_init(void)
770 if ((err = crypto_register_alg(&ablk_xts_alg))) 1287 if ((err = crypto_register_alg(&ablk_xts_alg)))
771 goto ablk_xts_err; 1288 goto ablk_xts_err;
772#endif 1289#endif
773
774 return err; 1290 return err;
775 1291
776#ifdef HAS_XTS 1292#ifdef HAS_XTS
@@ -784,18 +1300,24 @@ ablk_pcbc_err:
784 crypto_unregister_alg(&ablk_lrw_alg); 1300 crypto_unregister_alg(&ablk_lrw_alg);
785ablk_lrw_err: 1301ablk_lrw_err:
786#endif 1302#endif
1303#ifdef CONFIG_X86_64
787#ifdef HAS_CTR 1304#ifdef HAS_CTR
788 crypto_unregister_alg(&ablk_rfc3686_ctr_alg); 1305 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
789ablk_rfc3686_ctr_err: 1306ablk_rfc3686_ctr_err:
790#endif 1307#endif
1308 crypto_unregister_alg(&rfc4106_alg);
1309aead_gcm_err:
1310 crypto_unregister_alg(&__rfc4106_alg);
1311__aead_gcm_err:
791 crypto_unregister_alg(&ablk_ctr_alg); 1312 crypto_unregister_alg(&ablk_ctr_alg);
792ablk_ctr_err: 1313ablk_ctr_err:
1314 crypto_unregister_alg(&blk_ctr_alg);
1315blk_ctr_err:
1316#endif
793 crypto_unregister_alg(&ablk_cbc_alg); 1317 crypto_unregister_alg(&ablk_cbc_alg);
794ablk_cbc_err: 1318ablk_cbc_err:
795 crypto_unregister_alg(&ablk_ecb_alg); 1319 crypto_unregister_alg(&ablk_ecb_alg);
796ablk_ecb_err: 1320ablk_ecb_err:
797 crypto_unregister_alg(&blk_ctr_alg);
798blk_ctr_err:
799 crypto_unregister_alg(&blk_cbc_alg); 1321 crypto_unregister_alg(&blk_cbc_alg);
800blk_cbc_err: 1322blk_cbc_err:
801 crypto_unregister_alg(&blk_ecb_alg); 1323 crypto_unregister_alg(&blk_ecb_alg);
@@ -818,13 +1340,17 @@ static void __exit aesni_exit(void)
818#ifdef HAS_LRW 1340#ifdef HAS_LRW
819 crypto_unregister_alg(&ablk_lrw_alg); 1341 crypto_unregister_alg(&ablk_lrw_alg);
820#endif 1342#endif
1343#ifdef CONFIG_X86_64
821#ifdef HAS_CTR 1344#ifdef HAS_CTR
822 crypto_unregister_alg(&ablk_rfc3686_ctr_alg); 1345 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
823#endif 1346#endif
1347 crypto_unregister_alg(&rfc4106_alg);
1348 crypto_unregister_alg(&__rfc4106_alg);
824 crypto_unregister_alg(&ablk_ctr_alg); 1349 crypto_unregister_alg(&ablk_ctr_alg);
1350 crypto_unregister_alg(&blk_ctr_alg);
1351#endif
825 crypto_unregister_alg(&ablk_cbc_alg); 1352 crypto_unregister_alg(&ablk_cbc_alg);
826 crypto_unregister_alg(&ablk_ecb_alg); 1353 crypto_unregister_alg(&ablk_ecb_alg);
827 crypto_unregister_alg(&blk_ctr_alg);
828 crypto_unregister_alg(&blk_cbc_alg); 1354 crypto_unregister_alg(&blk_cbc_alg);
829 crypto_unregister_alg(&blk_ecb_alg); 1355 crypto_unregister_alg(&blk_ecb_alg);
830 crypto_unregister_alg(&__aesni_alg); 1356 crypto_unregister_alg(&__aesni_alg);
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 3b62ab56c7a0..5e1a2eef3e7c 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -32,11 +32,7 @@
32#define BOOT_HEAP_SIZE 0x400000 32#define BOOT_HEAP_SIZE 0x400000
33#else /* !CONFIG_KERNEL_BZIP2 */ 33#else /* !CONFIG_KERNEL_BZIP2 */
34 34
35#ifdef CONFIG_X86_64 35#define BOOT_HEAP_SIZE 0x8000
36#define BOOT_HEAP_SIZE 0x7000
37#else
38#define BOOT_HEAP_SIZE 0x4000
39#endif
40 36
41#endif /* !CONFIG_KERNEL_BZIP2 */ 37#endif /* !CONFIG_KERNEL_BZIP2 */
42 38
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index b81002f23614..078ad0caefc6 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -94,7 +94,7 @@ static inline void hw_breakpoint_disable(void)
94 94
95static inline int hw_breakpoint_active(void) 95static inline int hw_breakpoint_active(void)
96{ 96{
97 return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; 97 return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
98} 98}
99 99
100extern void aout_dump_debugregs(struct user *dump); 100extern void aout_dump_debugregs(struct user *dump);
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index ff2546ce7178..7a15153c675d 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,6 +20,9 @@
20#ifndef _ASM_X86_HYPERVISOR_H 20#ifndef _ASM_X86_HYPERVISOR_H
21#define _ASM_X86_HYPERVISOR_H 21#define _ASM_X86_HYPERVISOR_H
22 22
23#include <asm/kvm_para.h>
24#include <asm/xen/hypervisor.h>
25
23extern void init_hypervisor(struct cpuinfo_x86 *c); 26extern void init_hypervisor(struct cpuinfo_x86 *c);
24extern void init_hypervisor_platform(void); 27extern void init_hypervisor_platform(void);
25 28
@@ -47,4 +50,13 @@ extern const struct hypervisor_x86 x86_hyper_vmware;
47extern const struct hypervisor_x86 x86_hyper_ms_hyperv; 50extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
48extern const struct hypervisor_x86 x86_hyper_xen_hvm; 51extern const struct hypervisor_x86 x86_hyper_xen_hvm;
49 52
53static inline bool hypervisor_x2apic_available(void)
54{
55 if (kvm_para_available())
56 return true;
57 if (xen_x2apic_para_available())
58 return true;
59 return false;
60}
61
50#endif 62#endif
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ba870bb6dd8e..c704b38c57a2 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -10,6 +10,9 @@
10#include <asm/apicdef.h> 10#include <asm/apicdef.h>
11#include <asm/irq_vectors.h> 11#include <asm/irq_vectors.h>
12 12
13/* Even though we don't support this, supply it to appease OF */
14static inline void irq_dispose_mapping(unsigned int virq) { }
15
13static inline int irq_canonicalize(int irq) 16static inline int irq_canonicalize(int irq)
14{ 17{
15 return ((irq == 2) ? 9 : irq); 18 return ((irq == 2) ? 9 : irq);
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index f23eb2528464..ca242d35e873 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -18,7 +18,6 @@ enum die_val {
18 DIE_TRAP, 18 DIE_TRAP,
19 DIE_GPF, 19 DIE_GPF,
20 DIE_CALL, 20 DIE_CALL,
21 DIE_NMI_IPI,
22 DIE_PAGE_FAULT, 21 DIE_PAGE_FAULT,
23 DIE_NMIUNKNOWN, 22 DIE_NMIUNKNOWN,
24}; 23};
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b36c6b3fe144..8e37deb1eb38 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -15,6 +15,14 @@
15 15
16struct x86_emulate_ctxt; 16struct x86_emulate_ctxt;
17 17
18struct x86_exception {
19 u8 vector;
20 bool error_code_valid;
21 u16 error_code;
22 bool nested_page_fault;
23 u64 address; /* cr2 or nested page fault gpa */
24};
25
18/* 26/*
19 * x86_emulate_ops: 27 * x86_emulate_ops:
20 * 28 *
@@ -64,7 +72,8 @@ struct x86_emulate_ops {
64 * @bytes: [IN ] Number of bytes to read from memory. 72 * @bytes: [IN ] Number of bytes to read from memory.
65 */ 73 */
66 int (*read_std)(unsigned long addr, void *val, 74 int (*read_std)(unsigned long addr, void *val,
67 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 75 unsigned int bytes, struct kvm_vcpu *vcpu,
76 struct x86_exception *fault);
68 77
69 /* 78 /*
70 * write_std: Write bytes of standard (non-emulated/special) memory. 79 * write_std: Write bytes of standard (non-emulated/special) memory.
@@ -74,7 +83,8 @@ struct x86_emulate_ops {
74 * @bytes: [IN ] Number of bytes to write to memory. 83 * @bytes: [IN ] Number of bytes to write to memory.
75 */ 84 */
76 int (*write_std)(unsigned long addr, void *val, 85 int (*write_std)(unsigned long addr, void *val,
77 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 86 unsigned int bytes, struct kvm_vcpu *vcpu,
87 struct x86_exception *fault);
78 /* 88 /*
79 * fetch: Read bytes of standard (non-emulated/special) memory. 89 * fetch: Read bytes of standard (non-emulated/special) memory.
80 * Used for instruction fetch. 90 * Used for instruction fetch.
@@ -83,7 +93,8 @@ struct x86_emulate_ops {
83 * @bytes: [IN ] Number of bytes to read from memory. 93 * @bytes: [IN ] Number of bytes to read from memory.
84 */ 94 */
85 int (*fetch)(unsigned long addr, void *val, 95 int (*fetch)(unsigned long addr, void *val,
86 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 96 unsigned int bytes, struct kvm_vcpu *vcpu,
97 struct x86_exception *fault);
87 98
88 /* 99 /*
89 * read_emulated: Read bytes from emulated/special memory area. 100 * read_emulated: Read bytes from emulated/special memory area.
@@ -94,7 +105,7 @@ struct x86_emulate_ops {
94 int (*read_emulated)(unsigned long addr, 105 int (*read_emulated)(unsigned long addr,
95 void *val, 106 void *val,
96 unsigned int bytes, 107 unsigned int bytes,
97 unsigned int *error, 108 struct x86_exception *fault,
98 struct kvm_vcpu *vcpu); 109 struct kvm_vcpu *vcpu);
99 110
100 /* 111 /*
@@ -107,7 +118,7 @@ struct x86_emulate_ops {
107 int (*write_emulated)(unsigned long addr, 118 int (*write_emulated)(unsigned long addr,
108 const void *val, 119 const void *val,
109 unsigned int bytes, 120 unsigned int bytes,
110 unsigned int *error, 121 struct x86_exception *fault,
111 struct kvm_vcpu *vcpu); 122 struct kvm_vcpu *vcpu);
112 123
113 /* 124 /*
@@ -122,7 +133,7 @@ struct x86_emulate_ops {
122 const void *old, 133 const void *old,
123 const void *new, 134 const void *new,
124 unsigned int bytes, 135 unsigned int bytes,
125 unsigned int *error, 136 struct x86_exception *fault,
126 struct kvm_vcpu *vcpu); 137 struct kvm_vcpu *vcpu);
127 138
128 int (*pio_in_emulated)(int size, unsigned short port, void *val, 139 int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -159,7 +170,10 @@ struct operand {
159 }; 170 };
160 union { 171 union {
161 unsigned long *reg; 172 unsigned long *reg;
162 unsigned long mem; 173 struct segmented_address {
174 ulong ea;
175 unsigned seg;
176 } mem;
163 } addr; 177 } addr;
164 union { 178 union {
165 unsigned long val; 179 unsigned long val;
@@ -226,9 +240,8 @@ struct x86_emulate_ctxt {
226 240
227 bool perm_ok; /* do not check permissions if true */ 241 bool perm_ok; /* do not check permissions if true */
228 242
229 int exception; /* exception that happens during emulation or -1 */ 243 bool have_exception;
230 u32 error_code; /* error code for exception */ 244 struct x86_exception exception;
231 bool error_code_valid;
232 245
233 /* decode cache */ 246 /* decode cache */
234 struct decode_cache decode; 247 struct decode_cache decode;
@@ -252,7 +265,7 @@ struct x86_emulate_ctxt {
252#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 265#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
253#endif 266#endif
254 267
255int x86_decode_insn(struct x86_emulate_ctxt *ctxt); 268int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
256#define EMULATION_FAILED -1 269#define EMULATION_FAILED -1
257#define EMULATION_OK 0 270#define EMULATION_OK 0
258#define EMULATION_RESTART 1 271#define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f702f82aa1eb..ffd7f8d29187 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -83,11 +83,14 @@
83#define KVM_NR_FIXED_MTRR_REGION 88 83#define KVM_NR_FIXED_MTRR_REGION 88
84#define KVM_NR_VAR_MTRR 8 84#define KVM_NR_VAR_MTRR 8
85 85
86#define ASYNC_PF_PER_VCPU 64
87
86extern spinlock_t kvm_lock; 88extern spinlock_t kvm_lock;
87extern struct list_head vm_list; 89extern struct list_head vm_list;
88 90
89struct kvm_vcpu; 91struct kvm_vcpu;
90struct kvm; 92struct kvm;
93struct kvm_async_pf;
91 94
92enum kvm_reg { 95enum kvm_reg {
93 VCPU_REGS_RAX = 0, 96 VCPU_REGS_RAX = 0,
@@ -114,6 +117,7 @@ enum kvm_reg {
114 117
115enum kvm_reg_ex { 118enum kvm_reg_ex {
116 VCPU_EXREG_PDPTR = NR_VCPU_REGS, 119 VCPU_EXREG_PDPTR = NR_VCPU_REGS,
120 VCPU_EXREG_CR3,
117}; 121};
118 122
119enum { 123enum {
@@ -238,16 +242,18 @@ struct kvm_mmu {
238 void (*new_cr3)(struct kvm_vcpu *vcpu); 242 void (*new_cr3)(struct kvm_vcpu *vcpu);
239 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); 243 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
240 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); 244 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
241 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 245 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
242 void (*inject_page_fault)(struct kvm_vcpu *vcpu); 246 bool prefault);
247 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
248 struct x86_exception *fault);
243 void (*free)(struct kvm_vcpu *vcpu); 249 void (*free)(struct kvm_vcpu *vcpu);
244 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, 250 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
245 u32 *error); 251 struct x86_exception *exception);
246 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); 252 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
247 void (*prefetch_page)(struct kvm_vcpu *vcpu, 253 void (*prefetch_page)(struct kvm_vcpu *vcpu,
248 struct kvm_mmu_page *page); 254 struct kvm_mmu_page *page);
249 int (*sync_page)(struct kvm_vcpu *vcpu, 255 int (*sync_page)(struct kvm_vcpu *vcpu,
250 struct kvm_mmu_page *sp, bool clear_unsync); 256 struct kvm_mmu_page *sp);
251 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
252 hpa_t root_hpa; 258 hpa_t root_hpa;
253 int root_level; 259 int root_level;
@@ -315,16 +321,6 @@ struct kvm_vcpu_arch {
315 */ 321 */
316 struct kvm_mmu *walk_mmu; 322 struct kvm_mmu *walk_mmu;
317 323
318 /*
319 * This struct is filled with the necessary information to propagate a
320 * page fault into the guest
321 */
322 struct {
323 u64 address;
324 unsigned error_code;
325 bool nested;
326 } fault;
327
328 /* only needed in kvm_pv_mmu_op() path, but it's hot so 324 /* only needed in kvm_pv_mmu_op() path, but it's hot so
329 * put it here to avoid allocation */ 325 * put it here to avoid allocation */
330 struct kvm_pv_mmu_op_buffer mmu_op_buffer; 326 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -412,6 +408,15 @@ struct kvm_vcpu_arch {
412 u64 hv_vapic; 408 u64 hv_vapic;
413 409
414 cpumask_var_t wbinvd_dirty_mask; 410 cpumask_var_t wbinvd_dirty_mask;
411
412 struct {
413 bool halted;
414 gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
415 struct gfn_to_hva_cache data;
416 u64 msr_val;
417 u32 id;
418 bool send_user_only;
419 } apf;
415}; 420};
416 421
417struct kvm_arch { 422struct kvm_arch {
@@ -456,6 +461,10 @@ struct kvm_arch {
456 /* fields used by HYPER-V emulation */ 461 /* fields used by HYPER-V emulation */
457 u64 hv_guest_os_id; 462 u64 hv_guest_os_id;
458 u64 hv_hypercall; 463 u64 hv_hypercall;
464
465 #ifdef CONFIG_KVM_MMU_AUDIT
466 int audit_point;
467 #endif
459}; 468};
460 469
461struct kvm_vm_stat { 470struct kvm_vm_stat {
@@ -529,6 +538,7 @@ struct kvm_x86_ops {
529 struct kvm_segment *var, int seg); 538 struct kvm_segment *var, int seg);
530 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); 539 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
531 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); 540 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
541 void (*decache_cr3)(struct kvm_vcpu *vcpu);
532 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); 542 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
533 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 543 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
534 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 544 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -582,9 +592,17 @@ struct kvm_x86_ops {
582 592
583 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 593 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
584 594
595 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
585 const struct trace_print_flags *exit_reasons_str; 596 const struct trace_print_flags *exit_reasons_str;
586}; 597};
587 598
599struct kvm_arch_async_pf {
600 u32 token;
601 gfn_t gfn;
602 unsigned long cr3;
603 bool direct_map;
604};
605
588extern struct kvm_x86_ops *kvm_x86_ops; 606extern struct kvm_x86_ops *kvm_x86_ops;
589 607
590int kvm_mmu_module_init(void); 608int kvm_mmu_module_init(void);
@@ -594,7 +612,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
594int kvm_mmu_create(struct kvm_vcpu *vcpu); 612int kvm_mmu_create(struct kvm_vcpu *vcpu);
595int kvm_mmu_setup(struct kvm_vcpu *vcpu); 613int kvm_mmu_setup(struct kvm_vcpu *vcpu);
596void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); 614void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
597void kvm_mmu_set_base_ptes(u64 base_pte);
598void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 615void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
599 u64 dirty_mask, u64 nx_mask, u64 x_mask); 616 u64 dirty_mask, u64 nx_mask, u64 x_mask);
600 617
@@ -623,8 +640,15 @@ enum emulation_result {
623#define EMULTYPE_NO_DECODE (1 << 0) 640#define EMULTYPE_NO_DECODE (1 << 0)
624#define EMULTYPE_TRAP_UD (1 << 1) 641#define EMULTYPE_TRAP_UD (1 << 1)
625#define EMULTYPE_SKIP (1 << 2) 642#define EMULTYPE_SKIP (1 << 2)
626int emulate_instruction(struct kvm_vcpu *vcpu, 643int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
627 unsigned long cr2, u16 error_code, int emulation_type); 644 int emulation_type, void *insn, int insn_len);
645
646static inline int emulate_instruction(struct kvm_vcpu *vcpu,
647 int emulation_type)
648{
649 return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
650}
651
628void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 652void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
629void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 653void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
630 654
@@ -650,7 +674,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
650int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 674int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
651int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 675int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
652int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 676int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
653void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 677int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
654int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); 678int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
655int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); 679int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
656unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 680unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
@@ -668,11 +692,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
668void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 692void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
669void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); 693void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
670void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 694void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
671void kvm_inject_page_fault(struct kvm_vcpu *vcpu); 695void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
672int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 696int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
673 gfn_t gfn, void *data, int offset, int len, 697 gfn_t gfn, void *data, int offset, int len,
674 u32 access); 698 u32 access);
675void kvm_propagate_fault(struct kvm_vcpu *vcpu); 699void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
676bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 700bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
677 701
678int kvm_pic_set_irq(void *opaque, int irq, int level); 702int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -690,16 +714,21 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
690int kvm_mmu_load(struct kvm_vcpu *vcpu); 714int kvm_mmu_load(struct kvm_vcpu *vcpu);
691void kvm_mmu_unload(struct kvm_vcpu *vcpu); 715void kvm_mmu_unload(struct kvm_vcpu *vcpu);
692void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 716void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
693gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 717gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
694gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 718 struct x86_exception *exception);
695gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 719gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
696gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 720 struct x86_exception *exception);
721gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
722 struct x86_exception *exception);
723gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
724 struct x86_exception *exception);
697 725
698int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 726int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
699 727
700int kvm_fix_hypercall(struct kvm_vcpu *vcpu); 728int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
701 729
702int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); 730int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
731 void *insn, int insn_len);
703void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); 732void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
704 733
705void kvm_enable_tdp(void); 734void kvm_enable_tdp(void);
@@ -766,20 +795,25 @@ enum {
766#define HF_VINTR_MASK (1 << 2) 795#define HF_VINTR_MASK (1 << 2)
767#define HF_NMI_MASK (1 << 3) 796#define HF_NMI_MASK (1 << 3)
768#define HF_IRET_MASK (1 << 4) 797#define HF_IRET_MASK (1 << 4)
798#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
769 799
770/* 800/*
771 * Hardware virtualization extension instructions may fault if a 801 * Hardware virtualization extension instructions may fault if a
772 * reboot turns off virtualization while processes are running. 802 * reboot turns off virtualization while processes are running.
773 * Trap the fault and ignore the instruction if that happens. 803 * Trap the fault and ignore the instruction if that happens.
774 */ 804 */
775asmlinkage void kvm_handle_fault_on_reboot(void); 805asmlinkage void kvm_spurious_fault(void);
806extern bool kvm_rebooting;
776 807
777#define __kvm_handle_fault_on_reboot(insn) \ 808#define __kvm_handle_fault_on_reboot(insn) \
778 "666: " insn "\n\t" \ 809 "666: " insn "\n\t" \
810 "668: \n\t" \
779 ".pushsection .fixup, \"ax\" \n" \ 811 ".pushsection .fixup, \"ax\" \n" \
780 "667: \n\t" \ 812 "667: \n\t" \
813 "cmpb $0, kvm_rebooting \n\t" \
814 "jne 668b \n\t" \
781 __ASM_SIZE(push) " $666b \n\t" \ 815 __ASM_SIZE(push) " $666b \n\t" \
782 "jmp kvm_handle_fault_on_reboot \n\t" \ 816 "call kvm_spurious_fault \n\t" \
783 ".popsection \n\t" \ 817 ".popsection \n\t" \
784 ".pushsection __ex_table, \"a\" \n\t" \ 818 ".pushsection __ex_table, \"a\" \n\t" \
785 _ASM_PTR " 666b, 667b \n\t" \ 819 _ASM_PTR " 666b, 667b \n\t" \
@@ -788,6 +822,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
788#define KVM_ARCH_WANT_MMU_NOTIFIER 822#define KVM_ARCH_WANT_MMU_NOTIFIER
789int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 823int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
790int kvm_age_hva(struct kvm *kvm, unsigned long hva); 824int kvm_age_hva(struct kvm *kvm, unsigned long hva);
825int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
791void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 826void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
792int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 827int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
793int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 828int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
@@ -799,4 +834,15 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
799 834
800bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); 835bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
801 836
837void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
838 struct kvm_async_pf *work);
839void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
840 struct kvm_async_pf *work);
841void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
842 struct kvm_async_pf *work);
843bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
844extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
845
846void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
847
802#endif /* _ASM_X86_KVM_HOST_H */ 848#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b562b6184bc..a427bf77a93d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -20,6 +20,7 @@
20 * are available. The use of 0x11 and 0x12 is deprecated 20 * are available. The use of 0x11 and 0x12 is deprecated
21 */ 21 */
22#define KVM_FEATURE_CLOCKSOURCE2 3 22#define KVM_FEATURE_CLOCKSOURCE2 3
23#define KVM_FEATURE_ASYNC_PF 4
23 24
24/* The last 8 bits are used to indicate how to interpret the flags field 25/* The last 8 bits are used to indicate how to interpret the flags field
25 * in pvclock structure. If no bits are set, all flags are ignored. 26 * in pvclock structure. If no bits are set, all flags are ignored.
@@ -32,9 +33,13 @@
32/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ 33/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
33#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 34#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
34#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 35#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
36#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
35 37
36#define KVM_MAX_MMU_OP_BATCH 32 38#define KVM_MAX_MMU_OP_BATCH 32
37 39
40#define KVM_ASYNC_PF_ENABLED (1 << 0)
41#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
42
38/* Operations for KVM_HC_MMU_OP */ 43/* Operations for KVM_HC_MMU_OP */
39#define KVM_MMU_OP_WRITE_PTE 1 44#define KVM_MMU_OP_WRITE_PTE 1
40#define KVM_MMU_OP_FLUSH_TLB 2 45#define KVM_MMU_OP_FLUSH_TLB 2
@@ -61,10 +66,20 @@ struct kvm_mmu_op_release_pt {
61 __u64 pt_phys; 66 __u64 pt_phys;
62}; 67};
63 68
69#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
70#define KVM_PV_REASON_PAGE_READY 2
71
72struct kvm_vcpu_pv_apf_data {
73 __u32 reason;
74 __u8 pad[60];
75 __u32 enabled;
76};
77
64#ifdef __KERNEL__ 78#ifdef __KERNEL__
65#include <asm/processor.h> 79#include <asm/processor.h>
66 80
67extern void kvmclock_init(void); 81extern void kvmclock_init(void);
82extern int kvm_register_clock(char *txt);
68 83
69 84
70/* This instruction is vmcall. On non-VT architectures, it will generate a 85/* This instruction is vmcall. On non-VT architectures, it will generate a
@@ -160,8 +175,17 @@ static inline unsigned int kvm_arch_para_features(void)
160 175
161#ifdef CONFIG_KVM_GUEST 176#ifdef CONFIG_KVM_GUEST
162void __init kvm_guest_init(void); 177void __init kvm_guest_init(void);
178void kvm_async_pf_task_wait(u32 token);
179void kvm_async_pf_task_wake(u32 token);
180u32 kvm_read_and_reset_pf_reason(void);
163#else 181#else
164#define kvm_guest_init() do { } while (0) 182#define kvm_guest_init() do { } while (0)
183#define kvm_async_pf_task_wait(T) do {} while(0)
184#define kvm_async_pf_task_wake(T) do {} while(0)
185static inline u32 kvm_read_and_reset_pf_reason(void)
186{
187 return 0;
188}
165#endif 189#endif
166 190
167#endif /* __KERNEL__ */ 191#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index f7920601e472..72a8b52e7dfd 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -7,9 +7,19 @@
7 7
8#include <asm/mc146818rtc.h> 8#include <asm/mc146818rtc.h>
9 9
10#define NMI_REASON_PORT 0x61
11
12#define NMI_REASON_SERR 0x80
13#define NMI_REASON_IOCHK 0x40
14#define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK)
15
16#define NMI_REASON_CLEAR_SERR 0x04
17#define NMI_REASON_CLEAR_IOCHK 0x08
18#define NMI_REASON_CLEAR_MASK 0x0f
19
10static inline unsigned char get_nmi_reason(void) 20static inline unsigned char get_nmi_reason(void)
11{ 21{
12 return inb(0x61); 22 return inb(NMI_REASON_PORT);
13} 23}
14 24
15static inline void reassert_nmi(void) 25static inline void reassert_nmi(void)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c4021b953510..c76f5b92b840 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -23,6 +23,26 @@ void arch_trigger_all_cpu_backtrace(void);
23#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace 23#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
24#endif 24#endif
25 25
26/*
27 * Define some priorities for the nmi notifier call chain.
28 *
29 * Create a local nmi bit that has a higher priority than
30 * external nmis, because the local ones are more frequent.
31 *
32 * Also setup some default high/normal/low settings for
33 * subsystems to registers with. Using 4 bits to seperate
34 * the priorities. This can go alot higher if needed be.
35 */
36
37#define NMI_LOCAL_SHIFT 16 /* randomly picked */
38#define NMI_LOCAL_BIT (1ULL << NMI_LOCAL_SHIFT)
39#define NMI_HIGH_PRIOR (1ULL << 8)
40#define NMI_NORMAL_PRIOR (1ULL << 4)
41#define NMI_LOW_PRIOR (1ULL << 0)
42#define NMI_LOCAL_HIGH_PRIOR (NMI_LOCAL_BIT | NMI_HIGH_PRIOR)
43#define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR)
44#define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR)
45
26void stop_nmi(void); 46void stop_nmi(void);
27void restart_nmi(void); 47void restart_nmi(void);
28 48
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 42a978c0c1b3..f482010350fb 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -107,10 +107,14 @@ extern int olpc_ec_mask_unset(uint8_t bits);
107/* GPIO assignments */ 107/* GPIO assignments */
108 108
109#define OLPC_GPIO_MIC_AC 1 109#define OLPC_GPIO_MIC_AC 1
110#define OLPC_GPIO_DCON_IRQ geode_gpio(7) 110#define OLPC_GPIO_DCON_STAT0 5
111#define OLPC_GPIO_DCON_STAT1 6
112#define OLPC_GPIO_DCON_IRQ 7
111#define OLPC_GPIO_THRM_ALRM geode_gpio(10) 113#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
112#define OLPC_GPIO_SMB_CLK geode_gpio(14) 114#define OLPC_GPIO_DCON_LOAD 11
113#define OLPC_GPIO_SMB_DATA geode_gpio(15) 115#define OLPC_GPIO_DCON_BLANK 12
116#define OLPC_GPIO_SMB_CLK 14
117#define OLPC_GPIO_SMB_DATA 15
114#define OLPC_GPIO_WORKAUX geode_gpio(24) 118#define OLPC_GPIO_WORKAUX geode_gpio(24)
115#define OLPC_GPIO_LID geode_gpio(26) 119#define OLPC_GPIO_LID geode_gpio(26)
116#define OLPC_GPIO_ECSCI geode_gpio(27) 120#define OLPC_GPIO_ECSCI geode_gpio(27)
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 2a8478140bb3..641988efe063 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -8,6 +8,8 @@
8 8
9#ifdef CONFIG_OLPC_OPENFIRMWARE 9#ifdef CONFIG_OLPC_OPENFIRMWARE
10 10
11extern bool olpc_ofw_is_installed(void);
12
11/* run an OFW command by calling into the firmware */ 13/* run an OFW command by calling into the firmware */
12#define olpc_ofw(name, args, res) \ 14#define olpc_ofw(name, args, res) \
13 __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res) 15 __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
@@ -26,10 +28,17 @@ extern bool olpc_ofw_present(void);
26 28
27#else /* !CONFIG_OLPC_OPENFIRMWARE */ 29#else /* !CONFIG_OLPC_OPENFIRMWARE */
28 30
31static inline bool olpc_ofw_is_installed(void) { return false; }
29static inline void olpc_ofw_detect(void) { } 32static inline void olpc_ofw_detect(void) { }
30static inline void setup_olpc_ofw_pgd(void) { } 33static inline void setup_olpc_ofw_pgd(void) { }
31static inline bool olpc_ofw_present(void) { return false; } 34static inline bool olpc_ofw_present(void) { return false; }
32 35
33#endif /* !CONFIG_OLPC_OPENFIRMWARE */ 36#endif /* !CONFIG_OLPC_OPENFIRMWARE */
34 37
38#ifdef CONFIG_OLPC_OPENFIRMWARE_DT
39extern void olpc_dt_build_devicetree(void);
40#else
41static inline void olpc_dt_build_devicetree(void) { }
42#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */
43
35#endif /* _ASM_X86_OLPC_OFW_H */ 44#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 7709c12431b8..2071a8b2b32f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
435{ 435{
436 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); 436 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
437} 437}
438static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
439 pmd_t *pmdp)
440{
441 PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
442}
438 443
439static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, 444static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
440 pte_t *ptep) 445 pte_t *ptep)
@@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
442 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); 447 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
443} 448}
444 449
450static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
451 pmd_t *pmdp)
452{
453 PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
454}
455
445static inline pte_t __pte(pteval_t val) 456static inline pte_t __pte(pteval_t val)
446{ 457{
447 pteval_t ret; 458 pteval_t ret;
@@ -543,6 +554,20 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
543 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); 554 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
544} 555}
545 556
557#ifdef CONFIG_TRANSPARENT_HUGEPAGE
558static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
559 pmd_t *pmdp, pmd_t pmd)
560{
561#if PAGETABLE_LEVELS >= 3
562 if (sizeof(pmdval_t) > sizeof(long))
563 /* 5 arg words */
564 pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
565 else
566 PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd);
567#endif
568}
569#endif
570
546static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 571static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
547{ 572{
548 pmdval_t val = native_pmd_val(pmd); 573 pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b82bac975250..82885099c869 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -265,10 +265,16 @@ struct pv_mmu_ops {
265 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, 265 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
266 pte_t *ptep, pte_t pteval); 266 pte_t *ptep, pte_t pteval);
267 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); 267 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
268 void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
269 pmd_t *pmdp, pmd_t pmdval);
268 void (*pte_update)(struct mm_struct *mm, unsigned long addr, 270 void (*pte_update)(struct mm_struct *mm, unsigned long addr,
269 pte_t *ptep); 271 pte_t *ptep);
270 void (*pte_update_defer)(struct mm_struct *mm, 272 void (*pte_update_defer)(struct mm_struct *mm,
271 unsigned long addr, pte_t *ptep); 273 unsigned long addr, pte_t *ptep);
274 void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
275 pmd_t *pmdp);
276 void (*pmd_update_defer)(struct mm_struct *mm,
277 unsigned long addr, pmd_t *pmdp);
272 278
273 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, 279 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
274 pte_t *ptep); 280 pte_t *ptep);
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index f899e01a8ac9..8ee45167e817 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,6 +230,125 @@ do { \
230}) 230})
231 231
232/* 232/*
233 * Add return operation
234 */
235#define percpu_add_return_op(var, val) \
236({ \
237 typeof(var) paro_ret__ = val; \
238 switch (sizeof(var)) { \
239 case 1: \
240 asm("xaddb %0, "__percpu_arg(1) \
241 : "+q" (paro_ret__), "+m" (var) \
242 : : "memory"); \
243 break; \
244 case 2: \
245 asm("xaddw %0, "__percpu_arg(1) \
246 : "+r" (paro_ret__), "+m" (var) \
247 : : "memory"); \
248 break; \
249 case 4: \
250 asm("xaddl %0, "__percpu_arg(1) \
251 : "+r" (paro_ret__), "+m" (var) \
252 : : "memory"); \
253 break; \
254 case 8: \
255 asm("xaddq %0, "__percpu_arg(1) \
256 : "+re" (paro_ret__), "+m" (var) \
257 : : "memory"); \
258 break; \
259 default: __bad_percpu_size(); \
260 } \
261 paro_ret__ += val; \
262 paro_ret__; \
263})
264
265/*
266 * xchg is implemented using cmpxchg without a lock prefix. xchg is
267 * expensive due to the implied lock prefix. The processor cannot prefetch
268 * cachelines if xchg is used.
269 */
270#define percpu_xchg_op(var, nval) \
271({ \
272 typeof(var) pxo_ret__; \
273 typeof(var) pxo_new__ = (nval); \
274 switch (sizeof(var)) { \
275 case 1: \
276 asm("\n1:mov "__percpu_arg(1)",%%al" \
277 "\n\tcmpxchgb %2, "__percpu_arg(1) \
278 "\n\tjnz 1b" \
279 : "=a" (pxo_ret__), "+m" (var) \
280 : "q" (pxo_new__) \
281 : "memory"); \
282 break; \
283 case 2: \
284 asm("\n1:mov "__percpu_arg(1)",%%ax" \
285 "\n\tcmpxchgw %2, "__percpu_arg(1) \
286 "\n\tjnz 1b" \
287 : "=a" (pxo_ret__), "+m" (var) \
288 : "r" (pxo_new__) \
289 : "memory"); \
290 break; \
291 case 4: \
292 asm("\n1:mov "__percpu_arg(1)",%%eax" \
293 "\n\tcmpxchgl %2, "__percpu_arg(1) \
294 "\n\tjnz 1b" \
295 : "=a" (pxo_ret__), "+m" (var) \
296 : "r" (pxo_new__) \
297 : "memory"); \
298 break; \
299 case 8: \
300 asm("\n1:mov "__percpu_arg(1)",%%rax" \
301 "\n\tcmpxchgq %2, "__percpu_arg(1) \
302 "\n\tjnz 1b" \
303 : "=a" (pxo_ret__), "+m" (var) \
304 : "r" (pxo_new__) \
305 : "memory"); \
306 break; \
307 default: __bad_percpu_size(); \
308 } \
309 pxo_ret__; \
310})
311
312/*
313 * cmpxchg has no such implied lock semantics as a result it is much
314 * more efficient for cpu local operations.
315 */
316#define percpu_cmpxchg_op(var, oval, nval) \
317({ \
318 typeof(var) pco_ret__; \
319 typeof(var) pco_old__ = (oval); \
320 typeof(var) pco_new__ = (nval); \
321 switch (sizeof(var)) { \
322 case 1: \
323 asm("cmpxchgb %2, "__percpu_arg(1) \
324 : "=a" (pco_ret__), "+m" (var) \
325 : "q" (pco_new__), "0" (pco_old__) \
326 : "memory"); \
327 break; \
328 case 2: \
329 asm("cmpxchgw %2, "__percpu_arg(1) \
330 : "=a" (pco_ret__), "+m" (var) \
331 : "r" (pco_new__), "0" (pco_old__) \
332 : "memory"); \
333 break; \
334 case 4: \
335 asm("cmpxchgl %2, "__percpu_arg(1) \
336 : "=a" (pco_ret__), "+m" (var) \
337 : "r" (pco_new__), "0" (pco_old__) \
338 : "memory"); \
339 break; \
340 case 8: \
341 asm("cmpxchgq %2, "__percpu_arg(1) \
342 : "=a" (pco_ret__), "+m" (var) \
343 : "r" (pco_new__), "0" (pco_old__) \
344 : "memory"); \
345 break; \
346 default: __bad_percpu_size(); \
347 } \
348 pco_ret__; \
349})
350
351/*
233 * percpu_read() makes gcc load the percpu variable every time it is 352 * percpu_read() makes gcc load the percpu variable every time it is
234 * accessed while percpu_read_stable() allows the value to be cached. 353 * accessed while percpu_read_stable() allows the value to be cached.
235 * percpu_read_stable() is more efficient and can be used if its value 354 * percpu_read_stable() is more efficient and can be used if its value
@@ -267,6 +386,12 @@ do { \
267#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 386#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
268#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 387#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
269#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 388#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
389/*
390 * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much
391 * faster than an xchg with forced lock semantics.
392 */
393#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
394#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
270 395
271#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 396#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
272#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 397#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -286,6 +411,11 @@ do { \
286#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 411#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
287#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 412#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
288#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 413#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
414#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
415#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
416#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
417#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
418#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
289 419
290#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) 420#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
291#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) 421#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
@@ -299,6 +429,31 @@ do { \
299#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 429#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
300#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 430#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
301#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 431#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
432#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
433#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
434#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
435#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
436#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
437
438#ifndef CONFIG_M386
439#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
440#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
441#define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
442#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
443#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
444#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
445
446#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
447#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
448#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
449#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
450#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
451#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
452
453#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
454#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
455#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
456#endif /* !CONFIG_M386 */
302 457
303/* 458/*
304 * Per cpu atomic 64 bit operations are only available under 64 bit. 459 * Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -311,6 +466,7 @@ do { \
311#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 466#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
312#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 467#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
313#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 468#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
469#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
314 470
315#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 471#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
316#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 472#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
@@ -318,12 +474,12 @@ do { \
318#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 474#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
319#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 475#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
320#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 476#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
477#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
321 478
322#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) 479#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
323#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 480#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
324#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 481#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
325#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 482#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
326
327#endif 483#endif
328 484
329/* This is not atomic against other CPUs -- CPU preemption needs to be off */ 485/* This is not atomic against other CPUs -- CPU preemption needs to be off */
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 295e2ff18a6a..e2f6a99f14ab 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -20,6 +20,9 @@
20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) 20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
21#define ARCH_P4_MAX_CCCR (18) 21#define ARCH_P4_MAX_CCCR (18)
22 22
23#define ARCH_P4_CNTRVAL_BITS (40)
24#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1)
25
23#define P4_ESCR_EVENT_MASK 0x7e000000U 26#define P4_ESCR_EVENT_MASK 0x7e000000U
24#define P4_ESCR_EVENT_SHIFT 25 27#define P4_ESCR_EVENT_SHIFT 25
25#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U 28#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 271de94c3810..b4389a468fb6 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -92,7 +92,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
92extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); 92extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
93 93
94static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, 94static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
95 unsigned long adddress) 95 unsigned long address)
96{ 96{
97 ___pmd_free_tlb(tlb, pmd); 97 ___pmd_free_tlb(tlb, pmd);
98} 98}
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982b339e..98391db840c6 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
46#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 46#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
47#endif 47#endif
48 48
49#ifdef CONFIG_SMP
50static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
51{
52 return __pmd(xchg((pmdval_t *)xp, 0));
53}
54#else
55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
56#endif
57
49/* 58/*
50 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, 59 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
51 * split up the 29 bits of offset into this range: 60 * split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea01..94b979d1b58d 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
104#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 104#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
105#endif 105#endif
106 106
107#ifdef CONFIG_SMP
108union split_pmd {
109 struct {
110 u32 pmd_low;
111 u32 pmd_high;
112 };
113 pmd_t pmd;
114};
115static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
116{
117 union split_pmd res, *orig = (union split_pmd *)pmdp;
118
119 /* xchg acts as a barrier before setting of the high bits */
120 res.pmd_low = xchg(&orig->pmd_low, 0);
121 res.pmd_high = orig->pmd_high;
122 orig->pmd_high = 0;
123
124 return res.pmd;
125}
126#else
127#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
128#endif
129
107/* 130/*
108 * Bits 0, 6 and 7 are taken in the low part of the pte, 131 * Bits 0, 6 and 7 are taken in the low part of the pte,
109 * put the 32 bits of offset into the high part. 132 * put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ada823a13c7c..18601c86fab1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
35#else /* !CONFIG_PARAVIRT */ 35#else /* !CONFIG_PARAVIRT */
36#define set_pte(ptep, pte) native_set_pte(ptep, pte) 36#define set_pte(ptep, pte) native_set_pte(ptep, pte)
37#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) 37#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
38#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
38 39
39#define set_pte_atomic(ptep, pte) \ 40#define set_pte_atomic(ptep, pte) \
40 native_set_pte_atomic(ptep, pte) 41 native_set_pte_atomic(ptep, pte)
@@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
59 60
60#define pte_update(mm, addr, ptep) do { } while (0) 61#define pte_update(mm, addr, ptep) do { } while (0)
61#define pte_update_defer(mm, addr, ptep) do { } while (0) 62#define pte_update_defer(mm, addr, ptep) do { } while (0)
63#define pmd_update(mm, addr, ptep) do { } while (0)
64#define pmd_update_defer(mm, addr, ptep) do { } while (0)
62 65
63#define pgd_val(x) native_pgd_val(x) 66#define pgd_val(x) native_pgd_val(x)
64#define __pgd(x) native_make_pgd(x) 67#define __pgd(x) native_make_pgd(x)
@@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte)
94 return pte_flags(pte) & _PAGE_ACCESSED; 97 return pte_flags(pte) & _PAGE_ACCESSED;
95} 98}
96 99
100static inline int pmd_young(pmd_t pmd)
101{
102 return pmd_flags(pmd) & _PAGE_ACCESSED;
103}
104
97static inline int pte_write(pte_t pte) 105static inline int pte_write(pte_t pte)
98{ 106{
99 return pte_flags(pte) & _PAGE_RW; 107 return pte_flags(pte) & _PAGE_RW;
@@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte)
142 (_PAGE_PSE | _PAGE_PRESENT); 150 (_PAGE_PSE | _PAGE_PRESENT);
143} 151}
144 152
153#ifdef CONFIG_TRANSPARENT_HUGEPAGE
154static inline int pmd_trans_splitting(pmd_t pmd)
155{
156 return pmd_val(pmd) & _PAGE_SPLITTING;
157}
158
159static inline int pmd_trans_huge(pmd_t pmd)
160{
161 return pmd_val(pmd) & _PAGE_PSE;
162}
163
164static inline int has_transparent_hugepage(void)
165{
166 return cpu_has_pse;
167}
168#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
169
145static inline pte_t pte_set_flags(pte_t pte, pteval_t set) 170static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
146{ 171{
147 pteval_t v = native_pte_val(pte); 172 pteval_t v = native_pte_val(pte);
@@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte)
216 return pte_set_flags(pte, _PAGE_SPECIAL); 241 return pte_set_flags(pte, _PAGE_SPECIAL);
217} 242}
218 243
244static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
245{
246 pmdval_t v = native_pmd_val(pmd);
247
248 return __pmd(v | set);
249}
250
251static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
252{
253 pmdval_t v = native_pmd_val(pmd);
254
255 return __pmd(v & ~clear);
256}
257
258static inline pmd_t pmd_mkold(pmd_t pmd)
259{
260 return pmd_clear_flags(pmd, _PAGE_ACCESSED);
261}
262
263static inline pmd_t pmd_wrprotect(pmd_t pmd)
264{
265 return pmd_clear_flags(pmd, _PAGE_RW);
266}
267
268static inline pmd_t pmd_mkdirty(pmd_t pmd)
269{
270 return pmd_set_flags(pmd, _PAGE_DIRTY);
271}
272
273static inline pmd_t pmd_mkhuge(pmd_t pmd)
274{
275 return pmd_set_flags(pmd, _PAGE_PSE);
276}
277
278static inline pmd_t pmd_mkyoung(pmd_t pmd)
279{
280 return pmd_set_flags(pmd, _PAGE_ACCESSED);
281}
282
283static inline pmd_t pmd_mkwrite(pmd_t pmd)
284{
285 return pmd_set_flags(pmd, _PAGE_RW);
286}
287
288static inline pmd_t pmd_mknotpresent(pmd_t pmd)
289{
290 return pmd_clear_flags(pmd, _PAGE_PRESENT);
291}
292
219/* 293/*
220 * Mask out unsupported bits in a present pgprot. Non-present pgprots 294 * Mask out unsupported bits in a present pgprot. Non-present pgprots
221 * can use those bits for other purposes, so leave them be. 295 * can use those bits for other purposes, so leave them be.
@@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
256 return __pte(val); 330 return __pte(val);
257} 331}
258 332
333static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
334{
335 pmdval_t val = pmd_val(pmd);
336
337 val &= _HPAGE_CHG_MASK;
338 val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
339
340 return __pmd(val);
341}
342
259/* mprotect needs to preserve PAT bits when updating vm_page_prot */ 343/* mprotect needs to preserve PAT bits when updating vm_page_prot */
260#define pgprot_modify pgprot_modify 344#define pgprot_modify pgprot_modify
261static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) 345static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
350 * Currently stuck as a macro due to indirect forward reference to 434 * Currently stuck as a macro due to indirect forward reference to
351 * linux/mmzone.h's __section_mem_map_addr() definition: 435 * linux/mmzone.h's __section_mem_map_addr() definition:
352 */ 436 */
353#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) 437#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
354 438
355/* 439/*
356 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] 440 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
524 return res; 608 return res;
525} 609}
526 610
611static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
612{
613 pmd_t res = *pmdp;
614
615 native_pmd_clear(pmdp);
616 return res;
617}
618
527static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, 619static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
528 pte_t *ptep , pte_t pte) 620 pte_t *ptep , pte_t pte)
529{ 621{
530 native_set_pte(ptep, pte); 622 native_set_pte(ptep, pte);
531} 623}
532 624
625static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
626 pmd_t *pmdp , pmd_t pmd)
627{
628 native_set_pmd(pmdp, pmd);
629}
630
533#ifndef CONFIG_PARAVIRT 631#ifndef CONFIG_PARAVIRT
534/* 632/*
535 * Rules for using pte_update - it must be called after any PTE update which 633 * Rules for using pte_update - it must be called after any PTE update which
@@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
607 705
608#define flush_tlb_fix_spurious_fault(vma, address) 706#define flush_tlb_fix_spurious_fault(vma, address)
609 707
708#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
709
710#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
711extern int pmdp_set_access_flags(struct vm_area_struct *vma,
712 unsigned long address, pmd_t *pmdp,
713 pmd_t entry, int dirty);
714
715#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
716extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
717 unsigned long addr, pmd_t *pmdp);
718
719#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
720extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
721 unsigned long address, pmd_t *pmdp);
722
723
724#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
725extern void pmdp_splitting_flush(struct vm_area_struct *vma,
726 unsigned long addr, pmd_t *pmdp);
727
728#define __HAVE_ARCH_PMD_WRITE
729static inline int pmd_write(pmd_t pmd)
730{
731 return pmd_flags(pmd) & _PAGE_RW;
732}
733
734#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
735static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
736 pmd_t *pmdp)
737{
738 pmd_t pmd = native_pmdp_get_and_clear(pmdp);
739 pmd_update(mm, addr, pmdp);
740 return pmd;
741}
742
743#define __HAVE_ARCH_PMDP_SET_WRPROTECT
744static inline void pmdp_set_wrprotect(struct mm_struct *mm,
745 unsigned long addr, pmd_t *pmdp)
746{
747 clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
748 pmd_update(mm, addr, pmdp);
749}
750
610/* 751/*
611 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 752 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
612 * 753 *
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f86da20347f2..975f709e09ae 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
59 native_set_pte(ptep, pte); 59 native_set_pte(ptep, pte);
60} 60}
61 61
62static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
63{
64 *pmdp = pmd;
65}
66
67static inline void native_pmd_clear(pmd_t *pmd)
68{
69 native_set_pmd(pmd, native_make_pmd(0));
70}
71
62static inline pte_t native_ptep_get_and_clear(pte_t *xp) 72static inline pte_t native_ptep_get_and_clear(pte_t *xp)
63{ 73{
64#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
@@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
72#endif 82#endif
73} 83}
74 84
75static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) 85static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
76{ 86{
77 *pmdp = pmd; 87#ifdef CONFIG_SMP
78} 88 return native_make_pmd(xchg(&xp->pmd, 0));
79 89#else
80static inline void native_pmd_clear(pmd_t *pmd) 90 /* native_local_pmdp_get_and_clear,
81{ 91 but duplicated because of cyclic dependency */
82 native_set_pmd(pmd, native_make_pmd(0)); 92 pmd_t ret = *xp;
93 native_pmd_clear(xp);
94 return ret;
95#endif
83} 96}
84 97
85static inline void native_set_pud(pud_t *pudp, pud_t pud) 98static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -168,6 +181,7 @@ extern void cleanup_highmap(void);
168#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) 181#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
169 182
170#define __HAVE_ARCH_PTE_SAME 183#define __HAVE_ARCH_PTE_SAME
184
171#endif /* !__ASSEMBLY__ */ 185#endif /* !__ASSEMBLY__ */
172 186
173#endif /* _ASM_X86_PGTABLE_64_H */ 187#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d1f4a760be23..7db7723d1f32 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,6 +22,7 @@
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
25#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
25#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 26#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
26 27
27/* If _PAGE_BIT_PRESENT is clear, we use these: */ 28/* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -45,6 +46,7 @@
45#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
46#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
47#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
48#define __HAVE_ARCH_PTE_SPECIAL 50#define __HAVE_ARCH_PTE_SPECIAL
49 51
50#ifdef CONFIG_KMEMCHECK 52#ifdef CONFIG_KMEMCHECK
@@ -70,6 +72,7 @@
70/* Set of bits not changed in pte_modify */ 72/* Set of bits not changed in pte_modify */
71#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 73#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
72 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) 74 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
75#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
73 76
74#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) 77#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
75#define _PAGE_CACHE_WB (0) 78#define _PAGE_CACHE_WB (0)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cae9c3cb95cf..45636cefa186 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -141,10 +141,9 @@ extern __u32 cpu_caps_set[NCAPINTS];
141#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
142DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 142DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
143#define cpu_data(cpu) per_cpu(cpu_info, cpu) 143#define cpu_data(cpu) per_cpu(cpu_info, cpu)
144#define current_cpu_data __get_cpu_var(cpu_info)
145#else 144#else
145#define cpu_info boot_cpu_data
146#define cpu_data(cpu) boot_cpu_data 146#define cpu_data(cpu) boot_cpu_data
147#define current_cpu_data boot_cpu_data
148#endif 147#endif
149 148
150extern const struct seq_operations cpuinfo_op; 149extern const struct seq_operations cpuinfo_op;
@@ -762,10 +761,11 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c);
762extern void init_c1e_mask(void); 761extern void init_c1e_mask(void);
763 762
764extern unsigned long boot_option_idle_override; 763extern unsigned long boot_option_idle_override;
765extern unsigned long idle_halt;
766extern unsigned long idle_nomwait;
767extern bool c1e_detected; 764extern bool c1e_detected;
768 765
766enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
767 IDLE_POLL, IDLE_FORCE_MWAIT};
768
769extern void enable_sep_cpu(void); 769extern void enable_sep_cpu(void);
770extern int sysenter_setup(void); 770extern int sysenter_setup(void);
771 771
@@ -902,7 +902,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
902/* 902/*
903 * The below -8 is to reserve 8 bytes on top of the ring0 stack. 903 * The below -8 is to reserve 8 bytes on top of the ring0 stack.
904 * This is necessary to guarantee that the entire "struct pt_regs" 904 * This is necessary to guarantee that the entire "struct pt_regs"
905 * is accessable even if the CPU haven't stored the SS/ESP registers 905 * is accessible even if the CPU haven't stored the SS/ESP registers
906 * on the stack (interrupt gate does not save these registers 906 * on the stack (interrupt gate does not save these registers
907 * when switching to the same priv ring). 907 * when switching to the same priv ring).
908 * Therefore beware: accessing the ss/esp fields of the 908 * Therefore beware: accessing the ss/esp fields of the
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
new file mode 100644
index 000000000000..b4ec95f07518
--- /dev/null
+++ b/arch/x86/include/asm/prom.h
@@ -0,0 +1 @@
/* dummy prom.h; here to make linux/of.h's #includes happy */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0e831059ac5a..f2b83bc7d784 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -47,14 +47,13 @@ enum {
47 INTERCEPT_MONITOR, 47 INTERCEPT_MONITOR,
48 INTERCEPT_MWAIT, 48 INTERCEPT_MWAIT,
49 INTERCEPT_MWAIT_COND, 49 INTERCEPT_MWAIT_COND,
50 INTERCEPT_XSETBV,
50}; 51};
51 52
52 53
53struct __attribute__ ((__packed__)) vmcb_control_area { 54struct __attribute__ ((__packed__)) vmcb_control_area {
54 u16 intercept_cr_read; 55 u32 intercept_cr;
55 u16 intercept_cr_write; 56 u32 intercept_dr;
56 u16 intercept_dr_read;
57 u16 intercept_dr_write;
58 u32 intercept_exceptions; 57 u32 intercept_exceptions;
59 u64 intercept; 58 u64 intercept;
60 u8 reserved_1[42]; 59 u8 reserved_1[42];
@@ -81,14 +80,19 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
81 u32 event_inj_err; 80 u32 event_inj_err;
82 u64 nested_cr3; 81 u64 nested_cr3;
83 u64 lbr_ctl; 82 u64 lbr_ctl;
84 u64 reserved_5; 83 u32 clean;
84 u32 reserved_5;
85 u64 next_rip; 85 u64 next_rip;
86 u8 reserved_6[816]; 86 u8 insn_len;
87 u8 insn_bytes[15];
88 u8 reserved_6[800];
87}; 89};
88 90
89 91
90#define TLB_CONTROL_DO_NOTHING 0 92#define TLB_CONTROL_DO_NOTHING 0
91#define TLB_CONTROL_FLUSH_ALL_ASID 1 93#define TLB_CONTROL_FLUSH_ALL_ASID 1
94#define TLB_CONTROL_FLUSH_ASID 3
95#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
92 96
93#define V_TPR_MASK 0x0f 97#define V_TPR_MASK 0x0f
94 98
@@ -204,19 +208,31 @@ struct __attribute__ ((__packed__)) vmcb {
204#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK 208#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
205#define SVM_SELECTOR_CODE_MASK (1 << 3) 209#define SVM_SELECTOR_CODE_MASK (1 << 3)
206 210
207#define INTERCEPT_CR0_MASK 1 211#define INTERCEPT_CR0_READ 0
208#define INTERCEPT_CR3_MASK (1 << 3) 212#define INTERCEPT_CR3_READ 3
209#define INTERCEPT_CR4_MASK (1 << 4) 213#define INTERCEPT_CR4_READ 4
210#define INTERCEPT_CR8_MASK (1 << 8) 214#define INTERCEPT_CR8_READ 8
211 215#define INTERCEPT_CR0_WRITE (16 + 0)
212#define INTERCEPT_DR0_MASK 1 216#define INTERCEPT_CR3_WRITE (16 + 3)
213#define INTERCEPT_DR1_MASK (1 << 1) 217#define INTERCEPT_CR4_WRITE (16 + 4)
214#define INTERCEPT_DR2_MASK (1 << 2) 218#define INTERCEPT_CR8_WRITE (16 + 8)
215#define INTERCEPT_DR3_MASK (1 << 3) 219
216#define INTERCEPT_DR4_MASK (1 << 4) 220#define INTERCEPT_DR0_READ 0
217#define INTERCEPT_DR5_MASK (1 << 5) 221#define INTERCEPT_DR1_READ 1
218#define INTERCEPT_DR6_MASK (1 << 6) 222#define INTERCEPT_DR2_READ 2
219#define INTERCEPT_DR7_MASK (1 << 7) 223#define INTERCEPT_DR3_READ 3
224#define INTERCEPT_DR4_READ 4
225#define INTERCEPT_DR5_READ 5
226#define INTERCEPT_DR6_READ 6
227#define INTERCEPT_DR7_READ 7
228#define INTERCEPT_DR0_WRITE (16 + 0)
229#define INTERCEPT_DR1_WRITE (16 + 1)
230#define INTERCEPT_DR2_WRITE (16 + 2)
231#define INTERCEPT_DR3_WRITE (16 + 3)
232#define INTERCEPT_DR4_WRITE (16 + 4)
233#define INTERCEPT_DR5_WRITE (16 + 5)
234#define INTERCEPT_DR6_WRITE (16 + 6)
235#define INTERCEPT_DR7_WRITE (16 + 7)
220 236
221#define SVM_EVTINJ_VEC_MASK 0xff 237#define SVM_EVTINJ_VEC_MASK 0xff
222 238
@@ -246,6 +262,8 @@ struct __attribute__ ((__packed__)) vmcb {
246#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 262#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
247#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 263#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
248 264
265#define SVM_EXITINFO_REG_MASK 0x0F
266
249#define SVM_EXIT_READ_CR0 0x000 267#define SVM_EXIT_READ_CR0 0x000
250#define SVM_EXIT_READ_CR3 0x003 268#define SVM_EXIT_READ_CR3 0x003
251#define SVM_EXIT_READ_CR4 0x004 269#define SVM_EXIT_READ_CR4 0x004
@@ -316,6 +334,7 @@ struct __attribute__ ((__packed__)) vmcb {
316#define SVM_EXIT_MONITOR 0x08a 334#define SVM_EXIT_MONITOR 0x08a
317#define SVM_EXIT_MWAIT 0x08b 335#define SVM_EXIT_MWAIT 0x08b
318#define SVM_EXIT_MWAIT_COND 0x08c 336#define SVM_EXIT_MWAIT_COND 0x08c
337#define SVM_EXIT_XSETBV 0x08d
319#define SVM_EXIT_NPF 0x400 338#define SVM_EXIT_NPF 0x400
320 339
321#define SVM_EXIT_ERR -1 340#define SVM_EXIT_ERR -1
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index f66cda56781d..0310da67307f 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
30asmlinkage void stack_segment(void); 30asmlinkage void stack_segment(void);
31asmlinkage void general_protection(void); 31asmlinkage void general_protection(void);
32asmlinkage void page_fault(void); 32asmlinkage void page_fault(void);
33asmlinkage void async_page_fault(void);
33asmlinkage void spurious_interrupt_bug(void); 34asmlinkage void spurious_interrupt_bug(void);
34asmlinkage void coprocessor_error(void); 35asmlinkage void coprocessor_error(void);
35asmlinkage void alignment_check(void); 36asmlinkage void alignment_check(void);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9f0cbd987d50..84471b810460 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,15 +66,23 @@
66#define PIN_BASED_NMI_EXITING 0x00000008 66#define PIN_BASED_NMI_EXITING 0x00000008
67#define PIN_BASED_VIRTUAL_NMIS 0x00000020 67#define PIN_BASED_VIRTUAL_NMIS 0x00000020
68 68
69#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
69#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 70#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
71#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
70#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 72#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
71#define VM_EXIT_SAVE_IA32_PAT 0x00040000 73#define VM_EXIT_SAVE_IA32_PAT 0x00040000
72#define VM_EXIT_LOAD_IA32_PAT 0x00080000 74#define VM_EXIT_LOAD_IA32_PAT 0x00080000
75#define VM_EXIT_SAVE_IA32_EFER 0x00100000
76#define VM_EXIT_LOAD_IA32_EFER 0x00200000
77#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
73 78
79#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
74#define VM_ENTRY_IA32E_MODE 0x00000200 80#define VM_ENTRY_IA32E_MODE 0x00000200
75#define VM_ENTRY_SMM 0x00000400 81#define VM_ENTRY_SMM 0x00000400
76#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 82#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
83#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
77#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 84#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
85#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
78 86
79/* VMCS Encodings */ 87/* VMCS Encodings */
80enum vmcs_field { 88enum vmcs_field {
@@ -239,6 +247,7 @@ enum vmcs_field {
239#define EXIT_REASON_TASK_SWITCH 9 247#define EXIT_REASON_TASK_SWITCH 9
240#define EXIT_REASON_CPUID 10 248#define EXIT_REASON_CPUID 10
241#define EXIT_REASON_HLT 12 249#define EXIT_REASON_HLT 12
250#define EXIT_REASON_INVD 13
242#define EXIT_REASON_INVLPG 14 251#define EXIT_REASON_INVLPG 14
243#define EXIT_REASON_RDPMC 15 252#define EXIT_REASON_RDPMC 15
244#define EXIT_REASON_RDTSC 16 253#define EXIT_REASON_RDTSC 16
@@ -296,6 +305,12 @@ enum vmcs_field {
296#define GUEST_INTR_STATE_SMI 0x00000004 305#define GUEST_INTR_STATE_SMI 0x00000004
297#define GUEST_INTR_STATE_NMI 0x00000008 306#define GUEST_INTR_STATE_NMI 0x00000008
298 307
308/* GUEST_ACTIVITY_STATE flags */
309#define GUEST_ACTIVITY_ACTIVE 0
310#define GUEST_ACTIVITY_HLT 1
311#define GUEST_ACTIVITY_SHUTDOWN 2
312#define GUEST_ACTIVITY_WAIT_SIPI 3
313
299/* 314/*
300 * Exit Qualifications for MOV for Control Register Access 315 * Exit Qualifications for MOV for Control Register Access
301 */ 316 */
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 396ff4cc8ed4..66d0fff1ee84 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -37,4 +37,39 @@
37extern struct shared_info *HYPERVISOR_shared_info; 37extern struct shared_info *HYPERVISOR_shared_info;
38extern struct start_info *xen_start_info; 38extern struct start_info *xen_start_info;
39 39
40#include <asm/processor.h>
41
42static inline uint32_t xen_cpuid_base(void)
43{
44 uint32_t base, eax, ebx, ecx, edx;
45 char signature[13];
46
47 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
48 cpuid(base, &eax, &ebx, &ecx, &edx);
49 *(uint32_t *)(signature + 0) = ebx;
50 *(uint32_t *)(signature + 4) = ecx;
51 *(uint32_t *)(signature + 8) = edx;
52 signature[12] = 0;
53
54 if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
55 return base;
56 }
57
58 return 0;
59}
60
61#ifdef CONFIG_XEN
62extern bool xen_hvm_need_lapic(void);
63
64static inline bool xen_x2apic_para_available(void)
65{
66 return xen_hvm_need_lapic();
67}
68#else
69static inline bool xen_x2apic_para_available(void)
70{
71 return (xen_cpuid_base() != 0);
72}
73#endif
74
40#endif /* _ASM_X86_XEN_HYPERVISOR_H */ 75#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 8760cc60a21c..f25bdf238a33 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -42,6 +42,11 @@ extern unsigned int machine_to_phys_order;
42extern unsigned long get_phys_to_machine(unsigned long pfn); 42extern unsigned long get_phys_to_machine(unsigned long pfn);
43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
44 44
45extern int m2p_add_override(unsigned long mfn, struct page *page);
46extern int m2p_remove_override(struct page *page);
47extern struct page *m2p_find_override(unsigned long mfn);
48extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
49
45static inline unsigned long pfn_to_mfn(unsigned long pfn) 50static inline unsigned long pfn_to_mfn(unsigned long pfn)
46{ 51{
47 unsigned long mfn; 52 unsigned long mfn;
@@ -72,9 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
72 if (xen_feature(XENFEAT_auto_translated_physmap)) 77 if (xen_feature(XENFEAT_auto_translated_physmap))
73 return mfn; 78 return mfn;
74 79
75 if (unlikely((mfn >> machine_to_phys_order) != 0))
76 return ~0;
77
78 pfn = 0; 80 pfn = 0;
79 /* 81 /*
80 * The array access can fail (e.g., device space beyond end of RAM). 82 * The array access can fail (e.g., device space beyond end of RAM).
@@ -83,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
83 */ 85 */
84 __get_user(pfn, &machine_to_phys_mapping[mfn]); 86 __get_user(pfn, &machine_to_phys_mapping[mfn]);
85 87
88 /*
89 * If this appears to be a foreign mfn (because the pfn
90 * doesn't map back to the mfn), then check the local override
91 * table to see if there's a better pfn to use.
92 */
93 if (get_phys_to_machine(pfn) != mfn)
94 pfn = m2p_find_override_pfn(mfn, pfn);
95
86 return pfn; 96 return pfn;
87} 97}
88 98
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ec881c6bfee0..b3a71137983a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -509,6 +509,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
509 509
510 return 0; 510 return 0;
511} 511}
512EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
512 513
513int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) 514int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
514{ 515{
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d2fdb0826df2..57ca77787220 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1086,7 +1086,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
1086 1086
1087 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1087 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
1088 1088
1089 /* Intialize the exclusion range if necessary */ 1089 /* Initialize the exclusion range if necessary */
1090 for_each_iommu(iommu) { 1090 for_each_iommu(iommu) {
1091 if (iommu->exclusion_start && 1091 if (iommu->exclusion_start &&
1092 iommu->exclusion_start >= dma_dom->aperture[index]->offset 1092 iommu->exclusion_start >= dma_dom->aperture[index]->offset
@@ -1353,7 +1353,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1353 1353
1354/* 1354/*
1355 * Allocates a new protection domain usable for the dma_ops functions. 1355 * Allocates a new protection domain usable for the dma_ops functions.
1356 * It also intializes the page table and the address allocator data 1356 * It also initializes the page table and the address allocator data
1357 * structures required for the dma_ops interface 1357 * structures required for the dma_ops interface
1358 */ 1358 */
1359static struct dma_ops_domain *dma_ops_domain_alloc(void) 1359static struct dma_ops_domain *dma_ops_domain_alloc(void)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 1efd3789e3d4..06c196d7e59c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -49,8 +49,8 @@
49#include <asm/mtrr.h> 49#include <asm/mtrr.h>
50#include <asm/smp.h> 50#include <asm/smp.h>
51#include <asm/mce.h> 51#include <asm/mce.h>
52#include <asm/kvm_para.h>
53#include <asm/tsc.h> 52#include <asm/tsc.h>
53#include <asm/hypervisor.h>
54 54
55unsigned int num_processors; 55unsigned int num_processors;
56 56
@@ -516,7 +516,7 @@ static void __cpuinit setup_APIC_timer(void)
516{ 516{
517 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 517 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
518 518
519 if (cpu_has(&current_cpu_data, X86_FEATURE_ARAT)) { 519 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) {
520 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; 520 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
521 /* Make LAPIC timer preferrable over percpu HPET */ 521 /* Make LAPIC timer preferrable over percpu HPET */
522 lapic_clockevent.rating = 150; 522 lapic_clockevent.rating = 150;
@@ -1476,7 +1476,8 @@ void __init enable_IR_x2apic(void)
1476 /* IR is required if there is APIC ID > 255 even when running 1476 /* IR is required if there is APIC ID > 255 even when running
1477 * under KVM 1477 * under KVM
1478 */ 1478 */
1479 if (max_physical_apicid > 255 || !kvm_para_available()) 1479 if (max_physical_apicid > 255 ||
1480 !hypervisor_x2apic_available())
1480 goto nox2apic; 1481 goto nox2apic;
1481 /* 1482 /*
1482 * without IR all CPUs can be addressed by IOAPIC/MSI 1483 * without IR all CPUs can be addressed by IOAPIC/MSI
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 72ec29e1ae06..79fd43ca6f96 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -68,7 +68,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
68 68
69 switch (cmd) { 69 switch (cmd) {
70 case DIE_NMI: 70 case DIE_NMI:
71 case DIE_NMI_IPI:
72 break; 71 break;
73 72
74 default: 73 default:
@@ -96,7 +95,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
96static __read_mostly struct notifier_block backtrace_notifier = { 95static __read_mostly struct notifier_block backtrace_notifier = {
97 .notifier_call = arch_trigger_all_cpu_backtrace_handler, 96 .notifier_call = arch_trigger_all_cpu_backtrace_handler,
98 .next = NULL, 97 .next = NULL,
99 .priority = 1 98 .priority = NMI_LOCAL_LOW_PRIOR,
100}; 99};
101 100
102static int __init register_trigger_all_cpu_backtrace(void) 101static int __init register_trigger_all_cpu_backtrace(void)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 52735a710c30..697dc34b7b87 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2329,7 +2329,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2329 unsigned int irr; 2329 unsigned int irr;
2330 struct irq_desc *desc; 2330 struct irq_desc *desc;
2331 struct irq_cfg *cfg; 2331 struct irq_cfg *cfg;
2332 irq = __get_cpu_var(vector_irq)[vector]; 2332 irq = __this_cpu_read(vector_irq[vector]);
2333 2333
2334 if (irq == -1) 2334 if (irq == -1)
2335 continue; 2335 continue;
@@ -2363,7 +2363,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2363 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); 2363 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
2364 goto unlock; 2364 goto unlock;
2365 } 2365 }
2366 __get_cpu_var(vector_irq)[vector] = -1; 2366 __this_cpu_write(vector_irq[vector], -1);
2367unlock: 2367unlock:
2368 raw_spin_unlock(&desc->lock); 2368 raw_spin_unlock(&desc->lock);
2369 } 2369 }
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index f4f9e95aa151..bd16b58b8850 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -120,8 +120,8 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
120 else if (!strcmp(oem_table_id, "UVX")) 120 else if (!strcmp(oem_table_id, "UVX"))
121 uv_system_type = UV_X2APIC; 121 uv_system_type = UV_X2APIC;
122 else if (!strcmp(oem_table_id, "UVH")) { 122 else if (!strcmp(oem_table_id, "UVH")) {
123 __get_cpu_var(x2apic_extra_bits) = 123 __this_cpu_write(x2apic_extra_bits,
124 pnodeid << uvh_apicid.s.pnode_shift; 124 pnodeid << uvh_apicid.s.pnode_shift);
125 uv_system_type = UV_NON_UNIQUE_APIC; 125 uv_system_type = UV_NON_UNIQUE_APIC;
126 uv_set_apicid_hibit(); 126 uv_set_apicid_hibit();
127 return 1; 127 return 1;
@@ -286,7 +286,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x)
286 unsigned int id; 286 unsigned int id;
287 287
288 WARN_ON(preemptible() && num_online_cpus() > 1); 288 WARN_ON(preemptible() && num_online_cpus() > 1);
289 id = x | __get_cpu_var(x2apic_extra_bits); 289 id = x | __this_cpu_read(x2apic_extra_bits);
290 290
291 return id; 291 return id;
292} 292}
@@ -378,7 +378,7 @@ struct apic __refdata apic_x2apic_uv_x = {
378 378
379static __cpuinit void set_x2apic_extra_bits(int pnode) 379static __cpuinit void set_x2apic_extra_bits(int pnode)
380{ 380{
381 __get_cpu_var(x2apic_extra_bits) = (pnode << uvh_apicid.s.pnode_shift); 381 __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
382} 382}
383 383
384/* 384/*
@@ -641,7 +641,7 @@ void __cpuinit uv_cpu_init(void)
641 */ 641 */
642int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) 642int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
643{ 643{
644 if (reason != DIE_NMI_IPI) 644 if (reason != DIE_NMIUNKNOWN)
645 return NOTIFY_OK; 645 return NOTIFY_OK;
646 646
647 if (in_crash_kexec) 647 if (in_crash_kexec)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 9e093f8fe78c..7c7bedb83c5a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -668,7 +668,7 @@ EXPORT_SYMBOL_GPL(amd_erratum_383);
668 668
669bool cpu_has_amd_erratum(const int *erratum) 669bool cpu_has_amd_erratum(const int *erratum)
670{ 670{
671 struct cpuinfo_x86 *cpu = &current_cpu_data; 671 struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info);
672 int osvw_id = *erratum++; 672 int osvw_id = *erratum++;
673 u32 range; 673 u32 range;
674 u32 ms; 674 u32 ms;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 491977baf6c0..35c7e65e59be 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -521,7 +521,7 @@ static void check_supported_cpu(void *_rc)
521 521
522 *rc = -ENODEV; 522 *rc = -ENODEV;
523 523
524 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) 524 if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD)
525 return; 525 return;
526 526
527 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); 527 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
@@ -1377,7 +1377,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1377static void query_values_on_cpu(void *_err) 1377static void query_values_on_cpu(void *_err)
1378{ 1378{
1379 int *err = _err; 1379 int *err = _err;
1380 struct powernow_k8_data *data = __get_cpu_var(powernow_data); 1380 struct powernow_k8_data *data = __this_cpu_read(powernow_data);
1381 1381
1382 *err = query_current_values_with_pending_wait(data); 1382 *err = query_current_values_with_pending_wait(data);
1383} 1383}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 9ecf81f9b90f..7283e98deaae 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -265,7 +265,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
265 line_size = l2.line_size; 265 line_size = l2.line_size;
266 lines_per_tag = l2.lines_per_tag; 266 lines_per_tag = l2.lines_per_tag;
267 /* cpu_data has errata corrections for K7 applied */ 267 /* cpu_data has errata corrections for K7 applied */
268 size_in_kb = current_cpu_data.x86_cache_size; 268 size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
269 break; 269 break;
270 case 3: 270 case 3:
271 if (!l3.val) 271 if (!l3.val)
@@ -287,7 +287,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
287 eax->split.type = types[leaf]; 287 eax->split.type = types[leaf];
288 eax->split.level = levels[leaf]; 288 eax->split.level = levels[leaf];
289 eax->split.num_threads_sharing = 0; 289 eax->split.num_threads_sharing = 0;
290 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; 290 eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
291 291
292 292
293 if (assoc == 0xffff) 293 if (assoc == 0xffff)
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index e7dbde7bfedb..a77971979564 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -25,6 +25,7 @@
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <asm/mce.h> 26#include <asm/mce.h>
27#include <asm/apic.h> 27#include <asm/apic.h>
28#include <asm/nmi.h>
28 29
29/* Update fake mce registers on current CPU. */ 30/* Update fake mce registers on current CPU. */
30static void inject_mce(struct mce *m) 31static void inject_mce(struct mce *m)
@@ -83,7 +84,7 @@ static int mce_raise_notify(struct notifier_block *self,
83 struct die_args *args = (struct die_args *)data; 84 struct die_args *args = (struct die_args *)data;
84 int cpu = smp_processor_id(); 85 int cpu = smp_processor_id();
85 struct mce *m = &__get_cpu_var(injectm); 86 struct mce *m = &__get_cpu_var(injectm);
86 if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) 87 if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
87 return NOTIFY_DONE; 88 return NOTIFY_DONE;
88 cpumask_clear_cpu(cpu, mce_inject_cpumask); 89 cpumask_clear_cpu(cpu, mce_inject_cpumask);
89 if (m->inject_flags & MCJ_EXCEPTION) 90 if (m->inject_flags & MCJ_EXCEPTION)
@@ -95,7 +96,7 @@ static int mce_raise_notify(struct notifier_block *self,
95 96
96static struct notifier_block mce_raise_nb = { 97static struct notifier_block mce_raise_nb = {
97 .notifier_call = mce_raise_notify, 98 .notifier_call = mce_raise_notify,
98 .priority = 1000, 99 .priority = NMI_LOCAL_NORMAL_PRIOR,
99}; 100};
100 101
101/* Inject mce on current CPU */ 102/* Inject mce on current CPU */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7a35b72d7c03..d916183b7f9c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -326,7 +326,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
326 326
327static int msr_to_offset(u32 msr) 327static int msr_to_offset(u32 msr)
328{ 328{
329 unsigned bank = __get_cpu_var(injectm.bank); 329 unsigned bank = __this_cpu_read(injectm.bank);
330 330
331 if (msr == rip_msr) 331 if (msr == rip_msr)
332 return offsetof(struct mce, ip); 332 return offsetof(struct mce, ip);
@@ -346,7 +346,7 @@ static u64 mce_rdmsrl(u32 msr)
346{ 346{
347 u64 v; 347 u64 v;
348 348
349 if (__get_cpu_var(injectm).finished) { 349 if (__this_cpu_read(injectm.finished)) {
350 int offset = msr_to_offset(msr); 350 int offset = msr_to_offset(msr);
351 351
352 if (offset < 0) 352 if (offset < 0)
@@ -369,7 +369,7 @@ static u64 mce_rdmsrl(u32 msr)
369 369
370static void mce_wrmsrl(u32 msr, u64 v) 370static void mce_wrmsrl(u32 msr, u64 v)
371{ 371{
372 if (__get_cpu_var(injectm).finished) { 372 if (__this_cpu_read(injectm.finished)) {
373 int offset = msr_to_offset(msr); 373 int offset = msr_to_offset(msr);
374 374
375 if (offset >= 0) 375 if (offset >= 0)
@@ -1159,7 +1159,7 @@ static void mce_start_timer(unsigned long data)
1159 1159
1160 WARN_ON(smp_processor_id() != data); 1160 WARN_ON(smp_processor_id() != data);
1161 1161
1162 if (mce_available(&current_cpu_data)) { 1162 if (mce_available(__this_cpu_ptr(&cpu_info))) {
1163 machine_check_poll(MCP_TIMESTAMP, 1163 machine_check_poll(MCP_TIMESTAMP,
1164 &__get_cpu_var(mce_poll_banks)); 1164 &__get_cpu_var(mce_poll_banks));
1165 } 1165 }
@@ -1767,7 +1767,7 @@ static int mce_shutdown(struct sys_device *dev)
1767static int mce_resume(struct sys_device *dev) 1767static int mce_resume(struct sys_device *dev)
1768{ 1768{
1769 __mcheck_cpu_init_generic(); 1769 __mcheck_cpu_init_generic();
1770 __mcheck_cpu_init_vendor(&current_cpu_data); 1770 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1771 1771
1772 return 0; 1772 return 0;
1773} 1773}
@@ -1775,7 +1775,7 @@ static int mce_resume(struct sys_device *dev)
1775static void mce_cpu_restart(void *data) 1775static void mce_cpu_restart(void *data)
1776{ 1776{
1777 del_timer_sync(&__get_cpu_var(mce_timer)); 1777 del_timer_sync(&__get_cpu_var(mce_timer));
1778 if (!mce_available(&current_cpu_data)) 1778 if (!mce_available(__this_cpu_ptr(&cpu_info)))
1779 return; 1779 return;
1780 __mcheck_cpu_init_generic(); 1780 __mcheck_cpu_init_generic();
1781 __mcheck_cpu_init_timer(); 1781 __mcheck_cpu_init_timer();
@@ -1790,7 +1790,7 @@ static void mce_restart(void)
1790/* Toggle features for corrected errors */ 1790/* Toggle features for corrected errors */
1791static void mce_disable_ce(void *all) 1791static void mce_disable_ce(void *all)
1792{ 1792{
1793 if (!mce_available(&current_cpu_data)) 1793 if (!mce_available(__this_cpu_ptr(&cpu_info)))
1794 return; 1794 return;
1795 if (all) 1795 if (all)
1796 del_timer_sync(&__get_cpu_var(mce_timer)); 1796 del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1799,7 +1799,7 @@ static void mce_disable_ce(void *all)
1799 1799
1800static void mce_enable_ce(void *all) 1800static void mce_enable_ce(void *all)
1801{ 1801{
1802 if (!mce_available(&current_cpu_data)) 1802 if (!mce_available(__this_cpu_ptr(&cpu_info)))
1803 return; 1803 return;
1804 cmci_reenable(); 1804 cmci_reenable();
1805 cmci_recheck(); 1805 cmci_recheck();
@@ -2022,7 +2022,7 @@ static void __cpuinit mce_disable_cpu(void *h)
2022 unsigned long action = *(unsigned long *)h; 2022 unsigned long action = *(unsigned long *)h;
2023 int i; 2023 int i;
2024 2024
2025 if (!mce_available(&current_cpu_data)) 2025 if (!mce_available(__this_cpu_ptr(&cpu_info)))
2026 return; 2026 return;
2027 2027
2028 if (!(action & CPU_TASKS_FROZEN)) 2028 if (!(action & CPU_TASKS_FROZEN))
@@ -2040,7 +2040,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
2040 unsigned long action = *(unsigned long *)h; 2040 unsigned long action = *(unsigned long *)h;
2041 int i; 2041 int i;
2042 2042
2043 if (!mce_available(&current_cpu_data)) 2043 if (!mce_available(__this_cpu_ptr(&cpu_info)))
2044 return; 2044 return;
2045 2045
2046 if (!(action & CPU_TASKS_FROZEN)) 2046 if (!(action & CPU_TASKS_FROZEN))
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 6fcd0936194f..8694ef56459d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -130,7 +130,7 @@ void cmci_recheck(void)
130 unsigned long flags; 130 unsigned long flags;
131 int banks; 131 int banks;
132 132
133 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks)) 133 if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
134 return; 134 return;
135 local_irq_save(flags); 135 local_irq_save(flags);
136 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 136 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0a360d146596..9d977a2ea693 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -997,8 +997,7 @@ x86_perf_event_set_period(struct perf_event *event)
997 997
998static void x86_pmu_enable_event(struct perf_event *event) 998static void x86_pmu_enable_event(struct perf_event *event)
999{ 999{
1000 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1000 if (__this_cpu_read(cpu_hw_events.enabled))
1001 if (cpuc->enabled)
1002 __x86_pmu_enable_event(&event->hw, 1001 __x86_pmu_enable_event(&event->hw,
1003 ARCH_PERFMON_EVENTSEL_ENABLE); 1002 ARCH_PERFMON_EVENTSEL_ENABLE);
1004} 1003}
@@ -1268,11 +1267,10 @@ perf_event_nmi_handler(struct notifier_block *self,
1268 1267
1269 switch (cmd) { 1268 switch (cmd) {
1270 case DIE_NMI: 1269 case DIE_NMI:
1271 case DIE_NMI_IPI:
1272 break; 1270 break;
1273 case DIE_NMIUNKNOWN: 1271 case DIE_NMIUNKNOWN:
1274 this_nmi = percpu_read(irq_stat.__nmi_count); 1272 this_nmi = percpu_read(irq_stat.__nmi_count);
1275 if (this_nmi != __get_cpu_var(pmu_nmi).marked) 1273 if (this_nmi != __this_cpu_read(pmu_nmi.marked))
1276 /* let the kernel handle the unknown nmi */ 1274 /* let the kernel handle the unknown nmi */
1277 return NOTIFY_DONE; 1275 return NOTIFY_DONE;
1278 /* 1276 /*
@@ -1296,8 +1294,8 @@ perf_event_nmi_handler(struct notifier_block *self,
1296 this_nmi = percpu_read(irq_stat.__nmi_count); 1294 this_nmi = percpu_read(irq_stat.__nmi_count);
1297 if ((handled > 1) || 1295 if ((handled > 1) ||
1298 /* the next nmi could be a back-to-back nmi */ 1296 /* the next nmi could be a back-to-back nmi */
1299 ((__get_cpu_var(pmu_nmi).marked == this_nmi) && 1297 ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
1300 (__get_cpu_var(pmu_nmi).handled > 1))) { 1298 (__this_cpu_read(pmu_nmi.handled) > 1))) {
1301 /* 1299 /*
1302 * We could have two subsequent back-to-back nmis: The 1300 * We could have two subsequent back-to-back nmis: The
1303 * first handles more than one counter, the 2nd 1301 * first handles more than one counter, the 2nd
@@ -1308,8 +1306,8 @@ perf_event_nmi_handler(struct notifier_block *self,
1308 * handling more than one counter. We will mark the 1306 * handling more than one counter. We will mark the
1309 * next (3rd) and then drop it if unhandled. 1307 * next (3rd) and then drop it if unhandled.
1310 */ 1308 */
1311 __get_cpu_var(pmu_nmi).marked = this_nmi + 1; 1309 __this_cpu_write(pmu_nmi.marked, this_nmi + 1);
1312 __get_cpu_var(pmu_nmi).handled = handled; 1310 __this_cpu_write(pmu_nmi.handled, handled);
1313 } 1311 }
1314 1312
1315 return NOTIFY_STOP; 1313 return NOTIFY_STOP;
@@ -1318,7 +1316,7 @@ perf_event_nmi_handler(struct notifier_block *self,
1318static __read_mostly struct notifier_block perf_event_nmi_notifier = { 1316static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1319 .notifier_call = perf_event_nmi_handler, 1317 .notifier_call = perf_event_nmi_handler,
1320 .next = NULL, 1318 .next = NULL,
1321 .priority = 1 1319 .priority = NMI_LOCAL_LOW_PRIOR,
1322}; 1320};
1323 1321
1324static struct event_constraint unconstrained; 1322static struct event_constraint unconstrained;
@@ -1484,11 +1482,9 @@ static inline void x86_pmu_read(struct perf_event *event)
1484 */ 1482 */
1485static void x86_pmu_start_txn(struct pmu *pmu) 1483static void x86_pmu_start_txn(struct pmu *pmu)
1486{ 1484{
1487 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1488
1489 perf_pmu_disable(pmu); 1485 perf_pmu_disable(pmu);
1490 cpuc->group_flag |= PERF_EVENT_TXN; 1486 __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
1491 cpuc->n_txn = 0; 1487 __this_cpu_write(cpu_hw_events.n_txn, 0);
1492} 1488}
1493 1489
1494/* 1490/*
@@ -1498,14 +1494,12 @@ static void x86_pmu_start_txn(struct pmu *pmu)
1498 */ 1494 */
1499static void x86_pmu_cancel_txn(struct pmu *pmu) 1495static void x86_pmu_cancel_txn(struct pmu *pmu)
1500{ 1496{
1501 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1497 __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1502
1503 cpuc->group_flag &= ~PERF_EVENT_TXN;
1504 /* 1498 /*
1505 * Truncate the collected events. 1499 * Truncate the collected events.
1506 */ 1500 */
1507 cpuc->n_added -= cpuc->n_txn; 1501 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1508 cpuc->n_events -= cpuc->n_txn; 1502 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
1509 perf_pmu_enable(pmu); 1503 perf_pmu_enable(pmu);
1510} 1504}
1511 1505
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 24e390e40f2e..008835c1d79c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -649,7 +649,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
649 struct hw_perf_event *hwc = &event->hw; 649 struct hw_perf_event *hwc = &event->hw;
650 650
651 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { 651 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
652 if (!__get_cpu_var(cpu_hw_events).enabled) 652 if (!__this_cpu_read(cpu_hw_events.enabled))
653 return; 653 return;
654 654
655 intel_pmu_enable_bts(hwc->config); 655 intel_pmu_enable_bts(hwc->config);
@@ -679,7 +679,7 @@ static int intel_pmu_save_and_restart(struct perf_event *event)
679 679
680static void intel_pmu_reset(void) 680static void intel_pmu_reset(void)
681{ 681{
682 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; 682 struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
683 unsigned long flags; 683 unsigned long flags;
684 int idx; 684 int idx;
685 685
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 81400b93e694..e56b9bfbabd1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -753,19 +753,21 @@ out:
753 753
754static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) 754static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
755{ 755{
756 int overflow = 0; 756 u64 v;
757 u32 low, high;
758 757
759 rdmsr(hwc->config_base + hwc->idx, low, high); 758 /* an official way for overflow indication */
760 759 rdmsrl(hwc->config_base + hwc->idx, v);
761 /* we need to check high bit for unflagged overflows */ 760 if (v & P4_CCCR_OVF) {
762 if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) { 761 wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF);
763 overflow = 1; 762 return 1;
764 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
765 ((u64)low) & ~P4_CCCR_OVF);
766 } 763 }
767 764
768 return overflow; 765 /* it might be unflagged overflow */
766 rdmsrl(hwc->event_base + hwc->idx, v);
767 if (!(v & ARCH_P4_CNTRVAL_MASK))
768 return 1;
769
770 return 0;
769} 771}
770 772
771static void p4_pmu_disable_pebs(void) 773static void p4_pmu_disable_pebs(void)
@@ -1152,9 +1154,9 @@ static __initconst const struct x86_pmu p4_pmu = {
1152 */ 1154 */
1153 .num_counters = ARCH_P4_MAX_CCCR, 1155 .num_counters = ARCH_P4_MAX_CCCR,
1154 .apic = 1, 1156 .apic = 1,
1155 .cntval_bits = 40, 1157 .cntval_bits = ARCH_P4_CNTRVAL_BITS,
1156 .cntval_mask = (1ULL << 40) - 1, 1158 .cntval_mask = ARCH_P4_CNTRVAL_MASK,
1157 .max_period = (1ULL << 39) - 1, 1159 .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
1158 .hw_config = p4_hw_config, 1160 .hw_config = p4_hw_config,
1159 .schedule_events = p4_pmu_schedule_events, 1161 .schedule_events = p4_pmu_schedule_events,
1160 /* 1162 /*
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 8474c998cbd4..df20723a6a1b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -197,14 +197,8 @@ void show_stack(struct task_struct *task, unsigned long *sp)
197 */ 197 */
198void dump_stack(void) 198void dump_stack(void)
199{ 199{
200 unsigned long bp = 0;
201 unsigned long stack; 200 unsigned long stack;
202 201
203#ifdef CONFIG_FRAME_POINTER
204 if (!bp)
205 get_bp(bp);
206#endif
207
208 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 202 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
209 current->pid, current->comm, print_tainted(), 203 current->pid, current->comm, print_tainted(),
210 init_utsname()->release, 204 init_utsname()->release,
@@ -240,6 +234,7 @@ unsigned __kprobes long oops_begin(void)
240 bust_spinlocks(1); 234 bust_spinlocks(1);
241 return flags; 235 return flags;
242} 236}
237EXPORT_SYMBOL_GPL(oops_begin);
243 238
244void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) 239void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
245{ 240{
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0c2b7ef7a34d..294f26da0c0c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -14,6 +14,7 @@
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/pfn.h> 15#include <linux/pfn.h>
16#include <linux/suspend.h> 16#include <linux/suspend.h>
17#include <linux/acpi.h>
17#include <linux/firmware-map.h> 18#include <linux/firmware-map.h>
18#include <linux/memblock.h> 19#include <linux/memblock.h>
19 20
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 591e60104278..c8b4efad7ebb 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1406,6 +1406,16 @@ ENTRY(general_protection)
1406 CFI_ENDPROC 1406 CFI_ENDPROC
1407END(general_protection) 1407END(general_protection)
1408 1408
1409#ifdef CONFIG_KVM_GUEST
1410ENTRY(async_page_fault)
1411 RING0_EC_FRAME
1412 pushl $do_async_page_fault
1413 CFI_ADJUST_CFA_OFFSET 4
1414 jmp error_code
1415 CFI_ENDPROC
1416END(apf_page_fault)
1417#endif
1418
1409/* 1419/*
1410 * End of kprobes section 1420 * End of kprobes section
1411 */ 1421 */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e3ba417e8697..aed1ffbeb0c9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -299,17 +299,21 @@ ENDPROC(native_usergs_sysret64)
299ENTRY(save_args) 299ENTRY(save_args)
300 XCPT_FRAME 300 XCPT_FRAME
301 cld 301 cld
302 movq_cfi rdi, RDI+16-ARGOFFSET 302 /*
303 movq_cfi rsi, RSI+16-ARGOFFSET 303 * start from rbp in pt_regs and jump over
304 movq_cfi rdx, RDX+16-ARGOFFSET 304 * return address.
305 movq_cfi rcx, RCX+16-ARGOFFSET 305 */
306 movq_cfi rax, RAX+16-ARGOFFSET 306 movq_cfi rdi, RDI+8-RBP
307 movq_cfi r8, R8+16-ARGOFFSET 307 movq_cfi rsi, RSI+8-RBP
308 movq_cfi r9, R9+16-ARGOFFSET 308 movq_cfi rdx, RDX+8-RBP
309 movq_cfi r10, R10+16-ARGOFFSET 309 movq_cfi rcx, RCX+8-RBP
310 movq_cfi r11, R11+16-ARGOFFSET 310 movq_cfi rax, RAX+8-RBP
311 311 movq_cfi r8, R8+8-RBP
312 leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ 312 movq_cfi r9, R9+8-RBP
313 movq_cfi r10, R10+8-RBP
314 movq_cfi r11, R11+8-RBP
315
316 leaq -RBP+8(%rsp),%rdi /* arg1 for handler */
313 movq_cfi rbp, 8 /* push %rbp */ 317 movq_cfi rbp, 8 /* push %rbp */
314 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ 318 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
315 testl $3, CS(%rdi) 319 testl $3, CS(%rdi)
@@ -782,8 +786,9 @@ END(interrupt)
782 786
783/* 0(%rsp): ~(interrupt number) */ 787/* 0(%rsp): ~(interrupt number) */
784 .macro interrupt func 788 .macro interrupt func
785 subq $ORIG_RAX-ARGOFFSET+8, %rsp 789 /* reserve pt_regs for scratch regs and rbp */
786 CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8 790 subq $ORIG_RAX-RBP, %rsp
791 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
787 call save_args 792 call save_args
788 PARTIAL_FRAME 0 793 PARTIAL_FRAME 0
789 call \func 794 call \func
@@ -808,9 +813,14 @@ ret_from_intr:
808 TRACE_IRQS_OFF 813 TRACE_IRQS_OFF
809 decl PER_CPU_VAR(irq_count) 814 decl PER_CPU_VAR(irq_count)
810 leaveq 815 leaveq
816
811 CFI_RESTORE rbp 817 CFI_RESTORE rbp
812 CFI_DEF_CFA_REGISTER rsp 818 CFI_DEF_CFA_REGISTER rsp
813 CFI_ADJUST_CFA_OFFSET -8 819 CFI_ADJUST_CFA_OFFSET -8
820
821 /* we did not save rbx, restore only from ARGOFFSET */
822 addq $8, %rsp
823 CFI_ADJUST_CFA_OFFSET -8
814exit_intr: 824exit_intr:
815 GET_THREAD_INFO(%rcx) 825 GET_THREAD_INFO(%rcx)
816 testl $3,CS-ARGOFFSET(%rsp) 826 testl $3,CS-ARGOFFSET(%rsp)
@@ -1319,6 +1329,9 @@ errorentry xen_stack_segment do_stack_segment
1319#endif 1329#endif
1320errorentry general_protection do_general_protection 1330errorentry general_protection do_general_protection
1321errorentry page_fault do_page_fault 1331errorentry page_fault do_page_fault
1332#ifdef CONFIG_KVM_GUEST
1333errorentry async_page_fault do_async_page_fault
1334#endif
1322#ifdef CONFIG_X86_MCE 1335#ifdef CONFIG_X86_MCE
1323paranoidzeroentry machine_check *machine_check_vector(%rip) 1336paranoidzeroentry machine_check *machine_check_vector(%rip)
1324#endif 1337#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 298448656b60..382eb2936d4d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -170,9 +170,9 @@ static void ftrace_mod_code(void)
170 170
171void ftrace_nmi_enter(void) 171void ftrace_nmi_enter(void)
172{ 172{
173 __get_cpu_var(save_modifying_code) = modifying_code; 173 __this_cpu_write(save_modifying_code, modifying_code);
174 174
175 if (!__get_cpu_var(save_modifying_code)) 175 if (!__this_cpu_read(save_modifying_code))
176 return; 176 return;
177 177
178 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { 178 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
@@ -186,7 +186,7 @@ void ftrace_nmi_enter(void)
186 186
187void ftrace_nmi_exit(void) 187void ftrace_nmi_exit(void)
188{ 188{
189 if (!__get_cpu_var(save_modifying_code)) 189 if (!__this_cpu_read(save_modifying_code))
190 return; 190 return;
191 191
192 /* Finish all executions before clearing nmi_running */ 192 /* Finish all executions before clearing nmi_running */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 9f54b209c378..fc293dc8dc35 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -126,7 +126,7 @@ ENTRY(startup_32)
126 movsl 126 movsl
127 movl pa(boot_params) + NEW_CL_POINTER,%esi 127 movl pa(boot_params) + NEW_CL_POINTER,%esi
128 andl %esi,%esi 128 andl %esi,%esi
129 jz 1f # No comand line 129 jz 1f # No command line
130 movl $pa(boot_command_line),%edi 130 movl $pa(boot_command_line),%edi
131 movl $(COMMAND_LINE_SIZE/4),%ecx 131 movl $(COMMAND_LINE_SIZE/4),%ecx
132 rep 132 rep
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 42c594254507..02f07634d265 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -122,7 +122,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
122 return -EBUSY; 122 return -EBUSY;
123 123
124 set_debugreg(info->address, i); 124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address; 125 __this_cpu_write(cpu_debugreg[i], info->address);
126 126
127 dr7 = &__get_cpu_var(cpu_dr7); 127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type); 128 *dr7 |= encode_dr7(i, info->len, info->type);
@@ -397,12 +397,12 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
397 397
398void hw_breakpoint_restore(void) 398void hw_breakpoint_restore(void)
399{ 399{
400 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); 400 set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0);
401 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); 401 set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1);
402 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); 402 set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2);
403 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); 403 set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3);
404 set_debugreg(current->thread.debugreg6, 6); 404 set_debugreg(current->thread.debugreg6, 6);
405 set_debugreg(__get_cpu_var(cpu_dr7), 7); 405 set_debugreg(__this_cpu_read(cpu_dr7), 7);
406} 406}
407EXPORT_SYMBOL_GPL(hw_breakpoint_restore); 407EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
408 408
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 58bb239a2fd7..e60c38cc0eed 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
169 set_stopped_child_used_math(tsk); 169 set_stopped_child_used_math(tsk);
170 return 0; 170 return 0;
171} 171}
172EXPORT_SYMBOL_GPL(init_fpu);
172 173
173/* 174/*
174 * The xstateregs_active() routine is the same as the fpregs_active() routine, 175 * The xstateregs_active() routine is the same as the fpregs_active() routine,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 83ec0175f986..52945da52a94 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel_stat.h> 6#include <linux/kernel_stat.h>
7#include <linux/of.h>
7#include <linux/seq_file.h> 8#include <linux/seq_file.h>
8#include <linux/smp.h> 9#include <linux/smp.h>
9#include <linux/ftrace.h> 10#include <linux/ftrace.h>
@@ -234,7 +235,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
234 exit_idle(); 235 exit_idle();
235 irq_enter(); 236 irq_enter();
236 237
237 irq = __get_cpu_var(vector_irq)[vector]; 238 irq = __this_cpu_read(vector_irq[vector]);
238 239
239 if (!handle_irq(irq, regs)) { 240 if (!handle_irq(irq, regs)) {
240 ack_APIC_irq(); 241 ack_APIC_irq();
@@ -275,6 +276,15 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
275 276
276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 277EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
277 278
279#ifdef CONFIG_OF
280unsigned int irq_create_of_mapping(struct device_node *controller,
281 const u32 *intspec, unsigned int intsize)
282{
283 return intspec[0];
284}
285EXPORT_SYMBOL_GPL(irq_create_of_mapping);
286#endif
287
278#ifdef CONFIG_HOTPLUG_CPU 288#ifdef CONFIG_HOTPLUG_CPU
279/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ 289/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
280void fixup_irqs(void) 290void fixup_irqs(void)
@@ -350,12 +360,12 @@ void fixup_irqs(void)
350 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 360 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
351 unsigned int irr; 361 unsigned int irr;
352 362
353 if (__get_cpu_var(vector_irq)[vector] < 0) 363 if (__this_cpu_read(vector_irq[vector]) < 0)
354 continue; 364 continue;
355 365
356 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); 366 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
357 if (irr & (1 << (vector % 32))) { 367 if (irr & (1 << (vector % 32))) {
358 irq = __get_cpu_var(vector_irq)[vector]; 368 irq = __this_cpu_read(vector_irq[vector]);
359 369
360 data = irq_get_irq_data(irq); 370 data = irq_get_irq_data(irq);
361 raw_spin_lock(&desc->lock); 371 raw_spin_lock(&desc->lock);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 96656f207751..48ff6dcffa02 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -79,7 +79,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
79 u32 *isp, arg1, arg2; 79 u32 *isp, arg1, arg2;
80 80
81 curctx = (union irq_ctx *) current_thread_info(); 81 curctx = (union irq_ctx *) current_thread_info();
82 irqctx = __get_cpu_var(hardirq_ctx); 82 irqctx = __this_cpu_read(hardirq_ctx);
83 83
84 /* 84 /*
85 * this is where we switch to the IRQ stack. However, if we are 85 * this is where we switch to the IRQ stack. However, if we are
@@ -166,7 +166,7 @@ asmlinkage void do_softirq(void)
166 166
167 if (local_softirq_pending()) { 167 if (local_softirq_pending()) {
168 curctx = current_thread_info(); 168 curctx = current_thread_info();
169 irqctx = __get_cpu_var(softirq_ctx); 169 irqctx = __this_cpu_read(softirq_ctx);
170 irqctx->tinfo.task = curctx->task; 170 irqctx->tinfo.task = curctx->task;
171 irqctx->tinfo.previous_esp = current_stack_pointer; 171 irqctx->tinfo.previous_esp = current_stack_pointer;
172 172
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index cd21b654dec6..a4130005028a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -48,6 +48,7 @@
48#include <asm/apicdef.h> 48#include <asm/apicdef.h>
49#include <asm/system.h> 49#include <asm/system.h>
50#include <asm/apic.h> 50#include <asm/apic.h>
51#include <asm/nmi.h>
51 52
52struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = 53struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
53{ 54{
@@ -525,10 +526,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
525 } 526 }
526 return NOTIFY_DONE; 527 return NOTIFY_DONE;
527 528
528 case DIE_NMI_IPI:
529 /* Just ignore, we will handle the roundup on DIE_NMI. */
530 return NOTIFY_DONE;
531
532 case DIE_NMIUNKNOWN: 529 case DIE_NMIUNKNOWN:
533 if (was_in_debug_nmi[raw_smp_processor_id()]) { 530 if (was_in_debug_nmi[raw_smp_processor_id()]) {
534 was_in_debug_nmi[raw_smp_processor_id()] = 0; 531 was_in_debug_nmi[raw_smp_processor_id()] = 0;
@@ -606,7 +603,7 @@ static struct notifier_block kgdb_notifier = {
606 /* 603 /*
607 * Lowest-prio notifier priority, we want to be notified last: 604 * Lowest-prio notifier priority, we want to be notified last:
608 */ 605 */
609 .priority = -INT_MAX, 606 .priority = NMI_LOCAL_LOW_PRIOR,
610}; 607};
611 608
612/** 609/**
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 5940282bd2f9..d91c477b3f62 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -403,7 +403,7 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
403 403
404static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) 404static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
405{ 405{
406 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; 406 __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
407 kcb->kprobe_status = kcb->prev_kprobe.status; 407 kcb->kprobe_status = kcb->prev_kprobe.status;
408 kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; 408 kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
409 kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; 409 kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
@@ -412,7 +412,7 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
412static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, 412static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
413 struct kprobe_ctlblk *kcb) 413 struct kprobe_ctlblk *kcb)
414{ 414{
415 __get_cpu_var(current_kprobe) = p; 415 __this_cpu_write(current_kprobe, p);
416 kcb->kprobe_saved_flags = kcb->kprobe_old_flags 416 kcb->kprobe_saved_flags = kcb->kprobe_old_flags
417 = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); 417 = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
418 if (is_IF_modifier(p->ainsn.insn)) 418 if (is_IF_modifier(p->ainsn.insn))
@@ -586,7 +586,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
586 preempt_enable_no_resched(); 586 preempt_enable_no_resched();
587 return 1; 587 return 1;
588 } else if (kprobe_running()) { 588 } else if (kprobe_running()) {
589 p = __get_cpu_var(current_kprobe); 589 p = __this_cpu_read(current_kprobe);
590 if (p->break_handler && p->break_handler(p, regs)) { 590 if (p->break_handler && p->break_handler(p, regs)) {
591 setup_singlestep(p, regs, kcb, 0); 591 setup_singlestep(p, regs, kcb, 0);
592 return 1; 592 return 1;
@@ -759,11 +759,11 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
759 759
760 orig_ret_address = (unsigned long)ri->ret_addr; 760 orig_ret_address = (unsigned long)ri->ret_addr;
761 if (ri->rp && ri->rp->handler) { 761 if (ri->rp && ri->rp->handler) {
762 __get_cpu_var(current_kprobe) = &ri->rp->kp; 762 __this_cpu_write(current_kprobe, &ri->rp->kp);
763 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; 763 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
764 ri->ret_addr = correct_ret_addr; 764 ri->ret_addr = correct_ret_addr;
765 ri->rp->handler(ri, regs); 765 ri->rp->handler(ri, regs);
766 __get_cpu_var(current_kprobe) = NULL; 766 __this_cpu_write(current_kprobe, NULL);
767 } 767 }
768 768
769 recycle_rp_inst(ri, &empty_rp); 769 recycle_rp_inst(ri, &empty_rp);
@@ -1202,10 +1202,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
1202 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; 1202 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1203 regs->orig_ax = ~0UL; 1203 regs->orig_ax = ~0UL;
1204 1204
1205 __get_cpu_var(current_kprobe) = &op->kp; 1205 __this_cpu_write(current_kprobe, &op->kp);
1206 kcb->kprobe_status = KPROBE_HIT_ACTIVE; 1206 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1207 opt_pre_handler(&op->kp, regs); 1207 opt_pre_handler(&op->kp, regs);
1208 __get_cpu_var(current_kprobe) = NULL; 1208 __this_cpu_write(current_kprobe, NULL);
1209 } 1209 }
1210 preempt_enable_no_resched(); 1210 preempt_enable_no_resched();
1211} 1211}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8d3d4a..8dc44662394b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,16 +27,37 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <linux/notifier.h>
31#include <linux/reboot.h>
32#include <linux/hash.h>
33#include <linux/sched.h>
34#include <linux/slab.h>
35#include <linux/kprobes.h>
30#include <asm/timer.h> 36#include <asm/timer.h>
37#include <asm/cpu.h>
38#include <asm/traps.h>
39#include <asm/desc.h>
40#include <asm/tlbflush.h>
31 41
32#define MMU_QUEUE_SIZE 1024 42#define MMU_QUEUE_SIZE 1024
33 43
44static int kvmapf = 1;
45
46static int parse_no_kvmapf(char *arg)
47{
48 kvmapf = 0;
49 return 0;
50}
51
52early_param("no-kvmapf", parse_no_kvmapf);
53
34struct kvm_para_state { 54struct kvm_para_state {
35 u8 mmu_queue[MMU_QUEUE_SIZE]; 55 u8 mmu_queue[MMU_QUEUE_SIZE];
36 int mmu_queue_len; 56 int mmu_queue_len;
37}; 57};
38 58
39static DEFINE_PER_CPU(struct kvm_para_state, para_state); 59static DEFINE_PER_CPU(struct kvm_para_state, para_state);
60static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
40 61
41static struct kvm_para_state *kvm_para_state(void) 62static struct kvm_para_state *kvm_para_state(void)
42{ 63{
@@ -50,6 +71,195 @@ static void kvm_io_delay(void)
50{ 71{
51} 72}
52 73
74#define KVM_TASK_SLEEP_HASHBITS 8
75#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
76
77struct kvm_task_sleep_node {
78 struct hlist_node link;
79 wait_queue_head_t wq;
80 u32 token;
81 int cpu;
82 bool halted;
83 struct mm_struct *mm;
84};
85
86static struct kvm_task_sleep_head {
87 spinlock_t lock;
88 struct hlist_head list;
89} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
90
91static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
92 u32 token)
93{
94 struct hlist_node *p;
95
96 hlist_for_each(p, &b->list) {
97 struct kvm_task_sleep_node *n =
98 hlist_entry(p, typeof(*n), link);
99 if (n->token == token)
100 return n;
101 }
102
103 return NULL;
104}
105
106void kvm_async_pf_task_wait(u32 token)
107{
108 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
109 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
110 struct kvm_task_sleep_node n, *e;
111 DEFINE_WAIT(wait);
112 int cpu, idle;
113
114 cpu = get_cpu();
115 idle = idle_cpu(cpu);
116 put_cpu();
117
118 spin_lock(&b->lock);
119 e = _find_apf_task(b, token);
120 if (e) {
121 /* dummy entry exist -> wake up was delivered ahead of PF */
122 hlist_del(&e->link);
123 kfree(e);
124 spin_unlock(&b->lock);
125 return;
126 }
127
128 n.token = token;
129 n.cpu = smp_processor_id();
130 n.mm = current->active_mm;
131 n.halted = idle || preempt_count() > 1;
132 atomic_inc(&n.mm->mm_count);
133 init_waitqueue_head(&n.wq);
134 hlist_add_head(&n.link, &b->list);
135 spin_unlock(&b->lock);
136
137 for (;;) {
138 if (!n.halted)
139 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
140 if (hlist_unhashed(&n.link))
141 break;
142
143 if (!n.halted) {
144 local_irq_enable();
145 schedule();
146 local_irq_disable();
147 } else {
148 /*
149 * We cannot reschedule. So halt.
150 */
151 native_safe_halt();
152 local_irq_disable();
153 }
154 }
155 if (!n.halted)
156 finish_wait(&n.wq, &wait);
157
158 return;
159}
160EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
161
162static void apf_task_wake_one(struct kvm_task_sleep_node *n)
163{
164 hlist_del_init(&n->link);
165 if (!n->mm)
166 return;
167 mmdrop(n->mm);
168 if (n->halted)
169 smp_send_reschedule(n->cpu);
170 else if (waitqueue_active(&n->wq))
171 wake_up(&n->wq);
172}
173
174static void apf_task_wake_all(void)
175{
176 int i;
177
178 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
179 struct hlist_node *p, *next;
180 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
181 spin_lock(&b->lock);
182 hlist_for_each_safe(p, next, &b->list) {
183 struct kvm_task_sleep_node *n =
184 hlist_entry(p, typeof(*n), link);
185 if (n->cpu == smp_processor_id())
186 apf_task_wake_one(n);
187 }
188 spin_unlock(&b->lock);
189 }
190}
191
192void kvm_async_pf_task_wake(u32 token)
193{
194 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
195 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
196 struct kvm_task_sleep_node *n;
197
198 if (token == ~0) {
199 apf_task_wake_all();
200 return;
201 }
202
203again:
204 spin_lock(&b->lock);
205 n = _find_apf_task(b, token);
206 if (!n) {
207 /*
208 * async PF was not yet handled.
209 * Add dummy entry for the token.
210 */
211 n = kmalloc(sizeof(*n), GFP_ATOMIC);
212 if (!n) {
213 /*
214 * Allocation failed! Busy wait while other cpu
215 * handles async PF.
216 */
217 spin_unlock(&b->lock);
218 cpu_relax();
219 goto again;
220 }
221 n->token = token;
222 n->cpu = smp_processor_id();
223 n->mm = NULL;
224 init_waitqueue_head(&n->wq);
225 hlist_add_head(&n->link, &b->list);
226 } else
227 apf_task_wake_one(n);
228 spin_unlock(&b->lock);
229 return;
230}
231EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
232
233u32 kvm_read_and_reset_pf_reason(void)
234{
235 u32 reason = 0;
236
237 if (__get_cpu_var(apf_reason).enabled) {
238 reason = __get_cpu_var(apf_reason).reason;
239 __get_cpu_var(apf_reason).reason = 0;
240 }
241
242 return reason;
243}
244EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
245
246dotraplinkage void __kprobes
247do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
248{
249 switch (kvm_read_and_reset_pf_reason()) {
250 default:
251 do_page_fault(regs, error_code);
252 break;
253 case KVM_PV_REASON_PAGE_NOT_PRESENT:
254 /* page is swapped out by the host. */
255 kvm_async_pf_task_wait((u32)read_cr2());
256 break;
257 case KVM_PV_REASON_PAGE_READY:
258 kvm_async_pf_task_wake((u32)read_cr2());
259 break;
260 }
261}
262
53static void kvm_mmu_op(void *buffer, unsigned len) 263static void kvm_mmu_op(void *buffer, unsigned len)
54{ 264{
55 int r; 265 int r;
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void)
231#endif 441#endif
232} 442}
233 443
444void __cpuinit kvm_guest_cpu_init(void)
445{
446 if (!kvm_para_available())
447 return;
448
449 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
450 u64 pa = __pa(&__get_cpu_var(apf_reason));
451
452#ifdef CONFIG_PREEMPT
453 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
454#endif
455 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
456 __get_cpu_var(apf_reason).enabled = 1;
457 printk(KERN_INFO"KVM setup async PF for cpu %d\n",
458 smp_processor_id());
459 }
460}
461
462static void kvm_pv_disable_apf(void *unused)
463{
464 if (!__get_cpu_var(apf_reason).enabled)
465 return;
466
467 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
468 __get_cpu_var(apf_reason).enabled = 0;
469
470 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
471 smp_processor_id());
472}
473
474static int kvm_pv_reboot_notify(struct notifier_block *nb,
475 unsigned long code, void *unused)
476{
477 if (code == SYS_RESTART)
478 on_each_cpu(kvm_pv_disable_apf, NULL, 1);
479 return NOTIFY_DONE;
480}
481
482static struct notifier_block kvm_pv_reboot_nb = {
483 .notifier_call = kvm_pv_reboot_notify,
484};
485
486#ifdef CONFIG_SMP
487static void __init kvm_smp_prepare_boot_cpu(void)
488{
489#ifdef CONFIG_KVM_CLOCK
490 WARN_ON(kvm_register_clock("primary cpu clock"));
491#endif
492 kvm_guest_cpu_init();
493 native_smp_prepare_boot_cpu();
494}
495
496static void kvm_guest_cpu_online(void *dummy)
497{
498 kvm_guest_cpu_init();
499}
500
501static void kvm_guest_cpu_offline(void *dummy)
502{
503 kvm_pv_disable_apf(NULL);
504 apf_task_wake_all();
505}
506
507static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
508 unsigned long action, void *hcpu)
509{
510 int cpu = (unsigned long)hcpu;
511 switch (action) {
512 case CPU_ONLINE:
513 case CPU_DOWN_FAILED:
514 case CPU_ONLINE_FROZEN:
515 smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
516 break;
517 case CPU_DOWN_PREPARE:
518 case CPU_DOWN_PREPARE_FROZEN:
519 smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
520 break;
521 default:
522 break;
523 }
524 return NOTIFY_OK;
525}
526
527static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
528 .notifier_call = kvm_cpu_notify,
529};
530#endif
531
532static void __init kvm_apf_trap_init(void)
533{
534 set_intr_gate(14, &async_page_fault);
535}
536
234void __init kvm_guest_init(void) 537void __init kvm_guest_init(void)
235{ 538{
539 int i;
540
236 if (!kvm_para_available()) 541 if (!kvm_para_available())
237 return; 542 return;
238 543
239 paravirt_ops_setup(); 544 paravirt_ops_setup();
545 register_reboot_notifier(&kvm_pv_reboot_nb);
546 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
547 spin_lock_init(&async_pf_sleepers[i].lock);
548 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
549 x86_init.irqs.trap_init = kvm_apf_trap_init;
550
551#ifdef CONFIG_SMP
552 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
553 register_cpu_notifier(&kvm_cpu_notifier);
554#else
555 kvm_guest_cpu_init();
556#endif
240} 557}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ca43ce31a19c..f98d3eafe07a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -125,7 +125,7 @@ static struct clocksource kvm_clock = {
125 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 125 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
126}; 126};
127 127
128static int kvm_register_clock(char *txt) 128int kvm_register_clock(char *txt)
129{ 129{
130 int cpu = smp_processor_id(); 130 int cpu = smp_processor_id();
131 int low, high, ret; 131 int low, high, ret;
@@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
152} 152}
153#endif 153#endif
154 154
155#ifdef CONFIG_SMP
156static void __init kvm_smp_prepare_boot_cpu(void)
157{
158 WARN_ON(kvm_register_clock("primary cpu clock"));
159 native_smp_prepare_boot_cpu();
160}
161#endif
162
163/* 155/*
164 * After the clock is registered, the host will keep writing to the 156 * After the clock is registered, the host will keep writing to the
165 * registered memory location. If the guest happens to shutdown, this memory 157 * registered memory location. If the guest happens to shutdown, this memory
@@ -206,9 +198,6 @@ void __init kvmclock_init(void)
206 x86_cpuinit.setup_percpu_clockev = 198 x86_cpuinit.setup_percpu_clockev =
207 kvm_setup_secondary_clock; 199 kvm_setup_secondary_clock;
208#endif 200#endif
209#ifdef CONFIG_SMP
210 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
211#endif
212 machine_ops.shutdown = kvm_shutdown; 201 machine_ops.shutdown = kvm_shutdown;
213#ifdef CONFIG_KEXEC 202#ifdef CONFIG_KEXEC
214 machine_ops.crash_shutdown = kvm_crash_shutdown; 203 machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 8f2956091735..ab23f1ad4bf1 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -37,20 +37,11 @@
37 37
38void *module_alloc(unsigned long size) 38void *module_alloc(unsigned long size)
39{ 39{
40 struct vm_struct *area; 40 if (PAGE_ALIGN(size) > MODULES_LEN)
41
42 if (!size)
43 return NULL;
44 size = PAGE_ALIGN(size);
45 if (size > MODULES_LEN)
46 return NULL; 41 return NULL;
47 42 return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
48 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); 43 GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
49 if (!area) 44 -1, __builtin_return_address(0));
50 return NULL;
51
52 return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
53 PAGE_KERNEL_EXEC);
54} 45}
55 46
56/* Free memory returned from module_alloc */ 47/* Free memory returned from module_alloc */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c5b250011fd4..869e1aeeb71b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
421 .set_pte = native_set_pte, 421 .set_pte = native_set_pte,
422 .set_pte_at = native_set_pte_at, 422 .set_pte_at = native_set_pte_at,
423 .set_pmd = native_set_pmd, 423 .set_pmd = native_set_pmd,
424 .set_pmd_at = native_set_pmd_at,
424 .pte_update = paravirt_nop, 425 .pte_update = paravirt_nop,
425 .pte_update_defer = paravirt_nop, 426 .pte_update_defer = paravirt_nop,
427 .pmd_update = paravirt_nop,
428 .pmd_update_defer = paravirt_nop,
426 429
427 .ptep_modify_prot_start = __ptep_modify_prot_start, 430 .ptep_modify_prot_start = __ptep_modify_prot_start,
428 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 431 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c852041bfc3d..d8286ed54ffa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -22,11 +22,6 @@
22#include <asm/i387.h> 22#include <asm/i387.h>
23#include <asm/debugreg.h> 23#include <asm/debugreg.h>
24 24
25unsigned long idle_halt;
26EXPORT_SYMBOL(idle_halt);
27unsigned long idle_nomwait;
28EXPORT_SYMBOL(idle_nomwait);
29
30struct kmem_cache *task_xstate_cachep; 25struct kmem_cache *task_xstate_cachep;
31EXPORT_SYMBOL_GPL(task_xstate_cachep); 26EXPORT_SYMBOL_GPL(task_xstate_cachep);
32 27
@@ -327,7 +322,7 @@ long sys_execve(const char __user *name,
327/* 322/*
328 * Idle related variables and functions 323 * Idle related variables and functions
329 */ 324 */
330unsigned long boot_option_idle_override = 0; 325unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
331EXPORT_SYMBOL(boot_option_idle_override); 326EXPORT_SYMBOL(boot_option_idle_override);
332 327
333/* 328/*
@@ -386,6 +381,8 @@ void default_idle(void)
386 else 381 else
387 local_irq_enable(); 382 local_irq_enable();
388 current_thread_info()->status |= TS_POLLING; 383 current_thread_info()->status |= TS_POLLING;
384 trace_power_end(smp_processor_id());
385 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
389 } else { 386 } else {
390 local_irq_enable(); 387 local_irq_enable();
391 /* loop is done by the caller */ 388 /* loop is done by the caller */
@@ -443,10 +440,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
443 */ 440 */
444void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 441void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
445{ 442{
446 trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
447 trace_cpu_idle((ax>>4)+1, smp_processor_id());
448 if (!need_resched()) { 443 if (!need_resched()) {
449 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 444 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
450 clflush((void *)&current_thread_info()->flags); 445 clflush((void *)&current_thread_info()->flags);
451 446
452 __monitor((void *)&current_thread_info()->flags, 0, 0); 447 __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -462,7 +457,7 @@ static void mwait_idle(void)
462 if (!need_resched()) { 457 if (!need_resched()) {
463 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 458 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
464 trace_cpu_idle(1, smp_processor_id()); 459 trace_cpu_idle(1, smp_processor_id());
465 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 460 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
466 clflush((void *)&current_thread_info()->flags); 461 clflush((void *)&current_thread_info()->flags);
467 462
468 __monitor((void *)&current_thread_info()->flags, 0, 0); 463 __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -471,6 +466,8 @@ static void mwait_idle(void)
471 __sti_mwait(0, 0); 466 __sti_mwait(0, 0);
472 else 467 else
473 local_irq_enable(); 468 local_irq_enable();
469 trace_power_end(smp_processor_id());
470 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
474 } else 471 } else
475 local_irq_enable(); 472 local_irq_enable();
476} 473}
@@ -503,7 +500,6 @@ static void poll_idle(void)
503 * 500 *
504 * idle=mwait overrides this decision and forces the usage of mwait. 501 * idle=mwait overrides this decision and forces the usage of mwait.
505 */ 502 */
506static int __cpuinitdata force_mwait;
507 503
508#define MWAIT_INFO 0x05 504#define MWAIT_INFO 0x05
509#define MWAIT_ECX_EXTENDED_INFO 0x01 505#define MWAIT_ECX_EXTENDED_INFO 0x01
@@ -513,7 +509,7 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
513{ 509{
514 u32 eax, ebx, ecx, edx; 510 u32 eax, ebx, ecx, edx;
515 511
516 if (force_mwait) 512 if (boot_option_idle_override == IDLE_FORCE_MWAIT)
517 return 1; 513 return 1;
518 514
519 if (c->cpuid_level < MWAIT_INFO) 515 if (c->cpuid_level < MWAIT_INFO)
@@ -633,9 +629,10 @@ static int __init idle_setup(char *str)
633 if (!strcmp(str, "poll")) { 629 if (!strcmp(str, "poll")) {
634 printk("using polling idle threads.\n"); 630 printk("using polling idle threads.\n");
635 pm_idle = poll_idle; 631 pm_idle = poll_idle;
636 } else if (!strcmp(str, "mwait")) 632 boot_option_idle_override = IDLE_POLL;
637 force_mwait = 1; 633 } else if (!strcmp(str, "mwait")) {
638 else if (!strcmp(str, "halt")) { 634 boot_option_idle_override = IDLE_FORCE_MWAIT;
635 } else if (!strcmp(str, "halt")) {
639 /* 636 /*
640 * When the boot option of idle=halt is added, halt is 637 * When the boot option of idle=halt is added, halt is
641 * forced to be used for CPU idle. In such case CPU C2/C3 638 * forced to be used for CPU idle. In such case CPU C2/C3
@@ -644,8 +641,7 @@ static int __init idle_setup(char *str)
644 * the boot_option_idle_override. 641 * the boot_option_idle_override.
645 */ 642 */
646 pm_idle = default_idle; 643 pm_idle = default_idle;
647 idle_halt = 1; 644 boot_option_idle_override = IDLE_HALT;
648 return 0;
649 } else if (!strcmp(str, "nomwait")) { 645 } else if (!strcmp(str, "nomwait")) {
650 /* 646 /*
651 * If the boot option of "idle=nomwait" is added, 647 * If the boot option of "idle=nomwait" is added,
@@ -653,12 +649,10 @@ static int __init idle_setup(char *str)
653 * states. In such case it won't touch the variable 649 * states. In such case it won't touch the variable
654 * of boot_option_idle_override. 650 * of boot_option_idle_override.
655 */ 651 */
656 idle_nomwait = 1; 652 boot_option_idle_override = IDLE_NOMWAIT;
657 return 0;
658 } else 653 } else
659 return -1; 654 return -1;
660 655
661 boot_option_idle_override = 1;
662 return 0; 656 return 0;
663} 657}
664early_param("idle", idle_setup); 658early_param("idle", idle_setup);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4b9befa0e347..8d128783af47 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,8 +57,6 @@
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/debugreg.h> 58#include <asm/debugreg.h>
59 59
60#include <trace/events/power.h>
61
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 61
64/* 62/*
@@ -113,8 +111,6 @@ void cpu_idle(void)
113 stop_critical_timings(); 111 stop_critical_timings();
114 pm_idle(); 112 pm_idle();
115 start_critical_timings(); 113 start_critical_timings();
116 trace_power_end(smp_processor_id());
117 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
118 } 114 }
119 tick_nohz_restart_sched_tick(); 115 tick_nohz_restart_sched_tick();
120 preempt_enable_no_resched(); 116 preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4c818a738396..bd387e8f73b4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,8 +51,6 @@
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/debugreg.h> 52#include <asm/debugreg.h>
53 53
54#include <trace/events/power.h>
55
56asmlinkage extern void ret_from_fork(void); 54asmlinkage extern void ret_from_fork(void);
57 55
58DEFINE_PER_CPU(unsigned long, old_rsp); 56DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -141,10 +139,6 @@ void cpu_idle(void)
141 pm_idle(); 139 pm_idle();
142 start_critical_timings(); 140 start_critical_timings();
143 141
144 trace_power_end(smp_processor_id());
145 trace_cpu_idle(PWR_EVENT_EXIT,
146 smp_processor_id());
147
148 /* In many cases the interrupt that ended idle 142 /* In many cases the interrupt that ended idle
149 has already called exit_idle. But some idle 143 has already called exit_idle. But some idle
150 loops can be woken up without interrupt. */ 144 loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c495aa8d4815..fc7aae1e2bc7 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -18,6 +18,7 @@
18#include <asm/pci_x86.h> 18#include <asm/pci_x86.h>
19#include <asm/virtext.h> 19#include <asm/virtext.h>
20#include <asm/cpu.h> 20#include <asm/cpu.h>
21#include <asm/nmi.h>
21 22
22#ifdef CONFIG_X86_32 23#ifdef CONFIG_X86_32
23# include <linux/ctype.h> 24# include <linux/ctype.h>
@@ -747,7 +748,7 @@ static int crash_nmi_callback(struct notifier_block *self,
747{ 748{
748 int cpu; 749 int cpu;
749 750
750 if (val != DIE_NMI_IPI) 751 if (val != DIE_NMI)
751 return NOTIFY_OK; 752 return NOTIFY_OK;
752 753
753 cpu = raw_smp_processor_id(); 754 cpu = raw_smp_processor_id();
@@ -778,6 +779,8 @@ static void smp_send_nmi_allbutself(void)
778 779
779static struct notifier_block crash_nmi_nb = { 780static struct notifier_block crash_nmi_nb = {
780 .notifier_call = crash_nmi_callback, 781 .notifier_call = crash_nmi_callback,
782 /* we want to be the first one called */
783 .priority = NMI_LOCAL_HIGH_PRIOR+1,
781}; 784};
782 785
783/* Halt all other CPUs, calling the specified function on each of them 786/* Halt all other CPUs, calling the specified function on each of them
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1cfbbfc3ae26..6f39cab052d5 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -76,7 +76,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
76 CMOS_WRITE(real_seconds, RTC_SECONDS); 76 CMOS_WRITE(real_seconds, RTC_SECONDS);
77 CMOS_WRITE(real_minutes, RTC_MINUTES); 77 CMOS_WRITE(real_minutes, RTC_MINUTES);
78 } else { 78 } else {
79 printk(KERN_WARNING 79 printk_once(KERN_NOTICE
80 "set_rtc_mmss: can't update from %d to %d\n", 80 "set_rtc_mmss: can't update from %d to %d\n",
81 cmos_minutes, real_minutes); 81 cmos_minutes, real_minutes);
82 retval = -1; 82 retval = -1;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 5fdc0950da1d..763df77343dd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -427,7 +427,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
427 427
428 cpumask_set_cpu(cpu, c->llc_shared_map); 428 cpumask_set_cpu(cpu, c->llc_shared_map);
429 429
430 if (current_cpu_data.x86_max_cores == 1) { 430 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
431 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); 431 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
432 c->booted_cores = 1; 432 c->booted_cores = 1;
433 return; 433 return;
@@ -1089,7 +1089,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1089 1089
1090 preempt_disable(); 1090 preempt_disable();
1091 smp_cpu_index_default(); 1091 smp_cpu_index_default();
1092 current_cpu_data = boot_cpu_data; 1092 memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info));
1093 cpumask_copy(cpu_callin_mask, cpumask_of(0)); 1093 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1094 mb(); 1094 mb();
1095 /* 1095 /*
@@ -1383,7 +1383,7 @@ void play_dead_common(void)
1383 1383
1384 mb(); 1384 mb();
1385 /* Ack it */ 1385 /* Ack it */
1386 __get_cpu_var(cpu_state) = CPU_DEAD; 1386 __this_cpu_write(cpu_state, CPU_DEAD);
1387 1387
1388 /* 1388 /*
1389 * With physical CPU hotplug, we should halt the cpu 1389 * With physical CPU hotplug, we should halt the cpu
@@ -1403,11 +1403,11 @@ static inline void mwait_play_dead(void)
1403 int i; 1403 int i;
1404 void *mwait_ptr; 1404 void *mwait_ptr;
1405 1405
1406 if (!cpu_has(&current_cpu_data, X86_FEATURE_MWAIT)) 1406 if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_MWAIT))
1407 return; 1407 return;
1408 if (!cpu_has(&current_cpu_data, X86_FEATURE_CLFLSH)) 1408 if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
1409 return; 1409 return;
1410 if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) 1410 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1411 return; 1411 return;
1412 1412
1413 eax = CPUID_MWAIT_LEAF; 1413 eax = CPUID_MWAIT_LEAF;
@@ -1458,7 +1458,7 @@ static inline void mwait_play_dead(void)
1458 1458
1459static inline void hlt_play_dead(void) 1459static inline void hlt_play_dead(void)
1460{ 1460{
1461 if (current_cpu_data.x86 >= 4) 1461 if (__this_cpu_read(cpu_info.x86) >= 4)
1462 wbinvd(); 1462 wbinvd();
1463 1463
1464 while (1) { 1464 while (1) {
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..998e972f3b1a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
133 pmd = pmd_alloc(&tboot_mm, pud, vaddr); 133 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
134 if (!pmd) 134 if (!pmd)
135 return -1; 135 return -1;
136 pte = pte_alloc_map(&tboot_mm, pmd, vaddr); 136 pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
137 if (!pte) 137 if (!pte)
138 return -1; 138 return -1;
139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); 139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c76aaca5694d..b9b67166f9de 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -84,6 +84,11 @@ EXPORT_SYMBOL_GPL(used_vectors);
84static int ignore_nmis; 84static int ignore_nmis;
85 85
86int unknown_nmi_panic; 86int unknown_nmi_panic;
87/*
88 * Prevent NMI reason port (0x61) being accessed simultaneously, can
89 * only be used in NMI handler.
90 */
91static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
87 92
88static inline void conditional_sti(struct pt_regs *regs) 93static inline void conditional_sti(struct pt_regs *regs)
89{ 94{
@@ -310,15 +315,15 @@ static int __init setup_unknown_nmi_panic(char *str)
310__setup("unknown_nmi_panic", setup_unknown_nmi_panic); 315__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
311 316
312static notrace __kprobes void 317static notrace __kprobes void
313mem_parity_error(unsigned char reason, struct pt_regs *regs) 318pci_serr_error(unsigned char reason, struct pt_regs *regs)
314{ 319{
315 printk(KERN_EMERG 320 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
316 "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 321 reason, smp_processor_id());
317 reason, smp_processor_id());
318
319 printk(KERN_EMERG
320 "You have some hardware problem, likely on the PCI bus.\n");
321 322
323 /*
324 * On some machines, PCI SERR line is used to report memory
325 * errors. EDAC makes use of it.
326 */
322#if defined(CONFIG_EDAC) 327#if defined(CONFIG_EDAC)
323 if (edac_handler_set()) { 328 if (edac_handler_set()) {
324 edac_atomic_assert_error(); 329 edac_atomic_assert_error();
@@ -329,11 +334,11 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
329 if (panic_on_unrecovered_nmi) 334 if (panic_on_unrecovered_nmi)
330 panic("NMI: Not continuing"); 335 panic("NMI: Not continuing");
331 336
332 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 337 pr_emerg("Dazed and confused, but trying to continue\n");
333 338
334 /* Clear and disable the memory parity error line. */ 339 /* Clear and disable the PCI SERR error line. */
335 reason = (reason & 0xf) | 4; 340 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
336 outb(reason, 0x61); 341 outb(reason, NMI_REASON_PORT);
337} 342}
338 343
339static notrace __kprobes void 344static notrace __kprobes void
@@ -341,15 +346,17 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
341{ 346{
342 unsigned long i; 347 unsigned long i;
343 348
344 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); 349 pr_emerg(
350 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
351 reason, smp_processor_id());
345 show_registers(regs); 352 show_registers(regs);
346 353
347 if (panic_on_io_nmi) 354 if (panic_on_io_nmi)
348 panic("NMI IOCK error: Not continuing"); 355 panic("NMI IOCK error: Not continuing");
349 356
350 /* Re-enable the IOCK line, wait for a few seconds */ 357 /* Re-enable the IOCK line, wait for a few seconds */
351 reason = (reason & 0xf) | 8; 358 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
352 outb(reason, 0x61); 359 outb(reason, NMI_REASON_PORT);
353 360
354 i = 20000; 361 i = 20000;
355 while (--i) { 362 while (--i) {
@@ -357,8 +364,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
357 udelay(100); 364 udelay(100);
358 } 365 }
359 366
360 reason &= ~8; 367 reason &= ~NMI_REASON_CLEAR_IOCHK;
361 outb(reason, 0x61); 368 outb(reason, NMI_REASON_PORT);
362} 369}
363 370
364static notrace __kprobes void 371static notrace __kprobes void
@@ -377,57 +384,50 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
377 return; 384 return;
378 } 385 }
379#endif 386#endif
380 printk(KERN_EMERG 387 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
381 "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 388 reason, smp_processor_id());
382 reason, smp_processor_id());
383 389
384 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); 390 pr_emerg("Do you have a strange power saving mode enabled?\n");
385 if (unknown_nmi_panic || panic_on_unrecovered_nmi) 391 if (unknown_nmi_panic || panic_on_unrecovered_nmi)
386 panic("NMI: Not continuing"); 392 panic("NMI: Not continuing");
387 393
388 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 394 pr_emerg("Dazed and confused, but trying to continue\n");
389} 395}
390 396
391static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 397static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
392{ 398{
393 unsigned char reason = 0; 399 unsigned char reason = 0;
394 int cpu;
395 400
396 cpu = smp_processor_id(); 401 /*
397 402 * CPU-specific NMI must be processed before non-CPU-specific
398 /* Only the BSP gets external NMIs from the system. */ 403 * NMI, otherwise we may lose it, because the CPU-specific
399 if (!cpu) 404 * NMI can not be detected/processed on other CPUs.
400 reason = get_nmi_reason(); 405 */
406 if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
407 return;
401 408
402 if (!(reason & 0xc0)) { 409 /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
403 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) 410 raw_spin_lock(&nmi_reason_lock);
404 == NOTIFY_STOP) 411 reason = get_nmi_reason();
405 return;
406 412
407#ifdef CONFIG_X86_LOCAL_APIC 413 if (reason & NMI_REASON_MASK) {
408 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 414 if (reason & NMI_REASON_SERR)
409 == NOTIFY_STOP) 415 pci_serr_error(reason, regs);
410 return; 416 else if (reason & NMI_REASON_IOCHK)
417 io_check_error(reason, regs);
418#ifdef CONFIG_X86_32
419 /*
420 * Reassert NMI in case it became active
421 * meanwhile as it's edge-triggered:
422 */
423 reassert_nmi();
411#endif 424#endif
412 unknown_nmi_error(reason, regs); 425 raw_spin_unlock(&nmi_reason_lock);
413
414 return; 426 return;
415 } 427 }
416 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 428 raw_spin_unlock(&nmi_reason_lock);
417 return;
418 429
419 /* AK: following checks seem to be broken on modern chipsets. FIXME */ 430 unknown_nmi_error(reason, regs);
420 if (reason & 0x80)
421 mem_parity_error(reason, regs);
422 if (reason & 0x40)
423 io_check_error(reason, regs);
424#ifdef CONFIG_X86_32
425 /*
426 * Reassert NMI in case it became active meanwhile
427 * as it's edge-triggered:
428 */
429 reassert_nmi();
430#endif
431} 431}
432 432
433dotraplinkage notrace __kprobes void 433dotraplinkage notrace __kprobes void
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ae09f970e626..ffe5755caa8b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -659,7 +659,7 @@ void restore_sched_clock_state(void)
659 659
660 local_irq_save(flags); 660 local_irq_save(flags);
661 661
662 __get_cpu_var(cyc2ns_offset) = 0; 662 __this_cpu_write(cyc2ns_offset, 0);
663 offset = cyc2ns_suspend - sched_clock(); 663 offset = cyc2ns_suspend - sched_clock();
664 664
665 for_each_possible_cpu(cpu) 665 for_each_possible_cpu(cpu)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 61fb98519622..863f8753ab0a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
179 if (pud_none_or_clear_bad(pud)) 179 if (pud_none_or_clear_bad(pud))
180 goto out; 180 goto out;
181 pmd = pmd_offset(pud, 0xA0000); 181 pmd = pmd_offset(pud, 0xA0000);
182 split_huge_page_pmd(mm, pmd);
182 if (pmd_none_or_clear_bad(pmd)) 183 if (pmd_none_or_clear_bad(pmd))
183 goto out; 184 goto out;
184 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); 185 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ddc131ff438f..50f63648ce1b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select KVM_ASYNC_PF
31 select USER_RETURN_NOTIFIER 32 select USER_RETURN_NOTIFIER
32 select KVM_MMIO 33 select KVM_MMIO
33 ---help--- 34 ---help---
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..f15501f431c8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
1 1
2EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 2ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
3 3
4CFLAGS_x86.o := -I. 4CFLAGS_x86.o := -I.
5CFLAGS_svm.o := -I. 5CFLAGS_svm.o := -I.
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o) 10 assigned-dev.o)
11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
12 13
13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
14 i8254.o timer.o 15 i8254.o timer.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 38b6e8dafaff..caf966781d25 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -20,16 +20,8 @@
20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
21 */ 21 */
22 22
23#ifndef __KERNEL__
24#include <stdio.h>
25#include <stdint.h>
26#include <public/xen.h>
27#define DPRINTF(_f, _a ...) printf(_f , ## _a)
28#else
29#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
30#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
31#define DPRINTF(x...) do {} while (0)
32#endif
33#include <linux/module.h> 25#include <linux/module.h>
34#include <asm/kvm_emulate.h> 26#include <asm/kvm_emulate.h>
35 27
@@ -418,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
418} 410}
419 411
420static inline unsigned long 412static inline unsigned long
421register_address(struct decode_cache *c, unsigned long base, unsigned long reg) 413register_address(struct decode_cache *c, unsigned long reg)
422{ 414{
423 return base + address_mask(c, reg); 415 return address_mask(c, reg);
424} 416}
425 417
426static inline void 418static inline void
@@ -452,60 +444,55 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
452 return ops->get_cached_segment_base(seg, ctxt->vcpu); 444 return ops->get_cached_segment_base(seg, ctxt->vcpu);
453} 445}
454 446
455static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 447static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
456 struct x86_emulate_ops *ops, 448 struct x86_emulate_ops *ops,
457 struct decode_cache *c) 449 struct decode_cache *c)
458{ 450{
459 if (!c->has_seg_override) 451 if (!c->has_seg_override)
460 return 0; 452 return 0;
461 453
462 return seg_base(ctxt, ops, c->seg_override); 454 return c->seg_override;
463} 455}
464 456
465static unsigned long es_base(struct x86_emulate_ctxt *ctxt, 457static ulong linear(struct x86_emulate_ctxt *ctxt,
466 struct x86_emulate_ops *ops) 458 struct segmented_address addr)
467{ 459{
468 return seg_base(ctxt, ops, VCPU_SREG_ES); 460 struct decode_cache *c = &ctxt->decode;
469} 461 ulong la;
470
471static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
472 struct x86_emulate_ops *ops)
473{
474 return seg_base(ctxt, ops, VCPU_SREG_SS);
475}
476 462
477static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 463 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
478 u32 error, bool valid) 464 if (c->ad_bytes != 8)
479{ 465 la &= (u32)-1;
480 ctxt->exception = vec; 466 return la;
481 ctxt->error_code = error;
482 ctxt->error_code_valid = valid;
483} 467}
484 468
485static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 469static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
470 u32 error, bool valid)
486{ 471{
487 emulate_exception(ctxt, GP_VECTOR, err, true); 472 ctxt->exception.vector = vec;
473 ctxt->exception.error_code = error;
474 ctxt->exception.error_code_valid = valid;
475 return X86EMUL_PROPAGATE_FAULT;
488} 476}
489 477
490static void emulate_pf(struct x86_emulate_ctxt *ctxt) 478static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
491{ 479{
492 emulate_exception(ctxt, PF_VECTOR, 0, true); 480 return emulate_exception(ctxt, GP_VECTOR, err, true);
493} 481}
494 482
495static void emulate_ud(struct x86_emulate_ctxt *ctxt) 483static int emulate_ud(struct x86_emulate_ctxt *ctxt)
496{ 484{
497 emulate_exception(ctxt, UD_VECTOR, 0, false); 485 return emulate_exception(ctxt, UD_VECTOR, 0, false);
498} 486}
499 487
500static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) 488static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
501{ 489{
502 emulate_exception(ctxt, TS_VECTOR, err, true); 490 return emulate_exception(ctxt, TS_VECTOR, err, true);
503} 491}
504 492
505static int emulate_de(struct x86_emulate_ctxt *ctxt) 493static int emulate_de(struct x86_emulate_ctxt *ctxt)
506{ 494{
507 emulate_exception(ctxt, DE_VECTOR, 0, false); 495 return emulate_exception(ctxt, DE_VECTOR, 0, false);
508 return X86EMUL_PROPAGATE_FAULT;
509} 496}
510 497
511static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 498static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -520,7 +507,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
520 cur_size = fc->end - fc->start; 507 cur_size = fc->end - fc->start;
521 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); 508 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
522 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, 509 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
523 size, ctxt->vcpu, NULL); 510 size, ctxt->vcpu, &ctxt->exception);
524 if (rc != X86EMUL_CONTINUE) 511 if (rc != X86EMUL_CONTINUE)
525 return rc; 512 return rc;
526 fc->end += size; 513 fc->end += size;
@@ -564,7 +551,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
564 551
565static int read_descriptor(struct x86_emulate_ctxt *ctxt, 552static int read_descriptor(struct x86_emulate_ctxt *ctxt,
566 struct x86_emulate_ops *ops, 553 struct x86_emulate_ops *ops,
567 ulong addr, 554 struct segmented_address addr,
568 u16 *size, unsigned long *address, int op_bytes) 555 u16 *size, unsigned long *address, int op_bytes)
569{ 556{
570 int rc; 557 int rc;
@@ -572,10 +559,13 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
572 if (op_bytes == 2) 559 if (op_bytes == 2)
573 op_bytes = 3; 560 op_bytes = 3;
574 *address = 0; 561 *address = 0;
575 rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL); 562 rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
563 ctxt->vcpu, &ctxt->exception);
576 if (rc != X86EMUL_CONTINUE) 564 if (rc != X86EMUL_CONTINUE)
577 return rc; 565 return rc;
578 rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL); 566 addr.ea += 2;
567 rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
568 ctxt->vcpu, &ctxt->exception);
579 return rc; 569 return rc;
580} 570}
581 571
@@ -768,7 +758,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
768 break; 758 break;
769 } 759 }
770 } 760 }
771 op->addr.mem = modrm_ea; 761 op->addr.mem.ea = modrm_ea;
772done: 762done:
773 return rc; 763 return rc;
774} 764}
@@ -783,13 +773,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
783 op->type = OP_MEM; 773 op->type = OP_MEM;
784 switch (c->ad_bytes) { 774 switch (c->ad_bytes) {
785 case 2: 775 case 2:
786 op->addr.mem = insn_fetch(u16, 2, c->eip); 776 op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
787 break; 777 break;
788 case 4: 778 case 4:
789 op->addr.mem = insn_fetch(u32, 4, c->eip); 779 op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
790 break; 780 break;
791 case 8: 781 case 8:
792 op->addr.mem = insn_fetch(u64, 8, c->eip); 782 op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
793 break; 783 break;
794 } 784 }
795done: 785done:
@@ -808,7 +798,7 @@ static void fetch_bit_operand(struct decode_cache *c)
808 else if (c->src.bytes == 4) 798 else if (c->src.bytes == 4)
809 sv = (s32)c->src.val & (s32)mask; 799 sv = (s32)c->src.val & (s32)mask;
810 800
811 c->dst.addr.mem += (sv >> 3); 801 c->dst.addr.mem.ea += (sv >> 3);
812 } 802 }
813 803
814 /* only subword offset */ 804 /* only subword offset */
@@ -821,7 +811,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
821{ 811{
822 int rc; 812 int rc;
823 struct read_cache *mc = &ctxt->decode.mem_read; 813 struct read_cache *mc = &ctxt->decode.mem_read;
824 u32 err;
825 814
826 while (size) { 815 while (size) {
827 int n = min(size, 8u); 816 int n = min(size, 8u);
@@ -829,10 +818,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
829 if (mc->pos < mc->end) 818 if (mc->pos < mc->end)
830 goto read_cached; 819 goto read_cached;
831 820
832 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, 821 rc = ops->read_emulated(addr, mc->data + mc->end, n,
833 ctxt->vcpu); 822 &ctxt->exception, ctxt->vcpu);
834 if (rc == X86EMUL_PROPAGATE_FAULT)
835 emulate_pf(ctxt);
836 if (rc != X86EMUL_CONTINUE) 823 if (rc != X86EMUL_CONTINUE)
837 return rc; 824 return rc;
838 mc->end += n; 825 mc->end += n;
@@ -907,19 +894,15 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
907 struct desc_ptr dt; 894 struct desc_ptr dt;
908 u16 index = selector >> 3; 895 u16 index = selector >> 3;
909 int ret; 896 int ret;
910 u32 err;
911 ulong addr; 897 ulong addr;
912 898
913 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 899 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
914 900
915 if (dt.size < index * 8 + 7) { 901 if (dt.size < index * 8 + 7)
916 emulate_gp(ctxt, selector & 0xfffc); 902 return emulate_gp(ctxt, selector & 0xfffc);
917 return X86EMUL_PROPAGATE_FAULT;
918 }
919 addr = dt.address + index * 8; 903 addr = dt.address + index * 8;
920 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 904 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
921 if (ret == X86EMUL_PROPAGATE_FAULT) 905 &ctxt->exception);
922 emulate_pf(ctxt);
923 906
924 return ret; 907 return ret;
925} 908}
@@ -931,21 +914,17 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
931{ 914{
932 struct desc_ptr dt; 915 struct desc_ptr dt;
933 u16 index = selector >> 3; 916 u16 index = selector >> 3;
934 u32 err;
935 ulong addr; 917 ulong addr;
936 int ret; 918 int ret;
937 919
938 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 920 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
939 921
940 if (dt.size < index * 8 + 7) { 922 if (dt.size < index * 8 + 7)
941 emulate_gp(ctxt, selector & 0xfffc); 923 return emulate_gp(ctxt, selector & 0xfffc);
942 return X86EMUL_PROPAGATE_FAULT;
943 }
944 924
945 addr = dt.address + index * 8; 925 addr = dt.address + index * 8;
946 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 926 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
947 if (ret == X86EMUL_PROPAGATE_FAULT) 927 &ctxt->exception);
948 emulate_pf(ctxt);
949 928
950 return ret; 929 return ret;
951} 930}
@@ -1092,7 +1071,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1092{ 1071{
1093 int rc; 1072 int rc;
1094 struct decode_cache *c = &ctxt->decode; 1073 struct decode_cache *c = &ctxt->decode;
1095 u32 err;
1096 1074
1097 switch (c->dst.type) { 1075 switch (c->dst.type) {
1098 case OP_REG: 1076 case OP_REG:
@@ -1101,21 +1079,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1101 case OP_MEM: 1079 case OP_MEM:
1102 if (c->lock_prefix) 1080 if (c->lock_prefix)
1103 rc = ops->cmpxchg_emulated( 1081 rc = ops->cmpxchg_emulated(
1104 c->dst.addr.mem, 1082 linear(ctxt, c->dst.addr.mem),
1105 &c->dst.orig_val, 1083 &c->dst.orig_val,
1106 &c->dst.val, 1084 &c->dst.val,
1107 c->dst.bytes, 1085 c->dst.bytes,
1108 &err, 1086 &ctxt->exception,
1109 ctxt->vcpu); 1087 ctxt->vcpu);
1110 else 1088 else
1111 rc = ops->write_emulated( 1089 rc = ops->write_emulated(
1112 c->dst.addr.mem, 1090 linear(ctxt, c->dst.addr.mem),
1113 &c->dst.val, 1091 &c->dst.val,
1114 c->dst.bytes, 1092 c->dst.bytes,
1115 &err, 1093 &ctxt->exception,
1116 ctxt->vcpu); 1094 ctxt->vcpu);
1117 if (rc == X86EMUL_PROPAGATE_FAULT)
1118 emulate_pf(ctxt);
1119 if (rc != X86EMUL_CONTINUE) 1095 if (rc != X86EMUL_CONTINUE)
1120 return rc; 1096 return rc;
1121 break; 1097 break;
@@ -1137,8 +1113,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1137 c->dst.bytes = c->op_bytes; 1113 c->dst.bytes = c->op_bytes;
1138 c->dst.val = c->src.val; 1114 c->dst.val = c->src.val;
1139 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1115 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1140 c->dst.addr.mem = register_address(c, ss_base(ctxt, ops), 1116 c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1141 c->regs[VCPU_REGS_RSP]); 1117 c->dst.addr.mem.seg = VCPU_SREG_SS;
1142} 1118}
1143 1119
1144static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1120static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1147,10 +1123,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1147{ 1123{
1148 struct decode_cache *c = &ctxt->decode; 1124 struct decode_cache *c = &ctxt->decode;
1149 int rc; 1125 int rc;
1126 struct segmented_address addr;
1150 1127
1151 rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), 1128 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1152 c->regs[VCPU_REGS_RSP]), 1129 addr.seg = VCPU_SREG_SS;
1153 dest, len); 1130 rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
1154 if (rc != X86EMUL_CONTINUE) 1131 if (rc != X86EMUL_CONTINUE)
1155 return rc; 1132 return rc;
1156 1133
@@ -1184,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1184 change_mask |= EFLG_IF; 1161 change_mask |= EFLG_IF;
1185 break; 1162 break;
1186 case X86EMUL_MODE_VM86: 1163 case X86EMUL_MODE_VM86:
1187 if (iopl < 3) { 1164 if (iopl < 3)
1188 emulate_gp(ctxt, 0); 1165 return emulate_gp(ctxt, 0);
1189 return X86EMUL_PROPAGATE_FAULT;
1190 }
1191 change_mask |= EFLG_IF; 1166 change_mask |= EFLG_IF;
1192 break; 1167 break;
1193 default: /* real mode */ 1168 default: /* real mode */
@@ -1198,9 +1173,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1198 *(unsigned long *)dest = 1173 *(unsigned long *)dest =
1199 (ctxt->eflags & ~change_mask) | (val & change_mask); 1174 (ctxt->eflags & ~change_mask) | (val & change_mask);
1200 1175
1201 if (rc == X86EMUL_PROPAGATE_FAULT)
1202 emulate_pf(ctxt);
1203
1204 return rc; 1176 return rc;
1205} 1177}
1206 1178
@@ -1287,7 +1259,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1287 gva_t cs_addr; 1259 gva_t cs_addr;
1288 gva_t eip_addr; 1260 gva_t eip_addr;
1289 u16 cs, eip; 1261 u16 cs, eip;
1290 u32 err;
1291 1262
1292 /* TODO: Add limit checks */ 1263 /* TODO: Add limit checks */
1293 c->src.val = ctxt->eflags; 1264 c->src.val = ctxt->eflags;
@@ -1317,11 +1288,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1317 eip_addr = dt.address + (irq << 2); 1288 eip_addr = dt.address + (irq << 2);
1318 cs_addr = dt.address + (irq << 2) + 2; 1289 cs_addr = dt.address + (irq << 2) + 2;
1319 1290
1320 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err); 1291 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
1321 if (rc != X86EMUL_CONTINUE) 1292 if (rc != X86EMUL_CONTINUE)
1322 return rc; 1293 return rc;
1323 1294
1324 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err); 1295 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
1325 if (rc != X86EMUL_CONTINUE) 1296 if (rc != X86EMUL_CONTINUE)
1326 return rc; 1297 return rc;
1327 1298
@@ -1370,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1370 if (rc != X86EMUL_CONTINUE) 1341 if (rc != X86EMUL_CONTINUE)
1371 return rc; 1342 return rc;
1372 1343
1373 if (temp_eip & ~0xffff) { 1344 if (temp_eip & ~0xffff)
1374 emulate_gp(ctxt, 0); 1345 return emulate_gp(ctxt, 0);
1375 return X86EMUL_PROPAGATE_FAULT;
1376 }
1377 1346
1378 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1347 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1379 1348
@@ -1624,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1624 1593
1625 /* syscall is not available in real mode */ 1594 /* syscall is not available in real mode */
1626 if (ctxt->mode == X86EMUL_MODE_REAL || 1595 if (ctxt->mode == X86EMUL_MODE_REAL ||
1627 ctxt->mode == X86EMUL_MODE_VM86) { 1596 ctxt->mode == X86EMUL_MODE_VM86)
1628 emulate_ud(ctxt); 1597 return emulate_ud(ctxt);
1629 return X86EMUL_PROPAGATE_FAULT;
1630 }
1631 1598
1632 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1599 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1633 1600
@@ -1678,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1678 u16 cs_sel, ss_sel; 1645 u16 cs_sel, ss_sel;
1679 1646
1680 /* inject #GP if in real mode */ 1647 /* inject #GP if in real mode */
1681 if (ctxt->mode == X86EMUL_MODE_REAL) { 1648 if (ctxt->mode == X86EMUL_MODE_REAL)
1682 emulate_gp(ctxt, 0); 1649 return emulate_gp(ctxt, 0);
1683 return X86EMUL_PROPAGATE_FAULT;
1684 }
1685 1650
1686 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1651 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1687 * Therefore, we inject an #UD. 1652 * Therefore, we inject an #UD.
1688 */ 1653 */
1689 if (ctxt->mode == X86EMUL_MODE_PROT64) { 1654 if (ctxt->mode == X86EMUL_MODE_PROT64)
1690 emulate_ud(ctxt); 1655 return emulate_ud(ctxt);
1691 return X86EMUL_PROPAGATE_FAULT;
1692 }
1693 1656
1694 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1657 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1695 1658
1696 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 1659 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1697 switch (ctxt->mode) { 1660 switch (ctxt->mode) {
1698 case X86EMUL_MODE_PROT32: 1661 case X86EMUL_MODE_PROT32:
1699 if ((msr_data & 0xfffc) == 0x0) { 1662 if ((msr_data & 0xfffc) == 0x0)
1700 emulate_gp(ctxt, 0); 1663 return emulate_gp(ctxt, 0);
1701 return X86EMUL_PROPAGATE_FAULT;
1702 }
1703 break; 1664 break;
1704 case X86EMUL_MODE_PROT64: 1665 case X86EMUL_MODE_PROT64:
1705 if (msr_data == 0x0) { 1666 if (msr_data == 0x0)
1706 emulate_gp(ctxt, 0); 1667 return emulate_gp(ctxt, 0);
1707 return X86EMUL_PROPAGATE_FAULT;
1708 }
1709 break; 1668 break;
1710 } 1669 }
1711 1670
@@ -1745,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1745 1704
1746 /* inject #GP if in real mode or Virtual 8086 mode */ 1705 /* inject #GP if in real mode or Virtual 8086 mode */
1747 if (ctxt->mode == X86EMUL_MODE_REAL || 1706 if (ctxt->mode == X86EMUL_MODE_REAL ||
1748 ctxt->mode == X86EMUL_MODE_VM86) { 1707 ctxt->mode == X86EMUL_MODE_VM86)
1749 emulate_gp(ctxt, 0); 1708 return emulate_gp(ctxt, 0);
1750 return X86EMUL_PROPAGATE_FAULT;
1751 }
1752 1709
1753 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1710 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1754 1711
@@ -1763,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1763 switch (usermode) { 1720 switch (usermode) {
1764 case X86EMUL_MODE_PROT32: 1721 case X86EMUL_MODE_PROT32:
1765 cs_sel = (u16)(msr_data + 16); 1722 cs_sel = (u16)(msr_data + 16);
1766 if ((msr_data & 0xfffc) == 0x0) { 1723 if ((msr_data & 0xfffc) == 0x0)
1767 emulate_gp(ctxt, 0); 1724 return emulate_gp(ctxt, 0);
1768 return X86EMUL_PROPAGATE_FAULT;
1769 }
1770 ss_sel = (u16)(msr_data + 24); 1725 ss_sel = (u16)(msr_data + 24);
1771 break; 1726 break;
1772 case X86EMUL_MODE_PROT64: 1727 case X86EMUL_MODE_PROT64:
1773 cs_sel = (u16)(msr_data + 32); 1728 cs_sel = (u16)(msr_data + 32);
1774 if (msr_data == 0x0) { 1729 if (msr_data == 0x0)
1775 emulate_gp(ctxt, 0); 1730 return emulate_gp(ctxt, 0);
1776 return X86EMUL_PROPAGATE_FAULT;
1777 }
1778 ss_sel = cs_sel + 8; 1731 ss_sel = cs_sel + 8;
1779 cs.d = 0; 1732 cs.d = 0;
1780 cs.l = 1; 1733 cs.l = 1;
@@ -1934,33 +1887,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1934{ 1887{
1935 struct tss_segment_16 tss_seg; 1888 struct tss_segment_16 tss_seg;
1936 int ret; 1889 int ret;
1937 u32 err, new_tss_base = get_desc_base(new_desc); 1890 u32 new_tss_base = get_desc_base(new_desc);
1938 1891
1939 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1892 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1940 &err); 1893 &ctxt->exception);
1941 if (ret == X86EMUL_PROPAGATE_FAULT) { 1894 if (ret != X86EMUL_CONTINUE)
1942 /* FIXME: need to provide precise fault address */ 1895 /* FIXME: need to provide precise fault address */
1943 emulate_pf(ctxt);
1944 return ret; 1896 return ret;
1945 }
1946 1897
1947 save_state_to_tss16(ctxt, ops, &tss_seg); 1898 save_state_to_tss16(ctxt, ops, &tss_seg);
1948 1899
1949 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1900 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1950 &err); 1901 &ctxt->exception);
1951 if (ret == X86EMUL_PROPAGATE_FAULT) { 1902 if (ret != X86EMUL_CONTINUE)
1952 /* FIXME: need to provide precise fault address */ 1903 /* FIXME: need to provide precise fault address */
1953 emulate_pf(ctxt);
1954 return ret; 1904 return ret;
1955 }
1956 1905
1957 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1906 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1958 &err); 1907 &ctxt->exception);
1959 if (ret == X86EMUL_PROPAGATE_FAULT) { 1908 if (ret != X86EMUL_CONTINUE)
1960 /* FIXME: need to provide precise fault address */ 1909 /* FIXME: need to provide precise fault address */
1961 emulate_pf(ctxt);
1962 return ret; 1910 return ret;
1963 }
1964 1911
1965 if (old_tss_sel != 0xffff) { 1912 if (old_tss_sel != 0xffff) {
1966 tss_seg.prev_task_link = old_tss_sel; 1913 tss_seg.prev_task_link = old_tss_sel;
@@ -1968,12 +1915,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1968 ret = ops->write_std(new_tss_base, 1915 ret = ops->write_std(new_tss_base,
1969 &tss_seg.prev_task_link, 1916 &tss_seg.prev_task_link,
1970 sizeof tss_seg.prev_task_link, 1917 sizeof tss_seg.prev_task_link,
1971 ctxt->vcpu, &err); 1918 ctxt->vcpu, &ctxt->exception);
1972 if (ret == X86EMUL_PROPAGATE_FAULT) { 1919 if (ret != X86EMUL_CONTINUE)
1973 /* FIXME: need to provide precise fault address */ 1920 /* FIXME: need to provide precise fault address */
1974 emulate_pf(ctxt);
1975 return ret; 1921 return ret;
1976 }
1977 } 1922 }
1978 1923
1979 return load_state_from_tss16(ctxt, ops, &tss_seg); 1924 return load_state_from_tss16(ctxt, ops, &tss_seg);
@@ -2013,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2013 struct decode_cache *c = &ctxt->decode; 1958 struct decode_cache *c = &ctxt->decode;
2014 int ret; 1959 int ret;
2015 1960
2016 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { 1961 if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
2017 emulate_gp(ctxt, 0); 1962 return emulate_gp(ctxt, 0);
2018 return X86EMUL_PROPAGATE_FAULT;
2019 }
2020 c->eip = tss->eip; 1963 c->eip = tss->eip;
2021 ctxt->eflags = tss->eflags | 2; 1964 ctxt->eflags = tss->eflags | 2;
2022 c->regs[VCPU_REGS_RAX] = tss->eax; 1965 c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2076,33 +2019,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2076{ 2019{
2077 struct tss_segment_32 tss_seg; 2020 struct tss_segment_32 tss_seg;
2078 int ret; 2021 int ret;
2079 u32 err, new_tss_base = get_desc_base(new_desc); 2022 u32 new_tss_base = get_desc_base(new_desc);
2080 2023
2081 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2024 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2082 &err); 2025 &ctxt->exception);
2083 if (ret == X86EMUL_PROPAGATE_FAULT) { 2026 if (ret != X86EMUL_CONTINUE)
2084 /* FIXME: need to provide precise fault address */ 2027 /* FIXME: need to provide precise fault address */
2085 emulate_pf(ctxt);
2086 return ret; 2028 return ret;
2087 }
2088 2029
2089 save_state_to_tss32(ctxt, ops, &tss_seg); 2030 save_state_to_tss32(ctxt, ops, &tss_seg);
2090 2031
2091 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2032 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2092 &err); 2033 &ctxt->exception);
2093 if (ret == X86EMUL_PROPAGATE_FAULT) { 2034 if (ret != X86EMUL_CONTINUE)
2094 /* FIXME: need to provide precise fault address */ 2035 /* FIXME: need to provide precise fault address */
2095 emulate_pf(ctxt);
2096 return ret; 2036 return ret;
2097 }
2098 2037
2099 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2038 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2100 &err); 2039 &ctxt->exception);
2101 if (ret == X86EMUL_PROPAGATE_FAULT) { 2040 if (ret != X86EMUL_CONTINUE)
2102 /* FIXME: need to provide precise fault address */ 2041 /* FIXME: need to provide precise fault address */
2103 emulate_pf(ctxt);
2104 return ret; 2042 return ret;
2105 }
2106 2043
2107 if (old_tss_sel != 0xffff) { 2044 if (old_tss_sel != 0xffff) {
2108 tss_seg.prev_task_link = old_tss_sel; 2045 tss_seg.prev_task_link = old_tss_sel;
@@ -2110,12 +2047,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2110 ret = ops->write_std(new_tss_base, 2047 ret = ops->write_std(new_tss_base,
2111 &tss_seg.prev_task_link, 2048 &tss_seg.prev_task_link,
2112 sizeof tss_seg.prev_task_link, 2049 sizeof tss_seg.prev_task_link,
2113 ctxt->vcpu, &err); 2050 ctxt->vcpu, &ctxt->exception);
2114 if (ret == X86EMUL_PROPAGATE_FAULT) { 2051 if (ret != X86EMUL_CONTINUE)
2115 /* FIXME: need to provide precise fault address */ 2052 /* FIXME: need to provide precise fault address */
2116 emulate_pf(ctxt);
2117 return ret; 2053 return ret;
2118 }
2119 } 2054 }
2120 2055
2121 return load_state_from_tss32(ctxt, ops, &tss_seg); 2056 return load_state_from_tss32(ctxt, ops, &tss_seg);
@@ -2146,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2146 2081
2147 if (reason != TASK_SWITCH_IRET) { 2082 if (reason != TASK_SWITCH_IRET) {
2148 if ((tss_selector & 3) > next_tss_desc.dpl || 2083 if ((tss_selector & 3) > next_tss_desc.dpl ||
2149 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2084 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
2150 emulate_gp(ctxt, 0); 2085 return emulate_gp(ctxt, 0);
2151 return X86EMUL_PROPAGATE_FAULT;
2152 }
2153 } 2086 }
2154 2087
2155 desc_limit = desc_limit_scaled(&next_tss_desc); 2088 desc_limit = desc_limit_scaled(&next_tss_desc);
@@ -2231,14 +2164,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2231 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2164 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2232} 2165}
2233 2166
2234static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, 2167static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
2235 int reg, struct operand *op) 2168 int reg, struct operand *op)
2236{ 2169{
2237 struct decode_cache *c = &ctxt->decode; 2170 struct decode_cache *c = &ctxt->decode;
2238 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2171 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2239 2172
2240 register_address_increment(c, &c->regs[reg], df * op->bytes); 2173 register_address_increment(c, &c->regs[reg], df * op->bytes);
2241 op->addr.mem = register_address(c, base, c->regs[reg]); 2174 op->addr.mem.ea = register_address(c, c->regs[reg]);
2175 op->addr.mem.seg = seg;
2242} 2176}
2243 2177
2244static int em_push(struct x86_emulate_ctxt *ctxt) 2178static int em_push(struct x86_emulate_ctxt *ctxt)
@@ -2369,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2369 struct decode_cache *c = &ctxt->decode; 2303 struct decode_cache *c = &ctxt->decode;
2370 u64 tsc = 0; 2304 u64 tsc = 0;
2371 2305
2372 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) { 2306 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
2373 emulate_gp(ctxt, 0); 2307 return emulate_gp(ctxt, 0);
2374 return X86EMUL_PROPAGATE_FAULT;
2375 }
2376 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); 2308 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
2377 c->regs[VCPU_REGS_RAX] = (u32)tsc; 2309 c->regs[VCPU_REGS_RAX] = (u32)tsc;
2378 c->regs[VCPU_REGS_RDX] = tsc >> 32; 2310 c->regs[VCPU_REGS_RDX] = tsc >> 32;
@@ -2647,7 +2579,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
2647 2579
2648 op->type = OP_IMM; 2580 op->type = OP_IMM;
2649 op->bytes = size; 2581 op->bytes = size;
2650 op->addr.mem = c->eip; 2582 op->addr.mem.ea = c->eip;
2651 /* NB. Immediates are sign-extended as necessary. */ 2583 /* NB. Immediates are sign-extended as necessary. */
2652 switch (op->bytes) { 2584 switch (op->bytes) {
2653 case 1: 2585 case 1:
@@ -2678,7 +2610,7 @@ done:
2678} 2610}
2679 2611
2680int 2612int
2681x86_decode_insn(struct x86_emulate_ctxt *ctxt) 2613x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2682{ 2614{
2683 struct x86_emulate_ops *ops = ctxt->ops; 2615 struct x86_emulate_ops *ops = ctxt->ops;
2684 struct decode_cache *c = &ctxt->decode; 2616 struct decode_cache *c = &ctxt->decode;
@@ -2689,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
2689 struct operand memop = { .type = OP_NONE }; 2621 struct operand memop = { .type = OP_NONE };
2690 2622
2691 c->eip = ctxt->eip; 2623 c->eip = ctxt->eip;
2692 c->fetch.start = c->fetch.end = c->eip; 2624 c->fetch.start = c->eip;
2625 c->fetch.end = c->fetch.start + insn_len;
2626 if (insn_len > 0)
2627 memcpy(c->fetch.data, insn, insn_len);
2693 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); 2628 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
2694 2629
2695 switch (mode) { 2630 switch (mode) {
@@ -2803,10 +2738,8 @@ done_prefixes:
2803 c->execute = opcode.u.execute; 2738 c->execute = opcode.u.execute;
2804 2739
2805 /* Unrecognised? */ 2740 /* Unrecognised? */
2806 if (c->d == 0 || (c->d & Undefined)) { 2741 if (c->d == 0 || (c->d & Undefined))
2807 DPRINTF("Cannot emulate %02x\n", c->b);
2808 return -1; 2742 return -1;
2809 }
2810 2743
2811 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 2744 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2812 c->op_bytes = 8; 2745 c->op_bytes = 8;
@@ -2831,14 +2764,13 @@ done_prefixes:
2831 if (!c->has_seg_override) 2764 if (!c->has_seg_override)
2832 set_seg_override(c, VCPU_SREG_DS); 2765 set_seg_override(c, VCPU_SREG_DS);
2833 2766
2834 if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d)) 2767 memop.addr.mem.seg = seg_override(ctxt, ops, c);
2835 memop.addr.mem += seg_override_base(ctxt, ops, c);
2836 2768
2837 if (memop.type == OP_MEM && c->ad_bytes != 8) 2769 if (memop.type == OP_MEM && c->ad_bytes != 8)
2838 memop.addr.mem = (u32)memop.addr.mem; 2770 memop.addr.mem.ea = (u32)memop.addr.mem.ea;
2839 2771
2840 if (memop.type == OP_MEM && c->rip_relative) 2772 if (memop.type == OP_MEM && c->rip_relative)
2841 memop.addr.mem += c->eip; 2773 memop.addr.mem.ea += c->eip;
2842 2774
2843 /* 2775 /*
2844 * Decode and fetch the source operand: register, memory 2776 * Decode and fetch the source operand: register, memory
@@ -2890,14 +2822,14 @@ done_prefixes:
2890 case SrcSI: 2822 case SrcSI:
2891 c->src.type = OP_MEM; 2823 c->src.type = OP_MEM;
2892 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2824 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2893 c->src.addr.mem = 2825 c->src.addr.mem.ea =
2894 register_address(c, seg_override_base(ctxt, ops, c), 2826 register_address(c, c->regs[VCPU_REGS_RSI]);
2895 c->regs[VCPU_REGS_RSI]); 2827 c->src.addr.mem.seg = seg_override(ctxt, ops, c),
2896 c->src.val = 0; 2828 c->src.val = 0;
2897 break; 2829 break;
2898 case SrcImmFAddr: 2830 case SrcImmFAddr:
2899 c->src.type = OP_IMM; 2831 c->src.type = OP_IMM;
2900 c->src.addr.mem = c->eip; 2832 c->src.addr.mem.ea = c->eip;
2901 c->src.bytes = c->op_bytes + 2; 2833 c->src.bytes = c->op_bytes + 2;
2902 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); 2834 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
2903 break; 2835 break;
@@ -2944,7 +2876,7 @@ done_prefixes:
2944 break; 2876 break;
2945 case DstImmUByte: 2877 case DstImmUByte:
2946 c->dst.type = OP_IMM; 2878 c->dst.type = OP_IMM;
2947 c->dst.addr.mem = c->eip; 2879 c->dst.addr.mem.ea = c->eip;
2948 c->dst.bytes = 1; 2880 c->dst.bytes = 1;
2949 c->dst.val = insn_fetch(u8, 1, c->eip); 2881 c->dst.val = insn_fetch(u8, 1, c->eip);
2950 break; 2882 break;
@@ -2969,9 +2901,9 @@ done_prefixes:
2969 case DstDI: 2901 case DstDI:
2970 c->dst.type = OP_MEM; 2902 c->dst.type = OP_MEM;
2971 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2903 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2972 c->dst.addr.mem = 2904 c->dst.addr.mem.ea =
2973 register_address(c, es_base(ctxt, ops), 2905 register_address(c, c->regs[VCPU_REGS_RDI]);
2974 c->regs[VCPU_REGS_RDI]); 2906 c->dst.addr.mem.seg = VCPU_SREG_ES;
2975 c->dst.val = 0; 2907 c->dst.val = 0;
2976 break; 2908 break;
2977 case ImplicitOps: 2909 case ImplicitOps:
@@ -3020,24 +2952,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3020 ctxt->decode.mem_read.pos = 0; 2952 ctxt->decode.mem_read.pos = 0;
3021 2953
3022 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 2954 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
3023 emulate_ud(ctxt); 2955 rc = emulate_ud(ctxt);
3024 goto done; 2956 goto done;
3025 } 2957 }
3026 2958
3027 /* LOCK prefix is allowed only with some instructions */ 2959 /* LOCK prefix is allowed only with some instructions */
3028 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 2960 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
3029 emulate_ud(ctxt); 2961 rc = emulate_ud(ctxt);
3030 goto done; 2962 goto done;
3031 } 2963 }
3032 2964
3033 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { 2965 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
3034 emulate_ud(ctxt); 2966 rc = emulate_ud(ctxt);
3035 goto done; 2967 goto done;
3036 } 2968 }
3037 2969
3038 /* Privileged instruction can be executed only in CPL=0 */ 2970 /* Privileged instruction can be executed only in CPL=0 */
3039 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 2971 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
3040 emulate_gp(ctxt, 0); 2972 rc = emulate_gp(ctxt, 0);
3041 goto done; 2973 goto done;
3042 } 2974 }
3043 2975
@@ -3050,7 +2982,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3050 } 2982 }
3051 2983
3052 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { 2984 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
3053 rc = read_emulated(ctxt, ops, c->src.addr.mem, 2985 rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
3054 c->src.valptr, c->src.bytes); 2986 c->src.valptr, c->src.bytes);
3055 if (rc != X86EMUL_CONTINUE) 2987 if (rc != X86EMUL_CONTINUE)
3056 goto done; 2988 goto done;
@@ -3058,7 +2990,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3058 } 2990 }
3059 2991
3060 if (c->src2.type == OP_MEM) { 2992 if (c->src2.type == OP_MEM) {
3061 rc = read_emulated(ctxt, ops, c->src2.addr.mem, 2993 rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
3062 &c->src2.val, c->src2.bytes); 2994 &c->src2.val, c->src2.bytes);
3063 if (rc != X86EMUL_CONTINUE) 2995 if (rc != X86EMUL_CONTINUE)
3064 goto done; 2996 goto done;
@@ -3070,7 +3002,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3070 3002
3071 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3003 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
3072 /* optimisation - avoid slow emulated read if Mov */ 3004 /* optimisation - avoid slow emulated read if Mov */
3073 rc = read_emulated(ctxt, ops, c->dst.addr.mem, 3005 rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
3074 &c->dst.val, c->dst.bytes); 3006 &c->dst.val, c->dst.bytes);
3075 if (rc != X86EMUL_CONTINUE) 3007 if (rc != X86EMUL_CONTINUE)
3076 goto done; 3008 goto done;
@@ -3215,13 +3147,13 @@ special_insn:
3215 break; 3147 break;
3216 case 0x8c: /* mov r/m, sreg */ 3148 case 0x8c: /* mov r/m, sreg */
3217 if (c->modrm_reg > VCPU_SREG_GS) { 3149 if (c->modrm_reg > VCPU_SREG_GS) {
3218 emulate_ud(ctxt); 3150 rc = emulate_ud(ctxt);
3219 goto done; 3151 goto done;
3220 } 3152 }
3221 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 3153 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
3222 break; 3154 break;
3223 case 0x8d: /* lea r16/r32, m */ 3155 case 0x8d: /* lea r16/r32, m */
3224 c->dst.val = c->src.addr.mem; 3156 c->dst.val = c->src.addr.mem.ea;
3225 break; 3157 break;
3226 case 0x8e: { /* mov seg, r/m16 */ 3158 case 0x8e: { /* mov seg, r/m16 */
3227 uint16_t sel; 3159 uint16_t sel;
@@ -3230,7 +3162,7 @@ special_insn:
3230 3162
3231 if (c->modrm_reg == VCPU_SREG_CS || 3163 if (c->modrm_reg == VCPU_SREG_CS ||
3232 c->modrm_reg > VCPU_SREG_GS) { 3164 c->modrm_reg > VCPU_SREG_GS) {
3233 emulate_ud(ctxt); 3165 rc = emulate_ud(ctxt);
3234 goto done; 3166 goto done;
3235 } 3167 }
3236 3168
@@ -3268,7 +3200,6 @@ special_insn:
3268 break; 3200 break;
3269 case 0xa6 ... 0xa7: /* cmps */ 3201 case 0xa6 ... 0xa7: /* cmps */
3270 c->dst.type = OP_NONE; /* Disable writeback. */ 3202 c->dst.type = OP_NONE; /* Disable writeback. */
3271 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
3272 goto cmp; 3203 goto cmp;
3273 case 0xa8 ... 0xa9: /* test ax, imm */ 3204 case 0xa8 ... 0xa9: /* test ax, imm */
3274 goto test; 3205 goto test;
@@ -3363,7 +3294,7 @@ special_insn:
3363 do_io_in: 3294 do_io_in:
3364 c->dst.bytes = min(c->dst.bytes, 4u); 3295 c->dst.bytes = min(c->dst.bytes, 4u);
3365 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 3296 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
3366 emulate_gp(ctxt, 0); 3297 rc = emulate_gp(ctxt, 0);
3367 goto done; 3298 goto done;
3368 } 3299 }
3369 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 3300 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -3377,7 +3308,7 @@ special_insn:
3377 c->src.bytes = min(c->src.bytes, 4u); 3308 c->src.bytes = min(c->src.bytes, 4u);
3378 if (!emulator_io_permited(ctxt, ops, c->dst.val, 3309 if (!emulator_io_permited(ctxt, ops, c->dst.val,
3379 c->src.bytes)) { 3310 c->src.bytes)) {
3380 emulate_gp(ctxt, 0); 3311 rc = emulate_gp(ctxt, 0);
3381 goto done; 3312 goto done;
3382 } 3313 }
3383 ops->pio_out_emulated(c->src.bytes, c->dst.val, 3314 ops->pio_out_emulated(c->src.bytes, c->dst.val,
@@ -3402,14 +3333,14 @@ special_insn:
3402 break; 3333 break;
3403 case 0xfa: /* cli */ 3334 case 0xfa: /* cli */
3404 if (emulator_bad_iopl(ctxt, ops)) { 3335 if (emulator_bad_iopl(ctxt, ops)) {
3405 emulate_gp(ctxt, 0); 3336 rc = emulate_gp(ctxt, 0);
3406 goto done; 3337 goto done;
3407 } else 3338 } else
3408 ctxt->eflags &= ~X86_EFLAGS_IF; 3339 ctxt->eflags &= ~X86_EFLAGS_IF;
3409 break; 3340 break;
3410 case 0xfb: /* sti */ 3341 case 0xfb: /* sti */
3411 if (emulator_bad_iopl(ctxt, ops)) { 3342 if (emulator_bad_iopl(ctxt, ops)) {
3412 emulate_gp(ctxt, 0); 3343 rc = emulate_gp(ctxt, 0);
3413 goto done; 3344 goto done;
3414 } else { 3345 } else {
3415 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; 3346 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
@@ -3449,11 +3380,11 @@ writeback:
3449 c->dst.type = saved_dst_type; 3380 c->dst.type = saved_dst_type;
3450 3381
3451 if ((c->d & SrcMask) == SrcSI) 3382 if ((c->d & SrcMask) == SrcSI)
3452 string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), 3383 string_addr_inc(ctxt, seg_override(ctxt, ops, c),
3453 VCPU_REGS_RSI, &c->src); 3384 VCPU_REGS_RSI, &c->src);
3454 3385
3455 if ((c->d & DstMask) == DstDI) 3386 if ((c->d & DstMask) == DstDI)
3456 string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, 3387 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
3457 &c->dst); 3388 &c->dst);
3458 3389
3459 if (c->rep_prefix && (c->d & String)) { 3390 if (c->rep_prefix && (c->d & String)) {
@@ -3482,6 +3413,8 @@ writeback:
3482 ctxt->eip = c->eip; 3413 ctxt->eip = c->eip;
3483 3414
3484done: 3415done:
3416 if (rc == X86EMUL_PROPAGATE_FAULT)
3417 ctxt->have_exception = true;
3485 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 3418 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3486 3419
3487twobyte_insn: 3420twobyte_insn:
@@ -3544,9 +3477,11 @@ twobyte_insn:
3544 break; 3477 break;
3545 case 5: /* not defined */ 3478 case 5: /* not defined */
3546 emulate_ud(ctxt); 3479 emulate_ud(ctxt);
3480 rc = X86EMUL_PROPAGATE_FAULT;
3547 goto done; 3481 goto done;
3548 case 7: /* invlpg*/ 3482 case 7: /* invlpg*/
3549 emulate_invlpg(ctxt->vcpu, c->src.addr.mem); 3483 emulate_invlpg(ctxt->vcpu,
3484 linear(ctxt, c->src.addr.mem));
3550 /* Disable writeback. */ 3485 /* Disable writeback. */
3551 c->dst.type = OP_NONE; 3486 c->dst.type = OP_NONE;
3552 break; 3487 break;
@@ -3573,6 +3508,7 @@ twobyte_insn:
3573 case 5 ... 7: 3508 case 5 ... 7:
3574 case 9 ... 15: 3509 case 9 ... 15:
3575 emulate_ud(ctxt); 3510 emulate_ud(ctxt);
3511 rc = X86EMUL_PROPAGATE_FAULT;
3576 goto done; 3512 goto done;
3577 } 3513 }
3578 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3514 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3581,6 +3517,7 @@ twobyte_insn:
3581 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3517 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3582 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3518 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3583 emulate_ud(ctxt); 3519 emulate_ud(ctxt);
3520 rc = X86EMUL_PROPAGATE_FAULT;
3584 goto done; 3521 goto done;
3585 } 3522 }
3586 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); 3523 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
@@ -3588,6 +3525,7 @@ twobyte_insn:
3588 case 0x22: /* mov reg, cr */ 3525 case 0x22: /* mov reg, cr */
3589 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { 3526 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
3590 emulate_gp(ctxt, 0); 3527 emulate_gp(ctxt, 0);
3528 rc = X86EMUL_PROPAGATE_FAULT;
3591 goto done; 3529 goto done;
3592 } 3530 }
3593 c->dst.type = OP_NONE; 3531 c->dst.type = OP_NONE;
@@ -3596,6 +3534,7 @@ twobyte_insn:
3596 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3534 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3597 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3535 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3598 emulate_ud(ctxt); 3536 emulate_ud(ctxt);
3537 rc = X86EMUL_PROPAGATE_FAULT;
3599 goto done; 3538 goto done;
3600 } 3539 }
3601 3540
@@ -3604,6 +3543,7 @@ twobyte_insn:
3604 ~0ULL : ~0U), ctxt->vcpu) < 0) { 3543 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3605 /* #UD condition is already handled by the code above */ 3544 /* #UD condition is already handled by the code above */
3606 emulate_gp(ctxt, 0); 3545 emulate_gp(ctxt, 0);
3546 rc = X86EMUL_PROPAGATE_FAULT;
3607 goto done; 3547 goto done;
3608 } 3548 }
3609 3549
@@ -3615,6 +3555,7 @@ twobyte_insn:
3615 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3555 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3616 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 3556 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
3617 emulate_gp(ctxt, 0); 3557 emulate_gp(ctxt, 0);
3558 rc = X86EMUL_PROPAGATE_FAULT;
3618 goto done; 3559 goto done;
3619 } 3560 }
3620 rc = X86EMUL_CONTINUE; 3561 rc = X86EMUL_CONTINUE;
@@ -3623,6 +3564,7 @@ twobyte_insn:
3623 /* rdmsr */ 3564 /* rdmsr */
3624 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 3565 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
3625 emulate_gp(ctxt, 0); 3566 emulate_gp(ctxt, 0);
3567 rc = X86EMUL_PROPAGATE_FAULT;
3626 goto done; 3568 goto done;
3627 } else { 3569 } else {
3628 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3570 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3785,6 +3727,5 @@ twobyte_insn:
3785 goto writeback; 3727 goto writeback;
3786 3728
3787cannot_emulate: 3729cannot_emulate:
3788 DPRINTF("Cannot emulate %02x\n", c->b);
3789 return -1; 3730 return -1;
3790} 3731}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 975bb45329a1..3377d53fcd36 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -73,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
73 return vcpu->arch.cr4 & mask; 73 return vcpu->arch.cr4 & mask;
74} 74}
75 75
76static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
77{
78 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
79 kvm_x86_ops->decache_cr3(vcpu);
80 return vcpu->arch.cr3;
81}
82
76static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) 83static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
77{ 84{
78 return kvm_read_cr4_bits(vcpu, ~0UL); 85 return kvm_read_cr4_bits(vcpu, ~0UL);
@@ -84,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
84 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); 91 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
85} 92}
86 93
94static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
95{
96 vcpu->arch.hflags |= HF_GUEST_MASK;
97}
98
99static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
100{
101 vcpu->arch.hflags &= ~HF_GUEST_MASK;
102}
103
104static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
105{
106 return vcpu->arch.hflags & HF_GUEST_MASK;
107}
108
87#endif 109#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 413f8973a855..93cf9d0d3653 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
277 277
278 if (old_ppr != ppr) { 278 if (old_ppr != ppr) {
279 apic_set_reg(apic, APIC_PROCPRI, ppr); 279 apic_set_reg(apic, APIC_PROCPRI, ppr);
280 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 280 if (ppr < old_ppr)
281 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
281 } 282 }
282} 283}
283 284
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fbb04aee8301..f02b8edc3d44 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,9 +18,11 @@
18 * 18 *
19 */ 19 */
20 20
21#include "irq.h"
21#include "mmu.h" 22#include "mmu.h"
22#include "x86.h" 23#include "x86.h"
23#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include "x86.h"
24 26
25#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
26#include <linux/types.h> 28#include <linux/types.h>
@@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages;
194 196
195static u64 __read_mostly shadow_trap_nonpresent_pte; 197static u64 __read_mostly shadow_trap_nonpresent_pte;
196static u64 __read_mostly shadow_notrap_nonpresent_pte; 198static u64 __read_mostly shadow_notrap_nonpresent_pte;
197static u64 __read_mostly shadow_base_present_pte;
198static u64 __read_mostly shadow_nx_mask; 199static u64 __read_mostly shadow_nx_mask;
199static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 200static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
200static u64 __read_mostly shadow_user_mask; 201static u64 __read_mostly shadow_user_mask;
@@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
213} 214}
214EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); 215EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
215 216
216void kvm_mmu_set_base_ptes(u64 base_pte)
217{
218 shadow_base_present_pte = base_pte;
219}
220EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
221
222void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 217void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
223 u64 dirty_mask, u64 nx_mask, u64 x_mask) 218 u64 dirty_mask, u64 nx_mask, u64 x_mask)
224{ 219{
@@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
482} 477}
483 478
484/* 479/*
485 * Return the pointer to the largepage write count for a given 480 * Return the pointer to the large page information for a given gfn,
486 * gfn, handling slots that are not large page aligned. 481 * handling slots that are not large page aligned.
487 */ 482 */
488static int *slot_largepage_idx(gfn_t gfn, 483static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
489 struct kvm_memory_slot *slot, 484 struct kvm_memory_slot *slot,
490 int level) 485 int level)
491{ 486{
492 unsigned long idx; 487 unsigned long idx;
493 488
494 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 489 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
495 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 490 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
496 return &slot->lpage_info[level - 2][idx].write_count; 491 return &slot->lpage_info[level - 2][idx];
497} 492}
498 493
499static void account_shadowed(struct kvm *kvm, gfn_t gfn) 494static void account_shadowed(struct kvm *kvm, gfn_t gfn)
500{ 495{
501 struct kvm_memory_slot *slot; 496 struct kvm_memory_slot *slot;
502 int *write_count; 497 struct kvm_lpage_info *linfo;
503 int i; 498 int i;
504 499
505 slot = gfn_to_memslot(kvm, gfn); 500 slot = gfn_to_memslot(kvm, gfn);
506 for (i = PT_DIRECTORY_LEVEL; 501 for (i = PT_DIRECTORY_LEVEL;
507 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 502 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
508 write_count = slot_largepage_idx(gfn, slot, i); 503 linfo = lpage_info_slot(gfn, slot, i);
509 *write_count += 1; 504 linfo->write_count += 1;
510 } 505 }
511} 506}
512 507
513static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 508static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
514{ 509{
515 struct kvm_memory_slot *slot; 510 struct kvm_memory_slot *slot;
516 int *write_count; 511 struct kvm_lpage_info *linfo;
517 int i; 512 int i;
518 513
519 slot = gfn_to_memslot(kvm, gfn); 514 slot = gfn_to_memslot(kvm, gfn);
520 for (i = PT_DIRECTORY_LEVEL; 515 for (i = PT_DIRECTORY_LEVEL;
521 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 516 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
522 write_count = slot_largepage_idx(gfn, slot, i); 517 linfo = lpage_info_slot(gfn, slot, i);
523 *write_count -= 1; 518 linfo->write_count -= 1;
524 WARN_ON(*write_count < 0); 519 WARN_ON(linfo->write_count < 0);
525 } 520 }
526} 521}
527 522
@@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm,
530 int level) 525 int level)
531{ 526{
532 struct kvm_memory_slot *slot; 527 struct kvm_memory_slot *slot;
533 int *largepage_idx; 528 struct kvm_lpage_info *linfo;
534 529
535 slot = gfn_to_memslot(kvm, gfn); 530 slot = gfn_to_memslot(kvm, gfn);
536 if (slot) { 531 if (slot) {
537 largepage_idx = slot_largepage_idx(gfn, slot, level); 532 linfo = lpage_info_slot(gfn, slot, level);
538 return *largepage_idx; 533 return linfo->write_count;
539 } 534 }
540 535
541 return 1; 536 return 1;
@@ -559,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
559 return ret; 554 return ret;
560} 555}
561 556
562static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 557static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
563{ 558{
564 struct kvm_memory_slot *slot; 559 struct kvm_memory_slot *slot;
565 int host_level, level, max_level;
566
567 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 560 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
568 if (slot && slot->dirty_bitmap) 561 if (slot && slot->dirty_bitmap)
569 return PT_PAGE_TABLE_LEVEL; 562 return true;
563 return false;
564}
565
566static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 int host_level, level, max_level;
570 569
571 host_level = host_mapping_level(vcpu->kvm, large_gfn); 570 host_level = host_mapping_level(vcpu->kvm, large_gfn);
572 571
@@ -590,16 +589,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
590static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 589static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
591{ 590{
592 struct kvm_memory_slot *slot; 591 struct kvm_memory_slot *slot;
593 unsigned long idx; 592 struct kvm_lpage_info *linfo;
594 593
595 slot = gfn_to_memslot(kvm, gfn); 594 slot = gfn_to_memslot(kvm, gfn);
596 if (likely(level == PT_PAGE_TABLE_LEVEL)) 595 if (likely(level == PT_PAGE_TABLE_LEVEL))
597 return &slot->rmap[gfn - slot->base_gfn]; 596 return &slot->rmap[gfn - slot->base_gfn];
598 597
599 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 598 linfo = lpage_info_slot(gfn, slot, level);
600 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
601 599
602 return &slot->lpage_info[level - 2][idx].rmap_pde; 600 return &linfo->rmap_pde;
603} 601}
604 602
605/* 603/*
@@ -887,19 +885,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
887 end = start + (memslot->npages << PAGE_SHIFT); 885 end = start + (memslot->npages << PAGE_SHIFT);
888 if (hva >= start && hva < end) { 886 if (hva >= start && hva < end) {
889 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 887 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
888 gfn_t gfn = memslot->base_gfn + gfn_offset;
890 889
891 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 890 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
892 891
893 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 892 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
894 unsigned long idx; 893 struct kvm_lpage_info *linfo;
895 int sh; 894
896 895 linfo = lpage_info_slot(gfn, memslot,
897 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); 896 PT_DIRECTORY_LEVEL + j);
898 idx = ((memslot->base_gfn+gfn_offset) >> sh) - 897 ret |= handler(kvm, &linfo->rmap_pde, data);
899 (memslot->base_gfn >> sh);
900 ret |= handler(kvm,
901 &memslot->lpage_info[j][idx].rmap_pde,
902 data);
903 } 898 }
904 trace_kvm_age_page(hva, memslot, ret); 899 trace_kvm_age_page(hva, memslot, ret);
905 retval |= ret; 900 retval |= ret;
@@ -950,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
950 return young; 945 return young;
951} 946}
952 947
948static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
949 unsigned long data)
950{
951 u64 *spte;
952 int young = 0;
953
954 /*
955 * If there's no access bit in the secondary pte set by the
956 * hardware it's up to gup-fast/gup to set the access bit in
957 * the primary pte or in the page structure.
958 */
959 if (!shadow_accessed_mask)
960 goto out;
961
962 spte = rmap_next(kvm, rmapp, NULL);
963 while (spte) {
964 u64 _spte = *spte;
965 BUG_ON(!(_spte & PT_PRESENT_MASK));
966 young = _spte & PT_ACCESSED_MASK;
967 if (young) {
968 young = 1;
969 break;
970 }
971 spte = rmap_next(kvm, rmapp, spte);
972 }
973out:
974 return young;
975}
976
953#define RMAP_RECYCLE_THRESHOLD 1000 977#define RMAP_RECYCLE_THRESHOLD 1000
954 978
955static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 979static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -970,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
970 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 994 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
971} 995}
972 996
997int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
998{
999 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1000}
1001
973#ifdef MMU_DEBUG 1002#ifdef MMU_DEBUG
974static int is_empty_shadow_page(u64 *spt) 1003static int is_empty_shadow_page(u64 *spt)
975{ 1004{
@@ -1161,7 +1190,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1161} 1190}
1162 1191
1163static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1192static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1164 struct kvm_mmu_page *sp, bool clear_unsync) 1193 struct kvm_mmu_page *sp)
1165{ 1194{
1166 return 1; 1195 return 1;
1167} 1196}
@@ -1291,7 +1320,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1291 if (clear_unsync) 1320 if (clear_unsync)
1292 kvm_unlink_unsync_page(vcpu->kvm, sp); 1321 kvm_unlink_unsync_page(vcpu->kvm, sp);
1293 1322
1294 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { 1323 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1295 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1324 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1296 return 1; 1325 return 1;
1297 } 1326 }
@@ -1332,12 +1361,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1332 continue; 1361 continue;
1333 1362
1334 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1363 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1364 kvm_unlink_unsync_page(vcpu->kvm, s);
1335 if ((s->role.cr4_pae != !!is_pae(vcpu)) || 1365 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1336 (vcpu->arch.mmu.sync_page(vcpu, s, true))) { 1366 (vcpu->arch.mmu.sync_page(vcpu, s))) {
1337 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); 1367 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1338 continue; 1368 continue;
1339 } 1369 }
1340 kvm_unlink_unsync_page(vcpu->kvm, s);
1341 flush = true; 1370 flush = true;
1342 } 1371 }
1343 1372
@@ -1963,9 +1992,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1963 unsigned pte_access, int user_fault, 1992 unsigned pte_access, int user_fault,
1964 int write_fault, int dirty, int level, 1993 int write_fault, int dirty, int level,
1965 gfn_t gfn, pfn_t pfn, bool speculative, 1994 gfn_t gfn, pfn_t pfn, bool speculative,
1966 bool can_unsync, bool reset_host_protection) 1995 bool can_unsync, bool host_writable)
1967{ 1996{
1968 u64 spte; 1997 u64 spte, entry = *sptep;
1969 int ret = 0; 1998 int ret = 0;
1970 1999
1971 /* 2000 /*
@@ -1973,7 +2002,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1973 * whether the guest actually used the pte (in order to detect 2002 * whether the guest actually used the pte (in order to detect
1974 * demand paging). 2003 * demand paging).
1975 */ 2004 */
1976 spte = shadow_base_present_pte; 2005 spte = PT_PRESENT_MASK;
1977 if (!speculative) 2006 if (!speculative)
1978 spte |= shadow_accessed_mask; 2007 spte |= shadow_accessed_mask;
1979 if (!dirty) 2008 if (!dirty)
@@ -1990,8 +2019,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1990 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 2019 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1991 kvm_is_mmio_pfn(pfn)); 2020 kvm_is_mmio_pfn(pfn));
1992 2021
1993 if (reset_host_protection) 2022 if (host_writable)
1994 spte |= SPTE_HOST_WRITEABLE; 2023 spte |= SPTE_HOST_WRITEABLE;
2024 else
2025 pte_access &= ~ACC_WRITE_MASK;
1995 2026
1996 spte |= (u64)pfn << PAGE_SHIFT; 2027 spte |= (u64)pfn << PAGE_SHIFT;
1997 2028
@@ -2036,6 +2067,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2036 2067
2037set_pte: 2068set_pte:
2038 update_spte(sptep, spte); 2069 update_spte(sptep, spte);
2070 /*
2071 * If we overwrite a writable spte with a read-only one we
2072 * should flush remote TLBs. Otherwise rmap_write_protect
2073 * will find a read-only spte, even though the writable spte
2074 * might be cached on a CPU's TLB.
2075 */
2076 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2077 kvm_flush_remote_tlbs(vcpu->kvm);
2039done: 2078done:
2040 return ret; 2079 return ret;
2041} 2080}
@@ -2045,7 +2084,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2045 int user_fault, int write_fault, int dirty, 2084 int user_fault, int write_fault, int dirty,
2046 int *ptwrite, int level, gfn_t gfn, 2085 int *ptwrite, int level, gfn_t gfn,
2047 pfn_t pfn, bool speculative, 2086 pfn_t pfn, bool speculative,
2048 bool reset_host_protection) 2087 bool host_writable)
2049{ 2088{
2050 int was_rmapped = 0; 2089 int was_rmapped = 0;
2051 int rmap_count; 2090 int rmap_count;
@@ -2080,7 +2119,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2080 2119
2081 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2120 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2082 dirty, level, gfn, pfn, speculative, true, 2121 dirty, level, gfn, pfn, speculative, true,
2083 reset_host_protection)) { 2122 host_writable)) {
2084 if (write_fault) 2123 if (write_fault)
2085 *ptwrite = 1; 2124 *ptwrite = 1;
2086 kvm_mmu_flush_tlb(vcpu); 2125 kvm_mmu_flush_tlb(vcpu);
@@ -2211,7 +2250,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2211} 2250}
2212 2251
2213static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2252static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2214 int level, gfn_t gfn, pfn_t pfn) 2253 int map_writable, int level, gfn_t gfn, pfn_t pfn,
2254 bool prefault)
2215{ 2255{
2216 struct kvm_shadow_walk_iterator iterator; 2256 struct kvm_shadow_walk_iterator iterator;
2217 struct kvm_mmu_page *sp; 2257 struct kvm_mmu_page *sp;
@@ -2220,9 +2260,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2220 2260
2221 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2261 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2222 if (iterator.level == level) { 2262 if (iterator.level == level) {
2223 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2263 unsigned pte_access = ACC_ALL;
2264
2265 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2224 0, write, 1, &pt_write, 2266 0, write, 1, &pt_write,
2225 level, gfn, pfn, false, true); 2267 level, gfn, pfn, prefault, map_writable);
2226 direct_pte_prefetch(vcpu, iterator.sptep); 2268 direct_pte_prefetch(vcpu, iterator.sptep);
2227 ++vcpu->stat.pf_fixed; 2269 ++vcpu->stat.pf_fixed;
2228 break; 2270 break;
@@ -2277,27 +2319,81 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2277 return 1; 2319 return 1;
2278} 2320}
2279 2321
2280static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2322static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2323 gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2324{
2325 pfn_t pfn = *pfnp;
2326 gfn_t gfn = *gfnp;
2327 int level = *levelp;
2328
2329 /*
2330 * Check if it's a transparent hugepage. If this would be an
2331 * hugetlbfs page, level wouldn't be set to
2332 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2333 * here.
2334 */
2335 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2336 level == PT_PAGE_TABLE_LEVEL &&
2337 PageTransCompound(pfn_to_page(pfn)) &&
2338 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2339 unsigned long mask;
2340 /*
2341 * mmu_notifier_retry was successful and we hold the
2342 * mmu_lock here, so the pmd can't become splitting
2343 * from under us, and in turn
2344 * __split_huge_page_refcount() can't run from under
2345 * us and we can safely transfer the refcount from
2346 * PG_tail to PG_head as we switch the pfn to tail to
2347 * head.
2348 */
2349 *levelp = level = PT_DIRECTORY_LEVEL;
2350 mask = KVM_PAGES_PER_HPAGE(level) - 1;
2351 VM_BUG_ON((gfn & mask) != (pfn & mask));
2352 if (pfn & mask) {
2353 gfn &= ~mask;
2354 *gfnp = gfn;
2355 kvm_release_pfn_clean(pfn);
2356 pfn &= ~mask;
2357 if (!get_page_unless_zero(pfn_to_page(pfn)))
2358 BUG();
2359 *pfnp = pfn;
2360 }
2361 }
2362}
2363
2364static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2365 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2366
2367static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2368 bool prefault)
2281{ 2369{
2282 int r; 2370 int r;
2283 int level; 2371 int level;
2372 int force_pt_level;
2284 pfn_t pfn; 2373 pfn_t pfn;
2285 unsigned long mmu_seq; 2374 unsigned long mmu_seq;
2375 bool map_writable;
2286 2376
2287 level = mapping_level(vcpu, gfn); 2377 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2288 2378 if (likely(!force_pt_level)) {
2289 /* 2379 level = mapping_level(vcpu, gfn);
2290 * This path builds a PAE pagetable - so we can map 2mb pages at 2380 /*
2291 * maximum. Therefore check if the level is larger than that. 2381 * This path builds a PAE pagetable - so we can map
2292 */ 2382 * 2mb pages at maximum. Therefore check if the level
2293 if (level > PT_DIRECTORY_LEVEL) 2383 * is larger than that.
2294 level = PT_DIRECTORY_LEVEL; 2384 */
2385 if (level > PT_DIRECTORY_LEVEL)
2386 level = PT_DIRECTORY_LEVEL;
2295 2387
2296 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2388 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2389 } else
2390 level = PT_PAGE_TABLE_LEVEL;
2297 2391
2298 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2392 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2299 smp_rmb(); 2393 smp_rmb();
2300 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2394
2395 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2396 return 0;
2301 2397
2302 /* mmio */ 2398 /* mmio */
2303 if (is_error_pfn(pfn)) 2399 if (is_error_pfn(pfn))
@@ -2307,7 +2403,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2307 if (mmu_notifier_retry(vcpu, mmu_seq)) 2403 if (mmu_notifier_retry(vcpu, mmu_seq))
2308 goto out_unlock; 2404 goto out_unlock;
2309 kvm_mmu_free_some_pages(vcpu); 2405 kvm_mmu_free_some_pages(vcpu);
2310 r = __direct_map(vcpu, v, write, level, gfn, pfn); 2406 if (likely(!force_pt_level))
2407 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2408 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2409 prefault);
2311 spin_unlock(&vcpu->kvm->mmu_lock); 2410 spin_unlock(&vcpu->kvm->mmu_lock);
2312 2411
2313 2412
@@ -2530,6 +2629,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2530 hpa_t root = vcpu->arch.mmu.root_hpa; 2629 hpa_t root = vcpu->arch.mmu.root_hpa;
2531 sp = page_header(root); 2630 sp = page_header(root);
2532 mmu_sync_children(vcpu, sp); 2631 mmu_sync_children(vcpu, sp);
2632 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2533 return; 2633 return;
2534 } 2634 }
2535 for (i = 0; i < 4; ++i) { 2635 for (i = 0; i < 4; ++i) {
@@ -2552,23 +2652,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2552} 2652}
2553 2653
2554static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 2654static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2555 u32 access, u32 *error) 2655 u32 access, struct x86_exception *exception)
2556{ 2656{
2557 if (error) 2657 if (exception)
2558 *error = 0; 2658 exception->error_code = 0;
2559 return vaddr; 2659 return vaddr;
2560} 2660}
2561 2661
2562static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, 2662static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2563 u32 access, u32 *error) 2663 u32 access,
2664 struct x86_exception *exception)
2564{ 2665{
2565 if (error) 2666 if (exception)
2566 *error = 0; 2667 exception->error_code = 0;
2567 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); 2668 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2568} 2669}
2569 2670
2570static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2671static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2571 u32 error_code) 2672 u32 error_code, bool prefault)
2572{ 2673{
2573 gfn_t gfn; 2674 gfn_t gfn;
2574 int r; 2675 int r;
@@ -2584,17 +2685,68 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2584 gfn = gva >> PAGE_SHIFT; 2685 gfn = gva >> PAGE_SHIFT;
2585 2686
2586 return nonpaging_map(vcpu, gva & PAGE_MASK, 2687 return nonpaging_map(vcpu, gva & PAGE_MASK,
2587 error_code & PFERR_WRITE_MASK, gfn); 2688 error_code & PFERR_WRITE_MASK, gfn, prefault);
2689}
2690
2691static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2692{
2693 struct kvm_arch_async_pf arch;
2694
2695 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2696 arch.gfn = gfn;
2697 arch.direct_map = vcpu->arch.mmu.direct_map;
2698 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
2699
2700 return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2701}
2702
2703static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2704{
2705 if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2706 kvm_event_needs_reinjection(vcpu)))
2707 return false;
2708
2709 return kvm_x86_ops->interrupt_allowed(vcpu);
2588} 2710}
2589 2711
2590static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, 2712static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2591 u32 error_code) 2713 gva_t gva, pfn_t *pfn, bool write, bool *writable)
2714{
2715 bool async;
2716
2717 *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
2718
2719 if (!async)
2720 return false; /* *pfn has correct page already */
2721
2722 put_page(pfn_to_page(*pfn));
2723
2724 if (!prefault && can_do_async_pf(vcpu)) {
2725 trace_kvm_try_async_get_page(gva, gfn);
2726 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2727 trace_kvm_async_pf_doublefault(gva, gfn);
2728 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2729 return true;
2730 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2731 return true;
2732 }
2733
2734 *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
2735
2736 return false;
2737}
2738
2739static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2740 bool prefault)
2592{ 2741{
2593 pfn_t pfn; 2742 pfn_t pfn;
2594 int r; 2743 int r;
2595 int level; 2744 int level;
2745 int force_pt_level;
2596 gfn_t gfn = gpa >> PAGE_SHIFT; 2746 gfn_t gfn = gpa >> PAGE_SHIFT;
2597 unsigned long mmu_seq; 2747 unsigned long mmu_seq;
2748 int write = error_code & PFERR_WRITE_MASK;
2749 bool map_writable;
2598 2750
2599 ASSERT(vcpu); 2751 ASSERT(vcpu);
2600 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2752 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2603,21 +2755,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2603 if (r) 2755 if (r)
2604 return r; 2756 return r;
2605 2757
2606 level = mapping_level(vcpu, gfn); 2758 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2607 2759 if (likely(!force_pt_level)) {
2608 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2760 level = mapping_level(vcpu, gfn);
2761 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2762 } else
2763 level = PT_PAGE_TABLE_LEVEL;
2609 2764
2610 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2765 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2611 smp_rmb(); 2766 smp_rmb();
2612 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2767
2768 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2769 return 0;
2770
2771 /* mmio */
2613 if (is_error_pfn(pfn)) 2772 if (is_error_pfn(pfn))
2614 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2773 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2615 spin_lock(&vcpu->kvm->mmu_lock); 2774 spin_lock(&vcpu->kvm->mmu_lock);
2616 if (mmu_notifier_retry(vcpu, mmu_seq)) 2775 if (mmu_notifier_retry(vcpu, mmu_seq))
2617 goto out_unlock; 2776 goto out_unlock;
2618 kvm_mmu_free_some_pages(vcpu); 2777 kvm_mmu_free_some_pages(vcpu);
2619 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2778 if (likely(!force_pt_level))
2620 level, gfn, pfn); 2779 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2780 r = __direct_map(vcpu, gpa, write, map_writable,
2781 level, gfn, pfn, prefault);
2621 spin_unlock(&vcpu->kvm->mmu_lock); 2782 spin_unlock(&vcpu->kvm->mmu_lock);
2622 2783
2623 return r; 2784 return r;
@@ -2659,18 +2820,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2659 2820
2660static void paging_new_cr3(struct kvm_vcpu *vcpu) 2821static void paging_new_cr3(struct kvm_vcpu *vcpu)
2661{ 2822{
2662 pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); 2823 pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
2663 mmu_free_roots(vcpu); 2824 mmu_free_roots(vcpu);
2664} 2825}
2665 2826
2666static unsigned long get_cr3(struct kvm_vcpu *vcpu) 2827static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2667{ 2828{
2668 return vcpu->arch.cr3; 2829 return kvm_read_cr3(vcpu);
2669} 2830}
2670 2831
2671static void inject_page_fault(struct kvm_vcpu *vcpu) 2832static void inject_page_fault(struct kvm_vcpu *vcpu,
2833 struct x86_exception *fault)
2672{ 2834{
2673 vcpu->arch.mmu.inject_page_fault(vcpu); 2835 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
2674} 2836}
2675 2837
2676static void paging_free(struct kvm_vcpu *vcpu) 2838static void paging_free(struct kvm_vcpu *vcpu)
@@ -2816,6 +2978,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2816{ 2978{
2817 struct kvm_mmu *context = vcpu->arch.walk_mmu; 2979 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2818 2980
2981 context->base_role.word = 0;
2819 context->new_cr3 = nonpaging_new_cr3; 2982 context->new_cr3 = nonpaging_new_cr3;
2820 context->page_fault = tdp_page_fault; 2983 context->page_fault = tdp_page_fault;
2821 context->free = nonpaging_free; 2984 context->free = nonpaging_free;
@@ -3008,9 +3171,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3008 return; 3171 return;
3009 } 3172 }
3010 3173
3011 if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
3012 return;
3013
3014 ++vcpu->kvm->stat.mmu_pte_updated; 3174 ++vcpu->kvm->stat.mmu_pte_updated;
3015 if (!sp->role.cr4_pae) 3175 if (!sp->role.cr4_pae)
3016 paging32_update_pte(vcpu, sp, spte, new); 3176 paging32_update_pte(vcpu, sp, spte, new);
@@ -3264,12 +3424,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3264 } 3424 }
3265} 3425}
3266 3426
3267int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 3427int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3428 void *insn, int insn_len)
3268{ 3429{
3269 int r; 3430 int r;
3270 enum emulation_result er; 3431 enum emulation_result er;
3271 3432
3272 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); 3433 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
3273 if (r < 0) 3434 if (r < 0)
3274 goto out; 3435 goto out;
3275 3436
@@ -3282,7 +3443,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
3282 if (r) 3443 if (r)
3283 goto out; 3444 goto out;
3284 3445
3285 er = emulate_instruction(vcpu, cr2, error_code, 0); 3446 er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
3286 3447
3287 switch (er) { 3448 switch (er) {
3288 case EMULATE_DONE: 3449 case EMULATE_DONE:
@@ -3377,11 +3538,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3377 if (!test_bit(slot, sp->slot_bitmap)) 3538 if (!test_bit(slot, sp->slot_bitmap))
3378 continue; 3539 continue;
3379 3540
3541 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3542 continue;
3543
3380 pt = sp->spt; 3544 pt = sp->spt;
3381 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3545 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
3382 /* avoid RMW */ 3546 /* avoid RMW */
3383 if (is_writable_pte(pt[i])) 3547 if (is_writable_pte(pt[i]))
3384 pt[i] &= ~PT_WRITABLE_MASK; 3548 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3385 } 3549 }
3386 kvm_flush_remote_tlbs(kvm); 3550 kvm_flush_remote_tlbs(kvm);
3387} 3551}
@@ -3463,13 +3627,6 @@ static void mmu_destroy_caches(void)
3463 kmem_cache_destroy(mmu_page_header_cache); 3627 kmem_cache_destroy(mmu_page_header_cache);
3464} 3628}
3465 3629
3466void kvm_mmu_module_exit(void)
3467{
3468 mmu_destroy_caches();
3469 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3470 unregister_shrinker(&mmu_shrinker);
3471}
3472
3473int kvm_mmu_module_init(void) 3630int kvm_mmu_module_init(void)
3474{ 3631{
3475 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3632 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3566,7 +3723,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3566 3723
3567static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3724static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3568{ 3725{
3569 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); 3726 (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
3570 return 1; 3727 return 1;
3571} 3728}
3572 3729
@@ -3662,12 +3819,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3662} 3819}
3663EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3820EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3664 3821
3665#ifdef CONFIG_KVM_MMU_AUDIT
3666#include "mmu_audit.c"
3667#else
3668static void mmu_audit_disable(void) { }
3669#endif
3670
3671void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 3822void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3672{ 3823{
3673 ASSERT(vcpu); 3824 ASSERT(vcpu);
@@ -3675,5 +3826,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3675 destroy_kvm_mmu(vcpu); 3826 destroy_kvm_mmu(vcpu);
3676 free_mmu_pages(vcpu); 3827 free_mmu_pages(vcpu);
3677 mmu_free_memory_caches(vcpu); 3828 mmu_free_memory_caches(vcpu);
3829}
3830
3831#ifdef CONFIG_KVM_MMU_AUDIT
3832#include "mmu_audit.c"
3833#else
3834static void mmu_audit_disable(void) { }
3835#endif
3836
3837void kvm_mmu_module_exit(void)
3838{
3839 mmu_destroy_caches();
3840 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3841 unregister_shrinker(&mmu_shrinker);
3678 mmu_audit_disable(); 3842 mmu_audit_disable();
3679} 3843}
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ba2bcdde6221..5f6223b8bcf7 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,11 +19,9 @@
19 19
20#include <linux/ratelimit.h> 20#include <linux/ratelimit.h>
21 21
22static int audit_point; 22#define audit_printk(kvm, fmt, args...) \
23
24#define audit_printk(fmt, args...) \
25 printk(KERN_ERR "audit: (%s) error: " \ 23 printk(KERN_ERR "audit: (%s) error: " \
26 fmt, audit_point_name[audit_point], ##args) 24 fmt, audit_point_name[kvm->arch.audit_point], ##args)
27 25
28typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); 26typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
29 27
@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
97 95
98 if (sp->unsync) { 96 if (sp->unsync) {
99 if (level != PT_PAGE_TABLE_LEVEL) { 97 if (level != PT_PAGE_TABLE_LEVEL) {
100 audit_printk("unsync sp: %p level = %d\n", sp, level); 98 audit_printk(vcpu->kvm, "unsync sp: %p "
99 "level = %d\n", sp, level);
101 return; 100 return;
102 } 101 }
103 102
104 if (*sptep == shadow_notrap_nonpresent_pte) { 103 if (*sptep == shadow_notrap_nonpresent_pte) {
105 audit_printk("notrap spte in unsync sp: %p\n", sp); 104 audit_printk(vcpu->kvm, "notrap spte in unsync "
105 "sp: %p\n", sp);
106 return; 106 return;
107 } 107 }
108 } 108 }
109 109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { 110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk("notrap spte in direct sp: %p\n", sp); 111 audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
112 sp);
112 return; 113 return;
113 } 114 }
114 115
@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
125 126
126 hpa = pfn << PAGE_SHIFT; 127 hpa = pfn << PAGE_SHIFT;
127 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) 128 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
128 audit_printk("levels %d pfn %llx hpa %llx ent %llxn", 129 audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
129 vcpu->arch.mmu.root_level, pfn, hpa, *sptep); 130 "ent %llxn", vcpu->arch.mmu.root_level, pfn,
131 hpa, *sptep);
130} 132}
131 133
132static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) 134static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
142 if (!gfn_to_memslot(kvm, gfn)) { 144 if (!gfn_to_memslot(kvm, gfn)) {
143 if (!printk_ratelimit()) 145 if (!printk_ratelimit())
144 return; 146 return;
145 audit_printk("no memslot for gfn %llx\n", gfn); 147 audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
146 audit_printk("index %ld of sp (gfn=%llx)\n", 148 audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
147 (long int)(sptep - rev_sp->spt), rev_sp->gfn); 149 (long int)(sptep - rev_sp->spt), rev_sp->gfn);
148 dump_stack(); 150 dump_stack();
149 return; 151 return;
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
153 if (!*rmapp) { 155 if (!*rmapp) {
154 if (!printk_ratelimit()) 156 if (!printk_ratelimit())
155 return; 157 return;
156 audit_printk("no rmap for writable spte %llx\n", *sptep); 158 audit_printk(kvm, "no rmap for writable spte %llx\n",
159 *sptep);
157 dump_stack(); 160 dump_stack();
158 } 161 }
159} 162}
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
168{ 171{
169 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 172 struct kvm_mmu_page *sp = page_header(__pa(sptep));
170 173
171 if (audit_point == AUDIT_POST_SYNC && sp->unsync) 174 if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
172 audit_printk("meet unsync sp(%p) after sync root.\n", sp); 175 audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
176 "root.\n", sp);
173} 177}
174 178
175static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) 179static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
202 spte = rmap_next(kvm, rmapp, NULL); 206 spte = rmap_next(kvm, rmapp, NULL);
203 while (spte) { 207 while (spte) {
204 if (is_writable_pte(*spte)) 208 if (is_writable_pte(*spte))
205 audit_printk("shadow page has writable mappings: gfn " 209 audit_printk(kvm, "shadow page has writable "
206 "%llx role %x\n", sp->gfn, sp->role.word); 210 "mappings: gfn %llx role %x\n",
211 sp->gfn, sp->role.word);
207 spte = rmap_next(kvm, rmapp, spte); 212 spte = rmap_next(kvm, rmapp, spte);
208 } 213 }
209} 214}
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
238 if (!__ratelimit(&ratelimit_state)) 243 if (!__ratelimit(&ratelimit_state))
239 return; 244 return;
240 245
241 audit_point = point; 246 vcpu->kvm->arch.audit_point = point;
242 audit_all_active_sps(vcpu->kvm); 247 audit_all_active_sps(vcpu->kvm);
243 audit_vcpu_spte(vcpu); 248 audit_vcpu_spte(vcpu);
244} 249}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cd7a833a3b52..6bccc24c4181 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -72,7 +72,7 @@ struct guest_walker {
72 unsigned pt_access; 72 unsigned pt_access;
73 unsigned pte_access; 73 unsigned pte_access;
74 gfn_t gfn; 74 gfn_t gfn;
75 u32 error_code; 75 struct x86_exception fault;
76}; 76};
77 77
78static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) 78static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -266,21 +266,23 @@ walk:
266 return 1; 266 return 1;
267 267
268error: 268error:
269 walker->error_code = 0; 269 walker->fault.vector = PF_VECTOR;
270 walker->fault.error_code_valid = true;
271 walker->fault.error_code = 0;
270 if (present) 272 if (present)
271 walker->error_code |= PFERR_PRESENT_MASK; 273 walker->fault.error_code |= PFERR_PRESENT_MASK;
272 274
273 walker->error_code |= write_fault | user_fault; 275 walker->fault.error_code |= write_fault | user_fault;
274 276
275 if (fetch_fault && mmu->nx) 277 if (fetch_fault && mmu->nx)
276 walker->error_code |= PFERR_FETCH_MASK; 278 walker->fault.error_code |= PFERR_FETCH_MASK;
277 if (rsvd_fault) 279 if (rsvd_fault)
278 walker->error_code |= PFERR_RSVD_MASK; 280 walker->fault.error_code |= PFERR_RSVD_MASK;
279 281
280 vcpu->arch.fault.address = addr; 282 walker->fault.address = addr;
281 vcpu->arch.fault.error_code = walker->error_code; 283 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
282 284
283 trace_kvm_mmu_walker_error(walker->error_code); 285 trace_kvm_mmu_walker_error(walker->fault.error_code);
284 return 0; 286 return 0;
285} 287}
286 288
@@ -299,25 +301,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
299 addr, access); 301 addr, access);
300} 302}
301 303
304static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
305 struct kvm_mmu_page *sp, u64 *spte,
306 pt_element_t gpte)
307{
308 u64 nonpresent = shadow_trap_nonpresent_pte;
309
310 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
311 goto no_present;
312
313 if (!is_present_gpte(gpte)) {
314 if (!sp->unsync)
315 nonpresent = shadow_notrap_nonpresent_pte;
316 goto no_present;
317 }
318
319 if (!(gpte & PT_ACCESSED_MASK))
320 goto no_present;
321
322 return false;
323
324no_present:
325 drop_spte(vcpu->kvm, spte, nonpresent);
326 return true;
327}
328
302static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 329static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
303 u64 *spte, const void *pte) 330 u64 *spte, const void *pte)
304{ 331{
305 pt_element_t gpte; 332 pt_element_t gpte;
306 unsigned pte_access; 333 unsigned pte_access;
307 pfn_t pfn; 334 pfn_t pfn;
308 u64 new_spte;
309 335
310 gpte = *(const pt_element_t *)pte; 336 gpte = *(const pt_element_t *)pte;
311 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 337 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
312 if (!is_present_gpte(gpte)) {
313 if (sp->unsync)
314 new_spte = shadow_trap_nonpresent_pte;
315 else
316 new_spte = shadow_notrap_nonpresent_pte;
317 __set_spte(spte, new_spte);
318 }
319 return; 338 return;
320 } 339
321 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 340 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
322 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 341 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
323 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 342 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -329,7 +348,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
329 return; 348 return;
330 kvm_get_pfn(pfn); 349 kvm_get_pfn(pfn);
331 /* 350 /*
332 * we call mmu_set_spte() with reset_host_protection = true beacuse that 351 * we call mmu_set_spte() with host_writable = true beacuse that
333 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 352 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
334 */ 353 */
335 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 354 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -364,7 +383,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
364 u64 *sptep) 383 u64 *sptep)
365{ 384{
366 struct kvm_mmu_page *sp; 385 struct kvm_mmu_page *sp;
367 struct kvm_mmu *mmu = &vcpu->arch.mmu;
368 pt_element_t *gptep = gw->prefetch_ptes; 386 pt_element_t *gptep = gw->prefetch_ptes;
369 u64 *spte; 387 u64 *spte;
370 int i; 388 int i;
@@ -395,14 +413,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
395 413
396 gpte = gptep[i]; 414 gpte = gptep[i];
397 415
398 if (!is_present_gpte(gpte) || 416 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
399 is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
400 if (!sp->unsync)
401 __set_spte(spte, shadow_notrap_nonpresent_pte);
402 continue;
403 }
404
405 if (!(gpte & PT_ACCESSED_MASK))
406 continue; 417 continue;
407 418
408 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 419 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -427,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
427static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 438static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
428 struct guest_walker *gw, 439 struct guest_walker *gw,
429 int user_fault, int write_fault, int hlevel, 440 int user_fault, int write_fault, int hlevel,
430 int *ptwrite, pfn_t pfn) 441 int *ptwrite, pfn_t pfn, bool map_writable,
442 bool prefault)
431{ 443{
432 unsigned access = gw->pt_access; 444 unsigned access = gw->pt_access;
433 struct kvm_mmu_page *sp = NULL; 445 struct kvm_mmu_page *sp = NULL;
@@ -501,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
501 513
502 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 514 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
503 user_fault, write_fault, dirty, ptwrite, it.level, 515 user_fault, write_fault, dirty, ptwrite, it.level,
504 gw->gfn, pfn, false, true); 516 gw->gfn, pfn, prefault, map_writable);
505 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 517 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
506 518
507 return it.sptep; 519 return it.sptep;
@@ -527,8 +539,8 @@ out_gpte_changed:
527 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 539 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
528 * a negative value on error. 540 * a negative value on error.
529 */ 541 */
530static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, 542static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
531 u32 error_code) 543 bool prefault)
532{ 544{
533 int write_fault = error_code & PFERR_WRITE_MASK; 545 int write_fault = error_code & PFERR_WRITE_MASK;
534 int user_fault = error_code & PFERR_USER_MASK; 546 int user_fault = error_code & PFERR_USER_MASK;
@@ -538,7 +550,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
538 int r; 550 int r;
539 pfn_t pfn; 551 pfn_t pfn;
540 int level = PT_PAGE_TABLE_LEVEL; 552 int level = PT_PAGE_TABLE_LEVEL;
553 int force_pt_level;
541 unsigned long mmu_seq; 554 unsigned long mmu_seq;
555 bool map_writable;
542 556
543 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 557 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
544 558
@@ -556,19 +570,29 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
556 */ 570 */
557 if (!r) { 571 if (!r) {
558 pgprintk("%s: guest page fault\n", __func__); 572 pgprintk("%s: guest page fault\n", __func__);
559 inject_page_fault(vcpu); 573 if (!prefault) {
560 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 574 inject_page_fault(vcpu, &walker.fault);
575 /* reset fork detector */
576 vcpu->arch.last_pt_write_count = 0;
577 }
561 return 0; 578 return 0;
562 } 579 }
563 580
564 if (walker.level >= PT_DIRECTORY_LEVEL) { 581 if (walker.level >= PT_DIRECTORY_LEVEL)
582 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
583 else
584 force_pt_level = 1;
585 if (!force_pt_level) {
565 level = min(walker.level, mapping_level(vcpu, walker.gfn)); 586 level = min(walker.level, mapping_level(vcpu, walker.gfn));
566 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 587 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
567 } 588 }
568 589
569 mmu_seq = vcpu->kvm->mmu_notifier_seq; 590 mmu_seq = vcpu->kvm->mmu_notifier_seq;
570 smp_rmb(); 591 smp_rmb();
571 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 592
593 if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
594 &map_writable))
595 return 0;
572 596
573 /* mmio */ 597 /* mmio */
574 if (is_error_pfn(pfn)) 598 if (is_error_pfn(pfn))
@@ -580,8 +604,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
580 604
581 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 605 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
582 kvm_mmu_free_some_pages(vcpu); 606 kvm_mmu_free_some_pages(vcpu);
607 if (!force_pt_level)
608 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
583 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 609 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
584 level, &write_pt, pfn); 610 level, &write_pt, pfn, map_writable, prefault);
585 (void)sptep; 611 (void)sptep;
586 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 612 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
587 sptep, *sptep, write_pt); 613 sptep, *sptep, write_pt);
@@ -661,7 +687,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
661} 687}
662 688
663static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 689static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
664 u32 *error) 690 struct x86_exception *exception)
665{ 691{
666 struct guest_walker walker; 692 struct guest_walker walker;
667 gpa_t gpa = UNMAPPED_GVA; 693 gpa_t gpa = UNMAPPED_GVA;
@@ -672,14 +698,15 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
672 if (r) { 698 if (r) {
673 gpa = gfn_to_gpa(walker.gfn); 699 gpa = gfn_to_gpa(walker.gfn);
674 gpa |= vaddr & ~PAGE_MASK; 700 gpa |= vaddr & ~PAGE_MASK;
675 } else if (error) 701 } else if (exception)
676 *error = walker.error_code; 702 *exception = walker.fault;
677 703
678 return gpa; 704 return gpa;
679} 705}
680 706
681static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, 707static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
682 u32 access, u32 *error) 708 u32 access,
709 struct x86_exception *exception)
683{ 710{
684 struct guest_walker walker; 711 struct guest_walker walker;
685 gpa_t gpa = UNMAPPED_GVA; 712 gpa_t gpa = UNMAPPED_GVA;
@@ -690,8 +717,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
690 if (r) { 717 if (r) {
691 gpa = gfn_to_gpa(walker.gfn); 718 gpa = gfn_to_gpa(walker.gfn);
692 gpa |= vaddr & ~PAGE_MASK; 719 gpa |= vaddr & ~PAGE_MASK;
693 } else if (error) 720 } else if (exception)
694 *error = walker.error_code; 721 *exception = walker.fault;
695 722
696 return gpa; 723 return gpa;
697} 724}
@@ -730,12 +757,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
730 * Using the cached information from sp->gfns is safe because: 757 * Using the cached information from sp->gfns is safe because:
731 * - The spte has a reference to the struct page, so the pfn for a given gfn 758 * - The spte has a reference to the struct page, so the pfn for a given gfn
732 * can't change unless all sptes pointing to it are nuked first. 759 * can't change unless all sptes pointing to it are nuked first.
760 *
761 * Note:
762 * We should flush all tlbs if spte is dropped even though guest is
763 * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
764 * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
765 * used by guest then tlbs are not flushed, so guest is allowed to access the
766 * freed pages.
767 * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
733 */ 768 */
734static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 769static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
735 bool clear_unsync)
736{ 770{
737 int i, offset, nr_present; 771 int i, offset, nr_present;
738 bool reset_host_protection; 772 bool host_writable;
739 gpa_t first_pte_gpa; 773 gpa_t first_pte_gpa;
740 774
741 offset = nr_present = 0; 775 offset = nr_present = 0;
@@ -764,31 +798,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
764 return -EINVAL; 798 return -EINVAL;
765 799
766 gfn = gpte_to_gfn(gpte); 800 gfn = gpte_to_gfn(gpte);
767 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
768 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
769 || !(gpte & PT_ACCESSED_MASK)) {
770 u64 nonpresent;
771 801
772 if (is_present_gpte(gpte) || !clear_unsync) 802 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
773 nonpresent = shadow_trap_nonpresent_pte; 803 vcpu->kvm->tlbs_dirty++;
774 else 804 continue;
775 nonpresent = shadow_notrap_nonpresent_pte; 805 }
776 drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); 806
807 if (gfn != sp->gfns[i]) {
808 drop_spte(vcpu->kvm, &sp->spt[i],
809 shadow_trap_nonpresent_pte);
810 vcpu->kvm->tlbs_dirty++;
777 continue; 811 continue;
778 } 812 }
779 813
780 nr_present++; 814 nr_present++;
781 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 815 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
782 if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { 816 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
783 pte_access &= ~ACC_WRITE_MASK; 817
784 reset_host_protection = 0;
785 } else {
786 reset_host_protection = 1;
787 }
788 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 818 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
789 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 819 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
790 spte_to_pfn(sp->spt[i]), true, false, 820 spte_to_pfn(sp->spt[i]), true, false,
791 reset_host_protection); 821 host_writable);
792 } 822 }
793 823
794 return !nr_present; 824 return !nr_present;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b81a9b7c2ca4..25bd1bc5aad2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -31,6 +31,7 @@
31 31
32#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
33#include <asm/desc.h> 33#include <asm/desc.h>
34#include <asm/kvm_para.h>
34 35
35#include <asm/virtext.h> 36#include <asm/virtext.h>
36#include "trace.h" 37#include "trace.h"
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL");
50#define SVM_FEATURE_LBRV (1 << 1) 51#define SVM_FEATURE_LBRV (1 << 1)
51#define SVM_FEATURE_SVML (1 << 2) 52#define SVM_FEATURE_SVML (1 << 2)
52#define SVM_FEATURE_NRIP (1 << 3) 53#define SVM_FEATURE_NRIP (1 << 3)
54#define SVM_FEATURE_TSC_RATE (1 << 4)
55#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
56#define SVM_FEATURE_FLUSH_ASID (1 << 6)
57#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
53#define SVM_FEATURE_PAUSE_FILTER (1 << 10) 58#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
54 59
55#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 60#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -97,10 +102,8 @@ struct nested_state {
97 unsigned long vmexit_rax; 102 unsigned long vmexit_rax;
98 103
99 /* cache for intercepts of the guest */ 104 /* cache for intercepts of the guest */
100 u16 intercept_cr_read; 105 u32 intercept_cr;
101 u16 intercept_cr_write; 106 u32 intercept_dr;
102 u16 intercept_dr_read;
103 u16 intercept_dr_write;
104 u32 intercept_exceptions; 107 u32 intercept_exceptions;
105 u64 intercept; 108 u64 intercept;
106 109
@@ -123,7 +126,12 @@ struct vcpu_svm {
123 u64 next_rip; 126 u64 next_rip;
124 127
125 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; 128 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
126 u64 host_gs_base; 129 struct {
130 u16 fs;
131 u16 gs;
132 u16 ldt;
133 u64 gs_base;
134 } host;
127 135
128 u32 *msrpm; 136 u32 *msrpm;
129 137
@@ -133,6 +141,7 @@ struct vcpu_svm {
133 141
134 unsigned int3_injected; 142 unsigned int3_injected;
135 unsigned long int3_rip; 143 unsigned long int3_rip;
144 u32 apf_reason;
136}; 145};
137 146
138#define MSR_INVALID 0xffffffffU 147#define MSR_INVALID 0xffffffffU
@@ -180,14 +189,151 @@ static int nested_svm_vmexit(struct vcpu_svm *svm);
180static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 189static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
181 bool has_error_code, u32 error_code); 190 bool has_error_code, u32 error_code);
182 191
192enum {
193 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
194 pause filter count */
195 VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
196 VMCB_ASID, /* ASID */
197 VMCB_INTR, /* int_ctl, int_vector */
198 VMCB_NPT, /* npt_en, nCR3, gPAT */
199 VMCB_CR, /* CR0, CR3, CR4, EFER */
200 VMCB_DR, /* DR6, DR7 */
201 VMCB_DT, /* GDT, IDT */
202 VMCB_SEG, /* CS, DS, SS, ES, CPL */
203 VMCB_CR2, /* CR2 only */
204 VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
205 VMCB_DIRTY_MAX,
206};
207
208/* TPR and CR2 are always written before VMRUN */
209#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
210
211static inline void mark_all_dirty(struct vmcb *vmcb)
212{
213 vmcb->control.clean = 0;
214}
215
216static inline void mark_all_clean(struct vmcb *vmcb)
217{
218 vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
219 & ~VMCB_ALWAYS_DIRTY_MASK;
220}
221
222static inline void mark_dirty(struct vmcb *vmcb, int bit)
223{
224 vmcb->control.clean &= ~(1 << bit);
225}
226
183static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 227static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
184{ 228{
185 return container_of(vcpu, struct vcpu_svm, vcpu); 229 return container_of(vcpu, struct vcpu_svm, vcpu);
186} 230}
187 231
188static inline bool is_nested(struct vcpu_svm *svm) 232static void recalc_intercepts(struct vcpu_svm *svm)
233{
234 struct vmcb_control_area *c, *h;
235 struct nested_state *g;
236
237 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
238
239 if (!is_guest_mode(&svm->vcpu))
240 return;
241
242 c = &svm->vmcb->control;
243 h = &svm->nested.hsave->control;
244 g = &svm->nested;
245
246 c->intercept_cr = h->intercept_cr | g->intercept_cr;
247 c->intercept_dr = h->intercept_dr | g->intercept_dr;
248 c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
249 c->intercept = h->intercept | g->intercept;
250}
251
252static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
253{
254 if (is_guest_mode(&svm->vcpu))
255 return svm->nested.hsave;
256 else
257 return svm->vmcb;
258}
259
260static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
261{
262 struct vmcb *vmcb = get_host_vmcb(svm);
263
264 vmcb->control.intercept_cr |= (1U << bit);
265
266 recalc_intercepts(svm);
267}
268
269static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
270{
271 struct vmcb *vmcb = get_host_vmcb(svm);
272
273 vmcb->control.intercept_cr &= ~(1U << bit);
274
275 recalc_intercepts(svm);
276}
277
278static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
279{
280 struct vmcb *vmcb = get_host_vmcb(svm);
281
282 return vmcb->control.intercept_cr & (1U << bit);
283}
284
285static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
286{
287 struct vmcb *vmcb = get_host_vmcb(svm);
288
289 vmcb->control.intercept_dr |= (1U << bit);
290
291 recalc_intercepts(svm);
292}
293
294static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
295{
296 struct vmcb *vmcb = get_host_vmcb(svm);
297
298 vmcb->control.intercept_dr &= ~(1U << bit);
299
300 recalc_intercepts(svm);
301}
302
303static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
304{
305 struct vmcb *vmcb = get_host_vmcb(svm);
306
307 vmcb->control.intercept_exceptions |= (1U << bit);
308
309 recalc_intercepts(svm);
310}
311
312static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
189{ 313{
190 return svm->nested.vmcb; 314 struct vmcb *vmcb = get_host_vmcb(svm);
315
316 vmcb->control.intercept_exceptions &= ~(1U << bit);
317
318 recalc_intercepts(svm);
319}
320
321static inline void set_intercept(struct vcpu_svm *svm, int bit)
322{
323 struct vmcb *vmcb = get_host_vmcb(svm);
324
325 vmcb->control.intercept |= (1ULL << bit);
326
327 recalc_intercepts(svm);
328}
329
330static inline void clr_intercept(struct vcpu_svm *svm, int bit)
331{
332 struct vmcb *vmcb = get_host_vmcb(svm);
333
334 vmcb->control.intercept &= ~(1ULL << bit);
335
336 recalc_intercepts(svm);
191} 337}
192 338
193static inline void enable_gif(struct vcpu_svm *svm) 339static inline void enable_gif(struct vcpu_svm *svm)
@@ -264,11 +410,6 @@ static u32 svm_msrpm_offset(u32 msr)
264 410
265#define MAX_INST_SIZE 15 411#define MAX_INST_SIZE 15
266 412
267static inline u32 svm_has(u32 feat)
268{
269 return svm_features & feat;
270}
271
272static inline void clgi(void) 413static inline void clgi(void)
273{ 414{
274 asm volatile (__ex(SVM_CLGI)); 415 asm volatile (__ex(SVM_CLGI));
@@ -284,16 +425,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
284 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); 425 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
285} 426}
286 427
287static inline void force_new_asid(struct kvm_vcpu *vcpu)
288{
289 to_svm(vcpu)->asid_generation--;
290}
291
292static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
293{
294 force_new_asid(vcpu);
295}
296
297static int get_npt_level(void) 428static int get_npt_level(void)
298{ 429{
299#ifdef CONFIG_X86_64 430#ifdef CONFIG_X86_64
@@ -310,6 +441,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
310 efer &= ~EFER_LME; 441 efer &= ~EFER_LME;
311 442
312 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 443 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
444 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
313} 445}
314 446
315static int is_external_interrupt(u32 info) 447static int is_external_interrupt(u32 info)
@@ -347,7 +479,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
347 svm->next_rip = svm->vmcb->control.next_rip; 479 svm->next_rip = svm->vmcb->control.next_rip;
348 480
349 if (!svm->next_rip) { 481 if (!svm->next_rip) {
350 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != 482 if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
351 EMULATE_DONE) 483 EMULATE_DONE)
352 printk(KERN_DEBUG "%s: NOP\n", __func__); 484 printk(KERN_DEBUG "%s: NOP\n", __func__);
353 return; 485 return;
@@ -374,7 +506,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
374 nested_svm_check_exception(svm, nr, has_error_code, error_code)) 506 nested_svm_check_exception(svm, nr, has_error_code, error_code))
375 return; 507 return;
376 508
377 if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { 509 if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
378 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); 510 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
379 511
380 /* 512 /*
@@ -670,7 +802,7 @@ static __init int svm_hardware_setup(void)
670 802
671 svm_features = cpuid_edx(SVM_CPUID_FUNC); 803 svm_features = cpuid_edx(SVM_CPUID_FUNC);
672 804
673 if (!svm_has(SVM_FEATURE_NPT)) 805 if (!boot_cpu_has(X86_FEATURE_NPT))
674 npt_enabled = false; 806 npt_enabled = false;
675 807
676 if (npt_enabled && !npt) { 808 if (npt_enabled && !npt) {
@@ -725,13 +857,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
725 struct vcpu_svm *svm = to_svm(vcpu); 857 struct vcpu_svm *svm = to_svm(vcpu);
726 u64 g_tsc_offset = 0; 858 u64 g_tsc_offset = 0;
727 859
728 if (is_nested(svm)) { 860 if (is_guest_mode(vcpu)) {
729 g_tsc_offset = svm->vmcb->control.tsc_offset - 861 g_tsc_offset = svm->vmcb->control.tsc_offset -
730 svm->nested.hsave->control.tsc_offset; 862 svm->nested.hsave->control.tsc_offset;
731 svm->nested.hsave->control.tsc_offset = offset; 863 svm->nested.hsave->control.tsc_offset = offset;
732 } 864 }
733 865
734 svm->vmcb->control.tsc_offset = offset + g_tsc_offset; 866 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
867
868 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
735} 869}
736 870
737static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 871static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -739,8 +873,9 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
739 struct vcpu_svm *svm = to_svm(vcpu); 873 struct vcpu_svm *svm = to_svm(vcpu);
740 874
741 svm->vmcb->control.tsc_offset += adjustment; 875 svm->vmcb->control.tsc_offset += adjustment;
742 if (is_nested(svm)) 876 if (is_guest_mode(vcpu))
743 svm->nested.hsave->control.tsc_offset += adjustment; 877 svm->nested.hsave->control.tsc_offset += adjustment;
878 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
744} 879}
745 880
746static void init_vmcb(struct vcpu_svm *svm) 881static void init_vmcb(struct vcpu_svm *svm)
@@ -749,62 +884,62 @@ static void init_vmcb(struct vcpu_svm *svm)
749 struct vmcb_save_area *save = &svm->vmcb->save; 884 struct vmcb_save_area *save = &svm->vmcb->save;
750 885
751 svm->vcpu.fpu_active = 1; 886 svm->vcpu.fpu_active = 1;
887 svm->vcpu.arch.hflags = 0;
752 888
753 control->intercept_cr_read = INTERCEPT_CR0_MASK | 889 set_cr_intercept(svm, INTERCEPT_CR0_READ);
754 INTERCEPT_CR3_MASK | 890 set_cr_intercept(svm, INTERCEPT_CR3_READ);
755 INTERCEPT_CR4_MASK; 891 set_cr_intercept(svm, INTERCEPT_CR4_READ);
756 892 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
757 control->intercept_cr_write = INTERCEPT_CR0_MASK | 893 set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
758 INTERCEPT_CR3_MASK | 894 set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
759 INTERCEPT_CR4_MASK | 895 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
760 INTERCEPT_CR8_MASK; 896
761 897 set_dr_intercept(svm, INTERCEPT_DR0_READ);
762 control->intercept_dr_read = INTERCEPT_DR0_MASK | 898 set_dr_intercept(svm, INTERCEPT_DR1_READ);
763 INTERCEPT_DR1_MASK | 899 set_dr_intercept(svm, INTERCEPT_DR2_READ);
764 INTERCEPT_DR2_MASK | 900 set_dr_intercept(svm, INTERCEPT_DR3_READ);
765 INTERCEPT_DR3_MASK | 901 set_dr_intercept(svm, INTERCEPT_DR4_READ);
766 INTERCEPT_DR4_MASK | 902 set_dr_intercept(svm, INTERCEPT_DR5_READ);
767 INTERCEPT_DR5_MASK | 903 set_dr_intercept(svm, INTERCEPT_DR6_READ);
768 INTERCEPT_DR6_MASK | 904 set_dr_intercept(svm, INTERCEPT_DR7_READ);
769 INTERCEPT_DR7_MASK; 905
770 906 set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
771 control->intercept_dr_write = INTERCEPT_DR0_MASK | 907 set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
772 INTERCEPT_DR1_MASK | 908 set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
773 INTERCEPT_DR2_MASK | 909 set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
774 INTERCEPT_DR3_MASK | 910 set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
775 INTERCEPT_DR4_MASK | 911 set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
776 INTERCEPT_DR5_MASK | 912 set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
777 INTERCEPT_DR6_MASK | 913 set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
778 INTERCEPT_DR7_MASK; 914
779 915 set_exception_intercept(svm, PF_VECTOR);
780 control->intercept_exceptions = (1 << PF_VECTOR) | 916 set_exception_intercept(svm, UD_VECTOR);
781 (1 << UD_VECTOR) | 917 set_exception_intercept(svm, MC_VECTOR);
782 (1 << MC_VECTOR); 918
783 919 set_intercept(svm, INTERCEPT_INTR);
784 920 set_intercept(svm, INTERCEPT_NMI);
785 control->intercept = (1ULL << INTERCEPT_INTR) | 921 set_intercept(svm, INTERCEPT_SMI);
786 (1ULL << INTERCEPT_NMI) | 922 set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
787 (1ULL << INTERCEPT_SMI) | 923 set_intercept(svm, INTERCEPT_CPUID);
788 (1ULL << INTERCEPT_SELECTIVE_CR0) | 924 set_intercept(svm, INTERCEPT_INVD);
789 (1ULL << INTERCEPT_CPUID) | 925 set_intercept(svm, INTERCEPT_HLT);
790 (1ULL << INTERCEPT_INVD) | 926 set_intercept(svm, INTERCEPT_INVLPG);
791 (1ULL << INTERCEPT_HLT) | 927 set_intercept(svm, INTERCEPT_INVLPGA);
792 (1ULL << INTERCEPT_INVLPG) | 928 set_intercept(svm, INTERCEPT_IOIO_PROT);
793 (1ULL << INTERCEPT_INVLPGA) | 929 set_intercept(svm, INTERCEPT_MSR_PROT);
794 (1ULL << INTERCEPT_IOIO_PROT) | 930 set_intercept(svm, INTERCEPT_TASK_SWITCH);
795 (1ULL << INTERCEPT_MSR_PROT) | 931 set_intercept(svm, INTERCEPT_SHUTDOWN);
796 (1ULL << INTERCEPT_TASK_SWITCH) | 932 set_intercept(svm, INTERCEPT_VMRUN);
797 (1ULL << INTERCEPT_SHUTDOWN) | 933 set_intercept(svm, INTERCEPT_VMMCALL);
798 (1ULL << INTERCEPT_VMRUN) | 934 set_intercept(svm, INTERCEPT_VMLOAD);
799 (1ULL << INTERCEPT_VMMCALL) | 935 set_intercept(svm, INTERCEPT_VMSAVE);
800 (1ULL << INTERCEPT_VMLOAD) | 936 set_intercept(svm, INTERCEPT_STGI);
801 (1ULL << INTERCEPT_VMSAVE) | 937 set_intercept(svm, INTERCEPT_CLGI);
802 (1ULL << INTERCEPT_STGI) | 938 set_intercept(svm, INTERCEPT_SKINIT);
803 (1ULL << INTERCEPT_CLGI) | 939 set_intercept(svm, INTERCEPT_WBINVD);
804 (1ULL << INTERCEPT_SKINIT) | 940 set_intercept(svm, INTERCEPT_MONITOR);
805 (1ULL << INTERCEPT_WBINVD) | 941 set_intercept(svm, INTERCEPT_MWAIT);
806 (1ULL << INTERCEPT_MONITOR) | 942 set_intercept(svm, INTERCEPT_XSETBV);
807 (1ULL << INTERCEPT_MWAIT);
808 943
809 control->iopm_base_pa = iopm_base; 944 control->iopm_base_pa = iopm_base;
810 control->msrpm_base_pa = __pa(svm->msrpm); 945 control->msrpm_base_pa = __pa(svm->msrpm);
@@ -855,25 +990,27 @@ static void init_vmcb(struct vcpu_svm *svm)
855 if (npt_enabled) { 990 if (npt_enabled) {
856 /* Setup VMCB for Nested Paging */ 991 /* Setup VMCB for Nested Paging */
857 control->nested_ctl = 1; 992 control->nested_ctl = 1;
858 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | 993 clr_intercept(svm, INTERCEPT_TASK_SWITCH);
859 (1ULL << INTERCEPT_INVLPG)); 994 clr_intercept(svm, INTERCEPT_INVLPG);
860 control->intercept_exceptions &= ~(1 << PF_VECTOR); 995 clr_exception_intercept(svm, PF_VECTOR);
861 control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; 996 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
862 control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; 997 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
863 save->g_pat = 0x0007040600070406ULL; 998 save->g_pat = 0x0007040600070406ULL;
864 save->cr3 = 0; 999 save->cr3 = 0;
865 save->cr4 = 0; 1000 save->cr4 = 0;
866 } 1001 }
867 force_new_asid(&svm->vcpu); 1002 svm->asid_generation = 0;
868 1003
869 svm->nested.vmcb = 0; 1004 svm->nested.vmcb = 0;
870 svm->vcpu.arch.hflags = 0; 1005 svm->vcpu.arch.hflags = 0;
871 1006
872 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { 1007 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
873 control->pause_filter_count = 3000; 1008 control->pause_filter_count = 3000;
874 control->intercept |= (1ULL << INTERCEPT_PAUSE); 1009 set_intercept(svm, INTERCEPT_PAUSE);
875 } 1010 }
876 1011
1012 mark_all_dirty(svm->vmcb);
1013
877 enable_gif(svm); 1014 enable_gif(svm);
878} 1015}
879 1016
@@ -990,8 +1127,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
990 1127
991 if (unlikely(cpu != vcpu->cpu)) { 1128 if (unlikely(cpu != vcpu->cpu)) {
992 svm->asid_generation = 0; 1129 svm->asid_generation = 0;
1130 mark_all_dirty(svm->vmcb);
993 } 1131 }
994 1132
1133#ifdef CONFIG_X86_64
1134 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1135#endif
1136 savesegment(fs, svm->host.fs);
1137 savesegment(gs, svm->host.gs);
1138 svm->host.ldt = kvm_read_ldt();
1139
995 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1140 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
996 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1141 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
997} 1142}
@@ -1002,6 +1147,14 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1002 int i; 1147 int i;
1003 1148
1004 ++vcpu->stat.host_state_reload; 1149 ++vcpu->stat.host_state_reload;
1150 kvm_load_ldt(svm->host.ldt);
1151#ifdef CONFIG_X86_64
1152 loadsegment(fs, svm->host.fs);
1153 load_gs_index(svm->host.gs);
1154 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1155#else
1156 loadsegment(gs, svm->host.gs);
1157#endif
1005 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1158 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1006 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1159 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1007} 1160}
@@ -1021,7 +1174,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1021 switch (reg) { 1174 switch (reg) {
1022 case VCPU_EXREG_PDPTR: 1175 case VCPU_EXREG_PDPTR:
1023 BUG_ON(!npt_enabled); 1176 BUG_ON(!npt_enabled);
1024 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); 1177 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1025 break; 1178 break;
1026 default: 1179 default:
1027 BUG(); 1180 BUG();
@@ -1030,12 +1183,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1030 1183
1031static void svm_set_vintr(struct vcpu_svm *svm) 1184static void svm_set_vintr(struct vcpu_svm *svm)
1032{ 1185{
1033 svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; 1186 set_intercept(svm, INTERCEPT_VINTR);
1034} 1187}
1035 1188
1036static void svm_clear_vintr(struct vcpu_svm *svm) 1189static void svm_clear_vintr(struct vcpu_svm *svm)
1037{ 1190{
1038 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1191 clr_intercept(svm, INTERCEPT_VINTR);
1039} 1192}
1040 1193
1041static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1194static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1150,6 +1303,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1150 1303
1151 svm->vmcb->save.idtr.limit = dt->size; 1304 svm->vmcb->save.idtr.limit = dt->size;
1152 svm->vmcb->save.idtr.base = dt->address ; 1305 svm->vmcb->save.idtr.base = dt->address ;
1306 mark_dirty(svm->vmcb, VMCB_DT);
1153} 1307}
1154 1308
1155static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1309static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1166,19 +1320,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1166 1320
1167 svm->vmcb->save.gdtr.limit = dt->size; 1321 svm->vmcb->save.gdtr.limit = dt->size;
1168 svm->vmcb->save.gdtr.base = dt->address ; 1322 svm->vmcb->save.gdtr.base = dt->address ;
1323 mark_dirty(svm->vmcb, VMCB_DT);
1169} 1324}
1170 1325
1171static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1326static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1172{ 1327{
1173} 1328}
1174 1329
1330static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1331{
1332}
1333
1175static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1334static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1176{ 1335{
1177} 1336}
1178 1337
1179static void update_cr0_intercept(struct vcpu_svm *svm) 1338static void update_cr0_intercept(struct vcpu_svm *svm)
1180{ 1339{
1181 struct vmcb *vmcb = svm->vmcb;
1182 ulong gcr0 = svm->vcpu.arch.cr0; 1340 ulong gcr0 = svm->vcpu.arch.cr0;
1183 u64 *hcr0 = &svm->vmcb->save.cr0; 1341 u64 *hcr0 = &svm->vmcb->save.cr0;
1184 1342
@@ -1188,27 +1346,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
1188 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) 1346 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1189 | (gcr0 & SVM_CR0_SELECTIVE_MASK); 1347 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1190 1348
1349 mark_dirty(svm->vmcb, VMCB_CR);
1191 1350
1192 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1351 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1193 vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; 1352 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1194 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; 1353 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1195 if (is_nested(svm)) {
1196 struct vmcb *hsave = svm->nested.hsave;
1197
1198 hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
1199 hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1200 vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
1201 vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
1202 }
1203 } else { 1354 } else {
1204 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; 1355 set_cr_intercept(svm, INTERCEPT_CR0_READ);
1205 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; 1356 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1206 if (is_nested(svm)) {
1207 struct vmcb *hsave = svm->nested.hsave;
1208
1209 hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
1210 hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1211 }
1212 } 1357 }
1213} 1358}
1214 1359
@@ -1216,7 +1361,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1216{ 1361{
1217 struct vcpu_svm *svm = to_svm(vcpu); 1362 struct vcpu_svm *svm = to_svm(vcpu);
1218 1363
1219 if (is_nested(svm)) { 1364 if (is_guest_mode(vcpu)) {
1220 /* 1365 /*
1221 * We are here because we run in nested mode, the host kvm 1366 * We are here because we run in nested mode, the host kvm
1222 * intercepts cr0 writes but the l1 hypervisor does not. 1367 * intercepts cr0 writes but the l1 hypervisor does not.
@@ -1268,6 +1413,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1268 */ 1413 */
1269 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1414 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1270 svm->vmcb->save.cr0 = cr0; 1415 svm->vmcb->save.cr0 = cr0;
1416 mark_dirty(svm->vmcb, VMCB_CR);
1271 update_cr0_intercept(svm); 1417 update_cr0_intercept(svm);
1272} 1418}
1273 1419
@@ -1277,13 +1423,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1277 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; 1423 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1278 1424
1279 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1425 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1280 force_new_asid(vcpu); 1426 svm_flush_tlb(vcpu);
1281 1427
1282 vcpu->arch.cr4 = cr4; 1428 vcpu->arch.cr4 = cr4;
1283 if (!npt_enabled) 1429 if (!npt_enabled)
1284 cr4 |= X86_CR4_PAE; 1430 cr4 |= X86_CR4_PAE;
1285 cr4 |= host_cr4_mce; 1431 cr4 |= host_cr4_mce;
1286 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1432 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1433 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1287} 1434}
1288 1435
1289static void svm_set_segment(struct kvm_vcpu *vcpu, 1436static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1312,26 +1459,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1312 = (svm->vmcb->save.cs.attrib 1459 = (svm->vmcb->save.cs.attrib
1313 >> SVM_SELECTOR_DPL_SHIFT) & 3; 1460 >> SVM_SELECTOR_DPL_SHIFT) & 3;
1314 1461
1462 mark_dirty(svm->vmcb, VMCB_SEG);
1315} 1463}
1316 1464
1317static void update_db_intercept(struct kvm_vcpu *vcpu) 1465static void update_db_intercept(struct kvm_vcpu *vcpu)
1318{ 1466{
1319 struct vcpu_svm *svm = to_svm(vcpu); 1467 struct vcpu_svm *svm = to_svm(vcpu);
1320 1468
1321 svm->vmcb->control.intercept_exceptions &= 1469 clr_exception_intercept(svm, DB_VECTOR);
1322 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1470 clr_exception_intercept(svm, BP_VECTOR);
1323 1471
1324 if (svm->nmi_singlestep) 1472 if (svm->nmi_singlestep)
1325 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1473 set_exception_intercept(svm, DB_VECTOR);
1326 1474
1327 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1475 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1328 if (vcpu->guest_debug & 1476 if (vcpu->guest_debug &
1329 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 1477 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1330 svm->vmcb->control.intercept_exceptions |= 1478 set_exception_intercept(svm, DB_VECTOR);
1331 1 << DB_VECTOR;
1332 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1479 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1333 svm->vmcb->control.intercept_exceptions |= 1480 set_exception_intercept(svm, BP_VECTOR);
1334 1 << BP_VECTOR;
1335 } else 1481 } else
1336 vcpu->guest_debug = 0; 1482 vcpu->guest_debug = 0;
1337} 1483}
@@ -1345,21 +1491,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1345 else 1491 else
1346 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1492 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1347 1493
1348 update_db_intercept(vcpu); 1494 mark_dirty(svm->vmcb, VMCB_DR);
1349}
1350
1351static void load_host_msrs(struct kvm_vcpu *vcpu)
1352{
1353#ifdef CONFIG_X86_64
1354 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1355#endif
1356}
1357 1495
1358static void save_host_msrs(struct kvm_vcpu *vcpu) 1496 update_db_intercept(vcpu);
1359{
1360#ifdef CONFIG_X86_64
1361 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1362#endif
1363} 1497}
1364 1498
1365static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1499static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -1372,6 +1506,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1372 1506
1373 svm->asid_generation = sd->asid_generation; 1507 svm->asid_generation = sd->asid_generation;
1374 svm->vmcb->control.asid = sd->next_asid++; 1508 svm->vmcb->control.asid = sd->next_asid++;
1509
1510 mark_dirty(svm->vmcb, VMCB_ASID);
1375} 1511}
1376 1512
1377static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1513static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
@@ -1379,20 +1515,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1379 struct vcpu_svm *svm = to_svm(vcpu); 1515 struct vcpu_svm *svm = to_svm(vcpu);
1380 1516
1381 svm->vmcb->save.dr7 = value; 1517 svm->vmcb->save.dr7 = value;
1518 mark_dirty(svm->vmcb, VMCB_DR);
1382} 1519}
1383 1520
1384static int pf_interception(struct vcpu_svm *svm) 1521static int pf_interception(struct vcpu_svm *svm)
1385{ 1522{
1386 u64 fault_address; 1523 u64 fault_address = svm->vmcb->control.exit_info_2;
1387 u32 error_code; 1524 u32 error_code;
1525 int r = 1;
1388 1526
1389 fault_address = svm->vmcb->control.exit_info_2; 1527 switch (svm->apf_reason) {
1390 error_code = svm->vmcb->control.exit_info_1; 1528 default:
1391 1529 error_code = svm->vmcb->control.exit_info_1;
1392 trace_kvm_page_fault(fault_address, error_code); 1530
1393 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) 1531 trace_kvm_page_fault(fault_address, error_code);
1394 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1532 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1395 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1533 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1534 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1535 svm->vmcb->control.insn_bytes,
1536 svm->vmcb->control.insn_len);
1537 break;
1538 case KVM_PV_REASON_PAGE_NOT_PRESENT:
1539 svm->apf_reason = 0;
1540 local_irq_disable();
1541 kvm_async_pf_task_wait(fault_address);
1542 local_irq_enable();
1543 break;
1544 case KVM_PV_REASON_PAGE_READY:
1545 svm->apf_reason = 0;
1546 local_irq_disable();
1547 kvm_async_pf_task_wake(fault_address);
1548 local_irq_enable();
1549 break;
1550 }
1551 return r;
1396} 1552}
1397 1553
1398static int db_interception(struct vcpu_svm *svm) 1554static int db_interception(struct vcpu_svm *svm)
@@ -1440,7 +1596,7 @@ static int ud_interception(struct vcpu_svm *svm)
1440{ 1596{
1441 int er; 1597 int er;
1442 1598
1443 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); 1599 er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
1444 if (er != EMULATE_DONE) 1600 if (er != EMULATE_DONE)
1445 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1601 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1446 return 1; 1602 return 1;
@@ -1449,21 +1605,8 @@ static int ud_interception(struct vcpu_svm *svm)
1449static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1605static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1450{ 1606{
1451 struct vcpu_svm *svm = to_svm(vcpu); 1607 struct vcpu_svm *svm = to_svm(vcpu);
1452 u32 excp;
1453
1454 if (is_nested(svm)) {
1455 u32 h_excp, n_excp;
1456
1457 h_excp = svm->nested.hsave->control.intercept_exceptions;
1458 n_excp = svm->nested.intercept_exceptions;
1459 h_excp &= ~(1 << NM_VECTOR);
1460 excp = h_excp | n_excp;
1461 } else {
1462 excp = svm->vmcb->control.intercept_exceptions;
1463 excp &= ~(1 << NM_VECTOR);
1464 }
1465 1608
1466 svm->vmcb->control.intercept_exceptions = excp; 1609 clr_exception_intercept(svm, NM_VECTOR);
1467 1610
1468 svm->vcpu.fpu_active = 1; 1611 svm->vcpu.fpu_active = 1;
1469 update_cr0_intercept(svm); 1612 update_cr0_intercept(svm);
@@ -1570,7 +1713,7 @@ static int io_interception(struct vcpu_svm *svm)
1570 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1713 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1571 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1714 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1572 if (string || in) 1715 if (string || in)
1573 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 1716 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
1574 1717
1575 port = io_info >> 16; 1718 port = io_info >> 16;
1576 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1719 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1624,17 +1767,19 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1624 struct vcpu_svm *svm = to_svm(vcpu); 1767 struct vcpu_svm *svm = to_svm(vcpu);
1625 1768
1626 svm->vmcb->control.nested_cr3 = root; 1769 svm->vmcb->control.nested_cr3 = root;
1627 force_new_asid(vcpu); 1770 mark_dirty(svm->vmcb, VMCB_NPT);
1771 svm_flush_tlb(vcpu);
1628} 1772}
1629 1773
1630static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) 1774static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
1775 struct x86_exception *fault)
1631{ 1776{
1632 struct vcpu_svm *svm = to_svm(vcpu); 1777 struct vcpu_svm *svm = to_svm(vcpu);
1633 1778
1634 svm->vmcb->control.exit_code = SVM_EXIT_NPF; 1779 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1635 svm->vmcb->control.exit_code_hi = 0; 1780 svm->vmcb->control.exit_code_hi = 0;
1636 svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code; 1781 svm->vmcb->control.exit_info_1 = fault->error_code;
1637 svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address; 1782 svm->vmcb->control.exit_info_2 = fault->address;
1638 1783
1639 nested_svm_vmexit(svm); 1784 nested_svm_vmexit(svm);
1640} 1785}
@@ -1680,7 +1825,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1680{ 1825{
1681 int vmexit; 1826 int vmexit;
1682 1827
1683 if (!is_nested(svm)) 1828 if (!is_guest_mode(&svm->vcpu))
1684 return 0; 1829 return 0;
1685 1830
1686 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; 1831 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1698,7 +1843,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1698/* This function returns true if it is save to enable the irq window */ 1843/* This function returns true if it is save to enable the irq window */
1699static inline bool nested_svm_intr(struct vcpu_svm *svm) 1844static inline bool nested_svm_intr(struct vcpu_svm *svm)
1700{ 1845{
1701 if (!is_nested(svm)) 1846 if (!is_guest_mode(&svm->vcpu))
1702 return true; 1847 return true;
1703 1848
1704 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1849 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
@@ -1737,7 +1882,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
1737/* This function returns true if it is save to enable the nmi window */ 1882/* This function returns true if it is save to enable the nmi window */
1738static inline bool nested_svm_nmi(struct vcpu_svm *svm) 1883static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1739{ 1884{
1740 if (!is_nested(svm)) 1885 if (!is_guest_mode(&svm->vcpu))
1741 return true; 1886 return true;
1742 1887
1743 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) 1888 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
@@ -1836,8 +1981,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1836 return NESTED_EXIT_HOST; 1981 return NESTED_EXIT_HOST;
1837 break; 1982 break;
1838 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1983 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1839 /* When we're shadowing, trap PFs */ 1984 /* When we're shadowing, trap PFs, but not async PF */
1840 if (!npt_enabled) 1985 if (!npt_enabled && svm->apf_reason == 0)
1841 return NESTED_EXIT_HOST; 1986 return NESTED_EXIT_HOST;
1842 break; 1987 break;
1843 case SVM_EXIT_EXCP_BASE + NM_VECTOR: 1988 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1865,27 +2010,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1865 case SVM_EXIT_IOIO: 2010 case SVM_EXIT_IOIO:
1866 vmexit = nested_svm_intercept_ioio(svm); 2011 vmexit = nested_svm_intercept_ioio(svm);
1867 break; 2012 break;
1868 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 2013 case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
1869 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 2014 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
1870 if (svm->nested.intercept_cr_read & cr_bits) 2015 if (svm->nested.intercept_cr & bit)
1871 vmexit = NESTED_EXIT_DONE; 2016 vmexit = NESTED_EXIT_DONE;
1872 break; 2017 break;
1873 } 2018 }
1874 case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { 2019 case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
1875 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); 2020 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
1876 if (svm->nested.intercept_cr_write & cr_bits) 2021 if (svm->nested.intercept_dr & bit)
1877 vmexit = NESTED_EXIT_DONE;
1878 break;
1879 }
1880 case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
1881 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
1882 if (svm->nested.intercept_dr_read & dr_bits)
1883 vmexit = NESTED_EXIT_DONE;
1884 break;
1885 }
1886 case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
1887 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
1888 if (svm->nested.intercept_dr_write & dr_bits)
1889 vmexit = NESTED_EXIT_DONE; 2022 vmexit = NESTED_EXIT_DONE;
1890 break; 2023 break;
1891 } 2024 }
@@ -1893,6 +2026,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1893 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 2026 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1894 if (svm->nested.intercept_exceptions & excp_bits) 2027 if (svm->nested.intercept_exceptions & excp_bits)
1895 vmexit = NESTED_EXIT_DONE; 2028 vmexit = NESTED_EXIT_DONE;
2029 /* async page fault always cause vmexit */
2030 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2031 svm->apf_reason != 0)
2032 vmexit = NESTED_EXIT_DONE;
1896 break; 2033 break;
1897 } 2034 }
1898 case SVM_EXIT_ERR: { 2035 case SVM_EXIT_ERR: {
@@ -1926,10 +2063,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
1926 struct vmcb_control_area *dst = &dst_vmcb->control; 2063 struct vmcb_control_area *dst = &dst_vmcb->control;
1927 struct vmcb_control_area *from = &from_vmcb->control; 2064 struct vmcb_control_area *from = &from_vmcb->control;
1928 2065
1929 dst->intercept_cr_read = from->intercept_cr_read; 2066 dst->intercept_cr = from->intercept_cr;
1930 dst->intercept_cr_write = from->intercept_cr_write; 2067 dst->intercept_dr = from->intercept_dr;
1931 dst->intercept_dr_read = from->intercept_dr_read;
1932 dst->intercept_dr_write = from->intercept_dr_write;
1933 dst->intercept_exceptions = from->intercept_exceptions; 2068 dst->intercept_exceptions = from->intercept_exceptions;
1934 dst->intercept = from->intercept; 2069 dst->intercept = from->intercept;
1935 dst->iopm_base_pa = from->iopm_base_pa; 2070 dst->iopm_base_pa = from->iopm_base_pa;
@@ -1970,7 +2105,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1970 if (!nested_vmcb) 2105 if (!nested_vmcb)
1971 return 1; 2106 return 1;
1972 2107
1973 /* Exit nested SVM mode */ 2108 /* Exit Guest-Mode */
2109 leave_guest_mode(&svm->vcpu);
1974 svm->nested.vmcb = 0; 2110 svm->nested.vmcb = 0;
1975 2111
1976 /* Give the current vmcb to the guest */ 2112 /* Give the current vmcb to the guest */
@@ -1984,7 +2120,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1984 nested_vmcb->save.idtr = vmcb->save.idtr; 2120 nested_vmcb->save.idtr = vmcb->save.idtr;
1985 nested_vmcb->save.efer = svm->vcpu.arch.efer; 2121 nested_vmcb->save.efer = svm->vcpu.arch.efer;
1986 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); 2122 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1987 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; 2123 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
1988 nested_vmcb->save.cr2 = vmcb->save.cr2; 2124 nested_vmcb->save.cr2 = vmcb->save.cr2;
1989 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; 2125 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
1990 nested_vmcb->save.rflags = vmcb->save.rflags; 2126 nested_vmcb->save.rflags = vmcb->save.rflags;
@@ -2061,6 +2197,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2061 svm->vmcb->save.cpl = 0; 2197 svm->vmcb->save.cpl = 0;
2062 svm->vmcb->control.exit_int_info = 0; 2198 svm->vmcb->control.exit_int_info = 0;
2063 2199
2200 mark_all_dirty(svm->vmcb);
2201
2064 nested_svm_unmap(page); 2202 nested_svm_unmap(page);
2065 2203
2066 nested_svm_uninit_mmu_context(&svm->vcpu); 2204 nested_svm_uninit_mmu_context(&svm->vcpu);
@@ -2148,8 +2286,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2148 nested_vmcb->control.event_inj, 2286 nested_vmcb->control.event_inj,
2149 nested_vmcb->control.nested_ctl); 2287 nested_vmcb->control.nested_ctl);
2150 2288
2151 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, 2289 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2152 nested_vmcb->control.intercept_cr_write, 2290 nested_vmcb->control.intercept_cr >> 16,
2153 nested_vmcb->control.intercept_exceptions, 2291 nested_vmcb->control.intercept_exceptions,
2154 nested_vmcb->control.intercept); 2292 nested_vmcb->control.intercept);
2155 2293
@@ -2177,7 +2315,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2177 if (npt_enabled) 2315 if (npt_enabled)
2178 hsave->save.cr3 = vmcb->save.cr3; 2316 hsave->save.cr3 = vmcb->save.cr3;
2179 else 2317 else
2180 hsave->save.cr3 = svm->vcpu.arch.cr3; 2318 hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
2181 2319
2182 copy_vmcb_control_area(hsave, vmcb); 2320 copy_vmcb_control_area(hsave, vmcb);
2183 2321
@@ -2229,14 +2367,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2229 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; 2367 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
2230 2368
2231 /* cache intercepts */ 2369 /* cache intercepts */
2232 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; 2370 svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
2233 svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; 2371 svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
2234 svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
2235 svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
2236 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; 2372 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2237 svm->nested.intercept = nested_vmcb->control.intercept; 2373 svm->nested.intercept = nested_vmcb->control.intercept;
2238 2374
2239 force_new_asid(&svm->vcpu); 2375 svm_flush_tlb(&svm->vcpu);
2240 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 2376 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2241 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 2377 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2242 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 2378 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -2245,29 +2381,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2245 2381
2246 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { 2382 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2247 /* We only want the cr8 intercept bits of the guest */ 2383 /* We only want the cr8 intercept bits of the guest */
2248 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; 2384 clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2249 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2385 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2250 } 2386 }
2251 2387
2252 /* We don't want to see VMMCALLs from a nested guest */ 2388 /* We don't want to see VMMCALLs from a nested guest */
2253 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); 2389 clr_intercept(svm, INTERCEPT_VMMCALL);
2254
2255 /*
2256 * We don't want a nested guest to be more powerful than the guest, so
2257 * all intercepts are ORed
2258 */
2259 svm->vmcb->control.intercept_cr_read |=
2260 nested_vmcb->control.intercept_cr_read;
2261 svm->vmcb->control.intercept_cr_write |=
2262 nested_vmcb->control.intercept_cr_write;
2263 svm->vmcb->control.intercept_dr_read |=
2264 nested_vmcb->control.intercept_dr_read;
2265 svm->vmcb->control.intercept_dr_write |=
2266 nested_vmcb->control.intercept_dr_write;
2267 svm->vmcb->control.intercept_exceptions |=
2268 nested_vmcb->control.intercept_exceptions;
2269
2270 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
2271 2390
2272 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; 2391 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2273 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 2392 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
@@ -2278,11 +2397,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2278 2397
2279 nested_svm_unmap(page); 2398 nested_svm_unmap(page);
2280 2399
2281 /* nested_vmcb is our indicator if nested SVM is activated */ 2400 /* Enter Guest-Mode */
2401 enter_guest_mode(&svm->vcpu);
2402
2403 /*
2404 * Merge guest and host intercepts - must be called with vcpu in
2405 * guest-mode to take affect here
2406 */
2407 recalc_intercepts(svm);
2408
2282 svm->nested.vmcb = vmcb_gpa; 2409 svm->nested.vmcb = vmcb_gpa;
2283 2410
2284 enable_gif(svm); 2411 enable_gif(svm);
2285 2412
2413 mark_all_dirty(svm->vmcb);
2414
2286 return true; 2415 return true;
2287} 2416}
2288 2417
@@ -2400,6 +2529,8 @@ static int clgi_interception(struct vcpu_svm *svm)
2400 svm_clear_vintr(svm); 2529 svm_clear_vintr(svm);
2401 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2530 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2402 2531
2532 mark_dirty(svm->vmcb, VMCB_INTR);
2533
2403 return 1; 2534 return 1;
2404} 2535}
2405 2536
@@ -2426,6 +2557,19 @@ static int skinit_interception(struct vcpu_svm *svm)
2426 return 1; 2557 return 1;
2427} 2558}
2428 2559
2560static int xsetbv_interception(struct vcpu_svm *svm)
2561{
2562 u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2563 u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
2564
2565 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2566 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2567 skip_emulated_instruction(&svm->vcpu);
2568 }
2569
2570 return 1;
2571}
2572
2429static int invalid_op_interception(struct vcpu_svm *svm) 2573static int invalid_op_interception(struct vcpu_svm *svm)
2430{ 2574{
2431 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2575 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2507,19 +2651,92 @@ static int cpuid_interception(struct vcpu_svm *svm)
2507static int iret_interception(struct vcpu_svm *svm) 2651static int iret_interception(struct vcpu_svm *svm)
2508{ 2652{
2509 ++svm->vcpu.stat.nmi_window_exits; 2653 ++svm->vcpu.stat.nmi_window_exits;
2510 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 2654 clr_intercept(svm, INTERCEPT_IRET);
2511 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2655 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2512 return 1; 2656 return 1;
2513} 2657}
2514 2658
2515static int invlpg_interception(struct vcpu_svm *svm) 2659static int invlpg_interception(struct vcpu_svm *svm)
2516{ 2660{
2517 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2661 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2662 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2663
2664 kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2665 skip_emulated_instruction(&svm->vcpu);
2666 return 1;
2518} 2667}
2519 2668
2520static int emulate_on_interception(struct vcpu_svm *svm) 2669static int emulate_on_interception(struct vcpu_svm *svm)
2521{ 2670{
2522 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2671 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2672}
2673
2674#define CR_VALID (1ULL << 63)
2675
2676static int cr_interception(struct vcpu_svm *svm)
2677{
2678 int reg, cr;
2679 unsigned long val;
2680 int err;
2681
2682 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2683 return emulate_on_interception(svm);
2684
2685 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2686 return emulate_on_interception(svm);
2687
2688 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2689 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2690
2691 err = 0;
2692 if (cr >= 16) { /* mov to cr */
2693 cr -= 16;
2694 val = kvm_register_read(&svm->vcpu, reg);
2695 switch (cr) {
2696 case 0:
2697 err = kvm_set_cr0(&svm->vcpu, val);
2698 break;
2699 case 3:
2700 err = kvm_set_cr3(&svm->vcpu, val);
2701 break;
2702 case 4:
2703 err = kvm_set_cr4(&svm->vcpu, val);
2704 break;
2705 case 8:
2706 err = kvm_set_cr8(&svm->vcpu, val);
2707 break;
2708 default:
2709 WARN(1, "unhandled write to CR%d", cr);
2710 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2711 return 1;
2712 }
2713 } else { /* mov from cr */
2714 switch (cr) {
2715 case 0:
2716 val = kvm_read_cr0(&svm->vcpu);
2717 break;
2718 case 2:
2719 val = svm->vcpu.arch.cr2;
2720 break;
2721 case 3:
2722 val = kvm_read_cr3(&svm->vcpu);
2723 break;
2724 case 4:
2725 val = kvm_read_cr4(&svm->vcpu);
2726 break;
2727 case 8:
2728 val = kvm_get_cr8(&svm->vcpu);
2729 break;
2730 default:
2731 WARN(1, "unhandled read from CR%d", cr);
2732 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2733 return 1;
2734 }
2735 kvm_register_write(&svm->vcpu, reg, val);
2736 }
2737 kvm_complete_insn_gp(&svm->vcpu, err);
2738
2739 return 1;
2523} 2740}
2524 2741
2525static int cr0_write_interception(struct vcpu_svm *svm) 2742static int cr0_write_interception(struct vcpu_svm *svm)
@@ -2527,7 +2744,7 @@ static int cr0_write_interception(struct vcpu_svm *svm)
2527 struct kvm_vcpu *vcpu = &svm->vcpu; 2744 struct kvm_vcpu *vcpu = &svm->vcpu;
2528 int r; 2745 int r;
2529 2746
2530 r = emulate_instruction(&svm->vcpu, 0, 0, 0); 2747 r = cr_interception(svm);
2531 2748
2532 if (svm->nested.vmexit_rip) { 2749 if (svm->nested.vmexit_rip) {
2533 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); 2750 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
@@ -2536,22 +2753,47 @@ static int cr0_write_interception(struct vcpu_svm *svm)
2536 svm->nested.vmexit_rip = 0; 2753 svm->nested.vmexit_rip = 0;
2537 } 2754 }
2538 2755
2539 return r == EMULATE_DONE; 2756 return r;
2757}
2758
2759static int dr_interception(struct vcpu_svm *svm)
2760{
2761 int reg, dr;
2762 unsigned long val;
2763 int err;
2764
2765 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2766 return emulate_on_interception(svm);
2767
2768 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2769 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2770
2771 if (dr >= 16) { /* mov to DRn */
2772 val = kvm_register_read(&svm->vcpu, reg);
2773 kvm_set_dr(&svm->vcpu, dr - 16, val);
2774 } else {
2775 err = kvm_get_dr(&svm->vcpu, dr, &val);
2776 if (!err)
2777 kvm_register_write(&svm->vcpu, reg, val);
2778 }
2779
2780 return 1;
2540} 2781}
2541 2782
2542static int cr8_write_interception(struct vcpu_svm *svm) 2783static int cr8_write_interception(struct vcpu_svm *svm)
2543{ 2784{
2544 struct kvm_run *kvm_run = svm->vcpu.run; 2785 struct kvm_run *kvm_run = svm->vcpu.run;
2786 int r;
2545 2787
2546 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2788 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2547 /* instruction emulation calls kvm_set_cr8() */ 2789 /* instruction emulation calls kvm_set_cr8() */
2548 emulate_instruction(&svm->vcpu, 0, 0, 0); 2790 r = cr_interception(svm);
2549 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2791 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2550 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2792 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2551 return 1; 2793 return r;
2552 } 2794 }
2553 if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) 2795 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2554 return 1; 2796 return r;
2555 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2797 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2556 return 0; 2798 return 0;
2557} 2799}
@@ -2562,14 +2804,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2562 2804
2563 switch (ecx) { 2805 switch (ecx) {
2564 case MSR_IA32_TSC: { 2806 case MSR_IA32_TSC: {
2565 u64 tsc_offset; 2807 struct vmcb *vmcb = get_host_vmcb(svm);
2566 2808
2567 if (is_nested(svm)) 2809 *data = vmcb->control.tsc_offset + native_read_tsc();
2568 tsc_offset = svm->nested.hsave->control.tsc_offset;
2569 else
2570 tsc_offset = svm->vmcb->control.tsc_offset;
2571
2572 *data = tsc_offset + native_read_tsc();
2573 break; 2810 break;
2574 } 2811 }
2575 case MSR_STAR: 2812 case MSR_STAR:
@@ -2714,7 +2951,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2714 svm->vmcb->save.sysenter_esp = data; 2951 svm->vmcb->save.sysenter_esp = data;
2715 break; 2952 break;
2716 case MSR_IA32_DEBUGCTLMSR: 2953 case MSR_IA32_DEBUGCTLMSR:
2717 if (!svm_has(SVM_FEATURE_LBRV)) { 2954 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2718 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 2955 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2719 __func__, data); 2956 __func__, data);
2720 break; 2957 break;
@@ -2723,6 +2960,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2723 return 1; 2960 return 1;
2724 2961
2725 svm->vmcb->save.dbgctl = data; 2962 svm->vmcb->save.dbgctl = data;
2963 mark_dirty(svm->vmcb, VMCB_LBR);
2726 if (data & (1ULL<<0)) 2964 if (data & (1ULL<<0))
2727 svm_enable_lbrv(svm); 2965 svm_enable_lbrv(svm);
2728 else 2966 else
@@ -2775,6 +3013,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
2775 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3013 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2776 svm_clear_vintr(svm); 3014 svm_clear_vintr(svm);
2777 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3015 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3016 mark_dirty(svm->vmcb, VMCB_INTR);
2778 /* 3017 /*
2779 * If the user space waits to inject interrupts, exit as soon as 3018 * If the user space waits to inject interrupts, exit as soon as
2780 * possible 3019 * possible
@@ -2797,31 +3036,31 @@ static int pause_interception(struct vcpu_svm *svm)
2797} 3036}
2798 3037
2799static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 3038static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2800 [SVM_EXIT_READ_CR0] = emulate_on_interception, 3039 [SVM_EXIT_READ_CR0] = cr_interception,
2801 [SVM_EXIT_READ_CR3] = emulate_on_interception, 3040 [SVM_EXIT_READ_CR3] = cr_interception,
2802 [SVM_EXIT_READ_CR4] = emulate_on_interception, 3041 [SVM_EXIT_READ_CR4] = cr_interception,
2803 [SVM_EXIT_READ_CR8] = emulate_on_interception, 3042 [SVM_EXIT_READ_CR8] = cr_interception,
2804 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 3043 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2805 [SVM_EXIT_WRITE_CR0] = cr0_write_interception, 3044 [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
2806 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 3045 [SVM_EXIT_WRITE_CR3] = cr_interception,
2807 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 3046 [SVM_EXIT_WRITE_CR4] = cr_interception,
2808 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3047 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2809 [SVM_EXIT_READ_DR0] = emulate_on_interception, 3048 [SVM_EXIT_READ_DR0] = dr_interception,
2810 [SVM_EXIT_READ_DR1] = emulate_on_interception, 3049 [SVM_EXIT_READ_DR1] = dr_interception,
2811 [SVM_EXIT_READ_DR2] = emulate_on_interception, 3050 [SVM_EXIT_READ_DR2] = dr_interception,
2812 [SVM_EXIT_READ_DR3] = emulate_on_interception, 3051 [SVM_EXIT_READ_DR3] = dr_interception,
2813 [SVM_EXIT_READ_DR4] = emulate_on_interception, 3052 [SVM_EXIT_READ_DR4] = dr_interception,
2814 [SVM_EXIT_READ_DR5] = emulate_on_interception, 3053 [SVM_EXIT_READ_DR5] = dr_interception,
2815 [SVM_EXIT_READ_DR6] = emulate_on_interception, 3054 [SVM_EXIT_READ_DR6] = dr_interception,
2816 [SVM_EXIT_READ_DR7] = emulate_on_interception, 3055 [SVM_EXIT_READ_DR7] = dr_interception,
2817 [SVM_EXIT_WRITE_DR0] = emulate_on_interception, 3056 [SVM_EXIT_WRITE_DR0] = dr_interception,
2818 [SVM_EXIT_WRITE_DR1] = emulate_on_interception, 3057 [SVM_EXIT_WRITE_DR1] = dr_interception,
2819 [SVM_EXIT_WRITE_DR2] = emulate_on_interception, 3058 [SVM_EXIT_WRITE_DR2] = dr_interception,
2820 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 3059 [SVM_EXIT_WRITE_DR3] = dr_interception,
2821 [SVM_EXIT_WRITE_DR4] = emulate_on_interception, 3060 [SVM_EXIT_WRITE_DR4] = dr_interception,
2822 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 3061 [SVM_EXIT_WRITE_DR5] = dr_interception,
2823 [SVM_EXIT_WRITE_DR6] = emulate_on_interception, 3062 [SVM_EXIT_WRITE_DR6] = dr_interception,
2824 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 3063 [SVM_EXIT_WRITE_DR7] = dr_interception,
2825 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 3064 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2826 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3065 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2827 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3066 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
@@ -2854,6 +3093,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2854 [SVM_EXIT_WBINVD] = emulate_on_interception, 3093 [SVM_EXIT_WBINVD] = emulate_on_interception,
2855 [SVM_EXIT_MONITOR] = invalid_op_interception, 3094 [SVM_EXIT_MONITOR] = invalid_op_interception,
2856 [SVM_EXIT_MWAIT] = invalid_op_interception, 3095 [SVM_EXIT_MWAIT] = invalid_op_interception,
3096 [SVM_EXIT_XSETBV] = xsetbv_interception,
2857 [SVM_EXIT_NPF] = pf_interception, 3097 [SVM_EXIT_NPF] = pf_interception,
2858}; 3098};
2859 3099
@@ -2864,10 +3104,10 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
2864 struct vmcb_save_area *save = &svm->vmcb->save; 3104 struct vmcb_save_area *save = &svm->vmcb->save;
2865 3105
2866 pr_err("VMCB Control Area:\n"); 3106 pr_err("VMCB Control Area:\n");
2867 pr_err("cr_read: %04x\n", control->intercept_cr_read); 3107 pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff);
2868 pr_err("cr_write: %04x\n", control->intercept_cr_write); 3108 pr_err("cr_write: %04x\n", control->intercept_cr >> 16);
2869 pr_err("dr_read: %04x\n", control->intercept_dr_read); 3109 pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff);
2870 pr_err("dr_write: %04x\n", control->intercept_dr_write); 3110 pr_err("dr_write: %04x\n", control->intercept_dr >> 16);
2871 pr_err("exceptions: %08x\n", control->intercept_exceptions); 3111 pr_err("exceptions: %08x\n", control->intercept_exceptions);
2872 pr_err("intercepts: %016llx\n", control->intercept); 3112 pr_err("intercepts: %016llx\n", control->intercept);
2873 pr_err("pause filter count: %d\n", control->pause_filter_count); 3113 pr_err("pause filter count: %d\n", control->pause_filter_count);
@@ -2950,15 +3190,23 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
2950 3190
2951} 3191}
2952 3192
3193static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3194{
3195 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3196
3197 *info1 = control->exit_info_1;
3198 *info2 = control->exit_info_2;
3199}
3200
2953static int handle_exit(struct kvm_vcpu *vcpu) 3201static int handle_exit(struct kvm_vcpu *vcpu)
2954{ 3202{
2955 struct vcpu_svm *svm = to_svm(vcpu); 3203 struct vcpu_svm *svm = to_svm(vcpu);
2956 struct kvm_run *kvm_run = vcpu->run; 3204 struct kvm_run *kvm_run = vcpu->run;
2957 u32 exit_code = svm->vmcb->control.exit_code; 3205 u32 exit_code = svm->vmcb->control.exit_code;
2958 3206
2959 trace_kvm_exit(exit_code, vcpu); 3207 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
2960 3208
2961 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) 3209 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
2962 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3210 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2963 if (npt_enabled) 3211 if (npt_enabled)
2964 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3212 vcpu->arch.cr3 = svm->vmcb->save.cr3;
@@ -2970,7 +3218,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2970 return 1; 3218 return 1;
2971 } 3219 }
2972 3220
2973 if (is_nested(svm)) { 3221 if (is_guest_mode(vcpu)) {
2974 int vmexit; 3222 int vmexit;
2975 3223
2976 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, 3224 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -3033,7 +3281,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
3033 3281
3034 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 3282 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3035 3283
3036 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3037 /* FIXME: handle wraparound of asid_generation */ 3284 /* FIXME: handle wraparound of asid_generation */
3038 if (svm->asid_generation != sd->asid_generation) 3285 if (svm->asid_generation != sd->asid_generation)
3039 new_asid(svm, sd); 3286 new_asid(svm, sd);
@@ -3045,7 +3292,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3045 3292
3046 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3293 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3047 vcpu->arch.hflags |= HF_NMI_MASK; 3294 vcpu->arch.hflags |= HF_NMI_MASK;
3048 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3295 set_intercept(svm, INTERCEPT_IRET);
3049 ++vcpu->stat.nmi_injections; 3296 ++vcpu->stat.nmi_injections;
3050} 3297}
3051 3298
@@ -3058,6 +3305,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
3058 control->int_ctl &= ~V_INTR_PRIO_MASK; 3305 control->int_ctl &= ~V_INTR_PRIO_MASK;
3059 control->int_ctl |= V_IRQ_MASK | 3306 control->int_ctl |= V_IRQ_MASK |
3060 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 3307 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
3308 mark_dirty(svm->vmcb, VMCB_INTR);
3061} 3309}
3062 3310
3063static void svm_set_irq(struct kvm_vcpu *vcpu) 3311static void svm_set_irq(struct kvm_vcpu *vcpu)
@@ -3077,14 +3325,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3077{ 3325{
3078 struct vcpu_svm *svm = to_svm(vcpu); 3326 struct vcpu_svm *svm = to_svm(vcpu);
3079 3327
3080 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3328 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3081 return; 3329 return;
3082 3330
3083 if (irr == -1) 3331 if (irr == -1)
3084 return; 3332 return;
3085 3333
3086 if (tpr >= irr) 3334 if (tpr >= irr)
3087 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; 3335 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3088} 3336}
3089 3337
3090static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3338static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -3112,10 +3360,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3112 3360
3113 if (masked) { 3361 if (masked) {
3114 svm->vcpu.arch.hflags |= HF_NMI_MASK; 3362 svm->vcpu.arch.hflags |= HF_NMI_MASK;
3115 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3363 set_intercept(svm, INTERCEPT_IRET);
3116 } else { 3364 } else {
3117 svm->vcpu.arch.hflags &= ~HF_NMI_MASK; 3365 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3118 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 3366 clr_intercept(svm, INTERCEPT_IRET);
3119 } 3367 }
3120} 3368}
3121 3369
@@ -3131,7 +3379,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3131 3379
3132 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); 3380 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
3133 3381
3134 if (is_nested(svm)) 3382 if (is_guest_mode(vcpu))
3135 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); 3383 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
3136 3384
3137 return ret; 3385 return ret;
@@ -3177,7 +3425,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3177 3425
3178static void svm_flush_tlb(struct kvm_vcpu *vcpu) 3426static void svm_flush_tlb(struct kvm_vcpu *vcpu)
3179{ 3427{
3180 force_new_asid(vcpu); 3428 struct vcpu_svm *svm = to_svm(vcpu);
3429
3430 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3431 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3432 else
3433 svm->asid_generation--;
3181} 3434}
3182 3435
3183static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) 3436static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3188,10 +3441,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3188{ 3441{
3189 struct vcpu_svm *svm = to_svm(vcpu); 3442 struct vcpu_svm *svm = to_svm(vcpu);
3190 3443
3191 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3444 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3192 return; 3445 return;
3193 3446
3194 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 3447 if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3195 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 3448 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3196 kvm_set_cr8(vcpu, cr8); 3449 kvm_set_cr8(vcpu, cr8);
3197 } 3450 }
@@ -3202,7 +3455,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3202 struct vcpu_svm *svm = to_svm(vcpu); 3455 struct vcpu_svm *svm = to_svm(vcpu);
3203 u64 cr8; 3456 u64 cr8;
3204 3457
3205 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3458 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3206 return; 3459 return;
3207 3460
3208 cr8 = kvm_get_cr8(vcpu); 3461 cr8 = kvm_get_cr8(vcpu);
@@ -3289,9 +3542,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3289static void svm_vcpu_run(struct kvm_vcpu *vcpu) 3542static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3290{ 3543{
3291 struct vcpu_svm *svm = to_svm(vcpu); 3544 struct vcpu_svm *svm = to_svm(vcpu);
3292 u16 fs_selector;
3293 u16 gs_selector;
3294 u16 ldt_selector;
3295 3545
3296 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3546 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3297 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 3547 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3308,10 +3558,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3308 3558
3309 sync_lapic_to_cr8(vcpu); 3559 sync_lapic_to_cr8(vcpu);
3310 3560
3311 save_host_msrs(vcpu);
3312 savesegment(fs, fs_selector);
3313 savesegment(gs, gs_selector);
3314 ldt_selector = kvm_read_ldt();
3315 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3561 svm->vmcb->save.cr2 = vcpu->arch.cr2;
3316 3562
3317 clgi(); 3563 clgi();
@@ -3389,19 +3635,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3389#endif 3635#endif
3390 ); 3636 );
3391 3637
3392 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3393 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3394 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3395 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3396
3397 load_host_msrs(vcpu);
3398 kvm_load_ldt(ldt_selector);
3399 loadsegment(fs, fs_selector);
3400#ifdef CONFIG_X86_64 3638#ifdef CONFIG_X86_64
3401 load_gs_index(gs_selector); 3639 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3402 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
3403#else 3640#else
3404 loadsegment(gs, gs_selector); 3641 loadsegment(fs, svm->host.fs);
3405#endif 3642#endif
3406 3643
3407 reload_tss(vcpu); 3644 reload_tss(vcpu);
@@ -3410,10 +3647,21 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3410 3647
3411 stgi(); 3648 stgi();
3412 3649
3650 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3651 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3652 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3653 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3654
3413 sync_cr8_to_lapic(vcpu); 3655 sync_cr8_to_lapic(vcpu);
3414 3656
3415 svm->next_rip = 0; 3657 svm->next_rip = 0;
3416 3658
3659 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3660
3661 /* if exit due to PF check for async PF */
3662 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3663 svm->apf_reason = kvm_read_and_reset_pf_reason();
3664
3417 if (npt_enabled) { 3665 if (npt_enabled) {
3418 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); 3666 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
3419 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); 3667 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
@@ -3426,6 +3674,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3426 if (unlikely(svm->vmcb->control.exit_code == 3674 if (unlikely(svm->vmcb->control.exit_code ==
3427 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 3675 SVM_EXIT_EXCP_BASE + MC_VECTOR))
3428 svm_handle_mce(svm); 3676 svm_handle_mce(svm);
3677
3678 mark_all_clean(svm->vmcb);
3429} 3679}
3430 3680
3431#undef R 3681#undef R
@@ -3435,7 +3685,8 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3435 struct vcpu_svm *svm = to_svm(vcpu); 3685 struct vcpu_svm *svm = to_svm(vcpu);
3436 3686
3437 svm->vmcb->save.cr3 = root; 3687 svm->vmcb->save.cr3 = root;
3438 force_new_asid(vcpu); 3688 mark_dirty(svm->vmcb, VMCB_CR);
3689 svm_flush_tlb(vcpu);
3439} 3690}
3440 3691
3441static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) 3692static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -3443,11 +3694,13 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3443 struct vcpu_svm *svm = to_svm(vcpu); 3694 struct vcpu_svm *svm = to_svm(vcpu);
3444 3695
3445 svm->vmcb->control.nested_cr3 = root; 3696 svm->vmcb->control.nested_cr3 = root;
3697 mark_dirty(svm->vmcb, VMCB_NPT);
3446 3698
3447 /* Also sync guest cr3 here in case we live migrate */ 3699 /* Also sync guest cr3 here in case we live migrate */
3448 svm->vmcb->save.cr3 = vcpu->arch.cr3; 3700 svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
3701 mark_dirty(svm->vmcb, VMCB_CR);
3449 3702
3450 force_new_asid(vcpu); 3703 svm_flush_tlb(vcpu);
3451} 3704}
3452 3705
3453static int is_disabled(void) 3706static int is_disabled(void)
@@ -3494,10 +3747,6 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
3494static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 3747static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3495{ 3748{
3496 switch (func) { 3749 switch (func) {
3497 case 0x00000001:
3498 /* Mask out xsave bit as long as it is not supported by SVM */
3499 entry->ecx &= ~(bit(X86_FEATURE_XSAVE));
3500 break;
3501 case 0x80000001: 3750 case 0x80000001:
3502 if (nested) 3751 if (nested)
3503 entry->ecx |= (1 << 2); /* Set SVM bit */ 3752 entry->ecx |= (1 << 2); /* Set SVM bit */
@@ -3511,7 +3760,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3511 additional features */ 3760 additional features */
3512 3761
3513 /* Support next_rip if host supports it */ 3762 /* Support next_rip if host supports it */
3514 if (svm_has(SVM_FEATURE_NRIP)) 3763 if (boot_cpu_has(X86_FEATURE_NRIPS))
3515 entry->edx |= SVM_FEATURE_NRIP; 3764 entry->edx |= SVM_FEATURE_NRIP;
3516 3765
3517 /* Support NPT for the guest if enabled */ 3766 /* Support NPT for the guest if enabled */
@@ -3571,6 +3820,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
3571 { SVM_EXIT_WBINVD, "wbinvd" }, 3820 { SVM_EXIT_WBINVD, "wbinvd" },
3572 { SVM_EXIT_MONITOR, "monitor" }, 3821 { SVM_EXIT_MONITOR, "monitor" },
3573 { SVM_EXIT_MWAIT, "mwait" }, 3822 { SVM_EXIT_MWAIT, "mwait" },
3823 { SVM_EXIT_XSETBV, "xsetbv" },
3574 { SVM_EXIT_NPF, "npf" }, 3824 { SVM_EXIT_NPF, "npf" },
3575 { -1, NULL } 3825 { -1, NULL }
3576}; 3826};
@@ -3594,9 +3844,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3594{ 3844{
3595 struct vcpu_svm *svm = to_svm(vcpu); 3845 struct vcpu_svm *svm = to_svm(vcpu);
3596 3846
3597 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; 3847 set_exception_intercept(svm, NM_VECTOR);
3598 if (is_nested(svm))
3599 svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
3600 update_cr0_intercept(svm); 3848 update_cr0_intercept(svm);
3601} 3849}
3602 3850
@@ -3627,6 +3875,7 @@ static struct kvm_x86_ops svm_x86_ops = {
3627 .get_cpl = svm_get_cpl, 3875 .get_cpl = svm_get_cpl,
3628 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 3876 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
3629 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, 3877 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
3878 .decache_cr3 = svm_decache_cr3,
3630 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 3879 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
3631 .set_cr0 = svm_set_cr0, 3880 .set_cr0 = svm_set_cr0,
3632 .set_cr3 = svm_set_cr3, 3881 .set_cr3 = svm_set_cr3,
@@ -3667,7 +3916,9 @@ static struct kvm_x86_ops svm_x86_ops = {
3667 .get_tdp_level = get_npt_level, 3916 .get_tdp_level = get_npt_level,
3668 .get_mt_mask = svm_get_mt_mask, 3917 .get_mt_mask = svm_get_mt_mask,
3669 3918
3919 .get_exit_info = svm_get_exit_info,
3670 .exit_reasons_str = svm_exit_reasons_str, 3920 .exit_reasons_str = svm_exit_reasons_str,
3921
3671 .get_lpage_level = svm_get_lpage_level, 3922 .get_lpage_level = svm_get_lpage_level,
3672 3923
3673 .cpuid_update = svm_cpuid_update, 3924 .cpuid_update = svm_cpuid_update,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a6544b8e7c0f..1357d7cf4ec8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic,
178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) 178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) 179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
180 180
181#define KVM_ISA_VMX 1
182#define KVM_ISA_SVM 2
183
181/* 184/*
182 * Tracepoint for kvm guest exit: 185 * Tracepoint for kvm guest exit:
183 */ 186 */
184TRACE_EVENT(kvm_exit, 187TRACE_EVENT(kvm_exit,
185 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu), 188 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
186 TP_ARGS(exit_reason, vcpu), 189 TP_ARGS(exit_reason, vcpu, isa),
187 190
188 TP_STRUCT__entry( 191 TP_STRUCT__entry(
189 __field( unsigned int, exit_reason ) 192 __field( unsigned int, exit_reason )
190 __field( unsigned long, guest_rip ) 193 __field( unsigned long, guest_rip )
194 __field( u32, isa )
195 __field( u64, info1 )
196 __field( u64, info2 )
191 ), 197 ),
192 198
193 TP_fast_assign( 199 TP_fast_assign(
194 __entry->exit_reason = exit_reason; 200 __entry->exit_reason = exit_reason;
195 __entry->guest_rip = kvm_rip_read(vcpu); 201 __entry->guest_rip = kvm_rip_read(vcpu);
202 __entry->isa = isa;
203 kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
204 &__entry->info2);
196 ), 205 ),
197 206
198 TP_printk("reason %s rip 0x%lx", 207 TP_printk("reason %s rip 0x%lx info %llx %llx",
199 ftrace_print_symbols_seq(p, __entry->exit_reason, 208 ftrace_print_symbols_seq(p, __entry->exit_reason,
200 kvm_x86_ops->exit_reasons_str), 209 kvm_x86_ops->exit_reasons_str),
201 __entry->guest_rip) 210 __entry->guest_rip, __entry->info1, __entry->info2)
202); 211);
203 212
204/* 213/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81fcbe9515c5..bf89ec2cfb82 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
69static int __read_mostly vmm_exclusive = 1; 69static int __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO); 70module_param(vmm_exclusive, bool, S_IRUGO);
71 71
72static int __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO);
74
72#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 75#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
73 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 76 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
74#define KVM_GUEST_CR0_MASK \ 77#define KVM_GUEST_CR0_MASK \
@@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm);
177static u64 construct_eptp(unsigned long root_hpa); 180static u64 construct_eptp(unsigned long root_hpa);
178static void kvm_cpu_vmxon(u64 addr); 181static void kvm_cpu_vmxon(u64 addr);
179static void kvm_cpu_vmxoff(void); 182static void kvm_cpu_vmxoff(void);
183static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
180 184
181static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
182static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b;
188static unsigned long *vmx_msr_bitmap_legacy; 192static unsigned long *vmx_msr_bitmap_legacy;
189static unsigned long *vmx_msr_bitmap_longmode; 193static unsigned long *vmx_msr_bitmap_longmode;
190 194
195static bool cpu_has_load_ia32_efer;
196
191static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 197static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
192static DEFINE_SPINLOCK(vmx_vpid_lock); 198static DEFINE_SPINLOCK(vmx_vpid_lock);
193 199
@@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs)
472 u8 error; 478 u8 error;
473 479
474 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" 480 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
475 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 481 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
476 : "cc", "memory"); 482 : "cc", "memory");
477 if (error) 483 if (error)
478 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 484 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs)
485 u8 error; 491 u8 error;
486 492
487 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 493 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
488 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 494 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
489 : "cc", "memory"); 495 : "cc", "memory");
490 if (error) 496 if (error)
491 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 497 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
565 571
566static unsigned long vmcs_readl(unsigned long field) 572static unsigned long vmcs_readl(unsigned long field)
567{ 573{
568 unsigned long value; 574 unsigned long value = 0;
569 575
570 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) 576 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
571 : "=a"(value) : "d"(field) : "cc"); 577 : "+a"(value) : "d"(field) : "cc");
572 return value; 578 return value;
573} 579}
574 580
@@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
661 unsigned i; 667 unsigned i;
662 struct msr_autoload *m = &vmx->msr_autoload; 668 struct msr_autoload *m = &vmx->msr_autoload;
663 669
670 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
671 vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
672 vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
673 return;
674 }
675
664 for (i = 0; i < m->nr; ++i) 676 for (i = 0; i < m->nr; ++i)
665 if (m->guest[i].index == msr) 677 if (m->guest[i].index == msr)
666 break; 678 break;
@@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
680 unsigned i; 692 unsigned i;
681 struct msr_autoload *m = &vmx->msr_autoload; 693 struct msr_autoload *m = &vmx->msr_autoload;
682 694
695 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
696 vmcs_write64(GUEST_IA32_EFER, guest_val);
697 vmcs_write64(HOST_IA32_EFER, host_val);
698 vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
699 vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
700 return;
701 }
702
683 for (i = 0; i < m->nr; ++i) 703 for (i = 0; i < m->nr; ++i)
684 if (m->guest[i].index == msr) 704 if (m->guest[i].index == msr)
685 break; 705 break;
@@ -1009,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1009 vmx_set_interrupt_shadow(vcpu, 0); 1029 vmx_set_interrupt_shadow(vcpu, 0);
1010} 1030}
1011 1031
1032static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1033{
1034 /* Ensure that we clear the HLT state in the VMCS. We don't need to
1035 * explicitly skip the instruction because if the HLT state is set, then
1036 * the instruction is already executing and RIP has already been
1037 * advanced. */
1038 if (!yield_on_hlt &&
1039 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1040 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1041}
1042
1012static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 1043static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1013 bool has_error_code, u32 error_code, 1044 bool has_error_code, u32 error_code,
1014 bool reinject) 1045 bool reinject)
@@ -1035,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1035 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1066 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1036 1067
1037 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1068 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1069 vmx_clear_hlt(vcpu);
1038} 1070}
1039 1071
1040static bool vmx_rdtscp_supported(void) 1072static bool vmx_rdtscp_supported(void)
@@ -1305,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void)
1305 && tboot_enabled()) 1337 && tboot_enabled())
1306 return 1; 1338 return 1;
1307 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 1339 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1308 && !tboot_enabled()) 1340 && !tboot_enabled()) {
1341 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1342 " activate TXT before enabling KVM\n");
1309 return 1; 1343 return 1;
1344 }
1310 } 1345 }
1311 1346
1312 return 0; 1347 return 0;
@@ -1400,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1400 return 0; 1435 return 0;
1401} 1436}
1402 1437
1438static __init bool allow_1_setting(u32 msr, u32 ctl)
1439{
1440 u32 vmx_msr_low, vmx_msr_high;
1441
1442 rdmsr(msr, vmx_msr_low, vmx_msr_high);
1443 return vmx_msr_high & ctl;
1444}
1445
1403static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) 1446static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1404{ 1447{
1405 u32 vmx_msr_low, vmx_msr_high; 1448 u32 vmx_msr_low, vmx_msr_high;
@@ -1416,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1416 &_pin_based_exec_control) < 0) 1459 &_pin_based_exec_control) < 0)
1417 return -EIO; 1460 return -EIO;
1418 1461
1419 min = CPU_BASED_HLT_EXITING | 1462 min =
1420#ifdef CONFIG_X86_64 1463#ifdef CONFIG_X86_64
1421 CPU_BASED_CR8_LOAD_EXITING | 1464 CPU_BASED_CR8_LOAD_EXITING |
1422 CPU_BASED_CR8_STORE_EXITING | 1465 CPU_BASED_CR8_STORE_EXITING |
@@ -1429,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1429 CPU_BASED_MWAIT_EXITING | 1472 CPU_BASED_MWAIT_EXITING |
1430 CPU_BASED_MONITOR_EXITING | 1473 CPU_BASED_MONITOR_EXITING |
1431 CPU_BASED_INVLPG_EXITING; 1474 CPU_BASED_INVLPG_EXITING;
1475
1476 if (yield_on_hlt)
1477 min |= CPU_BASED_HLT_EXITING;
1478
1432 opt = CPU_BASED_TPR_SHADOW | 1479 opt = CPU_BASED_TPR_SHADOW |
1433 CPU_BASED_USE_MSR_BITMAPS | 1480 CPU_BASED_USE_MSR_BITMAPS |
1434 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1481 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1510,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1510 vmcs_conf->vmexit_ctrl = _vmexit_control; 1557 vmcs_conf->vmexit_ctrl = _vmexit_control;
1511 vmcs_conf->vmentry_ctrl = _vmentry_control; 1558 vmcs_conf->vmentry_ctrl = _vmentry_control;
1512 1559
1560 cpu_has_load_ia32_efer =
1561 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
1562 VM_ENTRY_LOAD_IA32_EFER)
1563 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
1564 VM_EXIT_LOAD_IA32_EFER);
1565
1513 return 0; 1566 return 0;
1514} 1567}
1515 1568
@@ -1683,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1683 save->limit = vmcs_read32(sf->limit); 1736 save->limit = vmcs_read32(sf->limit);
1684 save->ar = vmcs_read32(sf->ar_bytes); 1737 save->ar = vmcs_read32(sf->ar_bytes);
1685 vmcs_write16(sf->selector, save->base >> 4); 1738 vmcs_write16(sf->selector, save->base >> 4);
1686 vmcs_write32(sf->base, save->base & 0xfffff); 1739 vmcs_write32(sf->base, save->base & 0xffff0);
1687 vmcs_write32(sf->limit, 0xffff); 1740 vmcs_write32(sf->limit, 0xffff);
1688 vmcs_write32(sf->ar_bytes, 0xf3); 1741 vmcs_write32(sf->ar_bytes, 0xf3);
1742 if (save->base & 0xf)
1743 printk_once(KERN_WARNING "kvm: segment base is not paragraph"
1744 " aligned when entering protected mode (seg=%d)",
1745 seg);
1689} 1746}
1690 1747
1691static void enter_rmode(struct kvm_vcpu *vcpu) 1748static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1814,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1814 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 1871 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1815} 1872}
1816 1873
1874static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
1875{
1876 if (enable_ept && is_paging(vcpu))
1877 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
1878 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
1879}
1880
1817static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1881static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1818{ 1882{
1819 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 1883 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1857,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1857 unsigned long cr0, 1921 unsigned long cr0,
1858 struct kvm_vcpu *vcpu) 1922 struct kvm_vcpu *vcpu)
1859{ 1923{
1924 vmx_decache_cr3(vcpu);
1860 if (!(cr0 & X86_CR0_PG)) { 1925 if (!(cr0 & X86_CR0_PG)) {
1861 /* From paging/starting to nonpaging */ 1926 /* From paging/starting to nonpaging */
1862 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1927 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1937,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1937 if (enable_ept) { 2002 if (enable_ept) {
1938 eptp = construct_eptp(cr3); 2003 eptp = construct_eptp(cr3);
1939 vmcs_write64(EPT_POINTER, eptp); 2004 vmcs_write64(EPT_POINTER, eptp);
1940 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 2005 guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
1941 vcpu->kvm->arch.ept_identity_map_addr; 2006 vcpu->kvm->arch.ept_identity_map_addr;
1942 ept_load_pdptrs(vcpu); 2007 ept_load_pdptrs(vcpu);
1943 } 2008 }
@@ -2725,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2725 vmcs_writel(GUEST_IDTR_BASE, 0); 2790 vmcs_writel(GUEST_IDTR_BASE, 0);
2726 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 2791 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
2727 2792
2728 vmcs_write32(GUEST_ACTIVITY_STATE, 0); 2793 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
2729 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 2794 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2730 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 2795 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2731 2796
@@ -2787,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2787 return; 2852 return;
2788 } 2853 }
2789 2854
2855 if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
2856 enable_irq_window(vcpu);
2857 return;
2858 }
2790 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2859 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2791 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 2860 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2792 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2861 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2814,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2814 } else 2883 } else
2815 intr |= INTR_TYPE_EXT_INTR; 2884 intr |= INTR_TYPE_EXT_INTR;
2816 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 2885 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
2886 vmx_clear_hlt(vcpu);
2817} 2887}
2818 2888
2819static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 2889static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2841,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2841 } 2911 }
2842 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2912 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2843 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2913 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2914 vmx_clear_hlt(vcpu);
2844} 2915}
2845 2916
2846static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 2917static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2849,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2849 return 0; 2920 return 0;
2850 2921
2851 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 2922 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2852 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); 2923 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
2924 | GUEST_INTR_STATE_NMI));
2853} 2925}
2854 2926
2855static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 2927static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2910,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2910 * Cause the #SS fault with 0 error code in VM86 mode. 2982 * Cause the #SS fault with 0 error code in VM86 mode.
2911 */ 2983 */
2912 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2984 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2913 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) 2985 if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
2914 return 1; 2986 return 1;
2915 /* 2987 /*
2916 * Forward all other exceptions that are valid in real mode. 2988 * Forward all other exceptions that are valid in real mode.
@@ -3007,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3007 } 3079 }
3008 3080
3009 if (is_invalid_opcode(intr_info)) { 3081 if (is_invalid_opcode(intr_info)) {
3010 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); 3082 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
3011 if (er != EMULATE_DONE) 3083 if (er != EMULATE_DONE)
3012 kvm_queue_exception(vcpu, UD_VECTOR); 3084 kvm_queue_exception(vcpu, UD_VECTOR);
3013 return 1; 3085 return 1;
@@ -3026,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3026 3098
3027 if (kvm_event_needs_reinjection(vcpu)) 3099 if (kvm_event_needs_reinjection(vcpu))
3028 kvm_mmu_unprotect_page_virt(vcpu, cr2); 3100 kvm_mmu_unprotect_page_virt(vcpu, cr2);
3029 return kvm_mmu_page_fault(vcpu, cr2, error_code); 3101 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
3030 } 3102 }
3031 3103
3032 if (vmx->rmode.vm86_active && 3104 if (vmx->rmode.vm86_active &&
@@ -3098,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
3098 ++vcpu->stat.io_exits; 3170 ++vcpu->stat.io_exits;
3099 3171
3100 if (string || in) 3172 if (string || in)
3101 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3173 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3102 3174
3103 port = exit_qualification >> 16; 3175 port = exit_qualification >> 16;
3104 size = (exit_qualification & 7) + 1; 3176 size = (exit_qualification & 7) + 1;
@@ -3118,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3118 hypercall[2] = 0xc1; 3190 hypercall[2] = 0xc1;
3119} 3191}
3120 3192
3121static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
3122{
3123 if (err)
3124 kvm_inject_gp(vcpu, 0);
3125 else
3126 skip_emulated_instruction(vcpu);
3127}
3128
3129static int handle_cr(struct kvm_vcpu *vcpu) 3193static int handle_cr(struct kvm_vcpu *vcpu)
3130{ 3194{
3131 unsigned long exit_qualification, val; 3195 unsigned long exit_qualification, val;
@@ -3143,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3143 switch (cr) { 3207 switch (cr) {
3144 case 0: 3208 case 0:
3145 err = kvm_set_cr0(vcpu, val); 3209 err = kvm_set_cr0(vcpu, val);
3146 complete_insn_gp(vcpu, err); 3210 kvm_complete_insn_gp(vcpu, err);
3147 return 1; 3211 return 1;
3148 case 3: 3212 case 3:
3149 err = kvm_set_cr3(vcpu, val); 3213 err = kvm_set_cr3(vcpu, val);
3150 complete_insn_gp(vcpu, err); 3214 kvm_complete_insn_gp(vcpu, err);
3151 return 1; 3215 return 1;
3152 case 4: 3216 case 4:
3153 err = kvm_set_cr4(vcpu, val); 3217 err = kvm_set_cr4(vcpu, val);
3154 complete_insn_gp(vcpu, err); 3218 kvm_complete_insn_gp(vcpu, err);
3155 return 1; 3219 return 1;
3156 case 8: { 3220 case 8: {
3157 u8 cr8_prev = kvm_get_cr8(vcpu); 3221 u8 cr8_prev = kvm_get_cr8(vcpu);
3158 u8 cr8 = kvm_register_read(vcpu, reg); 3222 u8 cr8 = kvm_register_read(vcpu, reg);
3159 kvm_set_cr8(vcpu, cr8); 3223 err = kvm_set_cr8(vcpu, cr8);
3160 skip_emulated_instruction(vcpu); 3224 kvm_complete_insn_gp(vcpu, err);
3161 if (irqchip_in_kernel(vcpu->kvm)) 3225 if (irqchip_in_kernel(vcpu->kvm))
3162 return 1; 3226 return 1;
3163 if (cr8_prev <= cr8) 3227 if (cr8_prev <= cr8)
@@ -3176,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3176 case 1: /*mov from cr*/ 3240 case 1: /*mov from cr*/
3177 switch (cr) { 3241 switch (cr) {
3178 case 3: 3242 case 3:
3179 kvm_register_write(vcpu, reg, vcpu->arch.cr3); 3243 val = kvm_read_cr3(vcpu);
3180 trace_kvm_cr_read(cr, vcpu->arch.cr3); 3244 kvm_register_write(vcpu, reg, val);
3245 trace_kvm_cr_read(cr, val);
3181 skip_emulated_instruction(vcpu); 3246 skip_emulated_instruction(vcpu);
3182 return 1; 3247 return 1;
3183 case 8: 3248 case 8:
@@ -3349,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3349 return 1; 3414 return 1;
3350} 3415}
3351 3416
3417static int handle_invd(struct kvm_vcpu *vcpu)
3418{
3419 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3420}
3421
3352static int handle_invlpg(struct kvm_vcpu *vcpu) 3422static int handle_invlpg(struct kvm_vcpu *vcpu)
3353{ 3423{
3354 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3424 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3377,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
3377 3447
3378static int handle_apic_access(struct kvm_vcpu *vcpu) 3448static int handle_apic_access(struct kvm_vcpu *vcpu)
3379{ 3449{
3380 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3450 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3381} 3451}
3382 3452
3383static int handle_task_switch(struct kvm_vcpu *vcpu) 3453static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3476,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
3476 3546
3477 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 3547 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3478 trace_kvm_page_fault(gpa, exit_qualification); 3548 trace_kvm_page_fault(gpa, exit_qualification);
3479 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); 3549 return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
3480} 3550}
3481 3551
3482static u64 ept_rsvd_mask(u64 spte, int level) 3552static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3592,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3592 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) 3662 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
3593 return handle_interrupt_window(&vmx->vcpu); 3663 return handle_interrupt_window(&vmx->vcpu);
3594 3664
3595 err = emulate_instruction(vcpu, 0, 0, 0); 3665 err = emulate_instruction(vcpu, 0);
3596 3666
3597 if (err == EMULATE_DO_MMIO) { 3667 if (err == EMULATE_DO_MMIO) {
3598 ret = 0; 3668 ret = 0;
@@ -3649,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3649 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 3719 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
3650 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 3720 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
3651 [EXIT_REASON_HLT] = handle_halt, 3721 [EXIT_REASON_HLT] = handle_halt,
3722 [EXIT_REASON_INVD] = handle_invd,
3652 [EXIT_REASON_INVLPG] = handle_invlpg, 3723 [EXIT_REASON_INVLPG] = handle_invlpg,
3653 [EXIT_REASON_VMCALL] = handle_vmcall, 3724 [EXIT_REASON_VMCALL] = handle_vmcall,
3654 [EXIT_REASON_VMCLEAR] = handle_vmx_insn, 3725 [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
@@ -3676,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3676static const int kvm_vmx_max_exit_handlers = 3747static const int kvm_vmx_max_exit_handlers =
3677 ARRAY_SIZE(kvm_vmx_exit_handlers); 3748 ARRAY_SIZE(kvm_vmx_exit_handlers);
3678 3749
3750static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3751{
3752 *info1 = vmcs_readl(EXIT_QUALIFICATION);
3753 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
3754}
3755
3679/* 3756/*
3680 * The guest has exited. See if we can fix it or if we need userspace 3757 * The guest has exited. See if we can fix it or if we need userspace
3681 * assistance. 3758 * assistance.
@@ -3686,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3686 u32 exit_reason = vmx->exit_reason; 3763 u32 exit_reason = vmx->exit_reason;
3687 u32 vectoring_info = vmx->idt_vectoring_info; 3764 u32 vectoring_info = vmx->idt_vectoring_info;
3688 3765
3689 trace_kvm_exit(exit_reason, vcpu); 3766 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
3690 3767
3691 /* If guest state is invalid, start emulating */ 3768 /* If guest state is invalid, start emulating */
3692 if (vmx->emulation_required && emulate_invalid_guest_state) 3769 if (vmx->emulation_required && emulate_invalid_guest_state)
3693 return handle_invalid_guest_state(vcpu); 3770 return handle_invalid_guest_state(vcpu);
3694 3771
3695 /* Access CR3 don't cause VMExit in paging mode, so we need
3696 * to sync with guest real CR3. */
3697 if (enable_ept && is_paging(vcpu))
3698 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3699
3700 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 3772 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3701 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3773 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3702 vcpu->run->fail_entry.hardware_entry_failure_reason 3774 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -4013,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4013 ); 4085 );
4014 4086
4015 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 4087 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
4016 | (1 << VCPU_EXREG_PDPTR)); 4088 | (1 << VCPU_EXREG_PDPTR)
4089 | (1 << VCPU_EXREG_CR3));
4017 vcpu->arch.regs_dirty = 0; 4090 vcpu->arch.regs_dirty = 0;
4018 4091
4019 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 4092 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
@@ -4280,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4280 .get_cpl = vmx_get_cpl, 4353 .get_cpl = vmx_get_cpl,
4281 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 4354 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4282 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 4355 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
4356 .decache_cr3 = vmx_decache_cr3,
4283 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 4357 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
4284 .set_cr0 = vmx_set_cr0, 4358 .set_cr0 = vmx_set_cr0,
4285 .set_cr3 = vmx_set_cr3, 4359 .set_cr3 = vmx_set_cr3,
@@ -4320,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
4320 .get_tdp_level = get_ept_level, 4394 .get_tdp_level = get_ept_level,
4321 .get_mt_mask = vmx_get_mt_mask, 4395 .get_mt_mask = vmx_get_mt_mask,
4322 4396
4397 .get_exit_info = vmx_get_exit_info,
4323 .exit_reasons_str = vmx_exit_reasons_str, 4398 .exit_reasons_str = vmx_exit_reasons_str,
4399
4324 .get_lpage_level = vmx_get_lpage_level, 4400 .get_lpage_level = vmx_get_lpage_level,
4325 4401
4326 .cpuid_update = vmx_cpuid_update, 4402 .cpuid_update = vmx_cpuid_update,
@@ -4396,8 +4472,6 @@ static int __init vmx_init(void)
4396 4472
4397 if (enable_ept) { 4473 if (enable_ept) {
4398 bypass_guest_pf = 0; 4474 bypass_guest_pf = 0;
4399 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
4400 VMX_EPT_WRITABLE_MASK);
4401 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 4475 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4402 VMX_EPT_EXECUTABLE_MASK); 4476 VMX_EPT_EXECUTABLE_MASK);
4403 kvm_enable_tdp(); 4477 kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b989e1f1e5d3..bcc0efce85bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -43,6 +43,7 @@
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/hash.h>
46#include <trace/events/kvm.h> 47#include <trace/events/kvm.h>
47 48
48#define CREATE_TRACE_POINTS 49#define CREATE_TRACE_POINTS
@@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
155 156
156u64 __read_mostly host_xcr0; 157u64 __read_mostly host_xcr0;
157 158
159static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
160{
161 int i;
162 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
163 vcpu->arch.apf.gfns[i] = ~0;
164}
165
158static void kvm_on_user_return(struct user_return_notifier *urn) 166static void kvm_on_user_return(struct user_return_notifier *urn)
159{ 167{
160 unsigned slot; 168 unsigned slot;
@@ -326,23 +334,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
326} 334}
327EXPORT_SYMBOL_GPL(kvm_requeue_exception); 335EXPORT_SYMBOL_GPL(kvm_requeue_exception);
328 336
329void kvm_inject_page_fault(struct kvm_vcpu *vcpu) 337void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
330{ 338{
331 unsigned error_code = vcpu->arch.fault.error_code; 339 if (err)
340 kvm_inject_gp(vcpu, 0);
341 else
342 kvm_x86_ops->skip_emulated_instruction(vcpu);
343}
344EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
332 345
346void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
347{
333 ++vcpu->stat.pf_guest; 348 ++vcpu->stat.pf_guest;
334 vcpu->arch.cr2 = vcpu->arch.fault.address; 349 vcpu->arch.cr2 = fault->address;
335 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 350 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
336} 351}
337 352
338void kvm_propagate_fault(struct kvm_vcpu *vcpu) 353void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
339{ 354{
340 if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) 355 if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
341 vcpu->arch.nested_mmu.inject_page_fault(vcpu); 356 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
342 else 357 else
343 vcpu->arch.mmu.inject_page_fault(vcpu); 358 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
344
345 vcpu->arch.fault.nested = false;
346} 359}
347 360
348void kvm_inject_nmi(struct kvm_vcpu *vcpu) 361void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -460,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
460 (unsigned long *)&vcpu->arch.regs_avail)) 473 (unsigned long *)&vcpu->arch.regs_avail))
461 return true; 474 return true;
462 475
463 gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; 476 gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
464 offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); 477 offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
465 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), 478 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
466 PFERR_USER_MASK | PFERR_WRITE_MASK); 479 PFERR_USER_MASK | PFERR_WRITE_MASK);
467 if (r < 0) 480 if (r < 0)
@@ -506,12 +519,15 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
506 } else 519 } else
507#endif 520#endif
508 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, 521 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
509 vcpu->arch.cr3)) 522 kvm_read_cr3(vcpu)))
510 return 1; 523 return 1;
511 } 524 }
512 525
513 kvm_x86_ops->set_cr0(vcpu, cr0); 526 kvm_x86_ops->set_cr0(vcpu, cr0);
514 527
528 if ((cr0 ^ old_cr0) & X86_CR0_PG)
529 kvm_clear_async_pf_completion_queue(vcpu);
530
515 if ((cr0 ^ old_cr0) & update_bits) 531 if ((cr0 ^ old_cr0) & update_bits)
516 kvm_mmu_reset_context(vcpu); 532 kvm_mmu_reset_context(vcpu);
517 return 0; 533 return 0;
@@ -595,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
595 return 1; 611 return 1;
596 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 612 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
597 && ((cr4 ^ old_cr4) & pdptr_bits) 613 && ((cr4 ^ old_cr4) & pdptr_bits)
598 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) 614 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
615 kvm_read_cr3(vcpu)))
599 return 1; 616 return 1;
600 617
601 if (cr4 & X86_CR4_VMXE) 618 if (cr4 & X86_CR4_VMXE)
@@ -615,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
615 632
616int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 633int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
617{ 634{
618 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 635 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
619 kvm_mmu_sync_roots(vcpu); 636 kvm_mmu_sync_roots(vcpu);
620 kvm_mmu_flush_tlb(vcpu); 637 kvm_mmu_flush_tlb(vcpu);
621 return 0; 638 return 0;
@@ -650,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
650 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 667 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
651 return 1; 668 return 1;
652 vcpu->arch.cr3 = cr3; 669 vcpu->arch.cr3 = cr3;
670 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
653 vcpu->arch.mmu.new_cr3(vcpu); 671 vcpu->arch.mmu.new_cr3(vcpu);
654 return 0; 672 return 0;
655} 673}
656EXPORT_SYMBOL_GPL(kvm_set_cr3); 674EXPORT_SYMBOL_GPL(kvm_set_cr3);
657 675
658int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 676int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
659{ 677{
660 if (cr8 & CR8_RESERVED_BITS) 678 if (cr8 & CR8_RESERVED_BITS)
661 return 1; 679 return 1;
@@ -665,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
665 vcpu->arch.cr8 = cr8; 683 vcpu->arch.cr8 = cr8;
666 return 0; 684 return 0;
667} 685}
668
669void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
670{
671 if (__kvm_set_cr8(vcpu, cr8))
672 kvm_inject_gp(vcpu, 0);
673}
674EXPORT_SYMBOL_GPL(kvm_set_cr8); 686EXPORT_SYMBOL_GPL(kvm_set_cr8);
675 687
676unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 688unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@@ -775,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
775 * kvm-specific. Those are put in the beginning of the list. 787 * kvm-specific. Those are put in the beginning of the list.
776 */ 788 */
777 789
778#define KVM_SAVE_MSRS_BEGIN 7 790#define KVM_SAVE_MSRS_BEGIN 8
779static u32 msrs_to_save[] = { 791static u32 msrs_to_save[] = {
780 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 792 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
781 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 793 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
782 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 794 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
783 HV_X64_MSR_APIC_ASSIST_PAGE, 795 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
784 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 796 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
785 MSR_STAR, 797 MSR_STAR,
786#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
@@ -830,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
830 kvm_x86_ops->set_efer(vcpu, efer); 842 kvm_x86_ops->set_efer(vcpu, efer);
831 843
832 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 844 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
833 kvm_mmu_reset_context(vcpu);
834 845
835 /* Update reserved bits */ 846 /* Update reserved bits */
836 if ((efer ^ old_efer) & EFER_NX) 847 if ((efer ^ old_efer) & EFER_NX)
@@ -976,7 +987,7 @@ static inline u64 nsec_to_cycles(u64 nsec)
976 if (kvm_tsc_changes_freq()) 987 if (kvm_tsc_changes_freq())
977 printk_once(KERN_WARNING 988 printk_once(KERN_WARNING
978 "kvm: unreliable cycle conversion on adjustable rate TSC\n"); 989 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
979 ret = nsec * __get_cpu_var(cpu_tsc_khz); 990 ret = nsec * __this_cpu_read(cpu_tsc_khz);
980 do_div(ret, USEC_PER_SEC); 991 do_div(ret, USEC_PER_SEC);
981 return ret; 992 return ret;
982} 993}
@@ -1061,7 +1072,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1061 local_irq_save(flags); 1072 local_irq_save(flags);
1062 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); 1073 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
1063 kernel_ns = get_kernel_ns(); 1074 kernel_ns = get_kernel_ns();
1064 this_tsc_khz = __get_cpu_var(cpu_tsc_khz); 1075 this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
1065 1076
1066 if (unlikely(this_tsc_khz == 0)) { 1077 if (unlikely(this_tsc_khz == 0)) {
1067 local_irq_restore(flags); 1078 local_irq_restore(flags);
@@ -1418,6 +1429,30 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1418 return 0; 1429 return 0;
1419} 1430}
1420 1431
1432static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1433{
1434 gpa_t gpa = data & ~0x3f;
1435
1436 /* Bits 2:5 are resrved, Should be zero */
1437 if (data & 0x3c)
1438 return 1;
1439
1440 vcpu->arch.apf.msr_val = data;
1441
1442 if (!(data & KVM_ASYNC_PF_ENABLED)) {
1443 kvm_clear_async_pf_completion_queue(vcpu);
1444 kvm_async_pf_hash_reset(vcpu);
1445 return 0;
1446 }
1447
1448 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
1449 return 1;
1450
1451 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
1452 kvm_async_pf_wakeup_all(vcpu);
1453 return 0;
1454}
1455
1421int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1456int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1422{ 1457{
1423 switch (msr) { 1458 switch (msr) {
@@ -1499,6 +1534,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1499 } 1534 }
1500 break; 1535 break;
1501 } 1536 }
1537 case MSR_KVM_ASYNC_PF_EN:
1538 if (kvm_pv_enable_async_pf(vcpu, data))
1539 return 1;
1540 break;
1502 case MSR_IA32_MCG_CTL: 1541 case MSR_IA32_MCG_CTL:
1503 case MSR_IA32_MCG_STATUS: 1542 case MSR_IA32_MCG_STATUS:
1504 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1543 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1775,6 +1814,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1775 case MSR_KVM_SYSTEM_TIME_NEW: 1814 case MSR_KVM_SYSTEM_TIME_NEW:
1776 data = vcpu->arch.time; 1815 data = vcpu->arch.time;
1777 break; 1816 break;
1817 case MSR_KVM_ASYNC_PF_EN:
1818 data = vcpu->arch.apf.msr_val;
1819 break;
1778 case MSR_IA32_P5_MC_ADDR: 1820 case MSR_IA32_P5_MC_ADDR:
1779 case MSR_IA32_P5_MC_TYPE: 1821 case MSR_IA32_P5_MC_TYPE:
1780 case MSR_IA32_MCG_CAP: 1822 case MSR_IA32_MCG_CAP:
@@ -1904,6 +1946,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1904 case KVM_CAP_NOP_IO_DELAY: 1946 case KVM_CAP_NOP_IO_DELAY:
1905 case KVM_CAP_MP_STATE: 1947 case KVM_CAP_MP_STATE:
1906 case KVM_CAP_SYNC_MMU: 1948 case KVM_CAP_SYNC_MMU:
1949 case KVM_CAP_USER_NMI:
1907 case KVM_CAP_REINJECT_CONTROL: 1950 case KVM_CAP_REINJECT_CONTROL:
1908 case KVM_CAP_IRQ_INJECT_STATUS: 1951 case KVM_CAP_IRQ_INJECT_STATUS:
1909 case KVM_CAP_ASSIGN_DEV_IRQ: 1952 case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -1922,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1922 case KVM_CAP_DEBUGREGS: 1965 case KVM_CAP_DEBUGREGS:
1923 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1966 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1924 case KVM_CAP_XSAVE: 1967 case KVM_CAP_XSAVE:
1968 case KVM_CAP_ASYNC_PF:
1925 r = 1; 1969 r = 1;
1926 break; 1970 break;
1927 case KVM_CAP_COALESCED_MMIO: 1971 case KVM_CAP_COALESCED_MMIO:
@@ -2185,6 +2229,11 @@ out:
2185 return r; 2229 return r;
2186} 2230}
2187 2231
2232static void cpuid_mask(u32 *word, int wordnum)
2233{
2234 *word &= boot_cpu_data.x86_capability[wordnum];
2235}
2236
2188static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 2237static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2189 u32 index) 2238 u32 index)
2190{ 2239{
@@ -2259,7 +2308,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2259 break; 2308 break;
2260 case 1: 2309 case 1:
2261 entry->edx &= kvm_supported_word0_x86_features; 2310 entry->edx &= kvm_supported_word0_x86_features;
2311 cpuid_mask(&entry->edx, 0);
2262 entry->ecx &= kvm_supported_word4_x86_features; 2312 entry->ecx &= kvm_supported_word4_x86_features;
2313 cpuid_mask(&entry->ecx, 4);
2263 /* we support x2apic emulation even if host does not support 2314 /* we support x2apic emulation even if host does not support
2264 * it since we emulate x2apic in software */ 2315 * it since we emulate x2apic in software */
2265 entry->ecx |= F(X2APIC); 2316 entry->ecx |= F(X2APIC);
@@ -2350,7 +2401,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2350 break; 2401 break;
2351 case 0x80000001: 2402 case 0x80000001:
2352 entry->edx &= kvm_supported_word1_x86_features; 2403 entry->edx &= kvm_supported_word1_x86_features;
2404 cpuid_mask(&entry->edx, 1);
2353 entry->ecx &= kvm_supported_word6_x86_features; 2405 entry->ecx &= kvm_supported_word6_x86_features;
2406 cpuid_mask(&entry->ecx, 6);
2354 break; 2407 break;
2355 } 2408 }
2356 2409
@@ -3169,20 +3222,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3169 struct kvm_memslots *slots, *old_slots; 3222 struct kvm_memslots *slots, *old_slots;
3170 unsigned long *dirty_bitmap; 3223 unsigned long *dirty_bitmap;
3171 3224
3172 r = -ENOMEM; 3225 dirty_bitmap = memslot->dirty_bitmap_head;
3173 dirty_bitmap = vmalloc(n); 3226 if (memslot->dirty_bitmap == dirty_bitmap)
3174 if (!dirty_bitmap) 3227 dirty_bitmap += n / sizeof(long);
3175 goto out;
3176 memset(dirty_bitmap, 0, n); 3228 memset(dirty_bitmap, 0, n);
3177 3229
3178 r = -ENOMEM; 3230 r = -ENOMEM;
3179 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 3231 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
3180 if (!slots) { 3232 if (!slots)
3181 vfree(dirty_bitmap);
3182 goto out; 3233 goto out;
3183 }
3184 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 3234 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
3185 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 3235 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
3236 slots->generation++;
3186 3237
3187 old_slots = kvm->memslots; 3238 old_slots = kvm->memslots;
3188 rcu_assign_pointer(kvm->memslots, slots); 3239 rcu_assign_pointer(kvm->memslots, slots);
@@ -3195,11 +3246,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3195 spin_unlock(&kvm->mmu_lock); 3246 spin_unlock(&kvm->mmu_lock);
3196 3247
3197 r = -EFAULT; 3248 r = -EFAULT;
3198 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { 3249 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
3199 vfree(dirty_bitmap);
3200 goto out; 3250 goto out;
3201 }
3202 vfree(dirty_bitmap);
3203 } else { 3251 } else {
3204 r = -EFAULT; 3252 r = -EFAULT;
3205 if (clear_user(log->dirty_bitmap, n)) 3253 if (clear_user(log->dirty_bitmap, n))
@@ -3266,8 +3314,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
3266 if (vpic) { 3314 if (vpic) {
3267 r = kvm_ioapic_init(kvm); 3315 r = kvm_ioapic_init(kvm);
3268 if (r) { 3316 if (r) {
3317 mutex_lock(&kvm->slots_lock);
3269 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3318 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3270 &vpic->dev); 3319 &vpic->dev);
3320 mutex_unlock(&kvm->slots_lock);
3271 kfree(vpic); 3321 kfree(vpic);
3272 goto create_irqchip_unlock; 3322 goto create_irqchip_unlock;
3273 } 3323 }
@@ -3278,10 +3328,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
3278 smp_wmb(); 3328 smp_wmb();
3279 r = kvm_setup_default_irq_routing(kvm); 3329 r = kvm_setup_default_irq_routing(kvm);
3280 if (r) { 3330 if (r) {
3331 mutex_lock(&kvm->slots_lock);
3281 mutex_lock(&kvm->irq_lock); 3332 mutex_lock(&kvm->irq_lock);
3282 kvm_ioapic_destroy(kvm); 3333 kvm_ioapic_destroy(kvm);
3283 kvm_destroy_pic(kvm); 3334 kvm_destroy_pic(kvm);
3284 mutex_unlock(&kvm->irq_lock); 3335 mutex_unlock(&kvm->irq_lock);
3336 mutex_unlock(&kvm->slots_lock);
3285 } 3337 }
3286 create_irqchip_unlock: 3338 create_irqchip_unlock:
3287 mutex_unlock(&kvm->lock); 3339 mutex_unlock(&kvm->lock);
@@ -3557,63 +3609,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3557static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) 3609static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3558{ 3610{
3559 gpa_t t_gpa; 3611 gpa_t t_gpa;
3560 u32 error; 3612 struct x86_exception exception;
3561 3613
3562 BUG_ON(!mmu_is_nested(vcpu)); 3614 BUG_ON(!mmu_is_nested(vcpu));
3563 3615
3564 /* NPT walks are always user-walks */ 3616 /* NPT walks are always user-walks */
3565 access |= PFERR_USER_MASK; 3617 access |= PFERR_USER_MASK;
3566 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); 3618 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
3567 if (t_gpa == UNMAPPED_GVA)
3568 vcpu->arch.fault.nested = true;
3569 3619
3570 return t_gpa; 3620 return t_gpa;
3571} 3621}
3572 3622
3573gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3623gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
3624 struct x86_exception *exception)
3574{ 3625{
3575 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3626 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3576 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3627 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3577} 3628}
3578 3629
3579 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3630 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
3631 struct x86_exception *exception)
3580{ 3632{
3581 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3633 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3582 access |= PFERR_FETCH_MASK; 3634 access |= PFERR_FETCH_MASK;
3583 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3635 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3584} 3636}
3585 3637
3586gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3638gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
3639 struct x86_exception *exception)
3587{ 3640{
3588 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3641 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3589 access |= PFERR_WRITE_MASK; 3642 access |= PFERR_WRITE_MASK;
3590 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3643 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3591} 3644}
3592 3645
3593/* uses this to access any guest's mapped memory without checking CPL */ 3646/* uses this to access any guest's mapped memory without checking CPL */
3594gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3647gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
3648 struct x86_exception *exception)
3595{ 3649{
3596 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); 3650 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
3597} 3651}
3598 3652
3599static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3653static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3600 struct kvm_vcpu *vcpu, u32 access, 3654 struct kvm_vcpu *vcpu, u32 access,
3601 u32 *error) 3655 struct x86_exception *exception)
3602{ 3656{
3603 void *data = val; 3657 void *data = val;
3604 int r = X86EMUL_CONTINUE; 3658 int r = X86EMUL_CONTINUE;
3605 3659
3606 while (bytes) { 3660 while (bytes) {
3607 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, 3661 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3608 error); 3662 exception);
3609 unsigned offset = addr & (PAGE_SIZE-1); 3663 unsigned offset = addr & (PAGE_SIZE-1);
3610 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3664 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3611 int ret; 3665 int ret;
3612 3666
3613 if (gpa == UNMAPPED_GVA) { 3667 if (gpa == UNMAPPED_GVA)
3614 r = X86EMUL_PROPAGATE_FAULT; 3668 return X86EMUL_PROPAGATE_FAULT;
3615 goto out;
3616 }
3617 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3669 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3618 if (ret < 0) { 3670 if (ret < 0) {
3619 r = X86EMUL_IO_NEEDED; 3671 r = X86EMUL_IO_NEEDED;
@@ -3630,31 +3682,35 @@ out:
3630 3682
3631/* used for instruction fetching */ 3683/* used for instruction fetching */
3632static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, 3684static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
3633 struct kvm_vcpu *vcpu, u32 *error) 3685 struct kvm_vcpu *vcpu,
3686 struct x86_exception *exception)
3634{ 3687{
3635 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3688 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3636 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 3689 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3637 access | PFERR_FETCH_MASK, error); 3690 access | PFERR_FETCH_MASK,
3691 exception);
3638} 3692}
3639 3693
3640static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3694static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
3641 struct kvm_vcpu *vcpu, u32 *error) 3695 struct kvm_vcpu *vcpu,
3696 struct x86_exception *exception)
3642{ 3697{
3643 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3698 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3644 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3699 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3645 error); 3700 exception);
3646} 3701}
3647 3702
3648static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, 3703static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3649 struct kvm_vcpu *vcpu, u32 *error) 3704 struct kvm_vcpu *vcpu,
3705 struct x86_exception *exception)
3650{ 3706{
3651 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3707 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3652} 3708}
3653 3709
3654static int kvm_write_guest_virt_system(gva_t addr, void *val, 3710static int kvm_write_guest_virt_system(gva_t addr, void *val,
3655 unsigned int bytes, 3711 unsigned int bytes,
3656 struct kvm_vcpu *vcpu, 3712 struct kvm_vcpu *vcpu,
3657 u32 *error) 3713 struct x86_exception *exception)
3658{ 3714{
3659 void *data = val; 3715 void *data = val;
3660 int r = X86EMUL_CONTINUE; 3716 int r = X86EMUL_CONTINUE;
@@ -3662,15 +3718,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3662 while (bytes) { 3718 while (bytes) {
3663 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, 3719 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3664 PFERR_WRITE_MASK, 3720 PFERR_WRITE_MASK,
3665 error); 3721 exception);
3666 unsigned offset = addr & (PAGE_SIZE-1); 3722 unsigned offset = addr & (PAGE_SIZE-1);
3667 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3723 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3668 int ret; 3724 int ret;
3669 3725
3670 if (gpa == UNMAPPED_GVA) { 3726 if (gpa == UNMAPPED_GVA)
3671 r = X86EMUL_PROPAGATE_FAULT; 3727 return X86EMUL_PROPAGATE_FAULT;
3672 goto out;
3673 }
3674 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3728 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3675 if (ret < 0) { 3729 if (ret < 0) {
3676 r = X86EMUL_IO_NEEDED; 3730 r = X86EMUL_IO_NEEDED;
@@ -3688,7 +3742,7 @@ out:
3688static int emulator_read_emulated(unsigned long addr, 3742static int emulator_read_emulated(unsigned long addr,
3689 void *val, 3743 void *val,
3690 unsigned int bytes, 3744 unsigned int bytes,
3691 unsigned int *error_code, 3745 struct x86_exception *exception,
3692 struct kvm_vcpu *vcpu) 3746 struct kvm_vcpu *vcpu)
3693{ 3747{
3694 gpa_t gpa; 3748 gpa_t gpa;
@@ -3701,7 +3755,7 @@ static int emulator_read_emulated(unsigned long addr,
3701 return X86EMUL_CONTINUE; 3755 return X86EMUL_CONTINUE;
3702 } 3756 }
3703 3757
3704 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); 3758 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
3705 3759
3706 if (gpa == UNMAPPED_GVA) 3760 if (gpa == UNMAPPED_GVA)
3707 return X86EMUL_PROPAGATE_FAULT; 3761 return X86EMUL_PROPAGATE_FAULT;
@@ -3710,8 +3764,8 @@ static int emulator_read_emulated(unsigned long addr,
3710 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3764 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3711 goto mmio; 3765 goto mmio;
3712 3766
3713 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) 3767 if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
3714 == X86EMUL_CONTINUE) 3768 == X86EMUL_CONTINUE)
3715 return X86EMUL_CONTINUE; 3769 return X86EMUL_CONTINUE;
3716 3770
3717mmio: 3771mmio:
@@ -3735,7 +3789,7 @@ mmio:
3735} 3789}
3736 3790
3737int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3791int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3738 const void *val, int bytes) 3792 const void *val, int bytes)
3739{ 3793{
3740 int ret; 3794 int ret;
3741 3795
@@ -3749,12 +3803,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3749static int emulator_write_emulated_onepage(unsigned long addr, 3803static int emulator_write_emulated_onepage(unsigned long addr,
3750 const void *val, 3804 const void *val,
3751 unsigned int bytes, 3805 unsigned int bytes,
3752 unsigned int *error_code, 3806 struct x86_exception *exception,
3753 struct kvm_vcpu *vcpu) 3807 struct kvm_vcpu *vcpu)
3754{ 3808{
3755 gpa_t gpa; 3809 gpa_t gpa;
3756 3810
3757 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); 3811 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
3758 3812
3759 if (gpa == UNMAPPED_GVA) 3813 if (gpa == UNMAPPED_GVA)
3760 return X86EMUL_PROPAGATE_FAULT; 3814 return X86EMUL_PROPAGATE_FAULT;
@@ -3787,7 +3841,7 @@ mmio:
3787int emulator_write_emulated(unsigned long addr, 3841int emulator_write_emulated(unsigned long addr,
3788 const void *val, 3842 const void *val,
3789 unsigned int bytes, 3843 unsigned int bytes,
3790 unsigned int *error_code, 3844 struct x86_exception *exception,
3791 struct kvm_vcpu *vcpu) 3845 struct kvm_vcpu *vcpu)
3792{ 3846{
3793 /* Crossing a page boundary? */ 3847 /* Crossing a page boundary? */
@@ -3795,7 +3849,7 @@ int emulator_write_emulated(unsigned long addr,
3795 int rc, now; 3849 int rc, now;
3796 3850
3797 now = -addr & ~PAGE_MASK; 3851 now = -addr & ~PAGE_MASK;
3798 rc = emulator_write_emulated_onepage(addr, val, now, error_code, 3852 rc = emulator_write_emulated_onepage(addr, val, now, exception,
3799 vcpu); 3853 vcpu);
3800 if (rc != X86EMUL_CONTINUE) 3854 if (rc != X86EMUL_CONTINUE)
3801 return rc; 3855 return rc;
@@ -3803,7 +3857,7 @@ int emulator_write_emulated(unsigned long addr,
3803 val += now; 3857 val += now;
3804 bytes -= now; 3858 bytes -= now;
3805 } 3859 }
3806 return emulator_write_emulated_onepage(addr, val, bytes, error_code, 3860 return emulator_write_emulated_onepage(addr, val, bytes, exception,
3807 vcpu); 3861 vcpu);
3808} 3862}
3809 3863
@@ -3821,7 +3875,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3821 const void *old, 3875 const void *old,
3822 const void *new, 3876 const void *new,
3823 unsigned int bytes, 3877 unsigned int bytes,
3824 unsigned int *error_code, 3878 struct x86_exception *exception,
3825 struct kvm_vcpu *vcpu) 3879 struct kvm_vcpu *vcpu)
3826{ 3880{
3827 gpa_t gpa; 3881 gpa_t gpa;
@@ -3879,7 +3933,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3879emul_write: 3933emul_write:
3880 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3934 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3881 3935
3882 return emulator_write_emulated(addr, new, bytes, error_code, vcpu); 3936 return emulator_write_emulated(addr, new, bytes, exception, vcpu);
3883} 3937}
3884 3938
3885static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3939static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3904,7 +3958,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3904 if (vcpu->arch.pio.count) 3958 if (vcpu->arch.pio.count)
3905 goto data_avail; 3959 goto data_avail;
3906 3960
3907 trace_kvm_pio(0, port, size, 1); 3961 trace_kvm_pio(0, port, size, count);
3908 3962
3909 vcpu->arch.pio.port = port; 3963 vcpu->arch.pio.port = port;
3910 vcpu->arch.pio.in = 1; 3964 vcpu->arch.pio.in = 1;
@@ -3932,7 +3986,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
3932 const void *val, unsigned int count, 3986 const void *val, unsigned int count,
3933 struct kvm_vcpu *vcpu) 3987 struct kvm_vcpu *vcpu)
3934{ 3988{
3935 trace_kvm_pio(1, port, size, 1); 3989 trace_kvm_pio(1, port, size, count);
3936 3990
3937 vcpu->arch.pio.port = port; 3991 vcpu->arch.pio.port = port;
3938 vcpu->arch.pio.in = 0; 3992 vcpu->arch.pio.in = 0;
@@ -3973,13 +4027,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
3973 return X86EMUL_CONTINUE; 4027 return X86EMUL_CONTINUE;
3974 4028
3975 if (kvm_x86_ops->has_wbinvd_exit()) { 4029 if (kvm_x86_ops->has_wbinvd_exit()) {
3976 preempt_disable(); 4030 int cpu = get_cpu();
4031
4032 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3977 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, 4033 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
3978 wbinvd_ipi, NULL, 1); 4034 wbinvd_ipi, NULL, 1);
3979 preempt_enable(); 4035 put_cpu();
3980 cpumask_clear(vcpu->arch.wbinvd_dirty_mask); 4036 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
3981 } 4037 } else
3982 wbinvd(); 4038 wbinvd();
3983 return X86EMUL_CONTINUE; 4039 return X86EMUL_CONTINUE;
3984} 4040}
3985EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4041EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
@@ -4019,7 +4075,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
4019 value = vcpu->arch.cr2; 4075 value = vcpu->arch.cr2;
4020 break; 4076 break;
4021 case 3: 4077 case 3:
4022 value = vcpu->arch.cr3; 4078 value = kvm_read_cr3(vcpu);
4023 break; 4079 break;
4024 case 4: 4080 case 4:
4025 value = kvm_read_cr4(vcpu); 4081 value = kvm_read_cr4(vcpu);
@@ -4053,7 +4109,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
4053 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 4109 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4054 break; 4110 break;
4055 case 8: 4111 case 8:
4056 res = __kvm_set_cr8(vcpu, val & 0xfUL); 4112 res = kvm_set_cr8(vcpu, val);
4057 break; 4113 break;
4058 default: 4114 default:
4059 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4115 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@@ -4206,12 +4262,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
4206static void inject_emulated_exception(struct kvm_vcpu *vcpu) 4262static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4207{ 4263{
4208 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4264 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4209 if (ctxt->exception == PF_VECTOR) 4265 if (ctxt->exception.vector == PF_VECTOR)
4210 kvm_propagate_fault(vcpu); 4266 kvm_propagate_fault(vcpu, &ctxt->exception);
4211 else if (ctxt->error_code_valid) 4267 else if (ctxt->exception.error_code_valid)
4212 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 4268 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
4269 ctxt->exception.error_code);
4213 else 4270 else
4214 kvm_queue_exception(vcpu, ctxt->exception); 4271 kvm_queue_exception(vcpu, ctxt->exception.vector);
4215} 4272}
4216 4273
4217static void init_emulate_ctxt(struct kvm_vcpu *vcpu) 4274static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -4267,13 +4324,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4267 4324
4268static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4325static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4269{ 4326{
4327 int r = EMULATE_DONE;
4328
4270 ++vcpu->stat.insn_emulation_fail; 4329 ++vcpu->stat.insn_emulation_fail;
4271 trace_kvm_emulate_insn_failed(vcpu); 4330 trace_kvm_emulate_insn_failed(vcpu);
4272 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4331 if (!is_guest_mode(vcpu)) {
4273 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4332 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4274 vcpu->run->internal.ndata = 0; 4333 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4334 vcpu->run->internal.ndata = 0;
4335 r = EMULATE_FAIL;
4336 }
4275 kvm_queue_exception(vcpu, UD_VECTOR); 4337 kvm_queue_exception(vcpu, UD_VECTOR);
4276 return EMULATE_FAIL; 4338
4339 return r;
4277} 4340}
4278 4341
4279static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4342static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
@@ -4302,10 +4365,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4302 return false; 4365 return false;
4303} 4366}
4304 4367
4305int emulate_instruction(struct kvm_vcpu *vcpu, 4368int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4306 unsigned long cr2, 4369 unsigned long cr2,
4307 u16 error_code, 4370 int emulation_type,
4308 int emulation_type) 4371 void *insn,
4372 int insn_len)
4309{ 4373{
4310 int r; 4374 int r;
4311 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4375 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
@@ -4323,10 +4387,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
4323 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4387 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4324 init_emulate_ctxt(vcpu); 4388 init_emulate_ctxt(vcpu);
4325 vcpu->arch.emulate_ctxt.interruptibility = 0; 4389 vcpu->arch.emulate_ctxt.interruptibility = 0;
4326 vcpu->arch.emulate_ctxt.exception = -1; 4390 vcpu->arch.emulate_ctxt.have_exception = false;
4327 vcpu->arch.emulate_ctxt.perm_ok = false; 4391 vcpu->arch.emulate_ctxt.perm_ok = false;
4328 4392
4329 r = x86_decode_insn(&vcpu->arch.emulate_ctxt); 4393 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
4330 if (r == X86EMUL_PROPAGATE_FAULT) 4394 if (r == X86EMUL_PROPAGATE_FAULT)
4331 goto done; 4395 goto done;
4332 4396
@@ -4389,7 +4453,7 @@ restart:
4389 } 4453 }
4390 4454
4391done: 4455done:
4392 if (vcpu->arch.emulate_ctxt.exception >= 0) { 4456 if (vcpu->arch.emulate_ctxt.have_exception) {
4393 inject_emulated_exception(vcpu); 4457 inject_emulated_exception(vcpu);
4394 r = EMULATE_DONE; 4458 r = EMULATE_DONE;
4395 } else if (vcpu->arch.pio.count) { 4459 } else if (vcpu->arch.pio.count) {
@@ -4413,7 +4477,7 @@ done:
4413 4477
4414 return r; 4478 return r;
4415} 4479}
4416EXPORT_SYMBOL_GPL(emulate_instruction); 4480EXPORT_SYMBOL_GPL(x86_emulate_instruction);
4417 4481
4418int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 4482int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4419{ 4483{
@@ -4427,7 +4491,7 @@ EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
4427 4491
4428static void tsc_bad(void *info) 4492static void tsc_bad(void *info)
4429{ 4493{
4430 __get_cpu_var(cpu_tsc_khz) = 0; 4494 __this_cpu_write(cpu_tsc_khz, 0);
4431} 4495}
4432 4496
4433static void tsc_khz_changed(void *data) 4497static void tsc_khz_changed(void *data)
@@ -4441,7 +4505,7 @@ static void tsc_khz_changed(void *data)
4441 khz = cpufreq_quick_get(raw_smp_processor_id()); 4505 khz = cpufreq_quick_get(raw_smp_processor_id());
4442 if (!khz) 4506 if (!khz)
4443 khz = tsc_khz; 4507 khz = tsc_khz;
4444 __get_cpu_var(cpu_tsc_khz) = khz; 4508 __this_cpu_write(cpu_tsc_khz, khz);
4445} 4509}
4446 4510
4447static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 4511static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4653,7 +4717,6 @@ int kvm_arch_init(void *opaque)
4653 4717
4654 kvm_x86_ops = ops; 4718 kvm_x86_ops = ops;
4655 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 4719 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
4656 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
4657 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 4720 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
4658 PT_DIRTY_MASK, PT64_NX_MASK, 0); 4721 PT_DIRTY_MASK, PT64_NX_MASK, 0);
4659 4722
@@ -5116,6 +5179,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5116 vcpu->fpu_active = 0; 5179 vcpu->fpu_active = 0;
5117 kvm_x86_ops->fpu_deactivate(vcpu); 5180 kvm_x86_ops->fpu_deactivate(vcpu);
5118 } 5181 }
5182 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
5183 /* Page is swapped out. Do synthetic halt */
5184 vcpu->arch.apf.halted = true;
5185 r = 1;
5186 goto out;
5187 }
5119 } 5188 }
5120 5189
5121 r = kvm_mmu_reload(vcpu); 5190 r = kvm_mmu_reload(vcpu);
@@ -5244,7 +5313,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5244 5313
5245 r = 1; 5314 r = 1;
5246 while (r > 0) { 5315 while (r > 0) {
5247 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 5316 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
5317 !vcpu->arch.apf.halted)
5248 r = vcpu_enter_guest(vcpu); 5318 r = vcpu_enter_guest(vcpu);
5249 else { 5319 else {
5250 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5320 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -5257,6 +5327,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5257 vcpu->arch.mp_state = 5327 vcpu->arch.mp_state =
5258 KVM_MP_STATE_RUNNABLE; 5328 KVM_MP_STATE_RUNNABLE;
5259 case KVM_MP_STATE_RUNNABLE: 5329 case KVM_MP_STATE_RUNNABLE:
5330 vcpu->arch.apf.halted = false;
5260 break; 5331 break;
5261 case KVM_MP_STATE_SIPI_RECEIVED: 5332 case KVM_MP_STATE_SIPI_RECEIVED:
5262 default: 5333 default:
@@ -5278,6 +5349,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5278 vcpu->run->exit_reason = KVM_EXIT_INTR; 5349 vcpu->run->exit_reason = KVM_EXIT_INTR;
5279 ++vcpu->stat.request_irq_exits; 5350 ++vcpu->stat.request_irq_exits;
5280 } 5351 }
5352
5353 kvm_check_async_pf_completion(vcpu);
5354
5281 if (signal_pending(current)) { 5355 if (signal_pending(current)) {
5282 r = -EINTR; 5356 r = -EINTR;
5283 vcpu->run->exit_reason = KVM_EXIT_INTR; 5357 vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -5302,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5302 int r; 5376 int r;
5303 sigset_t sigsaved; 5377 sigset_t sigsaved;
5304 5378
5379 if (!tsk_used_math(current) && init_fpu(current))
5380 return -ENOMEM;
5381
5305 if (vcpu->sigset_active) 5382 if (vcpu->sigset_active)
5306 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 5383 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
5307 5384
@@ -5313,8 +5390,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5313 } 5390 }
5314 5391
5315 /* re-sync apic's tpr */ 5392 /* re-sync apic's tpr */
5316 if (!irqchip_in_kernel(vcpu->kvm)) 5393 if (!irqchip_in_kernel(vcpu->kvm)) {
5317 kvm_set_cr8(vcpu, kvm_run->cr8); 5394 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
5395 r = -EINVAL;
5396 goto out;
5397 }
5398 }
5318 5399
5319 if (vcpu->arch.pio.count || vcpu->mmio_needed) { 5400 if (vcpu->arch.pio.count || vcpu->mmio_needed) {
5320 if (vcpu->mmio_needed) { 5401 if (vcpu->mmio_needed) {
@@ -5323,7 +5404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5323 vcpu->mmio_needed = 0; 5404 vcpu->mmio_needed = 0;
5324 } 5405 }
5325 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5406 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5326 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); 5407 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5327 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5408 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5328 if (r != EMULATE_DONE) { 5409 if (r != EMULATE_DONE) {
5329 r = 0; 5410 r = 0;
@@ -5436,7 +5517,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
5436 5517
5437 sregs->cr0 = kvm_read_cr0(vcpu); 5518 sregs->cr0 = kvm_read_cr0(vcpu);
5438 sregs->cr2 = vcpu->arch.cr2; 5519 sregs->cr2 = vcpu->arch.cr2;
5439 sregs->cr3 = vcpu->arch.cr3; 5520 sregs->cr3 = kvm_read_cr3(vcpu);
5440 sregs->cr4 = kvm_read_cr4(vcpu); 5521 sregs->cr4 = kvm_read_cr4(vcpu);
5441 sregs->cr8 = kvm_get_cr8(vcpu); 5522 sregs->cr8 = kvm_get_cr8(vcpu);
5442 sregs->efer = vcpu->arch.efer; 5523 sregs->efer = vcpu->arch.efer;
@@ -5504,8 +5585,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5504 kvm_x86_ops->set_gdt(vcpu, &dt); 5585 kvm_x86_ops->set_gdt(vcpu, &dt);
5505 5586
5506 vcpu->arch.cr2 = sregs->cr2; 5587 vcpu->arch.cr2 = sregs->cr2;
5507 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 5588 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
5508 vcpu->arch.cr3 = sregs->cr3; 5589 vcpu->arch.cr3 = sregs->cr3;
5590 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
5509 5591
5510 kvm_set_cr8(vcpu, sregs->cr8); 5592 kvm_set_cr8(vcpu, sregs->cr8);
5511 5593
@@ -5522,7 +5604,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5522 if (sregs->cr4 & X86_CR4_OSXSAVE) 5604 if (sregs->cr4 & X86_CR4_OSXSAVE)
5523 update_cpuid(vcpu); 5605 update_cpuid(vcpu);
5524 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5606 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5525 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); 5607 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
5526 mmu_reset_needed = 1; 5608 mmu_reset_needed = 1;
5527 } 5609 }
5528 5610
@@ -5773,6 +5855,8 @@ free_vcpu:
5773 5855
5774void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 5856void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
5775{ 5857{
5858 vcpu->arch.apf.msr_val = 0;
5859
5776 vcpu_load(vcpu); 5860 vcpu_load(vcpu);
5777 kvm_mmu_unload(vcpu); 5861 kvm_mmu_unload(vcpu);
5778 vcpu_put(vcpu); 5862 vcpu_put(vcpu);
@@ -5792,6 +5876,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5792 vcpu->arch.dr7 = DR7_FIXED_1; 5876 vcpu->arch.dr7 = DR7_FIXED_1;
5793 5877
5794 kvm_make_request(KVM_REQ_EVENT, vcpu); 5878 kvm_make_request(KVM_REQ_EVENT, vcpu);
5879 vcpu->arch.apf.msr_val = 0;
5880
5881 kvm_clear_async_pf_completion_queue(vcpu);
5882 kvm_async_pf_hash_reset(vcpu);
5883 vcpu->arch.apf.halted = false;
5795 5884
5796 return kvm_x86_ops->vcpu_reset(vcpu); 5885 return kvm_x86_ops->vcpu_reset(vcpu);
5797} 5886}
@@ -5881,6 +5970,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5881 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 5970 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5882 goto fail_free_mce_banks; 5971 goto fail_free_mce_banks;
5883 5972
5973 kvm_async_pf_hash_reset(vcpu);
5974
5884 return 0; 5975 return 0;
5885fail_free_mce_banks: 5976fail_free_mce_banks:
5886 kfree(vcpu->arch.mce_banks); 5977 kfree(vcpu->arch.mce_banks);
@@ -5906,13 +5997,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
5906 free_page((unsigned long)vcpu->arch.pio_data); 5997 free_page((unsigned long)vcpu->arch.pio_data);
5907} 5998}
5908 5999
5909struct kvm *kvm_arch_create_vm(void) 6000int kvm_arch_init_vm(struct kvm *kvm)
5910{ 6001{
5911 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
5912
5913 if (!kvm)
5914 return ERR_PTR(-ENOMEM);
5915
5916 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6002 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5917 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6003 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5918 6004
@@ -5921,7 +6007,7 @@ struct kvm *kvm_arch_create_vm(void)
5921 6007
5922 spin_lock_init(&kvm->arch.tsc_write_lock); 6008 spin_lock_init(&kvm->arch.tsc_write_lock);
5923 6009
5924 return kvm; 6010 return 0;
5925} 6011}
5926 6012
5927static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 6013static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -5939,8 +6025,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
5939 /* 6025 /*
5940 * Unpin any mmu pages first. 6026 * Unpin any mmu pages first.
5941 */ 6027 */
5942 kvm_for_each_vcpu(i, vcpu, kvm) 6028 kvm_for_each_vcpu(i, vcpu, kvm) {
6029 kvm_clear_async_pf_completion_queue(vcpu);
5943 kvm_unload_vcpu_mmu(vcpu); 6030 kvm_unload_vcpu_mmu(vcpu);
6031 }
5944 kvm_for_each_vcpu(i, vcpu, kvm) 6032 kvm_for_each_vcpu(i, vcpu, kvm)
5945 kvm_arch_vcpu_free(vcpu); 6033 kvm_arch_vcpu_free(vcpu);
5946 6034
@@ -5964,13 +6052,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5964 kfree(kvm->arch.vpic); 6052 kfree(kvm->arch.vpic);
5965 kfree(kvm->arch.vioapic); 6053 kfree(kvm->arch.vioapic);
5966 kvm_free_vcpus(kvm); 6054 kvm_free_vcpus(kvm);
5967 kvm_free_physmem(kvm);
5968 if (kvm->arch.apic_access_page) 6055 if (kvm->arch.apic_access_page)
5969 put_page(kvm->arch.apic_access_page); 6056 put_page(kvm->arch.apic_access_page);
5970 if (kvm->arch.ept_identity_pagetable) 6057 if (kvm->arch.ept_identity_pagetable)
5971 put_page(kvm->arch.ept_identity_pagetable); 6058 put_page(kvm->arch.ept_identity_pagetable);
5972 cleanup_srcu_struct(&kvm->srcu);
5973 kfree(kvm);
5974} 6059}
5975 6060
5976int kvm_arch_prepare_memory_region(struct kvm *kvm, 6061int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -6051,7 +6136,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
6051 6136
6052int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6137int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6053{ 6138{
6054 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 6139 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6140 !vcpu->arch.apf.halted)
6141 || !list_empty_careful(&vcpu->async_pf.done)
6055 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6142 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
6056 || vcpu->arch.nmi_pending || 6143 || vcpu->arch.nmi_pending ||
6057 (kvm_arch_interrupt_allowed(vcpu) && 6144 (kvm_arch_interrupt_allowed(vcpu) &&
@@ -6110,6 +6197,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
6110} 6197}
6111EXPORT_SYMBOL_GPL(kvm_set_rflags); 6198EXPORT_SYMBOL_GPL(kvm_set_rflags);
6112 6199
6200void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
6201{
6202 int r;
6203
6204 if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
6205 is_error_page(work->page))
6206 return;
6207
6208 r = kvm_mmu_reload(vcpu);
6209 if (unlikely(r))
6210 return;
6211
6212 if (!vcpu->arch.mmu.direct_map &&
6213 work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
6214 return;
6215
6216 vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
6217}
6218
6219static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
6220{
6221 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
6222}
6223
6224static inline u32 kvm_async_pf_next_probe(u32 key)
6225{
6226 return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
6227}
6228
6229static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6230{
6231 u32 key = kvm_async_pf_hash_fn(gfn);
6232
6233 while (vcpu->arch.apf.gfns[key] != ~0)
6234 key = kvm_async_pf_next_probe(key);
6235
6236 vcpu->arch.apf.gfns[key] = gfn;
6237}
6238
6239static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
6240{
6241 int i;
6242 u32 key = kvm_async_pf_hash_fn(gfn);
6243
6244 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
6245 (vcpu->arch.apf.gfns[key] != gfn &&
6246 vcpu->arch.apf.gfns[key] != ~0); i++)
6247 key = kvm_async_pf_next_probe(key);
6248
6249 return key;
6250}
6251
6252bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6253{
6254 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
6255}
6256
6257static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6258{
6259 u32 i, j, k;
6260
6261 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
6262 while (true) {
6263 vcpu->arch.apf.gfns[i] = ~0;
6264 do {
6265 j = kvm_async_pf_next_probe(j);
6266 if (vcpu->arch.apf.gfns[j] == ~0)
6267 return;
6268 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
6269 /*
6270 * k lies cyclically in ]i,j]
6271 * | i.k.j |
6272 * |....j i.k.| or |.k..j i...|
6273 */
6274 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
6275 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
6276 i = j;
6277 }
6278}
6279
6280static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
6281{
6282
6283 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
6284 sizeof(val));
6285}
6286
6287void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
6288 struct kvm_async_pf *work)
6289{
6290 struct x86_exception fault;
6291
6292 trace_kvm_async_pf_not_present(work->arch.token, work->gva);
6293 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
6294
6295 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
6296 (vcpu->arch.apf.send_user_only &&
6297 kvm_x86_ops->get_cpl(vcpu) == 0))
6298 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
6299 else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
6300 fault.vector = PF_VECTOR;
6301 fault.error_code_valid = true;
6302 fault.error_code = 0;
6303 fault.nested_page_fault = false;
6304 fault.address = work->arch.token;
6305 kvm_inject_page_fault(vcpu, &fault);
6306 }
6307}
6308
6309void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
6310 struct kvm_async_pf *work)
6311{
6312 struct x86_exception fault;
6313
6314 trace_kvm_async_pf_ready(work->arch.token, work->gva);
6315 if (is_error_page(work->page))
6316 work->arch.token = ~0; /* broadcast wakeup */
6317 else
6318 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
6319
6320 if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
6321 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
6322 fault.vector = PF_VECTOR;
6323 fault.error_code_valid = true;
6324 fault.error_code = 0;
6325 fault.nested_page_fault = false;
6326 fault.address = work->arch.token;
6327 kvm_inject_page_fault(vcpu, &fault);
6328 }
6329 vcpu->arch.apf.halted = false;
6330}
6331
6332bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
6333{
6334 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
6335 return true;
6336 else
6337 return !kvm_event_needs_reinjection(vcpu) &&
6338 kvm_x86_ops->interrupt_allowed(vcpu);
6339}
6340
6113EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 6341EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
6114EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 6342EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
6115EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 6343EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index ff485d361182..fc45ba887d05 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -121,7 +121,7 @@ inline void __const_udelay(unsigned long xloops)
121 asm("mull %%edx" 121 asm("mull %%edx"
122 :"=d" (xloops), "=&a" (d0) 122 :"=d" (xloops), "=&a" (d0)
123 :"1" (xloops), "0" 123 :"1" (xloops), "0"
124 (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); 124 (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4)));
125 125
126 __delay(++xloops); 126 __delay(++xloops);
127} 127}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/vmstat.h> 9#include <linux/vmstat.h>
10#include <linux/highmem.h> 10#include <linux/highmem.h>
11#include <linux/swap.h>
11 12
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
13 14
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 90 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte); 91 page = pte_page(pte);
91 get_page(page); 92 get_page(page);
93 SetPageReferenced(page);
92 pages[*nr] = page; 94 pages[*nr] = page;
93 (*nr)++; 95 (*nr)++;
94 96
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
103 VM_BUG_ON(page != compound_head(page)); 105 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0); 106 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count); 107 atomic_add(nr, &page->_count);
108 SetPageReferenced(page);
109}
110
111static inline void get_huge_page_tail(struct page *page)
112{
113 /*
114 * __split_huge_page_refcount() cannot run
115 * from under us.
116 */
117 VM_BUG_ON(atomic_read(&page->_count) < 0);
118 atomic_inc(&page->_count);
106} 119}
107 120
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 121static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
128 do { 141 do {
129 VM_BUG_ON(compound_head(page) != head); 142 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page; 143 pages[*nr] = page;
144 if (PageTail(page))
145 get_huge_page_tail(page);
131 (*nr)++; 146 (*nr)++;
132 page++; 147 page++;
133 refs++; 148 refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
148 pmd_t pmd = *pmdp; 163 pmd_t pmd = *pmdp;
149 164
150 next = pmd_addr_end(addr, end); 165 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd)) 166 /*
167 * The pmd_trans_splitting() check below explains why
168 * pmdp_splitting_flush has to flush the tlb, to stop
169 * this gup-fast code from running while we set the
170 * splitting bit in the pmd. Returning zero will take
171 * the slow path that will call wait_split_huge_page()
172 * if the pmd is still in splitting state. gup-fast
173 * can't because it has irq disabled and
174 * wait_split_huge_page() would never return as the
175 * tlb flush IPI wouldn't run.
176 */
177 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
152 return 0; 178 return 0;
153 if (unlikely(pmd_large(pmd))) { 179 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) 180 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f89b5bb4e93f..c821074b7f0b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -45,6 +45,7 @@
45#include <asm/bugs.h> 45#include <asm/bugs.h>
46#include <asm/tlb.h> 46#include <asm/tlb.h>
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/olpc_ofw.h>
48#include <asm/pgalloc.h> 49#include <asm/pgalloc.h>
49#include <asm/sections.h> 50#include <asm/sections.h>
50#include <asm/paravirt.h> 51#include <asm/paravirt.h>
@@ -715,6 +716,7 @@ void __init paging_init(void)
715 /* 716 /*
716 * NOTE: at this point the bootmem allocator is fully available. 717 * NOTE: at this point the bootmem allocator is fully available.
717 */ 718 */
719 olpc_dt_build_devicetree();
718 sparse_init(); 720 sparse_init();
719 zone_sizes_init(); 721 zone_sizes_init();
720} 722}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc89..500242d3c96d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
320 return changed; 320 return changed;
321} 321}
322 322
323#ifdef CONFIG_TRANSPARENT_HUGEPAGE
324int pmdp_set_access_flags(struct vm_area_struct *vma,
325 unsigned long address, pmd_t *pmdp,
326 pmd_t entry, int dirty)
327{
328 int changed = !pmd_same(*pmdp, entry);
329
330 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
331
332 if (changed && dirty) {
333 *pmdp = entry;
334 pmd_update_defer(vma->vm_mm, address, pmdp);
335 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
336 }
337
338 return changed;
339}
340#endif
341
323int ptep_test_and_clear_young(struct vm_area_struct *vma, 342int ptep_test_and_clear_young(struct vm_area_struct *vma,
324 unsigned long addr, pte_t *ptep) 343 unsigned long addr, pte_t *ptep)
325{ 344{
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
335 return ret; 354 return ret;
336} 355}
337 356
357#ifdef CONFIG_TRANSPARENT_HUGEPAGE
358int pmdp_test_and_clear_young(struct vm_area_struct *vma,
359 unsigned long addr, pmd_t *pmdp)
360{
361 int ret = 0;
362
363 if (pmd_young(*pmdp))
364 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
365 (unsigned long *)pmdp);
366
367 if (ret)
368 pmd_update(vma->vm_mm, addr, pmdp);
369
370 return ret;
371}
372#endif
373
338int ptep_clear_flush_young(struct vm_area_struct *vma, 374int ptep_clear_flush_young(struct vm_area_struct *vma,
339 unsigned long address, pte_t *ptep) 375 unsigned long address, pte_t *ptep)
340{ 376{
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
347 return young; 383 return young;
348} 384}
349 385
386#ifdef CONFIG_TRANSPARENT_HUGEPAGE
387int pmdp_clear_flush_young(struct vm_area_struct *vma,
388 unsigned long address, pmd_t *pmdp)
389{
390 int young;
391
392 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
393
394 young = pmdp_test_and_clear_young(vma, address, pmdp);
395 if (young)
396 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
397
398 return young;
399}
400
401void pmdp_splitting_flush(struct vm_area_struct *vma,
402 unsigned long address, pmd_t *pmdp)
403{
404 int set;
405 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
406 set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
407 (unsigned long *)pmdp);
408 if (set) {
409 pmd_update(vma->vm_mm, address, pmdp);
410 /* need tlb flush only to serialize against gup-fast */
411 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
412 }
413}
414#endif
415
350/** 416/**
351 * reserve_top_address - reserves a hole in the top of kernel address space 417 * reserve_top_address - reserves a hole in the top of kernel address space
352 * @reserve - size of hole to reserve 418 * @reserve - size of hole to reserve
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 358c8b9c96a7..e2b7b0c06cdf 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -65,7 +65,6 @@ static int profile_exceptions_notify(struct notifier_block *self,
65 65
66 switch (val) { 66 switch (val) {
67 case DIE_NMI: 67 case DIE_NMI:
68 case DIE_NMI_IPI:
69 if (ctr_running) 68 if (ctr_running)
70 model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs)); 69 model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
71 else if (!nmi_enabled) 70 else if (!nmi_enabled)
@@ -143,7 +142,7 @@ static inline int has_mux(void)
143 142
144inline int op_x86_phys_to_virt(int phys) 143inline int op_x86_phys_to_virt(int phys)
145{ 144{
146 return __get_cpu_var(switch_index) + phys; 145 return __this_cpu_read(switch_index) + phys;
147} 146}
148 147
149inline int op_x86_virt_to_phys(int virt) 148inline int op_x86_virt_to_phys(int virt)
@@ -361,7 +360,7 @@ static void nmi_cpu_setup(void *dummy)
361static struct notifier_block profile_exceptions_nb = { 360static struct notifier_block profile_exceptions_nb = {
362 .notifier_call = profile_exceptions_notify, 361 .notifier_call = profile_exceptions_notify,
363 .next = NULL, 362 .next = NULL,
364 .priority = 2 363 .priority = NMI_LOCAL_LOW_PRIOR,
365}; 364};
366 365
367static void nmi_cpu_restore_registers(struct op_msrs *msrs) 366static void nmi_cpu_restore_registers(struct op_msrs *msrs)
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index 0636dd93cef8..720bf5a53c51 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -38,7 +38,7 @@ static int profile_timer_exceptions_notify(struct notifier_block *self,
38static struct notifier_block profile_timer_exceptions_nb = { 38static struct notifier_block profile_timer_exceptions_nb = {
39 .notifier_call = profile_timer_exceptions_notify, 39 .notifier_call = profile_timer_exceptions_notify,
40 .next = NULL, 40 .next = NULL,
41 .priority = 0 41 .priority = NMI_LOW_PRIOR,
42}; 42};
43 43
44static int timer_start(void) 44static int timer_start(void)
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index d769cda54082..94b745045e45 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -95,8 +95,8 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
95 * counter width: 95 * counter width:
96 */ 96 */
97 if (!(eax.split.version_id == 0 && 97 if (!(eax.split.version_id == 0 &&
98 current_cpu_data.x86 == 6 && 98 __this_cpu_read(cpu_info.x86) == 6 &&
99 current_cpu_data.x86_model == 15)) { 99 __this_cpu_read(cpu_info.x86_model) == 15)) {
100 100
101 if (counter_width < eax.split.bit_width) 101 if (counter_width < eax.split.bit_width)
102 counter_width = eax.split.bit_width; 102 counter_width = eax.split.bit_width;
@@ -235,8 +235,8 @@ static void arch_perfmon_setup_counters(void)
235 eax.full = cpuid_eax(0xa); 235 eax.full = cpuid_eax(0xa);
236 236
237 /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ 237 /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
238 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && 238 if (eax.split.version_id == 0 && __this_cpu_read(cpu_info.x86) == 6 &&
239 current_cpu_data.x86_model == 15) { 239 __this_cpu_read(cpu_info.x86_model) == 15) {
240 eax.split.version_id = 2; 240 eax.split.version_id = 2;
241 eax.split.num_counters = 2; 241 eax.split.num_counters = 2;
242 eax.split.bit_width = 40; 242 eax.split.bit_width = 40;
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index 0846a5bbbfbd..ab8269b0da29 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -9,6 +9,7 @@
9 * option) any later version. 9 * option) any later version.
10 */ 10 */
11 11
12#include <linux/acpi.h>
12#include <linux/delay.h> 13#include <linux/delay.h>
13#include <linux/dmi.h> 14#include <linux/dmi.h>
14#include <linux/pci.h> 15#include <linux/pci.h>
@@ -25,12 +26,14 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
25 u8 fbus, lbus; 26 u8 fbus, lbus;
26 int i; 27 int i;
27 28
29#ifdef CONFIG_ACPI
28 /* 30 /*
29 * The x86_pci_root_bus_res_quirks() function already refuses to use 31 * We should get host bridge information from ACPI unless the BIOS
30 * this information if ACPI _CRS was used. Therefore, we don't bother 32 * doesn't support it.
31 * checking if ACPI is enabled, and just generate the information
32 * for both the ACPI _CRS and no ACPI cases.
33 */ 33 */
34 if (acpi_os_get_root_pointer())
35 return;
36#endif
34 37
35 info = &pci_root_info[pci_root_num]; 38 info = &pci_root_info[pci_root_num];
36 pci_root_num++; 39 pci_root_num++;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index f7c8a399978c..5fe75026ecc2 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
22 22
23unsigned int pci_early_dump_regs; 23unsigned int pci_early_dump_regs;
24static int pci_bf_sort; 24static int pci_bf_sort;
25static int smbios_type_b1_flag;
25int pci_routeirq; 26int pci_routeirq;
26int noioapicquirk; 27int noioapicquirk;
27#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS 28#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -185,6 +186,39 @@ static int __devinit set_bf_sort(const struct dmi_system_id *d)
185 return 0; 186 return 0;
186} 187}
187 188
189static void __devinit read_dmi_type_b1(const struct dmi_header *dm,
190 void *private_data)
191{
192 u8 *d = (u8 *)dm + 4;
193
194 if (dm->type != 0xB1)
195 return;
196 switch (((*(u32 *)d) >> 9) & 0x03) {
197 case 0x00:
198 printk(KERN_INFO "dmi type 0xB1 record - unknown flag\n");
199 break;
200 case 0x01: /* set pci=bfsort */
201 smbios_type_b1_flag = 1;
202 break;
203 case 0x02: /* do not set pci=bfsort */
204 smbios_type_b1_flag = 2;
205 break;
206 default:
207 break;
208 }
209}
210
211static int __devinit find_sort_method(const struct dmi_system_id *d)
212{
213 dmi_walk(read_dmi_type_b1, NULL);
214
215 if (smbios_type_b1_flag == 1) {
216 set_bf_sort(d);
217 return 0;
218 }
219 return -1;
220}
221
188/* 222/*
189 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) 223 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
190 */ 224 */
@@ -213,6 +247,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
213 }, 247 },
214#endif /* __i386__ */ 248#endif /* __i386__ */
215 { 249 {
250 .callback = find_sort_method,
251 .ident = "Dell System",
252 .matches = {
253 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
254 },
255 },
256 {
216 .callback = set_bf_sort, 257 .callback = set_bf_sort,
217 .ident = "Dell PowerEdge 1950", 258 .ident = "Dell PowerEdge 1950",
218 .matches = { 259 .matches = {
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9f9bfb705cf9..87e6c8323117 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,7 +589,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
589 case PCI_DEVICE_ID_INTEL_ICH10_1: 589 case PCI_DEVICE_ID_INTEL_ICH10_1:
590 case PCI_DEVICE_ID_INTEL_ICH10_2: 590 case PCI_DEVICE_ID_INTEL_ICH10_2:
591 case PCI_DEVICE_ID_INTEL_ICH10_3: 591 case PCI_DEVICE_ID_INTEL_ICH10_3:
592 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC: 592 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0:
593 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1:
593 r->name = "PIIX/ICH"; 594 r->name = "PIIX/ICH";
594 r->get = pirq_piix_get; 595 r->get = pirq_piix_get;
595 r->set = pirq_piix_set; 596 r->set = pirq_piix_set;
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 65df603622b2..25bfdbb5b130 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -103,7 +103,7 @@ struct dw_spi_reg {
103static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0; 103static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
104 104
105static u32 *pclk_spi0; 105static u32 *pclk_spi0;
106/* Always contains an accessable address, start with 0 */ 106/* Always contains an accessible address, start with 0 */
107static struct dw_spi_reg *pspi; 107static struct dw_spi_reg *pspi;
108 108
109static struct kmsg_dumper dw_dumper; 109static struct kmsg_dumper dw_dumper;
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index c31b8fcb5a86..e797428b163b 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,3 +1,4 @@
1obj-$(CONFIG_OLPC) += olpc.o 1obj-$(CONFIG_OLPC) += olpc.o
2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o 2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
3obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o 3obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
4obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
index f5442c03abc3..127775696d6c 100644
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * Support for features of the OLPC XO-1 laptop 2 * Support for features of the OLPC XO-1 laptop
3 * 3 *
4 * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
4 * Copyright (C) 2010 One Laptop per Child 5 * Copyright (C) 2010 One Laptop per Child
5 * Copyright (C) 2006 Red Hat, Inc. 6 * Copyright (C) 2006 Red Hat, Inc.
6 * Copyright (C) 2006 Advanced Micro Devices, Inc. 7 * Copyright (C) 2006 Advanced Micro Devices, Inc.
@@ -12,8 +13,6 @@
12 */ 13 */
13 14
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/pci.h>
16#include <linux/pci_ids.h>
17#include <linux/platform_device.h> 16#include <linux/platform_device.h>
18#include <linux/pm.h> 17#include <linux/pm.h>
19 18
@@ -22,9 +21,6 @@
22 21
23#define DRV_NAME "olpc-xo1" 22#define DRV_NAME "olpc-xo1"
24 23
25#define PMS_BAR 4
26#define ACPI_BAR 5
27
28/* PMC registers (PMS block) */ 24/* PMC registers (PMS block) */
29#define PM_SCLK 0x10 25#define PM_SCLK 0x10
30#define PM_IN_SLPCTL 0x20 26#define PM_IN_SLPCTL 0x20
@@ -57,65 +53,67 @@ static void xo1_power_off(void)
57 outl(0x00002000, acpi_base + PM1_CNT); 53 outl(0x00002000, acpi_base + PM1_CNT);
58} 54}
59 55
60/* Read the base addresses from the PCI BAR info */ 56static int __devinit olpc_xo1_probe(struct platform_device *pdev)
61static int __devinit setup_bases(struct pci_dev *pdev)
62{ 57{
63 int r; 58 struct resource *res;
64 59
65 r = pci_enable_device_io(pdev); 60 /* don't run on non-XOs */
66 if (r) { 61 if (!machine_is_olpc())
67 dev_err(&pdev->dev, "can't enable device IO\n"); 62 return -ENODEV;
68 return r;
69 }
70 63
71 r = pci_request_region(pdev, ACPI_BAR, DRV_NAME); 64 res = platform_get_resource(pdev, IORESOURCE_IO, 0);
72 if (r) { 65 if (!res) {
73 dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR); 66 dev_err(&pdev->dev, "can't fetch device resource info\n");
74 return r; 67 return -EIO;
75 } 68 }
76 69
77 r = pci_request_region(pdev, PMS_BAR, DRV_NAME); 70 if (!request_region(res->start, resource_size(res), DRV_NAME)) {
78 if (r) { 71 dev_err(&pdev->dev, "can't request region\n");
79 dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR); 72 return -EIO;
80 pci_release_region(pdev, ACPI_BAR);
81 return r;
82 } 73 }
83 74
84 acpi_base = pci_resource_start(pdev, ACPI_BAR); 75 if (strcmp(pdev->name, "cs5535-pms") == 0)
85 pms_base = pci_resource_start(pdev, PMS_BAR); 76 pms_base = res->start;
77 else if (strcmp(pdev->name, "cs5535-acpi") == 0)
78 acpi_base = res->start;
79
80 /* If we have both addresses, we can override the poweroff hook */
81 if (pms_base && acpi_base) {
82 pm_power_off = xo1_power_off;
83 printk(KERN_INFO "OLPC XO-1 support registered\n");
84 }
86 85
87 return 0; 86 return 0;
88} 87}
89 88
90static int __devinit olpc_xo1_probe(struct platform_device *pdev) 89static int __devexit olpc_xo1_remove(struct platform_device *pdev)
91{ 90{
92 struct pci_dev *pcidev; 91 struct resource *r;
93 int r;
94
95 pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
96 NULL);
97 if (!pdev)
98 return -ENODEV;
99
100 r = setup_bases(pcidev);
101 if (r)
102 return r;
103 92
104 pm_power_off = xo1_power_off; 93 r = platform_get_resource(pdev, IORESOURCE_IO, 0);
94 release_region(r->start, resource_size(r));
105 95
106 printk(KERN_INFO "OLPC XO-1 support registered\n"); 96 if (strcmp(pdev->name, "cs5535-pms") == 0)
107 return 0; 97 pms_base = 0;
108} 98 else if (strcmp(pdev->name, "cs5535-acpi") == 0)
99 acpi_base = 0;
109 100
110static int __devexit olpc_xo1_remove(struct platform_device *pdev)
111{
112 pm_power_off = NULL; 101 pm_power_off = NULL;
113 return 0; 102 return 0;
114} 103}
115 104
116static struct platform_driver olpc_xo1_driver = { 105static struct platform_driver cs5535_pms_drv = {
106 .driver = {
107 .name = "cs5535-pms",
108 .owner = THIS_MODULE,
109 },
110 .probe = olpc_xo1_probe,
111 .remove = __devexit_p(olpc_xo1_remove),
112};
113
114static struct platform_driver cs5535_acpi_drv = {
117 .driver = { 115 .driver = {
118 .name = DRV_NAME, 116 .name = "cs5535-acpi",
119 .owner = THIS_MODULE, 117 .owner = THIS_MODULE,
120 }, 118 },
121 .probe = olpc_xo1_probe, 119 .probe = olpc_xo1_probe,
@@ -124,12 +122,23 @@ static struct platform_driver olpc_xo1_driver = {
124 122
125static int __init olpc_xo1_init(void) 123static int __init olpc_xo1_init(void)
126{ 124{
127 return platform_driver_register(&olpc_xo1_driver); 125 int r;
126
127 r = platform_driver_register(&cs5535_pms_drv);
128 if (r)
129 return r;
130
131 r = platform_driver_register(&cs5535_acpi_drv);
132 if (r)
133 platform_driver_unregister(&cs5535_pms_drv);
134
135 return r;
128} 136}
129 137
130static void __exit olpc_xo1_exit(void) 138static void __exit olpc_xo1_exit(void)
131{ 139{
132 platform_driver_unregister(&olpc_xo1_driver); 140 platform_driver_unregister(&cs5535_acpi_drv);
141 platform_driver_unregister(&cs5535_pms_drv);
133} 142}
134 143
135MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); 144MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
new file mode 100644
index 000000000000..dab874647530
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -0,0 +1,183 @@
1/*
2 * OLPC-specific OFW device tree support code.
3 *
4 * Paul Mackerras August 1996.
5 * Copyright (C) 1996-2005 Paul Mackerras.
6 *
7 * Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner.
8 * {engebret|bergner}@us.ibm.com
9 *
10 * Adapted for sparc by David S. Miller davem@davemloft.net
11 * Adapted for x86/OLPC by Andres Salomon <dilinger@queued.net>
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bootmem.h>
21#include <linux/of.h>
22#include <linux/of_pdt.h>
23#include <asm/olpc_ofw.h>
24
25static phandle __init olpc_dt_getsibling(phandle node)
26{
27 const void *args[] = { (void *)node };
28 void *res[] = { &node };
29
30 if ((s32)node == -1)
31 return 0;
32
33 if (olpc_ofw("peer", args, res) || (s32)node == -1)
34 return 0;
35
36 return node;
37}
38
39static phandle __init olpc_dt_getchild(phandle node)
40{
41 const void *args[] = { (void *)node };
42 void *res[] = { &node };
43
44 if ((s32)node == -1)
45 return 0;
46
47 if (olpc_ofw("child", args, res) || (s32)node == -1) {
48 pr_err("PROM: %s: fetching child failed!\n", __func__);
49 return 0;
50 }
51
52 return node;
53}
54
55static int __init olpc_dt_getproplen(phandle node, const char *prop)
56{
57 const void *args[] = { (void *)node, prop };
58 int len;
59 void *res[] = { &len };
60
61 if ((s32)node == -1)
62 return -1;
63
64 if (olpc_ofw("getproplen", args, res)) {
65 pr_err("PROM: %s: getproplen failed!\n", __func__);
66 return -1;
67 }
68
69 return len;
70}
71
72static int __init olpc_dt_getproperty(phandle node, const char *prop,
73 char *buf, int bufsize)
74{
75 int plen;
76
77 plen = olpc_dt_getproplen(node, prop);
78 if (plen > bufsize || plen < 1) {
79 return -1;
80 } else {
81 const void *args[] = { (void *)node, prop, buf, (void *)plen };
82 void *res[] = { &plen };
83
84 if (olpc_ofw("getprop", args, res)) {
85 pr_err("PROM: %s: getprop failed!\n", __func__);
86 return -1;
87 }
88 }
89
90 return plen;
91}
92
93static int __init olpc_dt_nextprop(phandle node, char *prev, char *buf)
94{
95 const void *args[] = { (void *)node, prev, buf };
96 int success;
97 void *res[] = { &success };
98
99 buf[0] = '\0';
100
101 if ((s32)node == -1)
102 return -1;
103
104 if (olpc_ofw("nextprop", args, res) || success != 1)
105 return -1;
106
107 return 0;
108}
109
110static int __init olpc_dt_pkg2path(phandle node, char *buf,
111 const int buflen, int *len)
112{
113 const void *args[] = { (void *)node, buf, (void *)buflen };
114 void *res[] = { len };
115
116 if ((s32)node == -1)
117 return -1;
118
119 if (olpc_ofw("package-to-path", args, res) || *len < 1)
120 return -1;
121
122 return 0;
123}
124
125static unsigned int prom_early_allocated __initdata;
126
127void * __init prom_early_alloc(unsigned long size)
128{
129 static u8 *mem;
130 static size_t free_mem;
131 void *res;
132
133 if (free_mem < size) {
134 const size_t chunk_size = max(PAGE_SIZE, size);
135
136 /*
137 * To mimimize the number of allocations, grab at least
138 * PAGE_SIZE of memory (that's an arbitrary choice that's
139 * fast enough on the platforms we care about while minimizing
140 * wasted bootmem) and hand off chunks of it to callers.
141 */
142 res = alloc_bootmem(chunk_size);
143 if (!res)
144 return NULL;
145 prom_early_allocated += chunk_size;
146 memset(res, 0, chunk_size);
147 free_mem = chunk_size;
148 mem = res;
149 }
150
151 /* allocate from the local cache */
152 free_mem -= size;
153 res = mem;
154 mem += size;
155 return res;
156}
157
158static struct of_pdt_ops prom_olpc_ops __initdata = {
159 .nextprop = olpc_dt_nextprop,
160 .getproplen = olpc_dt_getproplen,
161 .getproperty = olpc_dt_getproperty,
162 .getchild = olpc_dt_getchild,
163 .getsibling = olpc_dt_getsibling,
164 .pkg2path = olpc_dt_pkg2path,
165};
166
167void __init olpc_dt_build_devicetree(void)
168{
169 phandle root;
170
171 if (!olpc_ofw_is_installed())
172 return;
173
174 root = olpc_dt_getsibling(0);
175 if (!root) {
176 pr_err("PROM: unable to get root node from OFW!\n");
177 return;
178 }
179 of_pdt_build_devicetree(root, &prom_olpc_ops);
180
181 pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
182 prom_early_allocated);
183}
diff --git a/arch/x86/platform/olpc/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c
index 787320464379..e7604f62870d 100644
--- a/arch/x86/platform/olpc/olpc_ofw.c
+++ b/arch/x86/platform/olpc/olpc_ofw.c
@@ -110,3 +110,8 @@ void __init olpc_ofw_detect(void)
110 (unsigned long)olpc_ofw_cif, (-start) >> 20); 110 (unsigned long)olpc_ofw_cif, (-start) >> 20);
111 reserve_top_address(-start); 111 reserve_top_address(-start);
112} 112}
113
114bool __init olpc_ofw_is_installed(void)
115{
116 return olpc_ofw_cif != NULL;
117}
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 779385158915..17c565de3d64 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp)
12 12
13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
14 time.o xen-asm.o xen-asm_$(BITS).o \ 14 time.o xen-asm.o xen-asm_$(BITS).o \
15 grant-table.o suspend.o platform-pci-unplug.o 15 grant-table.o suspend.o platform-pci-unplug.o \
16 p2m.o
16 17
17obj-$(CONFIG_SMP) += smp.o 18obj-$(CONFIG_SMP) += smp.o
18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 19obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 44dcad43989d..7e8d3bc80af6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -574,8 +574,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
574 574
575 preempt_disable(); 575 preempt_disable();
576 576
577 start = __get_cpu_var(idt_desc).address; 577 start = __this_cpu_read(idt_desc.address);
578 end = start + __get_cpu_var(idt_desc).size + 1; 578 end = start + __this_cpu_read(idt_desc.size) + 1;
579 579
580 xen_mc_flush(); 580 xen_mc_flush();
581 581
@@ -1174,6 +1174,15 @@ asmlinkage void __init xen_start_kernel(void)
1174 1174
1175 xen_smp_init(); 1175 xen_smp_init();
1176 1176
1177#ifdef CONFIG_ACPI_NUMA
1178 /*
1179 * The pages we from Xen are not related to machine pages, so
1180 * any NUMA information the kernel tries to get from ACPI will
1181 * be meaningless. Prevent it from trying.
1182 */
1183 acpi_numa = -1;
1184#endif
1185
1177 pgd = (pgd_t *)xen_start_info->pt_base; 1186 pgd = (pgd_t *)xen_start_info->pt_base;
1178 1187
1179 if (!xen_initial_domain()) 1188 if (!xen_initial_domain())
@@ -1256,25 +1265,6 @@ asmlinkage void __init xen_start_kernel(void)
1256#endif 1265#endif
1257} 1266}
1258 1267
1259static uint32_t xen_cpuid_base(void)
1260{
1261 uint32_t base, eax, ebx, ecx, edx;
1262 char signature[13];
1263
1264 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1265 cpuid(base, &eax, &ebx, &ecx, &edx);
1266 *(uint32_t *)(signature + 0) = ebx;
1267 *(uint32_t *)(signature + 4) = ecx;
1268 *(uint32_t *)(signature + 8) = edx;
1269 signature[12] = 0;
1270
1271 if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
1272 return base;
1273 }
1274
1275 return 0;
1276}
1277
1278static int init_hvm_pv_info(int *major, int *minor) 1268static int init_hvm_pv_info(int *major, int *minor)
1279{ 1269{
1280 uint32_t eax, ebx, ecx, edx, pages, msr, base; 1270 uint32_t eax, ebx, ecx, edx, pages, msr, base;
@@ -1384,6 +1374,18 @@ static bool __init xen_hvm_platform(void)
1384 return true; 1374 return true;
1385} 1375}
1386 1376
1377bool xen_hvm_need_lapic(void)
1378{
1379 if (xen_pv_domain())
1380 return false;
1381 if (!xen_hvm_domain())
1382 return false;
1383 if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
1384 return false;
1385 return true;
1386}
1387EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
1388
1387const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { 1389const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
1388 .name = "Xen HVM", 1390 .name = "Xen HVM",
1389 .detect = xen_hvm_platform, 1391 .detect = xen_hvm_platform,
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 44924e551fde..5e92b61ad574 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
173 */ 173 */
174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
175 175
176/*
177 * Xen leaves the responsibility for maintaining p2m mappings to the
178 * guests themselves, but it must also access and update the p2m array
179 * during suspend/resume when all the pages are reallocated.
180 *
181 * The p2m table is logically a flat array, but we implement it as a
182 * three-level tree to allow the address space to be sparse.
183 *
184 * Xen
185 * |
186 * p2m_top p2m_top_mfn
187 * / \ / \
188 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
189 * / \ / \ / /
190 * p2m p2m p2m p2m p2m p2m p2m ...
191 *
192 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
193 *
194 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
195 * maximum representable pseudo-physical address space is:
196 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
197 *
198 * P2M_PER_PAGE depends on the architecture, as a mfn is always
199 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
200 * 512 and 1024 entries respectively.
201 */
202
203unsigned long xen_max_p2m_pfn __read_mostly;
204
205#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
206#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
207#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
208
209#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
210
211/* Placeholders for holes in the address space */
212static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
213static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
214static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
215
216static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
217static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
218static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
219
220RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
221RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
222
223static inline unsigned p2m_top_index(unsigned long pfn)
224{
225 BUG_ON(pfn >= MAX_P2M_PFN);
226 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
227}
228
229static inline unsigned p2m_mid_index(unsigned long pfn)
230{
231 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
232}
233
234static inline unsigned p2m_index(unsigned long pfn)
235{
236 return pfn % P2M_PER_PAGE;
237}
238
239static void p2m_top_init(unsigned long ***top)
240{
241 unsigned i;
242
243 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
244 top[i] = p2m_mid_missing;
245}
246
247static void p2m_top_mfn_init(unsigned long *top)
248{
249 unsigned i;
250
251 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
252 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
253}
254
255static void p2m_top_mfn_p_init(unsigned long **top)
256{
257 unsigned i;
258
259 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
260 top[i] = p2m_mid_missing_mfn;
261}
262
263static void p2m_mid_init(unsigned long **mid)
264{
265 unsigned i;
266
267 for (i = 0; i < P2M_MID_PER_PAGE; i++)
268 mid[i] = p2m_missing;
269}
270
271static void p2m_mid_mfn_init(unsigned long *mid)
272{
273 unsigned i;
274
275 for (i = 0; i < P2M_MID_PER_PAGE; i++)
276 mid[i] = virt_to_mfn(p2m_missing);
277}
278
279static void p2m_init(unsigned long *p2m)
280{
281 unsigned i;
282
283 for (i = 0; i < P2M_MID_PER_PAGE; i++)
284 p2m[i] = INVALID_P2M_ENTRY;
285}
286
287/*
288 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
289 *
290 * This is called both at boot time, and after resuming from suspend:
291 * - At boot time we're called very early, and must use extend_brk()
292 * to allocate memory.
293 *
294 * - After resume we're called from within stop_machine, but the mfn
295 * tree should alreay be completely allocated.
296 */
297void xen_build_mfn_list_list(void)
298{
299 unsigned long pfn;
300
301 /* Pre-initialize p2m_top_mfn to be completely missing */
302 if (p2m_top_mfn == NULL) {
303 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
304 p2m_mid_mfn_init(p2m_mid_missing_mfn);
305
306 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
307 p2m_top_mfn_p_init(p2m_top_mfn_p);
308
309 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
310 p2m_top_mfn_init(p2m_top_mfn);
311 } else {
312 /* Reinitialise, mfn's all change after migration */
313 p2m_mid_mfn_init(p2m_mid_missing_mfn);
314 }
315
316 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
317 unsigned topidx = p2m_top_index(pfn);
318 unsigned mididx = p2m_mid_index(pfn);
319 unsigned long **mid;
320 unsigned long *mid_mfn_p;
321
322 mid = p2m_top[topidx];
323 mid_mfn_p = p2m_top_mfn_p[topidx];
324
325 /* Don't bother allocating any mfn mid levels if
326 * they're just missing, just update the stored mfn,
327 * since all could have changed over a migrate.
328 */
329 if (mid == p2m_mid_missing) {
330 BUG_ON(mididx);
331 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
332 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
333 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
334 continue;
335 }
336
337 if (mid_mfn_p == p2m_mid_missing_mfn) {
338 /*
339 * XXX boot-time only! We should never find
340 * missing parts of the mfn tree after
341 * runtime. extend_brk() will BUG if we call
342 * it too late.
343 */
344 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
345 p2m_mid_mfn_init(mid_mfn_p);
346
347 p2m_top_mfn_p[topidx] = mid_mfn_p;
348 }
349
350 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
351 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
352 }
353}
354
355void xen_setup_mfn_list_list(void)
356{
357 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
358
359 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
360 virt_to_mfn(p2m_top_mfn);
361 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
362}
363
364/* Set up p2m_top to point to the domain-builder provided p2m pages */
365void __init xen_build_dynamic_phys_to_machine(void)
366{
367 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
368 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
369 unsigned long pfn;
370
371 xen_max_p2m_pfn = max_pfn;
372
373 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
374 p2m_init(p2m_missing);
375
376 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
377 p2m_mid_init(p2m_mid_missing);
378
379 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
380 p2m_top_init(p2m_top);
381
382 /*
383 * The domain builder gives us a pre-constructed p2m array in
384 * mfn_list for all the pages initially given to us, so we just
385 * need to graft that into our tree structure.
386 */
387 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
388 unsigned topidx = p2m_top_index(pfn);
389 unsigned mididx = p2m_mid_index(pfn);
390
391 if (p2m_top[topidx] == p2m_mid_missing) {
392 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
393 p2m_mid_init(mid);
394
395 p2m_top[topidx] = mid;
396 }
397
398 p2m_top[topidx][mididx] = &mfn_list[pfn];
399 }
400}
401
402unsigned long get_phys_to_machine(unsigned long pfn)
403{
404 unsigned topidx, mididx, idx;
405
406 if (unlikely(pfn >= MAX_P2M_PFN))
407 return INVALID_P2M_ENTRY;
408
409 topidx = p2m_top_index(pfn);
410 mididx = p2m_mid_index(pfn);
411 idx = p2m_index(pfn);
412
413 return p2m_top[topidx][mididx][idx];
414}
415EXPORT_SYMBOL_GPL(get_phys_to_machine);
416
417static void *alloc_p2m_page(void)
418{
419 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
420}
421
422static void free_p2m_page(void *p)
423{
424 free_page((unsigned long)p);
425}
426
427/*
428 * Fully allocate the p2m structure for a given pfn. We need to check
429 * that both the top and mid levels are allocated, and make sure the
430 * parallel mfn tree is kept in sync. We may race with other cpus, so
431 * the new pages are installed with cmpxchg; if we lose the race then
432 * simply free the page we allocated and use the one that's there.
433 */
434static bool alloc_p2m(unsigned long pfn)
435{
436 unsigned topidx, mididx;
437 unsigned long ***top_p, **mid;
438 unsigned long *top_mfn_p, *mid_mfn;
439
440 topidx = p2m_top_index(pfn);
441 mididx = p2m_mid_index(pfn);
442
443 top_p = &p2m_top[topidx];
444 mid = *top_p;
445
446 if (mid == p2m_mid_missing) {
447 /* Mid level is missing, allocate a new one */
448 mid = alloc_p2m_page();
449 if (!mid)
450 return false;
451
452 p2m_mid_init(mid);
453
454 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
455 free_p2m_page(mid);
456 }
457
458 top_mfn_p = &p2m_top_mfn[topidx];
459 mid_mfn = p2m_top_mfn_p[topidx];
460
461 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
462
463 if (mid_mfn == p2m_mid_missing_mfn) {
464 /* Separately check the mid mfn level */
465 unsigned long missing_mfn;
466 unsigned long mid_mfn_mfn;
467
468 mid_mfn = alloc_p2m_page();
469 if (!mid_mfn)
470 return false;
471
472 p2m_mid_mfn_init(mid_mfn);
473
474 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
475 mid_mfn_mfn = virt_to_mfn(mid_mfn);
476 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
477 free_p2m_page(mid_mfn);
478 else
479 p2m_top_mfn_p[topidx] = mid_mfn;
480 }
481
482 if (p2m_top[topidx][mididx] == p2m_missing) {
483 /* p2m leaf page is missing */
484 unsigned long *p2m;
485
486 p2m = alloc_p2m_page();
487 if (!p2m)
488 return false;
489
490 p2m_init(p2m);
491
492 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
493 free_p2m_page(p2m);
494 else
495 mid_mfn[mididx] = virt_to_mfn(p2m);
496 }
497
498 return true;
499}
500
501/* Try to install p2m mapping; fail if intermediate bits missing */
502bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
503{
504 unsigned topidx, mididx, idx;
505
506 if (unlikely(pfn >= MAX_P2M_PFN)) {
507 BUG_ON(mfn != INVALID_P2M_ENTRY);
508 return true;
509 }
510
511 topidx = p2m_top_index(pfn);
512 mididx = p2m_mid_index(pfn);
513 idx = p2m_index(pfn);
514
515 if (p2m_top[topidx][mididx] == p2m_missing)
516 return mfn == INVALID_P2M_ENTRY;
517
518 p2m_top[topidx][mididx][idx] = mfn;
519
520 return true;
521}
522
523bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
524{
525 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
526 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
527 return true;
528 }
529
530 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
531 if (!alloc_p2m(pfn))
532 return false;
533
534 if (!__set_phys_to_machine(pfn, mfn))
535 return false;
536 }
537
538 return true;
539}
540
541unsigned long arbitrary_virt_to_mfn(void *vaddr) 176unsigned long arbitrary_virt_to_mfn(void *vaddr)
542{ 177{
543 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); 178 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
@@ -566,6 +201,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
566 offset = address & ~PAGE_MASK; 201 offset = address & ~PAGE_MASK;
567 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); 202 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
568} 203}
204EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
569 205
570void make_lowmem_page_readonly(void *vaddr) 206void make_lowmem_page_readonly(void *vaddr)
571{ 207{
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 9e565da5d1f7..4ec8035e3216 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -22,7 +22,7 @@ static inline void xen_mc_batch(void)
22 unsigned long flags; 22 unsigned long flags;
23 /* need to disable interrupts until this entry is complete */ 23 /* need to disable interrupts until this entry is complete */
24 local_irq_save(flags); 24 local_irq_save(flags);
25 __get_cpu_var(xen_mc_irq_flags) = flags; 25 __this_cpu_write(xen_mc_irq_flags, flags);
26} 26}
27 27
28static inline struct multicall_space xen_mc_entry(size_t args) 28static inline struct multicall_space xen_mc_entry(size_t args)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 000000000000..8f2251d2a3f8
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,510 @@
1/*
2 * Xen leaves the responsibility for maintaining p2m mappings to the
3 * guests themselves, but it must also access and update the p2m array
4 * during suspend/resume when all the pages are reallocated.
5 *
6 * The p2m table is logically a flat array, but we implement it as a
7 * three-level tree to allow the address space to be sparse.
8 *
9 * Xen
10 * |
11 * p2m_top p2m_top_mfn
12 * / \ / \
13 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
14 * / \ / \ / /
15 * p2m p2m p2m p2m p2m p2m p2m ...
16 *
17 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
18 *
19 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
20 * maximum representable pseudo-physical address space is:
21 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
22 *
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively.
26 */
27
28#include <linux/init.h>
29#include <linux/module.h>
30#include <linux/list.h>
31#include <linux/hash.h>
32#include <linux/sched.h>
33
34#include <asm/cache.h>
35#include <asm/setup.h>
36
37#include <asm/xen/page.h>
38#include <asm/xen/hypercall.h>
39#include <asm/xen/hypervisor.h>
40
41#include "xen-ops.h"
42
43static void __init m2p_override_init(void);
44
45unsigned long xen_max_p2m_pfn __read_mostly;
46
47#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
48#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
49#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
50
51#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
52
53/* Placeholders for holes in the address space */
54static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
55static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
56static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
57
58static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
59static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
60static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
61
62RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
63RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
64
65static inline unsigned p2m_top_index(unsigned long pfn)
66{
67 BUG_ON(pfn >= MAX_P2M_PFN);
68 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
69}
70
71static inline unsigned p2m_mid_index(unsigned long pfn)
72{
73 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
74}
75
76static inline unsigned p2m_index(unsigned long pfn)
77{
78 return pfn % P2M_PER_PAGE;
79}
80
81static void p2m_top_init(unsigned long ***top)
82{
83 unsigned i;
84
85 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
86 top[i] = p2m_mid_missing;
87}
88
89static void p2m_top_mfn_init(unsigned long *top)
90{
91 unsigned i;
92
93 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
94 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
95}
96
97static void p2m_top_mfn_p_init(unsigned long **top)
98{
99 unsigned i;
100
101 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
102 top[i] = p2m_mid_missing_mfn;
103}
104
105static void p2m_mid_init(unsigned long **mid)
106{
107 unsigned i;
108
109 for (i = 0; i < P2M_MID_PER_PAGE; i++)
110 mid[i] = p2m_missing;
111}
112
113static void p2m_mid_mfn_init(unsigned long *mid)
114{
115 unsigned i;
116
117 for (i = 0; i < P2M_MID_PER_PAGE; i++)
118 mid[i] = virt_to_mfn(p2m_missing);
119}
120
121static void p2m_init(unsigned long *p2m)
122{
123 unsigned i;
124
125 for (i = 0; i < P2M_MID_PER_PAGE; i++)
126 p2m[i] = INVALID_P2M_ENTRY;
127}
128
129/*
130 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
131 *
132 * This is called both at boot time, and after resuming from suspend:
133 * - At boot time we're called very early, and must use extend_brk()
134 * to allocate memory.
135 *
136 * - After resume we're called from within stop_machine, but the mfn
137 * tree should alreay be completely allocated.
138 */
139void xen_build_mfn_list_list(void)
140{
141 unsigned long pfn;
142
143 /* Pre-initialize p2m_top_mfn to be completely missing */
144 if (p2m_top_mfn == NULL) {
145 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
146 p2m_mid_mfn_init(p2m_mid_missing_mfn);
147
148 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
149 p2m_top_mfn_p_init(p2m_top_mfn_p);
150
151 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
152 p2m_top_mfn_init(p2m_top_mfn);
153 } else {
154 /* Reinitialise, mfn's all change after migration */
155 p2m_mid_mfn_init(p2m_mid_missing_mfn);
156 }
157
158 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
159 unsigned topidx = p2m_top_index(pfn);
160 unsigned mididx = p2m_mid_index(pfn);
161 unsigned long **mid;
162 unsigned long *mid_mfn_p;
163
164 mid = p2m_top[topidx];
165 mid_mfn_p = p2m_top_mfn_p[topidx];
166
167 /* Don't bother allocating any mfn mid levels if
168 * they're just missing, just update the stored mfn,
169 * since all could have changed over a migrate.
170 */
171 if (mid == p2m_mid_missing) {
172 BUG_ON(mididx);
173 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
174 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
175 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
176 continue;
177 }
178
179 if (mid_mfn_p == p2m_mid_missing_mfn) {
180 /*
181 * XXX boot-time only! We should never find
182 * missing parts of the mfn tree after
183 * runtime. extend_brk() will BUG if we call
184 * it too late.
185 */
186 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
187 p2m_mid_mfn_init(mid_mfn_p);
188
189 p2m_top_mfn_p[topidx] = mid_mfn_p;
190 }
191
192 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
193 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
194 }
195}
196
197void xen_setup_mfn_list_list(void)
198{
199 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
200
201 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
202 virt_to_mfn(p2m_top_mfn);
203 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
204}
205
206/* Set up p2m_top to point to the domain-builder provided p2m pages */
207void __init xen_build_dynamic_phys_to_machine(void)
208{
209 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
210 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
211 unsigned long pfn;
212
213 xen_max_p2m_pfn = max_pfn;
214
215 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
216 p2m_init(p2m_missing);
217
218 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
219 p2m_mid_init(p2m_mid_missing);
220
221 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
222 p2m_top_init(p2m_top);
223
224 /*
225 * The domain builder gives us a pre-constructed p2m array in
226 * mfn_list for all the pages initially given to us, so we just
227 * need to graft that into our tree structure.
228 */
229 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
230 unsigned topidx = p2m_top_index(pfn);
231 unsigned mididx = p2m_mid_index(pfn);
232
233 if (p2m_top[topidx] == p2m_mid_missing) {
234 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
235 p2m_mid_init(mid);
236
237 p2m_top[topidx] = mid;
238 }
239
240 p2m_top[topidx][mididx] = &mfn_list[pfn];
241 }
242
243 m2p_override_init();
244}
245
246unsigned long get_phys_to_machine(unsigned long pfn)
247{
248 unsigned topidx, mididx, idx;
249
250 if (unlikely(pfn >= MAX_P2M_PFN))
251 return INVALID_P2M_ENTRY;
252
253 topidx = p2m_top_index(pfn);
254 mididx = p2m_mid_index(pfn);
255 idx = p2m_index(pfn);
256
257 return p2m_top[topidx][mididx][idx];
258}
259EXPORT_SYMBOL_GPL(get_phys_to_machine);
260
261static void *alloc_p2m_page(void)
262{
263 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
264}
265
266static void free_p2m_page(void *p)
267{
268 free_page((unsigned long)p);
269}
270
271/*
272 * Fully allocate the p2m structure for a given pfn. We need to check
273 * that both the top and mid levels are allocated, and make sure the
274 * parallel mfn tree is kept in sync. We may race with other cpus, so
275 * the new pages are installed with cmpxchg; if we lose the race then
276 * simply free the page we allocated and use the one that's there.
277 */
278static bool alloc_p2m(unsigned long pfn)
279{
280 unsigned topidx, mididx;
281 unsigned long ***top_p, **mid;
282 unsigned long *top_mfn_p, *mid_mfn;
283
284 topidx = p2m_top_index(pfn);
285 mididx = p2m_mid_index(pfn);
286
287 top_p = &p2m_top[topidx];
288 mid = *top_p;
289
290 if (mid == p2m_mid_missing) {
291 /* Mid level is missing, allocate a new one */
292 mid = alloc_p2m_page();
293 if (!mid)
294 return false;
295
296 p2m_mid_init(mid);
297
298 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
299 free_p2m_page(mid);
300 }
301
302 top_mfn_p = &p2m_top_mfn[topidx];
303 mid_mfn = p2m_top_mfn_p[topidx];
304
305 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
306
307 if (mid_mfn == p2m_mid_missing_mfn) {
308 /* Separately check the mid mfn level */
309 unsigned long missing_mfn;
310 unsigned long mid_mfn_mfn;
311
312 mid_mfn = alloc_p2m_page();
313 if (!mid_mfn)
314 return false;
315
316 p2m_mid_mfn_init(mid_mfn);
317
318 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
319 mid_mfn_mfn = virt_to_mfn(mid_mfn);
320 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
321 free_p2m_page(mid_mfn);
322 else
323 p2m_top_mfn_p[topidx] = mid_mfn;
324 }
325
326 if (p2m_top[topidx][mididx] == p2m_missing) {
327 /* p2m leaf page is missing */
328 unsigned long *p2m;
329
330 p2m = alloc_p2m_page();
331 if (!p2m)
332 return false;
333
334 p2m_init(p2m);
335
336 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
337 free_p2m_page(p2m);
338 else
339 mid_mfn[mididx] = virt_to_mfn(p2m);
340 }
341
342 return true;
343}
344
345/* Try to install p2m mapping; fail if intermediate bits missing */
346bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
347{
348 unsigned topidx, mididx, idx;
349
350 if (unlikely(pfn >= MAX_P2M_PFN)) {
351 BUG_ON(mfn != INVALID_P2M_ENTRY);
352 return true;
353 }
354
355 topidx = p2m_top_index(pfn);
356 mididx = p2m_mid_index(pfn);
357 idx = p2m_index(pfn);
358
359 if (p2m_top[topidx][mididx] == p2m_missing)
360 return mfn == INVALID_P2M_ENTRY;
361
362 p2m_top[topidx][mididx][idx] = mfn;
363
364 return true;
365}
366
367bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
368{
369 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
370 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
371 return true;
372 }
373
374 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
375 if (!alloc_p2m(pfn))
376 return false;
377
378 if (!__set_phys_to_machine(pfn, mfn))
379 return false;
380 }
381
382 return true;
383}
384
385#define M2P_OVERRIDE_HASH_SHIFT 10
386#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
387
388static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
389static DEFINE_SPINLOCK(m2p_override_lock);
390
391static void __init m2p_override_init(void)
392{
393 unsigned i;
394
395 m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
396 sizeof(unsigned long));
397
398 for (i = 0; i < M2P_OVERRIDE_HASH; i++)
399 INIT_LIST_HEAD(&m2p_overrides[i]);
400}
401
402static unsigned long mfn_hash(unsigned long mfn)
403{
404 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
405}
406
407/* Add an MFN override for a particular page */
408int m2p_add_override(unsigned long mfn, struct page *page)
409{
410 unsigned long flags;
411 unsigned long pfn;
412 unsigned long address;
413 unsigned level;
414 pte_t *ptep = NULL;
415
416 pfn = page_to_pfn(page);
417 if (!PageHighMem(page)) {
418 address = (unsigned long)__va(pfn << PAGE_SHIFT);
419 ptep = lookup_address(address, &level);
420
421 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
422 "m2p_add_override: pfn %lx not mapped", pfn))
423 return -EINVAL;
424 }
425
426 page->private = mfn;
427 page->index = pfn_to_mfn(pfn);
428
429 __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
430 if (!PageHighMem(page))
431 /* Just zap old mapping for now */
432 pte_clear(&init_mm, address, ptep);
433
434 spin_lock_irqsave(&m2p_override_lock, flags);
435 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
436 spin_unlock_irqrestore(&m2p_override_lock, flags);
437
438 return 0;
439}
440
441int m2p_remove_override(struct page *page)
442{
443 unsigned long flags;
444 unsigned long mfn;
445 unsigned long pfn;
446 unsigned long address;
447 unsigned level;
448 pte_t *ptep = NULL;
449
450 pfn = page_to_pfn(page);
451 mfn = get_phys_to_machine(pfn);
452 if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
453 return -EINVAL;
454
455 if (!PageHighMem(page)) {
456 address = (unsigned long)__va(pfn << PAGE_SHIFT);
457 ptep = lookup_address(address, &level);
458
459 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
460 "m2p_remove_override: pfn %lx not mapped", pfn))
461 return -EINVAL;
462 }
463
464 spin_lock_irqsave(&m2p_override_lock, flags);
465 list_del(&page->lru);
466 spin_unlock_irqrestore(&m2p_override_lock, flags);
467 __set_phys_to_machine(pfn, page->index);
468
469 if (!PageHighMem(page))
470 set_pte_at(&init_mm, address, ptep,
471 pfn_pte(pfn, PAGE_KERNEL));
472 /* No tlb flush necessary because the caller already
473 * left the pte unmapped. */
474
475 return 0;
476}
477
478struct page *m2p_find_override(unsigned long mfn)
479{
480 unsigned long flags;
481 struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
482 struct page *p, *ret;
483
484 ret = NULL;
485
486 spin_lock_irqsave(&m2p_override_lock, flags);
487
488 list_for_each_entry(p, bucket, lru) {
489 if (p->private == mfn) {
490 ret = p;
491 break;
492 }
493 }
494
495 spin_unlock_irqrestore(&m2p_override_lock, flags);
496
497 return ret;
498}
499
500unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
501{
502 struct page *p = m2p_find_override(mfn);
503 unsigned long ret = pfn;
504
505 if (p)
506 ret = page_to_pfn(p);
507
508 return ret;
509}
510EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 23e061b9327b..cc9b1e182fcf 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -159,8 +159,8 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
159{ 159{
160 struct xen_spinlock *prev; 160 struct xen_spinlock *prev;
161 161
162 prev = __get_cpu_var(lock_spinners); 162 prev = __this_cpu_read(lock_spinners);
163 __get_cpu_var(lock_spinners) = xl; 163 __this_cpu_write(lock_spinners, xl);
164 164
165 wmb(); /* set lock of interest before count */ 165 wmb(); /* set lock of interest before count */
166 166
@@ -179,14 +179,14 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock
179 asm(LOCK_PREFIX " decw %0" 179 asm(LOCK_PREFIX " decw %0"
180 : "+m" (xl->spinners) : : "memory"); 180 : "+m" (xl->spinners) : : "memory");
181 wmb(); /* decrement count before restoring lock */ 181 wmb(); /* decrement count before restoring lock */
182 __get_cpu_var(lock_spinners) = prev; 182 __this_cpu_write(lock_spinners, prev);
183} 183}
184 184
185static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) 185static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
186{ 186{
187 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 187 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
188 struct xen_spinlock *prev; 188 struct xen_spinlock *prev;
189 int irq = __get_cpu_var(lock_kicker_irq); 189 int irq = __this_cpu_read(lock_kicker_irq);
190 int ret; 190 int ret;
191 u64 start; 191 u64 start;
192 192
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 5da5e53fb94c..067759e3d6a5 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -135,24 +135,24 @@ static void do_stolen_accounting(void)
135 135
136 /* Add the appropriate number of ticks of stolen time, 136 /* Add the appropriate number of ticks of stolen time,
137 including any left-overs from last time. */ 137 including any left-overs from last time. */
138 stolen = runnable + offline + __get_cpu_var(xen_residual_stolen); 138 stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
139 139
140 if (stolen < 0) 140 if (stolen < 0)
141 stolen = 0; 141 stolen = 0;
142 142
143 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); 143 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
144 __get_cpu_var(xen_residual_stolen) = stolen; 144 __this_cpu_write(xen_residual_stolen, stolen);
145 account_steal_ticks(ticks); 145 account_steal_ticks(ticks);
146 146
147 /* Add the appropriate number of ticks of blocked time, 147 /* Add the appropriate number of ticks of blocked time,
148 including any left-overs from last time. */ 148 including any left-overs from last time. */
149 blocked += __get_cpu_var(xen_residual_blocked); 149 blocked += __this_cpu_read(xen_residual_blocked);
150 150
151 if (blocked < 0) 151 if (blocked < 0)
152 blocked = 0; 152 blocked = 0;
153 153
154 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); 154 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
155 __get_cpu_var(xen_residual_blocked) = blocked; 155 __this_cpu_write(xen_residual_blocked, blocked);
156 account_idle_ticks(ticks); 156 account_idle_ticks(ticks);
157} 157}
158 158