aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig.debug1
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/boot/Makefile4
-rw-r--r--arch/x86/boot/boot.h1
-rw-r--r--arch/x86/boot/video-mode.c2
-rw-r--r--arch/x86/boot/video.c2
-rw-r--r--arch/x86/crypto/Makefile8
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S2
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S302
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c314
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S353
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c329
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c249
-rw-r--r--arch/x86/entry/entry_64.S19
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/include/asm/highmem.h1
-rw-r--r--arch/x86/include/asm/i8259.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h10
-rw-r--r--arch/x86/include/asm/kvm_emulate.h10
-rw-r--r--arch/x86/include/asm/kvm_host.h65
-rw-r--r--arch/x86/include/asm/msr-index.h10
-rw-r--r--arch/x86/include/asm/page_types.h16
-rw-r--r--arch/x86/include/asm/pgtable.h10
-rw-r--r--arch/x86/include/asm/pgtable_types.h14
-rw-r--r--arch/x86/include/asm/vmx.h6
-rw-r--r--arch/x86/include/asm/x86_init.h1
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h5
-rw-r--r--arch/x86/include/asm/xen/page.h8
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h18
-rw-r--r--arch/x86/include/uapi/asm/svm.h1
-rw-r--r--arch/x86/include/uapi/asm/vmx.h4
-rw-r--r--arch/x86/kernel/acpi/boot.c22
-rw-r--r--arch/x86/kernel/apic/vector.c6
-rw-r--r--arch/x86/kernel/cpu/amd.c13
-rw-r--r--arch/x86/kernel/cpu/common.c3
-rw-r--r--arch/x86/kernel/cpu/intel.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.h5
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_msr.c7
-rw-r--r--arch/x86/kernel/fpu/signal.c11
-rw-r--r--arch/x86/kernel/fpu/xstate.c1
-rw-r--r--arch/x86/kernel/ftrace.c4
-rw-r--r--arch/x86/kernel/head_64.S8
-rw-r--r--arch/x86/kernel/i8259.c29
-rw-r--r--arch/x86/kernel/irq_work.c2
-rw-r--r--arch/x86/kernel/kvmclock.c46
-rw-r--r--arch/x86/kernel/livepatch.c9
-rw-r--r--arch/x86/kernel/mcount_64.S6
-rw-r--r--arch/x86/kernel/pci-dma.c2
-rw-r--r--arch/x86/kernel/pmem.c12
-rw-r--r--arch/x86/kernel/setup.c4
-rw-r--r--arch/x86/kernel/signal.c17
-rw-r--r--arch/x86/kernel/smpboot.c9
-rw-r--r--arch/x86/kernel/verify_cpu.S12
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/assigned-dev.c62
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/cpuid.h37
-rw-r--r--arch/x86/kvm/emulate.c35
-rw-r--r--arch/x86/kvm/hyperv.c31
-rw-r--r--arch/x86/kvm/i8254.c4
-rw-r--r--arch/x86/kvm/ioapic.c29
-rw-r--r--arch/x86/kvm/ioapic.h15
-rw-r--r--arch/x86/kvm/irq.c40
-rw-r--r--arch/x86/kvm/irq.h27
-rw-r--r--arch/x86/kvm/irq_comm.c129
-rw-r--r--arch/x86/kvm/lapic.c131
-rw-r--r--arch/x86/kvm/lapic.h7
-rw-r--r--arch/x86/kvm/mmu.c111
-rw-r--r--arch/x86/kvm/mmu.h6
-rw-r--r--arch/x86/kvm/paging_tmpl.h22
-rw-r--r--arch/x86/kvm/svm.c181
-rw-r--r--arch/x86/kvm/trace.h51
-rw-r--r--arch/x86/kvm/vmx.c814
-rw-r--r--arch/x86/kvm/x86.c442
-rw-r--r--arch/x86/mm/dump_pagetables.c19
-rw-r--r--arch/x86/mm/highmem_32.c14
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--arch/x86/mm/kasan_init_64.c2
-rw-r--r--arch/x86/mm/mpx.c53
-rw-r--r--arch/x86/net/bpf_jit_comp.c2
-rw-r--r--arch/x86/pci/acpi.c296
-rw-r--r--arch/x86/pci/bus_numa.c13
-rw-r--r--arch/x86/pci/common.c8
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/um/signal.c18
-rw-r--r--arch/x86/um/stub_32.S1
-rw-r--r--arch/x86/um/stub_64.S18
-rw-r--r--arch/x86/xen/enlighten.c15
-rw-r--r--arch/x86/xen/grant-table.c2
-rw-r--r--arch/x86/xen/mmu.c10
-rw-r--r--arch/x86/xen/p2m.c19
-rw-r--r--arch/x86/xen/setup.c9
-rw-r--r--arch/x86/xen/suspend.c20
101 files changed, 3512 insertions, 1215 deletions
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 3e0baf726eef..137dfa96aa14 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -113,7 +113,6 @@ config DEBUG_RODATA_TEST
113config DEBUG_WX 113config DEBUG_WX
114 bool "Warn on W+X mappings at boot" 114 bool "Warn on W+X mappings at boot"
115 depends on DEBUG_RODATA 115 depends on DEBUG_RODATA
116 default y
117 select X86_PTDUMP_CORE 116 select X86_PTDUMP_CORE
118 ---help--- 117 ---help---
119 Generate a warning if any W+X mappings are found at boot. 118 Generate a warning if any W+X mappings are found at boot.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2dfaa72260b4..4086abca0b32 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -171,9 +171,11 @@ asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
171asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) 171asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
172avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) 172avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
173avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) 173avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
174sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
175sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
174 176
175KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) 177KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
176KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) 178KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
177 179
178LDFLAGS := -m elf_$(UTS_MACHINE) 180LDFLAGS := -m elf_$(UTS_MACHINE)
179 181
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 0d553e54171b..2ee62dba0373 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,13 +9,13 @@
9# Changed by many, many contributors over the years. 9# Changed by many, many contributors over the years.
10# 10#
11 11
12KASAN_SANITIZE := n
13
12# If you want to preset the SVGA mode, uncomment the next line and 14# If you want to preset the SVGA mode, uncomment the next line and
13# set SVGA_MODE to whatever number you want. 15# set SVGA_MODE to whatever number you want.
14# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. 16# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
15# The number is the same as you would ordinarily press at bootup. 17# The number is the same as you would ordinarily press at bootup.
16 18
17KASAN_SANITIZE := n
18
19SVGA_MODE := -DSVGA_MODE=NORMAL_VGA 19SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
20 20
21targets := vmlinux.bin setup.bin setup.elf bzImage 21targets := vmlinux.bin setup.bin setup.elf bzImage
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 0033e96c3f09..9011a88353de 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -23,7 +23,6 @@
23#include <stdarg.h> 23#include <stdarg.h>
24#include <linux/types.h> 24#include <linux/types.h>
25#include <linux/edd.h> 25#include <linux/edd.h>
26#include <asm/boot.h>
27#include <asm/setup.h> 26#include <asm/setup.h>
28#include "bitops.h" 27#include "bitops.h"
29#include "ctype.h" 28#include "ctype.h"
diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c
index aa8a96b052e3..95c7a818c0ed 100644
--- a/arch/x86/boot/video-mode.c
+++ b/arch/x86/boot/video-mode.c
@@ -19,6 +19,8 @@
19#include "video.h" 19#include "video.h"
20#include "vesa.h" 20#include "vesa.h"
21 21
22#include <uapi/asm/boot.h>
23
22/* 24/*
23 * Common variables 25 * Common variables
24 */ 26 */
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 05111bb8d018..77780e386e9b 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -13,6 +13,8 @@
13 * Select video mode 13 * Select video mode
14 */ 14 */
15 15
16#include <uapi/asm/boot.h>
17
16#include "boot.h" 18#include "boot.h"
17#include "video.h" 19#include "video.h"
18#include "vesa.h" 20#include "vesa.h"
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9a2838cf0591..b9b912a44d61 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,6 +5,8 @@
5avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) 5avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
6avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ 6avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
7 $(comma)4)$(comma)%ymm2,yes,no) 7 $(comma)4)$(comma)%ymm2,yes,no)
8sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
9sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
8 10
9obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o 11obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
10 12
@@ -91,9 +93,15 @@ ifeq ($(avx2_supported),yes)
91sha1-ssse3-y += sha1_avx2_x86_64_asm.o 93sha1-ssse3-y += sha1_avx2_x86_64_asm.o
92poly1305-x86_64-y += poly1305-avx2-x86_64.o 94poly1305-x86_64-y += poly1305-avx2-x86_64.o
93endif 95endif
96ifeq ($(sha1_ni_supported),yes)
97sha1-ssse3-y += sha1_ni_asm.o
98endif
94crc32c-intel-y := crc32c-intel_glue.o 99crc32c-intel-y := crc32c-intel_glue.o
95crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o 100crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
96crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o 101crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
97sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o 102sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
103ifeq ($(sha256_ni_supported),yes)
104sha256-ssse3-y += sha256_ni_asm.o
105endif
98sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o 106sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
99crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o 107crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 225be06edc80..4fe27e074194 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -330,7 +330,7 @@ ENDPROC(crc_pcl)
330 ## PCLMULQDQ tables 330 ## PCLMULQDQ tables
331 ## Table is 128 entries x 2 words (8 bytes) each 331 ## Table is 128 entries x 2 words (8 bytes) each
332 ################################################################ 332 ################################################################
333.section .rotata, "a", %progbits 333.section .rodata, "a", %progbits
334.align 8 334.align 8
335K_table: 335K_table:
336 .long 0x493c7d27, 0x00000001 336 .long 0x493c7d27, 0x00000001
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
new file mode 100644
index 000000000000..874a651b9e7d
--- /dev/null
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -0,0 +1,302 @@
1/*
2 * Intel SHA Extensions optimized implementation of a SHA-1 update function
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2015 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * Sean Gulley <sean.m.gulley@intel.com>
22 * Tim Chen <tim.c.chen@linux.intel.com>
23 *
24 * BSD LICENSE
25 *
26 * Copyright(c) 2015 Intel Corporation.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * * Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * * Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in
36 * the documentation and/or other materials provided with the
37 * distribution.
38 * * Neither the name of Intel Corporation nor the names of its
39 * contributors may be used to endorse or promote products derived
40 * from this software without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 *
54 */
55
56#include <linux/linkage.h>
57
58#define DIGEST_PTR %rdi /* 1st arg */
59#define DATA_PTR %rsi /* 2nd arg */
60#define NUM_BLKS %rdx /* 3rd arg */
61
62#define RSPSAVE %rax
63
64/* gcc conversion */
65#define FRAME_SIZE 32 /* space for 2x16 bytes */
66
67#define ABCD %xmm0
68#define E0 %xmm1 /* Need two E's b/c they ping pong */
69#define E1 %xmm2
70#define MSG0 %xmm3
71#define MSG1 %xmm4
72#define MSG2 %xmm5
73#define MSG3 %xmm6
74#define SHUF_MASK %xmm7
75
76
77/*
78 * Intel SHA Extensions optimized implementation of a SHA-1 update function
79 *
80 * The function takes a pointer to the current hash values, a pointer to the
81 * input data, and a number of 64 byte blocks to process. Once all blocks have
82 * been processed, the digest pointer is updated with the resulting hash value.
83 * The function only processes complete blocks, there is no functionality to
84 * store partial blocks. All message padding and hash value initialization must
85 * be done outside the update function.
86 *
87 * The indented lines in the loop are instructions related to rounds processing.
88 * The non-indented lines are instructions related to the message schedule.
89 *
90 * void sha1_ni_transform(uint32_t *digest, const void *data,
91 uint32_t numBlocks)
92 * digest : pointer to digest
93 * data: pointer to input data
94 * numBlocks: Number of blocks to process
95 */
96.text
97.align 32
98ENTRY(sha1_ni_transform)
99 mov %rsp, RSPSAVE
100 sub $FRAME_SIZE, %rsp
101 and $~0xF, %rsp
102
103 shl $6, NUM_BLKS /* convert to bytes */
104 jz .Ldone_hash
105 add DATA_PTR, NUM_BLKS /* pointer to end of data */
106
107 /* load initial hash values */
108 pinsrd $3, 1*16(DIGEST_PTR), E0
109 movdqu 0*16(DIGEST_PTR), ABCD
110 pand UPPER_WORD_MASK(%rip), E0
111 pshufd $0x1B, ABCD, ABCD
112
113 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
114
115.Lloop0:
116 /* Save hash values for addition after rounds */
117 movdqa E0, (0*16)(%rsp)
118 movdqa ABCD, (1*16)(%rsp)
119
120 /* Rounds 0-3 */
121 movdqu 0*16(DATA_PTR), MSG0
122 pshufb SHUF_MASK, MSG0
123 paddd MSG0, E0
124 movdqa ABCD, E1
125 sha1rnds4 $0, E0, ABCD
126
127 /* Rounds 4-7 */
128 movdqu 1*16(DATA_PTR), MSG1
129 pshufb SHUF_MASK, MSG1
130 sha1nexte MSG1, E1
131 movdqa ABCD, E0
132 sha1rnds4 $0, E1, ABCD
133 sha1msg1 MSG1, MSG0
134
135 /* Rounds 8-11 */
136 movdqu 2*16(DATA_PTR), MSG2
137 pshufb SHUF_MASK, MSG2
138 sha1nexte MSG2, E0
139 movdqa ABCD, E1
140 sha1rnds4 $0, E0, ABCD
141 sha1msg1 MSG2, MSG1
142 pxor MSG2, MSG0
143
144 /* Rounds 12-15 */
145 movdqu 3*16(DATA_PTR), MSG3
146 pshufb SHUF_MASK, MSG3
147 sha1nexte MSG3, E1
148 movdqa ABCD, E0
149 sha1msg2 MSG3, MSG0
150 sha1rnds4 $0, E1, ABCD
151 sha1msg1 MSG3, MSG2
152 pxor MSG3, MSG1
153
154 /* Rounds 16-19 */
155 sha1nexte MSG0, E0
156 movdqa ABCD, E1
157 sha1msg2 MSG0, MSG1
158 sha1rnds4 $0, E0, ABCD
159 sha1msg1 MSG0, MSG3
160 pxor MSG0, MSG2
161
162 /* Rounds 20-23 */
163 sha1nexte MSG1, E1
164 movdqa ABCD, E0
165 sha1msg2 MSG1, MSG2
166 sha1rnds4 $1, E1, ABCD
167 sha1msg1 MSG1, MSG0
168 pxor MSG1, MSG3
169
170 /* Rounds 24-27 */
171 sha1nexte MSG2, E0
172 movdqa ABCD, E1
173 sha1msg2 MSG2, MSG3
174 sha1rnds4 $1, E0, ABCD
175 sha1msg1 MSG2, MSG1
176 pxor MSG2, MSG0
177
178 /* Rounds 28-31 */
179 sha1nexte MSG3, E1
180 movdqa ABCD, E0
181 sha1msg2 MSG3, MSG0
182 sha1rnds4 $1, E1, ABCD
183 sha1msg1 MSG3, MSG2
184 pxor MSG3, MSG1
185
186 /* Rounds 32-35 */
187 sha1nexte MSG0, E0
188 movdqa ABCD, E1
189 sha1msg2 MSG0, MSG1
190 sha1rnds4 $1, E0, ABCD
191 sha1msg1 MSG0, MSG3
192 pxor MSG0, MSG2
193
194 /* Rounds 36-39 */
195 sha1nexte MSG1, E1
196 movdqa ABCD, E0
197 sha1msg2 MSG1, MSG2
198 sha1rnds4 $1, E1, ABCD
199 sha1msg1 MSG1, MSG0
200 pxor MSG1, MSG3
201
202 /* Rounds 40-43 */
203 sha1nexte MSG2, E0
204 movdqa ABCD, E1
205 sha1msg2 MSG2, MSG3
206 sha1rnds4 $2, E0, ABCD
207 sha1msg1 MSG2, MSG1
208 pxor MSG2, MSG0
209
210 /* Rounds 44-47 */
211 sha1nexte MSG3, E1
212 movdqa ABCD, E0
213 sha1msg2 MSG3, MSG0
214 sha1rnds4 $2, E1, ABCD
215 sha1msg1 MSG3, MSG2
216 pxor MSG3, MSG1
217
218 /* Rounds 48-51 */
219 sha1nexte MSG0, E0
220 movdqa ABCD, E1
221 sha1msg2 MSG0, MSG1
222 sha1rnds4 $2, E0, ABCD
223 sha1msg1 MSG0, MSG3
224 pxor MSG0, MSG2
225
226 /* Rounds 52-55 */
227 sha1nexte MSG1, E1
228 movdqa ABCD, E0
229 sha1msg2 MSG1, MSG2
230 sha1rnds4 $2, E1, ABCD
231 sha1msg1 MSG1, MSG0
232 pxor MSG1, MSG3
233
234 /* Rounds 56-59 */
235 sha1nexte MSG2, E0
236 movdqa ABCD, E1
237 sha1msg2 MSG2, MSG3
238 sha1rnds4 $2, E0, ABCD
239 sha1msg1 MSG2, MSG1
240 pxor MSG2, MSG0
241
242 /* Rounds 60-63 */
243 sha1nexte MSG3, E1
244 movdqa ABCD, E0
245 sha1msg2 MSG3, MSG0
246 sha1rnds4 $3, E1, ABCD
247 sha1msg1 MSG3, MSG2
248 pxor MSG3, MSG1
249
250 /* Rounds 64-67 */
251 sha1nexte MSG0, E0
252 movdqa ABCD, E1
253 sha1msg2 MSG0, MSG1
254 sha1rnds4 $3, E0, ABCD
255 sha1msg1 MSG0, MSG3
256 pxor MSG0, MSG2
257
258 /* Rounds 68-71 */
259 sha1nexte MSG1, E1
260 movdqa ABCD, E0
261 sha1msg2 MSG1, MSG2
262 sha1rnds4 $3, E1, ABCD
263 pxor MSG1, MSG3
264
265 /* Rounds 72-75 */
266 sha1nexte MSG2, E0
267 movdqa ABCD, E1
268 sha1msg2 MSG2, MSG3
269 sha1rnds4 $3, E0, ABCD
270
271 /* Rounds 76-79 */
272 sha1nexte MSG3, E1
273 movdqa ABCD, E0
274 sha1rnds4 $3, E1, ABCD
275
276 /* Add current hash values with previously saved */
277 sha1nexte (0*16)(%rsp), E0
278 paddd (1*16)(%rsp), ABCD
279
280 /* Increment data pointer and loop if more to process */
281 add $64, DATA_PTR
282 cmp NUM_BLKS, DATA_PTR
283 jne .Lloop0
284
285 /* Write hash values back in the correct order */
286 pshufd $0x1B, ABCD, ABCD
287 movdqu ABCD, 0*16(DIGEST_PTR)
288 pextrd $3, E0, 1*16(DIGEST_PTR)
289
290.Ldone_hash:
291 mov RSPSAVE, %rsp
292
293 ret
294ENDPROC(sha1_ni_transform)
295
296.data
297
298.align 64
299PSHUFFLE_BYTE_FLIP_MASK:
300 .octa 0x000102030405060708090a0b0c0d0e0f
301UPPER_WORD_MASK:
302 .octa 0xFFFFFFFF000000000000000000000000
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 00212c32d4db..dd14616b7739 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -31,24 +31,11 @@
31#include <crypto/sha1_base.h> 31#include <crypto/sha1_base.h>
32#include <asm/fpu/api.h> 32#include <asm/fpu/api.h>
33 33
34typedef void (sha1_transform_fn)(u32 *digest, const char *data,
35 unsigned int rounds);
34 36
35asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, 37static int sha1_update(struct shash_desc *desc, const u8 *data,
36 unsigned int rounds); 38 unsigned int len, sha1_transform_fn *sha1_xform)
37#ifdef CONFIG_AS_AVX
38asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
39 unsigned int rounds);
40#endif
41#ifdef CONFIG_AS_AVX2
42#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
43
44asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
45 unsigned int rounds);
46#endif
47
48static void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
49
50static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
51 unsigned int len)
52{ 39{
53 struct sha1_state *sctx = shash_desc_ctx(desc); 40 struct sha1_state *sctx = shash_desc_ctx(desc);
54 41
@@ -61,14 +48,14 @@ static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
61 48
62 kernel_fpu_begin(); 49 kernel_fpu_begin();
63 sha1_base_do_update(desc, data, len, 50 sha1_base_do_update(desc, data, len,
64 (sha1_block_fn *)sha1_transform_asm); 51 (sha1_block_fn *)sha1_xform);
65 kernel_fpu_end(); 52 kernel_fpu_end();
66 53
67 return 0; 54 return 0;
68} 55}
69 56
70static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, 57static int sha1_finup(struct shash_desc *desc, const u8 *data,
71 unsigned int len, u8 *out) 58 unsigned int len, u8 *out, sha1_transform_fn *sha1_xform)
72{ 59{
73 if (!irq_fpu_usable()) 60 if (!irq_fpu_usable())
74 return crypto_sha1_finup(desc, data, len, out); 61 return crypto_sha1_finup(desc, data, len, out);
@@ -76,32 +63,37 @@ static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
76 kernel_fpu_begin(); 63 kernel_fpu_begin();
77 if (len) 64 if (len)
78 sha1_base_do_update(desc, data, len, 65 sha1_base_do_update(desc, data, len,
79 (sha1_block_fn *)sha1_transform_asm); 66 (sha1_block_fn *)sha1_xform);
80 sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm); 67 sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform);
81 kernel_fpu_end(); 68 kernel_fpu_end();
82 69
83 return sha1_base_finish(desc, out); 70 return sha1_base_finish(desc, out);
84} 71}
85 72
86/* Add padding and return the message digest. */ 73asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
87static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) 74 unsigned int rounds);
75
76static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
77 unsigned int len)
88{ 78{
89 return sha1_ssse3_finup(desc, NULL, 0, out); 79 return sha1_update(desc, data, len,
80 (sha1_transform_fn *) sha1_transform_ssse3);
90} 81}
91 82
92#ifdef CONFIG_AS_AVX2 83static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
93static void sha1_apply_transform_avx2(u32 *digest, const char *data, 84 unsigned int len, u8 *out)
94 unsigned int rounds)
95{ 85{
96 /* Select the optimal transform based on data block size */ 86 return sha1_finup(desc, data, len, out,
97 if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) 87 (sha1_transform_fn *) sha1_transform_ssse3);
98 sha1_transform_avx2(digest, data, rounds); 88}
99 else 89
100 sha1_transform_avx(digest, data, rounds); 90/* Add padding and return the message digest. */
91static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
92{
93 return sha1_ssse3_finup(desc, NULL, 0, out);
101} 94}
102#endif
103 95
104static struct shash_alg alg = { 96static struct shash_alg sha1_ssse3_alg = {
105 .digestsize = SHA1_DIGEST_SIZE, 97 .digestsize = SHA1_DIGEST_SIZE,
106 .init = sha1_base_init, 98 .init = sha1_base_init,
107 .update = sha1_ssse3_update, 99 .update = sha1_ssse3_update,
@@ -110,7 +102,7 @@ static struct shash_alg alg = {
110 .descsize = sizeof(struct sha1_state), 102 .descsize = sizeof(struct sha1_state),
111 .base = { 103 .base = {
112 .cra_name = "sha1", 104 .cra_name = "sha1",
113 .cra_driver_name= "sha1-ssse3", 105 .cra_driver_name = "sha1-ssse3",
114 .cra_priority = 150, 106 .cra_priority = 150,
115 .cra_flags = CRYPTO_ALG_TYPE_SHASH, 107 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
116 .cra_blocksize = SHA1_BLOCK_SIZE, 108 .cra_blocksize = SHA1_BLOCK_SIZE,
@@ -118,8 +110,60 @@ static struct shash_alg alg = {
118 } 110 }
119}; 111};
120 112
113static int register_sha1_ssse3(void)
114{
115 if (boot_cpu_has(X86_FEATURE_SSSE3))
116 return crypto_register_shash(&sha1_ssse3_alg);
117 return 0;
118}
119
120static void unregister_sha1_ssse3(void)
121{
122 if (boot_cpu_has(X86_FEATURE_SSSE3))
123 crypto_unregister_shash(&sha1_ssse3_alg);
124}
125
121#ifdef CONFIG_AS_AVX 126#ifdef CONFIG_AS_AVX
122static bool __init avx_usable(void) 127asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
128 unsigned int rounds);
129
130static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
131 unsigned int len)
132{
133 return sha1_update(desc, data, len,
134 (sha1_transform_fn *) sha1_transform_avx);
135}
136
137static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
138 unsigned int len, u8 *out)
139{
140 return sha1_finup(desc, data, len, out,
141 (sha1_transform_fn *) sha1_transform_avx);
142}
143
144static int sha1_avx_final(struct shash_desc *desc, u8 *out)
145{
146 return sha1_avx_finup(desc, NULL, 0, out);
147}
148
149static struct shash_alg sha1_avx_alg = {
150 .digestsize = SHA1_DIGEST_SIZE,
151 .init = sha1_base_init,
152 .update = sha1_avx_update,
153 .final = sha1_avx_final,
154 .finup = sha1_avx_finup,
155 .descsize = sizeof(struct sha1_state),
156 .base = {
157 .cra_name = "sha1",
158 .cra_driver_name = "sha1-avx",
159 .cra_priority = 160,
160 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
161 .cra_blocksize = SHA1_BLOCK_SIZE,
162 .cra_module = THIS_MODULE,
163 }
164};
165
166static bool avx_usable(void)
123{ 167{
124 if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { 168 if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
125 if (cpu_has_avx) 169 if (cpu_has_avx)
@@ -130,55 +174,197 @@ static bool __init avx_usable(void)
130 return true; 174 return true;
131} 175}
132 176
133#ifdef CONFIG_AS_AVX2 177static int register_sha1_avx(void)
134static bool __init avx2_usable(void) 178{
179 if (avx_usable())
180 return crypto_register_shash(&sha1_avx_alg);
181 return 0;
182}
183
184static void unregister_sha1_avx(void)
135{ 185{
136 if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) && 186 if (avx_usable())
137 boot_cpu_has(X86_FEATURE_BMI2)) 187 crypto_unregister_shash(&sha1_avx_alg);
188}
189
190#else /* CONFIG_AS_AVX */
191static inline int register_sha1_avx(void) { return 0; }
192static inline void unregister_sha1_avx(void) { }
193#endif /* CONFIG_AS_AVX */
194
195
196#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX)
197#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
198
199asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
200 unsigned int rounds);
201
202static bool avx2_usable(void)
203{
204 if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
205 && boot_cpu_has(X86_FEATURE_BMI1)
206 && boot_cpu_has(X86_FEATURE_BMI2))
138 return true; 207 return true;
139 208
140 return false; 209 return false;
141} 210}
211
212static void sha1_apply_transform_avx2(u32 *digest, const char *data,
213 unsigned int rounds)
214{
215 /* Select the optimal transform based on data block size */
216 if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE)
217 sha1_transform_avx2(digest, data, rounds);
218 else
219 sha1_transform_avx(digest, data, rounds);
220}
221
222static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
223 unsigned int len)
224{
225 return sha1_update(desc, data, len,
226 (sha1_transform_fn *) sha1_apply_transform_avx2);
227}
228
229static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
230 unsigned int len, u8 *out)
231{
232 return sha1_finup(desc, data, len, out,
233 (sha1_transform_fn *) sha1_apply_transform_avx2);
234}
235
236static int sha1_avx2_final(struct shash_desc *desc, u8 *out)
237{
238 return sha1_avx2_finup(desc, NULL, 0, out);
239}
240
241static struct shash_alg sha1_avx2_alg = {
242 .digestsize = SHA1_DIGEST_SIZE,
243 .init = sha1_base_init,
244 .update = sha1_avx2_update,
245 .final = sha1_avx2_final,
246 .finup = sha1_avx2_finup,
247 .descsize = sizeof(struct sha1_state),
248 .base = {
249 .cra_name = "sha1",
250 .cra_driver_name = "sha1-avx2",
251 .cra_priority = 170,
252 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
253 .cra_blocksize = SHA1_BLOCK_SIZE,
254 .cra_module = THIS_MODULE,
255 }
256};
257
258static int register_sha1_avx2(void)
259{
260 if (avx2_usable())
261 return crypto_register_shash(&sha1_avx2_alg);
262 return 0;
263}
264
265static void unregister_sha1_avx2(void)
266{
267 if (avx2_usable())
268 crypto_unregister_shash(&sha1_avx2_alg);
269}
270
271#else
272static inline int register_sha1_avx2(void) { return 0; }
273static inline void unregister_sha1_avx2(void) { }
142#endif 274#endif
275
276#ifdef CONFIG_AS_SHA1_NI
277asmlinkage void sha1_ni_transform(u32 *digest, const char *data,
278 unsigned int rounds);
279
280static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
281 unsigned int len)
282{
283 return sha1_update(desc, data, len,
284 (sha1_transform_fn *) sha1_ni_transform);
285}
286
287static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
288 unsigned int len, u8 *out)
289{
290 return sha1_finup(desc, data, len, out,
291 (sha1_transform_fn *) sha1_ni_transform);
292}
293
294static int sha1_ni_final(struct shash_desc *desc, u8 *out)
295{
296 return sha1_ni_finup(desc, NULL, 0, out);
297}
298
299static struct shash_alg sha1_ni_alg = {
300 .digestsize = SHA1_DIGEST_SIZE,
301 .init = sha1_base_init,
302 .update = sha1_ni_update,
303 .final = sha1_ni_final,
304 .finup = sha1_ni_finup,
305 .descsize = sizeof(struct sha1_state),
306 .base = {
307 .cra_name = "sha1",
308 .cra_driver_name = "sha1-ni",
309 .cra_priority = 250,
310 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
311 .cra_blocksize = SHA1_BLOCK_SIZE,
312 .cra_module = THIS_MODULE,
313 }
314};
315
316static int register_sha1_ni(void)
317{
318 if (boot_cpu_has(X86_FEATURE_SHA_NI))
319 return crypto_register_shash(&sha1_ni_alg);
320 return 0;
321}
322
323static void unregister_sha1_ni(void)
324{
325 if (boot_cpu_has(X86_FEATURE_SHA_NI))
326 crypto_unregister_shash(&sha1_ni_alg);
327}
328
329#else
330static inline int register_sha1_ni(void) { return 0; }
331static inline void unregister_sha1_ni(void) { }
143#endif 332#endif
144 333
145static int __init sha1_ssse3_mod_init(void) 334static int __init sha1_ssse3_mod_init(void)
146{ 335{
147 char *algo_name; 336 if (register_sha1_ssse3())
337 goto fail;
148 338
149 /* test for SSSE3 first */ 339 if (register_sha1_avx()) {
150 if (cpu_has_ssse3) { 340 unregister_sha1_ssse3();
151 sha1_transform_asm = sha1_transform_ssse3; 341 goto fail;
152 algo_name = "SSSE3";
153 } 342 }
154 343
155#ifdef CONFIG_AS_AVX 344 if (register_sha1_avx2()) {
156 /* allow AVX to override SSSE3, it's a little faster */ 345 unregister_sha1_avx();
157 if (avx_usable()) { 346 unregister_sha1_ssse3();
158 sha1_transform_asm = sha1_transform_avx; 347 goto fail;
159 algo_name = "AVX";
160#ifdef CONFIG_AS_AVX2
161 /* allow AVX2 to override AVX, it's a little faster */
162 if (avx2_usable()) {
163 sha1_transform_asm = sha1_apply_transform_avx2;
164 algo_name = "AVX2";
165 }
166#endif
167 } 348 }
168#endif
169 349
170 if (sha1_transform_asm) { 350 if (register_sha1_ni()) {
171 pr_info("Using %s optimized SHA-1 implementation\n", algo_name); 351 unregister_sha1_avx2();
172 return crypto_register_shash(&alg); 352 unregister_sha1_avx();
353 unregister_sha1_ssse3();
354 goto fail;
173 } 355 }
174 pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n");
175 356
357 return 0;
358fail:
176 return -ENODEV; 359 return -ENODEV;
177} 360}
178 361
179static void __exit sha1_ssse3_mod_fini(void) 362static void __exit sha1_ssse3_mod_fini(void)
180{ 363{
181 crypto_unregister_shash(&alg); 364 unregister_sha1_ni();
365 unregister_sha1_avx2();
366 unregister_sha1_avx();
367 unregister_sha1_ssse3();
182} 368}
183 369
184module_init(sha1_ssse3_mod_init); 370module_init(sha1_ssse3_mod_init);
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
new file mode 100644
index 000000000000..748cdf21a938
--- /dev/null
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -0,0 +1,353 @@
1/*
2 * Intel SHA Extensions optimized implementation of a SHA-256 update function
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2015 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * Sean Gulley <sean.m.gulley@intel.com>
22 * Tim Chen <tim.c.chen@linux.intel.com>
23 *
24 * BSD LICENSE
25 *
26 * Copyright(c) 2015 Intel Corporation.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * * Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * * Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in
36 * the documentation and/or other materials provided with the
37 * distribution.
38 * * Neither the name of Intel Corporation nor the names of its
39 * contributors may be used to endorse or promote products derived
40 * from this software without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 *
54 */
55
56#include <linux/linkage.h>
57
58#define DIGEST_PTR %rdi /* 1st arg */
59#define DATA_PTR %rsi /* 2nd arg */
60#define NUM_BLKS %rdx /* 3rd arg */
61
62#define SHA256CONSTANTS %rax
63
64#define MSG %xmm0
65#define STATE0 %xmm1
66#define STATE1 %xmm2
67#define MSGTMP0 %xmm3
68#define MSGTMP1 %xmm4
69#define MSGTMP2 %xmm5
70#define MSGTMP3 %xmm6
71#define MSGTMP4 %xmm7
72
73#define SHUF_MASK %xmm8
74
75#define ABEF_SAVE %xmm9
76#define CDGH_SAVE %xmm10
77
78/*
79 * Intel SHA Extensions optimized implementation of a SHA-256 update function
80 *
81 * The function takes a pointer to the current hash values, a pointer to the
82 * input data, and a number of 64 byte blocks to process. Once all blocks have
83 * been processed, the digest pointer is updated with the resulting hash value.
84 * The function only processes complete blocks, there is no functionality to
85 * store partial blocks. All message padding and hash value initialization must
86 * be done outside the update function.
87 *
88 * The indented lines in the loop are instructions related to rounds processing.
89 * The non-indented lines are instructions related to the message schedule.
90 *
91 * void sha256_ni_transform(uint32_t *digest, const void *data,
92 uint32_t numBlocks);
93 * digest : pointer to digest
94 * data: pointer to input data
95 * numBlocks: Number of blocks to process
96 */
97
98.text
99.align 32
100ENTRY(sha256_ni_transform)
101
102 shl $6, NUM_BLKS /* convert to bytes */
103 jz .Ldone_hash
104 add DATA_PTR, NUM_BLKS /* pointer to end of data */
105
106 /*
107 * load initial hash values
108 * Need to reorder these appropriately
109 * DCBA, HGFE -> ABEF, CDGH
110 */
111 movdqu 0*16(DIGEST_PTR), STATE0
112 movdqu 1*16(DIGEST_PTR), STATE1
113
114 pshufd $0xB1, STATE0, STATE0 /* CDAB */
115 pshufd $0x1B, STATE1, STATE1 /* EFGH */
116 movdqa STATE0, MSGTMP4
117 palignr $8, STATE1, STATE0 /* ABEF */
118 pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
119
120 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
121 lea K256(%rip), SHA256CONSTANTS
122
123.Lloop0:
124 /* Save hash values for addition after rounds */
125 movdqa STATE0, ABEF_SAVE
126 movdqa STATE1, CDGH_SAVE
127
128 /* Rounds 0-3 */
129 movdqu 0*16(DATA_PTR), MSG
130 pshufb SHUF_MASK, MSG
131 movdqa MSG, MSGTMP0
132 paddd 0*16(SHA256CONSTANTS), MSG
133 sha256rnds2 STATE0, STATE1
134 pshufd $0x0E, MSG, MSG
135 sha256rnds2 STATE1, STATE0
136
137 /* Rounds 4-7 */
138 movdqu 1*16(DATA_PTR), MSG
139 pshufb SHUF_MASK, MSG
140 movdqa MSG, MSGTMP1
141 paddd 1*16(SHA256CONSTANTS), MSG
142 sha256rnds2 STATE0, STATE1
143 pshufd $0x0E, MSG, MSG
144 sha256rnds2 STATE1, STATE0
145 sha256msg1 MSGTMP1, MSGTMP0
146
147 /* Rounds 8-11 */
148 movdqu 2*16(DATA_PTR), MSG
149 pshufb SHUF_MASK, MSG
150 movdqa MSG, MSGTMP2
151 paddd 2*16(SHA256CONSTANTS), MSG
152 sha256rnds2 STATE0, STATE1
153 pshufd $0x0E, MSG, MSG
154 sha256rnds2 STATE1, STATE0
155 sha256msg1 MSGTMP2, MSGTMP1
156
157 /* Rounds 12-15 */
158 movdqu 3*16(DATA_PTR), MSG
159 pshufb SHUF_MASK, MSG
160 movdqa MSG, MSGTMP3
161 paddd 3*16(SHA256CONSTANTS), MSG
162 sha256rnds2 STATE0, STATE1
163 movdqa MSGTMP3, MSGTMP4
164 palignr $4, MSGTMP2, MSGTMP4
165 paddd MSGTMP4, MSGTMP0
166 sha256msg2 MSGTMP3, MSGTMP0
167 pshufd $0x0E, MSG, MSG
168 sha256rnds2 STATE1, STATE0
169 sha256msg1 MSGTMP3, MSGTMP2
170
171 /* Rounds 16-19 */
172 movdqa MSGTMP0, MSG
173 paddd 4*16(SHA256CONSTANTS), MSG
174 sha256rnds2 STATE0, STATE1
175 movdqa MSGTMP0, MSGTMP4
176 palignr $4, MSGTMP3, MSGTMP4
177 paddd MSGTMP4, MSGTMP1
178 sha256msg2 MSGTMP0, MSGTMP1
179 pshufd $0x0E, MSG, MSG
180 sha256rnds2 STATE1, STATE0
181 sha256msg1 MSGTMP0, MSGTMP3
182
183 /* Rounds 20-23 */
184 movdqa MSGTMP1, MSG
185 paddd 5*16(SHA256CONSTANTS), MSG
186 sha256rnds2 STATE0, STATE1
187 movdqa MSGTMP1, MSGTMP4
188 palignr $4, MSGTMP0, MSGTMP4
189 paddd MSGTMP4, MSGTMP2
190 sha256msg2 MSGTMP1, MSGTMP2
191 pshufd $0x0E, MSG, MSG
192 sha256rnds2 STATE1, STATE0
193 sha256msg1 MSGTMP1, MSGTMP0
194
195 /* Rounds 24-27 */
196 movdqa MSGTMP2, MSG
197 paddd 6*16(SHA256CONSTANTS), MSG
198 sha256rnds2 STATE0, STATE1
199 movdqa MSGTMP2, MSGTMP4
200 palignr $4, MSGTMP1, MSGTMP4
201 paddd MSGTMP4, MSGTMP3
202 sha256msg2 MSGTMP2, MSGTMP3
203 pshufd $0x0E, MSG, MSG
204 sha256rnds2 STATE1, STATE0
205 sha256msg1 MSGTMP2, MSGTMP1
206
207 /* Rounds 28-31 */
208 movdqa MSGTMP3, MSG
209 paddd 7*16(SHA256CONSTANTS), MSG
210 sha256rnds2 STATE0, STATE1
211 movdqa MSGTMP3, MSGTMP4
212 palignr $4, MSGTMP2, MSGTMP4
213 paddd MSGTMP4, MSGTMP0
214 sha256msg2 MSGTMP3, MSGTMP0
215 pshufd $0x0E, MSG, MSG
216 sha256rnds2 STATE1, STATE0
217 sha256msg1 MSGTMP3, MSGTMP2
218
219 /* Rounds 32-35 */
220 movdqa MSGTMP0, MSG
221 paddd 8*16(SHA256CONSTANTS), MSG
222 sha256rnds2 STATE0, STATE1
223 movdqa MSGTMP0, MSGTMP4
224 palignr $4, MSGTMP3, MSGTMP4
225 paddd MSGTMP4, MSGTMP1
226 sha256msg2 MSGTMP0, MSGTMP1
227 pshufd $0x0E, MSG, MSG
228 sha256rnds2 STATE1, STATE0
229 sha256msg1 MSGTMP0, MSGTMP3
230
231 /* Rounds 36-39 */
232 movdqa MSGTMP1, MSG
233 paddd 9*16(SHA256CONSTANTS), MSG
234 sha256rnds2 STATE0, STATE1
235 movdqa MSGTMP1, MSGTMP4
236 palignr $4, MSGTMP0, MSGTMP4
237 paddd MSGTMP4, MSGTMP2
238 sha256msg2 MSGTMP1, MSGTMP2
239 pshufd $0x0E, MSG, MSG
240 sha256rnds2 STATE1, STATE0
241 sha256msg1 MSGTMP1, MSGTMP0
242
243 /* Rounds 40-43 */
244 movdqa MSGTMP2, MSG
245 paddd 10*16(SHA256CONSTANTS), MSG
246 sha256rnds2 STATE0, STATE1
247 movdqa MSGTMP2, MSGTMP4
248 palignr $4, MSGTMP1, MSGTMP4
249 paddd MSGTMP4, MSGTMP3
250 sha256msg2 MSGTMP2, MSGTMP3
251 pshufd $0x0E, MSG, MSG
252 sha256rnds2 STATE1, STATE0
253 sha256msg1 MSGTMP2, MSGTMP1
254
255 /* Rounds 44-47 */
256 movdqa MSGTMP3, MSG
257 paddd 11*16(SHA256CONSTANTS), MSG
258 sha256rnds2 STATE0, STATE1
259 movdqa MSGTMP3, MSGTMP4
260 palignr $4, MSGTMP2, MSGTMP4
261 paddd MSGTMP4, MSGTMP0
262 sha256msg2 MSGTMP3, MSGTMP0
263 pshufd $0x0E, MSG, MSG
264 sha256rnds2 STATE1, STATE0
265 sha256msg1 MSGTMP3, MSGTMP2
266
267 /* Rounds 48-51 */
268 movdqa MSGTMP0, MSG
269 paddd 12*16(SHA256CONSTANTS), MSG
270 sha256rnds2 STATE0, STATE1
271 movdqa MSGTMP0, MSGTMP4
272 palignr $4, MSGTMP3, MSGTMP4
273 paddd MSGTMP4, MSGTMP1
274 sha256msg2 MSGTMP0, MSGTMP1
275 pshufd $0x0E, MSG, MSG
276 sha256rnds2 STATE1, STATE0
277 sha256msg1 MSGTMP0, MSGTMP3
278
279 /* Rounds 52-55 */
280 movdqa MSGTMP1, MSG
281 paddd 13*16(SHA256CONSTANTS), MSG
282 sha256rnds2 STATE0, STATE1
283 movdqa MSGTMP1, MSGTMP4
284 palignr $4, MSGTMP0, MSGTMP4
285 paddd MSGTMP4, MSGTMP2
286 sha256msg2 MSGTMP1, MSGTMP2
287 pshufd $0x0E, MSG, MSG
288 sha256rnds2 STATE1, STATE0
289
290 /* Rounds 56-59 */
291 movdqa MSGTMP2, MSG
292 paddd 14*16(SHA256CONSTANTS), MSG
293 sha256rnds2 STATE0, STATE1
294 movdqa MSGTMP2, MSGTMP4
295 palignr $4, MSGTMP1, MSGTMP4
296 paddd MSGTMP4, MSGTMP3
297 sha256msg2 MSGTMP2, MSGTMP3
298 pshufd $0x0E, MSG, MSG
299 sha256rnds2 STATE1, STATE0
300
301 /* Rounds 60-63 */
302 movdqa MSGTMP3, MSG
303 paddd 15*16(SHA256CONSTANTS), MSG
304 sha256rnds2 STATE0, STATE1
305 pshufd $0x0E, MSG, MSG
306 sha256rnds2 STATE1, STATE0
307
308 /* Add current hash values with previously saved */
309 paddd ABEF_SAVE, STATE0
310 paddd CDGH_SAVE, STATE1
311
312 /* Increment data pointer and loop if more to process */
313 add $64, DATA_PTR
314 cmp NUM_BLKS, DATA_PTR
315 jne .Lloop0
316
317 /* Write hash values back in the correct order */
318 pshufd $0x1B, STATE0, STATE0 /* FEBA */
319 pshufd $0xB1, STATE1, STATE1 /* DCHG */
320 movdqa STATE0, MSGTMP4
321 pblendw $0xF0, STATE1, STATE0 /* DCBA */
322 palignr $8, MSGTMP4, STATE1 /* HGFE */
323
324 movdqu STATE0, 0*16(DIGEST_PTR)
325 movdqu STATE1, 1*16(DIGEST_PTR)
326
327.Ldone_hash:
328
329 ret
330ENDPROC(sha256_ni_transform)
331
332.data
333.align 64
334K256:
335 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
336 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
337 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
338 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
339 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
340 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
341 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
342 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
343 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
344 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
345 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
346 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
347 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
348 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
349 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
350 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
351
352PSHUFFLE_BYTE_FLIP_MASK:
353 .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 0e0e85aea634..5f4d6086dc59 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -42,19 +42,10 @@
42 42
43asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, 43asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
44 u64 rounds); 44 u64 rounds);
45#ifdef CONFIG_AS_AVX 45typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds);
46asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
47 u64 rounds);
48#endif
49#ifdef CONFIG_AS_AVX2
50asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
51 u64 rounds);
52#endif
53
54static void (*sha256_transform_asm)(u32 *, const char *, u64);
55 46
56static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, 47static int sha256_update(struct shash_desc *desc, const u8 *data,
57 unsigned int len) 48 unsigned int len, sha256_transform_fn *sha256_xform)
58{ 49{
59 struct sha256_state *sctx = shash_desc_ctx(desc); 50 struct sha256_state *sctx = shash_desc_ctx(desc);
60 51
@@ -67,14 +58,14 @@ static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
67 58
68 kernel_fpu_begin(); 59 kernel_fpu_begin();
69 sha256_base_do_update(desc, data, len, 60 sha256_base_do_update(desc, data, len,
70 (sha256_block_fn *)sha256_transform_asm); 61 (sha256_block_fn *)sha256_xform);
71 kernel_fpu_end(); 62 kernel_fpu_end();
72 63
73 return 0; 64 return 0;
74} 65}
75 66
76static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, 67static int sha256_finup(struct shash_desc *desc, const u8 *data,
77 unsigned int len, u8 *out) 68 unsigned int len, u8 *out, sha256_transform_fn *sha256_xform)
78{ 69{
79 if (!irq_fpu_usable()) 70 if (!irq_fpu_usable())
80 return crypto_sha256_finup(desc, data, len, out); 71 return crypto_sha256_finup(desc, data, len, out);
@@ -82,20 +73,32 @@ static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
82 kernel_fpu_begin(); 73 kernel_fpu_begin();
83 if (len) 74 if (len)
84 sha256_base_do_update(desc, data, len, 75 sha256_base_do_update(desc, data, len,
85 (sha256_block_fn *)sha256_transform_asm); 76 (sha256_block_fn *)sha256_xform);
86 sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm); 77 sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform);
87 kernel_fpu_end(); 78 kernel_fpu_end();
88 79
89 return sha256_base_finish(desc, out); 80 return sha256_base_finish(desc, out);
90} 81}
91 82
83static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
84 unsigned int len)
85{
86 return sha256_update(desc, data, len, sha256_transform_ssse3);
87}
88
89static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
90 unsigned int len, u8 *out)
91{
92 return sha256_finup(desc, data, len, out, sha256_transform_ssse3);
93}
94
92/* Add padding and return the message digest. */ 95/* Add padding and return the message digest. */
93static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) 96static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
94{ 97{
95 return sha256_ssse3_finup(desc, NULL, 0, out); 98 return sha256_ssse3_finup(desc, NULL, 0, out);
96} 99}
97 100
98static struct shash_alg algs[] = { { 101static struct shash_alg sha256_ssse3_algs[] = { {
99 .digestsize = SHA256_DIGEST_SIZE, 102 .digestsize = SHA256_DIGEST_SIZE,
100 .init = sha256_base_init, 103 .init = sha256_base_init,
101 .update = sha256_ssse3_update, 104 .update = sha256_ssse3_update,
@@ -127,8 +130,75 @@ static struct shash_alg algs[] = { {
127 } 130 }
128} }; 131} };
129 132
133static int register_sha256_ssse3(void)
134{
135 if (boot_cpu_has(X86_FEATURE_SSSE3))
136 return crypto_register_shashes(sha256_ssse3_algs,
137 ARRAY_SIZE(sha256_ssse3_algs));
138 return 0;
139}
140
141static void unregister_sha256_ssse3(void)
142{
143 if (boot_cpu_has(X86_FEATURE_SSSE3))
144 crypto_unregister_shashes(sha256_ssse3_algs,
145 ARRAY_SIZE(sha256_ssse3_algs));
146}
147
130#ifdef CONFIG_AS_AVX 148#ifdef CONFIG_AS_AVX
131static bool __init avx_usable(void) 149asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
150 u64 rounds);
151
152static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
153 unsigned int len)
154{
155 return sha256_update(desc, data, len, sha256_transform_avx);
156}
157
158static int sha256_avx_finup(struct shash_desc *desc, const u8 *data,
159 unsigned int len, u8 *out)
160{
161 return sha256_finup(desc, data, len, out, sha256_transform_avx);
162}
163
164static int sha256_avx_final(struct shash_desc *desc, u8 *out)
165{
166 return sha256_avx_finup(desc, NULL, 0, out);
167}
168
169static struct shash_alg sha256_avx_algs[] = { {
170 .digestsize = SHA256_DIGEST_SIZE,
171 .init = sha256_base_init,
172 .update = sha256_avx_update,
173 .final = sha256_avx_final,
174 .finup = sha256_avx_finup,
175 .descsize = sizeof(struct sha256_state),
176 .base = {
177 .cra_name = "sha256",
178 .cra_driver_name = "sha256-avx",
179 .cra_priority = 160,
180 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
181 .cra_blocksize = SHA256_BLOCK_SIZE,
182 .cra_module = THIS_MODULE,
183 }
184}, {
185 .digestsize = SHA224_DIGEST_SIZE,
186 .init = sha224_base_init,
187 .update = sha256_avx_update,
188 .final = sha256_avx_final,
189 .finup = sha256_avx_finup,
190 .descsize = sizeof(struct sha256_state),
191 .base = {
192 .cra_name = "sha224",
193 .cra_driver_name = "sha224-avx",
194 .cra_priority = 160,
195 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
196 .cra_blocksize = SHA224_BLOCK_SIZE,
197 .cra_module = THIS_MODULE,
198 }
199} };
200
201static bool avx_usable(void)
132{ 202{
133 if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { 203 if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
134 if (cpu_has_avx) 204 if (cpu_has_avx)
@@ -138,47 +208,216 @@ static bool __init avx_usable(void)
138 208
139 return true; 209 return true;
140} 210}
141#endif
142 211
143static int __init sha256_ssse3_mod_init(void) 212static int register_sha256_avx(void)
144{ 213{
145 /* test for SSSE3 first */ 214 if (avx_usable())
146 if (cpu_has_ssse3) 215 return crypto_register_shashes(sha256_avx_algs,
147 sha256_transform_asm = sha256_transform_ssse3; 216 ARRAY_SIZE(sha256_avx_algs));
217 return 0;
218}
148 219
149#ifdef CONFIG_AS_AVX 220static void unregister_sha256_avx(void)
150 /* allow AVX to override SSSE3, it's a little faster */ 221{
151 if (avx_usable()) { 222 if (avx_usable())
152#ifdef CONFIG_AS_AVX2 223 crypto_unregister_shashes(sha256_avx_algs,
153 if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) 224 ARRAY_SIZE(sha256_avx_algs));
154 sha256_transform_asm = sha256_transform_rorx; 225}
155 else 226
227#else
228static inline int register_sha256_avx(void) { return 0; }
229static inline void unregister_sha256_avx(void) { }
156#endif 230#endif
157 sha256_transform_asm = sha256_transform_avx; 231
232#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
233asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
234 u64 rounds);
235
236static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
237 unsigned int len)
238{
239 return sha256_update(desc, data, len, sha256_transform_rorx);
240}
241
242static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data,
243 unsigned int len, u8 *out)
244{
245 return sha256_finup(desc, data, len, out, sha256_transform_rorx);
246}
247
248static int sha256_avx2_final(struct shash_desc *desc, u8 *out)
249{
250 return sha256_avx2_finup(desc, NULL, 0, out);
251}
252
253static struct shash_alg sha256_avx2_algs[] = { {
254 .digestsize = SHA256_DIGEST_SIZE,
255 .init = sha256_base_init,
256 .update = sha256_avx2_update,
257 .final = sha256_avx2_final,
258 .finup = sha256_avx2_finup,
259 .descsize = sizeof(struct sha256_state),
260 .base = {
261 .cra_name = "sha256",
262 .cra_driver_name = "sha256-avx2",
263 .cra_priority = 170,
264 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
265 .cra_blocksize = SHA256_BLOCK_SIZE,
266 .cra_module = THIS_MODULE,
158 } 267 }
159#endif 268}, {
269 .digestsize = SHA224_DIGEST_SIZE,
270 .init = sha224_base_init,
271 .update = sha256_avx2_update,
272 .final = sha256_avx2_final,
273 .finup = sha256_avx2_finup,
274 .descsize = sizeof(struct sha256_state),
275 .base = {
276 .cra_name = "sha224",
277 .cra_driver_name = "sha224-avx2",
278 .cra_priority = 170,
279 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
280 .cra_blocksize = SHA224_BLOCK_SIZE,
281 .cra_module = THIS_MODULE,
282 }
283} };
160 284
161 if (sha256_transform_asm) { 285static bool avx2_usable(void)
162#ifdef CONFIG_AS_AVX 286{
163 if (sha256_transform_asm == sha256_transform_avx) 287 if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
164 pr_info("Using AVX optimized SHA-256 implementation\n"); 288 boot_cpu_has(X86_FEATURE_BMI2))
165#ifdef CONFIG_AS_AVX2 289 return true;
166 else if (sha256_transform_asm == sha256_transform_rorx) 290
167 pr_info("Using AVX2 optimized SHA-256 implementation\n"); 291 return false;
292}
293
294static int register_sha256_avx2(void)
295{
296 if (avx2_usable())
297 return crypto_register_shashes(sha256_avx2_algs,
298 ARRAY_SIZE(sha256_avx2_algs));
299 return 0;
300}
301
302static void unregister_sha256_avx2(void)
303{
304 if (avx2_usable())
305 crypto_unregister_shashes(sha256_avx2_algs,
306 ARRAY_SIZE(sha256_avx2_algs));
307}
308
309#else
310static inline int register_sha256_avx2(void) { return 0; }
311static inline void unregister_sha256_avx2(void) { }
168#endif 312#endif
169 else 313
314#ifdef CONFIG_AS_SHA256_NI
315asmlinkage void sha256_ni_transform(u32 *digest, const char *data,
316 u64 rounds); /*unsigned int rounds);*/
317
318static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
319 unsigned int len)
320{
321 return sha256_update(desc, data, len, sha256_ni_transform);
322}
323
324static int sha256_ni_finup(struct shash_desc *desc, const u8 *data,
325 unsigned int len, u8 *out)
326{
327 return sha256_finup(desc, data, len, out, sha256_ni_transform);
328}
329
330static int sha256_ni_final(struct shash_desc *desc, u8 *out)
331{
332 return sha256_ni_finup(desc, NULL, 0, out);
333}
334
335static struct shash_alg sha256_ni_algs[] = { {
336 .digestsize = SHA256_DIGEST_SIZE,
337 .init = sha256_base_init,
338 .update = sha256_ni_update,
339 .final = sha256_ni_final,
340 .finup = sha256_ni_finup,
341 .descsize = sizeof(struct sha256_state),
342 .base = {
343 .cra_name = "sha256",
344 .cra_driver_name = "sha256-ni",
345 .cra_priority = 250,
346 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
347 .cra_blocksize = SHA256_BLOCK_SIZE,
348 .cra_module = THIS_MODULE,
349 }
350}, {
351 .digestsize = SHA224_DIGEST_SIZE,
352 .init = sha224_base_init,
353 .update = sha256_ni_update,
354 .final = sha256_ni_final,
355 .finup = sha256_ni_finup,
356 .descsize = sizeof(struct sha256_state),
357 .base = {
358 .cra_name = "sha224",
359 .cra_driver_name = "sha224-ni",
360 .cra_priority = 250,
361 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
362 .cra_blocksize = SHA224_BLOCK_SIZE,
363 .cra_module = THIS_MODULE,
364 }
365} };
366
367static int register_sha256_ni(void)
368{
369 if (boot_cpu_has(X86_FEATURE_SHA_NI))
370 return crypto_register_shashes(sha256_ni_algs,
371 ARRAY_SIZE(sha256_ni_algs));
372 return 0;
373}
374
375static void unregister_sha256_ni(void)
376{
377 if (boot_cpu_has(X86_FEATURE_SHA_NI))
378 crypto_unregister_shashes(sha256_ni_algs,
379 ARRAY_SIZE(sha256_ni_algs));
380}
381
382#else
383static inline int register_sha256_ni(void) { return 0; }
384static inline void unregister_sha256_ni(void) { }
170#endif 385#endif
171 pr_info("Using SSSE3 optimized SHA-256 implementation\n"); 386
172 return crypto_register_shashes(algs, ARRAY_SIZE(algs)); 387static int __init sha256_ssse3_mod_init(void)
388{
389 if (register_sha256_ssse3())
390 goto fail;
391
392 if (register_sha256_avx()) {
393 unregister_sha256_ssse3();
394 goto fail;
173 } 395 }
174 pr_info("Neither AVX nor SSSE3 is available/usable.\n");
175 396
397 if (register_sha256_avx2()) {
398 unregister_sha256_avx();
399 unregister_sha256_ssse3();
400 goto fail;
401 }
402
403 if (register_sha256_ni()) {
404 unregister_sha256_avx2();
405 unregister_sha256_avx();
406 unregister_sha256_ssse3();
407 goto fail;
408 }
409
410 return 0;
411fail:
176 return -ENODEV; 412 return -ENODEV;
177} 413}
178 414
179static void __exit sha256_ssse3_mod_fini(void) 415static void __exit sha256_ssse3_mod_fini(void)
180{ 416{
181 crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); 417 unregister_sha256_ni();
418 unregister_sha256_avx2();
419 unregister_sha256_avx();
420 unregister_sha256_ssse3();
182} 421}
183 422
184module_init(sha256_ssse3_mod_init); 423module_init(sha256_ssse3_mod_init);
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 0c8c38c101ac..34e5083d6f36 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -41,19 +41,11 @@
41 41
42asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data, 42asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
43 u64 rounds); 43 u64 rounds);
44#ifdef CONFIG_AS_AVX
45asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
46 u64 rounds);
47#endif
48#ifdef CONFIG_AS_AVX2
49asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
50 u64 rounds);
51#endif
52 44
53static void (*sha512_transform_asm)(u64 *, const char *, u64); 45typedef void (sha512_transform_fn)(u64 *digest, const char *data, u64 rounds);
54 46
55static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, 47static int sha512_update(struct shash_desc *desc, const u8 *data,
56 unsigned int len) 48 unsigned int len, sha512_transform_fn *sha512_xform)
57{ 49{
58 struct sha512_state *sctx = shash_desc_ctx(desc); 50 struct sha512_state *sctx = shash_desc_ctx(desc);
59 51
@@ -66,14 +58,14 @@ static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
66 58
67 kernel_fpu_begin(); 59 kernel_fpu_begin();
68 sha512_base_do_update(desc, data, len, 60 sha512_base_do_update(desc, data, len,
69 (sha512_block_fn *)sha512_transform_asm); 61 (sha512_block_fn *)sha512_xform);
70 kernel_fpu_end(); 62 kernel_fpu_end();
71 63
72 return 0; 64 return 0;
73} 65}
74 66
75static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, 67static int sha512_finup(struct shash_desc *desc, const u8 *data,
76 unsigned int len, u8 *out) 68 unsigned int len, u8 *out, sha512_transform_fn *sha512_xform)
77{ 69{
78 if (!irq_fpu_usable()) 70 if (!irq_fpu_usable())
79 return crypto_sha512_finup(desc, data, len, out); 71 return crypto_sha512_finup(desc, data, len, out);
@@ -81,20 +73,32 @@ static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
81 kernel_fpu_begin(); 73 kernel_fpu_begin();
82 if (len) 74 if (len)
83 sha512_base_do_update(desc, data, len, 75 sha512_base_do_update(desc, data, len,
84 (sha512_block_fn *)sha512_transform_asm); 76 (sha512_block_fn *)sha512_xform);
85 sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm); 77 sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_xform);
86 kernel_fpu_end(); 78 kernel_fpu_end();
87 79
88 return sha512_base_finish(desc, out); 80 return sha512_base_finish(desc, out);
89} 81}
90 82
83static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
84 unsigned int len)
85{
86 return sha512_update(desc, data, len, sha512_transform_ssse3);
87}
88
89static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
90 unsigned int len, u8 *out)
91{
92 return sha512_finup(desc, data, len, out, sha512_transform_ssse3);
93}
94
91/* Add padding and return the message digest. */ 95/* Add padding and return the message digest. */
92static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) 96static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
93{ 97{
94 return sha512_ssse3_finup(desc, NULL, 0, out); 98 return sha512_ssse3_finup(desc, NULL, 0, out);
95} 99}
96 100
97static struct shash_alg algs[] = { { 101static struct shash_alg sha512_ssse3_algs[] = { {
98 .digestsize = SHA512_DIGEST_SIZE, 102 .digestsize = SHA512_DIGEST_SIZE,
99 .init = sha512_base_init, 103 .init = sha512_base_init,
100 .update = sha512_ssse3_update, 104 .update = sha512_ssse3_update,
@@ -126,8 +130,25 @@ static struct shash_alg algs[] = { {
126 } 130 }
127} }; 131} };
128 132
133static int register_sha512_ssse3(void)
134{
135 if (boot_cpu_has(X86_FEATURE_SSSE3))
136 return crypto_register_shashes(sha512_ssse3_algs,
137 ARRAY_SIZE(sha512_ssse3_algs));
138 return 0;
139}
140
141static void unregister_sha512_ssse3(void)
142{
143 if (boot_cpu_has(X86_FEATURE_SSSE3))
144 crypto_unregister_shashes(sha512_ssse3_algs,
145 ARRAY_SIZE(sha512_ssse3_algs));
146}
147
129#ifdef CONFIG_AS_AVX 148#ifdef CONFIG_AS_AVX
130static bool __init avx_usable(void) 149asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
150 u64 rounds);
151static bool avx_usable(void)
131{ 152{
132 if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { 153 if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
133 if (cpu_has_avx) 154 if (cpu_has_avx)
@@ -137,47 +158,185 @@ static bool __init avx_usable(void)
137 158
138 return true; 159 return true;
139} 160}
140#endif
141 161
142static int __init sha512_ssse3_mod_init(void) 162static int sha512_avx_update(struct shash_desc *desc, const u8 *data,
163 unsigned int len)
143{ 164{
144 /* test for SSSE3 first */ 165 return sha512_update(desc, data, len, sha512_transform_avx);
145 if (cpu_has_ssse3) 166}
146 sha512_transform_asm = sha512_transform_ssse3;
147 167
148#ifdef CONFIG_AS_AVX 168static int sha512_avx_finup(struct shash_desc *desc, const u8 *data,
149 /* allow AVX to override SSSE3, it's a little faster */ 169 unsigned int len, u8 *out)
150 if (avx_usable()) { 170{
151#ifdef CONFIG_AS_AVX2 171 return sha512_finup(desc, data, len, out, sha512_transform_avx);
152 if (boot_cpu_has(X86_FEATURE_AVX2)) 172}
153 sha512_transform_asm = sha512_transform_rorx; 173
154 else 174/* Add padding and return the message digest. */
155#endif 175static int sha512_avx_final(struct shash_desc *desc, u8 *out)
156 sha512_transform_asm = sha512_transform_avx; 176{
177 return sha512_avx_finup(desc, NULL, 0, out);
178}
179
180static struct shash_alg sha512_avx_algs[] = { {
181 .digestsize = SHA512_DIGEST_SIZE,
182 .init = sha512_base_init,
183 .update = sha512_avx_update,
184 .final = sha512_avx_final,
185 .finup = sha512_avx_finup,
186 .descsize = sizeof(struct sha512_state),
187 .base = {
188 .cra_name = "sha512",
189 .cra_driver_name = "sha512-avx",
190 .cra_priority = 160,
191 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
192 .cra_blocksize = SHA512_BLOCK_SIZE,
193 .cra_module = THIS_MODULE,
157 } 194 }
158#endif 195}, {
196 .digestsize = SHA384_DIGEST_SIZE,
197 .init = sha384_base_init,
198 .update = sha512_avx_update,
199 .final = sha512_avx_final,
200 .finup = sha512_avx_finup,
201 .descsize = sizeof(struct sha512_state),
202 .base = {
203 .cra_name = "sha384",
204 .cra_driver_name = "sha384-avx",
205 .cra_priority = 160,
206 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
207 .cra_blocksize = SHA384_BLOCK_SIZE,
208 .cra_module = THIS_MODULE,
209 }
210} };
159 211
160 if (sha512_transform_asm) { 212static int register_sha512_avx(void)
161#ifdef CONFIG_AS_AVX 213{
162 if (sha512_transform_asm == sha512_transform_avx) 214 if (avx_usable())
163 pr_info("Using AVX optimized SHA-512 implementation\n"); 215 return crypto_register_shashes(sha512_avx_algs,
164#ifdef CONFIG_AS_AVX2 216 ARRAY_SIZE(sha512_avx_algs));
165 else if (sha512_transform_asm == sha512_transform_rorx) 217 return 0;
166 pr_info("Using AVX2 optimized SHA-512 implementation\n"); 218}
219
220static void unregister_sha512_avx(void)
221{
222 if (avx_usable())
223 crypto_unregister_shashes(sha512_avx_algs,
224 ARRAY_SIZE(sha512_avx_algs));
225}
226#else
227static inline int register_sha512_avx(void) { return 0; }
228static inline void unregister_sha512_avx(void) { }
167#endif 229#endif
168 else 230
231#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
232asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
233 u64 rounds);
234
235static int sha512_avx2_update(struct shash_desc *desc, const u8 *data,
236 unsigned int len)
237{
238 return sha512_update(desc, data, len, sha512_transform_rorx);
239}
240
241static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data,
242 unsigned int len, u8 *out)
243{
244 return sha512_finup(desc, data, len, out, sha512_transform_rorx);
245}
246
247/* Add padding and return the message digest. */
248static int sha512_avx2_final(struct shash_desc *desc, u8 *out)
249{
250 return sha512_avx2_finup(desc, NULL, 0, out);
251}
252
253static struct shash_alg sha512_avx2_algs[] = { {
254 .digestsize = SHA512_DIGEST_SIZE,
255 .init = sha512_base_init,
256 .update = sha512_avx2_update,
257 .final = sha512_avx2_final,
258 .finup = sha512_avx2_finup,
259 .descsize = sizeof(struct sha512_state),
260 .base = {
261 .cra_name = "sha512",
262 .cra_driver_name = "sha512-avx2",
263 .cra_priority = 170,
264 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
265 .cra_blocksize = SHA512_BLOCK_SIZE,
266 .cra_module = THIS_MODULE,
267 }
268}, {
269 .digestsize = SHA384_DIGEST_SIZE,
270 .init = sha384_base_init,
271 .update = sha512_avx2_update,
272 .final = sha512_avx2_final,
273 .finup = sha512_avx2_finup,
274 .descsize = sizeof(struct sha512_state),
275 .base = {
276 .cra_name = "sha384",
277 .cra_driver_name = "sha384-avx2",
278 .cra_priority = 170,
279 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
280 .cra_blocksize = SHA384_BLOCK_SIZE,
281 .cra_module = THIS_MODULE,
282 }
283} };
284
285static bool avx2_usable(void)
286{
287 if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
288 boot_cpu_has(X86_FEATURE_BMI2))
289 return true;
290
291 return false;
292}
293
294static int register_sha512_avx2(void)
295{
296 if (avx2_usable())
297 return crypto_register_shashes(sha512_avx2_algs,
298 ARRAY_SIZE(sha512_avx2_algs));
299 return 0;
300}
301
302static void unregister_sha512_avx2(void)
303{
304 if (avx2_usable())
305 crypto_unregister_shashes(sha512_avx2_algs,
306 ARRAY_SIZE(sha512_avx2_algs));
307}
308#else
309static inline int register_sha512_avx2(void) { return 0; }
310static inline void unregister_sha512_avx2(void) { }
169#endif 311#endif
170 pr_info("Using SSSE3 optimized SHA-512 implementation\n"); 312
171 return crypto_register_shashes(algs, ARRAY_SIZE(algs)); 313static int __init sha512_ssse3_mod_init(void)
314{
315
316 if (register_sha512_ssse3())
317 goto fail;
318
319 if (register_sha512_avx()) {
320 unregister_sha512_ssse3();
321 goto fail;
172 } 322 }
173 pr_info("Neither AVX nor SSSE3 is available/usable.\n");
174 323
324 if (register_sha512_avx2()) {
325 unregister_sha512_avx();
326 unregister_sha512_ssse3();
327 goto fail;
328 }
329
330 return 0;
331fail:
175 return -ENODEV; 332 return -ENODEV;
176} 333}
177 334
178static void __exit sha512_ssse3_mod_fini(void) 335static void __exit sha512_ssse3_mod_fini(void)
179{ 336{
180 crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); 337 unregister_sha512_avx2();
338 unregister_sha512_avx();
339 unregister_sha512_ssse3();
181} 340}
182 341
183module_init(sha512_ssse3_mod_init); 342module_init(sha512_ssse3_mod_init);
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 53616ca03244..a55697d19824 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -509,6 +509,17 @@ END(irq_entries_start)
509 * tracking that we're in kernel mode. 509 * tracking that we're in kernel mode.
510 */ 510 */
511 SWAPGS 511 SWAPGS
512
513 /*
514 * We need to tell lockdep that IRQs are off. We can't do this until
515 * we fix gsbase, and we should do it before enter_from_user_mode
516 * (which can take locks). Since TRACE_IRQS_OFF idempotent,
517 * the simplest way to handle it is to just call it twice if
518 * we enter from user mode. There's no reason to optimize this since
519 * TRACE_IRQS_OFF is a no-op if lockdep is off.
520 */
521 TRACE_IRQS_OFF
522
512#ifdef CONFIG_CONTEXT_TRACKING 523#ifdef CONFIG_CONTEXT_TRACKING
513 call enter_from_user_mode 524 call enter_from_user_mode
514#endif 525#endif
@@ -1049,12 +1060,18 @@ ENTRY(error_entry)
1049 SWAPGS 1060 SWAPGS
1050 1061
1051.Lerror_entry_from_usermode_after_swapgs: 1062.Lerror_entry_from_usermode_after_swapgs:
1063 /*
1064 * We need to tell lockdep that IRQs are off. We can't do this until
1065 * we fix gsbase, and we should do it before enter_from_user_mode
1066 * (which can take locks).
1067 */
1068 TRACE_IRQS_OFF
1052#ifdef CONFIG_CONTEXT_TRACKING 1069#ifdef CONFIG_CONTEXT_TRACKING
1053 call enter_from_user_mode 1070 call enter_from_user_mode
1054#endif 1071#endif
1072 ret
1055 1073
1056.Lerror_entry_done: 1074.Lerror_entry_done:
1057
1058 TRACE_IRQS_OFF 1075 TRACE_IRQS_OFF
1059 ret 1076 ret
1060 1077
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index caa2c712d1e7..f17705e1332c 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -382,3 +382,4 @@
382373 i386 shutdown sys_shutdown 382373 i386 shutdown sys_shutdown
383374 i386 userfaultfd sys_userfaultfd 383374 i386 userfaultfd sys_userfaultfd
384375 i386 membarrier sys_membarrier 384375 i386 membarrier sys_membarrier
385376 i386 mlock2 sys_mlock2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 278842fdf1f6..314a90bfc09c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -331,6 +331,7 @@
331322 64 execveat stub_execveat 331322 64 execveat stub_execveat
332323 common userfaultfd sys_userfaultfd 332323 common userfaultfd sys_userfaultfd
333324 common membarrier sys_membarrier 333324 common membarrier sys_membarrier
334325 common mlock2 sys_mlock2
334 335
335# 336#
336# x32-specific system call numbers start at 512 to avoid cache impact 337# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 04e9d023168f..1c0b43724ce3 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -68,7 +68,6 @@ void *kmap_atomic(struct page *page);
68void __kunmap_atomic(void *kvaddr); 68void __kunmap_atomic(void *kvaddr);
69void *kmap_atomic_pfn(unsigned long pfn); 69void *kmap_atomic_pfn(unsigned long pfn);
70void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); 70void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
71struct page *kmap_atomic_to_page(void *ptr);
72 71
73#define flush_cache_kmaps() do { } while (0) 72#define flush_cache_kmaps() do { } while (0)
74 73
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index ccffa53750a8..39bcefc20de7 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -60,6 +60,7 @@ struct legacy_pic {
60 void (*mask_all)(void); 60 void (*mask_all)(void);
61 void (*restore_mask)(void); 61 void (*restore_mask)(void);
62 void (*init)(int auto_eoi); 62 void (*init)(int auto_eoi);
63 int (*probe)(void);
63 int (*irq_pending)(unsigned int irq); 64 int (*irq_pending)(unsigned int irq);
64 void (*make_irq)(unsigned int irq); 65 void (*make_irq)(unsigned int irq);
65}; 66};
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 046c7fb1ca43..a210eba2727c 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -33,6 +33,11 @@ enum irq_remap_cap {
33 IRQ_POSTING_CAP = 0, 33 IRQ_POSTING_CAP = 0,
34}; 34};
35 35
36struct vcpu_data {
37 u64 pi_desc_addr; /* Physical address of PI Descriptor */
38 u32 vector; /* Guest vector of the interrupt */
39};
40
36#ifdef CONFIG_IRQ_REMAP 41#ifdef CONFIG_IRQ_REMAP
37 42
38extern bool irq_remapping_cap(enum irq_remap_cap cap); 43extern bool irq_remapping_cap(enum irq_remap_cap cap);
@@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
58 return x86_vector_domain; 63 return x86_vector_domain;
59} 64}
60 65
61struct vcpu_data {
62 u64 pi_desc_addr; /* Physical address of PI Descriptor */
63 u32 vector; /* Guest vector of the interrupt */
64};
65
66#else /* CONFIG_IRQ_REMAP */ 66#else /* CONFIG_IRQ_REMAP */
67 67
68static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } 68static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e16466ec473c..e9cd7befcb76 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -112,6 +112,16 @@ struct x86_emulate_ops {
112 struct x86_exception *fault); 112 struct x86_exception *fault);
113 113
114 /* 114 /*
115 * read_phys: Read bytes of standard (non-emulated/special) memory.
116 * Used for descriptor reading.
117 * @addr: [IN ] Physical address from which to read.
118 * @val: [OUT] Value read from memory.
119 * @bytes: [IN ] Number of bytes to read from memory.
120 */
121 int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
122 void *val, unsigned int bytes);
123
124 /*
115 * write_std: Write bytes of standard (non-emulated/special) memory. 125 * write_std: Write bytes of standard (non-emulated/special) memory.
116 * Used for descriptor writing. 126 * Used for descriptor writing.
117 * @addr: [IN ] Linear address to which to write. 127 * @addr: [IN ] Linear address to which to write.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3a36ee704c30..30cfd64295a0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -24,6 +24,7 @@
24#include <linux/perf_event.h> 24#include <linux/perf_event.h>
25#include <linux/pvclock_gtod.h> 25#include <linux/pvclock_gtod.h>
26#include <linux/clocksource.h> 26#include <linux/clocksource.h>
27#include <linux/irqbypass.h>
27 28
28#include <asm/pvclock-abi.h> 29#include <asm/pvclock-abi.h>
29#include <asm/desc.h> 30#include <asm/desc.h>
@@ -176,6 +177,8 @@ enum {
176 */ 177 */
177#define KVM_APIC_PV_EOI_PENDING 1 178#define KVM_APIC_PV_EOI_PENDING 1
178 179
180struct kvm_kernel_irq_routing_entry;
181
179/* 182/*
180 * We don't want allocation failures within the mmu code, so we preallocate 183 * We don't want allocation failures within the mmu code, so we preallocate
181 * enough memory for a single page fault in a cache. 184 * enough memory for a single page fault in a cache.
@@ -374,6 +377,7 @@ struct kvm_mtrr {
374/* Hyper-V per vcpu emulation context */ 377/* Hyper-V per vcpu emulation context */
375struct kvm_vcpu_hv { 378struct kvm_vcpu_hv {
376 u64 hv_vapic; 379 u64 hv_vapic;
380 s64 runtime_offset;
377}; 381};
378 382
379struct kvm_vcpu_arch { 383struct kvm_vcpu_arch {
@@ -396,6 +400,7 @@ struct kvm_vcpu_arch {
396 u64 efer; 400 u64 efer;
397 u64 apic_base; 401 u64 apic_base;
398 struct kvm_lapic *apic; /* kernel irqchip context */ 402 struct kvm_lapic *apic; /* kernel irqchip context */
403 u64 eoi_exit_bitmap[4];
399 unsigned long apic_attention; 404 unsigned long apic_attention;
400 int32_t apic_arb_prio; 405 int32_t apic_arb_prio;
401 int mp_state; 406 int mp_state;
@@ -500,6 +505,7 @@ struct kvm_vcpu_arch {
500 u32 virtual_tsc_mult; 505 u32 virtual_tsc_mult;
501 u32 virtual_tsc_khz; 506 u32 virtual_tsc_khz;
502 s64 ia32_tsc_adjust_msr; 507 s64 ia32_tsc_adjust_msr;
508 u64 tsc_scaling_ratio;
503 509
504 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ 510 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
505 unsigned nmi_pending; /* NMI queued after currently running handler */ 511 unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -573,6 +579,9 @@ struct kvm_vcpu_arch {
573 struct { 579 struct {
574 bool pv_unhalted; 580 bool pv_unhalted;
575 } pv; 581 } pv;
582
583 int pending_ioapic_eoi;
584 int pending_external_vector;
576}; 585};
577 586
578struct kvm_lpage_info { 587struct kvm_lpage_info {
@@ -683,6 +692,9 @@ struct kvm_arch {
683 u32 bsp_vcpu_id; 692 u32 bsp_vcpu_id;
684 693
685 u64 disabled_quirks; 694 u64 disabled_quirks;
695
696 bool irqchip_split;
697 u8 nr_reserved_ioapic_pins;
686}; 698};
687 699
688struct kvm_vm_stat { 700struct kvm_vm_stat {
@@ -766,7 +778,7 @@ struct kvm_x86_ops {
766 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 778 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
767 void (*vcpu_put)(struct kvm_vcpu *vcpu); 779 void (*vcpu_put)(struct kvm_vcpu *vcpu);
768 780
769 void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); 781 void (*update_bp_intercept)(struct kvm_vcpu *vcpu);
770 int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); 782 int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
771 int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); 783 int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
772 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 784 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -819,10 +831,10 @@ struct kvm_x86_ops {
819 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 831 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
820 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 832 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
821 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 833 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
822 int (*vm_has_apicv)(struct kvm *kvm); 834 int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu);
823 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); 835 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
824 void (*hwapic_isr_update)(struct kvm *kvm, int isr); 836 void (*hwapic_isr_update)(struct kvm *kvm, int isr);
825 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); 837 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu);
826 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); 838 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
827 void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); 839 void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
828 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); 840 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
@@ -833,7 +845,7 @@ struct kvm_x86_ops {
833 int (*get_lpage_level)(void); 845 int (*get_lpage_level)(void);
834 bool (*rdtscp_supported)(void); 846 bool (*rdtscp_supported)(void);
835 bool (*invpcid_supported)(void); 847 bool (*invpcid_supported)(void);
836 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); 848 void (*adjust_tsc_offset_guest)(struct kvm_vcpu *vcpu, s64 adjustment);
837 849
838 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 850 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
839 851
@@ -841,11 +853,9 @@ struct kvm_x86_ops {
841 853
842 bool (*has_wbinvd_exit)(void); 854 bool (*has_wbinvd_exit)(void);
843 855
844 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
845 u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); 856 u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
846 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 857 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
847 858
848 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
849 u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); 859 u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
850 860
851 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); 861 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
@@ -887,6 +897,20 @@ struct kvm_x86_ops {
887 gfn_t offset, unsigned long mask); 897 gfn_t offset, unsigned long mask);
888 /* pmu operations of sub-arch */ 898 /* pmu operations of sub-arch */
889 const struct kvm_pmu_ops *pmu_ops; 899 const struct kvm_pmu_ops *pmu_ops;
900
901 /*
902 * Architecture specific hooks for vCPU blocking due to
903 * HLT instruction.
904 * Returns for .pre_block():
905 * - 0 means continue to block the vCPU.
906 * - 1 means we cannot block the vCPU since some event
907 * happens during this period, such as, 'ON' bit in
908 * posted-interrupts descriptor is set.
909 */
910 int (*pre_block)(struct kvm_vcpu *vcpu);
911 void (*post_block)(struct kvm_vcpu *vcpu);
912 int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
913 uint32_t guest_irq, bool set);
890}; 914};
891 915
892struct kvm_arch_async_pf { 916struct kvm_arch_async_pf {
@@ -898,17 +922,6 @@ struct kvm_arch_async_pf {
898 922
899extern struct kvm_x86_ops *kvm_x86_ops; 923extern struct kvm_x86_ops *kvm_x86_ops;
900 924
901static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
902 s64 adjustment)
903{
904 kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false);
905}
906
907static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
908{
909 kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true);
910}
911
912int kvm_mmu_module_init(void); 925int kvm_mmu_module_init(void);
913void kvm_mmu_module_exit(void); 926void kvm_mmu_module_exit(void);
914 927
@@ -961,10 +974,12 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
961 974
962/* control of guest tsc rate supported? */ 975/* control of guest tsc rate supported? */
963extern bool kvm_has_tsc_control; 976extern bool kvm_has_tsc_control;
964/* minimum supported tsc_khz for guests */
965extern u32 kvm_min_guest_tsc_khz;
966/* maximum supported tsc_khz for guests */ 977/* maximum supported tsc_khz for guests */
967extern u32 kvm_max_guest_tsc_khz; 978extern u32 kvm_max_guest_tsc_khz;
979/* number of bits of the fractional part of the TSC scaling ratio */
980extern u8 kvm_tsc_scaling_ratio_frac_bits;
981/* maximum allowed value of TSC scaling ratio */
982extern u64 kvm_max_tsc_scaling_ratio;
968 983
969enum emulation_result { 984enum emulation_result {
970 EMULATE_DONE, /* no further processing */ 985 EMULATE_DONE, /* no further processing */
@@ -1210,6 +1225,9 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
1210void kvm_define_shared_msr(unsigned index, u32 msr); 1225void kvm_define_shared_msr(unsigned index, u32 msr);
1211int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); 1226int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
1212 1227
1228u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
1229u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
1230
1213unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); 1231unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
1214bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); 1232bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
1215 1233
@@ -1231,4 +1249,13 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
1231bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); 1249bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
1232bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); 1250bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
1233 1251
1252bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
1253 struct kvm_vcpu **dest_vcpu);
1254
1255void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
1256 struct kvm_lapic_irq *irq);
1257
1258static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
1259static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
1260
1234#endif /* _ASM_X86_KVM_HOST_H */ 1261#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b8c14bb7fc8f..690b4027e17c 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -35,7 +35,7 @@
35#define MSR_IA32_PERFCTR0 0x000000c1 35#define MSR_IA32_PERFCTR0 0x000000c1
36#define MSR_IA32_PERFCTR1 0x000000c2 36#define MSR_IA32_PERFCTR1 0x000000c2
37#define MSR_FSB_FREQ 0x000000cd 37#define MSR_FSB_FREQ 0x000000cd
38#define MSR_NHM_PLATFORM_INFO 0x000000ce 38#define MSR_PLATFORM_INFO 0x000000ce
39 39
40#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 40#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2
41#define NHM_C3_AUTO_DEMOTE (1UL << 25) 41#define NHM_C3_AUTO_DEMOTE (1UL << 25)
@@ -44,7 +44,6 @@
44#define SNB_C1_AUTO_UNDEMOTE (1UL << 27) 44#define SNB_C1_AUTO_UNDEMOTE (1UL << 27)
45#define SNB_C3_AUTO_UNDEMOTE (1UL << 28) 45#define SNB_C3_AUTO_UNDEMOTE (1UL << 28)
46 46
47#define MSR_PLATFORM_INFO 0x000000ce
48#define MSR_MTRRcap 0x000000fe 47#define MSR_MTRRcap 0x000000fe
49#define MSR_IA32_BBL_CR_CTL 0x00000119 48#define MSR_IA32_BBL_CR_CTL 0x00000119
50#define MSR_IA32_BBL_CR_CTL3 0x0000011e 49#define MSR_IA32_BBL_CR_CTL3 0x0000011e
@@ -206,6 +205,13 @@
206#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 205#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0
207#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 206#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1
208 207
208/* Config TDP MSRs */
209#define MSR_CONFIG_TDP_NOMINAL 0x00000648
210#define MSR_CONFIG_TDP_LEVEL1 0x00000649
211#define MSR_CONFIG_TDP_LEVEL2 0x0000064A
212#define MSR_CONFIG_TDP_CONTROL 0x0000064B
213#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C
214
209/* Hardware P state interface */ 215/* Hardware P state interface */
210#define MSR_PPERF 0x0000064e 216#define MSR_PPERF 0x0000064e
211#define MSR_PERF_LIMIT_REASONS 0x0000064f 217#define MSR_PERF_LIMIT_REASONS 0x0000064f
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index c5b7fb2774d0..cc071c6f7d4d 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -9,19 +9,21 @@
9#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) 9#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
10#define PAGE_MASK (~(PAGE_SIZE-1)) 10#define PAGE_MASK (~(PAGE_SIZE-1))
11 11
12#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
13#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
14
15#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
16#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
17
12#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) 18#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
13#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) 19#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
14 20
15/* Cast PAGE_MASK to a signed type so that it is sign-extended if 21/* Cast *PAGE_MASK to a signed type so that it is sign-extended if
16 virtual addresses are 32-bits but physical addresses are larger 22 virtual addresses are 32-bits but physical addresses are larger
17 (ie, 32-bit PAE). */ 23 (ie, 32-bit PAE). */
18#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) 24#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
19 25#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK)
20#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) 26#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK)
21#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
22
23#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
24#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
25 27
26#define HPAGE_SHIFT PMD_SHIFT 28#define HPAGE_SHIFT PMD_SHIFT
27#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) 29#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e99cbe814ea8..d3eee663c41f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -322,6 +322,16 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
322 return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); 322 return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
323} 323}
324 324
325static inline pte_t pte_clear_soft_dirty(pte_t pte)
326{
327 return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
328}
329
330static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
331{
332 return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
333}
334
325#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ 335#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
326 336
327/* 337/*
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index dd5b0aa9dd2f..a471cadb9630 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -279,17 +279,14 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
279static inline pudval_t pud_pfn_mask(pud_t pud) 279static inline pudval_t pud_pfn_mask(pud_t pud)
280{ 280{
281 if (native_pud_val(pud) & _PAGE_PSE) 281 if (native_pud_val(pud) & _PAGE_PSE)
282 return PUD_PAGE_MASK & PHYSICAL_PAGE_MASK; 282 return PHYSICAL_PUD_PAGE_MASK;
283 else 283 else
284 return PTE_PFN_MASK; 284 return PTE_PFN_MASK;
285} 285}
286 286
287static inline pudval_t pud_flags_mask(pud_t pud) 287static inline pudval_t pud_flags_mask(pud_t pud)
288{ 288{
289 if (native_pud_val(pud) & _PAGE_PSE) 289 return ~pud_pfn_mask(pud);
290 return ~(PUD_PAGE_MASK & (pudval_t)PHYSICAL_PAGE_MASK);
291 else
292 return ~PTE_PFN_MASK;
293} 290}
294 291
295static inline pudval_t pud_flags(pud_t pud) 292static inline pudval_t pud_flags(pud_t pud)
@@ -300,17 +297,14 @@ static inline pudval_t pud_flags(pud_t pud)
300static inline pmdval_t pmd_pfn_mask(pmd_t pmd) 297static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
301{ 298{
302 if (native_pmd_val(pmd) & _PAGE_PSE) 299 if (native_pmd_val(pmd) & _PAGE_PSE)
303 return PMD_PAGE_MASK & PHYSICAL_PAGE_MASK; 300 return PHYSICAL_PMD_PAGE_MASK;
304 else 301 else
305 return PTE_PFN_MASK; 302 return PTE_PFN_MASK;
306} 303}
307 304
308static inline pmdval_t pmd_flags_mask(pmd_t pmd) 305static inline pmdval_t pmd_flags_mask(pmd_t pmd)
309{ 306{
310 if (native_pmd_val(pmd) & _PAGE_PSE) 307 return ~pmd_pfn_mask(pmd);
311 return ~(PMD_PAGE_MASK & (pmdval_t)PHYSICAL_PAGE_MASK);
312 else
313 return ~PTE_PFN_MASK;
314} 308}
315 309
316static inline pmdval_t pmd_flags(pmd_t pmd) 310static inline pmdval_t pmd_flags(pmd_t pmd)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 448b7ca61aee..14c63c7e8337 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,7 +72,8 @@
72#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 72#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
73#define SECONDARY_EXEC_ENABLE_PML 0x00020000 73#define SECONDARY_EXEC_ENABLE_PML 0x00020000
74#define SECONDARY_EXEC_XSAVES 0x00100000 74#define SECONDARY_EXEC_XSAVES 0x00100000
75 75#define SECONDARY_EXEC_PCOMMIT 0x00200000
76#define SECONDARY_EXEC_TSC_SCALING 0x02000000
76 77
77#define PIN_BASED_EXT_INTR_MASK 0x00000001 78#define PIN_BASED_EXT_INTR_MASK 0x00000001
78#define PIN_BASED_NMI_EXITING 0x00000008 79#define PIN_BASED_NMI_EXITING 0x00000008
@@ -167,6 +168,8 @@ enum vmcs_field {
167 VMWRITE_BITMAP = 0x00002028, 168 VMWRITE_BITMAP = 0x00002028,
168 XSS_EXIT_BITMAP = 0x0000202C, 169 XSS_EXIT_BITMAP = 0x0000202C,
169 XSS_EXIT_BITMAP_HIGH = 0x0000202D, 170 XSS_EXIT_BITMAP_HIGH = 0x0000202D,
171 TSC_MULTIPLIER = 0x00002032,
172 TSC_MULTIPLIER_HIGH = 0x00002033,
170 GUEST_PHYSICAL_ADDRESS = 0x00002400, 173 GUEST_PHYSICAL_ADDRESS = 0x00002400,
171 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, 174 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
172 VMCS_LINK_POINTER = 0x00002800, 175 VMCS_LINK_POINTER = 0x00002800,
@@ -416,6 +419,7 @@ enum vmcs_field {
416#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 419#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
417#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 420#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
418 421
422#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */
419#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ 423#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
420#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ 424#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
421 425
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 10002a46c593..1ae89a2721d6 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -1,7 +1,6 @@
1#ifndef _ASM_X86_PLATFORM_H 1#ifndef _ASM_X86_PLATFORM_H
2#define _ASM_X86_PLATFORM_H 2#define _ASM_X86_PLATFORM_H
3 3
4#include <asm/pgtable_types.h>
5#include <asm/bootparam.h> 4#include <asm/bootparam.h>
6 5
7struct mpc_bus; 6struct mpc_bus;
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index d866959e5685..8b2d4bea9962 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -57,4 +57,9 @@ static inline bool xen_x2apic_para_available(void)
57} 57}
58#endif 58#endif
59 59
60#ifdef CONFIG_HOTPLUG_CPU
61void xen_arch_register_cpu(int num);
62void xen_arch_unregister_cpu(int num);
63#endif
64
60#endif /* _ASM_X86_XEN_HYPERVISOR_H */ 65#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 0679e11d2cf7..f5fb840b43e8 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -12,7 +12,7 @@
12#include <asm/pgtable.h> 12#include <asm/pgtable.h>
13 13
14#include <xen/interface/xen.h> 14#include <xen/interface/xen.h>
15#include <xen/grant_table.h> 15#include <xen/interface/grant_table.h>
16#include <xen/features.h> 16#include <xen/features.h>
17 17
18/* Xen machine address */ 18/* Xen machine address */
@@ -43,6 +43,8 @@ extern unsigned long *xen_p2m_addr;
43extern unsigned long xen_p2m_size; 43extern unsigned long xen_p2m_size;
44extern unsigned long xen_max_p2m_pfn; 44extern unsigned long xen_max_p2m_pfn;
45 45
46extern int xen_alloc_p2m_entry(unsigned long pfn);
47
46extern unsigned long get_phys_to_machine(unsigned long pfn); 48extern unsigned long get_phys_to_machine(unsigned long pfn);
47extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 49extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
48extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); 50extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
@@ -296,8 +298,8 @@ void make_lowmem_page_readwrite(void *vaddr);
296#define xen_unmap(cookie) iounmap((cookie)) 298#define xen_unmap(cookie) iounmap((cookie))
297 299
298static inline bool xen_arch_need_swiotlb(struct device *dev, 300static inline bool xen_arch_need_swiotlb(struct device *dev,
299 unsigned long pfn, 301 phys_addr_t phys,
300 unsigned long bfn) 302 dma_addr_t dev_addr)
301{ 303{
302 return false; 304 return false;
303} 305}
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index f0412c50c47b..040d4083c24f 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -153,6 +153,12 @@
153/* MSR used to provide vcpu index */ 153/* MSR used to provide vcpu index */
154#define HV_X64_MSR_VP_INDEX 0x40000002 154#define HV_X64_MSR_VP_INDEX 0x40000002
155 155
156/* MSR used to reset the guest OS. */
157#define HV_X64_MSR_RESET 0x40000003
158
159/* MSR used to provide vcpu runtime in 100ns units */
160#define HV_X64_MSR_VP_RUNTIME 0x40000010
161
156/* MSR used to read the per-partition time reference counter */ 162/* MSR used to read the per-partition time reference counter */
157#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 163#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
158 164
@@ -251,4 +257,16 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
251 __s64 tsc_offset; 257 __s64 tsc_offset;
252} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; 258} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
253 259
260/* Define the number of synthetic interrupt sources. */
261#define HV_SYNIC_SINT_COUNT (16)
262/* Define the expected SynIC version. */
263#define HV_SYNIC_VERSION_1 (0x1)
264
265#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0)
266#define HV_SYNIC_SIMP_ENABLE (1ULL << 0)
267#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0)
268#define HV_SYNIC_SINT_MASKED (1ULL << 16)
269#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17)
270#define HV_SYNIC_SINT_VECTOR_MASK (0xFF)
271
254#endif 272#endif
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index b5d7640abc5d..8a4add8e4639 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -100,6 +100,7 @@
100 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ 100 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \
101 { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ 101 { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \
102 { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ 102 { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \
103 { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \
103 { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ 104 { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \
104 { SVM_EXIT_INTR, "interrupt" }, \ 105 { SVM_EXIT_INTR, "interrupt" }, \
105 { SVM_EXIT_NMI, "nmi" }, \ 106 { SVM_EXIT_NMI, "nmi" }, \
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 37fee272618f..5b15d94a33f8 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -78,6 +78,7 @@
78#define EXIT_REASON_PML_FULL 62 78#define EXIT_REASON_PML_FULL 62
79#define EXIT_REASON_XSAVES 63 79#define EXIT_REASON_XSAVES 63
80#define EXIT_REASON_XRSTORS 64 80#define EXIT_REASON_XRSTORS 64
81#define EXIT_REASON_PCOMMIT 65
81 82
82#define VMX_EXIT_REASONS \ 83#define VMX_EXIT_REASONS \
83 { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ 84 { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -126,7 +127,8 @@
126 { EXIT_REASON_INVVPID, "INVVPID" }, \ 127 { EXIT_REASON_INVVPID, "INVVPID" }, \
127 { EXIT_REASON_INVPCID, "INVPCID" }, \ 128 { EXIT_REASON_INVPCID, "INVPCID" }, \
128 { EXIT_REASON_XSAVES, "XSAVES" }, \ 129 { EXIT_REASON_XSAVES, "XSAVES" }, \
129 { EXIT_REASON_XRSTORS, "XRSTORS" } 130 { EXIT_REASON_XRSTORS, "XRSTORS" }, \
131 { EXIT_REASON_PCOMMIT, "PCOMMIT" }
130 132
131#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 133#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
132#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 134#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ded848c20e05..e75907601a41 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -976,6 +976,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
976{ 976{
977 int count; 977 int count;
978 int x2count = 0; 978 int x2count = 0;
979 int ret;
980 struct acpi_subtable_proc madt_proc[2];
979 981
980 if (!cpu_has_apic) 982 if (!cpu_has_apic)
981 return -ENODEV; 983 return -ENODEV;
@@ -999,10 +1001,22 @@ static int __init acpi_parse_madt_lapic_entries(void)
999 acpi_parse_sapic, MAX_LOCAL_APIC); 1001 acpi_parse_sapic, MAX_LOCAL_APIC);
1000 1002
1001 if (!count) { 1003 if (!count) {
1002 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, 1004 memset(madt_proc, 0, sizeof(madt_proc));
1003 acpi_parse_x2apic, MAX_LOCAL_APIC); 1005 madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
1004 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, 1006 madt_proc[0].handler = acpi_parse_lapic;
1005 acpi_parse_lapic, MAX_LOCAL_APIC); 1007 madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
1008 madt_proc[1].handler = acpi_parse_x2apic;
1009 ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
1010 sizeof(struct acpi_table_madt),
1011 madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
1012 if (ret < 0) {
1013 printk(KERN_ERR PREFIX
1014 "Error parsing LAPIC/X2APIC entries\n");
1015 return ret;
1016 }
1017
1018 x2count = madt_proc[0].count;
1019 count = madt_proc[1].count;
1006 } 1020 }
1007 if (!count && !x2count) { 1021 if (!count && !x2count) {
1008 printk(KERN_ERR PREFIX "No LAPIC entries present\n"); 1022 printk(KERN_ERR PREFIX "No LAPIC entries present\n");
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 836d11b92811..861bc59c8f25 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -361,7 +361,11 @@ int __init arch_probe_nr_irqs(void)
361 if (nr < nr_irqs) 361 if (nr < nr_irqs)
362 nr_irqs = nr; 362 nr_irqs = nr;
363 363
364 return nr_legacy_irqs(); 364 /*
365 * We don't know if PIC is present at this point so we need to do
366 * probe() to get the right number of legacy IRQs.
367 */
368 return legacy_pic->probe();
365} 369}
366 370
367#ifdef CONFIG_X86_IO_APIC 371#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 4a70fc6d400a..a8816b325162 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -352,6 +352,7 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
352#ifdef CONFIG_SMP 352#ifdef CONFIG_SMP
353 unsigned bits; 353 unsigned bits;
354 int cpu = smp_processor_id(); 354 int cpu = smp_processor_id();
355 unsigned int socket_id, core_complex_id;
355 356
356 bits = c->x86_coreid_bits; 357 bits = c->x86_coreid_bits;
357 /* Low order bits define the core id (index of core in socket) */ 358 /* Low order bits define the core id (index of core in socket) */
@@ -361,6 +362,18 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
361 /* use socket ID also for last level cache */ 362 /* use socket ID also for last level cache */
362 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; 363 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
363 amd_get_topology(c); 364 amd_get_topology(c);
365
366 /*
367 * Fix percpu cpu_llc_id here as LLC topology is different
368 * for Fam17h systems.
369 */
370 if (c->x86 != 0x17 || !cpuid_edx(0x80000006))
371 return;
372
373 socket_id = (c->apicid >> bits) - 1;
374 core_complex_id = (c->apicid & ((1 << bits) - 1)) >> 3;
375
376 per_cpu(cpu_llc_id, cpu) = (socket_id << 3) | core_complex_id;
364#endif 377#endif
365} 378}
366 379
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4ddd780aeac9..c2b7522cbf35 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -273,10 +273,9 @@ __setup("nosmap", setup_disable_smap);
273 273
274static __always_inline void setup_smap(struct cpuinfo_x86 *c) 274static __always_inline void setup_smap(struct cpuinfo_x86 *c)
275{ 275{
276 unsigned long eflags; 276 unsigned long eflags = native_save_fl();
277 277
278 /* This should have been cleared long ago */ 278 /* This should have been cleared long ago */
279 raw_local_save_flags(eflags);
280 BUG_ON(eflags & X86_EFLAGS_AC); 279 BUG_ON(eflags & X86_EFLAGS_AC);
281 280
282 if (cpu_has(c, X86_FEATURE_SMAP)) { 281 if (cpu_has(c, X86_FEATURE_SMAP)) {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 98a13db5f4be..209ac1e7d1f0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -97,6 +97,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
97 switch (c->x86_model) { 97 switch (c->x86_model) {
98 case 0x27: /* Penwell */ 98 case 0x27: /* Penwell */
99 case 0x35: /* Cloverview */ 99 case 0x35: /* Cloverview */
100 case 0x4a: /* Merrifield */
100 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); 101 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
101 break; 102 break;
102 default: 103 default:
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 7fc27f1cca58..b3e94ef461fd 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -698,3 +698,4 @@ int __init microcode_init(void)
698 return error; 698 return error;
699 699
700} 700}
701late_initcall(microcode_init);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4562cf070c27..2bf79d7c97df 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -5,7 +5,7 @@
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput 6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian 10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
11 * 11 *
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 499f533dd3cc..d0e35ebb2adb 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -5,7 +5,7 @@
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput 6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian 10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
11 * 11 *
@@ -387,7 +387,7 @@ struct cpu_hw_events {
387/* Check flags and event code/umask, and set the HSW N/A flag */ 387/* Check flags and event code/umask, and set the HSW N/A flag */
388#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ 388#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
389 __EVENT_CONSTRAINT(code, n, \ 389 __EVENT_CONSTRAINT(code, n, \
390 INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \ 390 INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
391 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW) 391 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
392 392
393 393
@@ -627,6 +627,7 @@ struct x86_perf_task_context {
627 u64 lbr_from[MAX_LBR_ENTRIES]; 627 u64 lbr_from[MAX_LBR_ENTRIES];
628 u64 lbr_to[MAX_LBR_ENTRIES]; 628 u64 lbr_to[MAX_LBR_ENTRIES];
629 u64 lbr_info[MAX_LBR_ENTRIES]; 629 u64 lbr_info[MAX_LBR_ENTRIES];
630 int tos;
630 int lbr_callstack_users; 631 int lbr_callstack_users;
631 int lbr_stack_state; 632 int lbr_stack_state;
632}; 633};
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index f63360be2238..e2a430021e46 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -232,7 +232,7 @@ static struct event_constraint intel_hsw_event_constraints[] = {
232 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 232 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
233 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 233 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
234 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ 234 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
235 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */ 235 INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
236 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ 236 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
237 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ 237 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
238 /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ 238 /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 377e8f8ed391..a316ca96f1b6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
298static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) 298static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
299{ 299{
300 if (event->attach_state & PERF_ATTACH_TASK) 300 if (event->attach_state & PERF_ATTACH_TASK)
301 return perf_cgroup_from_task(event->hw.target); 301 return perf_cgroup_from_task(event->hw.target, event->ctx);
302 302
303 return event->cgrp; 303 return event->cgrp;
304} 304}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index bfd0b717e944..659f01e165d5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -239,7 +239,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
239 } 239 }
240 240
241 mask = x86_pmu.lbr_nr - 1; 241 mask = x86_pmu.lbr_nr - 1;
242 tos = intel_pmu_lbr_tos(); 242 tos = task_ctx->tos;
243 for (i = 0; i < tos; i++) { 243 for (i = 0; i < tos; i++) {
244 lbr_idx = (tos - i) & mask; 244 lbr_idx = (tos - i) & mask;
245 wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); 245 wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
@@ -247,6 +247,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
247 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) 247 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
248 wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); 248 wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
249 } 249 }
250 wrmsrl(x86_pmu.lbr_tos, tos);
250 task_ctx->lbr_stack_state = LBR_NONE; 251 task_ctx->lbr_stack_state = LBR_NONE;
251} 252}
252 253
@@ -270,6 +271,7 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
270 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) 271 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
271 rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); 272 rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
272 } 273 }
274 task_ctx->tos = tos;
273 task_ctx->lbr_stack_state = LBR_VALID; 275 task_ctx->lbr_stack_state = LBR_VALID;
274} 276}
275 277
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 81431c0f0614..ed446bdcbf31 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -107,12 +107,6 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
107static struct kobj_attribute format_attr_##_var = \ 107static struct kobj_attribute format_attr_##_var = \
108 __ATTR(_name, 0444, __rapl_##_var##_show, NULL) 108 __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
109 109
110#define RAPL_EVENT_DESC(_name, _config) \
111{ \
112 .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
113 .config = _config, \
114}
115
116#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ 110#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
117 111
118#define RAPL_EVENT_ATTR_STR(_name, v, str) \ 112#define RAPL_EVENT_ATTR_STR(_name, v, str) \
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
index f32ac13934f2..ec863b9a9f78 100644
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -163,10 +163,9 @@ again:
163 goto again; 163 goto again;
164 164
165 delta = now - prev; 165 delta = now - prev;
166 if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { 166 if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
167 delta <<= 32; 167 delta = sign_extend64(delta, 31);
168 delta >>= 32; /* sign extend */ 168
169 }
170 local64_add(now - prev, &event->count); 169 local64_add(now - prev, &event->count);
171} 170}
172 171
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index ef29b742cea7..31c6a60505e6 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -385,20 +385,19 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
385 */ 385 */
386void fpu__init_prepare_fx_sw_frame(void) 386void fpu__init_prepare_fx_sw_frame(void)
387{ 387{
388 int fsave_header_size = sizeof(struct fregs_state);
389 int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; 388 int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
390 389
391 if (config_enabled(CONFIG_X86_32))
392 size += fsave_header_size;
393
394 fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; 390 fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
395 fx_sw_reserved.extended_size = size; 391 fx_sw_reserved.extended_size = size;
396 fx_sw_reserved.xfeatures = xfeatures_mask; 392 fx_sw_reserved.xfeatures = xfeatures_mask;
397 fx_sw_reserved.xstate_size = xstate_size; 393 fx_sw_reserved.xstate_size = xstate_size;
398 394
399 if (config_enabled(CONFIG_IA32_EMULATION)) { 395 if (config_enabled(CONFIG_IA32_EMULATION) ||
396 config_enabled(CONFIG_X86_32)) {
397 int fsave_header_size = sizeof(struct fregs_state);
398
400 fx_sw_reserved_ia32 = fx_sw_reserved; 399 fx_sw_reserved_ia32 = fx_sw_reserved;
401 fx_sw_reserved_ia32.extended_size += fsave_header_size; 400 fx_sw_reserved_ia32.extended_size = size + fsave_header_size;
402 } 401 }
403} 402}
404 403
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 6454f2731b56..70fc312221fc 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -694,7 +694,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
694 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 694 if (!boot_cpu_has(X86_FEATURE_XSAVE))
695 return NULL; 695 return NULL;
696 696
697 xsave = &current->thread.fpu.state.xsave;
698 /* 697 /*
699 * We should not ever be requesting features that we 698 * We should not ever be requesting features that we
700 * have not enabled. Remember that pcntxt_mask is 699 * have not enabled. Remember that pcntxt_mask is
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8b7b0a51e742..311bcf338f07 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -556,6 +556,7 @@ void ftrace_replace_code(int enable)
556 run_sync(); 556 run_sync();
557 557
558 report = "updating code"; 558 report = "updating code";
559 count = 0;
559 560
560 for_ftrace_rec_iter(iter) { 561 for_ftrace_rec_iter(iter) {
561 rec = ftrace_rec_iter_record(iter); 562 rec = ftrace_rec_iter_record(iter);
@@ -563,11 +564,13 @@ void ftrace_replace_code(int enable)
563 ret = add_update(rec, enable); 564 ret = add_update(rec, enable);
564 if (ret) 565 if (ret)
565 goto remove_breakpoints; 566 goto remove_breakpoints;
567 count++;
566 } 568 }
567 569
568 run_sync(); 570 run_sync();
569 571
570 report = "removing breakpoints"; 572 report = "removing breakpoints";
573 count = 0;
571 574
572 for_ftrace_rec_iter(iter) { 575 for_ftrace_rec_iter(iter) {
573 rec = ftrace_rec_iter_record(iter); 576 rec = ftrace_rec_iter_record(iter);
@@ -575,6 +578,7 @@ void ftrace_replace_code(int enable)
575 ret = finish_update(rec, enable); 578 ret = finish_update(rec, enable);
576 if (ret) 579 if (ret)
577 goto remove_breakpoints; 580 goto remove_breakpoints;
581 count++;
578 } 582 }
579 583
580 run_sync(); 584 run_sync();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 1d40ca8a73f2..ffdc0e860390 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -65,6 +65,9 @@ startup_64:
65 * tables and then reload them. 65 * tables and then reload them.
66 */ 66 */
67 67
68 /* Sanitize CPU configuration */
69 call verify_cpu
70
68 /* 71 /*
69 * Compute the delta between the address I am compiled to run at and the 72 * Compute the delta between the address I am compiled to run at and the
70 * address I am actually running at. 73 * address I am actually running at.
@@ -174,6 +177,9 @@ ENTRY(secondary_startup_64)
174 * after the boot processor executes this code. 177 * after the boot processor executes this code.
175 */ 178 */
176 179
180 /* Sanitize CPU configuration */
181 call verify_cpu
182
177 movq $(init_level4_pgt - __START_KERNEL_map), %rax 183 movq $(init_level4_pgt - __START_KERNEL_map), %rax
1781: 1841:
179 185
@@ -288,6 +294,8 @@ ENTRY(secondary_startup_64)
288 pushq %rax # target address in negative space 294 pushq %rax # target address in negative space
289 lretq 295 lretq
290 296
297#include "verify_cpu.S"
298
291#ifdef CONFIG_HOTPLUG_CPU 299#ifdef CONFIG_HOTPLUG_CPU
292/* 300/*
293 * Boot CPU0 entry point. It's called from play_dead(). Everything has been set 301 * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 16cb827a5b27..be22f5a2192e 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -295,16 +295,11 @@ static void unmask_8259A(void)
295 raw_spin_unlock_irqrestore(&i8259A_lock, flags); 295 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
296} 296}
297 297
298static void init_8259A(int auto_eoi) 298static int probe_8259A(void)
299{ 299{
300 unsigned long flags; 300 unsigned long flags;
301 unsigned char probe_val = ~(1 << PIC_CASCADE_IR); 301 unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
302 unsigned char new_val; 302 unsigned char new_val;
303
304 i8259A_auto_eoi = auto_eoi;
305
306 raw_spin_lock_irqsave(&i8259A_lock, flags);
307
308 /* 303 /*
309 * Check to see if we have a PIC. 304 * Check to see if we have a PIC.
310 * Mask all except the cascade and read 305 * Mask all except the cascade and read
@@ -312,16 +307,28 @@ static void init_8259A(int auto_eoi)
312 * have a PIC, we will read 0xff as opposed to the 307 * have a PIC, we will read 0xff as opposed to the
313 * value we wrote. 308 * value we wrote.
314 */ 309 */
310 raw_spin_lock_irqsave(&i8259A_lock, flags);
311
315 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 312 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
316 outb(probe_val, PIC_MASTER_IMR); 313 outb(probe_val, PIC_MASTER_IMR);
317 new_val = inb(PIC_MASTER_IMR); 314 new_val = inb(PIC_MASTER_IMR);
318 if (new_val != probe_val) { 315 if (new_val != probe_val) {
319 printk(KERN_INFO "Using NULL legacy PIC\n"); 316 printk(KERN_INFO "Using NULL legacy PIC\n");
320 legacy_pic = &null_legacy_pic; 317 legacy_pic = &null_legacy_pic;
321 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
322 return;
323 } 318 }
324 319
320 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
321 return nr_legacy_irqs();
322}
323
324static void init_8259A(int auto_eoi)
325{
326 unsigned long flags;
327
328 i8259A_auto_eoi = auto_eoi;
329
330 raw_spin_lock_irqsave(&i8259A_lock, flags);
331
325 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 332 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
326 333
327 /* 334 /*
@@ -379,6 +386,10 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
379{ 386{
380 return 0; 387 return 0;
381} 388}
389static int legacy_pic_probe(void)
390{
391 return 0;
392}
382 393
383struct legacy_pic null_legacy_pic = { 394struct legacy_pic null_legacy_pic = {
384 .nr_legacy_irqs = 0, 395 .nr_legacy_irqs = 0,
@@ -388,6 +399,7 @@ struct legacy_pic null_legacy_pic = {
388 .mask_all = legacy_pic_noop, 399 .mask_all = legacy_pic_noop,
389 .restore_mask = legacy_pic_noop, 400 .restore_mask = legacy_pic_noop,
390 .init = legacy_pic_int_noop, 401 .init = legacy_pic_int_noop,
402 .probe = legacy_pic_probe,
391 .irq_pending = legacy_pic_irq_pending_noop, 403 .irq_pending = legacy_pic_irq_pending_noop,
392 .make_irq = legacy_pic_uint_noop, 404 .make_irq = legacy_pic_uint_noop,
393}; 405};
@@ -400,6 +412,7 @@ struct legacy_pic default_legacy_pic = {
400 .mask_all = mask_8259A, 412 .mask_all = mask_8259A,
401 .restore_mask = unmask_8259A, 413 .restore_mask = unmask_8259A,
402 .init = init_8259A, 414 .init = init_8259A,
415 .probe = probe_8259A,
403 .irq_pending = i8259A_irq_pending, 416 .irq_pending = i8259A_irq_pending,
404 .make_irq = make_8259A_irq, 417 .make_irq = make_8259A_irq,
405}; 418};
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index dc5fa6a1e8d6..3512ba607361 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * x86 specific code for irq_work 2 * x86 specific code for irq_work
3 * 3 *
4 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 4 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
5 */ 5 */
6 6
7#include <linux/kernel.h> 7#include <linux/kernel.h>
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2c7aafa70702..2bd81e302427 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -32,6 +32,7 @@
32static int kvmclock = 1; 32static int kvmclock = 1;
33static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; 33static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
34static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; 34static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
35static cycle_t kvm_sched_clock_offset;
35 36
36static int parse_no_kvmclock(char *arg) 37static int parse_no_kvmclock(char *arg)
37{ 38{
@@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
92 return kvm_clock_read(); 93 return kvm_clock_read();
93} 94}
94 95
96static cycle_t kvm_sched_clock_read(void)
97{
98 return kvm_clock_read() - kvm_sched_clock_offset;
99}
100
101static inline void kvm_sched_clock_init(bool stable)
102{
103 if (!stable) {
104 pv_time_ops.sched_clock = kvm_clock_read;
105 return;
106 }
107
108 kvm_sched_clock_offset = kvm_clock_read();
109 pv_time_ops.sched_clock = kvm_sched_clock_read;
110 set_sched_clock_stable();
111
112 printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
113 kvm_sched_clock_offset);
114
115 BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
116 sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
117}
118
95/* 119/*
96 * If we don't do that, there is the possibility that the guest 120 * If we don't do that, there is the possibility that the guest
97 * will calibrate under heavy load - thus, getting a lower lpj - 121 * will calibrate under heavy load - thus, getting a lower lpj -
@@ -248,7 +272,17 @@ void __init kvmclock_init(void)
248 memblock_free(mem, size); 272 memblock_free(mem, size);
249 return; 273 return;
250 } 274 }
251 pv_time_ops.sched_clock = kvm_clock_read; 275
276 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
277 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
278
279 cpu = get_cpu();
280 vcpu_time = &hv_clock[cpu].pvti;
281 flags = pvclock_read_flags(vcpu_time);
282
283 kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
284 put_cpu();
285
252 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 286 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
253 x86_platform.get_wallclock = kvm_get_wallclock; 287 x86_platform.get_wallclock = kvm_get_wallclock;
254 x86_platform.set_wallclock = kvm_set_wallclock; 288 x86_platform.set_wallclock = kvm_set_wallclock;
@@ -265,16 +299,6 @@ void __init kvmclock_init(void)
265 kvm_get_preset_lpj(); 299 kvm_get_preset_lpj();
266 clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); 300 clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
267 pv_info.name = "KVM"; 301 pv_info.name = "KVM";
268
269 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
270 pvclock_set_flags(~0);
271
272 cpu = get_cpu();
273 vcpu_time = &hv_clock[cpu].pvti;
274 flags = pvclock_read_flags(vcpu_time);
275 if (flags & PVCLOCK_COUNTS_FROM_ZERO)
276 set_sched_clock_stable();
277 put_cpu();
278} 302}
279 303
280int __init kvm_setup_vsyscall_timeinfo(void) 304int __init kvm_setup_vsyscall_timeinfo(void)
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
index ff3c3101d003..d1d35ccffed3 100644
--- a/arch/x86/kernel/livepatch.c
+++ b/arch/x86/kernel/livepatch.c
@@ -42,7 +42,6 @@ int klp_write_module_reloc(struct module *mod, unsigned long type,
42 bool readonly; 42 bool readonly;
43 unsigned long val; 43 unsigned long val;
44 unsigned long core = (unsigned long)mod->module_core; 44 unsigned long core = (unsigned long)mod->module_core;
45 unsigned long core_ro_size = mod->core_ro_size;
46 unsigned long core_size = mod->core_size; 45 unsigned long core_size = mod->core_size;
47 46
48 switch (type) { 47 switch (type) {
@@ -70,10 +69,12 @@ int klp_write_module_reloc(struct module *mod, unsigned long type,
70 /* loc does not point to any symbol inside the module */ 69 /* loc does not point to any symbol inside the module */
71 return -EINVAL; 70 return -EINVAL;
72 71
73 if (loc < core + core_ro_size) 72 readonly = false;
73
74#ifdef CONFIG_DEBUG_SET_MODULE_RONX
75 if (loc < core + mod->core_ro_size)
74 readonly = true; 76 readonly = true;
75 else 77#endif
76 readonly = false;
77 78
78 /* determine if the relocation spans a page boundary */ 79 /* determine if the relocation spans a page boundary */
79 numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2; 80 numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index 94ea120fa21f..87e1762e2bca 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -278,6 +278,12 @@ trace:
278 /* save_mcount_regs fills in first two parameters */ 278 /* save_mcount_regs fills in first two parameters */
279 save_mcount_regs 279 save_mcount_regs
280 280
281 /*
282 * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not
283 * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the
284 * ip and parent ip are used and the list function is called when
285 * function tracing is enabled.
286 */
281 call *ftrace_trace_function 287 call *ftrace_trace_function
282 288
283 restore_mcount_regs 289 restore_mcount_regs
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index cd99433b8ba1..6ba014c61d62 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -90,7 +90,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
90again: 90again:
91 page = NULL; 91 page = NULL;
92 /* CMA can be used only in the context which permits sleeping */ 92 /* CMA can be used only in the context which permits sleeping */
93 if (flag & __GFP_WAIT) { 93 if (gfpflags_allow_blocking(flag)) {
94 page = dma_alloc_from_contiguous(dev, count, get_order(size)); 94 page = dma_alloc_from_contiguous(dev, count, get_order(size));
95 if (page && page_to_phys(page) + size > dma_mask) { 95 if (page && page_to_phys(page) + size > dma_mask) {
96 dma_release_from_contiguous(dev, page, count); 96 dma_release_from_contiguous(dev, page, count);
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 4f00b63d7ff3..14415aff1813 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -4,10 +4,22 @@
4 */ 4 */
5#include <linux/platform_device.h> 5#include <linux/platform_device.h>
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/ioport.h>
8
9static int found(u64 start, u64 end, void *data)
10{
11 return 1;
12}
7 13
8static __init int register_e820_pmem(void) 14static __init int register_e820_pmem(void)
9{ 15{
16 char *pmem = "Persistent Memory (legacy)";
10 struct platform_device *pdev; 17 struct platform_device *pdev;
18 int rc;
19
20 rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found);
21 if (rc <= 0)
22 return 0;
11 23
12 /* 24 /*
13 * See drivers/nvdimm/e820.c for the implementation, this is 25 * See drivers/nvdimm/e820.c for the implementation, this is
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a1e4da98c8f0..d2bbe343fda7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1188,7 +1188,7 @@ void __init setup_arch(char **cmdline_p)
1188 */ 1188 */
1189 clone_pgd_range(initial_page_table, 1189 clone_pgd_range(initial_page_table,
1190 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 1190 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
1191 KERNEL_PGD_PTRS); 1191 min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
1192#endif 1192#endif
1193 1193
1194 tboot_probe(); 1194 tboot_probe();
@@ -1250,8 +1250,6 @@ void __init setup_arch(char **cmdline_p)
1250 if (efi_enabled(EFI_BOOT)) 1250 if (efi_enabled(EFI_BOOT))
1251 efi_apply_memmap_quirks(); 1251 efi_apply_memmap_quirks();
1252#endif 1252#endif
1253
1254 microcode_init();
1255} 1253}
1256 1254
1257#ifdef CONFIG_X86_32 1255#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b7ffb7c00075..cb6282c3638f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -690,12 +690,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
690 signal_setup_done(failed, ksig, stepping); 690 signal_setup_done(failed, ksig, stepping);
691} 691}
692 692
693#ifdef CONFIG_X86_32 693static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
694#define NR_restart_syscall __NR_restart_syscall 694{
695#else /* !CONFIG_X86_32 */ 695#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64)
696#define NR_restart_syscall \ 696 return __NR_restart_syscall;
697 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall 697#else /* !CONFIG_X86_32 && CONFIG_X86_64 */
698#endif /* CONFIG_X86_32 */ 698 return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall :
699 __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
700#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */
701}
699 702
700/* 703/*
701 * Note that 'init' is a special process: it doesn't get signals it doesn't 704 * Note that 'init' is a special process: it doesn't get signals it doesn't
@@ -724,7 +727,7 @@ void do_signal(struct pt_regs *regs)
724 break; 727 break;
725 728
726 case -ERESTART_RESTARTBLOCK: 729 case -ERESTART_RESTARTBLOCK:
727 regs->ax = NR_restart_syscall; 730 regs->ax = get_nr_restart_syscall(regs);
728 regs->ip -= 2; 731 regs->ip -= 2;
729 break; 732 break;
730 } 733 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4df777710ab7..f2281e9cfdbe 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -509,7 +509,7 @@ void __inquire_remote_apic(int apicid)
509 */ 509 */
510#define UDELAY_10MS_DEFAULT 10000 510#define UDELAY_10MS_DEFAULT 10000
511 511
512static unsigned int init_udelay = INT_MAX; 512static unsigned int init_udelay = UINT_MAX;
513 513
514static int __init cpu_init_udelay(char *str) 514static int __init cpu_init_udelay(char *str)
515{ 515{
@@ -522,14 +522,15 @@ early_param("cpu_init_udelay", cpu_init_udelay);
522static void __init smp_quirk_init_udelay(void) 522static void __init smp_quirk_init_udelay(void)
523{ 523{
524 /* if cmdline changed it from default, leave it alone */ 524 /* if cmdline changed it from default, leave it alone */
525 if (init_udelay != INT_MAX) 525 if (init_udelay != UINT_MAX)
526 return; 526 return;
527 527
528 /* if modern processor, use no delay */ 528 /* if modern processor, use no delay */
529 if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || 529 if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
530 ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) 530 ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
531 init_udelay = 0; 531 init_udelay = 0;
532 532 return;
533 }
533 /* else, use legacy delay */ 534 /* else, use legacy delay */
534 init_udelay = UDELAY_10MS_DEFAULT; 535 init_udelay = UDELAY_10MS_DEFAULT;
535} 536}
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index b9242bacbe59..4cf401f581e7 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -34,10 +34,11 @@
34#include <asm/msr-index.h> 34#include <asm/msr-index.h>
35 35
36verify_cpu: 36verify_cpu:
37 pushfl # Save caller passed flags 37 pushf # Save caller passed flags
38 pushl $0 # Kill any dangerous flags 38 push $0 # Kill any dangerous flags
39 popfl 39 popf
40 40
41#ifndef __x86_64__
41 pushfl # standard way to check for cpuid 42 pushfl # standard way to check for cpuid
42 popl %eax 43 popl %eax
43 movl %eax,%ebx 44 movl %eax,%ebx
@@ -48,6 +49,7 @@ verify_cpu:
48 popl %eax 49 popl %eax
49 cmpl %eax,%ebx 50 cmpl %eax,%ebx
50 jz verify_cpu_no_longmode # cpu has no cpuid 51 jz verify_cpu_no_longmode # cpu has no cpuid
52#endif
51 53
52 movl $0x0,%eax # See if cpuid 1 is implemented 54 movl $0x0,%eax # See if cpuid 1 is implemented
53 cpuid 55 cpuid
@@ -130,10 +132,10 @@ verify_cpu_sse_test:
130 jmp verify_cpu_sse_test # try again 132 jmp verify_cpu_sse_test # try again
131 133
132verify_cpu_no_longmode: 134verify_cpu_no_longmode:
133 popfl # Restore caller passed flags 135 popf # Restore caller passed flags
134 movl $1,%eax 136 movl $1,%eax
135 ret 137 ret
136verify_cpu_sse_ok: 138verify_cpu_sse_ok:
137 popfl # Restore caller passed flags 139 popf # Restore caller passed flags
138 xorl %eax, %eax 140 xorl %eax, %eax
139 ret 141 ret
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index d8a1d56276e1..639a6e34500c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,8 @@ config KVM
28 select ANON_INODES 28 select ANON_INODES
29 select HAVE_KVM_IRQCHIP 29 select HAVE_KVM_IRQCHIP
30 select HAVE_KVM_IRQFD 30 select HAVE_KVM_IRQFD
31 select IRQ_BYPASS_MANAGER
32 select HAVE_KVM_IRQ_BYPASS
31 select HAVE_KVM_IRQ_ROUTING 33 select HAVE_KVM_IRQ_ROUTING
32 select HAVE_KVM_EVENTFD 34 select HAVE_KVM_EVENTFD
33 select KVM_APIC_ARCHITECTURE 35 select KVM_APIC_ARCHITECTURE
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index d090ecf08809..9dc091acd5fb 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include "irq.h" 22#include "irq.h"
23#include "assigned-dev.h" 23#include "assigned-dev.h"
24#include "trace/events/kvm.h"
24 25
25struct kvm_assigned_dev_kernel { 26struct kvm_assigned_dev_kernel {
26 struct kvm_irq_ack_notifier ack_notifier; 27 struct kvm_irq_ack_notifier ack_notifier;
@@ -131,7 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
131 return IRQ_HANDLED; 132 return IRQ_HANDLED;
132} 133}
133 134
134#ifdef __KVM_HAVE_MSI 135/*
136 * Deliver an IRQ in an atomic context if we can, or return a failure,
137 * user can retry in a process context.
138 * Return value:
139 * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
140 * Other values - No need to retry.
141 */
142static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
143 int level)
144{
145 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
146 struct kvm_kernel_irq_routing_entry *e;
147 int ret = -EINVAL;
148 int idx;
149
150 trace_kvm_set_irq(irq, level, irq_source_id);
151
152 /*
153 * Injection into either PIC or IOAPIC might need to scan all CPUs,
154 * which would need to be retried from thread context; when same GSI
155 * is connected to both PIC and IOAPIC, we'd have to report a
156 * partial failure here.
157 * Since there's no easy way to do this, we only support injecting MSI
158 * which is limited to 1:1 GSI mapping.
159 */
160 idx = srcu_read_lock(&kvm->irq_srcu);
161 if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
162 e = &entries[0];
163 ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
164 irq, level);
165 }
166 srcu_read_unlock(&kvm->irq_srcu, idx);
167 return ret;
168}
169
170
135static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) 171static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
136{ 172{
137 struct kvm_assigned_dev_kernel *assigned_dev = dev_id; 173 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -150,9 +186,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
150 186
151 return IRQ_HANDLED; 187 return IRQ_HANDLED;
152} 188}
153#endif
154 189
155#ifdef __KVM_HAVE_MSIX
156static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) 190static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
157{ 191{
158 struct kvm_assigned_dev_kernel *assigned_dev = dev_id; 192 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -183,7 +217,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
183 217
184 return IRQ_HANDLED; 218 return IRQ_HANDLED;
185} 219}
186#endif
187 220
188/* Ack the irq line for an assigned device */ 221/* Ack the irq line for an assigned device */
189static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 222static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
@@ -386,7 +419,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
386 return 0; 419 return 0;
387} 420}
388 421
389#ifdef __KVM_HAVE_MSI
390static int assigned_device_enable_host_msi(struct kvm *kvm, 422static int assigned_device_enable_host_msi(struct kvm *kvm,
391 struct kvm_assigned_dev_kernel *dev) 423 struct kvm_assigned_dev_kernel *dev)
392{ 424{
@@ -408,9 +440,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
408 440
409 return 0; 441 return 0;
410} 442}
411#endif
412 443
413#ifdef __KVM_HAVE_MSIX
414static int assigned_device_enable_host_msix(struct kvm *kvm, 444static int assigned_device_enable_host_msix(struct kvm *kvm,
415 struct kvm_assigned_dev_kernel *dev) 445 struct kvm_assigned_dev_kernel *dev)
416{ 446{
@@ -443,8 +473,6 @@ err:
443 return r; 473 return r;
444} 474}
445 475
446#endif
447
448static int assigned_device_enable_guest_intx(struct kvm *kvm, 476static int assigned_device_enable_guest_intx(struct kvm *kvm,
449 struct kvm_assigned_dev_kernel *dev, 477 struct kvm_assigned_dev_kernel *dev,
450 struct kvm_assigned_irq *irq) 478 struct kvm_assigned_irq *irq)
@@ -454,7 +482,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm,
454 return 0; 482 return 0;
455} 483}
456 484
457#ifdef __KVM_HAVE_MSI
458static int assigned_device_enable_guest_msi(struct kvm *kvm, 485static int assigned_device_enable_guest_msi(struct kvm *kvm,
459 struct kvm_assigned_dev_kernel *dev, 486 struct kvm_assigned_dev_kernel *dev,
460 struct kvm_assigned_irq *irq) 487 struct kvm_assigned_irq *irq)
@@ -463,9 +490,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
463 dev->ack_notifier.gsi = -1; 490 dev->ack_notifier.gsi = -1;
464 return 0; 491 return 0;
465} 492}
466#endif
467 493
468#ifdef __KVM_HAVE_MSIX
469static int assigned_device_enable_guest_msix(struct kvm *kvm, 494static int assigned_device_enable_guest_msix(struct kvm *kvm,
470 struct kvm_assigned_dev_kernel *dev, 495 struct kvm_assigned_dev_kernel *dev,
471 struct kvm_assigned_irq *irq) 496 struct kvm_assigned_irq *irq)
@@ -474,7 +499,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
474 dev->ack_notifier.gsi = -1; 499 dev->ack_notifier.gsi = -1;
475 return 0; 500 return 0;
476} 501}
477#endif
478 502
479static int assign_host_irq(struct kvm *kvm, 503static int assign_host_irq(struct kvm *kvm,
480 struct kvm_assigned_dev_kernel *dev, 504 struct kvm_assigned_dev_kernel *dev,
@@ -492,16 +516,12 @@ static int assign_host_irq(struct kvm *kvm,
492 case KVM_DEV_IRQ_HOST_INTX: 516 case KVM_DEV_IRQ_HOST_INTX:
493 r = assigned_device_enable_host_intx(kvm, dev); 517 r = assigned_device_enable_host_intx(kvm, dev);
494 break; 518 break;
495#ifdef __KVM_HAVE_MSI
496 case KVM_DEV_IRQ_HOST_MSI: 519 case KVM_DEV_IRQ_HOST_MSI:
497 r = assigned_device_enable_host_msi(kvm, dev); 520 r = assigned_device_enable_host_msi(kvm, dev);
498 break; 521 break;
499#endif
500#ifdef __KVM_HAVE_MSIX
501 case KVM_DEV_IRQ_HOST_MSIX: 522 case KVM_DEV_IRQ_HOST_MSIX:
502 r = assigned_device_enable_host_msix(kvm, dev); 523 r = assigned_device_enable_host_msix(kvm, dev);
503 break; 524 break;
504#endif
505 default: 525 default:
506 r = -EINVAL; 526 r = -EINVAL;
507 } 527 }
@@ -534,16 +554,12 @@ static int assign_guest_irq(struct kvm *kvm,
534 case KVM_DEV_IRQ_GUEST_INTX: 554 case KVM_DEV_IRQ_GUEST_INTX:
535 r = assigned_device_enable_guest_intx(kvm, dev, irq); 555 r = assigned_device_enable_guest_intx(kvm, dev, irq);
536 break; 556 break;
537#ifdef __KVM_HAVE_MSI
538 case KVM_DEV_IRQ_GUEST_MSI: 557 case KVM_DEV_IRQ_GUEST_MSI:
539 r = assigned_device_enable_guest_msi(kvm, dev, irq); 558 r = assigned_device_enable_guest_msi(kvm, dev, irq);
540 break; 559 break;
541#endif
542#ifdef __KVM_HAVE_MSIX
543 case KVM_DEV_IRQ_GUEST_MSIX: 560 case KVM_DEV_IRQ_GUEST_MSIX:
544 r = assigned_device_enable_guest_msix(kvm, dev, irq); 561 r = assigned_device_enable_guest_msix(kvm, dev, irq);
545 break; 562 break;
546#endif
547 default: 563 default:
548 r = -EINVAL; 564 r = -EINVAL;
549 } 565 }
@@ -826,7 +842,6 @@ out:
826} 842}
827 843
828 844
829#ifdef __KVM_HAVE_MSIX
830static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, 845static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
831 struct kvm_assigned_msix_nr *entry_nr) 846 struct kvm_assigned_msix_nr *entry_nr)
832{ 847{
@@ -906,7 +921,6 @@ msix_entry_out:
906 921
907 return r; 922 return r;
908} 923}
909#endif
910 924
911static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, 925static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
912 struct kvm_assigned_pci_dev *assigned_dev) 926 struct kvm_assigned_pci_dev *assigned_dev)
@@ -1012,7 +1026,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
1012 goto out; 1026 goto out;
1013 break; 1027 break;
1014 } 1028 }
1015#ifdef __KVM_HAVE_MSIX
1016 case KVM_ASSIGN_SET_MSIX_NR: { 1029 case KVM_ASSIGN_SET_MSIX_NR: {
1017 struct kvm_assigned_msix_nr entry_nr; 1030 struct kvm_assigned_msix_nr entry_nr;
1018 r = -EFAULT; 1031 r = -EFAULT;
@@ -1033,7 +1046,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
1033 goto out; 1046 goto out;
1034 break; 1047 break;
1035 } 1048 }
1036#endif
1037 case KVM_ASSIGN_SET_INTX_MASK: { 1049 case KVM_ASSIGN_SET_INTX_MASK: {
1038 struct kvm_assigned_pci_dev assigned_dev; 1050 struct kvm_assigned_pci_dev assigned_dev;
1039 1051
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 156441bcaac8..6525e926f566 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
348 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | 348 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
349 F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | 349 F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
350 F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | 350 F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
351 F(AVX512CD); 351 F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
352 352
353 /* cpuid 0xD.1.eax */ 353 /* cpuid 0xD.1.eax */
354 const u32 kvm_supported_word10_x86_features = 354 const u32 kvm_supported_word10_x86_features =
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dd05b9cef6ae..06332cb7e7d1 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -133,4 +133,41 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
133 best = kvm_find_cpuid_entry(vcpu, 7, 0); 133 best = kvm_find_cpuid_entry(vcpu, 7, 0);
134 return best && (best->ebx & bit(X86_FEATURE_MPX)); 134 return best && (best->ebx & bit(X86_FEATURE_MPX));
135} 135}
136
137static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
138{
139 struct kvm_cpuid_entry2 *best;
140
141 best = kvm_find_cpuid_entry(vcpu, 7, 0);
142 return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
143}
144
145static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
146{
147 struct kvm_cpuid_entry2 *best;
148
149 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
150 return best && (best->edx & bit(X86_FEATURE_RDTSCP));
151}
152
153/*
154 * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
155 */
156#define BIT_NRIPS 3
157
158static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
159{
160 struct kvm_cpuid_entry2 *best;
161
162 best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0);
163
164 /*
165 * NRIPS is a scattered cpuid feature, so we can't use
166 * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit
167 * position 8, not 3).
168 */
169 return best && (best->edx & bit(BIT_NRIPS));
170}
171#undef BIT_NRIPS
172
136#endif 173#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9da95b9daf8d..1505587d06e9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
2272#define GET_SMSTATE(type, smbase, offset) \ 2272#define GET_SMSTATE(type, smbase, offset) \
2273 ({ \ 2273 ({ \
2274 type __val; \ 2274 type __val; \
2275 int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \ 2275 int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \
2276 sizeof(__val), NULL); \ 2276 sizeof(__val)); \
2277 if (r != X86EMUL_CONTINUE) \ 2277 if (r != X86EMUL_CONTINUE) \
2278 return X86EMUL_UNHANDLEABLE; \ 2278 return X86EMUL_UNHANDLEABLE; \
2279 __val; \ 2279 __val; \
@@ -2484,17 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
2484 2484
2485 /* 2485 /*
2486 * Get back to real mode, to prepare a safe state in which to load 2486 * Get back to real mode, to prepare a safe state in which to load
2487 * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed 2487 * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
2488 * to read_std/write_std are not virtual. 2488 * supports long mode.
2489 *
2490 * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
2491 */ 2489 */
2490 cr4 = ctxt->ops->get_cr(ctxt, 4);
2491 if (emulator_has_longmode(ctxt)) {
2492 struct desc_struct cs_desc;
2493
2494 /* Zero CR4.PCIDE before CR0.PG. */
2495 if (cr4 & X86_CR4_PCIDE) {
2496 ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
2497 cr4 &= ~X86_CR4_PCIDE;
2498 }
2499
2500 /* A 32-bit code segment is required to clear EFER.LMA. */
2501 memset(&cs_desc, 0, sizeof(cs_desc));
2502 cs_desc.type = 0xb;
2503 cs_desc.s = cs_desc.g = cs_desc.p = 1;
2504 ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
2505 }
2506
2507 /* For the 64-bit case, this will clear EFER.LMA. */
2492 cr0 = ctxt->ops->get_cr(ctxt, 0); 2508 cr0 = ctxt->ops->get_cr(ctxt, 0);
2493 if (cr0 & X86_CR0_PE) 2509 if (cr0 & X86_CR0_PE)
2494 ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); 2510 ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
2495 cr4 = ctxt->ops->get_cr(ctxt, 4); 2511
2512 /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */
2496 if (cr4 & X86_CR4_PAE) 2513 if (cr4 & X86_CR4_PAE)
2497 ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); 2514 ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
2515
2516 /* And finally go back to 32-bit mode. */
2498 efer = 0; 2517 efer = 0;
2499 ctxt->ops->set_msr(ctxt, MSR_EFER, efer); 2518 ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
2500 2519
@@ -4455,7 +4474,7 @@ static const struct opcode twobyte_table[256] = {
4455 F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, 4474 F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
4456 /* 0xA8 - 0xAF */ 4475 /* 0xA8 - 0xAF */
4457 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), 4476 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
4458 II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm), 4477 II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
4459 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), 4478 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
4460 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), 4479 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
4461 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), 4480 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index a8160d2ae362..62cf8c915e95 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -41,6 +41,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
41 case HV_X64_MSR_TIME_REF_COUNT: 41 case HV_X64_MSR_TIME_REF_COUNT:
42 case HV_X64_MSR_CRASH_CTL: 42 case HV_X64_MSR_CRASH_CTL:
43 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: 43 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
44 case HV_X64_MSR_RESET:
44 r = true; 45 r = true;
45 break; 46 break;
46 } 47 }
@@ -163,6 +164,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
163 data); 164 data);
164 case HV_X64_MSR_CRASH_CTL: 165 case HV_X64_MSR_CRASH_CTL:
165 return kvm_hv_msr_set_crash_ctl(vcpu, data, host); 166 return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
167 case HV_X64_MSR_RESET:
168 if (data == 1) {
169 vcpu_debug(vcpu, "hyper-v reset requested\n");
170 kvm_make_request(KVM_REQ_HV_RESET, vcpu);
171 }
172 break;
166 default: 173 default:
167 vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", 174 vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
168 msr, data); 175 msr, data);
@@ -171,7 +178,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
171 return 0; 178 return 0;
172} 179}
173 180
174static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 181/* Calculate cpu time spent by current task in 100ns units */
182static u64 current_task_runtime_100ns(void)
183{
184 cputime_t utime, stime;
185
186 task_cputime_adjusted(current, &utime, &stime);
187 return div_u64(cputime_to_nsecs(utime + stime), 100);
188}
189
190static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
175{ 191{
176 struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; 192 struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
177 193
@@ -205,6 +221,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
205 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); 221 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
206 case HV_X64_MSR_TPR: 222 case HV_X64_MSR_TPR:
207 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); 223 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
224 case HV_X64_MSR_VP_RUNTIME:
225 if (!host)
226 return 1;
227 hv->runtime_offset = data - current_task_runtime_100ns();
228 break;
208 default: 229 default:
209 vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", 230 vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
210 msr, data); 231 msr, data);
@@ -241,6 +262,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
241 pdata); 262 pdata);
242 case HV_X64_MSR_CRASH_CTL: 263 case HV_X64_MSR_CRASH_CTL:
243 return kvm_hv_msr_get_crash_ctl(vcpu, pdata); 264 return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
265 case HV_X64_MSR_RESET:
266 data = 0;
267 break;
244 default: 268 default:
245 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 269 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
246 return 1; 270 return 1;
@@ -277,6 +301,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
277 case HV_X64_MSR_APIC_ASSIST_PAGE: 301 case HV_X64_MSR_APIC_ASSIST_PAGE:
278 data = hv->hv_vapic; 302 data = hv->hv_vapic;
279 break; 303 break;
304 case HV_X64_MSR_VP_RUNTIME:
305 data = current_task_runtime_100ns() + hv->runtime_offset;
306 break;
280 default: 307 default:
281 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 308 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
282 return 1; 309 return 1;
@@ -295,7 +322,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
295 mutex_unlock(&vcpu->kvm->lock); 322 mutex_unlock(&vcpu->kvm->lock);
296 return r; 323 return r;
297 } else 324 } else
298 return kvm_hv_set_msr(vcpu, msr, data); 325 return kvm_hv_set_msr(vcpu, msr, data, host);
299} 326}
300 327
301int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 328int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index f90952f64e79..08116ff227cc 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -35,6 +35,7 @@
35#include <linux/kvm_host.h> 35#include <linux/kvm_host.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37 37
38#include "ioapic.h"
38#include "irq.h" 39#include "irq.h"
39#include "i8254.h" 40#include "i8254.h"
40#include "x86.h" 41#include "x86.h"
@@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
333 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; 334 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
334 s64 interval; 335 s64 interval;
335 336
336 if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) 337 if (!ioapic_in_kernel(kvm) ||
338 ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
337 return; 339 return;
338 340
339 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 341 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 856f79105bb5..88d0a92d3f94 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -233,21 +233,7 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
233} 233}
234 234
235 235
236static void update_handled_vectors(struct kvm_ioapic *ioapic) 236void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
237{
238 DECLARE_BITMAP(handled_vectors, 256);
239 int i;
240
241 memset(handled_vectors, 0, sizeof(handled_vectors));
242 for (i = 0; i < IOAPIC_NUM_PINS; ++i)
243 __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
244 memcpy(ioapic->handled_vectors, handled_vectors,
245 sizeof(handled_vectors));
246 smp_wmb();
247}
248
249void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
250 u32 *tmr)
251{ 237{
252 struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; 238 struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
253 union kvm_ioapic_redirect_entry *e; 239 union kvm_ioapic_redirect_entry *e;
@@ -260,13 +246,11 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
260 kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || 246 kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
261 index == RTC_GSI) { 247 index == RTC_GSI) {
262 if (kvm_apic_match_dest(vcpu, NULL, 0, 248 if (kvm_apic_match_dest(vcpu, NULL, 0,
263 e->fields.dest_id, e->fields.dest_mode)) { 249 e->fields.dest_id, e->fields.dest_mode) ||
250 (e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
251 kvm_apic_pending_eoi(vcpu, e->fields.vector)))
264 __set_bit(e->fields.vector, 252 __set_bit(e->fields.vector,
265 (unsigned long *)eoi_exit_bitmap); 253 (unsigned long *)eoi_exit_bitmap);
266 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
267 __set_bit(e->fields.vector,
268 (unsigned long *)tmr);
269 }
270 } 254 }
271 } 255 }
272 spin_unlock(&ioapic->lock); 256 spin_unlock(&ioapic->lock);
@@ -315,7 +299,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
315 e->bits |= (u32) val; 299 e->bits |= (u32) val;
316 e->fields.remote_irr = 0; 300 e->fields.remote_irr = 0;
317 } 301 }
318 update_handled_vectors(ioapic);
319 mask_after = e->fields.mask; 302 mask_after = e->fields.mask;
320 if (mask_before != mask_after) 303 if (mask_before != mask_after)
321 kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); 304 kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
@@ -599,7 +582,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
599 ioapic->id = 0; 582 ioapic->id = 0;
600 memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); 583 memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
601 rtc_irq_eoi_tracking_reset(ioapic); 584 rtc_irq_eoi_tracking_reset(ioapic);
602 update_handled_vectors(ioapic);
603} 585}
604 586
605static const struct kvm_io_device_ops ioapic_mmio_ops = { 587static const struct kvm_io_device_ops ioapic_mmio_ops = {
@@ -628,8 +610,10 @@ int kvm_ioapic_init(struct kvm *kvm)
628 if (ret < 0) { 610 if (ret < 0) {
629 kvm->arch.vioapic = NULL; 611 kvm->arch.vioapic = NULL;
630 kfree(ioapic); 612 kfree(ioapic);
613 return ret;
631 } 614 }
632 615
616 kvm_vcpu_request_scan_ioapic(kvm);
633 return ret; 617 return ret;
634} 618}
635 619
@@ -666,7 +650,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
666 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); 650 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
667 ioapic->irr = 0; 651 ioapic->irr = 0;
668 ioapic->irr_delivered = 0; 652 ioapic->irr_delivered = 0;
669 update_handled_vectors(ioapic);
670 kvm_vcpu_request_scan_ioapic(kvm); 653 kvm_vcpu_request_scan_ioapic(kvm);
671 kvm_ioapic_inject_all(ioapic, state->irr); 654 kvm_ioapic_inject_all(ioapic, state->irr);
672 spin_unlock(&ioapic->lock); 655 spin_unlock(&ioapic->lock);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ca0b0b4e6256..084617d37c74 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -9,6 +9,7 @@ struct kvm;
9struct kvm_vcpu; 9struct kvm_vcpu;
10 10
11#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS 11#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
12#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
12#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ 13#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
13#define IOAPIC_EDGE_TRIG 0 14#define IOAPIC_EDGE_TRIG 0
14#define IOAPIC_LEVEL_TRIG 1 15#define IOAPIC_LEVEL_TRIG 1
@@ -73,7 +74,6 @@ struct kvm_ioapic {
73 struct kvm *kvm; 74 struct kvm *kvm;
74 void (*ack_notifier)(void *opaque, int irq); 75 void (*ack_notifier)(void *opaque, int irq);
75 spinlock_t lock; 76 spinlock_t lock;
76 DECLARE_BITMAP(handled_vectors, 256);
77 struct rtc_status rtc_status; 77 struct rtc_status rtc_status;
78 struct delayed_work eoi_inject; 78 struct delayed_work eoi_inject;
79 u32 irq_eoi[IOAPIC_NUM_PINS]; 79 u32 irq_eoi[IOAPIC_NUM_PINS];
@@ -98,11 +98,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
98 return kvm->arch.vioapic; 98 return kvm->arch.vioapic;
99} 99}
100 100
101static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) 101static inline int ioapic_in_kernel(struct kvm *kvm)
102{ 102{
103 struct kvm_ioapic *ioapic = kvm->arch.vioapic; 103 int ret;
104 smp_rmb(); 104
105 return test_bit(vector, ioapic->handled_vectors); 105 ret = (ioapic_irqchip(kvm) != NULL);
106 return ret;
106} 107}
107 108
108void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); 109void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -120,7 +121,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
120 struct kvm_lapic_irq *irq, unsigned long *dest_map); 121 struct kvm_lapic_irq *irq, unsigned long *dest_map);
121int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 122int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
122int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 123int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
123void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, 124void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
124 u32 *tmr); 125void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
125 126
126#endif 127#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index a1ec6a50a05a..097060e33bd6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -38,14 +38,27 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
38EXPORT_SYMBOL(kvm_cpu_has_pending_timer); 38EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
39 39
40/* 40/*
41 * check if there is a pending userspace external interrupt
42 */
43static int pending_userspace_extint(struct kvm_vcpu *v)
44{
45 return v->arch.pending_external_vector != -1;
46}
47
48/*
41 * check if there is pending interrupt from 49 * check if there is pending interrupt from
42 * non-APIC source without intack. 50 * non-APIC source without intack.
43 */ 51 */
44static int kvm_cpu_has_extint(struct kvm_vcpu *v) 52static int kvm_cpu_has_extint(struct kvm_vcpu *v)
45{ 53{
46 if (kvm_apic_accept_pic_intr(v)) 54 u8 accept = kvm_apic_accept_pic_intr(v);
47 return pic_irqchip(v->kvm)->output; /* PIC */ 55
48 else 56 if (accept) {
57 if (irqchip_split(v->kvm))
58 return pending_userspace_extint(v);
59 else
60 return pic_irqchip(v->kvm)->output;
61 } else
49 return 0; 62 return 0;
50} 63}
51 64
@@ -57,13 +70,13 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
57 */ 70 */
58int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) 71int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
59{ 72{
60 if (!irqchip_in_kernel(v->kvm)) 73 if (!lapic_in_kernel(v))
61 return v->arch.interrupt.pending; 74 return v->arch.interrupt.pending;
62 75
63 if (kvm_cpu_has_extint(v)) 76 if (kvm_cpu_has_extint(v))
64 return 1; 77 return 1;
65 78
66 if (kvm_apic_vid_enabled(v->kvm)) 79 if (kvm_vcpu_apic_vid_enabled(v))
67 return 0; 80 return 0;
68 81
69 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ 82 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
@@ -75,7 +88,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
75 */ 88 */
76int kvm_cpu_has_interrupt(struct kvm_vcpu *v) 89int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
77{ 90{
78 if (!irqchip_in_kernel(v->kvm)) 91 if (!lapic_in_kernel(v))
79 return v->arch.interrupt.pending; 92 return v->arch.interrupt.pending;
80 93
81 if (kvm_cpu_has_extint(v)) 94 if (kvm_cpu_has_extint(v))
@@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
91 */ 104 */
92static int kvm_cpu_get_extint(struct kvm_vcpu *v) 105static int kvm_cpu_get_extint(struct kvm_vcpu *v)
93{ 106{
94 if (kvm_cpu_has_extint(v)) 107 if (kvm_cpu_has_extint(v)) {
95 return kvm_pic_read_irq(v->kvm); /* PIC */ 108 if (irqchip_split(v->kvm)) {
96 return -1; 109 int vector = v->arch.pending_external_vector;
110
111 v->arch.pending_external_vector = -1;
112 return vector;
113 } else
114 return kvm_pic_read_irq(v->kvm); /* PIC */
115 } else
116 return -1;
97} 117}
98 118
99/* 119/*
@@ -103,7 +123,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
103{ 123{
104 int vector; 124 int vector;
105 125
106 if (!irqchip_in_kernel(v->kvm)) 126 if (!lapic_in_kernel(v))
107 return v->arch.interrupt.nr; 127 return v->arch.interrupt.nr;
108 128
109 vector = kvm_cpu_get_extint(v); 129 vector = kvm_cpu_get_extint(v);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 3d782a2c336a..ae5c78f2337d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -83,13 +83,38 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
83 return kvm->arch.vpic; 83 return kvm->arch.vpic;
84} 84}
85 85
86static inline int pic_in_kernel(struct kvm *kvm)
87{
88 int ret;
89
90 ret = (pic_irqchip(kvm) != NULL);
91 return ret;
92}
93
94static inline int irqchip_split(struct kvm *kvm)
95{
96 return kvm->arch.irqchip_split;
97}
98
86static inline int irqchip_in_kernel(struct kvm *kvm) 99static inline int irqchip_in_kernel(struct kvm *kvm)
87{ 100{
88 struct kvm_pic *vpic = pic_irqchip(kvm); 101 struct kvm_pic *vpic = pic_irqchip(kvm);
102 bool ret;
103
104 ret = (vpic != NULL);
105 ret |= irqchip_split(kvm);
89 106
90 /* Read vpic before kvm->irq_routing. */ 107 /* Read vpic before kvm->irq_routing. */
91 smp_rmb(); 108 smp_rmb();
92 return vpic != NULL; 109 return ret;
110}
111
112static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
113{
114 /* Same as irqchip_in_kernel(vcpu->kvm), but with less
115 * pointer chasing and no unnecessary memory barriers.
116 */
117 return vcpu->arch.apic != NULL;
93} 118}
94 119
95void kvm_pic_reset(struct kvm_kpic_state *s); 120void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 9efff9e5b58c..84b96d319909 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -91,8 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
91 return r; 91 return r;
92} 92}
93 93
94static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, 94void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
95 struct kvm_lapic_irq *irq) 95 struct kvm_lapic_irq *irq)
96{ 96{
97 trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); 97 trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
98 98
@@ -108,6 +108,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
108 irq->level = 1; 108 irq->level = 1;
109 irq->shorthand = 0; 109 irq->shorthand = 0;
110} 110}
111EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
111 112
112int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 113int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
113 struct kvm *kvm, int irq_source_id, int level, bool line_status) 114 struct kvm *kvm, int irq_source_id, int level, bool line_status)
@@ -123,12 +124,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
123} 124}
124 125
125 126
126static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, 127int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
127 struct kvm *kvm) 128 struct kvm *kvm, int irq_source_id, int level,
129 bool line_status)
128{ 130{
129 struct kvm_lapic_irq irq; 131 struct kvm_lapic_irq irq;
130 int r; 132 int r;
131 133
134 if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
135 return -EWOULDBLOCK;
136
132 kvm_set_msi_irq(e, &irq); 137 kvm_set_msi_irq(e, &irq);
133 138
134 if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) 139 if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
@@ -137,42 +142,6 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
137 return -EWOULDBLOCK; 142 return -EWOULDBLOCK;
138} 143}
139 144
140/*
141 * Deliver an IRQ in an atomic context if we can, or return a failure,
142 * user can retry in a process context.
143 * Return value:
144 * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
145 * Other values - No need to retry.
146 */
147int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
148{
149 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
150 struct kvm_kernel_irq_routing_entry *e;
151 int ret = -EINVAL;
152 int idx;
153
154 trace_kvm_set_irq(irq, level, irq_source_id);
155
156 /*
157 * Injection into either PIC or IOAPIC might need to scan all CPUs,
158 * which would need to be retried from thread context; when same GSI
159 * is connected to both PIC and IOAPIC, we'd have to report a
160 * partial failure here.
161 * Since there's no easy way to do this, we only support injecting MSI
162 * which is limited to 1:1 GSI mapping.
163 */
164 idx = srcu_read_lock(&kvm->irq_srcu);
165 if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
166 e = &entries[0];
167 if (likely(e->type == KVM_IRQ_ROUTING_MSI))
168 ret = kvm_set_msi_inatomic(e, kvm);
169 else
170 ret = -EWOULDBLOCK;
171 }
172 srcu_read_unlock(&kvm->irq_srcu, idx);
173 return ret;
174}
175
176int kvm_request_irq_source_id(struct kvm *kvm) 145int kvm_request_irq_source_id(struct kvm *kvm)
177{ 146{
178 unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; 147 unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
@@ -208,7 +177,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
208 goto unlock; 177 goto unlock;
209 } 178 }
210 clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); 179 clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
211 if (!irqchip_in_kernel(kvm)) 180 if (!ioapic_in_kernel(kvm))
212 goto unlock; 181 goto unlock;
213 182
214 kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); 183 kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
@@ -297,6 +266,33 @@ out:
297 return r; 266 return r;
298} 267}
299 268
269bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
270 struct kvm_vcpu **dest_vcpu)
271{
272 int i, r = 0;
273 struct kvm_vcpu *vcpu;
274
275 if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
276 return true;
277
278 kvm_for_each_vcpu(i, vcpu, kvm) {
279 if (!kvm_apic_present(vcpu))
280 continue;
281
282 if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
283 irq->dest_id, irq->dest_mode))
284 continue;
285
286 if (++r == 2)
287 return false;
288
289 *dest_vcpu = vcpu;
290 }
291
292 return r == 1;
293}
294EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
295
300#define IOAPIC_ROUTING_ENTRY(irq) \ 296#define IOAPIC_ROUTING_ENTRY(irq) \
301 { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ 297 { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
302 .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } 298 .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
@@ -328,3 +324,54 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
328 return kvm_set_irq_routing(kvm, default_routing, 324 return kvm_set_irq_routing(kvm, default_routing,
329 ARRAY_SIZE(default_routing), 0); 325 ARRAY_SIZE(default_routing), 0);
330} 326}
327
328static const struct kvm_irq_routing_entry empty_routing[] = {};
329
330int kvm_setup_empty_irq_routing(struct kvm *kvm)
331{
332 return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
333}
334
335void kvm_arch_irq_routing_update(struct kvm *kvm)
336{
337 if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
338 return;
339 kvm_make_scan_ioapic_request(kvm);
340}
341
342void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
343{
344 struct kvm *kvm = vcpu->kvm;
345 struct kvm_kernel_irq_routing_entry *entry;
346 struct kvm_irq_routing_table *table;
347 u32 i, nr_ioapic_pins;
348 int idx;
349
350 /* kvm->irq_routing must be read after clearing
351 * KVM_SCAN_IOAPIC. */
352 smp_mb();
353 idx = srcu_read_lock(&kvm->irq_srcu);
354 table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
355 nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
356 kvm->arch.nr_reserved_ioapic_pins);
357 for (i = 0; i < nr_ioapic_pins; ++i) {
358 hlist_for_each_entry(entry, &table->map[i], link) {
359 u32 dest_id, dest_mode;
360 bool level;
361
362 if (entry->type != KVM_IRQ_ROUTING_MSI)
363 continue;
364 dest_id = (entry->msi.address_lo >> 12) & 0xff;
365 dest_mode = (entry->msi.address_lo >> 2) & 0x1;
366 level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
367 if (level && kvm_apic_match_dest(vcpu, NULL, 0,
368 dest_id, dest_mode)) {
369 u32 vector = entry->msi.data & 0xff;
370
371 __set_bit(vector,
372 (unsigned long *) eoi_exit_bitmap);
373 }
374 }
375 }
376 srcu_read_unlock(&kvm->irq_srcu, idx);
377}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8d9013c5e1ee..4d30b865be30 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -209,7 +209,7 @@ out:
209 if (old) 209 if (old)
210 kfree_rcu(old, rcu); 210 kfree_rcu(old, rcu);
211 211
212 kvm_vcpu_request_scan_ioapic(kvm); 212 kvm_make_scan_ioapic_request(kvm);
213} 213}
214 214
215static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) 215static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
@@ -348,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
348 struct kvm_lapic *apic = vcpu->arch.apic; 348 struct kvm_lapic *apic = vcpu->arch.apic;
349 349
350 __kvm_apic_update_irr(pir, apic->regs); 350 __kvm_apic_update_irr(pir, apic->regs);
351
352 kvm_make_request(KVM_REQ_EVENT, vcpu);
351} 353}
352EXPORT_SYMBOL_GPL(kvm_apic_update_irr); 354EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
353 355
@@ -390,7 +392,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
390 392
391 vcpu = apic->vcpu; 393 vcpu = apic->vcpu;
392 394
393 if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) { 395 if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) {
394 /* try to update RVI */ 396 /* try to update RVI */
395 apic_clear_vector(vec, apic->regs + APIC_IRR); 397 apic_clear_vector(vec, apic->regs + APIC_IRR);
396 kvm_make_request(KVM_REQ_EVENT, vcpu); 398 kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -551,15 +553,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
551 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); 553 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
552} 554}
553 555
554void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
555{
556 struct kvm_lapic *apic = vcpu->arch.apic;
557 int i;
558
559 for (i = 0; i < 8; i++)
560 apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
561}
562
563static void apic_update_ppr(struct kvm_lapic *apic) 556static void apic_update_ppr(struct kvm_lapic *apic)
564{ 557{
565 u32 tpr, isrv, ppr, old_ppr; 558 u32 tpr, isrv, ppr, old_ppr;
@@ -764,6 +757,65 @@ out:
764 return ret; 757 return ret;
765} 758}
766 759
760bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
761 struct kvm_vcpu **dest_vcpu)
762{
763 struct kvm_apic_map *map;
764 bool ret = false;
765 struct kvm_lapic *dst = NULL;
766
767 if (irq->shorthand)
768 return false;
769
770 rcu_read_lock();
771 map = rcu_dereference(kvm->arch.apic_map);
772
773 if (!map)
774 goto out;
775
776 if (irq->dest_mode == APIC_DEST_PHYSICAL) {
777 if (irq->dest_id == 0xFF)
778 goto out;
779
780 if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
781 goto out;
782
783 dst = map->phys_map[irq->dest_id];
784 if (dst && kvm_apic_present(dst->vcpu))
785 *dest_vcpu = dst->vcpu;
786 else
787 goto out;
788 } else {
789 u16 cid;
790 unsigned long bitmap = 1;
791 int i, r = 0;
792
793 if (!kvm_apic_logical_map_valid(map))
794 goto out;
795
796 apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
797
798 if (cid >= ARRAY_SIZE(map->logical_map))
799 goto out;
800
801 for_each_set_bit(i, &bitmap, 16) {
802 dst = map->logical_map[cid][i];
803 if (++r == 2)
804 goto out;
805 }
806
807 if (dst && kvm_apic_present(dst->vcpu))
808 *dest_vcpu = dst->vcpu;
809 else
810 goto out;
811 }
812
813 ret = true;
814out:
815 rcu_read_unlock();
816 return ret;
817}
818
767/* 819/*
768 * Add a pending IRQ into lapic. 820 * Add a pending IRQ into lapic.
769 * Return 1 if successfully added and 0 if discarded. 821 * Return 1 if successfully added and 0 if discarded.
@@ -781,6 +833,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
781 case APIC_DM_LOWEST: 833 case APIC_DM_LOWEST:
782 vcpu->arch.apic_arb_prio++; 834 vcpu->arch.apic_arb_prio++;
783 case APIC_DM_FIXED: 835 case APIC_DM_FIXED:
836 if (unlikely(trig_mode && !level))
837 break;
838
784 /* FIXME add logic for vcpu on reset */ 839 /* FIXME add logic for vcpu on reset */
785 if (unlikely(!apic_enabled(apic))) 840 if (unlikely(!apic_enabled(apic)))
786 break; 841 break;
@@ -790,6 +845,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
790 if (dest_map) 845 if (dest_map)
791 __set_bit(vcpu->vcpu_id, dest_map); 846 __set_bit(vcpu->vcpu_id, dest_map);
792 847
848 if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
849 if (trig_mode)
850 apic_set_vector(vector, apic->regs + APIC_TMR);
851 else
852 apic_clear_vector(vector, apic->regs + APIC_TMR);
853 }
854
793 if (kvm_x86_ops->deliver_posted_interrupt) 855 if (kvm_x86_ops->deliver_posted_interrupt)
794 kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); 856 kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
795 else { 857 else {
@@ -868,16 +930,32 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
868 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 930 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
869} 931}
870 932
933static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
934{
935 return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap);
936}
937
871static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) 938static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
872{ 939{
873 if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { 940 int trigger_mode;
874 int trigger_mode; 941
875 if (apic_test_vector(vector, apic->regs + APIC_TMR)) 942 /* Eoi the ioapic only if the ioapic doesn't own the vector. */
876 trigger_mode = IOAPIC_LEVEL_TRIG; 943 if (!kvm_ioapic_handles_vector(apic, vector))
877 else 944 return;
878 trigger_mode = IOAPIC_EDGE_TRIG; 945
879 kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); 946 /* Request a KVM exit to inform the userspace IOAPIC. */
947 if (irqchip_split(apic->vcpu->kvm)) {
948 apic->vcpu->arch.pending_ioapic_eoi = vector;
949 kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
950 return;
880 } 951 }
952
953 if (apic_test_vector(vector, apic->regs + APIC_TMR))
954 trigger_mode = IOAPIC_LEVEL_TRIG;
955 else
956 trigger_mode = IOAPIC_EDGE_TRIG;
957
958 kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
881} 959}
882 960
883static int apic_set_eoi(struct kvm_lapic *apic) 961static int apic_set_eoi(struct kvm_lapic *apic)
@@ -1172,7 +1250,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
1172 1250
1173 tsc_deadline = apic->lapic_timer.expired_tscdeadline; 1251 tsc_deadline = apic->lapic_timer.expired_tscdeadline;
1174 apic->lapic_timer.expired_tscdeadline = 0; 1252 apic->lapic_timer.expired_tscdeadline = 0;
1175 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); 1253 guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
1176 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); 1254 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
1177 1255
1178 /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ 1256 /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
@@ -1240,7 +1318,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
1240 local_irq_save(flags); 1318 local_irq_save(flags);
1241 1319
1242 now = apic->lapic_timer.timer.base->get_time(); 1320 now = apic->lapic_timer.timer.base->get_time();
1243 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); 1321 guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
1244 if (likely(tscdeadline > guest_tsc)) { 1322 if (likely(tscdeadline > guest_tsc)) {
1245 ns = (tscdeadline - guest_tsc) * 1000000ULL; 1323 ns = (tscdeadline - guest_tsc) * 1000000ULL;
1246 do_div(ns, this_tsc_khz); 1324 do_div(ns, this_tsc_khz);
@@ -1615,7 +1693,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
1615 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); 1693 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
1616 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 1694 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
1617 } 1695 }
1618 apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); 1696 apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu);
1619 apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0; 1697 apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
1620 apic->highest_isr_cache = -1; 1698 apic->highest_isr_cache = -1;
1621 update_divide_count(apic); 1699 update_divide_count(apic);
@@ -1838,7 +1916,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1838 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, 1916 kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
1839 apic_find_highest_isr(apic)); 1917 apic_find_highest_isr(apic));
1840 kvm_make_request(KVM_REQ_EVENT, vcpu); 1918 kvm_make_request(KVM_REQ_EVENT, vcpu);
1841 kvm_rtc_eoi_tracking_restore_one(vcpu); 1919 if (ioapic_in_kernel(vcpu->kvm))
1920 kvm_rtc_eoi_tracking_restore_one(vcpu);
1921
1922 vcpu->arch.apic_arb_prio = 0;
1842} 1923}
1843 1924
1844void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1925void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1922,7 +2003,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
1922 /* Cache not set: could be safe but we don't bother. */ 2003 /* Cache not set: could be safe but we don't bother. */
1923 apic->highest_isr_cache == -1 || 2004 apic->highest_isr_cache == -1 ||
1924 /* Need EOI to update ioapic. */ 2005 /* Need EOI to update ioapic. */
1925 kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { 2006 kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
1926 /* 2007 /*
1927 * PV EOI was disabled by apic_sync_pv_eoi_from_guest 2008 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
1928 * so we need not do anything here. 2009 * so we need not do anything here.
@@ -1978,7 +2059,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1978 struct kvm_lapic *apic = vcpu->arch.apic; 2059 struct kvm_lapic *apic = vcpu->arch.apic;
1979 u32 reg = (msr - APIC_BASE_MSR) << 4; 2060 u32 reg = (msr - APIC_BASE_MSR) << 4;
1980 2061
1981 if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) 2062 if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
1982 return 1; 2063 return 1;
1983 2064
1984 if (reg == APIC_ICR2) 2065 if (reg == APIC_ICR2)
@@ -1995,7 +2076,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
1995 struct kvm_lapic *apic = vcpu->arch.apic; 2076 struct kvm_lapic *apic = vcpu->arch.apic;
1996 u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; 2077 u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
1997 2078
1998 if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) 2079 if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
1999 return 1; 2080 return 1;
2000 2081
2001 if (reg == APIC_DFR || reg == APIC_ICR2) { 2082 if (reg == APIC_DFR || reg == APIC_ICR2) {
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 764037991d26..fde8e35d5850 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
57u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); 57u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
58void kvm_apic_set_version(struct kvm_vcpu *vcpu); 58void kvm_apic_set_version(struct kvm_vcpu *vcpu);
59 59
60void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
61void __kvm_apic_update_irr(u32 *pir, void *regs); 60void __kvm_apic_update_irr(u32 *pir, void *regs);
62void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); 61void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
63int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, 62int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
@@ -144,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
144 return apic->vcpu->arch.apic_base & X2APIC_ENABLE; 143 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
145} 144}
146 145
147static inline bool kvm_apic_vid_enabled(struct kvm *kvm) 146static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu)
148{ 147{
149 return kvm_x86_ops->vm_has_apicv(kvm); 148 return kvm_x86_ops->cpu_uses_apicv(vcpu);
150} 149}
151 150
152static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) 151static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
@@ -169,4 +168,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
169 168
170void wait_lapic_expire(struct kvm_vcpu *vcpu); 169void wait_lapic_expire(struct kvm_vcpu *vcpu);
171 170
171bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
172 struct kvm_vcpu **dest_vcpu);
172#endif 173#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff606f507913..e7c2c1428a69 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -818,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
818 kvm->arch.indirect_shadow_pages--; 818 kvm->arch.indirect_shadow_pages--;
819} 819}
820 820
821static int has_wrprotected_page(struct kvm_vcpu *vcpu, 821static int __has_wrprotected_page(gfn_t gfn, int level,
822 gfn_t gfn, 822 struct kvm_memory_slot *slot)
823 int level)
824{ 823{
825 struct kvm_memory_slot *slot;
826 struct kvm_lpage_info *linfo; 824 struct kvm_lpage_info *linfo;
827 825
828 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
829 if (slot) { 826 if (slot) {
830 linfo = lpage_info_slot(gfn, slot, level); 827 linfo = lpage_info_slot(gfn, slot, level);
831 return linfo->write_count; 828 return linfo->write_count;
@@ -834,6 +831,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu,
834 return 1; 831 return 1;
835} 832}
836 833
834static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
835{
836 struct kvm_memory_slot *slot;
837
838 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
839 return __has_wrprotected_page(gfn, level, slot);
840}
841
837static int host_mapping_level(struct kvm *kvm, gfn_t gfn) 842static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
838{ 843{
839 unsigned long page_size; 844 unsigned long page_size;
@@ -851,6 +856,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
851 return ret; 856 return ret;
852} 857}
853 858
859static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
860 bool no_dirty_log)
861{
862 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
863 return false;
864 if (no_dirty_log && slot->dirty_bitmap)
865 return false;
866
867 return true;
868}
869
854static struct kvm_memory_slot * 870static struct kvm_memory_slot *
855gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, 871gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
856 bool no_dirty_log) 872 bool no_dirty_log)
@@ -858,21 +874,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
858 struct kvm_memory_slot *slot; 874 struct kvm_memory_slot *slot;
859 875
860 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 876 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
861 if (!slot || slot->flags & KVM_MEMSLOT_INVALID || 877 if (!memslot_valid_for_gpte(slot, no_dirty_log))
862 (no_dirty_log && slot->dirty_bitmap))
863 slot = NULL; 878 slot = NULL;
864 879
865 return slot; 880 return slot;
866} 881}
867 882
868static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) 883static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
869{ 884 bool *force_pt_level)
870 return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
871}
872
873static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
874{ 885{
875 int host_level, level, max_level; 886 int host_level, level, max_level;
887 struct kvm_memory_slot *slot;
888
889 if (unlikely(*force_pt_level))
890 return PT_PAGE_TABLE_LEVEL;
891
892 slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
893 *force_pt_level = !memslot_valid_for_gpte(slot, true);
894 if (unlikely(*force_pt_level))
895 return PT_PAGE_TABLE_LEVEL;
876 896
877 host_level = host_mapping_level(vcpu->kvm, large_gfn); 897 host_level = host_mapping_level(vcpu->kvm, large_gfn);
878 898
@@ -882,7 +902,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
882 max_level = min(kvm_x86_ops->get_lpage_level(), host_level); 902 max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
883 903
884 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) 904 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
885 if (has_wrprotected_page(vcpu, large_gfn, level)) 905 if (__has_wrprotected_page(large_gfn, level, slot))
886 break; 906 break;
887 907
888 return level - 1; 908 return level - 1;
@@ -2962,14 +2982,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2962{ 2982{
2963 int r; 2983 int r;
2964 int level; 2984 int level;
2965 int force_pt_level; 2985 bool force_pt_level = false;
2966 pfn_t pfn; 2986 pfn_t pfn;
2967 unsigned long mmu_seq; 2987 unsigned long mmu_seq;
2968 bool map_writable, write = error_code & PFERR_WRITE_MASK; 2988 bool map_writable, write = error_code & PFERR_WRITE_MASK;
2969 2989
2970 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); 2990 level = mapping_level(vcpu, gfn, &force_pt_level);
2971 if (likely(!force_pt_level)) { 2991 if (likely(!force_pt_level)) {
2972 level = mapping_level(vcpu, gfn);
2973 /* 2992 /*
2974 * This path builds a PAE pagetable - so we can map 2993 * This path builds a PAE pagetable - so we can map
2975 * 2mb pages at maximum. Therefore check if the level 2994 * 2mb pages at maximum. Therefore check if the level
@@ -2979,8 +2998,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2979 level = PT_DIRECTORY_LEVEL; 2998 level = PT_DIRECTORY_LEVEL;
2980 2999
2981 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 3000 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2982 } else 3001 }
2983 level = PT_PAGE_TABLE_LEVEL;
2984 3002
2985 if (fast_page_fault(vcpu, v, level, error_code)) 3003 if (fast_page_fault(vcpu, v, level, error_code))
2986 return 0; 3004 return 0;
@@ -3341,7 +3359,7 @@ exit:
3341 return reserved; 3359 return reserved;
3342} 3360}
3343 3361
3344int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3362int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3345{ 3363{
3346 u64 spte; 3364 u64 spte;
3347 bool reserved; 3365 bool reserved;
@@ -3350,7 +3368,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3350 return RET_MMIO_PF_EMULATE; 3368 return RET_MMIO_PF_EMULATE;
3351 3369
3352 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); 3370 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
3353 if (unlikely(reserved)) 3371 if (WARN_ON(reserved))
3354 return RET_MMIO_PF_BUG; 3372 return RET_MMIO_PF_BUG;
3355 3373
3356 if (is_mmio_spte(spte)) { 3374 if (is_mmio_spte(spte)) {
@@ -3374,17 +3392,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3374 */ 3392 */
3375 return RET_MMIO_PF_RETRY; 3393 return RET_MMIO_PF_RETRY;
3376} 3394}
3377EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); 3395EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
3378
3379static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
3380 u32 error_code, bool direct)
3381{
3382 int ret;
3383
3384 ret = handle_mmio_page_fault_common(vcpu, addr, direct);
3385 WARN_ON(ret == RET_MMIO_PF_BUG);
3386 return ret;
3387}
3388 3396
3389static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 3397static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3390 u32 error_code, bool prefault) 3398 u32 error_code, bool prefault)
@@ -3395,7 +3403,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3395 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 3403 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
3396 3404
3397 if (unlikely(error_code & PFERR_RSVD_MASK)) { 3405 if (unlikely(error_code & PFERR_RSVD_MASK)) {
3398 r = handle_mmio_page_fault(vcpu, gva, error_code, true); 3406 r = handle_mmio_page_fault(vcpu, gva, true);
3399 3407
3400 if (likely(r != RET_MMIO_PF_INVALID)) 3408 if (likely(r != RET_MMIO_PF_INVALID))
3401 return r; 3409 return r;
@@ -3427,7 +3435,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
3427 3435
3428static bool can_do_async_pf(struct kvm_vcpu *vcpu) 3436static bool can_do_async_pf(struct kvm_vcpu *vcpu)
3429{ 3437{
3430 if (unlikely(!irqchip_in_kernel(vcpu->kvm) || 3438 if (unlikely(!lapic_in_kernel(vcpu) ||
3431 kvm_event_needs_reinjection(vcpu))) 3439 kvm_event_needs_reinjection(vcpu)))
3432 return false; 3440 return false;
3433 3441
@@ -3476,7 +3484,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3476 pfn_t pfn; 3484 pfn_t pfn;
3477 int r; 3485 int r;
3478 int level; 3486 int level;
3479 int force_pt_level; 3487 bool force_pt_level;
3480 gfn_t gfn = gpa >> PAGE_SHIFT; 3488 gfn_t gfn = gpa >> PAGE_SHIFT;
3481 unsigned long mmu_seq; 3489 unsigned long mmu_seq;
3482 int write = error_code & PFERR_WRITE_MASK; 3490 int write = error_code & PFERR_WRITE_MASK;
@@ -3485,7 +3493,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3485 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3493 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3486 3494
3487 if (unlikely(error_code & PFERR_RSVD_MASK)) { 3495 if (unlikely(error_code & PFERR_RSVD_MASK)) {
3488 r = handle_mmio_page_fault(vcpu, gpa, error_code, true); 3496 r = handle_mmio_page_fault(vcpu, gpa, true);
3489 3497
3490 if (likely(r != RET_MMIO_PF_INVALID)) 3498 if (likely(r != RET_MMIO_PF_INVALID))
3491 return r; 3499 return r;
@@ -3495,20 +3503,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3495 if (r) 3503 if (r)
3496 return r; 3504 return r;
3497 3505
3498 if (mapping_level_dirty_bitmap(vcpu, gfn) || 3506 force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
3499 !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) 3507 PT_DIRECTORY_LEVEL);
3500 force_pt_level = 1; 3508 level = mapping_level(vcpu, gfn, &force_pt_level);
3501 else
3502 force_pt_level = 0;
3503
3504 if (likely(!force_pt_level)) { 3509 if (likely(!force_pt_level)) {
3505 level = mapping_level(vcpu, gfn);
3506 if (level > PT_DIRECTORY_LEVEL && 3510 if (level > PT_DIRECTORY_LEVEL &&
3507 !check_hugepage_cache_consistency(vcpu, gfn, level)) 3511 !check_hugepage_cache_consistency(vcpu, gfn, level))
3508 level = PT_DIRECTORY_LEVEL; 3512 level = PT_DIRECTORY_LEVEL;
3509 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 3513 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3510 } else 3514 }
3511 level = PT_PAGE_TABLE_LEVEL;
3512 3515
3513 if (fast_page_fault(vcpu, gpa, level, error_code)) 3516 if (fast_page_fault(vcpu, gpa, level, error_code))
3514 return 0; 3517 return 0;
@@ -3706,7 +3709,7 @@ static void
3706__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, 3709__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
3707 int maxphyaddr, bool execonly) 3710 int maxphyaddr, bool execonly)
3708{ 3711{
3709 int pte; 3712 u64 bad_mt_xwr;
3710 3713
3711 rsvd_check->rsvd_bits_mask[0][3] = 3714 rsvd_check->rsvd_bits_mask[0][3] =
3712 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); 3715 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
@@ -3724,14 +3727,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
3724 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); 3727 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
3725 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; 3728 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
3726 3729
3727 for (pte = 0; pte < 64; pte++) { 3730 bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
3728 int rwx_bits = pte & 7; 3731 bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
3729 int mt = pte >> 3; 3732 bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
3730 if (mt == 0x2 || mt == 0x3 || mt == 0x7 || 3733 bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
3731 rwx_bits == 0x2 || rwx_bits == 0x6 || 3734 bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
3732 (rwx_bits == 0x4 && !execonly)) 3735 if (!execonly) {
3733 rsvd_check->bad_mt_xwr |= (1ull << pte); 3736 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
3737 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
3734 } 3738 }
3739 rsvd_check->bad_mt_xwr = bad_mt_xwr;
3735} 3740}
3736 3741
3737static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, 3742static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e4202e41d535..55ffb7b0f95e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -56,13 +56,13 @@ void
56reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 56reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
57 57
58/* 58/*
59 * Return values of handle_mmio_page_fault_common: 59 * Return values of handle_mmio_page_fault:
60 * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction 60 * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
61 * directly. 61 * directly.
62 * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page 62 * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
63 * fault path update the mmio spte. 63 * fault path update the mmio spte.
64 * RET_MMIO_PF_RETRY: let CPU fault again on the address. 64 * RET_MMIO_PF_RETRY: let CPU fault again on the address.
65 * RET_MMIO_PF_BUG: bug is detected. 65 * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
66 */ 66 */
67enum { 67enum {
68 RET_MMIO_PF_EMULATE = 1, 68 RET_MMIO_PF_EMULATE = 1,
@@ -71,7 +71,7 @@ enum {
71 RET_MMIO_PF_BUG = -1 71 RET_MMIO_PF_BUG = -1
72}; 72};
73 73
74int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); 74int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
75void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); 75void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
76void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); 76void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
77 77
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 736e6ab8784d..3058a22a658d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -698,15 +698,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
698 int r; 698 int r;
699 pfn_t pfn; 699 pfn_t pfn;
700 int level = PT_PAGE_TABLE_LEVEL; 700 int level = PT_PAGE_TABLE_LEVEL;
701 int force_pt_level; 701 bool force_pt_level = false;
702 unsigned long mmu_seq; 702 unsigned long mmu_seq;
703 bool map_writable, is_self_change_mapping; 703 bool map_writable, is_self_change_mapping;
704 704
705 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 705 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
706 706
707 if (unlikely(error_code & PFERR_RSVD_MASK)) { 707 if (unlikely(error_code & PFERR_RSVD_MASK)) {
708 r = handle_mmio_page_fault(vcpu, addr, error_code, 708 r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu));
709 mmu_is_nested(vcpu));
710 if (likely(r != RET_MMIO_PF_INVALID)) 709 if (likely(r != RET_MMIO_PF_INVALID))
711 return r; 710 return r;
712 711
@@ -743,15 +742,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
743 is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, 742 is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
744 &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); 743 &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
745 744
746 if (walker.level >= PT_DIRECTORY_LEVEL) 745 if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
747 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) 746 level = mapping_level(vcpu, walker.gfn, &force_pt_level);
748 || is_self_change_mapping; 747 if (likely(!force_pt_level)) {
749 else 748 level = min(walker.level, level);
750 force_pt_level = 1; 749 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
751 if (!force_pt_level) { 750 }
752 level = min(walker.level, mapping_level(vcpu, walker.gfn)); 751 } else
753 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 752 force_pt_level = true;
754 }
755 753
756 mmu_seq = vcpu->kvm->mmu_notifier_seq; 754 mmu_seq = vcpu->kvm->mmu_notifier_seq;
757 smp_rmb(); 755 smp_rmb();
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2f9ed1ff0632..83a1c643f9a5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -158,7 +158,8 @@ struct vcpu_svm {
158 unsigned long int3_rip; 158 unsigned long int3_rip;
159 u32 apf_reason; 159 u32 apf_reason;
160 160
161 u64 tsc_ratio; 161 /* cached guest cpuid flags for faster access */
162 bool nrips_enabled : 1;
162}; 163};
163 164
164static DEFINE_PER_CPU(u64, current_tsc_ratio); 165static DEFINE_PER_CPU(u64, current_tsc_ratio);
@@ -211,7 +212,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm);
211static int nested_svm_vmexit(struct vcpu_svm *svm); 212static int nested_svm_vmexit(struct vcpu_svm *svm);
212static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 213static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
213 bool has_error_code, u32 error_code); 214 bool has_error_code, u32 error_code);
214static u64 __scale_tsc(u64 ratio, u64 tsc);
215 215
216enum { 216enum {
217 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, 217 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
@@ -891,20 +891,9 @@ static __init int svm_hardware_setup(void)
891 kvm_enable_efer_bits(EFER_FFXSR); 891 kvm_enable_efer_bits(EFER_FFXSR);
892 892
893 if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { 893 if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
894 u64 max;
895
896 kvm_has_tsc_control = true; 894 kvm_has_tsc_control = true;
897 895 kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
898 /* 896 kvm_tsc_scaling_ratio_frac_bits = 32;
899 * Make sure the user can only configure tsc_khz values that
900 * fit into a signed integer.
901 * A min value is not calculated needed because it will always
902 * be 1 on all machines and a value of 0 is used to disable
903 * tsc-scaling for the vcpu.
904 */
905 max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
906
907 kvm_max_guest_tsc_khz = max;
908 } 897 }
909 898
910 if (nested) { 899 if (nested) {
@@ -968,68 +957,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
968 seg->base = 0; 957 seg->base = 0;
969} 958}
970 959
971static u64 __scale_tsc(u64 ratio, u64 tsc)
972{
973 u64 mult, frac, _tsc;
974
975 mult = ratio >> 32;
976 frac = ratio & ((1ULL << 32) - 1);
977
978 _tsc = tsc;
979 _tsc *= mult;
980 _tsc += (tsc >> 32) * frac;
981 _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
982
983 return _tsc;
984}
985
986static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
987{
988 struct vcpu_svm *svm = to_svm(vcpu);
989 u64 _tsc = tsc;
990
991 if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
992 _tsc = __scale_tsc(svm->tsc_ratio, tsc);
993
994 return _tsc;
995}
996
997static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
998{
999 struct vcpu_svm *svm = to_svm(vcpu);
1000 u64 ratio;
1001 u64 khz;
1002
1003 /* Guest TSC same frequency as host TSC? */
1004 if (!scale) {
1005 svm->tsc_ratio = TSC_RATIO_DEFAULT;
1006 return;
1007 }
1008
1009 /* TSC scaling supported? */
1010 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1011 if (user_tsc_khz > tsc_khz) {
1012 vcpu->arch.tsc_catchup = 1;
1013 vcpu->arch.tsc_always_catchup = 1;
1014 } else
1015 WARN(1, "user requested TSC rate below hardware speed\n");
1016 return;
1017 }
1018
1019 khz = user_tsc_khz;
1020
1021 /* TSC scaling required - calculate ratio */
1022 ratio = khz << 32;
1023 do_div(ratio, tsc_khz);
1024
1025 if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
1026 WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
1027 user_tsc_khz);
1028 return;
1029 }
1030 svm->tsc_ratio = ratio;
1031}
1032
1033static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) 960static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
1034{ 961{
1035 struct vcpu_svm *svm = to_svm(vcpu); 962 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1056,16 +983,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1056 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 983 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1057} 984}
1058 985
1059static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) 986static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
1060{ 987{
1061 struct vcpu_svm *svm = to_svm(vcpu); 988 struct vcpu_svm *svm = to_svm(vcpu);
1062 989
1063 if (host) {
1064 if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
1065 WARN_ON(adjustment < 0);
1066 adjustment = svm_scale_tsc(vcpu, (u64)adjustment);
1067 }
1068
1069 svm->vmcb->control.tsc_offset += adjustment; 990 svm->vmcb->control.tsc_offset += adjustment;
1070 if (is_guest_mode(vcpu)) 991 if (is_guest_mode(vcpu))
1071 svm->nested.hsave->control.tsc_offset += adjustment; 992 svm->nested.hsave->control.tsc_offset += adjustment;
@@ -1077,16 +998,7 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
1077 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 998 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1078} 999}
1079 1000
1080static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 1001static void init_vmcb(struct vcpu_svm *svm)
1081{
1082 u64 tsc;
1083
1084 tsc = svm_scale_tsc(vcpu, rdtsc());
1085
1086 return target_tsc - tsc;
1087}
1088
1089static void init_vmcb(struct vcpu_svm *svm, bool init_event)
1090{ 1002{
1091 struct vmcb_control_area *control = &svm->vmcb->control; 1003 struct vmcb_control_area *control = &svm->vmcb->control;
1092 struct vmcb_save_area *save = &svm->vmcb->save; 1004 struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1107,6 +1019,8 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
1107 set_exception_intercept(svm, PF_VECTOR); 1019 set_exception_intercept(svm, PF_VECTOR);
1108 set_exception_intercept(svm, UD_VECTOR); 1020 set_exception_intercept(svm, UD_VECTOR);
1109 set_exception_intercept(svm, MC_VECTOR); 1021 set_exception_intercept(svm, MC_VECTOR);
1022 set_exception_intercept(svm, AC_VECTOR);
1023 set_exception_intercept(svm, DB_VECTOR);
1110 1024
1111 set_intercept(svm, INTERCEPT_INTR); 1025 set_intercept(svm, INTERCEPT_INTR);
1112 set_intercept(svm, INTERCEPT_NMI); 1026 set_intercept(svm, INTERCEPT_NMI);
@@ -1157,8 +1071,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
1157 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1071 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1158 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1072 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1159 1073
1160 if (!init_event) 1074 svm_set_efer(&svm->vcpu, 0);
1161 svm_set_efer(&svm->vcpu, 0);
1162 save->dr6 = 0xffff0ff0; 1075 save->dr6 = 0xffff0ff0;
1163 kvm_set_rflags(&svm->vcpu, 2); 1076 kvm_set_rflags(&svm->vcpu, 2);
1164 save->rip = 0x0000fff0; 1077 save->rip = 0x0000fff0;
@@ -1212,7 +1125,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1212 if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) 1125 if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1213 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 1126 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1214 } 1127 }
1215 init_vmcb(svm, init_event); 1128 init_vmcb(svm);
1216 1129
1217 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); 1130 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1218 kvm_register_write(vcpu, VCPU_REGS_RDX, eax); 1131 kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1233,8 +1146,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1233 goto out; 1146 goto out;
1234 } 1147 }
1235 1148
1236 svm->tsc_ratio = TSC_RATIO_DEFAULT;
1237
1238 err = kvm_vcpu_init(&svm->vcpu, kvm, id); 1149 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
1239 if (err) 1150 if (err)
1240 goto free_svm; 1151 goto free_svm;
@@ -1268,7 +1179,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1268 clear_page(svm->vmcb); 1179 clear_page(svm->vmcb);
1269 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 1180 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1270 svm->asid_generation = 0; 1181 svm->asid_generation = 0;
1271 init_vmcb(svm, false); 1182 init_vmcb(svm);
1272 1183
1273 svm_init_osvw(&svm->vcpu); 1184 svm_init_osvw(&svm->vcpu);
1274 1185
@@ -1320,10 +1231,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1320 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1231 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1321 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1232 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1322 1233
1323 if (static_cpu_has(X86_FEATURE_TSCRATEMSR) && 1234 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1324 svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) { 1235 u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
1325 __this_cpu_write(current_tsc_ratio, svm->tsc_ratio); 1236 if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
1326 wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); 1237 __this_cpu_write(current_tsc_ratio, tsc_ratio);
1238 wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
1239 }
1327 } 1240 }
1328} 1241}
1329 1242
@@ -1642,20 +1555,13 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1642 mark_dirty(svm->vmcb, VMCB_SEG); 1555 mark_dirty(svm->vmcb, VMCB_SEG);
1643} 1556}
1644 1557
1645static void update_db_bp_intercept(struct kvm_vcpu *vcpu) 1558static void update_bp_intercept(struct kvm_vcpu *vcpu)
1646{ 1559{
1647 struct vcpu_svm *svm = to_svm(vcpu); 1560 struct vcpu_svm *svm = to_svm(vcpu);
1648 1561
1649 clr_exception_intercept(svm, DB_VECTOR);
1650 clr_exception_intercept(svm, BP_VECTOR); 1562 clr_exception_intercept(svm, BP_VECTOR);
1651 1563
1652 if (svm->nmi_singlestep)
1653 set_exception_intercept(svm, DB_VECTOR);
1654
1655 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1564 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1656 if (vcpu->guest_debug &
1657 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1658 set_exception_intercept(svm, DB_VECTOR);
1659 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1565 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1660 set_exception_intercept(svm, BP_VECTOR); 1566 set_exception_intercept(svm, BP_VECTOR);
1661 } else 1567 } else
@@ -1761,7 +1667,6 @@ static int db_interception(struct vcpu_svm *svm)
1761 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) 1667 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1762 svm->vmcb->save.rflags &= 1668 svm->vmcb->save.rflags &=
1763 ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1669 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1764 update_db_bp_intercept(&svm->vcpu);
1765 } 1670 }
1766 1671
1767 if (svm->vcpu.guest_debug & 1672 if (svm->vcpu.guest_debug &
@@ -1796,6 +1701,12 @@ static int ud_interception(struct vcpu_svm *svm)
1796 return 1; 1701 return 1;
1797} 1702}
1798 1703
1704static int ac_interception(struct vcpu_svm *svm)
1705{
1706 kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
1707 return 1;
1708}
1709
1799static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1710static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1800{ 1711{
1801 struct vcpu_svm *svm = to_svm(vcpu); 1712 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1890,7 +1801,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
1890 * so reinitialize it. 1801 * so reinitialize it.
1891 */ 1802 */
1892 clear_page(svm->vmcb); 1803 clear_page(svm->vmcb);
1893 init_vmcb(svm, false); 1804 init_vmcb(svm);
1894 1805
1895 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 1806 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1896 return 0; 1807 return 0;
@@ -2365,7 +2276,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2365 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 2276 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
2366 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 2277 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
2367 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 2278 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2368 nested_vmcb->control.next_rip = vmcb->control.next_rip; 2279
2280 if (svm->nrips_enabled)
2281 nested_vmcb->control.next_rip = vmcb->control.next_rip;
2369 2282
2370 /* 2283 /*
2371 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have 2284 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -3060,7 +2973,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
3060 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2973 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
3061 /* instruction emulation calls kvm_set_cr8() */ 2974 /* instruction emulation calls kvm_set_cr8() */
3062 r = cr_interception(svm); 2975 r = cr_interception(svm);
3063 if (irqchip_in_kernel(svm->vcpu.kvm)) 2976 if (lapic_in_kernel(&svm->vcpu))
3064 return r; 2977 return r;
3065 if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) 2978 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
3066 return r; 2979 return r;
@@ -3071,8 +2984,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
3071static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) 2984static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
3072{ 2985{
3073 struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); 2986 struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
3074 return vmcb->control.tsc_offset + 2987 return vmcb->control.tsc_offset + host_tsc;
3075 svm_scale_tsc(vcpu, host_tsc);
3076} 2988}
3077 2989
3078static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2990static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -3082,7 +2994,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3082 switch (msr_info->index) { 2994 switch (msr_info->index) {
3083 case MSR_IA32_TSC: { 2995 case MSR_IA32_TSC: {
3084 msr_info->data = svm->vmcb->control.tsc_offset + 2996 msr_info->data = svm->vmcb->control.tsc_offset +
3085 svm_scale_tsc(vcpu, rdtsc()); 2997 kvm_scale_tsc(vcpu, rdtsc());
3086 2998
3087 break; 2999 break;
3088 } 3000 }
@@ -3294,24 +3206,11 @@ static int msr_interception(struct vcpu_svm *svm)
3294 3206
3295static int interrupt_window_interception(struct vcpu_svm *svm) 3207static int interrupt_window_interception(struct vcpu_svm *svm)
3296{ 3208{
3297 struct kvm_run *kvm_run = svm->vcpu.run;
3298
3299 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3209 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3300 svm_clear_vintr(svm); 3210 svm_clear_vintr(svm);
3301 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3211 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3302 mark_dirty(svm->vmcb, VMCB_INTR); 3212 mark_dirty(svm->vmcb, VMCB_INTR);
3303 ++svm->vcpu.stat.irq_window_exits; 3213 ++svm->vcpu.stat.irq_window_exits;
3304 /*
3305 * If the user space waits to inject interrupts, exit as soon as
3306 * possible
3307 */
3308 if (!irqchip_in_kernel(svm->vcpu.kvm) &&
3309 kvm_run->request_interrupt_window &&
3310 !kvm_cpu_has_interrupt(&svm->vcpu)) {
3311 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3312 return 0;
3313 }
3314
3315 return 1; 3214 return 1;
3316} 3215}
3317 3216
@@ -3371,6 +3270,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3371 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 3270 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
3372 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 3271 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
3373 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 3272 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
3273 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
3374 [SVM_EXIT_INTR] = intr_interception, 3274 [SVM_EXIT_INTR] = intr_interception,
3375 [SVM_EXIT_NMI] = nmi_interception, 3275 [SVM_EXIT_NMI] = nmi_interception,
3376 [SVM_EXIT_SMI] = nop_on_interception, 3276 [SVM_EXIT_SMI] = nop_on_interception,
@@ -3659,12 +3559,12 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
3659 return; 3559 return;
3660} 3560}
3661 3561
3662static int svm_vm_has_apicv(struct kvm *kvm) 3562static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu)
3663{ 3563{
3664 return 0; 3564 return 0;
3665} 3565}
3666 3566
3667static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 3567static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
3668{ 3568{
3669 return; 3569 return;
3670} 3570}
@@ -3754,7 +3654,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
3754 */ 3654 */
3755 svm->nmi_singlestep = true; 3655 svm->nmi_singlestep = true;
3756 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3656 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3757 update_db_bp_intercept(vcpu);
3758} 3657}
3759 3658
3760static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 3659static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4098,6 +3997,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4098 3997
4099static void svm_cpuid_update(struct kvm_vcpu *vcpu) 3998static void svm_cpuid_update(struct kvm_vcpu *vcpu)
4100{ 3999{
4000 struct vcpu_svm *svm = to_svm(vcpu);
4001
4002 /* Update nrips enabled cache */
4003 svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
4101} 4004}
4102 4005
4103static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 4006static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -4376,7 +4279,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4376 .vcpu_load = svm_vcpu_load, 4279 .vcpu_load = svm_vcpu_load,
4377 .vcpu_put = svm_vcpu_put, 4280 .vcpu_put = svm_vcpu_put,
4378 4281
4379 .update_db_bp_intercept = update_db_bp_intercept, 4282 .update_bp_intercept = update_bp_intercept,
4380 .get_msr = svm_get_msr, 4283 .get_msr = svm_get_msr,
4381 .set_msr = svm_set_msr, 4284 .set_msr = svm_set_msr,
4382 .get_segment_base = svm_get_segment_base, 4285 .get_segment_base = svm_get_segment_base,
@@ -4425,7 +4328,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4425 .enable_irq_window = enable_irq_window, 4328 .enable_irq_window = enable_irq_window,
4426 .update_cr8_intercept = update_cr8_intercept, 4329 .update_cr8_intercept = update_cr8_intercept,
4427 .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, 4330 .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
4428 .vm_has_apicv = svm_vm_has_apicv, 4331 .cpu_uses_apicv = svm_cpu_uses_apicv,
4429 .load_eoi_exitmap = svm_load_eoi_exitmap, 4332 .load_eoi_exitmap = svm_load_eoi_exitmap,
4430 .sync_pir_to_irr = svm_sync_pir_to_irr, 4333 .sync_pir_to_irr = svm_sync_pir_to_irr,
4431 4334
@@ -4448,11 +4351,9 @@ static struct kvm_x86_ops svm_x86_ops = {
4448 4351
4449 .has_wbinvd_exit = svm_has_wbinvd_exit, 4352 .has_wbinvd_exit = svm_has_wbinvd_exit,
4450 4353
4451 .set_tsc_khz = svm_set_tsc_khz,
4452 .read_tsc_offset = svm_read_tsc_offset, 4354 .read_tsc_offset = svm_read_tsc_offset,
4453 .write_tsc_offset = svm_write_tsc_offset, 4355 .write_tsc_offset = svm_write_tsc_offset,
4454 .adjust_tsc_offset = svm_adjust_tsc_offset, 4356 .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
4455 .compute_tsc_offset = svm_compute_tsc_offset,
4456 .read_l1_tsc = svm_read_l1_tsc, 4357 .read_l1_tsc = svm_read_l1_tsc,
4457 4358
4458 .set_tdp_cr3 = set_tdp_cr3, 4359 .set_tdp_cr3 = set_tdp_cr3,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4eae7c35ddf5..120302511802 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -129,6 +129,24 @@ TRACE_EVENT(kvm_pio,
129); 129);
130 130
131/* 131/*
132 * Tracepoint for fast mmio.
133 */
134TRACE_EVENT(kvm_fast_mmio,
135 TP_PROTO(u64 gpa),
136 TP_ARGS(gpa),
137
138 TP_STRUCT__entry(
139 __field(u64, gpa)
140 ),
141
142 TP_fast_assign(
143 __entry->gpa = gpa;
144 ),
145
146 TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
147);
148
149/*
132 * Tracepoint for cpuid. 150 * Tracepoint for cpuid.
133 */ 151 */
134TRACE_EVENT(kvm_cpuid, 152TRACE_EVENT(kvm_cpuid,
@@ -974,6 +992,39 @@ TRACE_EVENT(kvm_enter_smm,
974 __entry->smbase) 992 __entry->smbase)
975); 993);
976 994
995/*
996 * Tracepoint for VT-d posted-interrupts.
997 */
998TRACE_EVENT(kvm_pi_irte_update,
999 TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
1000 unsigned int gvec, u64 pi_desc_addr, bool set),
1001 TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
1002
1003 TP_STRUCT__entry(
1004 __field( unsigned int, vcpu_id )
1005 __field( unsigned int, gsi )
1006 __field( unsigned int, gvec )
1007 __field( u64, pi_desc_addr )
1008 __field( bool, set )
1009 ),
1010
1011 TP_fast_assign(
1012 __entry->vcpu_id = vcpu_id;
1013 __entry->gsi = gsi;
1014 __entry->gvec = gvec;
1015 __entry->pi_desc_addr = pi_desc_addr;
1016 __entry->set = set;
1017 ),
1018
1019 TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
1020 "gvec: 0x%x, pi_desc_addr: 0x%llx",
1021 __entry->set ? "enabled and being updated" : "disabled",
1022 __entry->vcpu_id,
1023 __entry->gsi,
1024 __entry->gvec,
1025 __entry->pi_desc_addr)
1026);
1027
977#endif /* _TRACE_KVM_H */ 1028#endif /* _TRACE_KVM_H */
978 1029
979#undef TRACE_INCLUDE_PATH 1030#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a8bc64566ab..af823a388c19 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -35,6 +35,7 @@
35#include "kvm_cache_regs.h" 35#include "kvm_cache_regs.h"
36#include "x86.h" 36#include "x86.h"
37 37
38#include <asm/cpu.h>
38#include <asm/io.h> 39#include <asm/io.h>
39#include <asm/desc.h> 40#include <asm/desc.h>
40#include <asm/vmx.h> 41#include <asm/vmx.h>
@@ -45,6 +46,7 @@
45#include <asm/debugreg.h> 46#include <asm/debugreg.h>
46#include <asm/kexec.h> 47#include <asm/kexec.h>
47#include <asm/apic.h> 48#include <asm/apic.h>
49#include <asm/irq_remapping.h>
48 50
49#include "trace.h" 51#include "trace.h"
50#include "pmu.h" 52#include "pmu.h"
@@ -105,6 +107,8 @@ static u64 __read_mostly host_xss;
105static bool __read_mostly enable_pml = 1; 107static bool __read_mostly enable_pml = 1;
106module_param_named(pml, enable_pml, bool, S_IRUGO); 108module_param_named(pml, enable_pml, bool, S_IRUGO);
107 109
110#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
111
108#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) 112#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
109#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) 113#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
110#define KVM_VM_CR0_ALWAYS_ON \ 114#define KVM_VM_CR0_ALWAYS_ON \
@@ -424,6 +428,9 @@ struct nested_vmx {
424 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ 428 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
425 u64 vmcs01_debugctl; 429 u64 vmcs01_debugctl;
426 430
431 u16 vpid02;
432 u16 last_vpid;
433
427 u32 nested_vmx_procbased_ctls_low; 434 u32 nested_vmx_procbased_ctls_low;
428 u32 nested_vmx_procbased_ctls_high; 435 u32 nested_vmx_procbased_ctls_high;
429 u32 nested_vmx_true_procbased_ctls_low; 436 u32 nested_vmx_true_procbased_ctls_low;
@@ -440,14 +447,33 @@ struct nested_vmx {
440 u32 nested_vmx_misc_low; 447 u32 nested_vmx_misc_low;
441 u32 nested_vmx_misc_high; 448 u32 nested_vmx_misc_high;
442 u32 nested_vmx_ept_caps; 449 u32 nested_vmx_ept_caps;
450 u32 nested_vmx_vpid_caps;
443}; 451};
444 452
445#define POSTED_INTR_ON 0 453#define POSTED_INTR_ON 0
454#define POSTED_INTR_SN 1
455
446/* Posted-Interrupt Descriptor */ 456/* Posted-Interrupt Descriptor */
447struct pi_desc { 457struct pi_desc {
448 u32 pir[8]; /* Posted interrupt requested */ 458 u32 pir[8]; /* Posted interrupt requested */
449 u32 control; /* bit 0 of control is outstanding notification bit */ 459 union {
450 u32 rsvd[7]; 460 struct {
461 /* bit 256 - Outstanding Notification */
462 u16 on : 1,
463 /* bit 257 - Suppress Notification */
464 sn : 1,
465 /* bit 271:258 - Reserved */
466 rsvd_1 : 14;
467 /* bit 279:272 - Notification Vector */
468 u8 nv;
469 /* bit 287:280 - Reserved */
470 u8 rsvd_2;
471 /* bit 319:288 - Notification Destination */
472 u32 ndst;
473 };
474 u64 control;
475 };
476 u32 rsvd[6];
451} __aligned(64); 477} __aligned(64);
452 478
453static bool pi_test_and_set_on(struct pi_desc *pi_desc) 479static bool pi_test_and_set_on(struct pi_desc *pi_desc)
@@ -467,6 +493,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
467 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); 493 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
468} 494}
469 495
496static inline void pi_clear_sn(struct pi_desc *pi_desc)
497{
498 return clear_bit(POSTED_INTR_SN,
499 (unsigned long *)&pi_desc->control);
500}
501
502static inline void pi_set_sn(struct pi_desc *pi_desc)
503{
504 return set_bit(POSTED_INTR_SN,
505 (unsigned long *)&pi_desc->control);
506}
507
508static inline int pi_test_on(struct pi_desc *pi_desc)
509{
510 return test_bit(POSTED_INTR_ON,
511 (unsigned long *)&pi_desc->control);
512}
513
514static inline int pi_test_sn(struct pi_desc *pi_desc)
515{
516 return test_bit(POSTED_INTR_SN,
517 (unsigned long *)&pi_desc->control);
518}
519
470struct vcpu_vmx { 520struct vcpu_vmx {
471 struct kvm_vcpu vcpu; 521 struct kvm_vcpu vcpu;
472 unsigned long host_rsp; 522 unsigned long host_rsp;
@@ -532,8 +582,6 @@ struct vcpu_vmx {
532 s64 vnmi_blocked_time; 582 s64 vnmi_blocked_time;
533 u32 exit_reason; 583 u32 exit_reason;
534 584
535 bool rdtscp_enabled;
536
537 /* Posted interrupt descriptor */ 585 /* Posted interrupt descriptor */
538 struct pi_desc pi_desc; 586 struct pi_desc pi_desc;
539 587
@@ -563,6 +611,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
563 return container_of(vcpu, struct vcpu_vmx, vcpu); 611 return container_of(vcpu, struct vcpu_vmx, vcpu);
564} 612}
565 613
614static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
615{
616 return &(to_vmx(vcpu)->pi_desc);
617}
618
566#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) 619#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
567#define FIELD(number, name) [number] = VMCS12_OFFSET(name) 620#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
568#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ 621#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
@@ -809,7 +862,7 @@ static void kvm_cpu_vmxon(u64 addr);
809static void kvm_cpu_vmxoff(void); 862static void kvm_cpu_vmxoff(void);
810static bool vmx_mpx_supported(void); 863static bool vmx_mpx_supported(void);
811static bool vmx_xsaves_supported(void); 864static bool vmx_xsaves_supported(void);
812static int vmx_vm_has_apicv(struct kvm *kvm); 865static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu);
813static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 866static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
814static void vmx_set_segment(struct kvm_vcpu *vcpu, 867static void vmx_set_segment(struct kvm_vcpu *vcpu,
815 struct kvm_segment *var, int seg); 868 struct kvm_segment *var, int seg);
@@ -831,6 +884,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
831static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 884static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
832static DEFINE_PER_CPU(struct desc_ptr, host_gdt); 885static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
833 886
887/*
888 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
889 * can find which vCPU should be waken up.
890 */
891static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
892static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
893
834static unsigned long *vmx_io_bitmap_a; 894static unsigned long *vmx_io_bitmap_a;
835static unsigned long *vmx_io_bitmap_b; 895static unsigned long *vmx_io_bitmap_b;
836static unsigned long *vmx_msr_bitmap_legacy; 896static unsigned long *vmx_msr_bitmap_legacy;
@@ -946,9 +1006,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
946 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; 1006 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
947} 1007}
948 1008
949static inline bool vm_need_tpr_shadow(struct kvm *kvm) 1009static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
950{ 1010{
951 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); 1011 return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
952} 1012}
953 1013
954static inline bool cpu_has_secondary_exec_ctrls(void) 1014static inline bool cpu_has_secondary_exec_ctrls(void)
@@ -983,7 +1043,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
983 1043
984static inline bool cpu_has_vmx_posted_intr(void) 1044static inline bool cpu_has_vmx_posted_intr(void)
985{ 1045{
986 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; 1046 return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
1047 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
987} 1048}
988 1049
989static inline bool cpu_has_vmx_apicv(void) 1050static inline bool cpu_has_vmx_apicv(void)
@@ -1062,9 +1123,9 @@ static inline bool cpu_has_vmx_ple(void)
1062 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 1123 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1063} 1124}
1064 1125
1065static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) 1126static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
1066{ 1127{
1067 return flexpriority_enabled && irqchip_in_kernel(kvm); 1128 return flexpriority_enabled && lapic_in_kernel(vcpu);
1068} 1129}
1069 1130
1070static inline bool cpu_has_vmx_vpid(void) 1131static inline bool cpu_has_vmx_vpid(void)
@@ -1113,6 +1174,12 @@ static inline bool cpu_has_vmx_pml(void)
1113 return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; 1174 return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1114} 1175}
1115 1176
1177static inline bool cpu_has_vmx_tsc_scaling(void)
1178{
1179 return vmcs_config.cpu_based_2nd_exec_ctrl &
1180 SECONDARY_EXEC_TSC_SCALING;
1181}
1182
1116static inline bool report_flexpriority(void) 1183static inline bool report_flexpriority(void)
1117{ 1184{
1118 return flexpriority_enabled; 1185 return flexpriority_enabled;
@@ -1157,6 +1224,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
1157 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 1224 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
1158} 1225}
1159 1226
1227static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
1228{
1229 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
1230}
1231
1160static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) 1232static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
1161{ 1233{
1162 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); 1234 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
@@ -1337,13 +1409,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1337 __loaded_vmcs_clear, loaded_vmcs, 1); 1409 __loaded_vmcs_clear, loaded_vmcs, 1);
1338} 1410}
1339 1411
1340static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 1412static inline void vpid_sync_vcpu_single(int vpid)
1341{ 1413{
1342 if (vmx->vpid == 0) 1414 if (vpid == 0)
1343 return; 1415 return;
1344 1416
1345 if (cpu_has_vmx_invvpid_single()) 1417 if (cpu_has_vmx_invvpid_single())
1346 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); 1418 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
1347} 1419}
1348 1420
1349static inline void vpid_sync_vcpu_global(void) 1421static inline void vpid_sync_vcpu_global(void)
@@ -1352,10 +1424,10 @@ static inline void vpid_sync_vcpu_global(void)
1352 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); 1424 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
1353} 1425}
1354 1426
1355static inline void vpid_sync_context(struct vcpu_vmx *vmx) 1427static inline void vpid_sync_context(int vpid)
1356{ 1428{
1357 if (cpu_has_vmx_invvpid_single()) 1429 if (cpu_has_vmx_invvpid_single())
1358 vpid_sync_vcpu_single(vmx); 1430 vpid_sync_vcpu_single(vpid);
1359 else 1431 else
1360 vpid_sync_vcpu_global(); 1432 vpid_sync_vcpu_global();
1361} 1433}
@@ -1567,7 +1639,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1567 u32 eb; 1639 u32 eb;
1568 1640
1569 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 1641 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1570 (1u << NM_VECTOR) | (1u << DB_VECTOR); 1642 (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
1571 if ((vcpu->guest_debug & 1643 if ((vcpu->guest_debug &
1572 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 1644 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1573 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 1645 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1895,6 +1967,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
1895 preempt_enable(); 1967 preempt_enable();
1896} 1968}
1897 1969
1970static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
1971{
1972 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
1973 struct pi_desc old, new;
1974 unsigned int dest;
1975
1976 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
1977 !irq_remapping_cap(IRQ_POSTING_CAP))
1978 return;
1979
1980 do {
1981 old.control = new.control = pi_desc->control;
1982
1983 /*
1984 * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
1985 * are two possible cases:
1986 * 1. After running 'pre_block', context switch
1987 * happened. For this case, 'sn' was set in
1988 * vmx_vcpu_put(), so we need to clear it here.
1989 * 2. After running 'pre_block', we were blocked,
1990 * and woken up by some other guy. For this case,
1991 * we don't need to do anything, 'pi_post_block'
1992 * will do everything for us. However, we cannot
1993 * check whether it is case #1 or case #2 here
1994 * (maybe, not needed), so we also clear sn here,
1995 * I think it is not a big deal.
1996 */
1997 if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
1998 if (vcpu->cpu != cpu) {
1999 dest = cpu_physical_id(cpu);
2000
2001 if (x2apic_enabled())
2002 new.ndst = dest;
2003 else
2004 new.ndst = (dest << 8) & 0xFF00;
2005 }
2006
2007 /* set 'NV' to 'notification vector' */
2008 new.nv = POSTED_INTR_VECTOR;
2009 }
2010
2011 /* Allow posting non-urgent interrupts */
2012 new.sn = 0;
2013 } while (cmpxchg(&pi_desc->control, old.control,
2014 new.control) != old.control);
2015}
1898/* 2016/*
1899 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 2017 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1900 * vcpu mutex is already taken. 2018 * vcpu mutex is already taken.
@@ -1943,12 +2061,35 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1943 2061
1944 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 2062 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1945 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 2063 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
2064
2065 /* Setup TSC multiplier */
2066 if (cpu_has_vmx_tsc_scaling())
2067 vmcs_write64(TSC_MULTIPLIER,
2068 vcpu->arch.tsc_scaling_ratio);
2069
1946 vmx->loaded_vmcs->cpu = cpu; 2070 vmx->loaded_vmcs->cpu = cpu;
1947 } 2071 }
2072
2073 vmx_vcpu_pi_load(vcpu, cpu);
2074}
2075
2076static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
2077{
2078 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2079
2080 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
2081 !irq_remapping_cap(IRQ_POSTING_CAP))
2082 return;
2083
2084 /* Set SN when the vCPU is preempted */
2085 if (vcpu->preempted)
2086 pi_set_sn(pi_desc);
1948} 2087}
1949 2088
1950static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 2089static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1951{ 2090{
2091 vmx_vcpu_pi_put(vcpu);
2092
1952 __vmx_load_host_state(to_vmx(vcpu)); 2093 __vmx_load_host_state(to_vmx(vcpu));
1953 if (!vmm_exclusive) { 2094 if (!vmm_exclusive) {
1954 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); 2095 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
@@ -2207,7 +2348,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
2207 if (index >= 0) 2348 if (index >= 0)
2208 move_msr_up(vmx, index, save_nmsrs++); 2349 move_msr_up(vmx, index, save_nmsrs++);
2209 index = __find_msr_index(vmx, MSR_TSC_AUX); 2350 index = __find_msr_index(vmx, MSR_TSC_AUX);
2210 if (index >= 0 && vmx->rdtscp_enabled) 2351 if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
2211 move_msr_up(vmx, index, save_nmsrs++); 2352 move_msr_up(vmx, index, save_nmsrs++);
2212 /* 2353 /*
2213 * MSR_STAR is only needed on long mode guests, and only 2354 * MSR_STAR is only needed on long mode guests, and only
@@ -2230,15 +2371,16 @@ static void setup_msrs(struct vcpu_vmx *vmx)
2230 2371
2231/* 2372/*
2232 * reads and returns guest's timestamp counter "register" 2373 * reads and returns guest's timestamp counter "register"
2233 * guest_tsc = host_tsc + tsc_offset -- 21.3 2374 * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
2375 * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
2234 */ 2376 */
2235static u64 guest_read_tsc(void) 2377static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
2236{ 2378{
2237 u64 host_tsc, tsc_offset; 2379 u64 host_tsc, tsc_offset;
2238 2380
2239 host_tsc = rdtsc(); 2381 host_tsc = rdtsc();
2240 tsc_offset = vmcs_read64(TSC_OFFSET); 2382 tsc_offset = vmcs_read64(TSC_OFFSET);
2241 return host_tsc + tsc_offset; 2383 return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
2242} 2384}
2243 2385
2244/* 2386/*
@@ -2255,22 +2397,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2255 return host_tsc + tsc_offset; 2397 return host_tsc + tsc_offset;
2256} 2398}
2257 2399
2258/*
2259 * Engage any workarounds for mis-matched TSC rates. Currently limited to
2260 * software catchup for faster rates on slower CPUs.
2261 */
2262static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2263{
2264 if (!scale)
2265 return;
2266
2267 if (user_tsc_khz > tsc_khz) {
2268 vcpu->arch.tsc_catchup = 1;
2269 vcpu->arch.tsc_always_catchup = 1;
2270 } else
2271 WARN(1, "user requested TSC rate below hardware speed\n");
2272}
2273
2274static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) 2400static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
2275{ 2401{
2276 return vmcs_read64(TSC_OFFSET); 2402 return vmcs_read64(TSC_OFFSET);
@@ -2302,7 +2428,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2302 } 2428 }
2303} 2429}
2304 2430
2305static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) 2431static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
2306{ 2432{
2307 u64 offset = vmcs_read64(TSC_OFFSET); 2433 u64 offset = vmcs_read64(TSC_OFFSET);
2308 2434
@@ -2315,11 +2441,6 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
2315 offset + adjustment); 2441 offset + adjustment);
2316} 2442}
2317 2443
2318static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2319{
2320 return target_tsc - rdtsc();
2321}
2322
2323static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) 2444static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
2324{ 2445{
2325 struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); 2446 struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
@@ -2377,7 +2498,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2377 vmx->nested.nested_vmx_pinbased_ctls_high |= 2498 vmx->nested.nested_vmx_pinbased_ctls_high |=
2378 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2499 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2379 PIN_BASED_VMX_PREEMPTION_TIMER; 2500 PIN_BASED_VMX_PREEMPTION_TIMER;
2380 if (vmx_vm_has_apicv(vmx->vcpu.kvm)) 2501 if (vmx_cpu_uses_apicv(&vmx->vcpu))
2381 vmx->nested.nested_vmx_pinbased_ctls_high |= 2502 vmx->nested.nested_vmx_pinbased_ctls_high |=
2382 PIN_BASED_POSTED_INTR; 2503 PIN_BASED_POSTED_INTR;
2383 2504
@@ -2471,10 +2592,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2471 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2592 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2472 SECONDARY_EXEC_RDTSCP | 2593 SECONDARY_EXEC_RDTSCP |
2473 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2594 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2595 SECONDARY_EXEC_ENABLE_VPID |
2474 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2596 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2475 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2597 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2476 SECONDARY_EXEC_WBINVD_EXITING | 2598 SECONDARY_EXEC_WBINVD_EXITING |
2477 SECONDARY_EXEC_XSAVES; 2599 SECONDARY_EXEC_XSAVES |
2600 SECONDARY_EXEC_PCOMMIT;
2478 2601
2479 if (enable_ept) { 2602 if (enable_ept) {
2480 /* nested EPT: emulate EPT also to L1 */ 2603 /* nested EPT: emulate EPT also to L1 */
@@ -2493,6 +2616,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2493 } else 2616 } else
2494 vmx->nested.nested_vmx_ept_caps = 0; 2617 vmx->nested.nested_vmx_ept_caps = 0;
2495 2618
2619 if (enable_vpid)
2620 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
2621 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
2622 else
2623 vmx->nested.nested_vmx_vpid_caps = 0;
2624
2496 if (enable_unrestricted_guest) 2625 if (enable_unrestricted_guest)
2497 vmx->nested.nested_vmx_secondary_ctls_high |= 2626 vmx->nested.nested_vmx_secondary_ctls_high |=
2498 SECONDARY_EXEC_UNRESTRICTED_GUEST; 2627 SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -2608,7 +2737,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2608 break; 2737 break;
2609 case MSR_IA32_VMX_EPT_VPID_CAP: 2738 case MSR_IA32_VMX_EPT_VPID_CAP:
2610 /* Currently, no nested vpid support */ 2739 /* Currently, no nested vpid support */
2611 *pdata = vmx->nested.nested_vmx_ept_caps; 2740 *pdata = vmx->nested.nested_vmx_ept_caps |
2741 ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
2612 break; 2742 break;
2613 default: 2743 default:
2614 return 1; 2744 return 1;
@@ -2642,7 +2772,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2642 case MSR_EFER: 2772 case MSR_EFER:
2643 return kvm_get_msr_common(vcpu, msr_info); 2773 return kvm_get_msr_common(vcpu, msr_info);
2644 case MSR_IA32_TSC: 2774 case MSR_IA32_TSC:
2645 msr_info->data = guest_read_tsc(); 2775 msr_info->data = guest_read_tsc(vcpu);
2646 break; 2776 break;
2647 case MSR_IA32_SYSENTER_CS: 2777 case MSR_IA32_SYSENTER_CS:
2648 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2778 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
@@ -2673,7 +2803,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2673 msr_info->data = vcpu->arch.ia32_xss; 2803 msr_info->data = vcpu->arch.ia32_xss;
2674 break; 2804 break;
2675 case MSR_TSC_AUX: 2805 case MSR_TSC_AUX:
2676 if (!to_vmx(vcpu)->rdtscp_enabled) 2806 if (!guest_cpuid_has_rdtscp(vcpu))
2677 return 1; 2807 return 1;
2678 /* Otherwise falls through */ 2808 /* Otherwise falls through */
2679 default: 2809 default:
@@ -2779,7 +2909,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2779 clear_atomic_switch_msr(vmx, MSR_IA32_XSS); 2909 clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
2780 break; 2910 break;
2781 case MSR_TSC_AUX: 2911 case MSR_TSC_AUX:
2782 if (!vmx->rdtscp_enabled) 2912 if (!guest_cpuid_has_rdtscp(vcpu))
2783 return 1; 2913 return 1;
2784 /* Check reserved bit, higher 32 bits should be zero */ 2914 /* Check reserved bit, higher 32 bits should be zero */
2785 if ((data >> 32) != 0) 2915 if ((data >> 32) != 0)
@@ -2874,6 +3004,8 @@ static int hardware_enable(void)
2874 return -EBUSY; 3004 return -EBUSY;
2875 3005
2876 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 3006 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
3007 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
3008 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
2877 3009
2878 /* 3010 /*
2879 * Now we can enable the vmclear operation in kdump 3011 * Now we can enable the vmclear operation in kdump
@@ -3015,7 +3147,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
3015 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3147 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3016 SECONDARY_EXEC_SHADOW_VMCS | 3148 SECONDARY_EXEC_SHADOW_VMCS |
3017 SECONDARY_EXEC_XSAVES | 3149 SECONDARY_EXEC_XSAVES |
3018 SECONDARY_EXEC_ENABLE_PML; 3150 SECONDARY_EXEC_ENABLE_PML |
3151 SECONDARY_EXEC_PCOMMIT |
3152 SECONDARY_EXEC_TSC_SCALING;
3019 if (adjust_vmx_controls(min2, opt2, 3153 if (adjust_vmx_controls(min2, opt2,
3020 MSR_IA32_VMX_PROCBASED_CTLS2, 3154 MSR_IA32_VMX_PROCBASED_CTLS2,
3021 &_cpu_based_2nd_exec_control) < 0) 3155 &_cpu_based_2nd_exec_control) < 0)
@@ -3441,9 +3575,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
3441 3575
3442#endif 3576#endif
3443 3577
3444static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 3578static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
3445{ 3579{
3446 vpid_sync_context(to_vmx(vcpu)); 3580 vpid_sync_context(vpid);
3447 if (enable_ept) { 3581 if (enable_ept) {
3448 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3582 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3449 return; 3583 return;
@@ -3451,6 +3585,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
3451 } 3585 }
3452} 3586}
3453 3587
3588static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
3589{
3590 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
3591}
3592
3454static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 3593static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
3455{ 3594{
3456 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 3595 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -3644,20 +3783,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3644 if (!is_paging(vcpu)) { 3783 if (!is_paging(vcpu)) {
3645 hw_cr4 &= ~X86_CR4_PAE; 3784 hw_cr4 &= ~X86_CR4_PAE;
3646 hw_cr4 |= X86_CR4_PSE; 3785 hw_cr4 |= X86_CR4_PSE;
3647 /*
3648 * SMEP/SMAP is disabled if CPU is in non-paging mode
3649 * in hardware. However KVM always uses paging mode to
3650 * emulate guest non-paging mode with TDP.
3651 * To emulate this behavior, SMEP/SMAP needs to be
3652 * manually disabled when guest switches to non-paging
3653 * mode.
3654 */
3655 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
3656 } else if (!(cr4 & X86_CR4_PAE)) { 3786 } else if (!(cr4 & X86_CR4_PAE)) {
3657 hw_cr4 &= ~X86_CR4_PAE; 3787 hw_cr4 &= ~X86_CR4_PAE;
3658 } 3788 }
3659 } 3789 }
3660 3790
3791 if (!enable_unrestricted_guest && !is_paging(vcpu))
3792 /*
3793 * SMEP/SMAP is disabled if CPU is in non-paging mode in
3794 * hardware. However KVM always uses paging mode without
3795 * unrestricted guest.
3796 * To emulate this behavior, SMEP/SMAP needs to be manually
3797 * disabled when guest switches to non-paging mode.
3798 */
3799 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
3800
3661 vmcs_writel(CR4_READ_SHADOW, cr4); 3801 vmcs_writel(CR4_READ_SHADOW, cr4);
3662 vmcs_writel(GUEST_CR4, hw_cr4); 3802 vmcs_writel(GUEST_CR4, hw_cr4);
3663 return 0; 3803 return 0;
@@ -4146,29 +4286,28 @@ static int alloc_identity_pagetable(struct kvm *kvm)
4146 return r; 4286 return r;
4147} 4287}
4148 4288
4149static void allocate_vpid(struct vcpu_vmx *vmx) 4289static int allocate_vpid(void)
4150{ 4290{
4151 int vpid; 4291 int vpid;
4152 4292
4153 vmx->vpid = 0;
4154 if (!enable_vpid) 4293 if (!enable_vpid)
4155 return; 4294 return 0;
4156 spin_lock(&vmx_vpid_lock); 4295 spin_lock(&vmx_vpid_lock);
4157 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 4296 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
4158 if (vpid < VMX_NR_VPIDS) { 4297 if (vpid < VMX_NR_VPIDS)
4159 vmx->vpid = vpid;
4160 __set_bit(vpid, vmx_vpid_bitmap); 4298 __set_bit(vpid, vmx_vpid_bitmap);
4161 } 4299 else
4300 vpid = 0;
4162 spin_unlock(&vmx_vpid_lock); 4301 spin_unlock(&vmx_vpid_lock);
4302 return vpid;
4163} 4303}
4164 4304
4165static void free_vpid(struct vcpu_vmx *vmx) 4305static void free_vpid(int vpid)
4166{ 4306{
4167 if (!enable_vpid) 4307 if (!enable_vpid || vpid == 0)
4168 return; 4308 return;
4169 spin_lock(&vmx_vpid_lock); 4309 spin_lock(&vmx_vpid_lock);
4170 if (vmx->vpid != 0) 4310 __clear_bit(vpid, vmx_vpid_bitmap);
4171 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
4172 spin_unlock(&vmx_vpid_lock); 4311 spin_unlock(&vmx_vpid_lock);
4173} 4312}
4174 4313
@@ -4323,9 +4462,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
4323 msr, MSR_TYPE_W); 4462 msr, MSR_TYPE_W);
4324} 4463}
4325 4464
4326static int vmx_vm_has_apicv(struct kvm *kvm) 4465static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
4327{ 4466{
4328 return enable_apicv && irqchip_in_kernel(kvm); 4467 return enable_apicv && lapic_in_kernel(vcpu);
4329} 4468}
4330 4469
4331static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 4470static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -4369,6 +4508,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
4369{ 4508{
4370#ifdef CONFIG_SMP 4509#ifdef CONFIG_SMP
4371 if (vcpu->mode == IN_GUEST_MODE) { 4510 if (vcpu->mode == IN_GUEST_MODE) {
4511 struct vcpu_vmx *vmx = to_vmx(vcpu);
4512
4513 /*
4514 * Currently, we don't support urgent interrupt,
4515 * all interrupts are recognized as non-urgent
4516 * interrupt, so we cannot post interrupts when
4517 * 'SN' is set.
4518 *
4519 * If the vcpu is in guest mode, it means it is
4520 * running instead of being scheduled out and
4521 * waiting in the run queue, and that's the only
4522 * case when 'SN' is set currently, warning if
4523 * 'SN' is set.
4524 */
4525 WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
4526
4372 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), 4527 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
4373 POSTED_INTR_VECTOR); 4528 POSTED_INTR_VECTOR);
4374 return true; 4529 return true;
@@ -4505,7 +4660,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4505{ 4660{
4506 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4661 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4507 4662
4508 if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) 4663 if (!vmx_cpu_uses_apicv(&vmx->vcpu))
4509 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4664 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4510 return pin_based_exec_ctrl; 4665 return pin_based_exec_ctrl;
4511} 4666}
@@ -4517,7 +4672,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4517 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4672 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4518 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4673 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4519 4674
4520 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { 4675 if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
4521 exec_control &= ~CPU_BASED_TPR_SHADOW; 4676 exec_control &= ~CPU_BASED_TPR_SHADOW;
4522#ifdef CONFIG_X86_64 4677#ifdef CONFIG_X86_64
4523 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4678 exec_control |= CPU_BASED_CR8_STORE_EXITING |
@@ -4534,7 +4689,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4534static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4689static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4535{ 4690{
4536 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4691 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4537 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) 4692 if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu))
4538 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4693 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4539 if (vmx->vpid == 0) 4694 if (vmx->vpid == 0)
4540 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4695 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
@@ -4548,7 +4703,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4548 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4703 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4549 if (!ple_gap) 4704 if (!ple_gap)
4550 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4705 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4551 if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) 4706 if (!vmx_cpu_uses_apicv(&vmx->vcpu))
4552 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4707 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4553 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4708 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4554 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4709 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -4558,8 +4713,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4558 a current VMCS12 4713 a current VMCS12
4559 */ 4714 */
4560 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4715 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4561 /* PML is enabled/disabled in creating/destorying vcpu */ 4716
4562 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4717 if (!enable_pml)
4718 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4719
4720 /* Currently, we allow L1 guest to directly run pcommit instruction. */
4721 exec_control &= ~SECONDARY_EXEC_PCOMMIT;
4563 4722
4564 return exec_control; 4723 return exec_control;
4565} 4724}
@@ -4604,12 +4763,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4604 4763
4605 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 4764 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
4606 4765
4607 if (cpu_has_secondary_exec_ctrls()) { 4766 if (cpu_has_secondary_exec_ctrls())
4608 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 4767 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4609 vmx_secondary_exec_control(vmx)); 4768 vmx_secondary_exec_control(vmx));
4610 }
4611 4769
4612 if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { 4770 if (vmx_cpu_uses_apicv(&vmx->vcpu)) {
4613 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4771 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4614 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4772 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4615 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4773 vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4753,7 +4911,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4753 4911
4754 if (cpu_has_vmx_tpr_shadow() && !init_event) { 4912 if (cpu_has_vmx_tpr_shadow() && !init_event) {
4755 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4913 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4756 if (vm_need_tpr_shadow(vcpu->kvm)) 4914 if (cpu_need_tpr_shadow(vcpu))
4757 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4915 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4758 __pa(vcpu->arch.apic->regs)); 4916 __pa(vcpu->arch.apic->regs));
4759 vmcs_write32(TPR_THRESHOLD, 0); 4917 vmcs_write32(TPR_THRESHOLD, 0);
@@ -4761,7 +4919,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4761 4919
4762 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4920 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4763 4921
4764 if (vmx_vm_has_apicv(vcpu->kvm)) 4922 if (vmx_cpu_uses_apicv(vcpu))
4765 memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); 4923 memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
4766 4924
4767 if (vmx->vpid != 0) 4925 if (vmx->vpid != 0)
@@ -4771,12 +4929,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4771 vmx_set_cr0(vcpu, cr0); /* enter rmode */ 4929 vmx_set_cr0(vcpu, cr0); /* enter rmode */
4772 vmx->vcpu.arch.cr0 = cr0; 4930 vmx->vcpu.arch.cr0 = cr0;
4773 vmx_set_cr4(vcpu, 0); 4931 vmx_set_cr4(vcpu, 0);
4774 if (!init_event) 4932 vmx_set_efer(vcpu, 0);
4775 vmx_set_efer(vcpu, 0);
4776 vmx_fpu_activate(vcpu); 4933 vmx_fpu_activate(vcpu);
4777 update_exception_bitmap(vcpu); 4934 update_exception_bitmap(vcpu);
4778 4935
4779 vpid_sync_context(vmx); 4936 vpid_sync_context(vmx->vpid);
4780} 4937}
4781 4938
4782/* 4939/*
@@ -5104,6 +5261,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
5104 return handle_rmode_exception(vcpu, ex_no, error_code); 5261 return handle_rmode_exception(vcpu, ex_no, error_code);
5105 5262
5106 switch (ex_no) { 5263 switch (ex_no) {
5264 case AC_VECTOR:
5265 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5266 return 1;
5107 case DB_VECTOR: 5267 case DB_VECTOR:
5108 dr6 = vmcs_readl(EXIT_QUALIFICATION); 5268 dr6 = vmcs_readl(EXIT_QUALIFICATION);
5109 if (!(vcpu->guest_debug & 5269 if (!(vcpu->guest_debug &
@@ -5296,7 +5456,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
5296 u8 cr8 = (u8)val; 5456 u8 cr8 = (u8)val;
5297 err = kvm_set_cr8(vcpu, cr8); 5457 err = kvm_set_cr8(vcpu, cr8);
5298 kvm_complete_insn_gp(vcpu, err); 5458 kvm_complete_insn_gp(vcpu, err);
5299 if (irqchip_in_kernel(vcpu->kvm)) 5459 if (lapic_in_kernel(vcpu))
5300 return 1; 5460 return 1;
5301 if (cr8_prev <= cr8) 5461 if (cr8_prev <= cr8)
5302 return 1; 5462 return 1;
@@ -5510,17 +5670,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5510 kvm_make_request(KVM_REQ_EVENT, vcpu); 5670 kvm_make_request(KVM_REQ_EVENT, vcpu);
5511 5671
5512 ++vcpu->stat.irq_window_exits; 5672 ++vcpu->stat.irq_window_exits;
5513
5514 /*
5515 * If the user space waits to inject interrupts, exit as soon as
5516 * possible
5517 */
5518 if (!irqchip_in_kernel(vcpu->kvm) &&
5519 vcpu->run->request_interrupt_window &&
5520 !kvm_cpu_has_interrupt(vcpu)) {
5521 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
5522 return 0;
5523 }
5524 return 1; 5673 return 1;
5525} 5674}
5526 5675
@@ -5753,10 +5902,11 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5753 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5902 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5754 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5903 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5755 skip_emulated_instruction(vcpu); 5904 skip_emulated_instruction(vcpu);
5905 trace_kvm_fast_mmio(gpa);
5756 return 1; 5906 return 1;
5757 } 5907 }
5758 5908
5759 ret = handle_mmio_page_fault_common(vcpu, gpa, true); 5909 ret = handle_mmio_page_fault(vcpu, gpa, true);
5760 if (likely(ret == RET_MMIO_PF_EMULATE)) 5910 if (likely(ret == RET_MMIO_PF_EMULATE))
5761 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == 5911 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
5762 EMULATE_DONE; 5912 EMULATE_DONE;
@@ -5910,6 +6060,25 @@ static void update_ple_window_actual_max(void)
5910 ple_window_grow, INT_MIN); 6060 ple_window_grow, INT_MIN);
5911} 6061}
5912 6062
6063/*
6064 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
6065 */
6066static void wakeup_handler(void)
6067{
6068 struct kvm_vcpu *vcpu;
6069 int cpu = smp_processor_id();
6070
6071 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6072 list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
6073 blocked_vcpu_list) {
6074 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
6075
6076 if (pi_test_on(pi_desc) == 1)
6077 kvm_vcpu_kick(vcpu);
6078 }
6079 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6080}
6081
5913static __init int hardware_setup(void) 6082static __init int hardware_setup(void)
5914{ 6083{
5915 int r = -ENOMEM, i, msr; 6084 int r = -ENOMEM, i, msr;
@@ -6028,6 +6197,12 @@ static __init int hardware_setup(void)
6028 if (!cpu_has_vmx_apicv()) 6197 if (!cpu_has_vmx_apicv())
6029 enable_apicv = 0; 6198 enable_apicv = 0;
6030 6199
6200 if (cpu_has_vmx_tsc_scaling()) {
6201 kvm_has_tsc_control = true;
6202 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
6203 kvm_tsc_scaling_ratio_frac_bits = 48;
6204 }
6205
6031 if (enable_apicv) 6206 if (enable_apicv)
6032 kvm_x86_ops->update_cr8_intercept = NULL; 6207 kvm_x86_ops->update_cr8_intercept = NULL;
6033 else { 6208 else {
@@ -6096,6 +6271,8 @@ static __init int hardware_setup(void)
6096 kvm_x86_ops->enable_log_dirty_pt_masked = NULL; 6271 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
6097 } 6272 }
6098 6273
6274 kvm_set_posted_intr_wakeup_handler(wakeup_handler);
6275
6099 return alloc_kvm_area(); 6276 return alloc_kvm_area();
6100 6277
6101out8: 6278out8:
@@ -6627,7 +6804,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
6627 6804
6628static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) 6805static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
6629{ 6806{
6630 u32 exec_control;
6631 if (vmx->nested.current_vmptr == -1ull) 6807 if (vmx->nested.current_vmptr == -1ull)
6632 return; 6808 return;
6633 6809
@@ -6640,9 +6816,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
6640 they were modified */ 6816 they were modified */
6641 copy_shadow_to_vmcs12(vmx); 6817 copy_shadow_to_vmcs12(vmx);
6642 vmx->nested.sync_shadow_vmcs = false; 6818 vmx->nested.sync_shadow_vmcs = false;
6643 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6819 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
6644 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 6820 SECONDARY_EXEC_SHADOW_VMCS);
6645 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6646 vmcs_write64(VMCS_LINK_POINTER, -1ull); 6821 vmcs_write64(VMCS_LINK_POINTER, -1ull);
6647 } 6822 }
6648 vmx->nested.posted_intr_nv = -1; 6823 vmx->nested.posted_intr_nv = -1;
@@ -6662,6 +6837,7 @@ static void free_nested(struct vcpu_vmx *vmx)
6662 return; 6837 return;
6663 6838
6664 vmx->nested.vmxon = false; 6839 vmx->nested.vmxon = false;
6840 free_vpid(vmx->nested.vpid02);
6665 nested_release_vmcs12(vmx); 6841 nested_release_vmcs12(vmx);
6666 if (enable_shadow_vmcs) 6842 if (enable_shadow_vmcs)
6667 free_vmcs(vmx->nested.current_shadow_vmcs); 6843 free_vmcs(vmx->nested.current_shadow_vmcs);
@@ -7038,7 +7214,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
7038{ 7214{
7039 struct vcpu_vmx *vmx = to_vmx(vcpu); 7215 struct vcpu_vmx *vmx = to_vmx(vcpu);
7040 gpa_t vmptr; 7216 gpa_t vmptr;
7041 u32 exec_control;
7042 7217
7043 if (!nested_vmx_check_permission(vcpu)) 7218 if (!nested_vmx_check_permission(vcpu))
7044 return 1; 7219 return 1;
@@ -7070,9 +7245,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
7070 vmx->nested.current_vmcs12 = new_vmcs12; 7245 vmx->nested.current_vmcs12 = new_vmcs12;
7071 vmx->nested.current_vmcs12_page = page; 7246 vmx->nested.current_vmcs12_page = page;
7072 if (enable_shadow_vmcs) { 7247 if (enable_shadow_vmcs) {
7073 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7248 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
7074 exec_control |= SECONDARY_EXEC_SHADOW_VMCS; 7249 SECONDARY_EXEC_SHADOW_VMCS);
7075 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7076 vmcs_write64(VMCS_LINK_POINTER, 7250 vmcs_write64(VMCS_LINK_POINTER,
7077 __pa(vmx->nested.current_shadow_vmcs)); 7251 __pa(vmx->nested.current_shadow_vmcs));
7078 vmx->nested.sync_shadow_vmcs = true; 7252 vmx->nested.sync_shadow_vmcs = true;
@@ -7178,7 +7352,58 @@ static int handle_invept(struct kvm_vcpu *vcpu)
7178 7352
7179static int handle_invvpid(struct kvm_vcpu *vcpu) 7353static int handle_invvpid(struct kvm_vcpu *vcpu)
7180{ 7354{
7181 kvm_queue_exception(vcpu, UD_VECTOR); 7355 struct vcpu_vmx *vmx = to_vmx(vcpu);
7356 u32 vmx_instruction_info;
7357 unsigned long type, types;
7358 gva_t gva;
7359 struct x86_exception e;
7360 int vpid;
7361
7362 if (!(vmx->nested.nested_vmx_secondary_ctls_high &
7363 SECONDARY_EXEC_ENABLE_VPID) ||
7364 !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
7365 kvm_queue_exception(vcpu, UD_VECTOR);
7366 return 1;
7367 }
7368
7369 if (!nested_vmx_check_permission(vcpu))
7370 return 1;
7371
7372 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7373 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
7374
7375 types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
7376
7377 if (!(types & (1UL << type))) {
7378 nested_vmx_failValid(vcpu,
7379 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7380 return 1;
7381 }
7382
7383 /* according to the intel vmx instruction reference, the memory
7384 * operand is read even if it isn't needed (e.g., for type==global)
7385 */
7386 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
7387 vmx_instruction_info, false, &gva))
7388 return 1;
7389 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
7390 sizeof(u32), &e)) {
7391 kvm_inject_page_fault(vcpu, &e);
7392 return 1;
7393 }
7394
7395 switch (type) {
7396 case VMX_VPID_EXTENT_ALL_CONTEXT:
7397 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
7398 nested_vmx_succeed(vcpu);
7399 break;
7400 default:
7401 /* Trap single context invalidation invvpid calls */
7402 BUG_ON(1);
7403 break;
7404 }
7405
7406 skip_emulated_instruction(vcpu);
7182 return 1; 7407 return 1;
7183} 7408}
7184 7409
@@ -7207,6 +7432,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
7207 return 1; 7432 return 1;
7208} 7433}
7209 7434
7435static int handle_pcommit(struct kvm_vcpu *vcpu)
7436{
7437 /* we never catch pcommit instruct for L1 guest. */
7438 WARN_ON(1);
7439 return 1;
7440}
7441
7210/* 7442/*
7211 * The exit handlers return 1 if the exit was handled fully and guest execution 7443 * The exit handlers return 1 if the exit was handled fully and guest execution
7212 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 7444 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7257,6 +7489,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
7257 [EXIT_REASON_XSAVES] = handle_xsaves, 7489 [EXIT_REASON_XSAVES] = handle_xsaves,
7258 [EXIT_REASON_XRSTORS] = handle_xrstors, 7490 [EXIT_REASON_XRSTORS] = handle_xrstors,
7259 [EXIT_REASON_PML_FULL] = handle_pml_full, 7491 [EXIT_REASON_PML_FULL] = handle_pml_full,
7492 [EXIT_REASON_PCOMMIT] = handle_pcommit,
7260}; 7493};
7261 7494
7262static const int kvm_vmx_max_exit_handlers = 7495static const int kvm_vmx_max_exit_handlers =
@@ -7558,6 +7791,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7558 * the XSS exit bitmap in vmcs12. 7791 * the XSS exit bitmap in vmcs12.
7559 */ 7792 */
7560 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 7793 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
7794 case EXIT_REASON_PCOMMIT:
7795 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
7561 default: 7796 default:
7562 return true; 7797 return true;
7563 } 7798 }
@@ -7569,10 +7804,9 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
7569 *info2 = vmcs_read32(VM_EXIT_INTR_INFO); 7804 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
7570} 7805}
7571 7806
7572static int vmx_enable_pml(struct vcpu_vmx *vmx) 7807static int vmx_create_pml_buffer(struct vcpu_vmx *vmx)
7573{ 7808{
7574 struct page *pml_pg; 7809 struct page *pml_pg;
7575 u32 exec_control;
7576 7810
7577 pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); 7811 pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
7578 if (!pml_pg) 7812 if (!pml_pg)
@@ -7583,24 +7817,15 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx)
7583 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 7817 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
7584 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 7818 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
7585 7819
7586 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7587 exec_control |= SECONDARY_EXEC_ENABLE_PML;
7588 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7589
7590 return 0; 7820 return 0;
7591} 7821}
7592 7822
7593static void vmx_disable_pml(struct vcpu_vmx *vmx) 7823static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
7594{ 7824{
7595 u32 exec_control; 7825 if (vmx->pml_pg) {
7596 7826 __free_page(vmx->pml_pg);
7597 ASSERT(vmx->pml_pg); 7827 vmx->pml_pg = NULL;
7598 __free_page(vmx->pml_pg); 7828 }
7599 vmx->pml_pg = NULL;
7600
7601 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7602 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
7603 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7604} 7829}
7605 7830
7606static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 7831static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
@@ -7782,6 +8007,9 @@ static void dump_vmcs(void)
7782 vmcs_read32(IDT_VECTORING_INFO_FIELD), 8007 vmcs_read32(IDT_VECTORING_INFO_FIELD),
7783 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 8008 vmcs_read32(IDT_VECTORING_ERROR_CODE));
7784 pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET)); 8009 pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
8010 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
8011 pr_err("TSC Multiplier = 0x%016lx\n",
8012 vmcs_readl(TSC_MULTIPLIER));
7785 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) 8013 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
7786 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 8014 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
7787 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 8015 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
@@ -7924,10 +8152,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
7924 * apicv 8152 * apicv
7925 */ 8153 */
7926 if (!cpu_has_vmx_virtualize_x2apic_mode() || 8154 if (!cpu_has_vmx_virtualize_x2apic_mode() ||
7927 !vmx_vm_has_apicv(vcpu->kvm)) 8155 !vmx_cpu_uses_apicv(vcpu))
7928 return; 8156 return;
7929 8157
7930 if (!vm_need_tpr_shadow(vcpu->kvm)) 8158 if (!cpu_need_tpr_shadow(vcpu))
7931 return; 8159 return;
7932 8160
7933 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8161 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -8029,9 +8257,10 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
8029 } 8257 }
8030} 8258}
8031 8259
8032static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 8260static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu)
8033{ 8261{
8034 if (!vmx_vm_has_apicv(vcpu->kvm)) 8262 u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap;
8263 if (!vmx_cpu_uses_apicv(vcpu))
8035 return; 8264 return;
8036 8265
8037 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 8266 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
@@ -8477,8 +8706,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
8477 struct vcpu_vmx *vmx = to_vmx(vcpu); 8706 struct vcpu_vmx *vmx = to_vmx(vcpu);
8478 8707
8479 if (enable_pml) 8708 if (enable_pml)
8480 vmx_disable_pml(vmx); 8709 vmx_destroy_pml_buffer(vmx);
8481 free_vpid(vmx); 8710 free_vpid(vmx->vpid);
8482 leave_guest_mode(vcpu); 8711 leave_guest_mode(vcpu);
8483 vmx_load_vmcs01(vcpu); 8712 vmx_load_vmcs01(vcpu);
8484 free_nested(vmx); 8713 free_nested(vmx);
@@ -8497,7 +8726,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8497 if (!vmx) 8726 if (!vmx)
8498 return ERR_PTR(-ENOMEM); 8727 return ERR_PTR(-ENOMEM);
8499 8728
8500 allocate_vpid(vmx); 8729 vmx->vpid = allocate_vpid();
8501 8730
8502 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 8731 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
8503 if (err) 8732 if (err)
@@ -8530,7 +8759,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8530 put_cpu(); 8759 put_cpu();
8531 if (err) 8760 if (err)
8532 goto free_vmcs; 8761 goto free_vmcs;
8533 if (vm_need_virtualize_apic_accesses(kvm)) { 8762 if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
8534 err = alloc_apic_access_page(kvm); 8763 err = alloc_apic_access_page(kvm);
8535 if (err) 8764 if (err)
8536 goto free_vmcs; 8765 goto free_vmcs;
@@ -8545,8 +8774,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8545 goto free_vmcs; 8774 goto free_vmcs;
8546 } 8775 }
8547 8776
8548 if (nested) 8777 if (nested) {
8549 nested_vmx_setup_ctls_msrs(vmx); 8778 nested_vmx_setup_ctls_msrs(vmx);
8779 vmx->nested.vpid02 = allocate_vpid();
8780 }
8550 8781
8551 vmx->nested.posted_intr_nv = -1; 8782 vmx->nested.posted_intr_nv = -1;
8552 vmx->nested.current_vmptr = -1ull; 8783 vmx->nested.current_vmptr = -1ull;
@@ -8559,7 +8790,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8559 * for the guest, etc. 8790 * for the guest, etc.
8560 */ 8791 */
8561 if (enable_pml) { 8792 if (enable_pml) {
8562 err = vmx_enable_pml(vmx); 8793 err = vmx_create_pml_buffer(vmx);
8563 if (err) 8794 if (err)
8564 goto free_vmcs; 8795 goto free_vmcs;
8565 } 8796 }
@@ -8567,13 +8798,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8567 return &vmx->vcpu; 8798 return &vmx->vcpu;
8568 8799
8569free_vmcs: 8800free_vmcs:
8801 free_vpid(vmx->nested.vpid02);
8570 free_loaded_vmcs(vmx->loaded_vmcs); 8802 free_loaded_vmcs(vmx->loaded_vmcs);
8571free_msrs: 8803free_msrs:
8572 kfree(vmx->guest_msrs); 8804 kfree(vmx->guest_msrs);
8573uninit_vcpu: 8805uninit_vcpu:
8574 kvm_vcpu_uninit(&vmx->vcpu); 8806 kvm_vcpu_uninit(&vmx->vcpu);
8575free_vcpu: 8807free_vcpu:
8576 free_vpid(vmx); 8808 free_vpid(vmx->vpid);
8577 kmem_cache_free(kvm_vcpu_cache, vmx); 8809 kmem_cache_free(kvm_vcpu_cache, vmx);
8578 return ERR_PTR(err); 8810 return ERR_PTR(err);
8579} 8811}
@@ -8648,49 +8880,67 @@ static int vmx_get_lpage_level(void)
8648 return PT_PDPE_LEVEL; 8880 return PT_PDPE_LEVEL;
8649} 8881}
8650 8882
8883static void vmcs_set_secondary_exec_control(u32 new_ctl)
8884{
8885 /*
8886 * These bits in the secondary execution controls field
8887 * are dynamic, the others are mostly based on the hypervisor
8888 * architecture and the guest's CPUID. Do not touch the
8889 * dynamic bits.
8890 */
8891 u32 mask =
8892 SECONDARY_EXEC_SHADOW_VMCS |
8893 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
8894 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
8895
8896 u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8897
8898 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
8899 (new_ctl & ~mask) | (cur_ctl & mask));
8900}
8901
8651static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 8902static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
8652{ 8903{
8653 struct kvm_cpuid_entry2 *best; 8904 struct kvm_cpuid_entry2 *best;
8654 struct vcpu_vmx *vmx = to_vmx(vcpu); 8905 struct vcpu_vmx *vmx = to_vmx(vcpu);
8655 u32 exec_control; 8906 u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
8656 8907
8657 vmx->rdtscp_enabled = false;
8658 if (vmx_rdtscp_supported()) { 8908 if (vmx_rdtscp_supported()) {
8659 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8909 bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu);
8660 if (exec_control & SECONDARY_EXEC_RDTSCP) { 8910 if (!rdtscp_enabled)
8661 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 8911 secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
8662 if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) 8912
8663 vmx->rdtscp_enabled = true; 8913 if (nested) {
8664 else { 8914 if (rdtscp_enabled)
8665 exec_control &= ~SECONDARY_EXEC_RDTSCP; 8915 vmx->nested.nested_vmx_secondary_ctls_high |=
8666 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 8916 SECONDARY_EXEC_RDTSCP;
8667 exec_control); 8917 else
8668 } 8918 vmx->nested.nested_vmx_secondary_ctls_high &=
8919 ~SECONDARY_EXEC_RDTSCP;
8669 } 8920 }
8670 if (nested && !vmx->rdtscp_enabled)
8671 vmx->nested.nested_vmx_secondary_ctls_high &=
8672 ~SECONDARY_EXEC_RDTSCP;
8673 } 8921 }
8674 8922
8675 /* Exposing INVPCID only when PCID is exposed */ 8923 /* Exposing INVPCID only when PCID is exposed */
8676 best = kvm_find_cpuid_entry(vcpu, 0x7, 0); 8924 best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
8677 if (vmx_invpcid_supported() && 8925 if (vmx_invpcid_supported() &&
8678 best && (best->ebx & bit(X86_FEATURE_INVPCID)) && 8926 (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
8679 guest_cpuid_has_pcid(vcpu)) { 8927 !guest_cpuid_has_pcid(vcpu))) {
8680 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8928 secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
8681 exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; 8929
8682 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
8683 exec_control);
8684 } else {
8685 if (cpu_has_secondary_exec_ctrls()) {
8686 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8687 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
8688 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
8689 exec_control);
8690 }
8691 if (best) 8930 if (best)
8692 best->ebx &= ~bit(X86_FEATURE_INVPCID); 8931 best->ebx &= ~bit(X86_FEATURE_INVPCID);
8693 } 8932 }
8933
8934 vmcs_set_secondary_exec_control(secondary_exec_ctl);
8935
8936 if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
8937 if (guest_cpuid_has_pcommit(vcpu))
8938 vmx->nested.nested_vmx_secondary_ctls_high |=
8939 SECONDARY_EXEC_PCOMMIT;
8940 else
8941 vmx->nested.nested_vmx_secondary_ctls_high &=
8942 ~SECONDARY_EXEC_PCOMMIT;
8943 }
8694} 8944}
8695 8945
8696static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 8946static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9298,13 +9548,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9298 9548
9299 if (cpu_has_secondary_exec_ctrls()) { 9549 if (cpu_has_secondary_exec_ctrls()) {
9300 exec_control = vmx_secondary_exec_control(vmx); 9550 exec_control = vmx_secondary_exec_control(vmx);
9301 if (!vmx->rdtscp_enabled) 9551
9302 exec_control &= ~SECONDARY_EXEC_RDTSCP;
9303 /* Take the following fields only from vmcs12 */ 9552 /* Take the following fields only from vmcs12 */
9304 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 9553 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
9305 SECONDARY_EXEC_RDTSCP | 9554 SECONDARY_EXEC_RDTSCP |
9306 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 9555 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
9307 SECONDARY_EXEC_APIC_REGISTER_VIRT); 9556 SECONDARY_EXEC_APIC_REGISTER_VIRT |
9557 SECONDARY_EXEC_PCOMMIT);
9308 if (nested_cpu_has(vmcs12, 9558 if (nested_cpu_has(vmcs12,
9309 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 9559 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
9310 exec_control |= vmcs12->secondary_vm_exec_control; 9560 exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9323,7 +9573,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9323 vmcs_write64(APIC_ACCESS_ADDR, 9573 vmcs_write64(APIC_ACCESS_ADDR,
9324 page_to_phys(vmx->nested.apic_access_page)); 9574 page_to_phys(vmx->nested.apic_access_page));
9325 } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && 9575 } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
9326 (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { 9576 cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
9327 exec_control |= 9577 exec_control |=
9328 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 9578 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
9329 kvm_vcpu_reload_apic_access_page(vcpu); 9579 kvm_vcpu_reload_apic_access_page(vcpu);
@@ -9433,12 +9683,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9433 9683
9434 if (enable_vpid) { 9684 if (enable_vpid) {
9435 /* 9685 /*
9436 * Trivially support vpid by letting L2s share their parent 9686 * There is no direct mapping between vpid02 and vpid12, the
9437 * L1's vpid. TODO: move to a more elaborate solution, giving 9687 * vpid02 is per-vCPU for L0 and reused while the value of
9438 * each L2 its own vpid and exposing the vpid feature to L1. 9688 * vpid12 is changed w/ one invvpid during nested vmentry.
9689 * The vpid12 is allocated by L1 for L2, so it will not
9690 * influence global bitmap(for vpid01 and vpid02 allocation)
9691 * even if spawn a lot of nested vCPUs.
9439 */ 9692 */
9440 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 9693 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
9441 vmx_flush_tlb(vcpu); 9694 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
9695 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
9696 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
9697 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
9698 }
9699 } else {
9700 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
9701 vmx_flush_tlb(vcpu);
9702 }
9703
9442 } 9704 }
9443 9705
9444 if (nested_cpu_has_ept(vmcs12)) { 9706 if (nested_cpu_has_ept(vmcs12)) {
@@ -10278,6 +10540,201 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
10278 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); 10540 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
10279} 10541}
10280 10542
10543/*
10544 * This routine does the following things for vCPU which is going
10545 * to be blocked if VT-d PI is enabled.
10546 * - Store the vCPU to the wakeup list, so when interrupts happen
10547 * we can find the right vCPU to wake up.
10548 * - Change the Posted-interrupt descriptor as below:
10549 * 'NDST' <-- vcpu->pre_pcpu
10550 * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
10551 * - If 'ON' is set during this process, which means at least one
10552 * interrupt is posted for this vCPU, we cannot block it, in
10553 * this case, return 1, otherwise, return 0.
10554 *
10555 */
10556static int vmx_pre_block(struct kvm_vcpu *vcpu)
10557{
10558 unsigned long flags;
10559 unsigned int dest;
10560 struct pi_desc old, new;
10561 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
10562
10563 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
10564 !irq_remapping_cap(IRQ_POSTING_CAP))
10565 return 0;
10566
10567 vcpu->pre_pcpu = vcpu->cpu;
10568 spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
10569 vcpu->pre_pcpu), flags);
10570 list_add_tail(&vcpu->blocked_vcpu_list,
10571 &per_cpu(blocked_vcpu_on_cpu,
10572 vcpu->pre_pcpu));
10573 spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
10574 vcpu->pre_pcpu), flags);
10575
10576 do {
10577 old.control = new.control = pi_desc->control;
10578
10579 /*
10580 * We should not block the vCPU if
10581 * an interrupt is posted for it.
10582 */
10583 if (pi_test_on(pi_desc) == 1) {
10584 spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
10585 vcpu->pre_pcpu), flags);
10586 list_del(&vcpu->blocked_vcpu_list);
10587 spin_unlock_irqrestore(
10588 &per_cpu(blocked_vcpu_on_cpu_lock,
10589 vcpu->pre_pcpu), flags);
10590 vcpu->pre_pcpu = -1;
10591
10592 return 1;
10593 }
10594
10595 WARN((pi_desc->sn == 1),
10596 "Warning: SN field of posted-interrupts "
10597 "is set before blocking\n");
10598
10599 /*
10600 * Since vCPU can be preempted during this process,
10601 * vcpu->cpu could be different with pre_pcpu, we
10602 * need to set pre_pcpu as the destination of wakeup
10603 * notification event, then we can find the right vCPU
10604 * to wakeup in wakeup handler if interrupts happen
10605 * when the vCPU is in blocked state.
10606 */
10607 dest = cpu_physical_id(vcpu->pre_pcpu);
10608
10609 if (x2apic_enabled())
10610 new.ndst = dest;
10611 else
10612 new.ndst = (dest << 8) & 0xFF00;
10613
10614 /* set 'NV' to 'wakeup vector' */
10615 new.nv = POSTED_INTR_WAKEUP_VECTOR;
10616 } while (cmpxchg(&pi_desc->control, old.control,
10617 new.control) != old.control);
10618
10619 return 0;
10620}
10621
10622static void vmx_post_block(struct kvm_vcpu *vcpu)
10623{
10624 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
10625 struct pi_desc old, new;
10626 unsigned int dest;
10627 unsigned long flags;
10628
10629 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
10630 !irq_remapping_cap(IRQ_POSTING_CAP))
10631 return;
10632
10633 do {
10634 old.control = new.control = pi_desc->control;
10635
10636 dest = cpu_physical_id(vcpu->cpu);
10637
10638 if (x2apic_enabled())
10639 new.ndst = dest;
10640 else
10641 new.ndst = (dest << 8) & 0xFF00;
10642
10643 /* Allow posting non-urgent interrupts */
10644 new.sn = 0;
10645
10646 /* set 'NV' to 'notification vector' */
10647 new.nv = POSTED_INTR_VECTOR;
10648 } while (cmpxchg(&pi_desc->control, old.control,
10649 new.control) != old.control);
10650
10651 if(vcpu->pre_pcpu != -1) {
10652 spin_lock_irqsave(
10653 &per_cpu(blocked_vcpu_on_cpu_lock,
10654 vcpu->pre_pcpu), flags);
10655 list_del(&vcpu->blocked_vcpu_list);
10656 spin_unlock_irqrestore(
10657 &per_cpu(blocked_vcpu_on_cpu_lock,
10658 vcpu->pre_pcpu), flags);
10659 vcpu->pre_pcpu = -1;
10660 }
10661}
10662
10663/*
10664 * vmx_update_pi_irte - set IRTE for Posted-Interrupts
10665 *
10666 * @kvm: kvm
10667 * @host_irq: host irq of the interrupt
10668 * @guest_irq: gsi of the interrupt
10669 * @set: set or unset PI
10670 * returns 0 on success, < 0 on failure
10671 */
10672static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
10673 uint32_t guest_irq, bool set)
10674{
10675 struct kvm_kernel_irq_routing_entry *e;
10676 struct kvm_irq_routing_table *irq_rt;
10677 struct kvm_lapic_irq irq;
10678 struct kvm_vcpu *vcpu;
10679 struct vcpu_data vcpu_info;
10680 int idx, ret = -EINVAL;
10681
10682 if (!kvm_arch_has_assigned_device(kvm) ||
10683 !irq_remapping_cap(IRQ_POSTING_CAP))
10684 return 0;
10685
10686 idx = srcu_read_lock(&kvm->irq_srcu);
10687 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
10688 BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
10689
10690 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
10691 if (e->type != KVM_IRQ_ROUTING_MSI)
10692 continue;
10693 /*
10694 * VT-d PI cannot support posting multicast/broadcast
10695 * interrupts to a vCPU, we still use interrupt remapping
10696 * for these kind of interrupts.
10697 *
10698 * For lowest-priority interrupts, we only support
10699 * those with single CPU as the destination, e.g. user
10700 * configures the interrupts via /proc/irq or uses
10701 * irqbalance to make the interrupts single-CPU.
10702 *
10703 * We will support full lowest-priority interrupt later.
10704 */
10705
10706 kvm_set_msi_irq(e, &irq);
10707 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
10708 continue;
10709
10710 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
10711 vcpu_info.vector = irq.vector;
10712
10713 trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
10714 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
10715
10716 if (set)
10717 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
10718 else {
10719 /* suppress notification event before unposting */
10720 pi_set_sn(vcpu_to_pi_desc(vcpu));
10721 ret = irq_set_vcpu_affinity(host_irq, NULL);
10722 pi_clear_sn(vcpu_to_pi_desc(vcpu));
10723 }
10724
10725 if (ret < 0) {
10726 printk(KERN_INFO "%s: failed to update PI IRTE\n",
10727 __func__);
10728 goto out;
10729 }
10730 }
10731
10732 ret = 0;
10733out:
10734 srcu_read_unlock(&kvm->irq_srcu, idx);
10735 return ret;
10736}
10737
10281static struct kvm_x86_ops vmx_x86_ops = { 10738static struct kvm_x86_ops vmx_x86_ops = {
10282 .cpu_has_kvm_support = cpu_has_kvm_support, 10739 .cpu_has_kvm_support = cpu_has_kvm_support,
10283 .disabled_by_bios = vmx_disabled_by_bios, 10740 .disabled_by_bios = vmx_disabled_by_bios,
@@ -10297,7 +10754,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
10297 .vcpu_load = vmx_vcpu_load, 10754 .vcpu_load = vmx_vcpu_load,
10298 .vcpu_put = vmx_vcpu_put, 10755 .vcpu_put = vmx_vcpu_put,
10299 10756
10300 .update_db_bp_intercept = update_exception_bitmap, 10757 .update_bp_intercept = update_exception_bitmap,
10301 .get_msr = vmx_get_msr, 10758 .get_msr = vmx_get_msr,
10302 .set_msr = vmx_set_msr, 10759 .set_msr = vmx_set_msr,
10303 .get_segment_base = vmx_get_segment_base, 10760 .get_segment_base = vmx_get_segment_base,
@@ -10347,7 +10804,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
10347 .update_cr8_intercept = update_cr8_intercept, 10804 .update_cr8_intercept = update_cr8_intercept,
10348 .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, 10805 .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
10349 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 10806 .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
10350 .vm_has_apicv = vmx_vm_has_apicv, 10807 .cpu_uses_apicv = vmx_cpu_uses_apicv,
10351 .load_eoi_exitmap = vmx_load_eoi_exitmap, 10808 .load_eoi_exitmap = vmx_load_eoi_exitmap,
10352 .hwapic_irr_update = vmx_hwapic_irr_update, 10809 .hwapic_irr_update = vmx_hwapic_irr_update,
10353 .hwapic_isr_update = vmx_hwapic_isr_update, 10810 .hwapic_isr_update = vmx_hwapic_isr_update,
@@ -10371,11 +10828,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
10371 10828
10372 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 10829 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
10373 10830
10374 .set_tsc_khz = vmx_set_tsc_khz,
10375 .read_tsc_offset = vmx_read_tsc_offset, 10831 .read_tsc_offset = vmx_read_tsc_offset,
10376 .write_tsc_offset = vmx_write_tsc_offset, 10832 .write_tsc_offset = vmx_write_tsc_offset,
10377 .adjust_tsc_offset = vmx_adjust_tsc_offset, 10833 .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
10378 .compute_tsc_offset = vmx_compute_tsc_offset,
10379 .read_l1_tsc = vmx_read_l1_tsc, 10834 .read_l1_tsc = vmx_read_l1_tsc,
10380 10835
10381 .set_tdp_cr3 = vmx_set_cr3, 10836 .set_tdp_cr3 = vmx_set_cr3,
@@ -10394,7 +10849,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
10394 .flush_log_dirty = vmx_flush_log_dirty, 10849 .flush_log_dirty = vmx_flush_log_dirty,
10395 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, 10850 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
10396 10851
10852 .pre_block = vmx_pre_block,
10853 .post_block = vmx_post_block,
10854
10397 .pmu_ops = &intel_pmu_ops, 10855 .pmu_ops = &intel_pmu_ops,
10856
10857 .update_pi_irte = vmx_update_pi_irte,
10398}; 10858};
10399 10859
10400static int __init vmx_init(void) 10860static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bda65690788e..eed32283d22c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -51,6 +51,8 @@
51#include <linux/pci.h> 51#include <linux/pci.h>
52#include <linux/timekeeper_internal.h> 52#include <linux/timekeeper_internal.h>
53#include <linux/pvclock_gtod.h> 53#include <linux/pvclock_gtod.h>
54#include <linux/kvm_irqfd.h>
55#include <linux/irqbypass.h>
54#include <trace/events/kvm.h> 56#include <trace/events/kvm.h>
55 57
56#define CREATE_TRACE_POINTS 58#define CREATE_TRACE_POINTS
@@ -64,6 +66,7 @@
64#include <asm/fpu/internal.h> /* Ugh! */ 66#include <asm/fpu/internal.h> /* Ugh! */
65#include <asm/pvclock.h> 67#include <asm/pvclock.h>
66#include <asm/div64.h> 68#include <asm/div64.h>
69#include <asm/irq_remapping.h>
67 70
68#define MAX_IO_MSRS 256 71#define MAX_IO_MSRS 256
69#define KVM_MAX_MCE_BANKS 32 72#define KVM_MAX_MCE_BANKS 32
@@ -90,10 +93,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
90static void process_nmi(struct kvm_vcpu *vcpu); 93static void process_nmi(struct kvm_vcpu *vcpu);
91static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); 94static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
92 95
93struct kvm_x86_ops *kvm_x86_ops; 96struct kvm_x86_ops *kvm_x86_ops __read_mostly;
94EXPORT_SYMBOL_GPL(kvm_x86_ops); 97EXPORT_SYMBOL_GPL(kvm_x86_ops);
95 98
96static bool ignore_msrs = 0; 99static bool __read_mostly ignore_msrs = 0;
97module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); 100module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
98 101
99unsigned int min_timer_period_us = 500; 102unsigned int min_timer_period_us = 500;
@@ -102,20 +105,25 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
102static bool __read_mostly kvmclock_periodic_sync = true; 105static bool __read_mostly kvmclock_periodic_sync = true;
103module_param(kvmclock_periodic_sync, bool, S_IRUGO); 106module_param(kvmclock_periodic_sync, bool, S_IRUGO);
104 107
105bool kvm_has_tsc_control; 108bool __read_mostly kvm_has_tsc_control;
106EXPORT_SYMBOL_GPL(kvm_has_tsc_control); 109EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
107u32 kvm_max_guest_tsc_khz; 110u32 __read_mostly kvm_max_guest_tsc_khz;
108EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); 111EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
112u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
113EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
114u64 __read_mostly kvm_max_tsc_scaling_ratio;
115EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
116static u64 __read_mostly kvm_default_tsc_scaling_ratio;
109 117
110/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ 118/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
111static u32 tsc_tolerance_ppm = 250; 119static u32 __read_mostly tsc_tolerance_ppm = 250;
112module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); 120module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
113 121
114/* lapic timer advance (tscdeadline mode only) in nanoseconds */ 122/* lapic timer advance (tscdeadline mode only) in nanoseconds */
115unsigned int lapic_timer_advance_ns = 0; 123unsigned int __read_mostly lapic_timer_advance_ns = 0;
116module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); 124module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
117 125
118static bool backwards_tsc_observed = false; 126static bool __read_mostly backwards_tsc_observed = false;
119 127
120#define KVM_NR_SHARED_MSRS 16 128#define KVM_NR_SHARED_MSRS 16
121 129
@@ -622,7 +630,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
622 if ((cr0 ^ old_cr0) & update_bits) 630 if ((cr0 ^ old_cr0) & update_bits)
623 kvm_mmu_reset_context(vcpu); 631 kvm_mmu_reset_context(vcpu);
624 632
625 if ((cr0 ^ old_cr0) & X86_CR0_CD) 633 if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
634 kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
635 !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
626 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); 636 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
627 637
628 return 0; 638 return 0;
@@ -789,7 +799,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
789{ 799{
790 if (cr8 & CR8_RESERVED_BITS) 800 if (cr8 & CR8_RESERVED_BITS)
791 return 1; 801 return 1;
792 if (irqchip_in_kernel(vcpu->kvm)) 802 if (lapic_in_kernel(vcpu))
793 kvm_lapic_set_tpr(vcpu, cr8); 803 kvm_lapic_set_tpr(vcpu, cr8);
794 else 804 else
795 vcpu->arch.cr8 = cr8; 805 vcpu->arch.cr8 = cr8;
@@ -799,7 +809,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
799 809
800unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 810unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
801{ 811{
802 if (irqchip_in_kernel(vcpu->kvm)) 812 if (lapic_in_kernel(vcpu))
803 return kvm_lapic_get_cr8(vcpu); 813 return kvm_lapic_get_cr8(vcpu);
804 else 814 else
805 return vcpu->arch.cr8; 815 return vcpu->arch.cr8;
@@ -953,6 +963,9 @@ static u32 emulated_msrs[] = {
953 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, 963 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
954 HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, 964 HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
955 HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, 965 HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
966 HV_X64_MSR_RESET,
967 HV_X64_MSR_VP_INDEX,
968 HV_X64_MSR_VP_RUNTIME,
956 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 969 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
957 MSR_KVM_PV_EOI_EN, 970 MSR_KVM_PV_EOI_EN,
958 971
@@ -1241,14 +1254,53 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1241 return v; 1254 return v;
1242} 1255}
1243 1256
1244static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) 1257static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1258{
1259 u64 ratio;
1260
1261 /* Guest TSC same frequency as host TSC? */
1262 if (!scale) {
1263 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1264 return 0;
1265 }
1266
1267 /* TSC scaling supported? */
1268 if (!kvm_has_tsc_control) {
1269 if (user_tsc_khz > tsc_khz) {
1270 vcpu->arch.tsc_catchup = 1;
1271 vcpu->arch.tsc_always_catchup = 1;
1272 return 0;
1273 } else {
1274 WARN(1, "user requested TSC rate below hardware speed\n");
1275 return -1;
1276 }
1277 }
1278
1279 /* TSC scaling required - calculate ratio */
1280 ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
1281 user_tsc_khz, tsc_khz);
1282
1283 if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
1284 WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
1285 user_tsc_khz);
1286 return -1;
1287 }
1288
1289 vcpu->arch.tsc_scaling_ratio = ratio;
1290 return 0;
1291}
1292
1293static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1245{ 1294{
1246 u32 thresh_lo, thresh_hi; 1295 u32 thresh_lo, thresh_hi;
1247 int use_scaling = 0; 1296 int use_scaling = 0;
1248 1297
1249 /* tsc_khz can be zero if TSC calibration fails */ 1298 /* tsc_khz can be zero if TSC calibration fails */
1250 if (this_tsc_khz == 0) 1299 if (this_tsc_khz == 0) {
1251 return; 1300 /* set tsc_scaling_ratio to a safe value */
1301 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1302 return -1;
1303 }
1252 1304
1253 /* Compute a scale to convert nanoseconds in TSC cycles */ 1305 /* Compute a scale to convert nanoseconds in TSC cycles */
1254 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, 1306 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
@@ -1268,7 +1320,7 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1268 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); 1320 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
1269 use_scaling = 1; 1321 use_scaling = 1;
1270 } 1322 }
1271 kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); 1323 return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
1272} 1324}
1273 1325
1274static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) 1326static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@ -1314,6 +1366,48 @@ static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1314 vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; 1366 vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1315} 1367}
1316 1368
1369/*
1370 * Multiply tsc by a fixed point number represented by ratio.
1371 *
1372 * The most significant 64-N bits (mult) of ratio represent the
1373 * integral part of the fixed point number; the remaining N bits
1374 * (frac) represent the fractional part, ie. ratio represents a fixed
1375 * point number (mult + frac * 2^(-N)).
1376 *
1377 * N equals to kvm_tsc_scaling_ratio_frac_bits.
1378 */
1379static inline u64 __scale_tsc(u64 ratio, u64 tsc)
1380{
1381 return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
1382}
1383
1384u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
1385{
1386 u64 _tsc = tsc;
1387 u64 ratio = vcpu->arch.tsc_scaling_ratio;
1388
1389 if (ratio != kvm_default_tsc_scaling_ratio)
1390 _tsc = __scale_tsc(ratio, tsc);
1391
1392 return _tsc;
1393}
1394EXPORT_SYMBOL_GPL(kvm_scale_tsc);
1395
1396static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1397{
1398 u64 tsc;
1399
1400 tsc = kvm_scale_tsc(vcpu, rdtsc());
1401
1402 return target_tsc - tsc;
1403}
1404
1405u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1406{
1407 return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc));
1408}
1409EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
1410
1317void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) 1411void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1318{ 1412{
1319 struct kvm *kvm = vcpu->kvm; 1413 struct kvm *kvm = vcpu->kvm;
@@ -1325,7 +1419,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1325 u64 data = msr->data; 1419 u64 data = msr->data;
1326 1420
1327 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1421 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1328 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1422 offset = kvm_compute_tsc_offset(vcpu, data);
1329 ns = get_kernel_ns(); 1423 ns = get_kernel_ns();
1330 elapsed = ns - kvm->arch.last_tsc_nsec; 1424 elapsed = ns - kvm->arch.last_tsc_nsec;
1331 1425
@@ -1382,7 +1476,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1382 } else { 1476 } else {
1383 u64 delta = nsec_to_cycles(vcpu, elapsed); 1477 u64 delta = nsec_to_cycles(vcpu, elapsed);
1384 data += delta; 1478 data += delta;
1385 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1479 offset = kvm_compute_tsc_offset(vcpu, data);
1386 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1480 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1387 } 1481 }
1388 matched = true; 1482 matched = true;
@@ -1439,6 +1533,20 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1439 1533
1440EXPORT_SYMBOL_GPL(kvm_write_tsc); 1534EXPORT_SYMBOL_GPL(kvm_write_tsc);
1441 1535
1536static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
1537 s64 adjustment)
1538{
1539 kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
1540}
1541
1542static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
1543{
1544 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
1545 WARN_ON(adjustment < 0);
1546 adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
1547 kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
1548}
1549
1442#ifdef CONFIG_X86_64 1550#ifdef CONFIG_X86_64
1443 1551
1444static cycle_t read_tsc(void) 1552static cycle_t read_tsc(void)
@@ -1600,7 +1708,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
1600 1708
1601static int kvm_guest_time_update(struct kvm_vcpu *v) 1709static int kvm_guest_time_update(struct kvm_vcpu *v)
1602{ 1710{
1603 unsigned long flags, this_tsc_khz; 1711 unsigned long flags, this_tsc_khz, tgt_tsc_khz;
1604 struct kvm_vcpu_arch *vcpu = &v->arch; 1712 struct kvm_vcpu_arch *vcpu = &v->arch;
1605 struct kvm_arch *ka = &v->kvm->arch; 1713 struct kvm_arch *ka = &v->kvm->arch;
1606 s64 kernel_ns; 1714 s64 kernel_ns;
@@ -1637,7 +1745,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1637 kernel_ns = get_kernel_ns(); 1745 kernel_ns = get_kernel_ns();
1638 } 1746 }
1639 1747
1640 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc); 1748 tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
1641 1749
1642 /* 1750 /*
1643 * We may have to catch up the TSC to match elapsed wall clock 1751 * We may have to catch up the TSC to match elapsed wall clock
@@ -1663,7 +1771,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1663 return 0; 1771 return 0;
1664 1772
1665 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { 1773 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1666 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, 1774 tgt_tsc_khz = kvm_has_tsc_control ?
1775 vcpu->virtual_tsc_khz : this_tsc_khz;
1776 kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
1667 &vcpu->hv_clock.tsc_shift, 1777 &vcpu->hv_clock.tsc_shift,
1668 &vcpu->hv_clock.tsc_to_system_mul); 1778 &vcpu->hv_clock.tsc_to_system_mul);
1669 vcpu->hw_tsc_khz = this_tsc_khz; 1779 vcpu->hw_tsc_khz = this_tsc_khz;
@@ -1898,6 +2008,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu)
1898 2008
1899static void record_steal_time(struct kvm_vcpu *vcpu) 2009static void record_steal_time(struct kvm_vcpu *vcpu)
1900{ 2010{
2011 accumulate_steal_time(vcpu);
2012
1901 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) 2013 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1902 return; 2014 return;
1903 2015
@@ -2048,12 +2160,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2048 if (!(data & KVM_MSR_ENABLED)) 2160 if (!(data & KVM_MSR_ENABLED))
2049 break; 2161 break;
2050 2162
2051 vcpu->arch.st.last_steal = current->sched_info.run_delay;
2052
2053 preempt_disable();
2054 accumulate_steal_time(vcpu);
2055 preempt_enable();
2056
2057 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 2163 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2058 2164
2059 break; 2165 break;
@@ -2449,6 +2555,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2449 case KVM_CAP_ENABLE_CAP_VM: 2555 case KVM_CAP_ENABLE_CAP_VM:
2450 case KVM_CAP_DISABLE_QUIRKS: 2556 case KVM_CAP_DISABLE_QUIRKS:
2451 case KVM_CAP_SET_BOOT_CPU_ID: 2557 case KVM_CAP_SET_BOOT_CPU_ID:
2558 case KVM_CAP_SPLIT_IRQCHIP:
2452#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 2559#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2453 case KVM_CAP_ASSIGN_DEV_IRQ: 2560 case KVM_CAP_ASSIGN_DEV_IRQ:
2454 case KVM_CAP_PCI_2_3: 2561 case KVM_CAP_PCI_2_3:
@@ -2612,7 +2719,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2612 if (tsc_delta < 0) 2719 if (tsc_delta < 0)
2613 mark_tsc_unstable("KVM discovered backwards TSC"); 2720 mark_tsc_unstable("KVM discovered backwards TSC");
2614 if (check_tsc_unstable()) { 2721 if (check_tsc_unstable()) {
2615 u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, 2722 u64 offset = kvm_compute_tsc_offset(vcpu,
2616 vcpu->arch.last_guest_tsc); 2723 vcpu->arch.last_guest_tsc);
2617 kvm_x86_ops->write_tsc_offset(vcpu, offset); 2724 kvm_x86_ops->write_tsc_offset(vcpu, offset);
2618 vcpu->arch.tsc_catchup = 1; 2725 vcpu->arch.tsc_catchup = 1;
@@ -2628,7 +2735,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2628 vcpu->cpu = cpu; 2735 vcpu->cpu = cpu;
2629 } 2736 }
2630 2737
2631 accumulate_steal_time(vcpu);
2632 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 2738 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2633} 2739}
2634 2740
@@ -2657,17 +2763,50 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2657 return 0; 2763 return 0;
2658} 2764}
2659 2765
2766static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
2767{
2768 return (!lapic_in_kernel(vcpu) ||
2769 kvm_apic_accept_pic_intr(vcpu));
2770}
2771
2772/*
2773 * if userspace requested an interrupt window, check that the
2774 * interrupt window is open.
2775 *
2776 * No need to exit to userspace if we already have an interrupt queued.
2777 */
2778static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
2779{
2780 return kvm_arch_interrupt_allowed(vcpu) &&
2781 !kvm_cpu_has_interrupt(vcpu) &&
2782 !kvm_event_needs_reinjection(vcpu) &&
2783 kvm_cpu_accept_dm_intr(vcpu);
2784}
2785
2660static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 2786static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2661 struct kvm_interrupt *irq) 2787 struct kvm_interrupt *irq)
2662{ 2788{
2663 if (irq->irq >= KVM_NR_INTERRUPTS) 2789 if (irq->irq >= KVM_NR_INTERRUPTS)
2664 return -EINVAL; 2790 return -EINVAL;
2665 if (irqchip_in_kernel(vcpu->kvm)) 2791
2792 if (!irqchip_in_kernel(vcpu->kvm)) {
2793 kvm_queue_interrupt(vcpu, irq->irq, false);
2794 kvm_make_request(KVM_REQ_EVENT, vcpu);
2795 return 0;
2796 }
2797
2798 /*
2799 * With in-kernel LAPIC, we only use this to inject EXTINT, so
2800 * fail for in-kernel 8259.
2801 */
2802 if (pic_in_kernel(vcpu->kvm))
2666 return -ENXIO; 2803 return -ENXIO;
2667 2804
2668 kvm_queue_interrupt(vcpu, irq->irq, false); 2805 if (vcpu->arch.pending_external_vector != -1)
2669 kvm_make_request(KVM_REQ_EVENT, vcpu); 2806 return -EEXIST;
2670 2807
2808 vcpu->arch.pending_external_vector = irq->irq;
2809 kvm_make_request(KVM_REQ_EVENT, vcpu);
2671 return 0; 2810 return 0;
2672} 2811}
2673 2812
@@ -3176,7 +3315,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3176 struct kvm_vapic_addr va; 3315 struct kvm_vapic_addr va;
3177 3316
3178 r = -EINVAL; 3317 r = -EINVAL;
3179 if (!irqchip_in_kernel(vcpu->kvm)) 3318 if (!lapic_in_kernel(vcpu))
3180 goto out; 3319 goto out;
3181 r = -EFAULT; 3320 r = -EFAULT;
3182 if (copy_from_user(&va, argp, sizeof va)) 3321 if (copy_from_user(&va, argp, sizeof va))
@@ -3303,9 +3442,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3303 if (user_tsc_khz == 0) 3442 if (user_tsc_khz == 0)
3304 user_tsc_khz = tsc_khz; 3443 user_tsc_khz = tsc_khz;
3305 3444
3306 kvm_set_tsc_khz(vcpu, user_tsc_khz); 3445 if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
3446 r = 0;
3307 3447
3308 r = 0;
3309 goto out; 3448 goto out;
3310 } 3449 }
3311 case KVM_GET_TSC_KHZ: { 3450 case KVM_GET_TSC_KHZ: {
@@ -3425,41 +3564,35 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
3425 3564
3426static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 3565static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3427{ 3566{
3428 int r = 0;
3429
3430 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3567 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3431 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 3568 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
3432 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3569 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3433 return r; 3570 return 0;
3434} 3571}
3435 3572
3436static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 3573static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3437{ 3574{
3438 int r = 0;
3439
3440 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3575 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3441 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 3576 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
3442 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); 3577 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
3443 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3578 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3444 return r; 3579 return 0;
3445} 3580}
3446 3581
3447static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 3582static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3448{ 3583{
3449 int r = 0;
3450
3451 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3584 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3452 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 3585 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
3453 sizeof(ps->channels)); 3586 sizeof(ps->channels));
3454 ps->flags = kvm->arch.vpit->pit_state.flags; 3587 ps->flags = kvm->arch.vpit->pit_state.flags;
3455 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3588 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3456 memset(&ps->reserved, 0, sizeof(ps->reserved)); 3589 memset(&ps->reserved, 0, sizeof(ps->reserved));
3457 return r; 3590 return 0;
3458} 3591}
3459 3592
3460static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 3593static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3461{ 3594{
3462 int r = 0, start = 0; 3595 int start = 0;
3463 u32 prev_legacy, cur_legacy; 3596 u32 prev_legacy, cur_legacy;
3464 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3597 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3465 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 3598 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3471,7 +3604,7 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3471 kvm->arch.vpit->pit_state.flags = ps->flags; 3604 kvm->arch.vpit->pit_state.flags = ps->flags;
3472 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); 3605 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
3473 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3606 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3474 return r; 3607 return 0;
3475} 3608}
3476 3609
3477static int kvm_vm_ioctl_reinject(struct kvm *kvm, 3610static int kvm_vm_ioctl_reinject(struct kvm *kvm,
@@ -3556,6 +3689,28 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
3556 kvm->arch.disabled_quirks = cap->args[0]; 3689 kvm->arch.disabled_quirks = cap->args[0];
3557 r = 0; 3690 r = 0;
3558 break; 3691 break;
3692 case KVM_CAP_SPLIT_IRQCHIP: {
3693 mutex_lock(&kvm->lock);
3694 r = -EINVAL;
3695 if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
3696 goto split_irqchip_unlock;
3697 r = -EEXIST;
3698 if (irqchip_in_kernel(kvm))
3699 goto split_irqchip_unlock;
3700 if (atomic_read(&kvm->online_vcpus))
3701 goto split_irqchip_unlock;
3702 r = kvm_setup_empty_irq_routing(kvm);
3703 if (r)
3704 goto split_irqchip_unlock;
3705 /* Pairs with irqchip_in_kernel. */
3706 smp_wmb();
3707 kvm->arch.irqchip_split = true;
3708 kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
3709 r = 0;
3710split_irqchip_unlock:
3711 mutex_unlock(&kvm->lock);
3712 break;
3713 }
3559 default: 3714 default:
3560 r = -EINVAL; 3715 r = -EINVAL;
3561 break; 3716 break;
@@ -3669,7 +3824,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
3669 } 3824 }
3670 3825
3671 r = -ENXIO; 3826 r = -ENXIO;
3672 if (!irqchip_in_kernel(kvm)) 3827 if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
3673 goto get_irqchip_out; 3828 goto get_irqchip_out;
3674 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 3829 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
3675 if (r) 3830 if (r)
@@ -3693,7 +3848,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
3693 } 3848 }
3694 3849
3695 r = -ENXIO; 3850 r = -ENXIO;
3696 if (!irqchip_in_kernel(kvm)) 3851 if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
3697 goto set_irqchip_out; 3852 goto set_irqchip_out;
3698 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 3853 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
3699 if (r) 3854 if (r)
@@ -4060,6 +4215,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
4060 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); 4215 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
4061} 4216}
4062 4217
4218static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
4219 unsigned long addr, void *val, unsigned int bytes)
4220{
4221 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4222 int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
4223
4224 return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
4225}
4226
4063int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, 4227int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
4064 gva_t addr, void *val, 4228 gva_t addr, void *val,
4065 unsigned int bytes, 4229 unsigned int bytes,
@@ -4795,6 +4959,7 @@ static const struct x86_emulate_ops emulate_ops = {
4795 .write_gpr = emulator_write_gpr, 4959 .write_gpr = emulator_write_gpr,
4796 .read_std = kvm_read_guest_virt_system, 4960 .read_std = kvm_read_guest_virt_system,
4797 .write_std = kvm_write_guest_virt_system, 4961 .write_std = kvm_write_guest_virt_system,
4962 .read_phys = kvm_read_guest_phys_system,
4798 .fetch = kvm_fetch_guest_virt, 4963 .fetch = kvm_fetch_guest_virt,
4799 .read_emulated = emulator_read_emulated, 4964 .read_emulated = emulator_read_emulated,
4800 .write_emulated = emulator_write_emulated, 4965 .write_emulated = emulator_write_emulated,
@@ -5667,7 +5832,7 @@ void kvm_arch_exit(void)
5667int kvm_vcpu_halt(struct kvm_vcpu *vcpu) 5832int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
5668{ 5833{
5669 ++vcpu->stat.halt_exits; 5834 ++vcpu->stat.halt_exits;
5670 if (irqchip_in_kernel(vcpu->kvm)) { 5835 if (lapic_in_kernel(vcpu)) {
5671 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 5836 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
5672 return 1; 5837 return 1;
5673 } else { 5838 } else {
@@ -5766,17 +5931,10 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5766 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); 5931 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
5767} 5932}
5768 5933
5769/*
5770 * Check if userspace requested an interrupt window, and that the
5771 * interrupt window is open.
5772 *
5773 * No need to exit to userspace if we already have an interrupt queued.
5774 */
5775static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) 5934static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
5776{ 5935{
5777 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 5936 return vcpu->run->request_interrupt_window &&
5778 vcpu->run->request_interrupt_window && 5937 likely(!pic_in_kernel(vcpu->kvm));
5779 kvm_arch_interrupt_allowed(vcpu));
5780} 5938}
5781 5939
5782static void post_kvm_run_save(struct kvm_vcpu *vcpu) 5940static void post_kvm_run_save(struct kvm_vcpu *vcpu)
@@ -5787,13 +5945,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
5787 kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; 5945 kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
5788 kvm_run->cr8 = kvm_get_cr8(vcpu); 5946 kvm_run->cr8 = kvm_get_cr8(vcpu);
5789 kvm_run->apic_base = kvm_get_apic_base(vcpu); 5947 kvm_run->apic_base = kvm_get_apic_base(vcpu);
5790 if (irqchip_in_kernel(vcpu->kvm)) 5948 kvm_run->ready_for_interrupt_injection =
5791 kvm_run->ready_for_interrupt_injection = 1; 5949 pic_in_kernel(vcpu->kvm) ||
5792 else 5950 kvm_vcpu_ready_for_interrupt_injection(vcpu);
5793 kvm_run->ready_for_interrupt_injection =
5794 kvm_arch_interrupt_allowed(vcpu) &&
5795 !kvm_cpu_has_interrupt(vcpu) &&
5796 !kvm_event_needs_reinjection(vcpu);
5797} 5951}
5798 5952
5799static void update_cr8_intercept(struct kvm_vcpu *vcpu) 5953static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -6144,18 +6298,18 @@ static void process_smi(struct kvm_vcpu *vcpu)
6144 6298
6145static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) 6299static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
6146{ 6300{
6147 u64 eoi_exit_bitmap[4];
6148 u32 tmr[8];
6149
6150 if (!kvm_apic_hw_enabled(vcpu->arch.apic)) 6301 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
6151 return; 6302 return;
6152 6303
6153 memset(eoi_exit_bitmap, 0, 32); 6304 memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
6154 memset(tmr, 0, 32);
6155 6305
6156 kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr); 6306 if (irqchip_split(vcpu->kvm))
6157 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); 6307 kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
6158 kvm_apic_update_tmr(vcpu, tmr); 6308 else {
6309 kvm_x86_ops->sync_pir_to_irr(vcpu);
6310 kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
6311 }
6312 kvm_x86_ops->load_eoi_exitmap(vcpu);
6159} 6313}
6160 6314
6161static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) 6315static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -6168,7 +6322,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
6168{ 6322{
6169 struct page *page = NULL; 6323 struct page *page = NULL;
6170 6324
6171 if (!irqchip_in_kernel(vcpu->kvm)) 6325 if (!lapic_in_kernel(vcpu))
6172 return; 6326 return;
6173 6327
6174 if (!kvm_x86_ops->set_apic_access_page_addr) 6328 if (!kvm_x86_ops->set_apic_access_page_addr)
@@ -6206,8 +6360,10 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
6206static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 6360static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6207{ 6361{
6208 int r; 6362 int r;
6209 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 6363 bool req_int_win =
6210 vcpu->run->request_interrupt_window; 6364 dm_request_for_irq_injection(vcpu) &&
6365 kvm_cpu_accept_dm_intr(vcpu);
6366
6211 bool req_immediate_exit = false; 6367 bool req_immediate_exit = false;
6212 6368
6213 if (vcpu->requests) { 6369 if (vcpu->requests) {
@@ -6258,6 +6414,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6258 kvm_pmu_handle_event(vcpu); 6414 kvm_pmu_handle_event(vcpu);
6259 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 6415 if (kvm_check_request(KVM_REQ_PMI, vcpu))
6260 kvm_pmu_deliver_pmi(vcpu); 6416 kvm_pmu_deliver_pmi(vcpu);
6417 if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
6418 BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
6419 if (test_bit(vcpu->arch.pending_ioapic_eoi,
6420 (void *) vcpu->arch.eoi_exit_bitmap)) {
6421 vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
6422 vcpu->run->eoi.vector =
6423 vcpu->arch.pending_ioapic_eoi;
6424 r = 0;
6425 goto out;
6426 }
6427 }
6261 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) 6428 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
6262 vcpu_scan_ioapic(vcpu); 6429 vcpu_scan_ioapic(vcpu);
6263 if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) 6430 if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@@ -6268,6 +6435,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6268 r = 0; 6435 r = 0;
6269 goto out; 6436 goto out;
6270 } 6437 }
6438 if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
6439 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
6440 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
6441 r = 0;
6442 goto out;
6443 }
6444 }
6445
6446 /*
6447 * KVM_REQ_EVENT is not set when posted interrupts are set by
6448 * VT-d hardware, so we have to update RVI unconditionally.
6449 */
6450 if (kvm_lapic_enabled(vcpu)) {
6451 /*
6452 * Update architecture specific hints for APIC
6453 * virtual interrupt delivery.
6454 */
6455 if (kvm_x86_ops->hwapic_irr_update)
6456 kvm_x86_ops->hwapic_irr_update(vcpu,
6457 kvm_lapic_find_highest_irr(vcpu));
6271 } 6458 }
6272 6459
6273 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 6460 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -6286,13 +6473,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6286 kvm_x86_ops->enable_irq_window(vcpu); 6473 kvm_x86_ops->enable_irq_window(vcpu);
6287 6474
6288 if (kvm_lapic_enabled(vcpu)) { 6475 if (kvm_lapic_enabled(vcpu)) {
6289 /*
6290 * Update architecture specific hints for APIC
6291 * virtual interrupt delivery.
6292 */
6293 if (kvm_x86_ops->hwapic_irr_update)
6294 kvm_x86_ops->hwapic_irr_update(vcpu,
6295 kvm_lapic_find_highest_irr(vcpu));
6296 update_cr8_intercept(vcpu); 6476 update_cr8_intercept(vcpu);
6297 kvm_lapic_sync_to_vapic(vcpu); 6477 kvm_lapic_sync_to_vapic(vcpu);
6298 } 6478 }
@@ -6376,8 +6556,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6376 if (hw_breakpoint_active()) 6556 if (hw_breakpoint_active())
6377 hw_breakpoint_restore(); 6557 hw_breakpoint_restore();
6378 6558
6379 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, 6559 vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
6380 rdtsc());
6381 6560
6382 vcpu->mode = OUTSIDE_GUEST_MODE; 6561 vcpu->mode = OUTSIDE_GUEST_MODE;
6383 smp_wmb(); 6562 smp_wmb();
@@ -6428,10 +6607,15 @@ out:
6428 6607
6429static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) 6608static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
6430{ 6609{
6431 if (!kvm_arch_vcpu_runnable(vcpu)) { 6610 if (!kvm_arch_vcpu_runnable(vcpu) &&
6611 (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
6432 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 6612 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6433 kvm_vcpu_block(vcpu); 6613 kvm_vcpu_block(vcpu);
6434 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 6614 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6615
6616 if (kvm_x86_ops->post_block)
6617 kvm_x86_ops->post_block(vcpu);
6618
6435 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) 6619 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
6436 return 1; 6620 return 1;
6437 } 6621 }
@@ -6468,10 +6652,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
6468 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 6652 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6469 6653
6470 for (;;) { 6654 for (;;) {
6471 if (kvm_vcpu_running(vcpu)) 6655 if (kvm_vcpu_running(vcpu)) {
6472 r = vcpu_enter_guest(vcpu); 6656 r = vcpu_enter_guest(vcpu);
6473 else 6657 } else {
6474 r = vcpu_block(kvm, vcpu); 6658 r = vcpu_block(kvm, vcpu);
6659 }
6660
6475 if (r <= 0) 6661 if (r <= 0)
6476 break; 6662 break;
6477 6663
@@ -6479,9 +6665,10 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
6479 if (kvm_cpu_has_pending_timer(vcpu)) 6665 if (kvm_cpu_has_pending_timer(vcpu))
6480 kvm_inject_pending_timer_irqs(vcpu); 6666 kvm_inject_pending_timer_irqs(vcpu);
6481 6667
6482 if (dm_request_for_irq_injection(vcpu)) { 6668 if (dm_request_for_irq_injection(vcpu) &&
6483 r = -EINTR; 6669 kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
6484 vcpu->run->exit_reason = KVM_EXIT_INTR; 6670 r = 0;
6671 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
6485 ++vcpu->stat.request_irq_exits; 6672 ++vcpu->stat.request_irq_exits;
6486 break; 6673 break;
6487 } 6674 }
@@ -6608,7 +6795,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
6608 } 6795 }
6609 6796
6610 /* re-sync apic's tpr */ 6797 /* re-sync apic's tpr */
6611 if (!irqchip_in_kernel(vcpu->kvm)) { 6798 if (!lapic_in_kernel(vcpu)) {
6612 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { 6799 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
6613 r = -EINVAL; 6800 r = -EINVAL;
6614 goto out; 6801 goto out;
@@ -6932,7 +7119,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
6932 */ 7119 */
6933 kvm_set_rflags(vcpu, rflags); 7120 kvm_set_rflags(vcpu, rflags);
6934 7121
6935 kvm_x86_ops->update_db_bp_intercept(vcpu); 7122 kvm_x86_ops->update_bp_intercept(vcpu);
6936 7123
6937 r = 0; 7124 r = 0;
6938 7125
@@ -7281,6 +7468,20 @@ int kvm_arch_hardware_setup(void)
7281 if (r != 0) 7468 if (r != 0)
7282 return r; 7469 return r;
7283 7470
7471 if (kvm_has_tsc_control) {
7472 /*
7473 * Make sure the user can only configure tsc_khz values that
7474 * fit into a signed integer.
7475 * A min value is not calculated needed because it will always
7476 * be 1 on all machines.
7477 */
7478 u64 max = min(0x7fffffffULL,
7479 __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
7480 kvm_max_guest_tsc_khz = max;
7481
7482 kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
7483 }
7484
7284 kvm_init_msr_list(); 7485 kvm_init_msr_list();
7285 return 0; 7486 return 0;
7286} 7487}
@@ -7308,7 +7509,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
7308 7509
7309bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) 7510bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
7310{ 7511{
7311 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); 7512 return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
7312} 7513}
7313 7514
7314struct static_key kvm_no_apic_vcpu __read_mostly; 7515struct static_key kvm_no_apic_vcpu __read_mostly;
@@ -7377,6 +7578,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
7377 kvm_async_pf_hash_reset(vcpu); 7578 kvm_async_pf_hash_reset(vcpu);
7378 kvm_pmu_init(vcpu); 7579 kvm_pmu_init(vcpu);
7379 7580
7581 vcpu->arch.pending_external_vector = -1;
7582
7380 return 0; 7583 return 0;
7381 7584
7382fail_free_mce_banks: 7585fail_free_mce_banks:
@@ -7402,7 +7605,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
7402 kvm_mmu_destroy(vcpu); 7605 kvm_mmu_destroy(vcpu);
7403 srcu_read_unlock(&vcpu->kvm->srcu, idx); 7606 srcu_read_unlock(&vcpu->kvm->srcu, idx);
7404 free_page((unsigned long)vcpu->arch.pio_data); 7607 free_page((unsigned long)vcpu->arch.pio_data);
7405 if (!irqchip_in_kernel(vcpu->kvm)) 7608 if (!lapic_in_kernel(vcpu))
7406 static_key_slow_dec(&kvm_no_apic_vcpu); 7609 static_key_slow_dec(&kvm_no_apic_vcpu);
7407} 7610}
7408 7611
@@ -8029,7 +8232,59 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
8029} 8232}
8030EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); 8233EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
8031 8234
8235int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
8236 struct irq_bypass_producer *prod)
8237{
8238 struct kvm_kernel_irqfd *irqfd =
8239 container_of(cons, struct kvm_kernel_irqfd, consumer);
8240
8241 if (kvm_x86_ops->update_pi_irte) {
8242 irqfd->producer = prod;
8243 return kvm_x86_ops->update_pi_irte(irqfd->kvm,
8244 prod->irq, irqfd->gsi, 1);
8245 }
8246
8247 return -EINVAL;
8248}
8249
8250void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
8251 struct irq_bypass_producer *prod)
8252{
8253 int ret;
8254 struct kvm_kernel_irqfd *irqfd =
8255 container_of(cons, struct kvm_kernel_irqfd, consumer);
8256
8257 if (!kvm_x86_ops->update_pi_irte) {
8258 WARN_ON(irqfd->producer != NULL);
8259 return;
8260 }
8261
8262 WARN_ON(irqfd->producer != prod);
8263 irqfd->producer = NULL;
8264
8265 /*
8266 * When producer of consumer is unregistered, we change back to
8267 * remapped mode, so we can re-use the current implementation
8268 * when the irq is masked/disabed or the consumer side (KVM
8269 * int this case doesn't want to receive the interrupts.
8270 */
8271 ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
8272 if (ret)
8273 printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
8274 " fails: %d\n", irqfd->consumer.token, ret);
8275}
8276
8277int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
8278 uint32_t guest_irq, bool set)
8279{
8280 if (!kvm_x86_ops->update_pi_irte)
8281 return -EINVAL;
8282
8283 return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
8284}
8285
8032EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 8286EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
8287EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
8033EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 8288EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
8034EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 8289EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
8035EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 8290EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
@@ -8044,3 +8299,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
8044EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); 8299EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
8045EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); 8300EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
8046EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); 8301EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
8302EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 1bf417e9cc13..0f1c6fc3ddd8 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -89,7 +89,7 @@ static struct addr_marker address_markers[] = {
89 { 0/* VMALLOC_START */, "vmalloc() Area" }, 89 { 0/* VMALLOC_START */, "vmalloc() Area" },
90 { 0/*VMALLOC_END*/, "vmalloc() End" }, 90 { 0/*VMALLOC_END*/, "vmalloc() End" },
91# ifdef CONFIG_HIGHMEM 91# ifdef CONFIG_HIGHMEM
92 { 0/*PKMAP_BASE*/, "Persisent kmap() Area" }, 92 { 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
93# endif 93# endif
94 { 0/*FIXADDR_START*/, "Fixmap Area" }, 94 { 0/*FIXADDR_START*/, "Fixmap Area" },
95#endif 95#endif
@@ -358,6 +358,21 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
358#define pgd_none(a) pud_none(__pud(pgd_val(a))) 358#define pgd_none(a) pud_none(__pud(pgd_val(a)))
359#endif 359#endif
360 360
361#ifdef CONFIG_X86_64
362static inline bool is_hypervisor_range(int idx)
363{
364 /*
365 * ffff800000000000 - ffff87ffffffffff is reserved for
366 * the hypervisor.
367 */
368 return paravirt_enabled() &&
369 (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
370 (idx < pgd_index(__PAGE_OFFSET));
371}
372#else
373static inline bool is_hypervisor_range(int idx) { return false; }
374#endif
375
361static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, 376static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
362 bool checkwx) 377 bool checkwx)
363{ 378{
@@ -381,7 +396,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
381 396
382 for (i = 0; i < PTRS_PER_PGD; i++) { 397 for (i = 0; i < PTRS_PER_PGD; i++) {
383 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 398 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
384 if (!pgd_none(*start)) { 399 if (!pgd_none(*start) && !is_hypervisor_range(i)) {
385 if (pgd_large(*start) || !pgd_present(*start)) { 400 if (pgd_large(*start) || !pgd_present(*start)) {
386 prot = pgd_flags(*start); 401 prot = pgd_flags(*start);
387 note_page(m, &st, __pgprot(prot), 1); 402 note_page(m, &st, __pgprot(prot), 1);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index eecb207a2037..a6d739258137 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -104,20 +104,6 @@ void __kunmap_atomic(void *kvaddr)
104} 104}
105EXPORT_SYMBOL(__kunmap_atomic); 105EXPORT_SYMBOL(__kunmap_atomic);
106 106
107struct page *kmap_atomic_to_page(void *ptr)
108{
109 unsigned long idx, vaddr = (unsigned long)ptr;
110 pte_t *pte;
111
112 if (vaddr < FIXADDR_START)
113 return virt_to_page(ptr);
114
115 idx = virt_to_fix(vaddr);
116 pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
117 return pte_page(*pte);
118}
119EXPORT_SYMBOL(kmap_atomic_to_page);
120
121void __init set_highmem_pages_init(void) 107void __init set_highmem_pages_init(void)
122{ 108{
123 struct zone *zone; 109 struct zone *zone;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1f37cb2b56a9..493f54172b4a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -354,7 +354,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range,
354 } 354 }
355 355
356 for (i = 0; i < nr_range; i++) 356 for (i = 0; i < nr_range; i++)
357 printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", 357 pr_debug(" [mem %#010lx-%#010lx] page %s\n",
358 mr[i].start, mr[i].end - 1, 358 mr[i].start, mr[i].end - 1,
359 page_size_string(&mr[i])); 359 page_size_string(&mr[i]));
360 360
@@ -401,7 +401,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
401 unsigned long ret = 0; 401 unsigned long ret = 0;
402 int nr_range, i; 402 int nr_range, i;
403 403
404 pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n", 404 pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n",
405 start, end - 1); 405 start, end - 1);
406 406
407 memset(mr, 0, sizeof(mr)); 407 memset(mr, 0, sizeof(mr));
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5ed62eff31bd..ec081fe0ce2c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1270,7 +1270,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
1270 /* check to see if we have contiguous blocks */ 1270 /* check to see if we have contiguous blocks */
1271 if (p_end != p || node_start != node) { 1271 if (p_end != p || node_start != node) {
1272 if (p_start) 1272 if (p_start)
1273 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", 1273 pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1274 addr_start, addr_end-1, p_start, p_end-1, node_start); 1274 addr_start, addr_end-1, p_start, p_end-1, node_start);
1275 addr_start = addr; 1275 addr_start = addr;
1276 node_start = node; 1276 node_start = node;
@@ -1368,7 +1368,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
1368void __meminit vmemmap_populate_print_last(void) 1368void __meminit vmemmap_populate_print_last(void)
1369{ 1369{
1370 if (p_start) { 1370 if (p_start) {
1371 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", 1371 pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1372 addr_start, addr_end-1, p_start, p_end-1, node_start); 1372 addr_start, addr_end-1, p_start, p_end-1, node_start);
1373 p_start = NULL; 1373 p_start = NULL;
1374 p_end = NULL; 1374 p_end = NULL;
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 9ce5da27b136..d470cf219a2d 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -126,5 +126,5 @@ void __init kasan_init(void)
126 __flush_tlb_all(); 126 __flush_tlb_all();
127 init_task.kasan_depth = 0; 127 init_task.kasan_depth = 0;
128 128
129 pr_info("Kernel address sanitizer initialized\n"); 129 pr_info("KernelAddressSanitizer initialized\n");
130} 130}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index b0ae85f90f10..b2fd67da1701 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -101,19 +101,19 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
101 switch (type) { 101 switch (type) {
102 case REG_TYPE_RM: 102 case REG_TYPE_RM:
103 regno = X86_MODRM_RM(insn->modrm.value); 103 regno = X86_MODRM_RM(insn->modrm.value);
104 if (X86_REX_B(insn->rex_prefix.value) == 1) 104 if (X86_REX_B(insn->rex_prefix.value))
105 regno += 8; 105 regno += 8;
106 break; 106 break;
107 107
108 case REG_TYPE_INDEX: 108 case REG_TYPE_INDEX:
109 regno = X86_SIB_INDEX(insn->sib.value); 109 regno = X86_SIB_INDEX(insn->sib.value);
110 if (X86_REX_X(insn->rex_prefix.value) == 1) 110 if (X86_REX_X(insn->rex_prefix.value))
111 regno += 8; 111 regno += 8;
112 break; 112 break;
113 113
114 case REG_TYPE_BASE: 114 case REG_TYPE_BASE:
115 regno = X86_SIB_BASE(insn->sib.value); 115 regno = X86_SIB_BASE(insn->sib.value);
116 if (X86_REX_B(insn->rex_prefix.value) == 1) 116 if (X86_REX_B(insn->rex_prefix.value))
117 regno += 8; 117 regno += 8;
118 break; 118 break;
119 119
@@ -586,6 +586,29 @@ static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm,
586} 586}
587 587
588/* 588/*
589 * We only want to do a 4-byte get_user() on 32-bit. Otherwise,
590 * we might run off the end of the bounds table if we are on
591 * a 64-bit kernel and try to get 8 bytes.
592 */
593int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret,
594 long __user *bd_entry_ptr)
595{
596 u32 bd_entry_32;
597 int ret;
598
599 if (is_64bit_mm(mm))
600 return get_user(*bd_entry_ret, bd_entry_ptr);
601
602 /*
603 * Note that get_user() uses the type of the *pointer* to
604 * establish the size of the get, not the destination.
605 */
606 ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr);
607 *bd_entry_ret = bd_entry_32;
608 return ret;
609}
610
611/*
589 * Get the base of bounds tables pointed by specific bounds 612 * Get the base of bounds tables pointed by specific bounds
590 * directory entry. 613 * directory entry.
591 */ 614 */
@@ -605,7 +628,7 @@ static int get_bt_addr(struct mm_struct *mm,
605 int need_write = 0; 628 int need_write = 0;
606 629
607 pagefault_disable(); 630 pagefault_disable();
608 ret = get_user(bd_entry, bd_entry_ptr); 631 ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr);
609 pagefault_enable(); 632 pagefault_enable();
610 if (!ret) 633 if (!ret)
611 break; 634 break;
@@ -700,11 +723,23 @@ static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm,
700 */ 723 */
701static inline unsigned long bd_entry_virt_space(struct mm_struct *mm) 724static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
702{ 725{
703 unsigned long long virt_space = (1ULL << boot_cpu_data.x86_virt_bits); 726 unsigned long long virt_space;
704 if (is_64bit_mm(mm)) 727 unsigned long long GB = (1ULL << 30);
705 return virt_space / MPX_BD_NR_ENTRIES_64; 728
706 else 729 /*
707 return virt_space / MPX_BD_NR_ENTRIES_32; 730 * This covers 32-bit emulation as well as 32-bit kernels
731 * running on 64-bit harware.
732 */
733 if (!is_64bit_mm(mm))
734 return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
735
736 /*
737 * 'x86_virt_bits' returns what the hardware is capable
738 * of, and returns the full >32-bit adddress space when
739 * running 32-bit kernels on 64-bit hardware.
740 */
741 virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
742 return virt_space / MPX_BD_NR_ENTRIES_64;
708} 743}
709 744
710/* 745/*
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 70efcd0940f9..75991979f667 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1109,7 +1109,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
1109 bpf_flush_icache(header, image + proglen); 1109 bpf_flush_icache(header, image + proglen);
1110 set_memory_ro((unsigned long)header, header->pages); 1110 set_memory_ro((unsigned long)header, header->pages);
1111 prog->bpf_func = (void *)image; 1111 prog->bpf_func = (void *)image;
1112 prog->jited = true; 1112 prog->jited = 1;
1113 } 1113 }
1114out: 1114out:
1115 kfree(addrs); 1115 kfree(addrs);
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index ff9911707160..3cd69832d7f4 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -4,16 +4,15 @@
4#include <linux/irq.h> 4#include <linux/irq.h>
5#include <linux/dmi.h> 5#include <linux/dmi.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/pci-acpi.h>
7#include <asm/numa.h> 8#include <asm/numa.h>
8#include <asm/pci_x86.h> 9#include <asm/pci_x86.h>
9 10
10struct pci_root_info { 11struct pci_root_info {
11 struct acpi_device *bridge; 12 struct acpi_pci_root_info common;
12 char name[16];
13 struct pci_sysdata sd; 13 struct pci_sysdata sd;
14#ifdef CONFIG_PCI_MMCONFIG 14#ifdef CONFIG_PCI_MMCONFIG
15 bool mcfg_added; 15 bool mcfg_added;
16 u16 segment;
17 u8 start_bus; 16 u8 start_bus;
18 u8 end_bus; 17 u8 end_bus;
19#endif 18#endif
@@ -178,15 +177,18 @@ static int check_segment(u16 seg, struct device *dev, char *estr)
178 return 0; 177 return 0;
179} 178}
180 179
181static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start, 180static int setup_mcfg_map(struct acpi_pci_root_info *ci)
182 u8 end, phys_addr_t addr)
183{ 181{
184 int result; 182 int result, seg;
185 struct device *dev = &info->bridge->dev; 183 struct pci_root_info *info;
184 struct acpi_pci_root *root = ci->root;
185 struct device *dev = &ci->bridge->dev;
186 186
187 info->start_bus = start; 187 info = container_of(ci, struct pci_root_info, common);
188 info->end_bus = end; 188 info->start_bus = (u8)root->secondary.start;
189 info->end_bus = (u8)root->secondary.end;
189 info->mcfg_added = false; 190 info->mcfg_added = false;
191 seg = info->sd.domain;
190 192
191 /* return success if MMCFG is not in use */ 193 /* return success if MMCFG is not in use */
192 if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg) 194 if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg)
@@ -195,7 +197,8 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start,
195 if (!(pci_probe & PCI_PROBE_MMCONF)) 197 if (!(pci_probe & PCI_PROBE_MMCONF))
196 return check_segment(seg, dev, "MMCONFIG is disabled,"); 198 return check_segment(seg, dev, "MMCONFIG is disabled,");
197 199
198 result = pci_mmconfig_insert(dev, seg, start, end, addr); 200 result = pci_mmconfig_insert(dev, seg, info->start_bus, info->end_bus,
201 root->mcfg_addr);
199 if (result == 0) { 202 if (result == 0) {
200 /* enable MMCFG if it hasn't been enabled yet */ 203 /* enable MMCFG if it hasn't been enabled yet */
201 if (raw_pci_ext_ops == NULL) 204 if (raw_pci_ext_ops == NULL)
@@ -208,134 +211,55 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start,
208 return 0; 211 return 0;
209} 212}
210 213
211static void teardown_mcfg_map(struct pci_root_info *info) 214static void teardown_mcfg_map(struct acpi_pci_root_info *ci)
212{ 215{
216 struct pci_root_info *info;
217
218 info = container_of(ci, struct pci_root_info, common);
213 if (info->mcfg_added) { 219 if (info->mcfg_added) {
214 pci_mmconfig_delete(info->segment, info->start_bus, 220 pci_mmconfig_delete(info->sd.domain,
215 info->end_bus); 221 info->start_bus, info->end_bus);
216 info->mcfg_added = false; 222 info->mcfg_added = false;
217 } 223 }
218} 224}
219#else 225#else
220static int setup_mcfg_map(struct pci_root_info *info, 226static int setup_mcfg_map(struct acpi_pci_root_info *ci)
221 u16 seg, u8 start, u8 end,
222 phys_addr_t addr)
223{ 227{
224 return 0; 228 return 0;
225} 229}
226static void teardown_mcfg_map(struct pci_root_info *info) 230
231static void teardown_mcfg_map(struct acpi_pci_root_info *ci)
227{ 232{
228} 233}
229#endif 234#endif
230 235
231static void validate_resources(struct device *dev, struct list_head *crs_res, 236static int pci_acpi_root_get_node(struct acpi_pci_root *root)
232 unsigned long type)
233{ 237{
234 LIST_HEAD(list); 238 int busnum = root->secondary.start;
235 struct resource *res1, *res2, *root = NULL; 239 struct acpi_device *device = root->device;
236 struct resource_entry *tmp, *entry, *entry2; 240 int node = acpi_get_node(device->handle);
237
238 BUG_ON((type & (IORESOURCE_MEM | IORESOURCE_IO)) == 0);
239 root = (type & IORESOURCE_MEM) ? &iomem_resource : &ioport_resource;
240
241 list_splice_init(crs_res, &list);
242 resource_list_for_each_entry_safe(entry, tmp, &list) {
243 bool free = false;
244 resource_size_t end;
245
246 res1 = entry->res;
247 if (!(res1->flags & type))
248 goto next;
249
250 /* Exclude non-addressable range or non-addressable portion */
251 end = min(res1->end, root->end);
252 if (end <= res1->start) {
253 dev_info(dev, "host bridge window %pR (ignored, not CPU addressable)\n",
254 res1);
255 free = true;
256 goto next;
257 } else if (res1->end != end) {
258 dev_info(dev, "host bridge window %pR ([%#llx-%#llx] ignored, not CPU addressable)\n",
259 res1, (unsigned long long)end + 1,
260 (unsigned long long)res1->end);
261 res1->end = end;
262 }
263
264 resource_list_for_each_entry(entry2, crs_res) {
265 res2 = entry2->res;
266 if (!(res2->flags & type))
267 continue;
268
269 /*
270 * I don't like throwing away windows because then
271 * our resources no longer match the ACPI _CRS, but
272 * the kernel resource tree doesn't allow overlaps.
273 */
274 if (resource_overlaps(res1, res2)) {
275 res2->start = min(res1->start, res2->start);
276 res2->end = max(res1->end, res2->end);
277 dev_info(dev, "host bridge window expanded to %pR; %pR ignored\n",
278 res2, res1);
279 free = true;
280 goto next;
281 }
282 }
283 241
284next: 242 if (node == NUMA_NO_NODE) {
285 resource_list_del(entry); 243 node = x86_pci_root_bus_node(busnum);
286 if (free) 244 if (node != 0 && node != NUMA_NO_NODE)
287 resource_list_free_entry(entry); 245 dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n",
288 else 246 node);
289 resource_list_add_tail(entry, crs_res);
290 } 247 }
248 if (node != NUMA_NO_NODE && !node_online(node))
249 node = NUMA_NO_NODE;
250
251 return node;
291} 252}
292 253
293static void add_resources(struct pci_root_info *info, 254static int pci_acpi_root_init_info(struct acpi_pci_root_info *ci)
294 struct list_head *resources,
295 struct list_head *crs_res)
296{ 255{
297 struct resource_entry *entry, *tmp; 256 return setup_mcfg_map(ci);
298 struct resource *res, *conflict, *root = NULL;
299
300 validate_resources(&info->bridge->dev, crs_res, IORESOURCE_MEM);
301 validate_resources(&info->bridge->dev, crs_res, IORESOURCE_IO);
302
303 resource_list_for_each_entry_safe(entry, tmp, crs_res) {
304 res = entry->res;
305 if (res->flags & IORESOURCE_MEM)
306 root = &iomem_resource;
307 else if (res->flags & IORESOURCE_IO)
308 root = &ioport_resource;
309 else
310 BUG_ON(res);
311
312 conflict = insert_resource_conflict(root, res);
313 if (conflict) {
314 dev_info(&info->bridge->dev,
315 "ignoring host bridge window %pR (conflicts with %s %pR)\n",
316 res, conflict->name, conflict);
317 resource_list_destroy_entry(entry);
318 }
319 }
320
321 list_splice_tail(crs_res, resources);
322} 257}
323 258
324static void release_pci_root_info(struct pci_host_bridge *bridge) 259static void pci_acpi_root_release_info(struct acpi_pci_root_info *ci)
325{ 260{
326 struct resource *res; 261 teardown_mcfg_map(ci);
327 struct resource_entry *entry; 262 kfree(container_of(ci, struct pci_root_info, common));
328 struct pci_root_info *info = bridge->release_data;
329
330 resource_list_for_each_entry(entry, &bridge->windows) {
331 res = entry->res;
332 if (res->parent &&
333 (res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
334 release_resource(res);
335 }
336
337 teardown_mcfg_map(info);
338 kfree(info);
339} 263}
340 264
341/* 265/*
@@ -358,50 +282,47 @@ static bool resource_is_pcicfg_ioport(struct resource *res)
358 res->start == 0xCF8 && res->end == 0xCFF; 282 res->start == 0xCF8 && res->end == 0xCFF;
359} 283}
360 284
361static void probe_pci_root_info(struct pci_root_info *info, 285static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
362 struct acpi_device *device,
363 int busnum, int domain,
364 struct list_head *list)
365{ 286{
366 int ret; 287 struct acpi_device *device = ci->bridge;
288 int busnum = ci->root->secondary.start;
367 struct resource_entry *entry, *tmp; 289 struct resource_entry *entry, *tmp;
290 int status;
368 291
369 sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum); 292 status = acpi_pci_probe_root_resources(ci);
370 info->bridge = device; 293 if (pci_use_crs) {
371 ret = acpi_dev_get_resources(device, list, 294 resource_list_for_each_entry_safe(entry, tmp, &ci->resources)
372 acpi_dev_filter_resource_type_cb, 295 if (resource_is_pcicfg_ioport(entry->res))
373 (void *)(IORESOURCE_IO | IORESOURCE_MEM));
374 if (ret < 0)
375 dev_warn(&device->dev,
376 "failed to parse _CRS method, error code %d\n", ret);
377 else if (ret == 0)
378 dev_dbg(&device->dev,
379 "no IO and memory resources present in _CRS\n");
380 else
381 resource_list_for_each_entry_safe(entry, tmp, list) {
382 if ((entry->res->flags & IORESOURCE_DISABLED) ||
383 resource_is_pcicfg_ioport(entry->res))
384 resource_list_destroy_entry(entry); 296 resource_list_destroy_entry(entry);
385 else 297 return status;
386 entry->res->name = info->name; 298 }
387 } 299
300 resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
301 dev_printk(KERN_DEBUG, &device->dev,
302 "host bridge window %pR (ignored)\n", entry->res);
303 resource_list_destroy_entry(entry);
304 }
305 x86_pci_root_bus_resources(busnum, &ci->resources);
306
307 return 0;
388} 308}
389 309
310static struct acpi_pci_root_ops acpi_pci_root_ops = {
311 .pci_ops = &pci_root_ops,
312 .init_info = pci_acpi_root_init_info,
313 .release_info = pci_acpi_root_release_info,
314 .prepare_resources = pci_acpi_root_prepare_resources,
315};
316
390struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) 317struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
391{ 318{
392 struct acpi_device *device = root->device;
393 struct pci_root_info *info;
394 int domain = root->segment; 319 int domain = root->segment;
395 int busnum = root->secondary.start; 320 int busnum = root->secondary.start;
396 struct resource_entry *res_entry; 321 int node = pci_acpi_root_get_node(root);
397 LIST_HEAD(crs_res);
398 LIST_HEAD(resources);
399 struct pci_bus *bus; 322 struct pci_bus *bus;
400 struct pci_sysdata *sd;
401 int node;
402 323
403 if (pci_ignore_seg) 324 if (pci_ignore_seg)
404 domain = 0; 325 root->segment = domain = 0;
405 326
406 if (domain && !pci_domains_supported) { 327 if (domain && !pci_domains_supported) {
407 printk(KERN_WARNING "pci_bus %04x:%02x: " 328 printk(KERN_WARNING "pci_bus %04x:%02x: "
@@ -410,71 +331,33 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
410 return NULL; 331 return NULL;
411 } 332 }
412 333
413 node = acpi_get_node(device->handle);
414 if (node == NUMA_NO_NODE) {
415 node = x86_pci_root_bus_node(busnum);
416 if (node != 0 && node != NUMA_NO_NODE)
417 dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n",
418 node);
419 }
420
421 if (node != NUMA_NO_NODE && !node_online(node))
422 node = NUMA_NO_NODE;
423
424 info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
425 if (!info) {
426 printk(KERN_WARNING "pci_bus %04x:%02x: "
427 "ignored (out of memory)\n", domain, busnum);
428 return NULL;
429 }
430
431 sd = &info->sd;
432 sd->domain = domain;
433 sd->node = node;
434 sd->companion = device;
435
436 bus = pci_find_bus(domain, busnum); 334 bus = pci_find_bus(domain, busnum);
437 if (bus) { 335 if (bus) {
438 /* 336 /*
439 * If the desired bus has been scanned already, replace 337 * If the desired bus has been scanned already, replace
440 * its bus->sysdata. 338 * its bus->sysdata.
441 */ 339 */
442 memcpy(bus->sysdata, sd, sizeof(*sd)); 340 struct pci_sysdata sd = {
443 kfree(info); 341 .domain = domain,
444 } else { 342 .node = node,
445 /* insert busn res at first */ 343 .companion = root->device
446 pci_add_resource(&resources, &root->secondary); 344 };
447 345
448 /* 346 memcpy(bus->sysdata, &sd, sizeof(sd));
449 * _CRS with no apertures is normal, so only fall back to 347 } else {
450 * defaults or native bridge info if we're ignoring _CRS. 348 struct pci_root_info *info;
451 */ 349
452 probe_pci_root_info(info, device, busnum, domain, &crs_res); 350 info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
453 if (pci_use_crs) { 351 if (!info)
454 add_resources(info, &resources, &crs_res); 352 dev_err(&root->device->dev,
455 } else { 353 "pci_bus %04x:%02x: ignored (out of memory)\n",
456 resource_list_for_each_entry(res_entry, &crs_res) 354 domain, busnum);
457 dev_printk(KERN_DEBUG, &device->dev, 355 else {
458 "host bridge window %pR (ignored)\n", 356 info->sd.domain = domain;
459 res_entry->res); 357 info->sd.node = node;
460 resource_list_free(&crs_res); 358 info->sd.companion = root->device;
461 x86_pci_root_bus_resources(busnum, &resources); 359 bus = acpi_pci_root_create(root, &acpi_pci_root_ops,
462 } 360 &info->common, &info->sd);
463
464 if (!setup_mcfg_map(info, domain, (u8)root->secondary.start,
465 (u8)root->secondary.end, root->mcfg_addr))
466 bus = pci_create_root_bus(NULL, busnum, &pci_root_ops,
467 sd, &resources);
468
469 if (bus) {
470 pci_scan_child_bus(bus);
471 pci_set_host_bridge_release(
472 to_pci_host_bridge(bus->bridge),
473 release_pci_root_info, info);
474 } else {
475 resource_list_free(&resources);
476 teardown_mcfg_map(info);
477 kfree(info);
478 } 361 }
479 } 362 }
480 363
@@ -487,9 +370,6 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
487 pcie_bus_configure_settings(child); 370 pcie_bus_configure_settings(child);
488 } 371 }
489 372
490 if (bus && node != NUMA_NO_NODE)
491 dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node);
492
493 return bus; 373 return bus;
494} 374}
495 375
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 7bcf06a7cd12..6eb3c8af96e2 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -50,18 +50,9 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
50 if (!found) 50 if (!found)
51 pci_add_resource(resources, &info->busn); 51 pci_add_resource(resources, &info->busn);
52 52
53 list_for_each_entry(root_res, &info->resources, list) { 53 list_for_each_entry(root_res, &info->resources, list)
54 struct resource *res; 54 pci_add_resource(resources, &root_res->res);
55 struct resource *root;
56 55
57 res = &root_res->res;
58 pci_add_resource(resources, res);
59 if (res->flags & IORESOURCE_IO)
60 root = &ioport_resource;
61 else
62 root = &iomem_resource;
63 insert_resource(root, res);
64 }
65 return; 56 return;
66 57
67default_resources: 58default_resources:
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index dc78a4a9a466..eccd4d99e6a4 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -675,6 +675,14 @@ int pcibios_add_device(struct pci_dev *dev)
675 675
676int pcibios_alloc_irq(struct pci_dev *dev) 676int pcibios_alloc_irq(struct pci_dev *dev)
677{ 677{
678 /*
679 * If the PCI device was already claimed by core code and has
680 * MSI enabled, probing of the pcibios IRQ will overwrite
681 * dev->irq. So bail out if MSI is already enabled.
682 */
683 if (pci_dev_msi_enabled(dev))
684 return -EBUSY;
685
678 return pcibios_enable_irq(dev); 686 return pcibios_enable_irq(dev);
679} 687}
680 688
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 5b662c0faf8c..ea6f3802c17b 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -54,7 +54,7 @@ void pcibios_scan_specific_bus(int busn)
54} 54}
55EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus); 55EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus);
56 56
57int __init pci_subsys_init(void) 57static int __init pci_subsys_init(void)
58{ 58{
59 /* 59 /*
60 * The init function returns an non zero value when 60 * The init function returns an non zero value when
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 06934a8a4872..e5f854ce2d72 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -211,7 +211,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
211 if (err) 211 if (err)
212 return 1; 212 return 1;
213 213
214 err = convert_fxsr_from_user(&fpx, sc.fpstate); 214 err = convert_fxsr_from_user(&fpx, (void *)sc.fpstate);
215 if (err) 215 if (err)
216 return 1; 216 return 1;
217 217
@@ -227,7 +227,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
227 { 227 {
228 struct user_i387_struct fp; 228 struct user_i387_struct fp;
229 229
230 err = copy_from_user(&fp, sc.fpstate, 230 err = copy_from_user(&fp, (void *)sc.fpstate,
231 sizeof(struct user_i387_struct)); 231 sizeof(struct user_i387_struct));
232 if (err) 232 if (err)
233 return 1; 233 return 1;
@@ -291,7 +291,7 @@ static int copy_sc_to_user(struct sigcontext __user *to,
291#endif 291#endif
292#undef PUTREG 292#undef PUTREG
293 sc.oldmask = mask; 293 sc.oldmask = mask;
294 sc.fpstate = to_fp; 294 sc.fpstate = (unsigned long)to_fp;
295 295
296 err = copy_to_user(to, &sc, sizeof(struct sigcontext)); 296 err = copy_to_user(to, &sc, sizeof(struct sigcontext));
297 if (err) 297 if (err)
@@ -468,12 +468,10 @@ long sys_sigreturn(void)
468 struct sigframe __user *frame = (struct sigframe __user *)(sp - 8); 468 struct sigframe __user *frame = (struct sigframe __user *)(sp - 8);
469 sigset_t set; 469 sigset_t set;
470 struct sigcontext __user *sc = &frame->sc; 470 struct sigcontext __user *sc = &frame->sc;
471 unsigned long __user *oldmask = &sc->oldmask;
472 unsigned long __user *extramask = frame->extramask;
473 int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); 471 int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
474 472
475 if (copy_from_user(&set.sig[0], oldmask, sizeof(set.sig[0])) || 473 if (copy_from_user(&set.sig[0], (void *)sc->oldmask, sizeof(set.sig[0])) ||
476 copy_from_user(&set.sig[1], extramask, sig_size)) 474 copy_from_user(&set.sig[1], frame->extramask, sig_size))
477 goto segfault; 475 goto segfault;
478 476
479 set_current_blocked(&set); 477 set_current_blocked(&set);
@@ -505,6 +503,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
505{ 503{
506 struct rt_sigframe __user *frame; 504 struct rt_sigframe __user *frame;
507 int err = 0, sig = ksig->sig; 505 int err = 0, sig = ksig->sig;
506 unsigned long fp_to;
508 507
509 frame = (struct rt_sigframe __user *) 508 frame = (struct rt_sigframe __user *)
510 round_down(stack_top - sizeof(struct rt_sigframe), 16); 509 round_down(stack_top - sizeof(struct rt_sigframe), 16);
@@ -526,7 +525,10 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
526 err |= __save_altstack(&frame->uc.uc_stack, PT_REGS_SP(regs)); 525 err |= __save_altstack(&frame->uc.uc_stack, PT_REGS_SP(regs));
527 err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs, 526 err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs,
528 set->sig[0]); 527 set->sig[0]);
529 err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate); 528
529 fp_to = (unsigned long)&frame->fpstate;
530
531 err |= __put_user(fp_to, &frame->uc.uc_mcontext.fpstate);
530 if (sizeof(*set) == 16) { 532 if (sizeof(*set) == 16) {
531 err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); 533 err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
532 err |= __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 534 err |= __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S
index b972649d3a18..98816804e131 100644
--- a/arch/x86/um/stub_32.S
+++ b/arch/x86/um/stub_32.S
@@ -1,6 +1,5 @@
1#include <as-layout.h> 1#include <as-layout.h>
2 2
3 .globl syscall_stub
4.section .__syscall_stub, "ax" 3.section .__syscall_stub, "ax"
5 4
6 .globl batch_syscall_stub 5 .globl batch_syscall_stub
diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S
index 7160b20172d0..ba914b3b8cc4 100644
--- a/arch/x86/um/stub_64.S
+++ b/arch/x86/um/stub_64.S
@@ -1,25 +1,9 @@
1#include <as-layout.h> 1#include <as-layout.h>
2 2
3 .globl syscall_stub
4.section .__syscall_stub, "ax" 3.section .__syscall_stub, "ax"
5syscall_stub:
6 syscall
7 /* We don't have 64-bit constants, so this constructs the address
8 * we need.
9 */
10 movq $(STUB_DATA >> 32), %rbx
11 salq $32, %rbx
12 movq $(STUB_DATA & 0xffffffff), %rcx
13 or %rcx, %rbx
14 movq %rax, (%rbx)
15 int3
16
17 .globl batch_syscall_stub 4 .globl batch_syscall_stub
18batch_syscall_stub: 5batch_syscall_stub:
19 mov $(STUB_DATA >> 32), %rbx 6 mov $(STUB_DATA), %rbx
20 sal $32, %rbx
21 mov $(STUB_DATA & 0xffffffff), %rax
22 or %rax, %rbx
23 /* load pointer to first operation */ 7 /* load pointer to first operation */
24 mov %rbx, %rsp 8 mov %rbx, %rsp
25 add $0x10, %rsp 9 add $0x10, %rsp
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 2745e8ae93f3..4334e511cfc8 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -75,6 +75,7 @@
75#include <asm/mwait.h> 75#include <asm/mwait.h>
76#include <asm/pci_x86.h> 76#include <asm/pci_x86.h>
77#include <asm/pat.h> 77#include <asm/pat.h>
78#include <asm/cpu.h>
78 79
79#ifdef CONFIG_ACPI 80#ifdef CONFIG_ACPI
80#include <linux/acpi.h> 81#include <linux/acpi.h>
@@ -1892,3 +1893,17 @@ const struct hypervisor_x86 x86_hyper_xen = {
1892 .set_cpu_features = xen_set_cpu_features, 1893 .set_cpu_features = xen_set_cpu_features,
1893}; 1894};
1894EXPORT_SYMBOL(x86_hyper_xen); 1895EXPORT_SYMBOL(x86_hyper_xen);
1896
1897#ifdef CONFIG_HOTPLUG_CPU
1898void xen_arch_register_cpu(int num)
1899{
1900 arch_register_cpu(num);
1901}
1902EXPORT_SYMBOL(xen_arch_register_cpu);
1903
1904void xen_arch_unregister_cpu(int num)
1905{
1906 arch_unregister_cpu(num);
1907}
1908EXPORT_SYMBOL(xen_arch_unregister_cpu);
1909#endif
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 1580e7a5a4cf..e079500b17f3 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -133,7 +133,7 @@ static int __init xlated_setup_gnttab_pages(void)
133 kfree(pages); 133 kfree(pages);
134 return -ENOMEM; 134 return -ENOMEM;
135 } 135 }
136 rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */); 136 rc = alloc_xenballooned_pages(nr_grant_frames, pages);
137 if (rc) { 137 if (rc) {
138 pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__, 138 pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
139 nr_grant_frames, rc); 139 nr_grant_frames, rc);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 41ee3e25fcce..c913ca4f6958 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2494,14 +2494,9 @@ void __init xen_init_mmu_ops(void)
2494{ 2494{
2495 x86_init.paging.pagetable_init = xen_pagetable_init; 2495 x86_init.paging.pagetable_init = xen_pagetable_init;
2496 2496
2497 /* Optimization - we can use the HVM one but it has no idea which 2497 if (xen_feature(XENFEAT_auto_translated_physmap))
2498 * VCPUs are descheduled - which means that it will needlessly IPI
2499 * them. Xen knows so let it do the job.
2500 */
2501 if (xen_feature(XENFEAT_auto_translated_physmap)) {
2502 pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
2503 return; 2498 return;
2504 } 2499
2505 pv_mmu_ops = xen_mmu_ops; 2500 pv_mmu_ops = xen_mmu_ops;
2506 2501
2507 memset(dummy_mapping, 0xff, PAGE_SIZE); 2502 memset(dummy_mapping, 0xff, PAGE_SIZE);
@@ -2887,6 +2882,7 @@ static int do_remap_gfn(struct vm_area_struct *vma,
2887 addr += range; 2882 addr += range;
2888 if (err_ptr) 2883 if (err_ptr)
2889 err_ptr += batch; 2884 err_ptr += batch;
2885 cond_resched();
2890 } 2886 }
2891out: 2887out:
2892 2888
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 660b3cfef234..cab9f766bb06 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -530,7 +530,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
530 * the new pages are installed with cmpxchg; if we lose the race then 530 * the new pages are installed with cmpxchg; if we lose the race then
531 * simply free the page we allocated and use the one that's there. 531 * simply free the page we allocated and use the one that's there.
532 */ 532 */
533static bool alloc_p2m(unsigned long pfn) 533int xen_alloc_p2m_entry(unsigned long pfn)
534{ 534{
535 unsigned topidx; 535 unsigned topidx;
536 unsigned long *top_mfn_p, *mid_mfn; 536 unsigned long *top_mfn_p, *mid_mfn;
@@ -540,6 +540,9 @@ static bool alloc_p2m(unsigned long pfn)
540 unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); 540 unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
541 unsigned long p2m_pfn; 541 unsigned long p2m_pfn;
542 542
543 if (xen_feature(XENFEAT_auto_translated_physmap))
544 return 0;
545
543 ptep = lookup_address(addr, &level); 546 ptep = lookup_address(addr, &level);
544 BUG_ON(!ptep || level != PG_LEVEL_4K); 547 BUG_ON(!ptep || level != PG_LEVEL_4K);
545 pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); 548 pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -548,7 +551,7 @@ static bool alloc_p2m(unsigned long pfn)
548 /* PMD level is missing, allocate a new one */ 551 /* PMD level is missing, allocate a new one */
549 ptep = alloc_p2m_pmd(addr, pte_pg); 552 ptep = alloc_p2m_pmd(addr, pte_pg);
550 if (!ptep) 553 if (!ptep)
551 return false; 554 return -ENOMEM;
552 } 555 }
553 556
554 if (p2m_top_mfn && pfn < MAX_P2M_PFN) { 557 if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
@@ -566,7 +569,7 @@ static bool alloc_p2m(unsigned long pfn)
566 569
567 mid_mfn = alloc_p2m_page(); 570 mid_mfn = alloc_p2m_page();
568 if (!mid_mfn) 571 if (!mid_mfn)
569 return false; 572 return -ENOMEM;
570 573
571 p2m_mid_mfn_init(mid_mfn, p2m_missing); 574 p2m_mid_mfn_init(mid_mfn, p2m_missing);
572 575
@@ -592,7 +595,7 @@ static bool alloc_p2m(unsigned long pfn)
592 595
593 p2m = alloc_p2m_page(); 596 p2m = alloc_p2m_page();
594 if (!p2m) 597 if (!p2m)
595 return false; 598 return -ENOMEM;
596 599
597 if (p2m_pfn == PFN_DOWN(__pa(p2m_missing))) 600 if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
598 p2m_init(p2m); 601 p2m_init(p2m);
@@ -625,8 +628,9 @@ static bool alloc_p2m(unsigned long pfn)
625 HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; 628 HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
626 } 629 }
627 630
628 return true; 631 return 0;
629} 632}
633EXPORT_SYMBOL(xen_alloc_p2m_entry);
630 634
631unsigned long __init set_phys_range_identity(unsigned long pfn_s, 635unsigned long __init set_phys_range_identity(unsigned long pfn_s,
632 unsigned long pfn_e) 636 unsigned long pfn_e)
@@ -688,7 +692,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
688bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) 692bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
689{ 693{
690 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 694 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
691 if (!alloc_p2m(pfn)) 695 int ret;
696
697 ret = xen_alloc_p2m_entry(pfn);
698 if (ret < 0)
692 return false; 699 return false;
693 700
694 return __set_phys_to_machine(pfn, mfn); 701 return __set_phys_to_machine(pfn, mfn);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 63320b6d35bc..7ab29518a3b9 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -212,7 +212,7 @@ static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
212 e_pfn = PFN_DOWN(entry->addr + entry->size); 212 e_pfn = PFN_DOWN(entry->addr + entry->size);
213 213
214 /* We only care about E820 after this */ 214 /* We only care about E820 after this */
215 if (e_pfn < *min_pfn) 215 if (e_pfn <= *min_pfn)
216 continue; 216 continue;
217 217
218 s_pfn = PFN_UP(entry->addr); 218 s_pfn = PFN_UP(entry->addr);
@@ -829,6 +829,8 @@ char * __init xen_memory_setup(void)
829 addr = xen_e820_map[0].addr; 829 addr = xen_e820_map[0].addr;
830 size = xen_e820_map[0].size; 830 size = xen_e820_map[0].size;
831 while (i < xen_e820_map_entries) { 831 while (i < xen_e820_map_entries) {
832 bool discard = false;
833
832 chunk_size = size; 834 chunk_size = size;
833 type = xen_e820_map[i].type; 835 type = xen_e820_map[i].type;
834 836
@@ -843,10 +845,11 @@ char * __init xen_memory_setup(void)
843 xen_add_extra_mem(pfn_s, n_pfns); 845 xen_add_extra_mem(pfn_s, n_pfns);
844 xen_max_p2m_pfn = pfn_s + n_pfns; 846 xen_max_p2m_pfn = pfn_s + n_pfns;
845 } else 847 } else
846 type = E820_UNUSABLE; 848 discard = true;
847 } 849 }
848 850
849 xen_align_and_add_e820_region(addr, chunk_size, type); 851 if (!discard)
852 xen_align_and_add_e820_region(addr, chunk_size, type);
850 853
851 addr += chunk_size; 854 addr += chunk_size;
852 size -= chunk_size; 855 size -= chunk_size;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index feddabdab448..3705eabd7e22 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -68,26 +68,16 @@ static void xen_pv_post_suspend(int suspend_cancelled)
68 68
69void xen_arch_pre_suspend(void) 69void xen_arch_pre_suspend(void)
70{ 70{
71 int cpu;
72
73 for_each_online_cpu(cpu)
74 xen_pmu_finish(cpu);
75
76 if (xen_pv_domain()) 71 if (xen_pv_domain())
77 xen_pv_pre_suspend(); 72 xen_pv_pre_suspend();
78} 73}
79 74
80void xen_arch_post_suspend(int cancelled) 75void xen_arch_post_suspend(int cancelled)
81{ 76{
82 int cpu;
83
84 if (xen_pv_domain()) 77 if (xen_pv_domain())
85 xen_pv_post_suspend(cancelled); 78 xen_pv_post_suspend(cancelled);
86 else 79 else
87 xen_hvm_post_suspend(cancelled); 80 xen_hvm_post_suspend(cancelled);
88
89 for_each_online_cpu(cpu)
90 xen_pmu_init(cpu);
91} 81}
92 82
93static void xen_vcpu_notify_restore(void *data) 83static void xen_vcpu_notify_restore(void *data)
@@ -106,10 +96,20 @@ static void xen_vcpu_notify_suspend(void *data)
106 96
107void xen_arch_resume(void) 97void xen_arch_resume(void)
108{ 98{
99 int cpu;
100
109 on_each_cpu(xen_vcpu_notify_restore, NULL, 1); 101 on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
102
103 for_each_online_cpu(cpu)
104 xen_pmu_init(cpu);
110} 105}
111 106
112void xen_arch_suspend(void) 107void xen_arch_suspend(void)
113{ 108{
109 int cpu;
110
111 for_each_online_cpu(cpu)
112 xen_pmu_finish(cpu);
113
114 on_each_cpu(xen_vcpu_notify_suspend, NULL, 1); 114 on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
115} 115}