aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-24 15:01:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-24 15:01:20 -0400
commit5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a (patch)
treed1fc25d9650d3ac24591bba6f5e2e7a1afc54796 /arch
parent3c4cfadef6a1665d9cd02a543782d03d3e6740c6 (diff)
parent1a577b72475d161b6677c05abe57301362023bb2 (diff)
Merge tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Avi Kivity: "Highlights include - full big real mode emulation on pre-Westmere Intel hosts (can be disabled with emulate_invalid_guest_state=0) - relatively small ppc and s390 updates - PCID/INVPCID support in guests - EOI avoidance; 3.6 guests should perform better on 3.6 hosts on interrupt intensive workloads) - Lockless write faults during live migration - EPT accessed/dirty bits support for new Intel processors" Fix up conflicts in: - Documentation/virtual/kvm/api.txt: Stupid subchapter numbering, added next to each other. - arch/powerpc/kvm/booke_interrupts.S: PPC asm changes clashing with the KVM fixes - arch/s390/include/asm/sigp.h, arch/s390/kvm/sigp.c: Duplicated commits through the kvm tree and the s390 tree, with subsequent edits in the KVM tree. * tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (93 commits) KVM: fix race with level interrupts x86, hyper: fix build with !CONFIG_KVM_GUEST Revert "apic: fix kvm build on UP without IOAPIC" KVM guest: switch to apic_set_eoi_write, apic_write apic: add apic_set_eoi_write for PV use KVM: VMX: Implement PCID/INVPCID for guests with EPT KVM: Add x86_hyper_kvm to complete detect_hypervisor_platform check KVM: PPC: Critical interrupt emulation support KVM: PPC: e500mc: Fix tlbilx emulation for 64-bit guests KVM: PPC64: booke: Set interrupt computation mode for 64-bit host KVM: PPC: bookehv: Add ESR flag to Data Storage Interrupt KVM: PPC: bookehv64: Add support for std/ld emulation. booke: Added crit/mc exception handler for e500v2 booke/bookehv: Add host crit-watchdog exception support KVM: MMU: document mmu-lock and fast page fault KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint KVM: MMU: trace fast page fault KVM: MMU: fast path of handling guest page fault KVM: MMU: introduce SPTE_MMU_WRITEABLE bit KVM: MMU: fold tlb flush judgement into mmu_spte_update ...
Diffstat (limited to 'arch')
-rw-r--r--arch/ia64/include/asm/kvm.h1
-rw-r--r--arch/ia64/kvm/Kconfig1
-rw-r--r--arch/powerpc/include/asm/epapr_hcalls.h2
-rw-r--r--arch/powerpc/include/asm/hw_irq.h2
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h7
-rw-r--r--arch/powerpc/include/asm/kvm_host.h6
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h3
-rw-r--r--arch/powerpc/kernel/Makefile1
-rw-r--r--arch/powerpc/kernel/epapr_hcalls.S25
-rw-r--r--arch/powerpc/kernel/epapr_paravirt.c52
-rw-r--r--arch/powerpc/kernel/kvm.c28
-rw-r--r--arch/powerpc/kernel/kvm_emul.S12
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c123
-rw-r--r--arch/powerpc/kvm/book3s_hv.c40
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c5
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c15
-rw-r--r--arch/powerpc/kvm/booke.c26
-rw-r--r--arch/powerpc/kvm/booke_emulate.c28
-rw-r--r--arch/powerpc/kvm/booke_interrupts.S55
-rw-r--r--arch/powerpc/kvm/bookehv_interrupts.S2
-rw-r--r--arch/powerpc/kvm/e500_emulate.c3
-rw-r--r--arch/powerpc/kvm/e500mc.c8
-rw-r--r--arch/powerpc/kvm/emulate.c16
-rw-r--r--arch/powerpc/kvm/powerpc.c18
-rw-r--r--arch/powerpc/platforms/Kconfig9
-rw-r--r--arch/s390/include/asm/sclp.h2
-rw-r--r--arch/s390/include/asm/sigp.h1
-rw-r--r--arch/s390/kernel/setup.c12
-rw-r--r--arch/s390/kvm/kvm-s390.c1
-rw-r--r--arch/s390/kvm/sigp.c77
-rw-r--r--arch/x86/include/asm/apic.h3
-rw-r--r--arch/x86/include/asm/bitops.h7
-rw-r--r--arch/x86/include/asm/hypervisor.h1
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h31
-rw-r--r--arch/x86/include/asm/kvm_para.h7
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/vmx.h6
-rw-r--r--arch/x86/kernel/apic/apic.c17
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c3
-rw-r--r--arch/x86/kernel/kvm.c64
-rw-r--r--arch/x86/kvm/cpuid.c46
-rw-r--r--arch/x86/kvm/cpuid.h9
-rw-r--r--arch/x86/kvm/emulate.c273
-rw-r--r--arch/x86/kvm/i8259.c17
-rw-r--r--arch/x86/kvm/lapic.c194
-rw-r--r--arch/x86/kvm/lapic.h11
-rw-r--r--arch/x86/kvm/mmu.c359
-rw-r--r--arch/x86/kvm/mmutrace.h45
-rw-r--r--arch/x86/kvm/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/svm.c12
-rw-r--r--arch/x86/kvm/trace.h34
-rw-r--r--arch/x86/kvm/vmx.c189
-rw-r--r--arch/x86/kvm/x86.c123
55 files changed, 1594 insertions, 450 deletions
diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index b9f82c84f093..ec6c6b301238 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -26,6 +26,7 @@
26 26
27/* Select x86 specific features in <linux/kvm.h> */ 27/* Select x86 specific features in <linux/kvm.h> */
28#define __KVM_HAVE_IOAPIC 28#define __KVM_HAVE_IOAPIC
29#define __KVM_HAVE_IRQ_LINE
29#define __KVM_HAVE_DEVICE_ASSIGNMENT 30#define __KVM_HAVE_DEVICE_ASSIGNMENT
30 31
31/* Architectural interrupt line count. */ 32/* Architectural interrupt line count. */
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 9806e55f91be..df5351e3eed7 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -19,6 +19,7 @@ if VIRTUALIZATION
19 19
20config KVM 20config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on BROKEN
22 depends on HAVE_KVM && MODULES && EXPERIMENTAL 23 depends on HAVE_KVM && MODULES && EXPERIMENTAL
23 # for device assignment: 24 # for device assignment:
24 depends on PCI 25 depends on PCI
diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
index 976835d8f22e..bf2c06c33871 100644
--- a/arch/powerpc/include/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -153,6 +153,8 @@
153#define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5" 153#define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5"
154#define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4" 154#define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4"
155 155
156extern bool epapr_paravirt_enabled;
157extern u32 epapr_hypercall_start[];
156 158
157/* 159/*
158 * We use "uintptr_t" to define a register because it's guaranteed to be a 160 * We use "uintptr_t" to define a register because it's guaranteed to be a
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 0554ab062bdc..e45c4947a772 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -34,6 +34,8 @@ extern void __replay_interrupt(unsigned int vector);
34 34
35extern void timer_interrupt(struct pt_regs *); 35extern void timer_interrupt(struct pt_regs *);
36extern void performance_monitor_exception(struct pt_regs *regs); 36extern void performance_monitor_exception(struct pt_regs *regs);
37extern void WatchdogException(struct pt_regs *regs);
38extern void unknown_exception(struct pt_regs *regs);
37 39
38#ifdef CONFIG_PPC64 40#ifdef CONFIG_PPC64
39#include <asm/paca.h> 41#include <asm/paca.h>
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index b0c08b142770..0dd1d86d3e31 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -36,11 +36,8 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
36#define SPAPR_TCE_SHIFT 12 36#define SPAPR_TCE_SHIFT 12
37 37
38#ifdef CONFIG_KVM_BOOK3S_64_HV 38#ifdef CONFIG_KVM_BOOK3S_64_HV
39/* For now use fixed-size 16MB page table */ 39#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
40#define HPT_ORDER 24 40extern int kvm_hpt_order; /* order of preallocated HPTs */
41#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */
42#define HPT_NPTE (HPT_NPTEG << 3) /* 8 PTEs per PTEG */
43#define HPT_HASH_MASK (HPT_NPTEG - 1)
44#endif 41#endif
45 42
46#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ 43#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index d848cdc49715..50ea12fd7bf5 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -237,6 +237,10 @@ struct kvm_arch {
237 unsigned long vrma_slb_v; 237 unsigned long vrma_slb_v;
238 int rma_setup_done; 238 int rma_setup_done;
239 int using_mmu_notifiers; 239 int using_mmu_notifiers;
240 u32 hpt_order;
241 atomic_t vcpus_running;
242 unsigned long hpt_npte;
243 unsigned long hpt_mask;
240 spinlock_t slot_phys_lock; 244 spinlock_t slot_phys_lock;
241 unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; 245 unsigned long *slot_phys[KVM_MEM_SLOTS_NUM];
242 int slot_npages[KVM_MEM_SLOTS_NUM]; 246 int slot_npages[KVM_MEM_SLOTS_NUM];
@@ -414,7 +418,9 @@ struct kvm_vcpu_arch {
414 ulong mcsrr1; 418 ulong mcsrr1;
415 ulong mcsr; 419 ulong mcsr;
416 u32 dec; 420 u32 dec;
421#ifdef CONFIG_BOOKE
417 u32 decar; 422 u32 decar;
423#endif
418 u32 tbl; 424 u32 tbl;
419 u32 tbu; 425 u32 tbu;
420 u32 tcr; 426 u32 tcr;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index f68c22fa2fce..0124937a23b9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -119,7 +119,8 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
119extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); 119extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
120extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); 120extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
121 121
122extern long kvmppc_alloc_hpt(struct kvm *kvm); 122extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp);
123extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp);
123extern void kvmppc_free_hpt(struct kvm *kvm); 124extern void kvmppc_free_hpt(struct kvm *kvm);
124extern long kvmppc_prepare_vrma(struct kvm *kvm, 125extern long kvmppc_prepare_vrma(struct kvm *kvm,
125 struct kvm_userspace_memory_region *mem); 126 struct kvm_userspace_memory_region *mem);
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 83afacd3ba7b..bb282dd81612 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -128,6 +128,7 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),)
128obj-y += ppc_save_regs.o 128obj-y += ppc_save_regs.o
129endif 129endif
130 130
131obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o
131obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o 132obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o
132 133
133# Disable GCOV in odd or sensitive code 134# Disable GCOV in odd or sensitive code
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S
new file mode 100644
index 000000000000..697b390ebfd8
--- /dev/null
+++ b/arch/powerpc/kernel/epapr_hcalls.S
@@ -0,0 +1,25 @@
1/*
2 * Copyright (C) 2012 Freescale Semiconductor, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#include <linux/threads.h>
11#include <asm/reg.h>
12#include <asm/page.h>
13#include <asm/cputable.h>
14#include <asm/thread_info.h>
15#include <asm/ppc_asm.h>
16#include <asm/asm-offsets.h>
17
18/* Hypercall entry point. Will be patched with device tree instructions. */
19.global epapr_hypercall_start
20epapr_hypercall_start:
21 li r3, -1
22 nop
23 nop
24 nop
25 blr
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c
new file mode 100644
index 000000000000..028aeae370b6
--- /dev/null
+++ b/arch/powerpc/kernel/epapr_paravirt.c
@@ -0,0 +1,52 @@
1/*
2 * ePAPR para-virtualization support.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2, as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 *
17 * Copyright (C) 2012 Freescale Semiconductor, Inc.
18 */
19
20#include <linux/of.h>
21#include <asm/epapr_hcalls.h>
22#include <asm/cacheflush.h>
23#include <asm/code-patching.h>
24
25bool epapr_paravirt_enabled;
26
27static int __init epapr_paravirt_init(void)
28{
29 struct device_node *hyper_node;
30 const u32 *insts;
31 int len, i;
32
33 hyper_node = of_find_node_by_path("/hypervisor");
34 if (!hyper_node)
35 return -ENODEV;
36
37 insts = of_get_property(hyper_node, "hcall-instructions", &len);
38 if (!insts)
39 return -ENODEV;
40
41 if (len % 4 || len > (4 * 4))
42 return -ENODEV;
43
44 for (i = 0; i < (len / 4); i++)
45 patch_instruction(epapr_hypercall_start + i, insts[i]);
46
47 epapr_paravirt_enabled = true;
48
49 return 0;
50}
51
52early_initcall(epapr_paravirt_init);
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 02c167db6ba0..867db1de8949 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -31,6 +31,7 @@
31#include <asm/cacheflush.h> 31#include <asm/cacheflush.h>
32#include <asm/disassemble.h> 32#include <asm/disassemble.h>
33#include <asm/ppc-opcode.h> 33#include <asm/ppc-opcode.h>
34#include <asm/epapr_hcalls.h>
34 35
35#define KVM_MAGIC_PAGE (-4096L) 36#define KVM_MAGIC_PAGE (-4096L)
36#define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) 37#define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
@@ -726,7 +727,7 @@ unsigned long kvm_hypercall(unsigned long *in,
726 unsigned long register r11 asm("r11") = nr; 727 unsigned long register r11 asm("r11") = nr;
727 unsigned long register r12 asm("r12"); 728 unsigned long register r12 asm("r12");
728 729
729 asm volatile("bl kvm_hypercall_start" 730 asm volatile("bl epapr_hypercall_start"
730 : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6), 731 : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
731 "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11), 732 "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11),
732 "=r"(r12) 733 "=r"(r12)
@@ -747,29 +748,6 @@ unsigned long kvm_hypercall(unsigned long *in,
747} 748}
748EXPORT_SYMBOL_GPL(kvm_hypercall); 749EXPORT_SYMBOL_GPL(kvm_hypercall);
749 750
750static int kvm_para_setup(void)
751{
752 extern u32 kvm_hypercall_start;
753 struct device_node *hyper_node;
754 u32 *insts;
755 int len, i;
756
757 hyper_node = of_find_node_by_path("/hypervisor");
758 if (!hyper_node)
759 return -1;
760
761 insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len);
762 if (len % 4)
763 return -1;
764 if (len > (4 * 4))
765 return -1;
766
767 for (i = 0; i < (len / 4); i++)
768 kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]);
769
770 return 0;
771}
772
773static __init void kvm_free_tmp(void) 751static __init void kvm_free_tmp(void)
774{ 752{
775 unsigned long start, end; 753 unsigned long start, end;
@@ -791,7 +769,7 @@ static int __init kvm_guest_init(void)
791 if (!kvm_para_available()) 769 if (!kvm_para_available())
792 goto free_tmp; 770 goto free_tmp;
793 771
794 if (kvm_para_setup()) 772 if (!epapr_paravirt_enabled)
795 goto free_tmp; 773 goto free_tmp;
796 774
797 if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE)) 775 if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index e291cf3cf954..e100ff324a85 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -24,16 +24,6 @@
24#include <asm/page.h> 24#include <asm/page.h>
25#include <asm/asm-offsets.h> 25#include <asm/asm-offsets.h>
26 26
27/* Hypercall entry point. Will be patched with device tree instructions. */
28
29.global kvm_hypercall_start
30kvm_hypercall_start:
31 li r3, -1
32 nop
33 nop
34 nop
35 blr
36
37#define KVM_MAGIC_PAGE (-4096) 27#define KVM_MAGIC_PAGE (-4096)
38 28
39#ifdef CONFIG_64BIT 29#ifdef CONFIG_64BIT
@@ -132,7 +122,7 @@ kvm_emulate_mtmsrd_len:
132 .long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4 122 .long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
133 123
134 124
135#define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI) 125#define MSR_SAFE_BITS (MSR_EE | MSR_RI)
136#define MSR_CRITICAL_BITS ~MSR_SAFE_BITS 126#define MSR_CRITICAL_BITS ~MSR_SAFE_BITS
137 127
138.global kvm_emulate_mtmsr 128.global kvm_emulate_mtmsr
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 80a577517584..d03eb6f7b058 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -37,56 +37,121 @@
37/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ 37/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
38#define MAX_LPID_970 63 38#define MAX_LPID_970 63
39 39
40long kvmppc_alloc_hpt(struct kvm *kvm) 40/* Power architecture requires HPT is at least 256kB */
41#define PPC_MIN_HPT_ORDER 18
42
43long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
41{ 44{
42 unsigned long hpt; 45 unsigned long hpt;
43 long lpid;
44 struct revmap_entry *rev; 46 struct revmap_entry *rev;
45 struct kvmppc_linear_info *li; 47 struct kvmppc_linear_info *li;
48 long order = kvm_hpt_order;
46 49
47 /* Allocate guest's hashed page table */ 50 if (htab_orderp) {
48 li = kvm_alloc_hpt(); 51 order = *htab_orderp;
49 if (li) { 52 if (order < PPC_MIN_HPT_ORDER)
50 /* using preallocated memory */ 53 order = PPC_MIN_HPT_ORDER;
51 hpt = (ulong)li->base_virt; 54 }
52 kvm->arch.hpt_li = li; 55
53 } else { 56 /*
54 /* using dynamic memory */ 57 * If the user wants a different size from default,
58 * try first to allocate it from the kernel page allocator.
59 */
60 hpt = 0;
61 if (order != kvm_hpt_order) {
55 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| 62 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
56 __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT); 63 __GFP_NOWARN, order - PAGE_SHIFT);
64 if (!hpt)
65 --order;
57 } 66 }
58 67
68 /* Next try to allocate from the preallocated pool */
59 if (!hpt) { 69 if (!hpt) {
60 pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); 70 li = kvm_alloc_hpt();
61 return -ENOMEM; 71 if (li) {
72 hpt = (ulong)li->base_virt;
73 kvm->arch.hpt_li = li;
74 order = kvm_hpt_order;
75 }
62 } 76 }
77
78 /* Lastly try successively smaller sizes from the page allocator */
79 while (!hpt && order > PPC_MIN_HPT_ORDER) {
80 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
81 __GFP_NOWARN, order - PAGE_SHIFT);
82 if (!hpt)
83 --order;
84 }
85
86 if (!hpt)
87 return -ENOMEM;
88
63 kvm->arch.hpt_virt = hpt; 89 kvm->arch.hpt_virt = hpt;
90 kvm->arch.hpt_order = order;
91 /* HPTEs are 2**4 bytes long */
92 kvm->arch.hpt_npte = 1ul << (order - 4);
93 /* 128 (2**7) bytes in each HPTEG */
94 kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
64 95
65 /* Allocate reverse map array */ 96 /* Allocate reverse map array */
66 rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE); 97 rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
67 if (!rev) { 98 if (!rev) {
68 pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); 99 pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
69 goto out_freehpt; 100 goto out_freehpt;
70 } 101 }
71 kvm->arch.revmap = rev; 102 kvm->arch.revmap = rev;
103 kvm->arch.sdr1 = __pa(hpt) | (order - 18);
72 104
73 lpid = kvmppc_alloc_lpid(); 105 pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
74 if (lpid < 0) 106 hpt, order, kvm->arch.lpid);
75 goto out_freeboth;
76 107
77 kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18); 108 if (htab_orderp)
78 kvm->arch.lpid = lpid; 109 *htab_orderp = order;
79
80 pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
81 return 0; 110 return 0;
82 111
83 out_freeboth:
84 vfree(rev);
85 out_freehpt: 112 out_freehpt:
86 free_pages(hpt, HPT_ORDER - PAGE_SHIFT); 113 if (kvm->arch.hpt_li)
114 kvm_release_hpt(kvm->arch.hpt_li);
115 else
116 free_pages(hpt, order - PAGE_SHIFT);
87 return -ENOMEM; 117 return -ENOMEM;
88} 118}
89 119
120long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
121{
122 long err = -EBUSY;
123 long order;
124
125 mutex_lock(&kvm->lock);
126 if (kvm->arch.rma_setup_done) {
127 kvm->arch.rma_setup_done = 0;
128 /* order rma_setup_done vs. vcpus_running */
129 smp_mb();
130 if (atomic_read(&kvm->arch.vcpus_running)) {
131 kvm->arch.rma_setup_done = 1;
132 goto out;
133 }
134 }
135 if (kvm->arch.hpt_virt) {
136 order = kvm->arch.hpt_order;
137 /* Set the entire HPT to 0, i.e. invalid HPTEs */
138 memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
139 /*
140 * Set the whole last_vcpu array to an invalid vcpu number.
141 * This ensures that each vcpu will flush its TLB on next entry.
142 */
143 memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu));
144 *htab_orderp = order;
145 err = 0;
146 } else {
147 err = kvmppc_alloc_hpt(kvm, htab_orderp);
148 order = *htab_orderp;
149 }
150 out:
151 mutex_unlock(&kvm->lock);
152 return err;
153}
154
90void kvmppc_free_hpt(struct kvm *kvm) 155void kvmppc_free_hpt(struct kvm *kvm)
91{ 156{
92 kvmppc_free_lpid(kvm->arch.lpid); 157 kvmppc_free_lpid(kvm->arch.lpid);
@@ -94,7 +159,8 @@ void kvmppc_free_hpt(struct kvm *kvm)
94 if (kvm->arch.hpt_li) 159 if (kvm->arch.hpt_li)
95 kvm_release_hpt(kvm->arch.hpt_li); 160 kvm_release_hpt(kvm->arch.hpt_li);
96 else 161 else
97 free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); 162 free_pages(kvm->arch.hpt_virt,
163 kvm->arch.hpt_order - PAGE_SHIFT);
98} 164}
99 165
100/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ 166/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
@@ -119,6 +185,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
119 unsigned long psize; 185 unsigned long psize;
120 unsigned long hp0, hp1; 186 unsigned long hp0, hp1;
121 long ret; 187 long ret;
188 struct kvm *kvm = vcpu->kvm;
122 189
123 psize = 1ul << porder; 190 psize = 1ul << porder;
124 npages = memslot->npages >> (porder - PAGE_SHIFT); 191 npages = memslot->npages >> (porder - PAGE_SHIFT);
@@ -127,8 +194,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
127 if (npages > 1ul << (40 - porder)) 194 if (npages > 1ul << (40 - porder))
128 npages = 1ul << (40 - porder); 195 npages = 1ul << (40 - porder);
129 /* Can't use more than 1 HPTE per HPTEG */ 196 /* Can't use more than 1 HPTE per HPTEG */
130 if (npages > HPT_NPTEG) 197 if (npages > kvm->arch.hpt_mask + 1)
131 npages = HPT_NPTEG; 198 npages = kvm->arch.hpt_mask + 1;
132 199
133 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 200 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
134 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); 201 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
@@ -138,7 +205,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
138 for (i = 0; i < npages; ++i) { 205 for (i = 0; i < npages; ++i) {
139 addr = i << porder; 206 addr = i << porder;
140 /* can't use hpt_hash since va > 64 bits */ 207 /* can't use hpt_hash since va > 64 bits */
141 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; 208 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask;
142 /* 209 /*
143 * We assume that the hash table is empty and no 210 * We assume that the hash table is empty and no
144 * vcpus are using it at this stage. Since we create 211 * vcpus are using it at this stage. Since we create
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 3abe1b86e583..83e929e66f9d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -56,7 +56,7 @@
56/* #define EXIT_DEBUG_INT */ 56/* #define EXIT_DEBUG_INT */
57 57
58static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 58static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
59static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu); 59static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
60 60
61void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 61void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
62{ 62{
@@ -1104,11 +1104,15 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
1104 return -EINTR; 1104 return -EINTR;
1105 } 1105 }
1106 1106
1107 /* On the first time here, set up VRMA or RMA */ 1107 atomic_inc(&vcpu->kvm->arch.vcpus_running);
1108 /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */
1109 smp_mb();
1110
1111 /* On the first time here, set up HTAB and VRMA or RMA */
1108 if (!vcpu->kvm->arch.rma_setup_done) { 1112 if (!vcpu->kvm->arch.rma_setup_done) {
1109 r = kvmppc_hv_setup_rma(vcpu); 1113 r = kvmppc_hv_setup_htab_rma(vcpu);
1110 if (r) 1114 if (r)
1111 return r; 1115 goto out;
1112 } 1116 }
1113 1117
1114 flush_fp_to_thread(current); 1118 flush_fp_to_thread(current);
@@ -1126,6 +1130,9 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
1126 kvmppc_core_prepare_to_enter(vcpu); 1130 kvmppc_core_prepare_to_enter(vcpu);
1127 } 1131 }
1128 } while (r == RESUME_GUEST); 1132 } while (r == RESUME_GUEST);
1133
1134 out:
1135 atomic_dec(&vcpu->kvm->arch.vcpus_running);
1129 return r; 1136 return r;
1130} 1137}
1131 1138
@@ -1341,7 +1348,7 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
1341{ 1348{
1342} 1349}
1343 1350
1344static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) 1351static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1345{ 1352{
1346 int err = 0; 1353 int err = 0;
1347 struct kvm *kvm = vcpu->kvm; 1354 struct kvm *kvm = vcpu->kvm;
@@ -1360,6 +1367,15 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu)
1360 if (kvm->arch.rma_setup_done) 1367 if (kvm->arch.rma_setup_done)
1361 goto out; /* another vcpu beat us to it */ 1368 goto out; /* another vcpu beat us to it */
1362 1369
1370 /* Allocate hashed page table (if not done already) and reset it */
1371 if (!kvm->arch.hpt_virt) {
1372 err = kvmppc_alloc_hpt(kvm, NULL);
1373 if (err) {
1374 pr_err("KVM: Couldn't alloc HPT\n");
1375 goto out;
1376 }
1377 }
1378
1363 /* Look up the memslot for guest physical address 0 */ 1379 /* Look up the memslot for guest physical address 0 */
1364 memslot = gfn_to_memslot(kvm, 0); 1380 memslot = gfn_to_memslot(kvm, 0);
1365 1381
@@ -1471,13 +1487,14 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu)
1471 1487
1472int kvmppc_core_init_vm(struct kvm *kvm) 1488int kvmppc_core_init_vm(struct kvm *kvm)
1473{ 1489{
1474 long r; 1490 unsigned long lpcr, lpid;
1475 unsigned long lpcr;
1476 1491
1477 /* Allocate hashed page table */ 1492 /* Allocate the guest's logical partition ID */
1478 r = kvmppc_alloc_hpt(kvm); 1493
1479 if (r) 1494 lpid = kvmppc_alloc_lpid();
1480 return r; 1495 if (lpid < 0)
1496 return -ENOMEM;
1497 kvm->arch.lpid = lpid;
1481 1498
1482 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1499 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
1483 1500
@@ -1487,7 +1504,6 @@ int kvmppc_core_init_vm(struct kvm *kvm)
1487 1504
1488 if (cpu_has_feature(CPU_FTR_ARCH_201)) { 1505 if (cpu_has_feature(CPU_FTR_ARCH_201)) {
1489 /* PPC970; HID4 is effectively the LPCR */ 1506 /* PPC970; HID4 is effectively the LPCR */
1490 unsigned long lpid = kvm->arch.lpid;
1491 kvm->arch.host_lpid = 0; 1507 kvm->arch.host_lpid = 0;
1492 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); 1508 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
1493 lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH)); 1509 lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index e1b60f56f2a1..fb4eac290fef 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -25,6 +25,9 @@ static void __init kvm_linear_init_one(ulong size, int count, int type);
25static struct kvmppc_linear_info *kvm_alloc_linear(int type); 25static struct kvmppc_linear_info *kvm_alloc_linear(int type);
26static void kvm_release_linear(struct kvmppc_linear_info *ri); 26static void kvm_release_linear(struct kvmppc_linear_info *ri);
27 27
28int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER;
29EXPORT_SYMBOL_GPL(kvm_hpt_order);
30
28/*************** RMA *************/ 31/*************** RMA *************/
29 32
30/* 33/*
@@ -209,7 +212,7 @@ static void kvm_release_linear(struct kvmppc_linear_info *ri)
209void __init kvm_linear_init(void) 212void __init kvm_linear_init(void)
210{ 213{
211 /* HPT */ 214 /* HPT */
212 kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT); 215 kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT);
213 216
214 /* RMA */ 217 /* RMA */
215 /* Only do this on PPC970 in HV mode */ 218 /* Only do this on PPC970 in HV mode */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index cec4daddbf31..5c70d19494f9 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -237,7 +237,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
237 237
238 /* Find and lock the HPTEG slot to use */ 238 /* Find and lock the HPTEG slot to use */
239 do_insert: 239 do_insert:
240 if (pte_index >= HPT_NPTE) 240 if (pte_index >= kvm->arch.hpt_npte)
241 return H_PARAMETER; 241 return H_PARAMETER;
242 if (likely((flags & H_EXACT) == 0)) { 242 if (likely((flags & H_EXACT) == 0)) {
243 pte_index &= ~7UL; 243 pte_index &= ~7UL;
@@ -352,7 +352,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
352 unsigned long v, r, rb; 352 unsigned long v, r, rb;
353 struct revmap_entry *rev; 353 struct revmap_entry *rev;
354 354
355 if (pte_index >= HPT_NPTE) 355 if (pte_index >= kvm->arch.hpt_npte)
356 return H_PARAMETER; 356 return H_PARAMETER;
357 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); 357 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
358 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) 358 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
@@ -419,7 +419,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
419 i = 4; 419 i = 4;
420 break; 420 break;
421 } 421 }
422 if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) { 422 if (req != 1 || flags == 3 ||
423 pte_index >= kvm->arch.hpt_npte) {
423 /* parameter error */ 424 /* parameter error */
424 args[j] = ((0xa0 | flags) << 56) + pte_index; 425 args[j] = ((0xa0 | flags) << 56) + pte_index;
425 ret = H_PARAMETER; 426 ret = H_PARAMETER;
@@ -521,7 +522,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
521 struct revmap_entry *rev; 522 struct revmap_entry *rev;
522 unsigned long v, r, rb, mask, bits; 523 unsigned long v, r, rb, mask, bits;
523 524
524 if (pte_index >= HPT_NPTE) 525 if (pte_index >= kvm->arch.hpt_npte)
525 return H_PARAMETER; 526 return H_PARAMETER;
526 527
527 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); 528 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
@@ -583,7 +584,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
583 int i, n = 1; 584 int i, n = 1;
584 struct revmap_entry *rev = NULL; 585 struct revmap_entry *rev = NULL;
585 586
586 if (pte_index >= HPT_NPTE) 587 if (pte_index >= kvm->arch.hpt_npte)
587 return H_PARAMETER; 588 return H_PARAMETER;
588 if (flags & H_READ_4) { 589 if (flags & H_READ_4) {
589 pte_index &= ~3; 590 pte_index &= ~3;
@@ -678,7 +679,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
678 somask = (1UL << 28) - 1; 679 somask = (1UL << 28) - 1;
679 vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; 680 vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
680 } 681 }
681 hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK; 682 hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
682 avpn = slb_v & ~(somask >> 16); /* also includes B */ 683 avpn = slb_v & ~(somask >> 16); /* also includes B */
683 avpn |= (eaddr & somask) >> 16; 684 avpn |= (eaddr & somask) >> 16;
684 685
@@ -723,7 +724,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
723 if (val & HPTE_V_SECONDARY) 724 if (val & HPTE_V_SECONDARY)
724 break; 725 break;
725 val |= HPTE_V_SECONDARY; 726 val |= HPTE_V_SECONDARY;
726 hash = hash ^ HPT_HASH_MASK; 727 hash = hash ^ kvm->arch.hpt_mask;
727 } 728 }
728 return -1; 729 return -1;
729} 730}
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 72f13f4a06e0..d25a097c852b 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -612,6 +612,12 @@ static void kvmppc_fill_pt_regs(struct pt_regs *regs)
612 regs->link = lr; 612 regs->link = lr;
613} 613}
614 614
615/*
616 * For interrupts needed to be handled by host interrupt handlers,
617 * corresponding host handler are called from here in similar way
618 * (but not exact) as they are called from low level handler
619 * (such as from arch/powerpc/kernel/head_fsl_booke.S).
620 */
615static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, 621static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
616 unsigned int exit_nr) 622 unsigned int exit_nr)
617{ 623{
@@ -639,6 +645,17 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
639 kvmppc_fill_pt_regs(&regs); 645 kvmppc_fill_pt_regs(&regs);
640 performance_monitor_exception(&regs); 646 performance_monitor_exception(&regs);
641 break; 647 break;
648 case BOOKE_INTERRUPT_WATCHDOG:
649 kvmppc_fill_pt_regs(&regs);
650#ifdef CONFIG_BOOKE_WDT
651 WatchdogException(&regs);
652#else
653 unknown_exception(&regs);
654#endif
655 break;
656 case BOOKE_INTERRUPT_CRITICAL:
657 unknown_exception(&regs);
658 break;
642 } 659 }
643} 660}
644 661
@@ -683,6 +700,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
683 r = RESUME_GUEST; 700 r = RESUME_GUEST;
684 break; 701 break;
685 702
703 case BOOKE_INTERRUPT_WATCHDOG:
704 r = RESUME_GUEST;
705 break;
706
686 case BOOKE_INTERRUPT_DOORBELL: 707 case BOOKE_INTERRUPT_DOORBELL:
687 kvmppc_account_exit(vcpu, DBELL_EXITS); 708 kvmppc_account_exit(vcpu, DBELL_EXITS);
688 r = RESUME_GUEST; 709 r = RESUME_GUEST;
@@ -1267,6 +1288,11 @@ void kvmppc_decrementer_func(unsigned long data)
1267{ 1288{
1268 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; 1289 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
1269 1290
1291 if (vcpu->arch.tcr & TCR_ARE) {
1292 vcpu->arch.dec = vcpu->arch.decar;
1293 kvmppc_emulate_dec(vcpu);
1294 }
1295
1270 kvmppc_set_tsr_bits(vcpu, TSR_DIS); 1296 kvmppc_set_tsr_bits(vcpu, TSR_DIS);
1271} 1297}
1272 1298
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 6c76397f2af4..12834bb608ab 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -24,6 +24,7 @@
24#include "booke.h" 24#include "booke.h"
25 25
26#define OP_19_XOP_RFI 50 26#define OP_19_XOP_RFI 50
27#define OP_19_XOP_RFCI 51
27 28
28#define OP_31_XOP_MFMSR 83 29#define OP_31_XOP_MFMSR 83
29#define OP_31_XOP_WRTEE 131 30#define OP_31_XOP_WRTEE 131
@@ -36,6 +37,12 @@ static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
36 kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); 37 kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1);
37} 38}
38 39
40static void kvmppc_emul_rfci(struct kvm_vcpu *vcpu)
41{
42 vcpu->arch.pc = vcpu->arch.csrr0;
43 kvmppc_set_msr(vcpu, vcpu->arch.csrr1);
44}
45
39int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 46int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
40 unsigned int inst, int *advance) 47 unsigned int inst, int *advance)
41{ 48{
@@ -52,6 +59,12 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
52 *advance = 0; 59 *advance = 0;
53 break; 60 break;
54 61
62 case OP_19_XOP_RFCI:
63 kvmppc_emul_rfci(vcpu);
64 kvmppc_set_exit_type(vcpu, EMULATED_RFCI_EXITS);
65 *advance = 0;
66 break;
67
55 default: 68 default:
56 emulated = EMULATE_FAIL; 69 emulated = EMULATE_FAIL;
57 break; 70 break;
@@ -113,6 +126,12 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
113 case SPRN_ESR: 126 case SPRN_ESR:
114 vcpu->arch.shared->esr = spr_val; 127 vcpu->arch.shared->esr = spr_val;
115 break; 128 break;
129 case SPRN_CSRR0:
130 vcpu->arch.csrr0 = spr_val;
131 break;
132 case SPRN_CSRR1:
133 vcpu->arch.csrr1 = spr_val;
134 break;
116 case SPRN_DBCR0: 135 case SPRN_DBCR0:
117 vcpu->arch.dbcr0 = spr_val; 136 vcpu->arch.dbcr0 = spr_val;
118 break; 137 break;
@@ -129,6 +148,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
129 kvmppc_set_tcr(vcpu, spr_val); 148 kvmppc_set_tcr(vcpu, spr_val);
130 break; 149 break;
131 150
151 case SPRN_DECAR:
152 vcpu->arch.decar = spr_val;
153 break;
132 /* 154 /*
133 * Note: SPRG4-7 are user-readable. 155 * Note: SPRG4-7 are user-readable.
134 * These values are loaded into the real SPRGs when resuming the 156 * These values are loaded into the real SPRGs when resuming the
@@ -229,6 +251,12 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
229 case SPRN_ESR: 251 case SPRN_ESR:
230 *spr_val = vcpu->arch.shared->esr; 252 *spr_val = vcpu->arch.shared->esr;
231 break; 253 break;
254 case SPRN_CSRR0:
255 *spr_val = vcpu->arch.csrr0;
256 break;
257 case SPRN_CSRR1:
258 *spr_val = vcpu->arch.csrr1;
259 break;
232 case SPRN_DBCR0: 260 case SPRN_DBCR0:
233 *spr_val = vcpu->arch.dbcr0; 261 *spr_val = vcpu->arch.dbcr0;
234 break; 262 break;
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 8fd4b2a0911b..bb46b32f9813 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -52,16 +52,21 @@
52 (1<<BOOKE_INTERRUPT_PROGRAM) | \ 52 (1<<BOOKE_INTERRUPT_PROGRAM) | \
53 (1<<BOOKE_INTERRUPT_DTLB_MISS)) 53 (1<<BOOKE_INTERRUPT_DTLB_MISS))
54 54
55.macro KVM_HANDLER ivor_nr 55.macro KVM_HANDLER ivor_nr scratch srr0
56_GLOBAL(kvmppc_handler_\ivor_nr) 56_GLOBAL(kvmppc_handler_\ivor_nr)
57 /* Get pointer to vcpu and record exit number. */ 57 /* Get pointer to vcpu and record exit number. */
58 mtspr SPRN_SPRG_WSCRATCH0, r4 58 mtspr \scratch , r4
59 mfspr r4, SPRN_SPRG_RVCPU 59 mfspr r4, SPRN_SPRG_RVCPU
60 stw r3, VCPU_GPR(R3)(r4)
60 stw r5, VCPU_GPR(R5)(r4) 61 stw r5, VCPU_GPR(R5)(r4)
61 stw r6, VCPU_GPR(R6)(r4) 62 stw r6, VCPU_GPR(R6)(r4)
63 mfspr r3, \scratch
62 mfctr r5 64 mfctr r5
63 lis r6, kvmppc_resume_host@h 65 stw r3, VCPU_GPR(R4)(r4)
64 stw r5, VCPU_CTR(r4) 66 stw r5, VCPU_CTR(r4)
67 mfspr r3, \srr0
68 lis r6, kvmppc_resume_host@h
69 stw r3, VCPU_PC(r4)
65 li r5, \ivor_nr 70 li r5, \ivor_nr
66 ori r6, r6, kvmppc_resume_host@l 71 ori r6, r6, kvmppc_resume_host@l
67 mtctr r6 72 mtctr r6
@@ -69,37 +74,35 @@ _GLOBAL(kvmppc_handler_\ivor_nr)
69.endm 74.endm
70 75
71_GLOBAL(kvmppc_handlers_start) 76_GLOBAL(kvmppc_handlers_start)
72KVM_HANDLER BOOKE_INTERRUPT_CRITICAL 77KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
73KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK 78KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0
74KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE 79KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0
75KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE 80KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0
76KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL 81KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
77KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT 82KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT SPRN_SPRG_RSCRATCH0 SPRN_SRR0
78KVM_HANDLER BOOKE_INTERRUPT_PROGRAM 83KVM_HANDLER BOOKE_INTERRUPT_PROGRAM SPRN_SPRG_RSCRATCH0 SPRN_SRR0
79KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL 84KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
80KVM_HANDLER BOOKE_INTERRUPT_SYSCALL 85KVM_HANDLER BOOKE_INTERRUPT_SYSCALL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
81KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL 86KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
82KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER 87KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER SPRN_SPRG_RSCRATCH0 SPRN_SRR0
83KVM_HANDLER BOOKE_INTERRUPT_FIT 88KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0
84KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG 89KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
85KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS 90KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
86KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS 91KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
87KVM_HANDLER BOOKE_INTERRUPT_DEBUG 92KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
88KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL 93KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
89KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA 94KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0
90KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND 95KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0
91 96
92_GLOBAL(kvmppc_handler_len) 97_GLOBAL(kvmppc_handler_len)
93 .long kvmppc_handler_1 - kvmppc_handler_0 98 .long kvmppc_handler_1 - kvmppc_handler_0
94 99
95
96/* Registers: 100/* Registers:
97 * SPRG_SCRATCH0: guest r4 101 * SPRG_SCRATCH0: guest r4
98 * r4: vcpu pointer 102 * r4: vcpu pointer
99 * r5: KVM exit number 103 * r5: KVM exit number
100 */ 104 */
101_GLOBAL(kvmppc_resume_host) 105_GLOBAL(kvmppc_resume_host)
102 stw r3, VCPU_GPR(R3)(r4)
103 mfcr r3 106 mfcr r3
104 stw r3, VCPU_CR(r4) 107 stw r3, VCPU_CR(r4)
105 stw r7, VCPU_GPR(R7)(r4) 108 stw r7, VCPU_GPR(R7)(r4)
@@ -180,10 +183,6 @@ _GLOBAL(kvmppc_resume_host)
180 stw r3, VCPU_LR(r4) 183 stw r3, VCPU_LR(r4)
181 mfxer r3 184 mfxer r3
182 stw r3, VCPU_XER(r4) 185 stw r3, VCPU_XER(r4)
183 mfspr r3, SPRN_SPRG_RSCRATCH0
184 stw r3, VCPU_GPR(R4)(r4)
185 mfspr r3, SPRN_SRR0
186 stw r3, VCPU_PC(r4)
187 186
188 /* Restore host stack pointer and PID before IVPR, since the host 187 /* Restore host stack pointer and PID before IVPR, since the host
189 * exception handlers use them. */ 188 * exception handlers use them. */
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 1685dc43bcf2..d28c2d43ac1b 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -262,7 +262,7 @@ kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \
262kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \ 262kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \
263 SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0 263 SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0
264kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \ 264kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \
265 SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR) 265 SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
266kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR 266kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR
267kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 267kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0
268kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \ 268kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 8b99e076dc81..e04b0ef55ce0 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -269,6 +269,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
269 *spr_val = vcpu->arch.shared->mas7_3 >> 32; 269 *spr_val = vcpu->arch.shared->mas7_3 >> 32;
270 break; 270 break;
271#endif 271#endif
272 case SPRN_DECAR:
273 *spr_val = vcpu->arch.decar;
274 break;
272 case SPRN_TLB0CFG: 275 case SPRN_TLB0CFG:
273 *spr_val = vcpu->arch.tlbcfg[0]; 276 *spr_val = vcpu->arch.tlbcfg[0];
274 break; 277 break;
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index fe6c1de6b701..1f89d26e65fb 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2010 Freescale Semiconductor, Inc. All rights reserved. 2 * Copyright (C) 2010,2012 Freescale Semiconductor, Inc. All rights reserved.
3 * 3 *
4 * Author: Varun Sethi, <varun.sethi@freescale.com> 4 * Author: Varun Sethi, <varun.sethi@freescale.com>
5 * 5 *
@@ -57,7 +57,8 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
57 struct kvm_book3e_206_tlb_entry *gtlbe) 57 struct kvm_book3e_206_tlb_entry *gtlbe)
58{ 58{
59 unsigned int tid, ts; 59 unsigned int tid, ts;
60 u32 val, eaddr, lpid; 60 gva_t eaddr;
61 u32 val, lpid;
61 unsigned long flags; 62 unsigned long flags;
62 63
63 ts = get_tlb_ts(gtlbe); 64 ts = get_tlb_ts(gtlbe);
@@ -183,6 +184,9 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
183 184
184 vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \ 185 vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \
185 SPRN_EPCR_DUVD; 186 SPRN_EPCR_DUVD;
187#ifdef CONFIG_64BIT
188 vcpu->arch.shadow_epcr |= SPRN_EPCR_ICM;
189#endif
186 vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP; 190 vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP;
187 vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT); 191 vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT);
188 vcpu->arch.epsc = vcpu->arch.eplc; 192 vcpu->arch.epsc = vcpu->arch.eplc;
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index f90e86dea7a2..ee04abaefe23 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -59,11 +59,13 @@
59#define OP_31_XOP_STHBRX 918 59#define OP_31_XOP_STHBRX 918
60 60
61#define OP_LWZ 32 61#define OP_LWZ 32
62#define OP_LD 58
62#define OP_LWZU 33 63#define OP_LWZU 33
63#define OP_LBZ 34 64#define OP_LBZ 34
64#define OP_LBZU 35 65#define OP_LBZU 35
65#define OP_STW 36 66#define OP_STW 36
66#define OP_STWU 37 67#define OP_STWU 37
68#define OP_STD 62
67#define OP_STB 38 69#define OP_STB 38
68#define OP_STBU 39 70#define OP_STBU 39
69#define OP_LHZ 40 71#define OP_LHZ 40
@@ -392,6 +394,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
392 emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); 394 emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
393 break; 395 break;
394 396
397 /* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */
398 case OP_LD:
399 rt = get_rt(inst);
400 emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
401 break;
402
395 case OP_LWZU: 403 case OP_LWZU:
396 emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); 404 emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
397 kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); 405 kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
@@ -412,6 +420,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
412 4, 1); 420 4, 1);
413 break; 421 break;
414 422
423 /* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */
424 case OP_STD:
425 rs = get_rs(inst);
426 emulated = kvmppc_handle_store(run, vcpu,
427 kvmppc_get_gpr(vcpu, rs),
428 8, 1);
429 break;
430
415 case OP_STWU: 431 case OP_STWU:
416 emulated = kvmppc_handle_store(run, vcpu, 432 emulated = kvmppc_handle_store(run, vcpu,
417 kvmppc_get_gpr(vcpu, rs), 433 kvmppc_get_gpr(vcpu, rs),
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1493c8de947b..87f4dc886076 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -246,6 +246,7 @@ int kvm_dev_ioctl_check_extension(long ext)
246#endif 246#endif
247#ifdef CONFIG_PPC_BOOK3S_64 247#ifdef CONFIG_PPC_BOOK3S_64
248 case KVM_CAP_SPAPR_TCE: 248 case KVM_CAP_SPAPR_TCE:
249 case KVM_CAP_PPC_ALLOC_HTAB:
249 r = 1; 250 r = 1;
250 break; 251 break;
251#endif /* CONFIG_PPC_BOOK3S_64 */ 252#endif /* CONFIG_PPC_BOOK3S_64 */
@@ -802,6 +803,23 @@ long kvm_arch_vm_ioctl(struct file *filp,
802 r = -EFAULT; 803 r = -EFAULT;
803 break; 804 break;
804 } 805 }
806
807 case KVM_PPC_ALLOCATE_HTAB: {
808 struct kvm *kvm = filp->private_data;
809 u32 htab_order;
810
811 r = -EFAULT;
812 if (get_user(htab_order, (u32 __user *)argp))
813 break;
814 r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
815 if (r)
816 break;
817 r = -EFAULT;
818 if (put_user(htab_order, (u32 __user *)argp))
819 break;
820 r = 0;
821 break;
822 }
805#endif /* CONFIG_KVM_BOOK3S_64_HV */ 823#endif /* CONFIG_KVM_BOOK3S_64_HV */
806 824
807#ifdef CONFIG_PPC_BOOK3S_64 825#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index a35ca44ade66..e7a896acd982 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -25,6 +25,7 @@ source "arch/powerpc/platforms/wsp/Kconfig"
25config KVM_GUEST 25config KVM_GUEST
26 bool "KVM Guest support" 26 bool "KVM Guest support"
27 default n 27 default n
28 select EPAPR_PARAVIRT
28 ---help--- 29 ---help---
29 This option enables various optimizations for running under the KVM 30 This option enables various optimizations for running under the KVM
30 hypervisor. Overhead for the kernel when not running inside KVM should 31 hypervisor. Overhead for the kernel when not running inside KVM should
@@ -32,6 +33,14 @@ config KVM_GUEST
32 33
33 In case of doubt, say Y 34 In case of doubt, say Y
34 35
36config EPAPR_PARAVIRT
37 bool "ePAPR para-virtualization support"
38 default n
39 help
40 Enables ePAPR para-virtualization support for guests.
41
42 In case of doubt, say Y
43
35config PPC_NATIVE 44config PPC_NATIVE
36 bool 45 bool
37 depends on 6xx || PPC64 46 depends on 6xx || PPC64
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index 8685d1fb8b75..e62a555557ee 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -53,5 +53,7 @@ int sclp_chp_configure(struct chp_id chpid);
53int sclp_chp_deconfigure(struct chp_id chpid); 53int sclp_chp_deconfigure(struct chp_id chpid);
54int sclp_chp_read_info(struct sclp_chp_info *info); 54int sclp_chp_read_info(struct sclp_chp_info *info);
55void sclp_get_ipl_info(struct sclp_ipl_info *info); 55void sclp_get_ipl_info(struct sclp_ipl_info *info);
56bool sclp_has_linemode(void);
57bool sclp_has_vt220(void);
56 58
57#endif /* _ASM_S390_SCLP_H */ 59#endif /* _ASM_S390_SCLP_H */
diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h
index 7306270b5b84..5a87d16d3e7c 100644
--- a/arch/s390/include/asm/sigp.h
+++ b/arch/s390/include/asm/sigp.h
@@ -24,6 +24,7 @@
24 24
25#define SIGP_STATUS_CHECK_STOP 0x00000010UL 25#define SIGP_STATUS_CHECK_STOP 0x00000010UL
26#define SIGP_STATUS_STOPPED 0x00000040UL 26#define SIGP_STATUS_STOPPED 0x00000040UL
27#define SIGP_STATUS_EXT_CALL_PENDING 0x00000080UL
27#define SIGP_STATUS_INVALID_PARAMETER 0x00000100UL 28#define SIGP_STATUS_INVALID_PARAMETER 0x00000100UL
28#define SIGP_STATUS_INCORRECT_STATE 0x00000200UL 29#define SIGP_STATUS_INCORRECT_STATE 0x00000200UL
29#define SIGP_STATUS_NOT_RUNNING 0x00000400UL 30#define SIGP_STATUS_NOT_RUNNING 0x00000400UL
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 34d75b50526c..743c0f32fe3b 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -61,6 +61,7 @@
61#include <asm/kvm_virtio.h> 61#include <asm/kvm_virtio.h>
62#include <asm/diag.h> 62#include <asm/diag.h>
63#include <asm/os_info.h> 63#include <asm/os_info.h>
64#include <asm/sclp.h>
64#include "entry.h" 65#include "entry.h"
65 66
66long psw_kernel_bits = PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_PRIMARY | 67long psw_kernel_bits = PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_PRIMARY |
@@ -136,9 +137,14 @@ __setup("condev=", condev_setup);
136 137
137static void __init set_preferred_console(void) 138static void __init set_preferred_console(void)
138{ 139{
139 if (MACHINE_IS_KVM) 140 if (MACHINE_IS_KVM) {
140 add_preferred_console("hvc", 0, NULL); 141 if (sclp_has_vt220())
141 else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) 142 add_preferred_console("ttyS", 1, NULL);
143 else if (sclp_has_linemode())
144 add_preferred_console("ttyS", 0, NULL);
145 else
146 add_preferred_console("hvc", 0, NULL);
147 } else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP)
142 add_preferred_console("ttyS", 0, NULL); 148 add_preferred_console("ttyS", 0, NULL);
143 else if (CONSOLE_IS_3270) 149 else if (CONSOLE_IS_3270)
144 add_preferred_console("tty3270", 0, NULL); 150 add_preferred_console("tty3270", 0, NULL);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c552d1f4103f..d470ccbfabae 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -347,6 +347,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
347 vcpu->arch.guest_fpregs.fpc = 0; 347 vcpu->arch.guest_fpregs.fpc = 0;
348 asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc)); 348 asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc));
349 vcpu->arch.sie_block->gbea = 1; 349 vcpu->arch.sie_block->gbea = 1;
350 atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
350} 351}
351 352
352int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 353int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 1ab2ce1611c5..56f80e1f98f7 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -26,19 +26,23 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
26 int rc; 26 int rc;
27 27
28 if (cpu_addr >= KVM_MAX_VCPUS) 28 if (cpu_addr >= KVM_MAX_VCPUS)
29 return 3; /* not operational */ 29 return SIGP_CC_NOT_OPERATIONAL;
30 30
31 spin_lock(&fi->lock); 31 spin_lock(&fi->lock);
32 if (fi->local_int[cpu_addr] == NULL) 32 if (fi->local_int[cpu_addr] == NULL)
33 rc = 3; /* not operational */ 33 rc = SIGP_CC_NOT_OPERATIONAL;
34 else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags) 34 else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags)
35 & CPUSTAT_STOPPED)) { 35 & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED)))
36 *reg &= 0xffffffff00000000UL; 36 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
37 rc = 1; /* status stored */ 37 else {
38 } else {
39 *reg &= 0xffffffff00000000UL; 38 *reg &= 0xffffffff00000000UL;
40 *reg |= SIGP_STATUS_STOPPED; 39 if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
41 rc = 1; /* status stored */ 40 & CPUSTAT_ECALL_PEND)
41 *reg |= SIGP_STATUS_EXT_CALL_PENDING;
42 if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
43 & CPUSTAT_STOPPED)
44 *reg |= SIGP_STATUS_STOPPED;
45 rc = SIGP_CC_STATUS_STORED;
42 } 46 }
43 spin_unlock(&fi->lock); 47 spin_unlock(&fi->lock);
44 48
@@ -54,7 +58,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
54 int rc; 58 int rc;
55 59
56 if (cpu_addr >= KVM_MAX_VCPUS) 60 if (cpu_addr >= KVM_MAX_VCPUS)
57 return 3; /* not operational */ 61 return SIGP_CC_NOT_OPERATIONAL;
58 62
59 inti = kzalloc(sizeof(*inti), GFP_KERNEL); 63 inti = kzalloc(sizeof(*inti), GFP_KERNEL);
60 if (!inti) 64 if (!inti)
@@ -66,7 +70,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
66 spin_lock(&fi->lock); 70 spin_lock(&fi->lock);
67 li = fi->local_int[cpu_addr]; 71 li = fi->local_int[cpu_addr];
68 if (li == NULL) { 72 if (li == NULL) {
69 rc = 3; /* not operational */ 73 rc = SIGP_CC_NOT_OPERATIONAL;
70 kfree(inti); 74 kfree(inti);
71 goto unlock; 75 goto unlock;
72 } 76 }
@@ -77,7 +81,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
77 if (waitqueue_active(&li->wq)) 81 if (waitqueue_active(&li->wq))
78 wake_up_interruptible(&li->wq); 82 wake_up_interruptible(&li->wq);
79 spin_unlock_bh(&li->lock); 83 spin_unlock_bh(&li->lock);
80 rc = 0; /* order accepted */ 84 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
81 VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); 85 VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
82unlock: 86unlock:
83 spin_unlock(&fi->lock); 87 spin_unlock(&fi->lock);
@@ -92,7 +96,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
92 int rc; 96 int rc;
93 97
94 if (cpu_addr >= KVM_MAX_VCPUS) 98 if (cpu_addr >= KVM_MAX_VCPUS)
95 return 3; /* not operational */ 99 return SIGP_CC_NOT_OPERATIONAL;
96 100
97 inti = kzalloc(sizeof(*inti), GFP_KERNEL); 101 inti = kzalloc(sizeof(*inti), GFP_KERNEL);
98 if (!inti) 102 if (!inti)
@@ -104,7 +108,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
104 spin_lock(&fi->lock); 108 spin_lock(&fi->lock);
105 li = fi->local_int[cpu_addr]; 109 li = fi->local_int[cpu_addr];
106 if (li == NULL) { 110 if (li == NULL) {
107 rc = 3; /* not operational */ 111 rc = SIGP_CC_NOT_OPERATIONAL;
108 kfree(inti); 112 kfree(inti);
109 goto unlock; 113 goto unlock;
110 } 114 }
@@ -115,7 +119,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
115 if (waitqueue_active(&li->wq)) 119 if (waitqueue_active(&li->wq))
116 wake_up_interruptible(&li->wq); 120 wake_up_interruptible(&li->wq);
117 spin_unlock_bh(&li->lock); 121 spin_unlock_bh(&li->lock);
118 rc = 0; /* order accepted */ 122 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
119 VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); 123 VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
120unlock: 124unlock:
121 spin_unlock(&fi->lock); 125 spin_unlock(&fi->lock);
@@ -143,7 +147,7 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
143out: 147out:
144 spin_unlock_bh(&li->lock); 148 spin_unlock_bh(&li->lock);
145 149
146 return 0; /* order accepted */ 150 return SIGP_CC_ORDER_CODE_ACCEPTED;
147} 151}
148 152
149static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) 153static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action)
@@ -153,12 +157,12 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action)
153 int rc; 157 int rc;
154 158
155 if (cpu_addr >= KVM_MAX_VCPUS) 159 if (cpu_addr >= KVM_MAX_VCPUS)
156 return 3; /* not operational */ 160 return SIGP_CC_NOT_OPERATIONAL;
157 161
158 spin_lock(&fi->lock); 162 spin_lock(&fi->lock);
159 li = fi->local_int[cpu_addr]; 163 li = fi->local_int[cpu_addr];
160 if (li == NULL) { 164 if (li == NULL) {
161 rc = 3; /* not operational */ 165 rc = SIGP_CC_NOT_OPERATIONAL;
162 goto unlock; 166 goto unlock;
163 } 167 }
164 168
@@ -182,11 +186,11 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
182 186
183 switch (parameter & 0xff) { 187 switch (parameter & 0xff) {
184 case 0: 188 case 0:
185 rc = 3; /* not operational */ 189 rc = SIGP_CC_NOT_OPERATIONAL;
186 break; 190 break;
187 case 1: 191 case 1:
188 case 2: 192 case 2:
189 rc = 0; /* order accepted */ 193 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
190 break; 194 break;
191 default: 195 default:
192 rc = -EOPNOTSUPP; 196 rc = -EOPNOTSUPP;
@@ -207,21 +211,23 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
207 address = address & 0x7fffe000u; 211 address = address & 0x7fffe000u;
208 if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || 212 if (copy_from_guest_absolute(vcpu, &tmp, address, 1) ||
209 copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)) { 213 copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)) {
214 *reg &= 0xffffffff00000000UL;
210 *reg |= SIGP_STATUS_INVALID_PARAMETER; 215 *reg |= SIGP_STATUS_INVALID_PARAMETER;
211 return 1; /* invalid parameter */ 216 return SIGP_CC_STATUS_STORED;
212 } 217 }
213 218
214 inti = kzalloc(sizeof(*inti), GFP_KERNEL); 219 inti = kzalloc(sizeof(*inti), GFP_KERNEL);
215 if (!inti) 220 if (!inti)
216 return 2; /* busy */ 221 return SIGP_CC_BUSY;
217 222
218 spin_lock(&fi->lock); 223 spin_lock(&fi->lock);
219 if (cpu_addr < KVM_MAX_VCPUS) 224 if (cpu_addr < KVM_MAX_VCPUS)
220 li = fi->local_int[cpu_addr]; 225 li = fi->local_int[cpu_addr];
221 226
222 if (li == NULL) { 227 if (li == NULL) {
223 rc = 1; /* incorrect state */ 228 *reg &= 0xffffffff00000000UL;
224 *reg &= SIGP_STATUS_INCORRECT_STATE; 229 *reg |= SIGP_STATUS_INCORRECT_STATE;
230 rc = SIGP_CC_STATUS_STORED;
225 kfree(inti); 231 kfree(inti);
226 goto out_fi; 232 goto out_fi;
227 } 233 }
@@ -229,8 +235,9 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
229 spin_lock_bh(&li->lock); 235 spin_lock_bh(&li->lock);
230 /* cpu must be in stopped state */ 236 /* cpu must be in stopped state */
231 if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) { 237 if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) {
232 rc = 1; /* incorrect state */ 238 *reg &= 0xffffffff00000000UL;
233 *reg &= SIGP_STATUS_INCORRECT_STATE; 239 *reg |= SIGP_STATUS_INCORRECT_STATE;
240 rc = SIGP_CC_STATUS_STORED;
234 kfree(inti); 241 kfree(inti);
235 goto out_li; 242 goto out_li;
236 } 243 }
@@ -242,7 +249,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
242 atomic_set(&li->active, 1); 249 atomic_set(&li->active, 1);
243 if (waitqueue_active(&li->wq)) 250 if (waitqueue_active(&li->wq))
244 wake_up_interruptible(&li->wq); 251 wake_up_interruptible(&li->wq);
245 rc = 0; /* order accepted */ 252 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
246 253
247 VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address); 254 VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address);
248out_li: 255out_li:
@@ -259,21 +266,21 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
259 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; 266 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
260 267
261 if (cpu_addr >= KVM_MAX_VCPUS) 268 if (cpu_addr >= KVM_MAX_VCPUS)
262 return 3; /* not operational */ 269 return SIGP_CC_NOT_OPERATIONAL;
263 270
264 spin_lock(&fi->lock); 271 spin_lock(&fi->lock);
265 if (fi->local_int[cpu_addr] == NULL) 272 if (fi->local_int[cpu_addr] == NULL)
266 rc = 3; /* not operational */ 273 rc = SIGP_CC_NOT_OPERATIONAL;
267 else { 274 else {
268 if (atomic_read(fi->local_int[cpu_addr]->cpuflags) 275 if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
269 & CPUSTAT_RUNNING) { 276 & CPUSTAT_RUNNING) {
270 /* running */ 277 /* running */
271 rc = 1; 278 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
272 } else { 279 } else {
273 /* not running */ 280 /* not running */
274 *reg &= 0xffffffff00000000UL; 281 *reg &= 0xffffffff00000000UL;
275 *reg |= SIGP_STATUS_NOT_RUNNING; 282 *reg |= SIGP_STATUS_NOT_RUNNING;
276 rc = 0; 283 rc = SIGP_CC_STATUS_STORED;
277 } 284 }
278 } 285 }
279 spin_unlock(&fi->lock); 286 spin_unlock(&fi->lock);
@@ -286,23 +293,23 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
286 293
287static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr) 294static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr)
288{ 295{
289 int rc = 0;
290 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; 296 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
291 struct kvm_s390_local_interrupt *li; 297 struct kvm_s390_local_interrupt *li;
298 int rc = SIGP_CC_ORDER_CODE_ACCEPTED;
292 299
293 if (cpu_addr >= KVM_MAX_VCPUS) 300 if (cpu_addr >= KVM_MAX_VCPUS)
294 return 3; /* not operational */ 301 return SIGP_CC_NOT_OPERATIONAL;
295 302
296 spin_lock(&fi->lock); 303 spin_lock(&fi->lock);
297 li = fi->local_int[cpu_addr]; 304 li = fi->local_int[cpu_addr];
298 if (li == NULL) { 305 if (li == NULL) {
299 rc = 3; /* not operational */ 306 rc = SIGP_CC_NOT_OPERATIONAL;
300 goto out; 307 goto out;
301 } 308 }
302 309
303 spin_lock_bh(&li->lock); 310 spin_lock_bh(&li->lock);
304 if (li->action_bits & ACTION_STOP_ON_STOP) 311 if (li->action_bits & ACTION_STOP_ON_STOP)
305 rc = 2; /* busy */ 312 rc = SIGP_CC_BUSY;
306 else 313 else
307 VCPU_EVENT(vcpu, 4, "sigp restart %x to handle userspace", 314 VCPU_EVENT(vcpu, 4, "sigp restart %x to handle userspace",
308 cpu_addr); 315 cpu_addr);
@@ -377,7 +384,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
377 case SIGP_RESTART: 384 case SIGP_RESTART:
378 vcpu->stat.instruction_sigp_restart++; 385 vcpu->stat.instruction_sigp_restart++;
379 rc = __sigp_restart(vcpu, cpu_addr); 386 rc = __sigp_restart(vcpu, cpu_addr);
380 if (rc == 2) /* busy */ 387 if (rc == SIGP_CC_BUSY)
381 break; 388 break;
382 /* user space must know about restart */ 389 /* user space must know about restart */
383 default: 390 default:
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 88093c1d44fd..3ea51a84a0e4 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -465,6 +465,8 @@ static inline u32 safe_apic_wait_icr_idle(void)
465 return apic->safe_wait_icr_idle(); 465 return apic->safe_wait_icr_idle();
466} 466}
467 467
468extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v));
469
468#else /* CONFIG_X86_LOCAL_APIC */ 470#else /* CONFIG_X86_LOCAL_APIC */
469 471
470static inline u32 apic_read(u32 reg) { return 0; } 472static inline u32 apic_read(u32 reg) { return 0; }
@@ -474,6 +476,7 @@ static inline u64 apic_icr_read(void) { return 0; }
474static inline void apic_icr_write(u32 low, u32 high) { } 476static inline void apic_icr_write(u32 low, u32 high) { }
475static inline void apic_wait_icr_idle(void) { } 477static inline void apic_wait_icr_idle(void) { }
476static inline u32 safe_apic_wait_icr_idle(void) { return 0; } 478static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
479static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}
477 480
478#endif /* CONFIG_X86_LOCAL_APIC */ 481#endif /* CONFIG_X86_LOCAL_APIC */
479 482
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index a6983b277220..72f5009deb5a 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -264,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
264 * This operation is non-atomic and can be reordered. 264 * This operation is non-atomic and can be reordered.
265 * If two examples of this operation race, one can appear to succeed 265 * If two examples of this operation race, one can appear to succeed
266 * but actually fail. You must protect multiple accesses with a lock. 266 * but actually fail. You must protect multiple accesses with a lock.
267 *
268 * Note: the operation is performed atomically with respect to
269 * the local CPU, but not other CPUs. Portable code should not
270 * rely on this behaviour.
271 * KVM relies on this behaviour on x86 for modifying memory that is also
272 * accessed from a hypervisor on the same CPU if running in a VM: don't change
273 * this without also updating arch/x86/kernel/kvm.c
267 */ 274 */
268static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) 275static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
269{ 276{
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 7a15153c675d..b518c7509933 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper;
49extern const struct hypervisor_x86 x86_hyper_vmware; 49extern const struct hypervisor_x86 x86_hyper_vmware;
50extern const struct hypervisor_x86 x86_hyper_ms_hyperv; 50extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
51extern const struct hypervisor_x86 x86_hyper_xen_hvm; 51extern const struct hypervisor_x86 x86_hyper_xen_hvm;
52extern const struct hypervisor_x86 x86_hyper_kvm;
52 53
53static inline bool hypervisor_x2apic_available(void) 54static inline bool hypervisor_x2apic_available(void)
54{ 55{
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index e7d1c194d272..246617efd67f 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -12,6 +12,7 @@
12/* Select x86 specific features in <linux/kvm.h> */ 12/* Select x86 specific features in <linux/kvm.h> */
13#define __KVM_HAVE_PIT 13#define __KVM_HAVE_PIT
14#define __KVM_HAVE_IOAPIC 14#define __KVM_HAVE_IOAPIC
15#define __KVM_HAVE_IRQ_LINE
15#define __KVM_HAVE_DEVICE_ASSIGNMENT 16#define __KVM_HAVE_DEVICE_ASSIGNMENT
16#define __KVM_HAVE_MSI 17#define __KVM_HAVE_MSI
17#define __KVM_HAVE_USER_NMI 18#define __KVM_HAVE_USER_NMI
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1ac46c22dd50..c764f43b71c5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -192,8 +192,8 @@ struct x86_emulate_ops {
192 struct x86_instruction_info *info, 192 struct x86_instruction_info *info,
193 enum x86_intercept_stage stage); 193 enum x86_intercept_stage stage);
194 194
195 bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, 195 void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
196 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 196 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
197}; 197};
198 198
199typedef u32 __attribute__((vector_size(16))) sse128_t; 199typedef u32 __attribute__((vector_size(16))) sse128_t;
@@ -280,9 +280,9 @@ struct x86_emulate_ctxt {
280 u8 modrm_seg; 280 u8 modrm_seg;
281 bool rip_relative; 281 bool rip_relative;
282 unsigned long _eip; 282 unsigned long _eip;
283 struct operand memop;
283 /* Fields above regs are cleared together. */ 284 /* Fields above regs are cleared together. */
284 unsigned long regs[NR_VCPU_REGS]; 285 unsigned long regs[NR_VCPU_REGS];
285 struct operand memop;
286 struct operand *memopp; 286 struct operand *memopp;
287 struct fetch_cache fetch; 287 struct fetch_cache fetch;
288 struct read_cache io_read; 288 struct read_cache io_read;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2da88c0cda14..09155d64cf7e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,12 +48,13 @@
48 48
49#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 49#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
50#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) 50#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
51#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL
51#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 52#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
52 0xFFFFFF0000000000ULL) 53 0xFFFFFF0000000000ULL)
53#define CR4_RESERVED_BITS \ 54#define CR4_RESERVED_BITS \
54 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 55 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
55 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 56 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
56 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 57 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
57 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ 58 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
58 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 59 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
59 60
@@ -175,6 +176,13 @@ enum {
175 176
176/* apic attention bits */ 177/* apic attention bits */
177#define KVM_APIC_CHECK_VAPIC 0 178#define KVM_APIC_CHECK_VAPIC 0
179/*
180 * The following bit is set with PV-EOI, unset on EOI.
181 * We detect PV-EOI changes by guest by comparing
182 * this bit with PV-EOI in guest memory.
183 * See the implementation in apic_update_pv_eoi.
184 */
185#define KVM_APIC_PV_EOI_PENDING 1
178 186
179/* 187/*
180 * We don't want allocation failures within the mmu code, so we preallocate 188 * We don't want allocation failures within the mmu code, so we preallocate
@@ -484,6 +492,11 @@ struct kvm_vcpu_arch {
484 u64 length; 492 u64 length;
485 u64 status; 493 u64 status;
486 } osvw; 494 } osvw;
495
496 struct {
497 u64 msr_val;
498 struct gfn_to_hva_cache data;
499 } pv_eoi;
487}; 500};
488 501
489struct kvm_lpage_info { 502struct kvm_lpage_info {
@@ -661,6 +674,7 @@ struct kvm_x86_ops {
661 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 674 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
662 int (*get_lpage_level)(void); 675 int (*get_lpage_level)(void);
663 bool (*rdtscp_supported)(void); 676 bool (*rdtscp_supported)(void);
677 bool (*invpcid_supported)(void);
664 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); 678 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
665 679
666 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 680 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -802,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
802void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); 816void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
803bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 817bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
804 818
805int kvm_pic_set_irq(void *opaque, int irq, int level); 819static inline int __kvm_irq_line_state(unsigned long *irq_state,
820 int irq_source_id, int level)
821{
822 /* Logical OR for level trig interrupt */
823 if (level)
824 __set_bit(irq_source_id, irq_state);
825 else
826 __clear_bit(irq_source_id, irq_state);
827
828 return !!(*irq_state);
829}
830
831int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
832void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
806 833
807void kvm_inject_nmi(struct kvm_vcpu *vcpu); 834void kvm_inject_nmi(struct kvm_vcpu *vcpu);
808 835
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 63ab1661d00e..2f7712e08b1e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -22,6 +22,7 @@
22#define KVM_FEATURE_CLOCKSOURCE2 3 22#define KVM_FEATURE_CLOCKSOURCE2 3
23#define KVM_FEATURE_ASYNC_PF 4 23#define KVM_FEATURE_ASYNC_PF 4
24#define KVM_FEATURE_STEAL_TIME 5 24#define KVM_FEATURE_STEAL_TIME 5
25#define KVM_FEATURE_PV_EOI 6
25 26
26/* The last 8 bits are used to indicate how to interpret the flags field 27/* The last 8 bits are used to indicate how to interpret the flags field
27 * in pvclock structure. If no bits are set, all flags are ignored. 28 * in pvclock structure. If no bits are set, all flags are ignored.
@@ -37,6 +38,7 @@
37#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 38#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
38#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 39#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
39#define MSR_KVM_STEAL_TIME 0x4b564d03 40#define MSR_KVM_STEAL_TIME 0x4b564d03
41#define MSR_KVM_PV_EOI_EN 0x4b564d04
40 42
41struct kvm_steal_time { 43struct kvm_steal_time {
42 __u64 steal; 44 __u64 steal;
@@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data {
89 __u32 enabled; 91 __u32 enabled;
90}; 92};
91 93
94#define KVM_PV_EOI_BIT 0
95#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
96#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
97#define KVM_PV_EOI_DISABLED 0x0
98
92#ifdef __KERNEL__ 99#ifdef __KERNEL__
93#include <asm/processor.h> 100#include <asm/processor.h>
94 101
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index f8ab3eaad128..aea1d1d848c7 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -44,6 +44,7 @@
44 */ 44 */
45#define X86_CR3_PWT 0x00000008 /* Page Write Through */ 45#define X86_CR3_PWT 0x00000008 /* Page Write Through */
46#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ 46#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
47#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */
47 48
48/* 49/*
49 * Intel CPU features in CR4 50 * Intel CPU features in CR4
@@ -61,6 +62,7 @@
61#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ 62#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
62#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ 63#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
63#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ 64#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
65#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */
64#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ 66#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
65#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ 67#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
66 68
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 31f180c21ce9..74fcb963595b 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -60,6 +60,7 @@
60#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 60#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
61#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 61#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
62#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 62#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
63#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
63 64
64 65
65#define PIN_BASED_EXT_INTR_MASK 0x00000001 66#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -281,6 +282,7 @@ enum vmcs_field {
281#define EXIT_REASON_EPT_MISCONFIG 49 282#define EXIT_REASON_EPT_MISCONFIG 49
282#define EXIT_REASON_WBINVD 54 283#define EXIT_REASON_WBINVD 54
283#define EXIT_REASON_XSETBV 55 284#define EXIT_REASON_XSETBV 55
285#define EXIT_REASON_INVPCID 58
284 286
285/* 287/*
286 * Interruption-information format 288 * Interruption-information format
@@ -404,6 +406,7 @@ enum vmcs_field {
404#define VMX_EPTP_WB_BIT (1ull << 14) 406#define VMX_EPTP_WB_BIT (1ull << 14)
405#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 407#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
406#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) 408#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
409#define VMX_EPT_AD_BIT (1ull << 21)
407#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 410#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
408#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 411#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
409#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 412#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -415,11 +418,14 @@ enum vmcs_field {
415#define VMX_EPT_MAX_GAW 0x4 418#define VMX_EPT_MAX_GAW 0x4
416#define VMX_EPT_MT_EPTE_SHIFT 3 419#define VMX_EPT_MT_EPTE_SHIFT 3
417#define VMX_EPT_GAW_EPTP_SHIFT 3 420#define VMX_EPT_GAW_EPTP_SHIFT 3
421#define VMX_EPT_AD_ENABLE_BIT (1ull << 6)
418#define VMX_EPT_DEFAULT_MT 0x6ull 422#define VMX_EPT_DEFAULT_MT 0x6ull
419#define VMX_EPT_READABLE_MASK 0x1ull 423#define VMX_EPT_READABLE_MASK 0x1ull
420#define VMX_EPT_WRITABLE_MASK 0x2ull 424#define VMX_EPT_WRITABLE_MASK 0x2ull
421#define VMX_EPT_EXECUTABLE_MASK 0x4ull 425#define VMX_EPT_EXECUTABLE_MASK 0x4ull
422#define VMX_EPT_IPAT_BIT (1ull << 6) 426#define VMX_EPT_IPAT_BIT (1ull << 6)
427#define VMX_EPT_ACCESS_BIT (1ull << 8)
428#define VMX_EPT_DIRTY_BIT (1ull << 9)
423 429
424#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 430#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
425 431
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c421512ca5eb..98e24131ff3a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2143,6 +2143,23 @@ int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
2143} 2143}
2144 2144
2145/* 2145/*
2146 * Override the generic EOI implementation with an optimized version.
2147 * Only called during early boot when only one CPU is active and with
2148 * interrupts disabled, so we know this does not race with actual APIC driver
2149 * use.
2150 */
2151void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
2152{
2153 struct apic **drv;
2154
2155 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
2156 /* Should happen once for each apic */
2157 WARN_ON((*drv)->eoi_write == eoi_write);
2158 (*drv)->eoi_write = eoi_write;
2159 }
2160}
2161
2162/*
2146 * Power management 2163 * Power management
2147 */ 2164 */
2148#ifdef CONFIG_PM 2165#ifdef CONFIG_PM
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 755f64fb0743..a8f8fa9769d6 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -37,6 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
37#endif 37#endif
38 &x86_hyper_vmware, 38 &x86_hyper_vmware,
39 &x86_hyper_ms_hyperv, 39 &x86_hyper_ms_hyperv,
40#ifdef CONFIG_KVM_GUEST
41 &x86_hyper_kvm,
42#endif
40}; 43};
41 44
42const struct hypervisor_x86 *x86_hyper; 45const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e554e5ad2fe8..c1d61ee4b4f1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,6 +39,9 @@
39#include <asm/desc.h> 39#include <asm/desc.h>
40#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
41#include <asm/idle.h> 41#include <asm/idle.h>
42#include <asm/apic.h>
43#include <asm/apicdef.h>
44#include <asm/hypervisor.h>
42 45
43static int kvmapf = 1; 46static int kvmapf = 1;
44 47
@@ -283,6 +286,22 @@ static void kvm_register_steal_time(void)
283 cpu, __pa(st)); 286 cpu, __pa(st));
284} 287}
285 288
289static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
290
291static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
292{
293 /**
294 * This relies on __test_and_clear_bit to modify the memory
295 * in a way that is atomic with respect to the local CPU.
296 * The hypervisor only accesses this memory from the local CPU so
297 * there's no need for lock or memory barriers.
298 * An optimization barrier is implied in apic write.
299 */
300 if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi)))
301 return;
302 apic_write(APIC_EOI, APIC_EOI_ACK);
303}
304
286void __cpuinit kvm_guest_cpu_init(void) 305void __cpuinit kvm_guest_cpu_init(void)
287{ 306{
288 if (!kvm_para_available()) 307 if (!kvm_para_available())
@@ -300,11 +319,20 @@ void __cpuinit kvm_guest_cpu_init(void)
300 smp_processor_id()); 319 smp_processor_id());
301 } 320 }
302 321
322 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
323 unsigned long pa;
324 /* Size alignment is implied but just to make it explicit. */
325 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
326 __get_cpu_var(kvm_apic_eoi) = 0;
327 pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED;
328 wrmsrl(MSR_KVM_PV_EOI_EN, pa);
329 }
330
303 if (has_steal_clock) 331 if (has_steal_clock)
304 kvm_register_steal_time(); 332 kvm_register_steal_time();
305} 333}
306 334
307static void kvm_pv_disable_apf(void *unused) 335static void kvm_pv_disable_apf(void)
308{ 336{
309 if (!__get_cpu_var(apf_reason).enabled) 337 if (!__get_cpu_var(apf_reason).enabled)
310 return; 338 return;
@@ -316,11 +344,23 @@ static void kvm_pv_disable_apf(void *unused)
316 smp_processor_id()); 344 smp_processor_id());
317} 345}
318 346
347static void kvm_pv_guest_cpu_reboot(void *unused)
348{
349 /*
350 * We disable PV EOI before we load a new kernel by kexec,
351 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
352 * New kernel can re-enable when it boots.
353 */
354 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
355 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
356 kvm_pv_disable_apf();
357}
358
319static int kvm_pv_reboot_notify(struct notifier_block *nb, 359static int kvm_pv_reboot_notify(struct notifier_block *nb,
320 unsigned long code, void *unused) 360 unsigned long code, void *unused)
321{ 361{
322 if (code == SYS_RESTART) 362 if (code == SYS_RESTART)
323 on_each_cpu(kvm_pv_disable_apf, NULL, 1); 363 on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
324 return NOTIFY_DONE; 364 return NOTIFY_DONE;
325} 365}
326 366
@@ -371,7 +411,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
371static void kvm_guest_cpu_offline(void *dummy) 411static void kvm_guest_cpu_offline(void *dummy)
372{ 412{
373 kvm_disable_steal_time(); 413 kvm_disable_steal_time();
374 kvm_pv_disable_apf(NULL); 414 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
415 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
416 kvm_pv_disable_apf();
375 apf_task_wake_all(); 417 apf_task_wake_all();
376} 418}
377 419
@@ -424,6 +466,9 @@ void __init kvm_guest_init(void)
424 pv_time_ops.steal_clock = kvm_steal_clock; 466 pv_time_ops.steal_clock = kvm_steal_clock;
425 } 467 }
426 468
469 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
470 apic_set_eoi_write(kvm_guest_apic_eoi_write);
471
427#ifdef CONFIG_SMP 472#ifdef CONFIG_SMP
428 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 473 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
429 register_cpu_notifier(&kvm_cpu_notifier); 474 register_cpu_notifier(&kvm_cpu_notifier);
@@ -432,6 +477,19 @@ void __init kvm_guest_init(void)
432#endif 477#endif
433} 478}
434 479
480static bool __init kvm_detect(void)
481{
482 if (!kvm_para_available())
483 return false;
484 return true;
485}
486
487const struct hypervisor_x86 x86_hyper_kvm __refconst = {
488 .name = "KVM",
489 .detect = kvm_detect,
490};
491EXPORT_SYMBOL_GPL(x86_hyper_kvm);
492
435static __init int activate_jump_labels(void) 493static __init int activate_jump_labels(void)
436{ 494{
437 if (has_steal_clock) { 495 if (has_steal_clock) {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7df1c6d839fb..0595f1397b7c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
201 unsigned f_lm = 0; 201 unsigned f_lm = 0;
202#endif 202#endif
203 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; 203 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
204 unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
204 205
205 /* cpuid 1.edx */ 206 /* cpuid 1.edx */
206 const u32 kvm_supported_word0_x86_features = 207 const u32 kvm_supported_word0_x86_features =
@@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
228 0 /* DS-CPL, VMX, SMX, EST */ | 229 0 /* DS-CPL, VMX, SMX, EST */ |
229 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 230 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
230 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | 231 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
231 0 /* Reserved, DCA */ | F(XMM4_1) | 232 F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
232 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 233 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
233 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | 234 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
234 F(F16C) | F(RDRAND); 235 F(F16C) | F(RDRAND);
@@ -248,7 +249,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
248 /* cpuid 7.0.ebx */ 249 /* cpuid 7.0.ebx */
249 const u32 kvm_supported_word9_x86_features = 250 const u32 kvm_supported_word9_x86_features =
250 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | 251 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
251 F(BMI2) | F(ERMS) | F(RTM); 252 F(BMI2) | F(ERMS) | f_invpcid | F(RTM);
252 253
253 /* all calls to cpuid_count() should be made on the same cpu */ 254 /* all calls to cpuid_count() should be made on the same cpu */
254 get_cpu(); 255 get_cpu();
@@ -409,6 +410,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
409 (1 << KVM_FEATURE_NOP_IO_DELAY) | 410 (1 << KVM_FEATURE_NOP_IO_DELAY) |
410 (1 << KVM_FEATURE_CLOCKSOURCE2) | 411 (1 << KVM_FEATURE_CLOCKSOURCE2) |
411 (1 << KVM_FEATURE_ASYNC_PF) | 412 (1 << KVM_FEATURE_ASYNC_PF) |
413 (1 << KVM_FEATURE_PV_EOI) |
412 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 414 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
413 415
414 if (sched_info_on()) 416 if (sched_info_on())
@@ -639,33 +641,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
639 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); 641 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
640} 642}
641 643
642void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 644void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
643{ 645{
644 u32 function, index; 646 u32 function = *eax, index = *ecx;
645 struct kvm_cpuid_entry2 *best; 647 struct kvm_cpuid_entry2 *best;
646 648
647 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
648 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
649 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
650 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
651 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
652 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
653 best = kvm_find_cpuid_entry(vcpu, function, index); 649 best = kvm_find_cpuid_entry(vcpu, function, index);
654 650
655 if (!best) 651 if (!best)
656 best = check_cpuid_limit(vcpu, function, index); 652 best = check_cpuid_limit(vcpu, function, index);
657 653
658 if (best) { 654 if (best) {
659 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 655 *eax = best->eax;
660 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 656 *ebx = best->ebx;
661 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 657 *ecx = best->ecx;
662 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 658 *edx = best->edx;
663 } 659 } else
660 *eax = *ebx = *ecx = *edx = 0;
661}
662
663void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
664{
665 u32 function, eax, ebx, ecx, edx;
666
667 function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
668 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
669 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
670 kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
671 kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
672 kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
673 kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
664 kvm_x86_ops->skip_emulated_instruction(vcpu); 674 kvm_x86_ops->skip_emulated_instruction(vcpu);
665 trace_kvm_cpuid(function, 675 trace_kvm_cpuid(function, eax, ebx, ecx, edx);
666 kvm_register_read(vcpu, VCPU_REGS_RAX),
667 kvm_register_read(vcpu, VCPU_REGS_RBX),
668 kvm_register_read(vcpu, VCPU_REGS_RCX),
669 kvm_register_read(vcpu, VCPU_REGS_RDX));
670} 676}
671EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 677EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 26d1fb437eb5..a10e46016851 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
17int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 17int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
18 struct kvm_cpuid2 *cpuid, 18 struct kvm_cpuid2 *cpuid,
19 struct kvm_cpuid_entry2 __user *entries); 19 struct kvm_cpuid_entry2 __user *entries);
20void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
20 21
21 22
22static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) 23static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
@@ -51,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
51 return best && (best->ecx & bit(X86_FEATURE_OSVW)); 52 return best && (best->ecx & bit(X86_FEATURE_OSVW));
52} 53}
53 54
55static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu)
56{
57 struct kvm_cpuid_entry2 *best;
58
59 best = kvm_find_cpuid_entry(vcpu, 1, 0);
60 return best && (best->ecx & bit(X86_FEATURE_PCID));
61}
62
54#endif 63#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f95d242ee9f7..97d9a9914ba8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -433,11 +433,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
433 return ctxt->ops->intercept(ctxt, &info, stage); 433 return ctxt->ops->intercept(ctxt, &info, stage);
434} 434}
435 435
436static void assign_masked(ulong *dest, ulong src, ulong mask)
437{
438 *dest = (*dest & ~mask) | (src & mask);
439}
440
436static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) 441static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
437{ 442{
438 return (1UL << (ctxt->ad_bytes << 3)) - 1; 443 return (1UL << (ctxt->ad_bytes << 3)) - 1;
439} 444}
440 445
446static ulong stack_mask(struct x86_emulate_ctxt *ctxt)
447{
448 u16 sel;
449 struct desc_struct ss;
450
451 if (ctxt->mode == X86EMUL_MODE_PROT64)
452 return ~0UL;
453 ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS);
454 return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */
455}
456
457static int stack_size(struct x86_emulate_ctxt *ctxt)
458{
459 return (__fls(stack_mask(ctxt)) + 1) >> 3;
460}
461
441/* Access/update address held in a register, based on addressing mode. */ 462/* Access/update address held in a register, based on addressing mode. */
442static inline unsigned long 463static inline unsigned long
443address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) 464address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
@@ -958,6 +979,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
958 op->orig_val = op->val; 979 op->orig_val = op->val;
959} 980}
960 981
982static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
983{
984 if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP)
985 ctxt->modrm_seg = VCPU_SREG_SS;
986}
987
961static int decode_modrm(struct x86_emulate_ctxt *ctxt, 988static int decode_modrm(struct x86_emulate_ctxt *ctxt,
962 struct operand *op) 989 struct operand *op)
963{ 990{
@@ -1061,15 +1088,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1061 1088
1062 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) 1089 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
1063 modrm_ea += insn_fetch(s32, ctxt); 1090 modrm_ea += insn_fetch(s32, ctxt);
1064 else 1091 else {
1065 modrm_ea += ctxt->regs[base_reg]; 1092 modrm_ea += ctxt->regs[base_reg];
1093 adjust_modrm_seg(ctxt, base_reg);
1094 }
1066 if (index_reg != 4) 1095 if (index_reg != 4)
1067 modrm_ea += ctxt->regs[index_reg] << scale; 1096 modrm_ea += ctxt->regs[index_reg] << scale;
1068 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { 1097 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
1069 if (ctxt->mode == X86EMUL_MODE_PROT64) 1098 if (ctxt->mode == X86EMUL_MODE_PROT64)
1070 ctxt->rip_relative = 1; 1099 ctxt->rip_relative = 1;
1071 } else 1100 } else {
1072 modrm_ea += ctxt->regs[ctxt->modrm_rm]; 1101 base_reg = ctxt->modrm_rm;
1102 modrm_ea += ctxt->regs[base_reg];
1103 adjust_modrm_seg(ctxt, base_reg);
1104 }
1073 switch (ctxt->modrm_mod) { 1105 switch (ctxt->modrm_mod) {
1074 case 0: 1106 case 0:
1075 if (ctxt->modrm_rm == 5) 1107 if (ctxt->modrm_rm == 5)
@@ -1264,7 +1296,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1264 1296
1265/* allowed just for 8 bytes segments */ 1297/* allowed just for 8 bytes segments */
1266static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1298static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1267 u16 selector, struct desc_struct *desc) 1299 u16 selector, struct desc_struct *desc,
1300 ulong *desc_addr_p)
1268{ 1301{
1269 struct desc_ptr dt; 1302 struct desc_ptr dt;
1270 u16 index = selector >> 3; 1303 u16 index = selector >> 3;
@@ -1275,7 +1308,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1275 if (dt.size < index * 8 + 7) 1308 if (dt.size < index * 8 + 7)
1276 return emulate_gp(ctxt, selector & 0xfffc); 1309 return emulate_gp(ctxt, selector & 0xfffc);
1277 1310
1278 addr = dt.address + index * 8; 1311 *desc_addr_p = addr = dt.address + index * 8;
1279 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, 1312 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
1280 &ctxt->exception); 1313 &ctxt->exception);
1281} 1314}
@@ -1302,11 +1335,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1302static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1335static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1303 u16 selector, int seg) 1336 u16 selector, int seg)
1304{ 1337{
1305 struct desc_struct seg_desc; 1338 struct desc_struct seg_desc, old_desc;
1306 u8 dpl, rpl, cpl; 1339 u8 dpl, rpl, cpl;
1307 unsigned err_vec = GP_VECTOR; 1340 unsigned err_vec = GP_VECTOR;
1308 u32 err_code = 0; 1341 u32 err_code = 0;
1309 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ 1342 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
1343 ulong desc_addr;
1310 int ret; 1344 int ret;
1311 1345
1312 memset(&seg_desc, 0, sizeof seg_desc); 1346 memset(&seg_desc, 0, sizeof seg_desc);
@@ -1324,8 +1358,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1324 goto load; 1358 goto load;
1325 } 1359 }
1326 1360
1327 /* NULL selector is not valid for TR, CS and SS */ 1361 rpl = selector & 3;
1328 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) 1362 cpl = ctxt->ops->cpl(ctxt);
1363
1364 /* NULL selector is not valid for TR, CS and SS (except for long mode) */
1365 if ((seg == VCPU_SREG_CS
1366 || (seg == VCPU_SREG_SS
1367 && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl))
1368 || seg == VCPU_SREG_TR)
1329 && null_selector) 1369 && null_selector)
1330 goto exception; 1370 goto exception;
1331 1371
@@ -1336,7 +1376,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1336 if (null_selector) /* for NULL selector skip all following checks */ 1376 if (null_selector) /* for NULL selector skip all following checks */
1337 goto load; 1377 goto load;
1338 1378
1339 ret = read_segment_descriptor(ctxt, selector, &seg_desc); 1379 ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
1340 if (ret != X86EMUL_CONTINUE) 1380 if (ret != X86EMUL_CONTINUE)
1341 return ret; 1381 return ret;
1342 1382
@@ -1352,9 +1392,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1352 goto exception; 1392 goto exception;
1353 } 1393 }
1354 1394
1355 rpl = selector & 3;
1356 dpl = seg_desc.dpl; 1395 dpl = seg_desc.dpl;
1357 cpl = ctxt->ops->cpl(ctxt);
1358 1396
1359 switch (seg) { 1397 switch (seg) {
1360 case VCPU_SREG_SS: 1398 case VCPU_SREG_SS:
@@ -1384,6 +1422,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1384 case VCPU_SREG_TR: 1422 case VCPU_SREG_TR:
1385 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) 1423 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
1386 goto exception; 1424 goto exception;
1425 old_desc = seg_desc;
1426 seg_desc.type |= 2; /* busy */
1427 ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
1428 sizeof(seg_desc), &ctxt->exception);
1429 if (ret != X86EMUL_CONTINUE)
1430 return ret;
1387 break; 1431 break;
1388 case VCPU_SREG_LDTR: 1432 case VCPU_SREG_LDTR:
1389 if (seg_desc.s || seg_desc.type != 2) 1433 if (seg_desc.s || seg_desc.type != 2)
@@ -1474,17 +1518,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1474 return X86EMUL_CONTINUE; 1518 return X86EMUL_CONTINUE;
1475} 1519}
1476 1520
1477static int em_push(struct x86_emulate_ctxt *ctxt) 1521static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
1478{ 1522{
1479 struct segmented_address addr; 1523 struct segmented_address addr;
1480 1524
1481 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); 1525 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes);
1482 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); 1526 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
1483 addr.seg = VCPU_SREG_SS; 1527 addr.seg = VCPU_SREG_SS;
1484 1528
1529 return segmented_write(ctxt, addr, data, bytes);
1530}
1531
1532static int em_push(struct x86_emulate_ctxt *ctxt)
1533{
1485 /* Disable writeback. */ 1534 /* Disable writeback. */
1486 ctxt->dst.type = OP_NONE; 1535 ctxt->dst.type = OP_NONE;
1487 return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); 1536 return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
1488} 1537}
1489 1538
1490static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1539static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1556,6 +1605,33 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)
1556 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); 1605 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
1557} 1606}
1558 1607
1608static int em_enter(struct x86_emulate_ctxt *ctxt)
1609{
1610 int rc;
1611 unsigned frame_size = ctxt->src.val;
1612 unsigned nesting_level = ctxt->src2.val & 31;
1613
1614 if (nesting_level)
1615 return X86EMUL_UNHANDLEABLE;
1616
1617 rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt));
1618 if (rc != X86EMUL_CONTINUE)
1619 return rc;
1620 assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP],
1621 stack_mask(ctxt));
1622 assign_masked(&ctxt->regs[VCPU_REGS_RSP],
1623 ctxt->regs[VCPU_REGS_RSP] - frame_size,
1624 stack_mask(ctxt));
1625 return X86EMUL_CONTINUE;
1626}
1627
1628static int em_leave(struct x86_emulate_ctxt *ctxt)
1629{
1630 assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP],
1631 stack_mask(ctxt));
1632 return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes);
1633}
1634
1559static int em_push_sreg(struct x86_emulate_ctxt *ctxt) 1635static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
1560{ 1636{
1561 int seg = ctxt->src2.val; 1637 int seg = ctxt->src2.val;
@@ -1993,8 +2069,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
1993 u32 eax, ebx, ecx, edx; 2069 u32 eax, ebx, ecx, edx;
1994 2070
1995 eax = ecx = 0; 2071 eax = ecx = 0;
1996 return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) 2072 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
1997 && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 2073 return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
1998 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 2074 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
1999 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; 2075 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
2000} 2076}
@@ -2013,32 +2089,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
2013 2089
2014 eax = 0x00000000; 2090 eax = 0x00000000;
2015 ecx = 0x00000000; 2091 ecx = 0x00000000;
2016 if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { 2092 ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
2017 /* 2093 /*
2018 * Intel ("GenuineIntel") 2094 * Intel ("GenuineIntel")
2019 * remark: Intel CPUs only support "syscall" in 64bit 2095 * remark: Intel CPUs only support "syscall" in 64bit
2020 * longmode. Also an 64bit guest with a 2096 * longmode. Also an 64bit guest with a
2021 * 32bit compat-app running will #UD !! While this 2097 * 32bit compat-app running will #UD !! While this
2022 * behaviour can be fixed (by emulating) into AMD 2098 * behaviour can be fixed (by emulating) into AMD
2023 * response - CPUs of AMD can't behave like Intel. 2099 * response - CPUs of AMD can't behave like Intel.
2024 */ 2100 */
2025 if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && 2101 if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
2026 ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && 2102 ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
2027 edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) 2103 edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
2028 return false; 2104 return false;
2029 2105
2030 /* AMD ("AuthenticAMD") */ 2106 /* AMD ("AuthenticAMD") */
2031 if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && 2107 if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
2032 ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && 2108 ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
2033 edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) 2109 edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
2034 return true; 2110 return true;
2035 2111
2036 /* AMD ("AMDisbetter!") */ 2112 /* AMD ("AMDisbetter!") */
2037 if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && 2113 if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
2038 ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && 2114 ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
2039 edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) 2115 edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
2040 return true; 2116 return true;
2041 }
2042 2117
2043 /* default: (not Intel, not AMD), apply Intel's stricter rules... */ 2118 /* default: (not Intel, not AMD), apply Intel's stricter rules... */
2044 return false; 2119 return false;
@@ -2547,13 +2622,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2547 ulong old_tss_base = 2622 ulong old_tss_base =
2548 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); 2623 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
2549 u32 desc_limit; 2624 u32 desc_limit;
2625 ulong desc_addr;
2550 2626
2551 /* FIXME: old_tss_base == ~0 ? */ 2627 /* FIXME: old_tss_base == ~0 ? */
2552 2628
2553 ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); 2629 ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);
2554 if (ret != X86EMUL_CONTINUE) 2630 if (ret != X86EMUL_CONTINUE)
2555 return ret; 2631 return ret;
2556 ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); 2632 ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);
2557 if (ret != X86EMUL_CONTINUE) 2633 if (ret != X86EMUL_CONTINUE)
2558 return ret; 2634 return ret;
2559 2635
@@ -2948,6 +3024,24 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
2948 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); 3024 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
2949} 3025}
2950 3026
3027static int em_lldt(struct x86_emulate_ctxt *ctxt)
3028{
3029 u16 sel = ctxt->src.val;
3030
3031 /* Disable writeback. */
3032 ctxt->dst.type = OP_NONE;
3033 return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
3034}
3035
3036static int em_ltr(struct x86_emulate_ctxt *ctxt)
3037{
3038 u16 sel = ctxt->src.val;
3039
3040 /* Disable writeback. */
3041 ctxt->dst.type = OP_NONE;
3042 return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR);
3043}
3044
2951static int em_invlpg(struct x86_emulate_ctxt *ctxt) 3045static int em_invlpg(struct x86_emulate_ctxt *ctxt)
2952{ 3046{
2953 int rc; 3047 int rc;
@@ -2989,11 +3083,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2989 return X86EMUL_CONTINUE; 3083 return X86EMUL_CONTINUE;
2990} 3084}
2991 3085
3086static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
3087 void (*get)(struct x86_emulate_ctxt *ctxt,
3088 struct desc_ptr *ptr))
3089{
3090 struct desc_ptr desc_ptr;
3091
3092 if (ctxt->mode == X86EMUL_MODE_PROT64)
3093 ctxt->op_bytes = 8;
3094 get(ctxt, &desc_ptr);
3095 if (ctxt->op_bytes == 2) {
3096 ctxt->op_bytes = 4;
3097 desc_ptr.address &= 0x00ffffff;
3098 }
3099 /* Disable writeback. */
3100 ctxt->dst.type = OP_NONE;
3101 return segmented_write(ctxt, ctxt->dst.addr.mem,
3102 &desc_ptr, 2 + ctxt->op_bytes);
3103}
3104
3105static int em_sgdt(struct x86_emulate_ctxt *ctxt)
3106{
3107 return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt);
3108}
3109
3110static int em_sidt(struct x86_emulate_ctxt *ctxt)
3111{
3112 return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
3113}
3114
2992static int em_lgdt(struct x86_emulate_ctxt *ctxt) 3115static int em_lgdt(struct x86_emulate_ctxt *ctxt)
2993{ 3116{
2994 struct desc_ptr desc_ptr; 3117 struct desc_ptr desc_ptr;
2995 int rc; 3118 int rc;
2996 3119
3120 if (ctxt->mode == X86EMUL_MODE_PROT64)
3121 ctxt->op_bytes = 8;
2997 rc = read_descriptor(ctxt, ctxt->src.addr.mem, 3122 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
2998 &desc_ptr.size, &desc_ptr.address, 3123 &desc_ptr.size, &desc_ptr.address,
2999 ctxt->op_bytes); 3124 ctxt->op_bytes);
@@ -3021,6 +3146,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
3021 struct desc_ptr desc_ptr; 3146 struct desc_ptr desc_ptr;
3022 int rc; 3147 int rc;
3023 3148
3149 if (ctxt->mode == X86EMUL_MODE_PROT64)
3150 ctxt->op_bytes = 8;
3024 rc = read_descriptor(ctxt, ctxt->src.addr.mem, 3151 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
3025 &desc_ptr.size, &desc_ptr.address, 3152 &desc_ptr.size, &desc_ptr.address,
3026 ctxt->op_bytes); 3153 ctxt->op_bytes);
@@ -3143,6 +3270,42 @@ static int em_bsr(struct x86_emulate_ctxt *ctxt)
3143 return X86EMUL_CONTINUE; 3270 return X86EMUL_CONTINUE;
3144} 3271}
3145 3272
3273static int em_cpuid(struct x86_emulate_ctxt *ctxt)
3274{
3275 u32 eax, ebx, ecx, edx;
3276
3277 eax = ctxt->regs[VCPU_REGS_RAX];
3278 ecx = ctxt->regs[VCPU_REGS_RCX];
3279 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
3280 ctxt->regs[VCPU_REGS_RAX] = eax;
3281 ctxt->regs[VCPU_REGS_RBX] = ebx;
3282 ctxt->regs[VCPU_REGS_RCX] = ecx;
3283 ctxt->regs[VCPU_REGS_RDX] = edx;
3284 return X86EMUL_CONTINUE;
3285}
3286
3287static int em_lahf(struct x86_emulate_ctxt *ctxt)
3288{
3289 ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL;
3290 ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8;
3291 return X86EMUL_CONTINUE;
3292}
3293
3294static int em_bswap(struct x86_emulate_ctxt *ctxt)
3295{
3296 switch (ctxt->op_bytes) {
3297#ifdef CONFIG_X86_64
3298 case 8:
3299 asm("bswap %0" : "+r"(ctxt->dst.val));
3300 break;
3301#endif
3302 default:
3303 asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val));
3304 break;
3305 }
3306 return X86EMUL_CONTINUE;
3307}
3308
3146static bool valid_cr(int nr) 3309static bool valid_cr(int nr)
3147{ 3310{
3148 switch (nr) { 3311 switch (nr) {
@@ -3424,14 +3587,14 @@ static struct opcode group5[] = {
3424static struct opcode group6[] = { 3587static struct opcode group6[] = {
3425 DI(Prot, sldt), 3588 DI(Prot, sldt),
3426 DI(Prot, str), 3589 DI(Prot, str),
3427 DI(Prot | Priv, lldt), 3590 II(Prot | Priv | SrcMem16, em_lldt, lldt),
3428 DI(Prot | Priv, ltr), 3591 II(Prot | Priv | SrcMem16, em_ltr, ltr),
3429 N, N, N, N, 3592 N, N, N, N,
3430}; 3593};
3431 3594
3432static struct group_dual group7 = { { 3595static struct group_dual group7 = { {
3433 DI(Mov | DstMem | Priv, sgdt), 3596 II(Mov | DstMem | Priv, em_sgdt, sgdt),
3434 DI(Mov | DstMem | Priv, sidt), 3597 II(Mov | DstMem | Priv, em_sidt, sidt),
3435 II(SrcMem | Priv, em_lgdt, lgdt), 3598 II(SrcMem | Priv, em_lgdt, lgdt),
3436 II(SrcMem | Priv, em_lidt, lidt), 3599 II(SrcMem | Priv, em_lidt, lidt),
3437 II(SrcNone | DstMem | Mov, em_smsw, smsw), N, 3600 II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
@@ -3538,7 +3701,7 @@ static struct opcode opcode_table[256] = {
3538 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), 3701 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
3539 I(SrcImmFAddr | No64, em_call_far), N, 3702 I(SrcImmFAddr | No64, em_call_far), N,
3540 II(ImplicitOps | Stack, em_pushf, pushf), 3703 II(ImplicitOps | Stack, em_pushf, pushf),
3541 II(ImplicitOps | Stack, em_popf, popf), N, N, 3704 II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf),
3542 /* 0xA0 - 0xA7 */ 3705 /* 0xA0 - 0xA7 */
3543 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 3706 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
3544 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), 3707 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
@@ -3561,7 +3724,8 @@ static struct opcode opcode_table[256] = {
3561 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), 3724 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
3562 G(ByteOp, group11), G(0, group11), 3725 G(ByteOp, group11), G(0, group11),
3563 /* 0xC8 - 0xCF */ 3726 /* 0xC8 - 0xCF */
3564 N, N, N, I(ImplicitOps | Stack, em_ret_far), 3727 I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
3728 N, I(ImplicitOps | Stack, em_ret_far),
3565 D(ImplicitOps), DI(SrcImmByte, intn), 3729 D(ImplicitOps), DI(SrcImmByte, intn),
3566 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), 3730 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
3567 /* 0xD0 - 0xD7 */ 3731 /* 0xD0 - 0xD7 */
@@ -3635,7 +3799,7 @@ static struct opcode twobyte_table[256] = {
3635 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3799 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3636 /* 0xA0 - 0xA7 */ 3800 /* 0xA0 - 0xA7 */
3637 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), 3801 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
3638 DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), 3802 II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
3639 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3803 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3640 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 3804 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
3641 /* 0xA8 - 0xAF */ 3805 /* 0xA8 - 0xAF */
@@ -3658,11 +3822,12 @@ static struct opcode twobyte_table[256] = {
3658 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 3822 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
3659 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), 3823 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
3660 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3824 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3661 /* 0xC0 - 0xCF */ 3825 /* 0xC0 - 0xC7 */
3662 D2bv(DstMem | SrcReg | ModRM | Lock), 3826 D2bv(DstMem | SrcReg | ModRM | Lock),
3663 N, D(DstMem | SrcReg | ModRM | Mov), 3827 N, D(DstMem | SrcReg | ModRM | Mov),
3664 N, N, N, GD(0, &group9), 3828 N, N, N, GD(0, &group9),
3665 N, N, N, N, N, N, N, N, 3829 /* 0xC8 - 0xCF */
3830 X8(I(DstReg, em_bswap)),
3666 /* 0xD0 - 0xDF */ 3831 /* 0xD0 - 0xDF */
3667 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 3832 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
3668 /* 0xE0 - 0xEF */ 3833 /* 0xE0 - 0xEF */
@@ -4426,12 +4591,12 @@ twobyte_insn:
4426 break; 4591 break;
4427 case 0xb6 ... 0xb7: /* movzx */ 4592 case 0xb6 ... 0xb7: /* movzx */
4428 ctxt->dst.bytes = ctxt->op_bytes; 4593 ctxt->dst.bytes = ctxt->op_bytes;
4429 ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val 4594 ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
4430 : (u16) ctxt->src.val; 4595 : (u16) ctxt->src.val;
4431 break; 4596 break;
4432 case 0xbe ... 0xbf: /* movsx */ 4597 case 0xbe ... 0xbf: /* movsx */
4433 ctxt->dst.bytes = ctxt->op_bytes; 4598 ctxt->dst.bytes = ctxt->op_bytes;
4434 ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : 4599 ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
4435 (s16) ctxt->src.val; 4600 (s16) ctxt->src.val;
4436 break; 4601 break;
4437 case 0xc0 ... 0xc1: /* xadd */ 4602 case 0xc0 ... 0xc1: /* xadd */
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 81cf4fa4a2be..1df8fb9e1d5d 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s)
188 pic_unlock(s); 188 pic_unlock(s);
189} 189}
190 190
191int kvm_pic_set_irq(void *opaque, int irq, int level) 191int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
192{ 192{
193 struct kvm_pic *s = opaque;
194 int ret = -1; 193 int ret = -1;
195 194
196 pic_lock(s); 195 pic_lock(s);
197 if (irq >= 0 && irq < PIC_NUM_PINS) { 196 if (irq >= 0 && irq < PIC_NUM_PINS) {
198 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 197 int irq_level = __kvm_irq_line_state(&s->irq_states[irq],
198 irq_source_id, level);
199 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
199 pic_update_irq(s); 200 pic_update_irq(s);
200 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 201 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
201 s->pics[irq >> 3].imr, ret == 0); 202 s->pics[irq >> 3].imr, ret == 0);
@@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
205 return ret; 206 return ret;
206} 207}
207 208
209void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
210{
211 int i;
212
213 pic_lock(s);
214 for (i = 0; i < PIC_NUM_PINS; i++)
215 __clear_bit(irq_source_id, &s->irq_states[i]);
216 pic_unlock(s);
217}
218
208/* 219/*
209 * acknowledge interrupt 'irq' 220 * acknowledge interrupt 'irq'
210 */ 221 */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93c15743f1ee..ce878788a39f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -107,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap)
107 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 107 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
108} 108}
109 109
110static inline int __apic_test_and_set_vector(int vec, void *bitmap)
111{
112 return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
113}
114
115static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
116{
117 return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
118}
119
110static inline int apic_hw_enabled(struct kvm_lapic *apic) 120static inline int apic_hw_enabled(struct kvm_lapic *apic)
111{ 121{
112 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; 122 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
@@ -210,6 +220,16 @@ static int find_highest_vector(void *bitmap)
210 return fls(word[word_offset << 2]) - 1 + (word_offset << 5); 220 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
211} 221}
212 222
223static u8 count_vectors(void *bitmap)
224{
225 u32 *word = bitmap;
226 int word_offset;
227 u8 count = 0;
228 for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset)
229 count += hweight32(word[word_offset << 2]);
230 return count;
231}
232
213static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 233static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
214{ 234{
215 apic->irr_pending = true; 235 apic->irr_pending = true;
@@ -242,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
242 apic->irr_pending = true; 262 apic->irr_pending = true;
243} 263}
244 264
265static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
266{
267 if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
268 ++apic->isr_count;
269 BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
270 /*
271 * ISR (in service register) bit is set when injecting an interrupt.
272 * The highest vector is injected. Thus the latest bit set matches
273 * the highest bit in ISR.
274 */
275 apic->highest_isr_cache = vec;
276}
277
278static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
279{
280 if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
281 --apic->isr_count;
282 BUG_ON(apic->isr_count < 0);
283 apic->highest_isr_cache = -1;
284}
285
245int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 286int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
246{ 287{
247 struct kvm_lapic *apic = vcpu->arch.apic; 288 struct kvm_lapic *apic = vcpu->arch.apic;
@@ -270,9 +311,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
270 irq->level, irq->trig_mode); 311 irq->level, irq->trig_mode);
271} 312}
272 313
314static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
315{
316
317 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
318 sizeof(val));
319}
320
321static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
322{
323
324 return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
325 sizeof(*val));
326}
327
328static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
329{
330 return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
331}
332
333static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
334{
335 u8 val;
336 if (pv_eoi_get_user(vcpu, &val) < 0)
337 apic_debug("Can't read EOI MSR value: 0x%llx\n",
338 (unsigned long long)vcpi->arch.pv_eoi.msr_val);
339 return val & 0x1;
340}
341
342static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
343{
344 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
345 apic_debug("Can't set EOI MSR value: 0x%llx\n",
346 (unsigned long long)vcpi->arch.pv_eoi.msr_val);
347 return;
348 }
349 __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
350}
351
352static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
353{
354 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
355 apic_debug("Can't clear EOI MSR value: 0x%llx\n",
356 (unsigned long long)vcpi->arch.pv_eoi.msr_val);
357 return;
358 }
359 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
360}
361
273static inline int apic_find_highest_isr(struct kvm_lapic *apic) 362static inline int apic_find_highest_isr(struct kvm_lapic *apic)
274{ 363{
275 int result; 364 int result;
365 if (!apic->isr_count)
366 return -1;
367 if (likely(apic->highest_isr_cache != -1))
368 return apic->highest_isr_cache;
276 369
277 result = find_highest_vector(apic->regs + APIC_ISR); 370 result = find_highest_vector(apic->regs + APIC_ISR);
278 ASSERT(result == -1 || result >= 16); 371 ASSERT(result == -1 || result >= 16);
@@ -482,17 +575,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
482 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 575 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
483} 576}
484 577
485static void apic_set_eoi(struct kvm_lapic *apic) 578static int apic_set_eoi(struct kvm_lapic *apic)
486{ 579{
487 int vector = apic_find_highest_isr(apic); 580 int vector = apic_find_highest_isr(apic);
581
582 trace_kvm_eoi(apic, vector);
583
488 /* 584 /*
489 * Not every write EOI will has corresponding ISR, 585 * Not every write EOI will has corresponding ISR,
490 * one example is when Kernel check timer on setup_IO_APIC 586 * one example is when Kernel check timer on setup_IO_APIC
491 */ 587 */
492 if (vector == -1) 588 if (vector == -1)
493 return; 589 return vector;
494 590
495 apic_clear_vector(vector, apic->regs + APIC_ISR); 591 apic_clear_isr(vector, apic);
496 apic_update_ppr(apic); 592 apic_update_ppr(apic);
497 593
498 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && 594 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
@@ -505,6 +601,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
505 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 601 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
506 } 602 }
507 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 603 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
604 return vector;
508} 605}
509 606
510static void apic_send_ipi(struct kvm_lapic *apic) 607static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1081,10 +1178,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1081 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 1178 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
1082 } 1179 }
1083 apic->irr_pending = false; 1180 apic->irr_pending = false;
1181 apic->isr_count = 0;
1182 apic->highest_isr_cache = -1;
1084 update_divide_count(apic); 1183 update_divide_count(apic);
1085 atomic_set(&apic->lapic_timer.pending, 0); 1184 atomic_set(&apic->lapic_timer.pending, 0);
1086 if (kvm_vcpu_is_bsp(vcpu)) 1185 if (kvm_vcpu_is_bsp(vcpu))
1087 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 1186 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
1187 vcpu->arch.pv_eoi.msr_val = 0;
1088 apic_update_ppr(apic); 1188 apic_update_ppr(apic);
1089 1189
1090 vcpu->arch.apic_arb_prio = 0; 1190 vcpu->arch.apic_arb_prio = 0;
@@ -1248,7 +1348,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1248 if (vector == -1) 1348 if (vector == -1)
1249 return -1; 1349 return -1;
1250 1350
1251 apic_set_vector(vector, apic->regs + APIC_ISR); 1351 apic_set_isr(vector, apic);
1252 apic_update_ppr(apic); 1352 apic_update_ppr(apic);
1253 apic_clear_irr(vector, apic); 1353 apic_clear_irr(vector, apic);
1254 return vector; 1354 return vector;
@@ -1267,6 +1367,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1267 update_divide_count(apic); 1367 update_divide_count(apic);
1268 start_apic_timer(apic); 1368 start_apic_timer(apic);
1269 apic->irr_pending = true; 1369 apic->irr_pending = true;
1370 apic->isr_count = count_vectors(apic->regs + APIC_ISR);
1371 apic->highest_isr_cache = -1;
1270 kvm_make_request(KVM_REQ_EVENT, vcpu); 1372 kvm_make_request(KVM_REQ_EVENT, vcpu);
1271} 1373}
1272 1374
@@ -1283,11 +1385,51 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1283 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 1385 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1284} 1386}
1285 1387
1388/*
1389 * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
1390 *
1391 * Detect whether guest triggered PV EOI since the
1392 * last entry. If yes, set EOI on guests's behalf.
1393 * Clear PV EOI in guest memory in any case.
1394 */
1395static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
1396 struct kvm_lapic *apic)
1397{
1398 bool pending;
1399 int vector;
1400 /*
1401 * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
1402 * and KVM_PV_EOI_ENABLED in guest memory as follows:
1403 *
1404 * KVM_APIC_PV_EOI_PENDING is unset:
1405 * -> host disabled PV EOI.
1406 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
1407 * -> host enabled PV EOI, guest did not execute EOI yet.
1408 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
1409 * -> host enabled PV EOI, guest executed EOI.
1410 */
1411 BUG_ON(!pv_eoi_enabled(vcpu));
1412 pending = pv_eoi_get_pending(vcpu);
1413 /*
1414 * Clear pending bit in any case: it will be set again on vmentry.
1415 * While this might not be ideal from performance point of view,
1416 * this makes sure pv eoi is only enabled when we know it's safe.
1417 */
1418 pv_eoi_clr_pending(vcpu);
1419 if (pending)
1420 return;
1421 vector = apic_set_eoi(apic);
1422 trace_kvm_pv_eoi(apic, vector);
1423}
1424
1286void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) 1425void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1287{ 1426{
1288 u32 data; 1427 u32 data;
1289 void *vapic; 1428 void *vapic;
1290 1429
1430 if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
1431 apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
1432
1291 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) 1433 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1292 return; 1434 return;
1293 1435
@@ -1298,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1298 apic_set_tpr(vcpu->arch.apic, data & 0xff); 1440 apic_set_tpr(vcpu->arch.apic, data & 0xff);
1299} 1441}
1300 1442
1443/*
1444 * apic_sync_pv_eoi_to_guest - called before vmentry
1445 *
1446 * Detect whether it's safe to enable PV EOI and
1447 * if yes do so.
1448 */
1449static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
1450 struct kvm_lapic *apic)
1451{
1452 if (!pv_eoi_enabled(vcpu) ||
1453 /* IRR set or many bits in ISR: could be nested. */
1454 apic->irr_pending ||
1455 /* Cache not set: could be safe but we don't bother. */
1456 apic->highest_isr_cache == -1 ||
1457 /* Need EOI to update ioapic. */
1458 kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
1459 /*
1460 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
1461 * so we need not do anything here.
1462 */
1463 return;
1464 }
1465
1466 pv_eoi_set_pending(apic->vcpu);
1467}
1468
1301void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) 1469void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1302{ 1470{
1303 u32 data, tpr; 1471 u32 data, tpr;
1304 int max_irr, max_isr; 1472 int max_irr, max_isr;
1305 struct kvm_lapic *apic; 1473 struct kvm_lapic *apic = vcpu->arch.apic;
1306 void *vapic; 1474 void *vapic;
1307 1475
1476 apic_sync_pv_eoi_to_guest(vcpu, apic);
1477
1308 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) 1478 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1309 return; 1479 return;
1310 1480
1311 apic = vcpu->arch.apic;
1312 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; 1481 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
1313 max_irr = apic_find_highest_irr(apic); 1482 max_irr = apic_find_highest_irr(apic);
1314 if (max_irr < 0) 1483 if (max_irr < 0)
@@ -1394,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
1394 1563
1395 return 0; 1564 return 0;
1396} 1565}
1566
1567int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
1568{
1569 u64 addr = data & ~KVM_MSR_ENABLED;
1570 if (!IS_ALIGNED(addr, 4))
1571 return 1;
1572
1573 vcpu->arch.pv_eoi.msr_val = data;
1574 if (!pv_eoi_enabled(vcpu))
1575 return 0;
1576 return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
1577 addr);
1578}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 6f4ce2575d09..4af5405ae1e2 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,6 +13,15 @@ struct kvm_lapic {
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending; 15 bool irr_pending;
16 /* Number of bits set in ISR. */
17 s16 isr_count;
18 /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
19 int highest_isr_cache;
20 /**
21 * APIC register page. The layout matches the register layout seen by
22 * the guest 1:1, because it is accessed by the vmx microcode.
23 * Note: Only one register, the TPR, is used by the microcode.
24 */
16 void *regs; 25 void *regs;
17 gpa_t vapic_addr; 26 gpa_t vapic_addr;
18 struct page *vapic_page; 27 struct page *vapic_page;
@@ -60,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
60{ 69{
61 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; 70 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
62} 71}
72
73int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
63#endif 74#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 57e168e27b5b..01ca00423938 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -90,7 +90,7 @@ module_param(dbg, bool, 0644);
90 90
91#define PTE_PREFETCH_NUM 8 91#define PTE_PREFETCH_NUM 8
92 92
93#define PT_FIRST_AVAIL_BITS_SHIFT 9 93#define PT_FIRST_AVAIL_BITS_SHIFT 10
94#define PT64_SECOND_AVAIL_BITS_SHIFT 52 94#define PT64_SECOND_AVAIL_BITS_SHIFT 52
95 95
96#define PT64_LEVEL_BITS 9 96#define PT64_LEVEL_BITS 9
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
145#define CREATE_TRACE_POINTS 145#define CREATE_TRACE_POINTS
146#include "mmutrace.h" 146#include "mmutrace.h"
147 147
148#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 148#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
149#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
149 150
150#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 151#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
151 152
@@ -188,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask;
188static u64 __read_mostly shadow_mmio_mask; 189static u64 __read_mostly shadow_mmio_mask;
189 190
190static void mmu_spte_set(u64 *sptep, u64 spte); 191static void mmu_spte_set(u64 *sptep, u64 spte);
192static void mmu_free_roots(struct kvm_vcpu *vcpu);
191 193
192void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) 194void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
193{ 195{
@@ -444,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
444} 446}
445#endif 447#endif
446 448
449static bool spte_is_locklessly_modifiable(u64 spte)
450{
451 return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
452}
453
447static bool spte_has_volatile_bits(u64 spte) 454static bool spte_has_volatile_bits(u64 spte)
448{ 455{
456 /*
457 * Always atomicly update spte if it can be updated
458 * out of mmu-lock, it can ensure dirty bit is not lost,
459 * also, it can help us to get a stable is_writable_pte()
460 * to ensure tlb flush is not missed.
461 */
462 if (spte_is_locklessly_modifiable(spte))
463 return true;
464
449 if (!shadow_accessed_mask) 465 if (!shadow_accessed_mask)
450 return false; 466 return false;
451 467
@@ -478,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
478 494
479/* Rules for using mmu_spte_update: 495/* Rules for using mmu_spte_update:
480 * Update the state bits, it means the mapped pfn is not changged. 496 * Update the state bits, it means the mapped pfn is not changged.
497 *
498 * Whenever we overwrite a writable spte with a read-only one we
499 * should flush remote TLBs. Otherwise rmap_write_protect
500 * will find a read-only spte, even though the writable spte
501 * might be cached on a CPU's TLB, the return value indicates this
502 * case.
481 */ 503 */
482static void mmu_spte_update(u64 *sptep, u64 new_spte) 504static bool mmu_spte_update(u64 *sptep, u64 new_spte)
483{ 505{
484 u64 mask, old_spte = *sptep; 506 u64 old_spte = *sptep;
507 bool ret = false;
485 508
486 WARN_ON(!is_rmap_spte(new_spte)); 509 WARN_ON(!is_rmap_spte(new_spte));
487 510
488 if (!is_shadow_present_pte(old_spte)) 511 if (!is_shadow_present_pte(old_spte)) {
489 return mmu_spte_set(sptep, new_spte); 512 mmu_spte_set(sptep, new_spte);
490 513 return ret;
491 new_spte |= old_spte & shadow_dirty_mask; 514 }
492
493 mask = shadow_accessed_mask;
494 if (is_writable_pte(old_spte))
495 mask |= shadow_dirty_mask;
496 515
497 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) 516 if (!spte_has_volatile_bits(old_spte))
498 __update_clear_spte_fast(sptep, new_spte); 517 __update_clear_spte_fast(sptep, new_spte);
499 else 518 else
500 old_spte = __update_clear_spte_slow(sptep, new_spte); 519 old_spte = __update_clear_spte_slow(sptep, new_spte);
501 520
521 /*
522 * For the spte updated out of mmu-lock is safe, since
523 * we always atomicly update it, see the comments in
524 * spte_has_volatile_bits().
525 */
526 if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
527 ret = true;
528
502 if (!shadow_accessed_mask) 529 if (!shadow_accessed_mask)
503 return; 530 return ret;
504 531
505 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) 532 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
506 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 533 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
507 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) 534 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
508 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 535 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
536
537 return ret;
509} 538}
510 539
511/* 540/*
@@ -652,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
652 mmu_page_header_cache); 681 mmu_page_header_cache);
653} 682}
654 683
655static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 684static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
656 size_t size)
657{ 685{
658 void *p; 686 void *p;
659 687
@@ -664,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
664 692
665static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) 693static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
666{ 694{
667 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, 695 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
668 sizeof(struct pte_list_desc));
669} 696}
670 697
671static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) 698static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
@@ -1051,35 +1078,82 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
1051 rmap_remove(kvm, sptep); 1078 rmap_remove(kvm, sptep);
1052} 1079}
1053 1080
1054static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) 1081
1082static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1083{
1084 if (is_large_pte(*sptep)) {
1085 WARN_ON(page_header(__pa(sptep))->role.level ==
1086 PT_PAGE_TABLE_LEVEL);
1087 drop_spte(kvm, sptep);
1088 --kvm->stat.lpages;
1089 return true;
1090 }
1091
1092 return false;
1093}
1094
1095static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1096{
1097 if (__drop_large_spte(vcpu->kvm, sptep))
1098 kvm_flush_remote_tlbs(vcpu->kvm);
1099}
1100
1101/*
1102 * Write-protect on the specified @sptep, @pt_protect indicates whether
1103 * spte writ-protection is caused by protecting shadow page table.
1104 * @flush indicates whether tlb need be flushed.
1105 *
1106 * Note: write protection is difference between drity logging and spte
1107 * protection:
1108 * - for dirty logging, the spte can be set to writable at anytime if
1109 * its dirty bitmap is properly set.
1110 * - for spte protection, the spte can be writable only after unsync-ing
1111 * shadow page.
1112 *
1113 * Return true if the spte is dropped.
1114 */
1115static bool
1116spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
1117{
1118 u64 spte = *sptep;
1119
1120 if (!is_writable_pte(spte) &&
1121 !(pt_protect && spte_is_locklessly_modifiable(spte)))
1122 return false;
1123
1124 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1125
1126 if (__drop_large_spte(kvm, sptep)) {
1127 *flush |= true;
1128 return true;
1129 }
1130
1131 if (pt_protect)
1132 spte &= ~SPTE_MMU_WRITEABLE;
1133 spte = spte & ~PT_WRITABLE_MASK;
1134
1135 *flush |= mmu_spte_update(sptep, spte);
1136 return false;
1137}
1138
1139static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1140 int level, bool pt_protect)
1055{ 1141{
1056 u64 *sptep; 1142 u64 *sptep;
1057 struct rmap_iterator iter; 1143 struct rmap_iterator iter;
1058 int write_protected = 0; 1144 bool flush = false;
1059 1145
1060 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { 1146 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1061 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1147 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1062 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); 1148 if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
1063
1064 if (!is_writable_pte(*sptep)) {
1065 sptep = rmap_get_next(&iter);
1066 continue;
1067 }
1068
1069 if (level == PT_PAGE_TABLE_LEVEL) {
1070 mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
1071 sptep = rmap_get_next(&iter);
1072 } else {
1073 BUG_ON(!is_large_pte(*sptep));
1074 drop_spte(kvm, sptep);
1075 --kvm->stat.lpages;
1076 sptep = rmap_get_first(*rmapp, &iter); 1149 sptep = rmap_get_first(*rmapp, &iter);
1150 continue;
1077 } 1151 }
1078 1152
1079 write_protected = 1; 1153 sptep = rmap_get_next(&iter);
1080 } 1154 }
1081 1155
1082 return write_protected; 1156 return flush;
1083} 1157}
1084 1158
1085/** 1159/**
@@ -1100,26 +1174,26 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1100 1174
1101 while (mask) { 1175 while (mask) {
1102 rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; 1176 rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
1103 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); 1177 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
1104 1178
1105 /* clear the first set bit */ 1179 /* clear the first set bit */
1106 mask &= mask - 1; 1180 mask &= mask - 1;
1107 } 1181 }
1108} 1182}
1109 1183
1110static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1184static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1111{ 1185{
1112 struct kvm_memory_slot *slot; 1186 struct kvm_memory_slot *slot;
1113 unsigned long *rmapp; 1187 unsigned long *rmapp;
1114 int i; 1188 int i;
1115 int write_protected = 0; 1189 bool write_protected = false;
1116 1190
1117 slot = gfn_to_memslot(kvm, gfn); 1191 slot = gfn_to_memslot(kvm, gfn);
1118 1192
1119 for (i = PT_PAGE_TABLE_LEVEL; 1193 for (i = PT_PAGE_TABLE_LEVEL;
1120 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1194 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1121 rmapp = __gfn_to_rmap(gfn, i, slot); 1195 rmapp = __gfn_to_rmap(gfn, i, slot);
1122 write_protected |= __rmap_write_protect(kvm, rmapp, i); 1196 write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
1123 } 1197 }
1124 1198
1125 return write_protected; 1199 return write_protected;
@@ -1238,11 +1312,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1238 unsigned long data) 1312 unsigned long data)
1239{ 1313{
1240 u64 *sptep; 1314 u64 *sptep;
1241 struct rmap_iterator iter; 1315 struct rmap_iterator uninitialized_var(iter);
1242 int young = 0; 1316 int young = 0;
1243 1317
1244 /* 1318 /*
1245 * Emulate the accessed bit for EPT, by checking if this page has 1319 * In case of absence of EPT Access and Dirty Bits supports,
1320 * emulate the accessed bit for EPT, by checking if this page has
1246 * an EPT mapping, and clearing it if it does. On the next access, 1321 * an EPT mapping, and clearing it if it does. On the next access,
1247 * a new EPT mapping will be established. 1322 * a new EPT mapping will be established.
1248 * This has some overhead, but not as much as the cost of swapping 1323 * This has some overhead, but not as much as the cost of swapping
@@ -1253,11 +1328,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1253 1328
1254 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1329 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1255 sptep = rmap_get_next(&iter)) { 1330 sptep = rmap_get_next(&iter)) {
1256 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1331 BUG_ON(!is_shadow_present_pte(*sptep));
1257 1332
1258 if (*sptep & PT_ACCESSED_MASK) { 1333 if (*sptep & shadow_accessed_mask) {
1259 young = 1; 1334 young = 1;
1260 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); 1335 clear_bit((ffs(shadow_accessed_mask) - 1),
1336 (unsigned long *)sptep);
1261 } 1337 }
1262 } 1338 }
1263 1339
@@ -1281,9 +1357,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1281 1357
1282 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1358 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1283 sptep = rmap_get_next(&iter)) { 1359 sptep = rmap_get_next(&iter)) {
1284 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1360 BUG_ON(!is_shadow_present_pte(*sptep));
1285 1361
1286 if (*sptep & PT_ACCESSED_MASK) { 1362 if (*sptep & shadow_accessed_mask) {
1287 young = 1; 1363 young = 1;
1288 break; 1364 break;
1289 } 1365 }
@@ -1401,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1401 u64 *parent_pte, int direct) 1477 u64 *parent_pte, int direct)
1402{ 1478{
1403 struct kvm_mmu_page *sp; 1479 struct kvm_mmu_page *sp;
1404 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, 1480 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1405 sizeof *sp); 1481 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1406 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1407 if (!direct) 1482 if (!direct)
1408 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, 1483 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1409 PAGE_SIZE);
1410 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1484 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1411 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1485 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1412 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); 1486 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
@@ -1701,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1701 1775
1702 kvm_mmu_pages_init(parent, &parents, &pages); 1776 kvm_mmu_pages_init(parent, &parents, &pages);
1703 while (mmu_unsync_walk(parent, &pages)) { 1777 while (mmu_unsync_walk(parent, &pages)) {
1704 int protected = 0; 1778 bool protected = false;
1705 1779
1706 for_each_sp(pages, sp, parents, i) 1780 for_each_sp(pages, sp, parents, i)
1707 protected |= rmap_write_protect(vcpu->kvm, sp->gfn); 1781 protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
@@ -1866,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1866 mmu_spte_set(sptep, spte); 1940 mmu_spte_set(sptep, spte);
1867} 1941}
1868 1942
1869static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1870{
1871 if (is_large_pte(*sptep)) {
1872 drop_spte(vcpu->kvm, sptep);
1873 --vcpu->kvm->stat.lpages;
1874 kvm_flush_remote_tlbs(vcpu->kvm);
1875 }
1876}
1877
1878static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 1943static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1879 unsigned direct_access) 1944 unsigned direct_access)
1880{ 1945{
@@ -2243,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2243 gfn_t gfn, pfn_t pfn, bool speculative, 2308 gfn_t gfn, pfn_t pfn, bool speculative,
2244 bool can_unsync, bool host_writable) 2309 bool can_unsync, bool host_writable)
2245{ 2310{
2246 u64 spte, entry = *sptep; 2311 u64 spte;
2247 int ret = 0; 2312 int ret = 0;
2248 2313
2249 if (set_mmio_spte(sptep, gfn, pfn, pte_access)) 2314 if (set_mmio_spte(sptep, gfn, pfn, pte_access))
@@ -2257,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2257 spte |= shadow_x_mask; 2322 spte |= shadow_x_mask;
2258 else 2323 else
2259 spte |= shadow_nx_mask; 2324 spte |= shadow_nx_mask;
2325
2260 if (pte_access & ACC_USER_MASK) 2326 if (pte_access & ACC_USER_MASK)
2261 spte |= shadow_user_mask; 2327 spte |= shadow_user_mask;
2328
2262 if (level > PT_PAGE_TABLE_LEVEL) 2329 if (level > PT_PAGE_TABLE_LEVEL)
2263 spte |= PT_PAGE_SIZE_MASK; 2330 spte |= PT_PAGE_SIZE_MASK;
2264 if (tdp_enabled) 2331 if (tdp_enabled)
@@ -2283,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2283 goto done; 2350 goto done;
2284 } 2351 }
2285 2352
2286 spte |= PT_WRITABLE_MASK; 2353 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2287 2354
2288 if (!vcpu->arch.mmu.direct_map 2355 if (!vcpu->arch.mmu.direct_map
2289 && !(pte_access & ACC_WRITE_MASK)) { 2356 && !(pte_access & ACC_WRITE_MASK)) {
@@ -2312,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2312 __func__, gfn); 2379 __func__, gfn);
2313 ret = 1; 2380 ret = 1;
2314 pte_access &= ~ACC_WRITE_MASK; 2381 pte_access &= ~ACC_WRITE_MASK;
2315 if (is_writable_pte(spte)) 2382 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
2316 spte &= ~PT_WRITABLE_MASK;
2317 } 2383 }
2318 } 2384 }
2319 2385
@@ -2321,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2321 mark_page_dirty(vcpu->kvm, gfn); 2387 mark_page_dirty(vcpu->kvm, gfn);
2322 2388
2323set_pte: 2389set_pte:
2324 mmu_spte_update(sptep, spte); 2390 if (mmu_spte_update(sptep, spte))
2325 /*
2326 * If we overwrite a writable spte with a read-only one we
2327 * should flush remote TLBs. Otherwise rmap_write_protect
2328 * will find a read-only spte, even though the writable spte
2329 * might be cached on a CPU's TLB.
2330 */
2331 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2332 kvm_flush_remote_tlbs(vcpu->kvm); 2391 kvm_flush_remote_tlbs(vcpu->kvm);
2333done: 2392done:
2334 return ret; 2393 return ret;
@@ -2403,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2403 2462
2404static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2463static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2405{ 2464{
2465 mmu_free_roots(vcpu);
2406} 2466}
2407 2467
2408static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2468static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2625,18 +2685,116 @@ exit:
2625 return ret; 2685 return ret;
2626} 2686}
2627 2687
2688static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
2689{
2690 /*
2691 * #PF can be fast only if the shadow page table is present and it
2692 * is caused by write-protect, that means we just need change the
2693 * W bit of the spte which can be done out of mmu-lock.
2694 */
2695 if (!(error_code & PFERR_PRESENT_MASK) ||
2696 !(error_code & PFERR_WRITE_MASK))
2697 return false;
2698
2699 return true;
2700}
2701
2702static bool
2703fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
2704{
2705 struct kvm_mmu_page *sp = page_header(__pa(sptep));
2706 gfn_t gfn;
2707
2708 WARN_ON(!sp->role.direct);
2709
2710 /*
2711 * The gfn of direct spte is stable since it is calculated
2712 * by sp->gfn.
2713 */
2714 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
2715
2716 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
2717 mark_page_dirty(vcpu->kvm, gfn);
2718
2719 return true;
2720}
2721
2722/*
2723 * Return value:
2724 * - true: let the vcpu to access on the same address again.
2725 * - false: let the real page fault path to fix it.
2726 */
2727static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2728 u32 error_code)
2729{
2730 struct kvm_shadow_walk_iterator iterator;
2731 bool ret = false;
2732 u64 spte = 0ull;
2733
2734 if (!page_fault_can_be_fast(vcpu, error_code))
2735 return false;
2736
2737 walk_shadow_page_lockless_begin(vcpu);
2738 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
2739 if (!is_shadow_present_pte(spte) || iterator.level < level)
2740 break;
2741
2742 /*
2743 * If the mapping has been changed, let the vcpu fault on the
2744 * same address again.
2745 */
2746 if (!is_rmap_spte(spte)) {
2747 ret = true;
2748 goto exit;
2749 }
2750
2751 if (!is_last_spte(spte, level))
2752 goto exit;
2753
2754 /*
2755 * Check if it is a spurious fault caused by TLB lazily flushed.
2756 *
2757 * Need not check the access of upper level table entries since
2758 * they are always ACC_ALL.
2759 */
2760 if (is_writable_pte(spte)) {
2761 ret = true;
2762 goto exit;
2763 }
2764
2765 /*
2766 * Currently, to simplify the code, only the spte write-protected
2767 * by dirty-log can be fast fixed.
2768 */
2769 if (!spte_is_locklessly_modifiable(spte))
2770 goto exit;
2771
2772 /*
2773 * Currently, fast page fault only works for direct mapping since
2774 * the gfn is not stable for indirect shadow page.
2775 * See Documentation/virtual/kvm/locking.txt to get more detail.
2776 */
2777 ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
2778exit:
2779 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
2780 spte, ret);
2781 walk_shadow_page_lockless_end(vcpu);
2782
2783 return ret;
2784}
2785
2628static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2786static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2629 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2787 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2630 2788
2631static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, 2789static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2632 bool prefault) 2790 gfn_t gfn, bool prefault)
2633{ 2791{
2634 int r; 2792 int r;
2635 int level; 2793 int level;
2636 int force_pt_level; 2794 int force_pt_level;
2637 pfn_t pfn; 2795 pfn_t pfn;
2638 unsigned long mmu_seq; 2796 unsigned long mmu_seq;
2639 bool map_writable; 2797 bool map_writable, write = error_code & PFERR_WRITE_MASK;
2640 2798
2641 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); 2799 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2642 if (likely(!force_pt_level)) { 2800 if (likely(!force_pt_level)) {
@@ -2653,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2653 } else 2811 } else
2654 level = PT_PAGE_TABLE_LEVEL; 2812 level = PT_PAGE_TABLE_LEVEL;
2655 2813
2814 if (fast_page_fault(vcpu, v, level, error_code))
2815 return 0;
2816
2656 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2817 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2657 smp_rmb(); 2818 smp_rmb();
2658 2819
@@ -3041,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3041 gfn = gva >> PAGE_SHIFT; 3202 gfn = gva >> PAGE_SHIFT;
3042 3203
3043 return nonpaging_map(vcpu, gva & PAGE_MASK, 3204 return nonpaging_map(vcpu, gva & PAGE_MASK,
3044 error_code & PFERR_WRITE_MASK, gfn, prefault); 3205 error_code, gfn, prefault);
3045} 3206}
3046 3207
3047static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) 3208static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3121,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3121 } else 3282 } else
3122 level = PT_PAGE_TABLE_LEVEL; 3283 level = PT_PAGE_TABLE_LEVEL;
3123 3284
3285 if (fast_page_fault(vcpu, gpa, level, error_code))
3286 return 0;
3287
3124 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3288 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3125 smp_rmb(); 3289 smp_rmb();
3126 3290
@@ -3885,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3885void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 4049void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3886{ 4050{
3887 struct kvm_mmu_page *sp; 4051 struct kvm_mmu_page *sp;
4052 bool flush = false;
3888 4053
3889 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 4054 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
3890 int i; 4055 int i;
@@ -3899,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3899 !is_last_spte(pt[i], sp->role.level)) 4064 !is_last_spte(pt[i], sp->role.level))
3900 continue; 4065 continue;
3901 4066
3902 if (is_large_pte(pt[i])) { 4067 spte_write_protect(kvm, &pt[i], &flush, false);
3903 drop_spte(kvm, &pt[i]);
3904 --kvm->stat.lpages;
3905 continue;
3906 }
3907
3908 /* avoid RMW */
3909 if (is_writable_pte(pt[i]))
3910 mmu_spte_update(&pt[i],
3911 pt[i] & ~PT_WRITABLE_MASK);
3912 } 4068 }
3913 } 4069 }
3914 kvm_flush_remote_tlbs(kvm); 4070 kvm_flush_remote_tlbs(kvm);
@@ -3945,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3945static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 4101static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3946{ 4102{
3947 struct kvm *kvm; 4103 struct kvm *kvm;
3948 struct kvm *kvm_freed = NULL;
3949 int nr_to_scan = sc->nr_to_scan; 4104 int nr_to_scan = sc->nr_to_scan;
3950 4105
3951 if (nr_to_scan == 0) 4106 if (nr_to_scan == 0)
@@ -3957,22 +4112,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3957 int idx; 4112 int idx;
3958 LIST_HEAD(invalid_list); 4113 LIST_HEAD(invalid_list);
3959 4114
4115 /*
4116 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
4117 * here. We may skip a VM instance errorneosly, but we do not
4118 * want to shrink a VM that only started to populate its MMU
4119 * anyway.
4120 */
4121 if (kvm->arch.n_used_mmu_pages > 0) {
4122 if (!nr_to_scan--)
4123 break;
4124 continue;
4125 }
4126
3960 idx = srcu_read_lock(&kvm->srcu); 4127 idx = srcu_read_lock(&kvm->srcu);
3961 spin_lock(&kvm->mmu_lock); 4128 spin_lock(&kvm->mmu_lock);
3962 if (!kvm_freed && nr_to_scan > 0 &&
3963 kvm->arch.n_used_mmu_pages > 0) {
3964 kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3965 &invalid_list);
3966 kvm_freed = kvm;
3967 }
3968 nr_to_scan--;
3969 4129
4130 kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
3970 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4131 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4132
3971 spin_unlock(&kvm->mmu_lock); 4133 spin_unlock(&kvm->mmu_lock);
3972 srcu_read_unlock(&kvm->srcu, idx); 4134 srcu_read_unlock(&kvm->srcu, idx);
4135
4136 list_move_tail(&kvm->vm_list, &vm_list);
4137 break;
3973 } 4138 }
3974 if (kvm_freed)
3975 list_move_tail(&kvm_freed->vm_list, &vm_list);
3976 4139
3977 raw_spin_unlock(&kvm_lock); 4140 raw_spin_unlock(&kvm_lock);
3978 4141
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 89fb0e81322a..cd6e98333ba3 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -54,8 +54,8 @@
54 */ 54 */
55TRACE_EVENT( 55TRACE_EVENT(
56 kvm_mmu_pagetable_walk, 56 kvm_mmu_pagetable_walk,
57 TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), 57 TP_PROTO(u64 addr, u32 pferr),
58 TP_ARGS(addr, write_fault, user_fault, fetch_fault), 58 TP_ARGS(addr, pferr),
59 59
60 TP_STRUCT__entry( 60 TP_STRUCT__entry(
61 __field(__u64, addr) 61 __field(__u64, addr)
@@ -64,8 +64,7 @@ TRACE_EVENT(
64 64
65 TP_fast_assign( 65 TP_fast_assign(
66 __entry->addr = addr; 66 __entry->addr = addr;
67 __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) 67 __entry->pferr = pferr;
68 | (!!fetch_fault << 4);
69 ), 68 ),
70 69
71 TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, 70 TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
@@ -243,6 +242,44 @@ TRACE_EVENT(
243 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, 242 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
244 __entry->access) 243 __entry->access)
245); 244);
245
246#define __spte_satisfied(__spte) \
247 (__entry->retry && is_writable_pte(__entry->__spte))
248
249TRACE_EVENT(
250 fast_page_fault,
251 TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
252 u64 *sptep, u64 old_spte, bool retry),
253 TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
254
255 TP_STRUCT__entry(
256 __field(int, vcpu_id)
257 __field(gva_t, gva)
258 __field(u32, error_code)
259 __field(u64 *, sptep)
260 __field(u64, old_spte)
261 __field(u64, new_spte)
262 __field(bool, retry)
263 ),
264
265 TP_fast_assign(
266 __entry->vcpu_id = vcpu->vcpu_id;
267 __entry->gva = gva;
268 __entry->error_code = error_code;
269 __entry->sptep = sptep;
270 __entry->old_spte = old_spte;
271 __entry->new_spte = *sptep;
272 __entry->retry = retry;
273 ),
274
275 TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
276 " new %llx spurious %d fixed %d", __entry->vcpu_id,
277 __entry->gva, __print_flags(__entry->error_code, "|",
278 kvm_mmu_trace_pferr_flags), __entry->sptep,
279 __entry->old_spte, __entry->new_spte,
280 __spte_satisfied(old_spte), __spte_satisfied(new_spte)
281 )
282);
246#endif /* _TRACE_KVMMMU_H */ 283#endif /* _TRACE_KVMMMU_H */
247 284
248#undef TRACE_INCLUDE_PATH 285#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 34f970937ef1..bb7cf01cae76 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
154 const int fetch_fault = access & PFERR_FETCH_MASK; 154 const int fetch_fault = access & PFERR_FETCH_MASK;
155 u16 errcode = 0; 155 u16 errcode = 0;
156 156
157 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 157 trace_kvm_mmu_pagetable_walk(addr, access);
158 fetch_fault);
159retry_walk: 158retry_walk:
160 eperm = false; 159 eperm = false;
161 walker->level = mmu->root_level; 160 walker->level = mmu->root_level;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f75af406b268..baead950d6c8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3185,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3185 break; 3185 break;
3186 case MSR_IA32_DEBUGCTLMSR: 3186 case MSR_IA32_DEBUGCTLMSR:
3187 if (!boot_cpu_has(X86_FEATURE_LBRV)) { 3187 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3188 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 3188 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3189 __func__, data); 3189 __func__, data);
3190 break; 3190 break;
3191 } 3191 }
3192 if (data & DEBUGCTL_RESERVED_BITS) 3192 if (data & DEBUGCTL_RESERVED_BITS)
@@ -3205,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3205 case MSR_VM_CR: 3205 case MSR_VM_CR:
3206 return svm_set_vm_cr(vcpu, data); 3206 return svm_set_vm_cr(vcpu, data);
3207 case MSR_VM_IGNNE: 3207 case MSR_VM_IGNNE:
3208 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 3208 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3209 break; 3209 break;
3210 default: 3210 default:
3211 return kvm_set_msr_common(vcpu, ecx, data); 3211 return kvm_set_msr_common(vcpu, ecx, data);
@@ -4044,6 +4044,11 @@ static bool svm_rdtscp_supported(void)
4044 return false; 4044 return false;
4045} 4045}
4046 4046
4047static bool svm_invpcid_supported(void)
4048{
4049 return false;
4050}
4051
4047static bool svm_has_wbinvd_exit(void) 4052static bool svm_has_wbinvd_exit(void)
4048{ 4053{
4049 return true; 4054 return true;
@@ -4312,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4312 .cpuid_update = svm_cpuid_update, 4317 .cpuid_update = svm_cpuid_update,
4313 4318
4314 .rdtscp_supported = svm_rdtscp_supported, 4319 .rdtscp_supported = svm_rdtscp_supported,
4320 .invpcid_supported = svm_invpcid_supported,
4315 4321
4316 .set_supported_cpuid = svm_set_supported_cpuid, 4322 .set_supported_cpuid = svm_set_supported_cpuid,
4317 4323
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 62d02e3c3ed6..a71faf727ff3 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq,
517 __entry->coalesced ? " (coalesced)" : "") 517 __entry->coalesced ? " (coalesced)" : "")
518); 518);
519 519
520TRACE_EVENT(kvm_eoi,
521 TP_PROTO(struct kvm_lapic *apic, int vector),
522 TP_ARGS(apic, vector),
523
524 TP_STRUCT__entry(
525 __field( __u32, apicid )
526 __field( int, vector )
527 ),
528
529 TP_fast_assign(
530 __entry->apicid = apic->vcpu->vcpu_id;
531 __entry->vector = vector;
532 ),
533
534 TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
535);
536
537TRACE_EVENT(kvm_pv_eoi,
538 TP_PROTO(struct kvm_lapic *apic, int vector),
539 TP_ARGS(apic, vector),
540
541 TP_STRUCT__entry(
542 __field( __u32, apicid )
543 __field( int, vector )
544 ),
545
546 TP_fast_assign(
547 __entry->apicid = apic->vcpu->vcpu_id;
548 __entry->vector = vector;
549 ),
550
551 TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
552);
553
520/* 554/*
521 * Tracepoint for nested VMRUN 555 * Tracepoint for nested VMRUN
522 */ 556 */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 32eb58866292..c39b60707e02 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,7 +71,10 @@ static bool __read_mostly enable_unrestricted_guest = 1;
71module_param_named(unrestricted_guest, 71module_param_named(unrestricted_guest,
72 enable_unrestricted_guest, bool, S_IRUGO); 72 enable_unrestricted_guest, bool, S_IRUGO);
73 73
74static bool __read_mostly emulate_invalid_guest_state = 0; 74static bool __read_mostly enable_ept_ad_bits = 1;
75module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
76
77static bool __read_mostly emulate_invalid_guest_state = true;
75module_param(emulate_invalid_guest_state, bool, S_IRUGO); 78module_param(emulate_invalid_guest_state, bool, S_IRUGO);
76 79
77static bool __read_mostly vmm_exclusive = 1; 80static bool __read_mostly vmm_exclusive = 1;
@@ -615,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr);
615static void kvm_cpu_vmxoff(void); 618static void kvm_cpu_vmxoff(void);
616static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 619static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
617static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 620static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
621static void vmx_set_segment(struct kvm_vcpu *vcpu,
622 struct kvm_segment *var, int seg);
623static void vmx_get_segment(struct kvm_vcpu *vcpu,
624 struct kvm_segment *var, int seg);
618 625
619static DEFINE_PER_CPU(struct vmcs *, vmxarea); 626static DEFINE_PER_CPU(struct vmcs *, vmxarea);
620static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 627static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -789,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)
789 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; 796 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
790} 797}
791 798
799static inline bool cpu_has_vmx_ept_ad_bits(void)
800{
801 return vmx_capability.ept & VMX_EPT_AD_BIT;
802}
803
792static inline bool cpu_has_vmx_invept_individual_addr(void) 804static inline bool cpu_has_vmx_invept_individual_addr(void)
793{ 805{
794 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; 806 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -849,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void)
849 SECONDARY_EXEC_RDTSCP; 861 SECONDARY_EXEC_RDTSCP;
850} 862}
851 863
864static inline bool cpu_has_vmx_invpcid(void)
865{
866 return vmcs_config.cpu_based_2nd_exec_ctrl &
867 SECONDARY_EXEC_ENABLE_INVPCID;
868}
869
852static inline bool cpu_has_virtual_nmis(void) 870static inline bool cpu_has_virtual_nmis(void)
853{ 871{
854 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 872 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -1739,6 +1757,11 @@ static bool vmx_rdtscp_supported(void)
1739 return cpu_has_vmx_rdtscp(); 1757 return cpu_has_vmx_rdtscp();
1740} 1758}
1741 1759
1760static bool vmx_invpcid_supported(void)
1761{
1762 return cpu_has_vmx_invpcid() && enable_ept;
1763}
1764
1742/* 1765/*
1743 * Swap MSR entry in host/guest MSR entry array. 1766 * Swap MSR entry in host/guest MSR entry array.
1744 */ 1767 */
@@ -2458,7 +2481,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2458 SECONDARY_EXEC_ENABLE_EPT | 2481 SECONDARY_EXEC_ENABLE_EPT |
2459 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2482 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2460 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2483 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2461 SECONDARY_EXEC_RDTSCP; 2484 SECONDARY_EXEC_RDTSCP |
2485 SECONDARY_EXEC_ENABLE_INVPCID;
2462 if (adjust_vmx_controls(min2, opt2, 2486 if (adjust_vmx_controls(min2, opt2,
2463 MSR_IA32_VMX_PROCBASED_CTLS2, 2487 MSR_IA32_VMX_PROCBASED_CTLS2,
2464 &_cpu_based_2nd_exec_control) < 0) 2488 &_cpu_based_2nd_exec_control) < 0)
@@ -2645,8 +2669,12 @@ static __init int hardware_setup(void)
2645 !cpu_has_vmx_ept_4levels()) { 2669 !cpu_has_vmx_ept_4levels()) {
2646 enable_ept = 0; 2670 enable_ept = 0;
2647 enable_unrestricted_guest = 0; 2671 enable_unrestricted_guest = 0;
2672 enable_ept_ad_bits = 0;
2648 } 2673 }
2649 2674
2675 if (!cpu_has_vmx_ept_ad_bits())
2676 enable_ept_ad_bits = 0;
2677
2650 if (!cpu_has_vmx_unrestricted_guest()) 2678 if (!cpu_has_vmx_unrestricted_guest())
2651 enable_unrestricted_guest = 0; 2679 enable_unrestricted_guest = 0;
2652 2680
@@ -2770,6 +2798,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2770{ 2798{
2771 unsigned long flags; 2799 unsigned long flags;
2772 struct vcpu_vmx *vmx = to_vmx(vcpu); 2800 struct vcpu_vmx *vmx = to_vmx(vcpu);
2801 struct kvm_segment var;
2773 2802
2774 if (enable_unrestricted_guest) 2803 if (enable_unrestricted_guest)
2775 return; 2804 return;
@@ -2813,20 +2842,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2813 if (emulate_invalid_guest_state) 2842 if (emulate_invalid_guest_state)
2814 goto continue_rmode; 2843 goto continue_rmode;
2815 2844
2816 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); 2845 vmx_get_segment(vcpu, &var, VCPU_SREG_SS);
2817 vmcs_write32(GUEST_SS_LIMIT, 0xffff); 2846 vmx_set_segment(vcpu, &var, VCPU_SREG_SS);
2818 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); 2847
2848 vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
2849 vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
2850
2851 vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
2852 vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
2853
2854 vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
2855 vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
2819 2856
2820 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); 2857 vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
2821 vmcs_write32(GUEST_CS_LIMIT, 0xffff); 2858 vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
2822 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
2823 vmcs_writel(GUEST_CS_BASE, 0xf0000);
2824 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
2825 2859
2826 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); 2860 vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
2827 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); 2861 vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
2828 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
2829 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
2830 2862
2831continue_rmode: 2863continue_rmode:
2832 kvm_mmu_reset_context(vcpu); 2864 kvm_mmu_reset_context(vcpu);
@@ -3027,6 +3059,8 @@ static u64 construct_eptp(unsigned long root_hpa)
3027 /* TODO write the value reading from MSR */ 3059 /* TODO write the value reading from MSR */
3028 eptp = VMX_EPT_DEFAULT_MT | 3060 eptp = VMX_EPT_DEFAULT_MT |
3029 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; 3061 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
3062 if (enable_ept_ad_bits)
3063 eptp |= VMX_EPT_AD_ENABLE_BIT;
3030 eptp |= (root_hpa & PAGE_MASK); 3064 eptp |= (root_hpa & PAGE_MASK);
3031 3065
3032 return eptp; 3066 return eptp;
@@ -3153,11 +3187,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
3153 3187
3154static int vmx_get_cpl(struct kvm_vcpu *vcpu) 3188static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3155{ 3189{
3190 struct vcpu_vmx *vmx = to_vmx(vcpu);
3191
3192 /*
3193 * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
3194 * fail; use the cache instead.
3195 */
3196 if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
3197 return vmx->cpl;
3198 }
3199
3156 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { 3200 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
3157 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3201 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3158 to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); 3202 vmx->cpl = __vmx_get_cpl(vcpu);
3159 } 3203 }
3160 return to_vmx(vcpu)->cpl; 3204
3205 return vmx->cpl;
3161} 3206}
3162 3207
3163 3208
@@ -3165,7 +3210,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
3165{ 3210{
3166 u32 ar; 3211 u32 ar;
3167 3212
3168 if (var->unusable) 3213 if (var->unusable || !var->present)
3169 ar = 1 << 16; 3214 ar = 1 << 16;
3170 else { 3215 else {
3171 ar = var->type & 15; 3216 ar = var->type & 15;
@@ -3177,8 +3222,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
3177 ar |= (var->db & 1) << 14; 3222 ar |= (var->db & 1) << 14;
3178 ar |= (var->g & 1) << 15; 3223 ar |= (var->g & 1) << 15;
3179 } 3224 }
3180 if (ar == 0) /* a 0 value means unusable */
3181 ar = AR_UNUSABLE_MASK;
3182 3225
3183 return ar; 3226 return ar;
3184} 3227}
@@ -3229,6 +3272,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3229 3272
3230 vmcs_write32(sf->ar_bytes, ar); 3273 vmcs_write32(sf->ar_bytes, ar);
3231 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3274 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3275
3276 /*
3277 * Fix segments for real mode guest in hosts that don't have
3278 * "unrestricted_mode" or it was disabled.
3279 * This is done to allow migration of the guests from hosts with
3280 * unrestricted guest like Westmere to older host that don't have
3281 * unrestricted guest like Nehelem.
3282 */
3283 if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
3284 switch (seg) {
3285 case VCPU_SREG_CS:
3286 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
3287 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
3288 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
3289 vmcs_writel(GUEST_CS_BASE, 0xf0000);
3290 vmcs_write16(GUEST_CS_SELECTOR,
3291 vmcs_readl(GUEST_CS_BASE) >> 4);
3292 break;
3293 case VCPU_SREG_ES:
3294 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
3295 break;
3296 case VCPU_SREG_DS:
3297 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
3298 break;
3299 case VCPU_SREG_GS:
3300 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
3301 break;
3302 case VCPU_SREG_FS:
3303 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
3304 break;
3305 case VCPU_SREG_SS:
3306 vmcs_write16(GUEST_SS_SELECTOR,
3307 vmcs_readl(GUEST_SS_BASE) >> 4);
3308 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
3309 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
3310 break;
3311 }
3312 }
3232} 3313}
3233 3314
3234static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3315static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3731,6 +3812,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3731 if (!enable_ept) { 3812 if (!enable_ept) {
3732 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 3813 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
3733 enable_unrestricted_guest = 0; 3814 enable_unrestricted_guest = 0;
3815 /* Enable INVPCID for non-ept guests may cause performance regression. */
3816 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
3734 } 3817 }
3735 if (!enable_unrestricted_guest) 3818 if (!enable_unrestricted_guest)
3736 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 3819 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -4489,7 +4572,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
4489 break; 4572 break;
4490 } 4573 }
4491 vcpu->run->exit_reason = 0; 4574 vcpu->run->exit_reason = 0;
4492 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 4575 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
4493 (int)(exit_qualification >> 4) & 3, cr); 4576 (int)(exit_qualification >> 4) & 3, cr);
4494 return 0; 4577 return 0;
4495} 4578}
@@ -4769,6 +4852,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
4769{ 4852{
4770 unsigned long exit_qualification; 4853 unsigned long exit_qualification;
4771 gpa_t gpa; 4854 gpa_t gpa;
4855 u32 error_code;
4772 int gla_validity; 4856 int gla_validity;
4773 4857
4774 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4858 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4793,7 +4877,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
4793 4877
4794 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 4878 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
4795 trace_kvm_page_fault(gpa, exit_qualification); 4879 trace_kvm_page_fault(gpa, exit_qualification);
4796 return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); 4880
4881 /* It is a write fault? */
4882 error_code = exit_qualification & (1U << 1);
4883 /* ept page table is present? */
4884 error_code |= (exit_qualification >> 3) & 0x1;
4885
4886 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
4797} 4887}
4798 4888
4799static u64 ept_rsvd_mask(u64 spte, int level) 4889static u64 ept_rsvd_mask(u64 spte, int level)
@@ -4908,15 +4998,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4908 int ret = 1; 4998 int ret = 1;
4909 u32 cpu_exec_ctrl; 4999 u32 cpu_exec_ctrl;
4910 bool intr_window_requested; 5000 bool intr_window_requested;
5001 unsigned count = 130;
4911 5002
4912 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5003 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4913 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; 5004 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
4914 5005
4915 while (!guest_state_valid(vcpu)) { 5006 while (!guest_state_valid(vcpu) && count-- != 0) {
4916 if (intr_window_requested 5007 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
4917 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
4918 return handle_interrupt_window(&vmx->vcpu); 5008 return handle_interrupt_window(&vmx->vcpu);
4919 5009
5010 if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
5011 return 1;
5012
4920 err = emulate_instruction(vcpu, 0); 5013 err = emulate_instruction(vcpu, 0);
4921 5014
4922 if (err == EMULATE_DO_MMIO) { 5015 if (err == EMULATE_DO_MMIO) {
@@ -4924,8 +5017,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4924 goto out; 5017 goto out;
4925 } 5018 }
4926 5019
4927 if (err != EMULATE_DONE) 5020 if (err != EMULATE_DONE) {
5021 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5022 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5023 vcpu->run->internal.ndata = 0;
4928 return 0; 5024 return 0;
5025 }
4929 5026
4930 if (signal_pending(current)) 5027 if (signal_pending(current))
4931 goto out; 5028 goto out;
@@ -4933,7 +5030,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4933 schedule(); 5030 schedule();
4934 } 5031 }
4935 5032
4936 vmx->emulation_required = 0; 5033 vmx->emulation_required = !guest_state_valid(vcpu);
4937out: 5034out:
4938 return ret; 5035 return ret;
4939} 5036}
@@ -6467,6 +6564,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
6467 } 6564 }
6468 } 6565 }
6469 } 6566 }
6567
6568 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6569 /* Exposing INVPCID only when PCID is exposed */
6570 best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
6571 if (vmx_invpcid_supported() &&
6572 best && (best->ecx & bit(X86_FEATURE_INVPCID)) &&
6573 guest_cpuid_has_pcid(vcpu)) {
6574 exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
6575 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6576 exec_control);
6577 } else {
6578 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
6579 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6580 exec_control);
6581 if (best)
6582 best->ecx &= ~bit(X86_FEATURE_INVPCID);
6583 }
6470} 6584}
6471 6585
6472static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 6586static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -7201,6 +7315,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7201 .cpuid_update = vmx_cpuid_update, 7315 .cpuid_update = vmx_cpuid_update,
7202 7316
7203 .rdtscp_supported = vmx_rdtscp_supported, 7317 .rdtscp_supported = vmx_rdtscp_supported,
7318 .invpcid_supported = vmx_invpcid_supported,
7204 7319
7205 .set_supported_cpuid = vmx_set_supported_cpuid, 7320 .set_supported_cpuid = vmx_set_supported_cpuid,
7206 7321
@@ -7230,23 +7345,21 @@ static int __init vmx_init(void)
7230 if (!vmx_io_bitmap_a) 7345 if (!vmx_io_bitmap_a)
7231 return -ENOMEM; 7346 return -ENOMEM;
7232 7347
7348 r = -ENOMEM;
7349
7233 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); 7350 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
7234 if (!vmx_io_bitmap_b) { 7351 if (!vmx_io_bitmap_b)
7235 r = -ENOMEM;
7236 goto out; 7352 goto out;
7237 }
7238 7353
7239 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); 7354 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
7240 if (!vmx_msr_bitmap_legacy) { 7355 if (!vmx_msr_bitmap_legacy)
7241 r = -ENOMEM;
7242 goto out1; 7356 goto out1;
7243 } 7357
7244 7358
7245 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 7359 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
7246 if (!vmx_msr_bitmap_longmode) { 7360 if (!vmx_msr_bitmap_longmode)
7247 r = -ENOMEM;
7248 goto out2; 7361 goto out2;
7249 } 7362
7250 7363
7251 /* 7364 /*
7252 * Allow direct access to the PC debug port (it is often used for I/O 7365 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7275,8 +7388,10 @@ static int __init vmx_init(void)
7275 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7388 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
7276 7389
7277 if (enable_ept) { 7390 if (enable_ept) {
7278 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 7391 kvm_mmu_set_mask_ptes(0ull,
7279 VMX_EPT_EXECUTABLE_MASK); 7392 (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
7393 (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
7394 0ull, VMX_EPT_EXECUTABLE_MASK);
7280 ept_set_mmio_spte_mask(); 7395 ept_set_mmio_spte_mask();
7281 kvm_enable_tdp(); 7396 kvm_enable_tdp();
7282 } else 7397 } else
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index be6d54929fa7..59b59508ff07 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
528 return 1; 528 return 1;
529 } 529 }
530 530
531 if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
532 return 1;
533
531 kvm_x86_ops->set_cr0(vcpu, cr0); 534 kvm_x86_ops->set_cr0(vcpu, cr0);
532 535
533 if ((cr0 ^ old_cr0) & X86_CR0_PG) { 536 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
@@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
604 kvm_read_cr3(vcpu))) 607 kvm_read_cr3(vcpu)))
605 return 1; 608 return 1;
606 609
610 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
611 if (!guest_cpuid_has_pcid(vcpu))
612 return 1;
613
614 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
615 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
616 return 1;
617 }
618
607 if (kvm_x86_ops->set_cr4(vcpu, cr4)) 619 if (kvm_x86_ops->set_cr4(vcpu, cr4))
608 return 1; 620 return 1;
609 621
610 if ((cr4 ^ old_cr4) & pdptr_bits) 622 if (((cr4 ^ old_cr4) & pdptr_bits) ||
623 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
611 kvm_mmu_reset_context(vcpu); 624 kvm_mmu_reset_context(vcpu);
612 625
613 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) 626 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
@@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
626 } 639 }
627 640
628 if (is_long_mode(vcpu)) { 641 if (is_long_mode(vcpu)) {
629 if (cr3 & CR3_L_MODE_RESERVED_BITS) 642 if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
630 return 1; 643 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
644 return 1;
645 } else
646 if (cr3 & CR3_L_MODE_RESERVED_BITS)
647 return 1;
631 } else { 648 } else {
632 if (is_pae(vcpu)) { 649 if (is_pae(vcpu)) {
633 if (cr3 & CR3_PAE_RESERVED_BITS) 650 if (cr3 & CR3_PAE_RESERVED_BITS)
@@ -795,6 +812,7 @@ static u32 msrs_to_save[] = {
795 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 812 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
796 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 813 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
797 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 814 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
815 MSR_KVM_PV_EOI_EN,
798 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 816 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
799 MSR_STAR, 817 MSR_STAR,
800#ifdef CONFIG_X86_64 818#ifdef CONFIG_X86_64
@@ -1437,8 +1455,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1437 break; 1455 break;
1438 } 1456 }
1439 default: 1457 default:
1440 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1458 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1441 "data 0x%llx\n", msr, data); 1459 "data 0x%llx\n", msr, data);
1442 return 1; 1460 return 1;
1443 } 1461 }
1444 return 0; 1462 return 0;
@@ -1470,8 +1488,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1470 case HV_X64_MSR_TPR: 1488 case HV_X64_MSR_TPR:
1471 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); 1489 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1472 default: 1490 default:
1473 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1491 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1474 "data 0x%llx\n", msr, data); 1492 "data 0x%llx\n", msr, data);
1475 return 1; 1493 return 1;
1476 } 1494 }
1477 1495
@@ -1551,15 +1569,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1551 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 1569 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1552 data &= ~(u64)0x8; /* ignore TLB cache disable */ 1570 data &= ~(u64)0x8; /* ignore TLB cache disable */
1553 if (data != 0) { 1571 if (data != 0) {
1554 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1572 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1555 data); 1573 data);
1556 return 1; 1574 return 1;
1557 } 1575 }
1558 break; 1576 break;
1559 case MSR_FAM10H_MMIO_CONF_BASE: 1577 case MSR_FAM10H_MMIO_CONF_BASE:
1560 if (data != 0) { 1578 if (data != 0) {
1561 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 1579 vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
1562 "0x%llx\n", data); 1580 "0x%llx\n", data);
1563 return 1; 1581 return 1;
1564 } 1582 }
1565 break; 1583 break;
@@ -1574,8 +1592,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1574 thus reserved and should throw a #GP */ 1592 thus reserved and should throw a #GP */
1575 return 1; 1593 return 1;
1576 } 1594 }
1577 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 1595 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
1578 __func__, data); 1596 __func__, data);
1579 break; 1597 break;
1580 case MSR_IA32_UCODE_REV: 1598 case MSR_IA32_UCODE_REV:
1581 case MSR_IA32_UCODE_WRITE: 1599 case MSR_IA32_UCODE_WRITE:
@@ -1653,6 +1671,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1653 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 1671 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
1654 1672
1655 break; 1673 break;
1674 case MSR_KVM_PV_EOI_EN:
1675 if (kvm_lapic_enable_pv_eoi(vcpu, data))
1676 return 1;
1677 break;
1656 1678
1657 case MSR_IA32_MCG_CTL: 1679 case MSR_IA32_MCG_CTL:
1658 case MSR_IA32_MCG_STATUS: 1680 case MSR_IA32_MCG_STATUS:
@@ -1671,8 +1693,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1671 case MSR_K7_EVNTSEL2: 1693 case MSR_K7_EVNTSEL2:
1672 case MSR_K7_EVNTSEL3: 1694 case MSR_K7_EVNTSEL3:
1673 if (data != 0) 1695 if (data != 0)
1674 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1696 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1675 "0x%x data 0x%llx\n", msr, data); 1697 "0x%x data 0x%llx\n", msr, data);
1676 break; 1698 break;
1677 /* at least RHEL 4 unconditionally writes to the perfctr registers, 1699 /* at least RHEL 4 unconditionally writes to the perfctr registers,
1678 * so we ignore writes to make it happy. 1700 * so we ignore writes to make it happy.
@@ -1681,8 +1703,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1681 case MSR_K7_PERFCTR1: 1703 case MSR_K7_PERFCTR1:
1682 case MSR_K7_PERFCTR2: 1704 case MSR_K7_PERFCTR2:
1683 case MSR_K7_PERFCTR3: 1705 case MSR_K7_PERFCTR3:
1684 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1706 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1685 "0x%x data 0x%llx\n", msr, data); 1707 "0x%x data 0x%llx\n", msr, data);
1686 break; 1708 break;
1687 case MSR_P6_PERFCTR0: 1709 case MSR_P6_PERFCTR0:
1688 case MSR_P6_PERFCTR1: 1710 case MSR_P6_PERFCTR1:
@@ -1693,8 +1715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1693 return kvm_pmu_set_msr(vcpu, msr, data); 1715 return kvm_pmu_set_msr(vcpu, msr, data);
1694 1716
1695 if (pr || data != 0) 1717 if (pr || data != 0)
1696 pr_unimpl(vcpu, "disabled perfctr wrmsr: " 1718 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
1697 "0x%x data 0x%llx\n", msr, data); 1719 "0x%x data 0x%llx\n", msr, data);
1698 break; 1720 break;
1699 case MSR_K7_CLK_CTL: 1721 case MSR_K7_CLK_CTL:
1700 /* 1722 /*
@@ -1720,7 +1742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1720 /* Drop writes to this legacy MSR -- see rdmsr 1742 /* Drop writes to this legacy MSR -- see rdmsr
1721 * counterpart for further detail. 1743 * counterpart for further detail.
1722 */ 1744 */
1723 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); 1745 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1724 break; 1746 break;
1725 case MSR_AMD64_OSVW_ID_LENGTH: 1747 case MSR_AMD64_OSVW_ID_LENGTH:
1726 if (!guest_cpuid_has_osvw(vcpu)) 1748 if (!guest_cpuid_has_osvw(vcpu))
@@ -1738,12 +1760,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1738 if (kvm_pmu_msr(vcpu, msr)) 1760 if (kvm_pmu_msr(vcpu, msr))
1739 return kvm_pmu_set_msr(vcpu, msr, data); 1761 return kvm_pmu_set_msr(vcpu, msr, data);
1740 if (!ignore_msrs) { 1762 if (!ignore_msrs) {
1741 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1763 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
1742 msr, data); 1764 msr, data);
1743 return 1; 1765 return 1;
1744 } else { 1766 } else {
1745 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 1767 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
1746 msr, data); 1768 msr, data);
1747 break; 1769 break;
1748 } 1770 }
1749 } 1771 }
@@ -1846,7 +1868,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1846 data = kvm->arch.hv_hypercall; 1868 data = kvm->arch.hv_hypercall;
1847 break; 1869 break;
1848 default: 1870 default:
1849 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1871 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1850 return 1; 1872 return 1;
1851 } 1873 }
1852 1874
@@ -1877,7 +1899,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1877 data = vcpu->arch.hv_vapic; 1899 data = vcpu->arch.hv_vapic;
1878 break; 1900 break;
1879 default: 1901 default:
1880 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1902 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1881 return 1; 1903 return 1;
1882 } 1904 }
1883 *pdata = data; 1905 *pdata = data;
@@ -2030,10 +2052,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2030 if (kvm_pmu_msr(vcpu, msr)) 2052 if (kvm_pmu_msr(vcpu, msr))
2031 return kvm_pmu_get_msr(vcpu, msr, pdata); 2053 return kvm_pmu_get_msr(vcpu, msr, pdata);
2032 if (!ignore_msrs) { 2054 if (!ignore_msrs) {
2033 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 2055 vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
2034 return 1; 2056 return 1;
2035 } else { 2057 } else {
2036 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 2058 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
2037 data = 0; 2059 data = 0;
2038 } 2060 }
2039 break; 2061 break;
@@ -4116,7 +4138,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
4116 value = kvm_get_cr8(vcpu); 4138 value = kvm_get_cr8(vcpu);
4117 break; 4139 break;
4118 default: 4140 default:
4119 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4141 kvm_err("%s: unexpected cr %u\n", __func__, cr);
4120 return 0; 4142 return 0;
4121 } 4143 }
4122 4144
@@ -4145,7 +4167,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4145 res = kvm_set_cr8(vcpu, val); 4167 res = kvm_set_cr8(vcpu, val);
4146 break; 4168 break;
4147 default: 4169 default:
4148 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4170 kvm_err("%s: unexpected cr %u\n", __func__, cr);
4149 res = -1; 4171 res = -1;
4150 } 4172 }
4151 4173
@@ -4297,26 +4319,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
4297 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); 4319 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
4298} 4320}
4299 4321
4300static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, 4322static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
4301 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) 4323 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
4302{ 4324{
4303 struct kvm_cpuid_entry2 *cpuid = NULL; 4325 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
4304
4305 if (eax && ecx)
4306 cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt),
4307 *eax, *ecx);
4308
4309 if (cpuid) {
4310 *eax = cpuid->eax;
4311 *ecx = cpuid->ecx;
4312 if (ebx)
4313 *ebx = cpuid->ebx;
4314 if (edx)
4315 *edx = cpuid->edx;
4316 return true;
4317 }
4318
4319 return false;
4320} 4326}
4321 4327
4322static struct x86_emulate_ops emulate_ops = { 4328static struct x86_emulate_ops emulate_ops = {
@@ -5296,8 +5302,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5296 5302
5297 r = kvm_mmu_reload(vcpu); 5303 r = kvm_mmu_reload(vcpu);
5298 if (unlikely(r)) { 5304 if (unlikely(r)) {
5299 kvm_x86_ops->cancel_injection(vcpu); 5305 goto cancel_injection;
5300 goto out;
5301 } 5306 }
5302 5307
5303 preempt_disable(); 5308 preempt_disable();
@@ -5322,9 +5327,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5322 smp_wmb(); 5327 smp_wmb();
5323 local_irq_enable(); 5328 local_irq_enable();
5324 preempt_enable(); 5329 preempt_enable();
5325 kvm_x86_ops->cancel_injection(vcpu);
5326 r = 1; 5330 r = 1;
5327 goto out; 5331 goto cancel_injection;
5328 } 5332 }
5329 5333
5330 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5334 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -5388,9 +5392,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5388 if (unlikely(vcpu->arch.tsc_always_catchup)) 5392 if (unlikely(vcpu->arch.tsc_always_catchup))
5389 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 5393 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5390 5394
5391 kvm_lapic_sync_from_vapic(vcpu); 5395 if (vcpu->arch.apic_attention)
5396 kvm_lapic_sync_from_vapic(vcpu);
5392 5397
5393 r = kvm_x86_ops->handle_exit(vcpu); 5398 r = kvm_x86_ops->handle_exit(vcpu);
5399 return r;
5400
5401cancel_injection:
5402 kvm_x86_ops->cancel_injection(vcpu);
5403 if (unlikely(vcpu->arch.apic_attention))
5404 kvm_lapic_sync_from_vapic(vcpu);
5394out: 5405out:
5395 return r; 5406 return r;
5396} 5407}
@@ -6304,7 +6315,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6304 6315
6305 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6316 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6306 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { 6317 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
6307 vfree(free->arch.lpage_info[i]); 6318 kvm_kvfree(free->arch.lpage_info[i]);
6308 free->arch.lpage_info[i] = NULL; 6319 free->arch.lpage_info[i] = NULL;
6309 } 6320 }
6310 } 6321 }
@@ -6323,7 +6334,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6323 slot->base_gfn, level) + 1; 6334 slot->base_gfn, level) + 1;
6324 6335
6325 slot->arch.lpage_info[i] = 6336 slot->arch.lpage_info[i] =
6326 vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); 6337 kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
6327 if (!slot->arch.lpage_info[i]) 6338 if (!slot->arch.lpage_info[i])
6328 goto out_free; 6339 goto out_free;
6329 6340
@@ -6350,7 +6361,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6350 6361
6351out_free: 6362out_free:
6352 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6363 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6353 vfree(slot->arch.lpage_info[i]); 6364 kvm_kvfree(slot->arch.lpage_info[i]);
6354 slot->arch.lpage_info[i] = NULL; 6365 slot->arch.lpage_info[i] = NULL;
6355 } 6366 }
6356 return -ENOMEM; 6367 return -ENOMEM;